Home | History | Annotate | Download | only in X86
      1 //===-- X86ISelLowering.cpp - X86 DAG Lowering Implementation -------------===//
      2 //
      3 //                     The LLVM Compiler Infrastructure
      4 //
      5 // This file is distributed under the University of Illinois Open Source
      6 // License. See LICENSE.TXT for details.
      7 //
      8 //===----------------------------------------------------------------------===//
      9 //
     10 // This file defines the interfaces that X86 uses to lower LLVM code into a
     11 // selection DAG.
     12 //
     13 //===----------------------------------------------------------------------===//
     14 
     15 #include "X86ISelLowering.h"
     16 #include "Utils/X86ShuffleDecode.h"
     17 #include "X86CallingConv.h"
     18 #include "X86FrameLowering.h"
     19 #include "X86InstrBuilder.h"
     20 #include "X86MachineFunctionInfo.h"
     21 #include "X86ShuffleDecodeConstantPool.h"
     22 #include "X86TargetMachine.h"
     23 #include "X86TargetObjectFile.h"
     24 #include "llvm/ADT/SmallBitVector.h"
     25 #include "llvm/ADT/SmallSet.h"
     26 #include "llvm/ADT/Statistic.h"
     27 #include "llvm/ADT/StringExtras.h"
     28 #include "llvm/ADT/StringSwitch.h"
     29 #include "llvm/Analysis/EHPersonalities.h"
     30 #include "llvm/CodeGen/IntrinsicLowering.h"
     31 #include "llvm/CodeGen/MachineFrameInfo.h"
     32 #include "llvm/CodeGen/MachineFunction.h"
     33 #include "llvm/CodeGen/MachineInstrBuilder.h"
     34 #include "llvm/CodeGen/MachineJumpTableInfo.h"
     35 #include "llvm/CodeGen/MachineModuleInfo.h"
     36 #include "llvm/CodeGen/MachineRegisterInfo.h"
     37 #include "llvm/CodeGen/WinEHFuncInfo.h"
     38 #include "llvm/IR/CallSite.h"
     39 #include "llvm/IR/CallingConv.h"
     40 #include "llvm/IR/Constants.h"
     41 #include "llvm/IR/DerivedTypes.h"
     42 #include "llvm/IR/Function.h"
     43 #include "llvm/IR/GlobalAlias.h"
     44 #include "llvm/IR/GlobalVariable.h"
     45 #include "llvm/IR/Instructions.h"
     46 #include "llvm/IR/Intrinsics.h"
     47 #include "llvm/MC/MCAsmInfo.h"
     48 #include "llvm/MC/MCContext.h"
     49 #include "llvm/MC/MCExpr.h"
     50 #include "llvm/MC/MCSymbol.h"
     51 #include "llvm/Support/CommandLine.h"
     52 #include "llvm/Support/Debug.h"
     53 #include "llvm/Support/ErrorHandling.h"
     54 #include "llvm/Support/MathExtras.h"
     55 #include "llvm/Target/TargetOptions.h"
     56 #include "X86IntrinsicsInfo.h"
     57 #include <bitset>
     58 #include <numeric>
     59 #include <cctype>
     60 using namespace llvm;
     61 
     62 #define DEBUG_TYPE "x86-isel"
     63 
     64 STATISTIC(NumTailCalls, "Number of tail calls");
     65 
     66 static cl::opt<bool> ExperimentalVectorWideningLegalization(
     67     "x86-experimental-vector-widening-legalization", cl::init(false),
     68     cl::desc("Enable an experimental vector type legalization through widening "
     69              "rather than promotion."),
     70     cl::Hidden);
     71 
     72 X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
     73                                      const X86Subtarget &STI)
     74     : TargetLowering(TM), Subtarget(STI) {
     75   bool UseX87 = !Subtarget.useSoftFloat() && Subtarget.hasX87();
     76   X86ScalarSSEf64 = Subtarget.hasSSE2();
     77   X86ScalarSSEf32 = Subtarget.hasSSE1();
     78   MVT PtrVT = MVT::getIntegerVT(8 * TM.getPointerSize());
     79 
     80   // Set up the TargetLowering object.
     81 
     82   // X86 is weird. It always uses i8 for shift amounts and setcc results.
     83   setBooleanContents(ZeroOrOneBooleanContent);
     84   // X86-SSE is even stranger. It uses -1 or 0 for vector masks.
     85   setBooleanVectorContents(ZeroOrNegativeOneBooleanContent);
     86 
     87   // For 64-bit, since we have so many registers, use the ILP scheduler.
     88   // For 32-bit, use the register pressure specific scheduling.
     89   // For Atom, always use ILP scheduling.
     90   if (Subtarget.isAtom())
     91     setSchedulingPreference(Sched::ILP);
     92   else if (Subtarget.is64Bit())
     93     setSchedulingPreference(Sched::ILP);
     94   else
     95     setSchedulingPreference(Sched::RegPressure);
     96   const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
     97   setStackPointerRegisterToSaveRestore(RegInfo->getStackRegister());
     98 
     99   // Bypass expensive divides on Atom when compiling with O2.
    100   if (TM.getOptLevel() >= CodeGenOpt::Default) {
    101     if (Subtarget.hasSlowDivide32())
    102       addBypassSlowDiv(32, 8);
    103     if (Subtarget.hasSlowDivide64() && Subtarget.is64Bit())
    104       addBypassSlowDiv(64, 16);
    105   }
    106 
    107   if (Subtarget.isTargetKnownWindowsMSVC()) {
    108     // Setup Windows compiler runtime calls.
    109     setLibcallName(RTLIB::SDIV_I64, "_alldiv");
    110     setLibcallName(RTLIB::UDIV_I64, "_aulldiv");
    111     setLibcallName(RTLIB::SREM_I64, "_allrem");
    112     setLibcallName(RTLIB::UREM_I64, "_aullrem");
    113     setLibcallName(RTLIB::MUL_I64, "_allmul");
    114     setLibcallCallingConv(RTLIB::SDIV_I64, CallingConv::X86_StdCall);
    115     setLibcallCallingConv(RTLIB::UDIV_I64, CallingConv::X86_StdCall);
    116     setLibcallCallingConv(RTLIB::SREM_I64, CallingConv::X86_StdCall);
    117     setLibcallCallingConv(RTLIB::UREM_I64, CallingConv::X86_StdCall);
    118     setLibcallCallingConv(RTLIB::MUL_I64, CallingConv::X86_StdCall);
    119   }
    120 
    121   if (Subtarget.isTargetDarwin()) {
    122     // Darwin should use _setjmp/_longjmp instead of setjmp/longjmp.
    123     setUseUnderscoreSetJmp(false);
    124     setUseUnderscoreLongJmp(false);
    125   } else if (Subtarget.isTargetWindowsGNU()) {
    126     // MS runtime is weird: it exports _setjmp, but longjmp!
    127     setUseUnderscoreSetJmp(true);
    128     setUseUnderscoreLongJmp(false);
    129   } else {
    130     setUseUnderscoreSetJmp(true);
    131     setUseUnderscoreLongJmp(true);
    132   }
    133 
    134   // Set up the register classes.
    135   addRegisterClass(MVT::i8, &X86::GR8RegClass);
    136   addRegisterClass(MVT::i16, &X86::GR16RegClass);
    137   addRegisterClass(MVT::i32, &X86::GR32RegClass);
    138   if (Subtarget.is64Bit())
    139     addRegisterClass(MVT::i64, &X86::GR64RegClass);
    140 
    141   for (MVT VT : MVT::integer_valuetypes())
    142     setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i1, Promote);
    143 
    144   // We don't accept any truncstore of integer registers.
    145   setTruncStoreAction(MVT::i64, MVT::i32, Expand);
    146   setTruncStoreAction(MVT::i64, MVT::i16, Expand);
    147   setTruncStoreAction(MVT::i64, MVT::i8 , Expand);
    148   setTruncStoreAction(MVT::i32, MVT::i16, Expand);
    149   setTruncStoreAction(MVT::i32, MVT::i8 , Expand);
    150   setTruncStoreAction(MVT::i16, MVT::i8,  Expand);
    151 
    152   setTruncStoreAction(MVT::f64, MVT::f32, Expand);
    153 
    154   // SETOEQ and SETUNE require checking two conditions.
    155   setCondCodeAction(ISD::SETOEQ, MVT::f32, Expand);
    156   setCondCodeAction(ISD::SETOEQ, MVT::f64, Expand);
    157   setCondCodeAction(ISD::SETOEQ, MVT::f80, Expand);
    158   setCondCodeAction(ISD::SETUNE, MVT::f32, Expand);
    159   setCondCodeAction(ISD::SETUNE, MVT::f64, Expand);
    160   setCondCodeAction(ISD::SETUNE, MVT::f80, Expand);
    161 
    162   // Promote all UINT_TO_FP to larger SINT_TO_FP's, as X86 doesn't have this
    163   // operation.
    164   setOperationAction(ISD::UINT_TO_FP       , MVT::i1   , Promote);
    165   setOperationAction(ISD::UINT_TO_FP       , MVT::i8   , Promote);
    166   setOperationAction(ISD::UINT_TO_FP       , MVT::i16  , Promote);
    167 
    168   if (Subtarget.is64Bit()) {
    169     if (!Subtarget.useSoftFloat() && Subtarget.hasAVX512())
    170       // f32/f64 are legal, f80 is custom.
    171       setOperationAction(ISD::UINT_TO_FP   , MVT::i32  , Custom);
    172     else
    173       setOperationAction(ISD::UINT_TO_FP   , MVT::i32  , Promote);
    174     setOperationAction(ISD::UINT_TO_FP     , MVT::i64  , Custom);
    175   } else if (!Subtarget.useSoftFloat()) {
    176     // We have an algorithm for SSE2->double, and we turn this into a
    177     // 64-bit FILD followed by conditional FADD for other targets.
    178     setOperationAction(ISD::UINT_TO_FP     , MVT::i64  , Custom);
    179     // We have an algorithm for SSE2, and we turn this into a 64-bit
    180     // FILD or VCVTUSI2SS/SD for other targets.
    181     setOperationAction(ISD::UINT_TO_FP     , MVT::i32  , Custom);
    182   }
    183 
    184   // Promote i1/i8 SINT_TO_FP to larger SINT_TO_FP's, as X86 doesn't have
    185   // this operation.
    186   setOperationAction(ISD::SINT_TO_FP       , MVT::i1   , Promote);
    187   setOperationAction(ISD::SINT_TO_FP       , MVT::i8   , Promote);
    188 
    189   if (!Subtarget.useSoftFloat()) {
    190     // SSE has no i16 to fp conversion, only i32
    191     if (X86ScalarSSEf32) {
    192       setOperationAction(ISD::SINT_TO_FP     , MVT::i16  , Promote);
    193       // f32 and f64 cases are Legal, f80 case is not
    194       setOperationAction(ISD::SINT_TO_FP     , MVT::i32  , Custom);
    195     } else {
    196       setOperationAction(ISD::SINT_TO_FP     , MVT::i16  , Custom);
    197       setOperationAction(ISD::SINT_TO_FP     , MVT::i32  , Custom);
    198     }
    199   } else {
    200     setOperationAction(ISD::SINT_TO_FP     , MVT::i16  , Promote);
    201     setOperationAction(ISD::SINT_TO_FP     , MVT::i32  , Promote);
    202   }
    203 
    204   // Promote i1/i8 FP_TO_SINT to larger FP_TO_SINTS's, as X86 doesn't have
    205   // this operation.
    206   setOperationAction(ISD::FP_TO_SINT       , MVT::i1   , Promote);
    207   setOperationAction(ISD::FP_TO_SINT       , MVT::i8   , Promote);
    208 
    209   if (!Subtarget.useSoftFloat()) {
    210     // In 32-bit mode these are custom lowered.  In 64-bit mode F32 and F64
    211     // are Legal, f80 is custom lowered.
    212     setOperationAction(ISD::FP_TO_SINT     , MVT::i64  , Custom);
    213     setOperationAction(ISD::SINT_TO_FP     , MVT::i64  , Custom);
    214 
    215     if (X86ScalarSSEf32) {
    216       setOperationAction(ISD::FP_TO_SINT     , MVT::i16  , Promote);
    217       // f32 and f64 cases are Legal, f80 case is not
    218       setOperationAction(ISD::FP_TO_SINT     , MVT::i32  , Custom);
    219     } else {
    220       setOperationAction(ISD::FP_TO_SINT     , MVT::i16  , Custom);
    221       setOperationAction(ISD::FP_TO_SINT     , MVT::i32  , Custom);
    222     }
    223   } else {
    224     setOperationAction(ISD::FP_TO_SINT     , MVT::i16  , Promote);
    225     setOperationAction(ISD::FP_TO_SINT     , MVT::i32  , Expand);
    226     setOperationAction(ISD::FP_TO_SINT     , MVT::i64  , Expand);
    227   }
    228 
    229   // Handle FP_TO_UINT by promoting the destination to a larger signed
    230   // conversion.
    231   setOperationAction(ISD::FP_TO_UINT       , MVT::i1   , Promote);
    232   setOperationAction(ISD::FP_TO_UINT       , MVT::i8   , Promote);
    233   setOperationAction(ISD::FP_TO_UINT       , MVT::i16  , Promote);
    234 
    235   if (Subtarget.is64Bit()) {
    236     if (!Subtarget.useSoftFloat() && Subtarget.hasAVX512()) {
    237       // FP_TO_UINT-i32/i64 is legal for f32/f64, but custom for f80.
    238       setOperationAction(ISD::FP_TO_UINT   , MVT::i32  , Custom);
    239       setOperationAction(ISD::FP_TO_UINT   , MVT::i64  , Custom);
    240     } else {
    241       setOperationAction(ISD::FP_TO_UINT   , MVT::i32  , Promote);
    242       setOperationAction(ISD::FP_TO_UINT   , MVT::i64  , Expand);
    243     }
    244   } else if (!Subtarget.useSoftFloat()) {
    245     // Since AVX is a superset of SSE3, only check for SSE here.
    246     if (Subtarget.hasSSE1() && !Subtarget.hasSSE3())
    247       // Expand FP_TO_UINT into a select.
    248       // FIXME: We would like to use a Custom expander here eventually to do
    249       // the optimal thing for SSE vs. the default expansion in the legalizer.
    250       setOperationAction(ISD::FP_TO_UINT   , MVT::i32  , Expand);
    251     else
    252       // With AVX512 we can use vcvts[ds]2usi for f32/f64->i32, f80 is custom.
    253       // With SSE3 we can use fisttpll to convert to a signed i64; without
    254       // SSE, we're stuck with a fistpll.
    255       setOperationAction(ISD::FP_TO_UINT   , MVT::i32  , Custom);
    256 
    257     setOperationAction(ISD::FP_TO_UINT     , MVT::i64  , Custom);
    258   }
    259 
    260   // TODO: when we have SSE, these could be more efficient, by using movd/movq.
    261   if (!X86ScalarSSEf64) {
    262     setOperationAction(ISD::BITCAST        , MVT::f32  , Expand);
    263     setOperationAction(ISD::BITCAST        , MVT::i32  , Expand);
    264     if (Subtarget.is64Bit()) {
    265       setOperationAction(ISD::BITCAST      , MVT::f64  , Expand);
    266       // Without SSE, i64->f64 goes through memory.
    267       setOperationAction(ISD::BITCAST      , MVT::i64  , Expand);
    268     }
    269   } else if (!Subtarget.is64Bit())
    270     setOperationAction(ISD::BITCAST      , MVT::i64  , Custom);
    271 
    272   // Scalar integer divide and remainder are lowered to use operations that
    273   // produce two results, to match the available instructions. This exposes
    274   // the two-result form to trivial CSE, which is able to combine x/y and x%y
    275   // into a single instruction.
    276   //
    277   // Scalar integer multiply-high is also lowered to use two-result
    278   // operations, to match the available instructions. However, plain multiply
    279   // (low) operations are left as Legal, as there are single-result
    280   // instructions for this in x86. Using the two-result multiply instructions
    281   // when both high and low results are needed must be arranged by dagcombine.
    282   for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {
    283     setOperationAction(ISD::MULHS, VT, Expand);
    284     setOperationAction(ISD::MULHU, VT, Expand);
    285     setOperationAction(ISD::SDIV, VT, Expand);
    286     setOperationAction(ISD::UDIV, VT, Expand);
    287     setOperationAction(ISD::SREM, VT, Expand);
    288     setOperationAction(ISD::UREM, VT, Expand);
    289 
    290     // Add/Sub overflow ops with MVT::Glues are lowered to EFLAGS dependences.
    291     setOperationAction(ISD::ADDC, VT, Custom);
    292     setOperationAction(ISD::ADDE, VT, Custom);
    293     setOperationAction(ISD::SUBC, VT, Custom);
    294     setOperationAction(ISD::SUBE, VT, Custom);
    295   }
    296 
    297   setOperationAction(ISD::BR_JT            , MVT::Other, Expand);
    298   setOperationAction(ISD::BRCOND           , MVT::Other, Custom);
    299   for (auto VT : { MVT::f32, MVT::f64, MVT::f80, MVT::f128,
    300                    MVT::i8,  MVT::i16, MVT::i32, MVT::i64 }) {
    301     setOperationAction(ISD::BR_CC,     VT, Expand);
    302     setOperationAction(ISD::SELECT_CC, VT, Expand);
    303   }
    304   if (Subtarget.is64Bit())
    305     setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i32, Legal);
    306   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i16  , Legal);
    307   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i8   , Legal);
    308   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1   , Expand);
    309   setOperationAction(ISD::FP_ROUND_INREG   , MVT::f32  , Expand);
    310 
    311   setOperationAction(ISD::FREM             , MVT::f32  , Expand);
    312   setOperationAction(ISD::FREM             , MVT::f64  , Expand);
    313   setOperationAction(ISD::FREM             , MVT::f80  , Expand);
    314   setOperationAction(ISD::FLT_ROUNDS_      , MVT::i32  , Custom);
    315 
    316   // Promote the i8 variants and force them on up to i32 which has a shorter
    317   // encoding.
    318   setOperationPromotedToType(ISD::CTTZ           , MVT::i8   , MVT::i32);
    319   setOperationPromotedToType(ISD::CTTZ_ZERO_UNDEF, MVT::i8   , MVT::i32);
    320   if (!Subtarget.hasBMI()) {
    321     setOperationAction(ISD::CTTZ           , MVT::i16  , Custom);
    322     setOperationAction(ISD::CTTZ           , MVT::i32  , Custom);
    323     setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i16  , Legal);
    324     setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i32  , Legal);
    325     if (Subtarget.is64Bit()) {
    326       setOperationAction(ISD::CTTZ         , MVT::i64  , Custom);
    327       setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i64, Legal);
    328     }
    329   }
    330 
    331   if (Subtarget.hasLZCNT()) {
    332     // When promoting the i8 variants, force them to i32 for a shorter
    333     // encoding.
    334     setOperationPromotedToType(ISD::CTLZ           , MVT::i8   , MVT::i32);
    335     setOperationPromotedToType(ISD::CTLZ_ZERO_UNDEF, MVT::i8   , MVT::i32);
    336   } else {
    337     setOperationAction(ISD::CTLZ           , MVT::i8   , Custom);
    338     setOperationAction(ISD::CTLZ           , MVT::i16  , Custom);
    339     setOperationAction(ISD::CTLZ           , MVT::i32  , Custom);
    340     setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i8   , Custom);
    341     setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i16  , Custom);
    342     setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i32  , Custom);
    343     if (Subtarget.is64Bit()) {
    344       setOperationAction(ISD::CTLZ         , MVT::i64  , Custom);
    345       setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i64, Custom);
    346     }
    347   }
    348 
    349   // Special handling for half-precision floating point conversions.
    350   // If we don't have F16C support, then lower half float conversions
    351   // into library calls.
    352   if (Subtarget.useSoftFloat() || !Subtarget.hasF16C()) {
    353     setOperationAction(ISD::FP16_TO_FP, MVT::f32, Expand);
    354     setOperationAction(ISD::FP_TO_FP16, MVT::f32, Expand);
    355   }
    356 
    357   // There's never any support for operations beyond MVT::f32.
    358   setOperationAction(ISD::FP16_TO_FP, MVT::f64, Expand);
    359   setOperationAction(ISD::FP16_TO_FP, MVT::f80, Expand);
    360   setOperationAction(ISD::FP_TO_FP16, MVT::f64, Expand);
    361   setOperationAction(ISD::FP_TO_FP16, MVT::f80, Expand);
    362 
    363   setLoadExtAction(ISD::EXTLOAD, MVT::f32, MVT::f16, Expand);
    364   setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::f16, Expand);
    365   setLoadExtAction(ISD::EXTLOAD, MVT::f80, MVT::f16, Expand);
    366   setTruncStoreAction(MVT::f32, MVT::f16, Expand);
    367   setTruncStoreAction(MVT::f64, MVT::f16, Expand);
    368   setTruncStoreAction(MVT::f80, MVT::f16, Expand);
    369 
    370   if (Subtarget.hasPOPCNT()) {
    371     setOperationAction(ISD::CTPOP          , MVT::i8   , Promote);
    372   } else {
    373     setOperationAction(ISD::CTPOP          , MVT::i8   , Expand);
    374     setOperationAction(ISD::CTPOP          , MVT::i16  , Expand);
    375     setOperationAction(ISD::CTPOP          , MVT::i32  , Expand);
    376     if (Subtarget.is64Bit())
    377       setOperationAction(ISD::CTPOP        , MVT::i64  , Expand);
    378   }
    379 
    380   setOperationAction(ISD::READCYCLECOUNTER , MVT::i64  , Custom);
    381 
    382   if (!Subtarget.hasMOVBE())
    383     setOperationAction(ISD::BSWAP          , MVT::i16  , Expand);
    384 
    385   // These should be promoted to a larger select which is supported.
    386   setOperationAction(ISD::SELECT          , MVT::i1   , Promote);
    387   // X86 wants to expand cmov itself.
    388   for (auto VT : { MVT::f32, MVT::f64, MVT::f80, MVT::f128 }) {
    389     setOperationAction(ISD::SELECT, VT, Custom);
    390     setOperationAction(ISD::SETCC, VT, Custom);
    391   }
    392   for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {
    393     if (VT == MVT::i64 && !Subtarget.is64Bit())
    394       continue;
    395     setOperationAction(ISD::SELECT, VT, Custom);
    396     setOperationAction(ISD::SETCC,  VT, Custom);
    397     setOperationAction(ISD::SETCCE, VT, Custom);
    398   }
    399   setOperationAction(ISD::EH_RETURN       , MVT::Other, Custom);
    400   // NOTE: EH_SJLJ_SETJMP/_LONGJMP supported here is NOT intended to support
    401   // SjLj exception handling but a light-weight setjmp/longjmp replacement to
    402   // support continuation, user-level threading, and etc.. As a result, no
    403   // other SjLj exception interfaces are implemented and please don't build
    404   // your own exception handling based on them.
    405   // LLVM/Clang supports zero-cost DWARF exception handling.
    406   setOperationAction(ISD::EH_SJLJ_SETJMP, MVT::i32, Custom);
    407   setOperationAction(ISD::EH_SJLJ_LONGJMP, MVT::Other, Custom);
    408   setOperationAction(ISD::EH_SJLJ_SETUP_DISPATCH, MVT::Other, Custom);
    409   if (TM.Options.ExceptionModel == ExceptionHandling::SjLj)
    410     setLibcallName(RTLIB::UNWIND_RESUME, "_Unwind_SjLj_Resume");
    411 
    412   // Darwin ABI issue.
    413   for (auto VT : { MVT::i32, MVT::i64 }) {
    414     if (VT == MVT::i64 && !Subtarget.is64Bit())
    415       continue;
    416     setOperationAction(ISD::ConstantPool    , VT, Custom);
    417     setOperationAction(ISD::JumpTable       , VT, Custom);
    418     setOperationAction(ISD::GlobalAddress   , VT, Custom);
    419     setOperationAction(ISD::GlobalTLSAddress, VT, Custom);
    420     setOperationAction(ISD::ExternalSymbol  , VT, Custom);
    421     setOperationAction(ISD::BlockAddress    , VT, Custom);
    422   }
    423   // 64-bit addm sub, shl, sra, srl (iff 32-bit x86)
    424   for (auto VT : { MVT::i32, MVT::i64 }) {
    425     if (VT == MVT::i64 && !Subtarget.is64Bit())
    426       continue;
    427     setOperationAction(ISD::SHL_PARTS, VT, Custom);
    428     setOperationAction(ISD::SRA_PARTS, VT, Custom);
    429     setOperationAction(ISD::SRL_PARTS, VT, Custom);
    430   }
    431 
    432   if (Subtarget.hasSSE1())
    433     setOperationAction(ISD::PREFETCH      , MVT::Other, Legal);
    434 
    435   setOperationAction(ISD::ATOMIC_FENCE  , MVT::Other, Custom);
    436 
    437   // Expand certain atomics
    438   for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {
    439     setOperationAction(ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS, VT, Custom);
    440     setOperationAction(ISD::ATOMIC_LOAD_SUB, VT, Custom);
    441     setOperationAction(ISD::ATOMIC_LOAD_ADD, VT, Custom);
    442     setOperationAction(ISD::ATOMIC_LOAD_OR, VT, Custom);
    443     setOperationAction(ISD::ATOMIC_LOAD_XOR, VT, Custom);
    444     setOperationAction(ISD::ATOMIC_LOAD_AND, VT, Custom);
    445     setOperationAction(ISD::ATOMIC_STORE, VT, Custom);
    446   }
    447 
    448   if (Subtarget.hasCmpxchg16b()) {
    449     setOperationAction(ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS, MVT::i128, Custom);
    450   }
    451 
    452   // FIXME - use subtarget debug flags
    453   if (!Subtarget.isTargetDarwin() && !Subtarget.isTargetELF() &&
    454       !Subtarget.isTargetCygMing() && !Subtarget.isTargetWin64() &&
    455       TM.Options.ExceptionModel != ExceptionHandling::SjLj) {
    456     setOperationAction(ISD::EH_LABEL, MVT::Other, Expand);
    457   }
    458 
    459   setOperationAction(ISD::FRAME_TO_ARGS_OFFSET, MVT::i32, Custom);
    460   setOperationAction(ISD::FRAME_TO_ARGS_OFFSET, MVT::i64, Custom);
    461 
    462   setOperationAction(ISD::INIT_TRAMPOLINE, MVT::Other, Custom);
    463   setOperationAction(ISD::ADJUST_TRAMPOLINE, MVT::Other, Custom);
    464 
    465   setOperationAction(ISD::TRAP, MVT::Other, Legal);
    466   setOperationAction(ISD::DEBUGTRAP, MVT::Other, Legal);
    467 
    468   // VASTART needs to be custom lowered to use the VarArgsFrameIndex
    469   setOperationAction(ISD::VASTART           , MVT::Other, Custom);
    470   setOperationAction(ISD::VAEND             , MVT::Other, Expand);
    471   bool Is64Bit = Subtarget.is64Bit();
    472   setOperationAction(ISD::VAARG,  MVT::Other, Is64Bit ? Custom : Expand);
    473   setOperationAction(ISD::VACOPY, MVT::Other, Is64Bit ? Custom : Expand);
    474 
    475   setOperationAction(ISD::STACKSAVE,          MVT::Other, Expand);
    476   setOperationAction(ISD::STACKRESTORE,       MVT::Other, Expand);
    477 
    478   setOperationAction(ISD::DYNAMIC_STACKALLOC, PtrVT, Custom);
    479 
    480   // GC_TRANSITION_START and GC_TRANSITION_END need custom lowering.
    481   setOperationAction(ISD::GC_TRANSITION_START, MVT::Other, Custom);
    482   setOperationAction(ISD::GC_TRANSITION_END, MVT::Other, Custom);
    483 
    484   if (!Subtarget.useSoftFloat() && X86ScalarSSEf64) {
    485     // f32 and f64 use SSE.
    486     // Set up the FP register classes.
    487     addRegisterClass(MVT::f32, &X86::FR32RegClass);
    488     addRegisterClass(MVT::f64, &X86::FR64RegClass);
    489 
    490     for (auto VT : { MVT::f32, MVT::f64 }) {
    491       // Use ANDPD to simulate FABS.
    492       setOperationAction(ISD::FABS, VT, Custom);
    493 
    494       // Use XORP to simulate FNEG.
    495       setOperationAction(ISD::FNEG, VT, Custom);
    496 
    497       // Use ANDPD and ORPD to simulate FCOPYSIGN.
    498       setOperationAction(ISD::FCOPYSIGN, VT, Custom);
    499 
    500       // We don't support sin/cos/fmod
    501       setOperationAction(ISD::FSIN   , VT, Expand);
    502       setOperationAction(ISD::FCOS   , VT, Expand);
    503       setOperationAction(ISD::FSINCOS, VT, Expand);
    504     }
    505 
    506     // Lower this to MOVMSK plus an AND.
    507     setOperationAction(ISD::FGETSIGN, MVT::i64, Custom);
    508     setOperationAction(ISD::FGETSIGN, MVT::i32, Custom);
    509 
    510     // Expand FP immediates into loads from the stack, except for the special
    511     // cases we handle.
    512     addLegalFPImmediate(APFloat(+0.0)); // xorpd
    513     addLegalFPImmediate(APFloat(+0.0f)); // xorps
    514   } else if (UseX87 && X86ScalarSSEf32) {
    515     // Use SSE for f32, x87 for f64.
    516     // Set up the FP register classes.
    517     addRegisterClass(MVT::f32, &X86::FR32RegClass);
    518     addRegisterClass(MVT::f64, &X86::RFP64RegClass);
    519 
    520     // Use ANDPS to simulate FABS.
    521     setOperationAction(ISD::FABS , MVT::f32, Custom);
    522 
    523     // Use XORP to simulate FNEG.
    524     setOperationAction(ISD::FNEG , MVT::f32, Custom);
    525 
    526     setOperationAction(ISD::UNDEF,     MVT::f64, Expand);
    527 
    528     // Use ANDPS and ORPS to simulate FCOPYSIGN.
    529     setOperationAction(ISD::FCOPYSIGN, MVT::f64, Expand);
    530     setOperationAction(ISD::FCOPYSIGN, MVT::f32, Custom);
    531 
    532     // We don't support sin/cos/fmod
    533     setOperationAction(ISD::FSIN   , MVT::f32, Expand);
    534     setOperationAction(ISD::FCOS   , MVT::f32, Expand);
    535     setOperationAction(ISD::FSINCOS, MVT::f32, Expand);
    536 
    537     // Special cases we handle for FP constants.
    538     addLegalFPImmediate(APFloat(+0.0f)); // xorps
    539     addLegalFPImmediate(APFloat(+0.0)); // FLD0
    540     addLegalFPImmediate(APFloat(+1.0)); // FLD1
    541     addLegalFPImmediate(APFloat(-0.0)); // FLD0/FCHS
    542     addLegalFPImmediate(APFloat(-1.0)); // FLD1/FCHS
    543 
    544     if (!TM.Options.UnsafeFPMath) {
    545       setOperationAction(ISD::FSIN   , MVT::f64, Expand);
    546       setOperationAction(ISD::FCOS   , MVT::f64, Expand);
    547       setOperationAction(ISD::FSINCOS, MVT::f64, Expand);
    548     }
    549   } else if (UseX87) {
    550     // f32 and f64 in x87.
    551     // Set up the FP register classes.
    552     addRegisterClass(MVT::f64, &X86::RFP64RegClass);
    553     addRegisterClass(MVT::f32, &X86::RFP32RegClass);
    554 
    555     for (auto VT : { MVT::f32, MVT::f64 }) {
    556       setOperationAction(ISD::UNDEF,     VT, Expand);
    557       setOperationAction(ISD::FCOPYSIGN, VT, Expand);
    558 
    559       if (!TM.Options.UnsafeFPMath) {
    560         setOperationAction(ISD::FSIN   , VT, Expand);
    561         setOperationAction(ISD::FCOS   , VT, Expand);
    562         setOperationAction(ISD::FSINCOS, VT, Expand);
    563       }
    564     }
    565     addLegalFPImmediate(APFloat(+0.0)); // FLD0
    566     addLegalFPImmediate(APFloat(+1.0)); // FLD1
    567     addLegalFPImmediate(APFloat(-0.0)); // FLD0/FCHS
    568     addLegalFPImmediate(APFloat(-1.0)); // FLD1/FCHS
    569     addLegalFPImmediate(APFloat(+0.0f)); // FLD0
    570     addLegalFPImmediate(APFloat(+1.0f)); // FLD1
    571     addLegalFPImmediate(APFloat(-0.0f)); // FLD0/FCHS
    572     addLegalFPImmediate(APFloat(-1.0f)); // FLD1/FCHS
    573   }
    574 
    575   // We don't support FMA.
    576   setOperationAction(ISD::FMA, MVT::f64, Expand);
    577   setOperationAction(ISD::FMA, MVT::f32, Expand);
    578 
    579   // Long double always uses X87, except f128 in MMX.
    580   if (UseX87) {
    581     if (Subtarget.is64Bit() && Subtarget.hasMMX()) {
    582       addRegisterClass(MVT::f128, &X86::FR128RegClass);
    583       ValueTypeActions.setTypeAction(MVT::f128, TypeSoftenFloat);
    584       setOperationAction(ISD::FABS , MVT::f128, Custom);
    585       setOperationAction(ISD::FNEG , MVT::f128, Custom);
    586       setOperationAction(ISD::FCOPYSIGN, MVT::f128, Custom);
    587     }
    588 
    589     addRegisterClass(MVT::f80, &X86::RFP80RegClass);
    590     setOperationAction(ISD::UNDEF,     MVT::f80, Expand);
    591     setOperationAction(ISD::FCOPYSIGN, MVT::f80, Expand);
    592     {
    593       APFloat TmpFlt = APFloat::getZero(APFloat::x87DoubleExtended);
    594       addLegalFPImmediate(TmpFlt);  // FLD0
    595       TmpFlt.changeSign();
    596       addLegalFPImmediate(TmpFlt);  // FLD0/FCHS
    597 
    598       bool ignored;
    599       APFloat TmpFlt2(+1.0);
    600       TmpFlt2.convert(APFloat::x87DoubleExtended, APFloat::rmNearestTiesToEven,
    601                       &ignored);
    602       addLegalFPImmediate(TmpFlt2);  // FLD1
    603       TmpFlt2.changeSign();
    604       addLegalFPImmediate(TmpFlt2);  // FLD1/FCHS
    605     }
    606 
    607     if (!TM.Options.UnsafeFPMath) {
    608       setOperationAction(ISD::FSIN   , MVT::f80, Expand);
    609       setOperationAction(ISD::FCOS   , MVT::f80, Expand);
    610       setOperationAction(ISD::FSINCOS, MVT::f80, Expand);
    611     }
    612 
    613     setOperationAction(ISD::FFLOOR, MVT::f80, Expand);
    614     setOperationAction(ISD::FCEIL,  MVT::f80, Expand);
    615     setOperationAction(ISD::FTRUNC, MVT::f80, Expand);
    616     setOperationAction(ISD::FRINT,  MVT::f80, Expand);
    617     setOperationAction(ISD::FNEARBYINT, MVT::f80, Expand);
    618     setOperationAction(ISD::FMA, MVT::f80, Expand);
    619   }
    620 
    621   // Always use a library call for pow.
    622   setOperationAction(ISD::FPOW             , MVT::f32  , Expand);
    623   setOperationAction(ISD::FPOW             , MVT::f64  , Expand);
    624   setOperationAction(ISD::FPOW             , MVT::f80  , Expand);
    625 
    626   setOperationAction(ISD::FLOG, MVT::f80, Expand);
    627   setOperationAction(ISD::FLOG2, MVT::f80, Expand);
    628   setOperationAction(ISD::FLOG10, MVT::f80, Expand);
    629   setOperationAction(ISD::FEXP, MVT::f80, Expand);
    630   setOperationAction(ISD::FEXP2, MVT::f80, Expand);
    631   setOperationAction(ISD::FMINNUM, MVT::f80, Expand);
    632   setOperationAction(ISD::FMAXNUM, MVT::f80, Expand);
    633 
    634   // Some FP actions are always expanded for vector types.
    635   for (auto VT : { MVT::v4f32, MVT::v8f32, MVT::v16f32,
    636                    MVT::v2f64, MVT::v4f64, MVT::v8f64 }) {
    637     setOperationAction(ISD::FSIN,      VT, Expand);
    638     setOperationAction(ISD::FSINCOS,   VT, Expand);
    639     setOperationAction(ISD::FCOS,      VT, Expand);
    640     setOperationAction(ISD::FREM,      VT, Expand);
    641     setOperationAction(ISD::FPOWI,     VT, Expand);
    642     setOperationAction(ISD::FCOPYSIGN, VT, Expand);
    643     setOperationAction(ISD::FPOW,      VT, Expand);
    644     setOperationAction(ISD::FLOG,      VT, Expand);
    645     setOperationAction(ISD::FLOG2,     VT, Expand);
    646     setOperationAction(ISD::FLOG10,    VT, Expand);
    647     setOperationAction(ISD::FEXP,      VT, Expand);
    648     setOperationAction(ISD::FEXP2,     VT, Expand);
    649   }
    650 
    651   // First set operation action for all vector types to either promote
    652   // (for widening) or expand (for scalarization). Then we will selectively
    653   // turn on ones that can be effectively codegen'd.
    654   for (MVT VT : MVT::vector_valuetypes()) {
    655     setOperationAction(ISD::SDIV, VT, Expand);
    656     setOperationAction(ISD::UDIV, VT, Expand);
    657     setOperationAction(ISD::SREM, VT, Expand);
    658     setOperationAction(ISD::UREM, VT, Expand);
    659     setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT,Expand);
    660     setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Expand);
    661     setOperationAction(ISD::EXTRACT_SUBVECTOR, VT,Expand);
    662     setOperationAction(ISD::INSERT_SUBVECTOR, VT,Expand);
    663     setOperationAction(ISD::FMA,  VT, Expand);
    664     setOperationAction(ISD::FFLOOR, VT, Expand);
    665     setOperationAction(ISD::FCEIL, VT, Expand);
    666     setOperationAction(ISD::FTRUNC, VT, Expand);
    667     setOperationAction(ISD::FRINT, VT, Expand);
    668     setOperationAction(ISD::FNEARBYINT, VT, Expand);
    669     setOperationAction(ISD::SMUL_LOHI, VT, Expand);
    670     setOperationAction(ISD::MULHS, VT, Expand);
    671     setOperationAction(ISD::UMUL_LOHI, VT, Expand);
    672     setOperationAction(ISD::MULHU, VT, Expand);
    673     setOperationAction(ISD::SDIVREM, VT, Expand);
    674     setOperationAction(ISD::UDIVREM, VT, Expand);
    675     setOperationAction(ISD::CTPOP, VT, Expand);
    676     setOperationAction(ISD::CTTZ, VT, Expand);
    677     setOperationAction(ISD::CTLZ, VT, Expand);
    678     setOperationAction(ISD::ROTL, VT, Expand);
    679     setOperationAction(ISD::ROTR, VT, Expand);
    680     setOperationAction(ISD::BSWAP, VT, Expand);
    681     setOperationAction(ISD::SETCC, VT, Expand);
    682     setOperationAction(ISD::FP_TO_UINT, VT, Expand);
    683     setOperationAction(ISD::FP_TO_SINT, VT, Expand);
    684     setOperationAction(ISD::UINT_TO_FP, VT, Expand);
    685     setOperationAction(ISD::SINT_TO_FP, VT, Expand);
    686     setOperationAction(ISD::SIGN_EXTEND_INREG, VT,Expand);
    687     setOperationAction(ISD::TRUNCATE, VT, Expand);
    688     setOperationAction(ISD::SIGN_EXTEND, VT, Expand);
    689     setOperationAction(ISD::ZERO_EXTEND, VT, Expand);
    690     setOperationAction(ISD::ANY_EXTEND, VT, Expand);
    691     setOperationAction(ISD::SELECT_CC, VT, Expand);
    692     for (MVT InnerVT : MVT::vector_valuetypes()) {
    693       setTruncStoreAction(InnerVT, VT, Expand);
    694 
    695       setLoadExtAction(ISD::SEXTLOAD, InnerVT, VT, Expand);
    696       setLoadExtAction(ISD::ZEXTLOAD, InnerVT, VT, Expand);
    697 
    698       // N.b. ISD::EXTLOAD legality is basically ignored except for i1-like
    699       // types, we have to deal with them whether we ask for Expansion or not.
    700       // Setting Expand causes its own optimisation problems though, so leave
    701       // them legal.
    702       if (VT.getVectorElementType() == MVT::i1)
    703         setLoadExtAction(ISD::EXTLOAD, InnerVT, VT, Expand);
    704 
    705       // EXTLOAD for MVT::f16 vectors is not legal because f16 vectors are
    706       // split/scalarized right now.
    707       if (VT.getVectorElementType() == MVT::f16)
    708         setLoadExtAction(ISD::EXTLOAD, InnerVT, VT, Expand);
    709     }
    710   }
    711 
    712   // FIXME: In order to prevent SSE instructions being expanded to MMX ones
    713   // with -msoft-float, disable use of MMX as well.
    714   if (!Subtarget.useSoftFloat() && Subtarget.hasMMX()) {
    715     addRegisterClass(MVT::x86mmx, &X86::VR64RegClass);
    716     // No operations on x86mmx supported, everything uses intrinsics.
    717   }
    718 
    719   if (!Subtarget.useSoftFloat() && Subtarget.hasSSE1()) {
    720     addRegisterClass(MVT::v4f32, &X86::VR128RegClass);
    721 
    722     setOperationAction(ISD::FNEG,               MVT::v4f32, Custom);
    723     setOperationAction(ISD::FABS,               MVT::v4f32, Custom);
    724     setOperationAction(ISD::BUILD_VECTOR,       MVT::v4f32, Custom);
    725     setOperationAction(ISD::VECTOR_SHUFFLE,     MVT::v4f32, Custom);
    726     setOperationAction(ISD::VSELECT,            MVT::v4f32, Custom);
    727     setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4f32, Custom);
    728     setOperationAction(ISD::SELECT,             MVT::v4f32, Custom);
    729     setOperationAction(ISD::UINT_TO_FP,         MVT::v4i32, Custom);
    730   }
    731 
    732   if (!Subtarget.useSoftFloat() && Subtarget.hasSSE2()) {
    733     addRegisterClass(MVT::v2f64, &X86::VR128RegClass);
    734 
    735     // FIXME: Unfortunately, -soft-float and -no-implicit-float mean XMM
    736     // registers cannot be used even for integer operations.
    737     addRegisterClass(MVT::v16i8, &X86::VR128RegClass);
    738     addRegisterClass(MVT::v8i16, &X86::VR128RegClass);
    739     addRegisterClass(MVT::v4i32, &X86::VR128RegClass);
    740     addRegisterClass(MVT::v2i64, &X86::VR128RegClass);
    741 
    742     setOperationAction(ISD::MUL,                MVT::v16i8, Custom);
    743     setOperationAction(ISD::MUL,                MVT::v4i32, Custom);
    744     setOperationAction(ISD::MUL,                MVT::v2i64, Custom);
    745     setOperationAction(ISD::UMUL_LOHI,          MVT::v4i32, Custom);
    746     setOperationAction(ISD::SMUL_LOHI,          MVT::v4i32, Custom);
    747     setOperationAction(ISD::MULHU,              MVT::v16i8, Custom);
    748     setOperationAction(ISD::MULHS,              MVT::v16i8, Custom);
    749     setOperationAction(ISD::MULHU,              MVT::v8i16, Legal);
    750     setOperationAction(ISD::MULHS,              MVT::v8i16, Legal);
    751     setOperationAction(ISD::MUL,                MVT::v8i16, Legal);
    752     setOperationAction(ISD::FNEG,               MVT::v2f64, Custom);
    753     setOperationAction(ISD::FABS,               MVT::v2f64, Custom);
    754 
    755     setOperationAction(ISD::SMAX,               MVT::v8i16, Legal);
    756     setOperationAction(ISD::UMAX,               MVT::v16i8, Legal);
    757     setOperationAction(ISD::SMIN,               MVT::v8i16, Legal);
    758     setOperationAction(ISD::UMIN,               MVT::v16i8, Legal);
    759 
    760     setOperationAction(ISD::SETCC,              MVT::v2i64, Custom);
    761     setOperationAction(ISD::SETCC,              MVT::v16i8, Custom);
    762     setOperationAction(ISD::SETCC,              MVT::v8i16, Custom);
    763     setOperationAction(ISD::SETCC,              MVT::v4i32, Custom);
    764 
    765     setOperationAction(ISD::SCALAR_TO_VECTOR,   MVT::v16i8, Custom);
    766     setOperationAction(ISD::SCALAR_TO_VECTOR,   MVT::v8i16, Custom);
    767     setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v8i16, Custom);
    768     setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v4i32, Custom);
    769     setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v4f32, Custom);
    770 
    771     setOperationAction(ISD::CTPOP,              MVT::v16i8, Custom);
    772     setOperationAction(ISD::CTPOP,              MVT::v8i16, Custom);
    773     setOperationAction(ISD::CTPOP,              MVT::v4i32, Custom);
    774     setOperationAction(ISD::CTPOP,              MVT::v2i64, Custom);
    775 
    776     setOperationAction(ISD::CTTZ,               MVT::v16i8, Custom);
    777     setOperationAction(ISD::CTTZ,               MVT::v8i16, Custom);
    778     setOperationAction(ISD::CTTZ,               MVT::v4i32, Custom);
    779     // ISD::CTTZ v2i64 - scalarization is faster.
    780 
    781     // Custom lower build_vector, vector_shuffle, and extract_vector_elt.
    782     for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32 }) {
    783       setOperationAction(ISD::BUILD_VECTOR,       VT, Custom);
    784       setOperationAction(ISD::VECTOR_SHUFFLE,     VT, Custom);
    785       setOperationAction(ISD::VSELECT,            VT, Custom);
    786       setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
    787     }
    788 
    789     // We support custom legalizing of sext and anyext loads for specific
    790     // memory vector types which we can load as a scalar (or sequence of
    791     // scalars) and extend in-register to a legal 128-bit vector type. For sext
    792     // loads these must work with a single scalar load.
    793     for (MVT VT : MVT::integer_vector_valuetypes()) {
    794       setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v4i8, Custom);
    795       setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v4i16, Custom);
    796       setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v8i8, Custom);
    797       setLoadExtAction(ISD::EXTLOAD, VT, MVT::v2i8, Custom);
    798       setLoadExtAction(ISD::EXTLOAD, VT, MVT::v2i16, Custom);
    799       setLoadExtAction(ISD::EXTLOAD, VT, MVT::v2i32, Custom);
    800       setLoadExtAction(ISD::EXTLOAD, VT, MVT::v4i8, Custom);
    801       setLoadExtAction(ISD::EXTLOAD, VT, MVT::v4i16, Custom);
    802       setLoadExtAction(ISD::EXTLOAD, VT, MVT::v8i8, Custom);
    803     }
    804 
    805     for (auto VT : { MVT::v2f64, MVT::v2i64 }) {
    806       setOperationAction(ISD::BUILD_VECTOR,       VT, Custom);
    807       setOperationAction(ISD::VECTOR_SHUFFLE,     VT, Custom);
    808       setOperationAction(ISD::VSELECT,            VT, Custom);
    809 
    810       if (VT == MVT::v2i64 && !Subtarget.is64Bit())
    811         continue;
    812 
    813       setOperationAction(ISD::INSERT_VECTOR_ELT,  VT, Custom);
    814       setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
    815     }
    816 
    817     // Promote v16i8, v8i16, v4i32 load, select, and, or, xor to v2i64.
    818     for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32 }) {
    819       setOperationPromotedToType(ISD::AND,    VT, MVT::v2i64);
    820       setOperationPromotedToType(ISD::OR,     VT, MVT::v2i64);
    821       setOperationPromotedToType(ISD::XOR,    VT, MVT::v2i64);
    822       setOperationPromotedToType(ISD::LOAD,   VT, MVT::v2i64);
    823       setOperationPromotedToType(ISD::SELECT, VT, MVT::v2i64);
    824     }
    825 
    826     // Custom lower v2i64 and v2f64 selects.
    827     setOperationAction(ISD::SELECT,             MVT::v2f64, Custom);
    828     setOperationAction(ISD::SELECT,             MVT::v2i64, Custom);
    829 
    830     setOperationAction(ISD::FP_TO_SINT,         MVT::v4i32, Legal);
    831     setOperationAction(ISD::SINT_TO_FP,         MVT::v4i32, Legal);
    832 
    833     setOperationAction(ISD::SINT_TO_FP,         MVT::v2i32, Custom);
    834 
    835     setOperationAction(ISD::UINT_TO_FP,         MVT::v4i8,  Custom);
    836     setOperationAction(ISD::UINT_TO_FP,         MVT::v4i16, Custom);
    837     // As there is no 64-bit GPR available, we need build a special custom
    838     // sequence to convert from v2i32 to v2f32.
    839     if (!Subtarget.is64Bit())
    840       setOperationAction(ISD::UINT_TO_FP,       MVT::v2f32, Custom);
    841 
    842     setOperationAction(ISD::FP_EXTEND,          MVT::v2f32, Custom);
    843     setOperationAction(ISD::FP_ROUND,           MVT::v2f32, Custom);
    844 
    845     for (MVT VT : MVT::fp_vector_valuetypes())
    846       setLoadExtAction(ISD::EXTLOAD, VT, MVT::v2f32, Legal);
    847 
    848     setOperationAction(ISD::BITCAST,            MVT::v2i32, Custom);
    849     setOperationAction(ISD::BITCAST,            MVT::v4i16, Custom);
    850     setOperationAction(ISD::BITCAST,            MVT::v8i8,  Custom);
    851 
    852     setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v2i64, Custom);
    853     setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v4i32, Custom);
    854     setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v8i16, Custom);
    855 
    856     for (auto VT : { MVT::v8i16, MVT::v16i8 }) {
    857       setOperationAction(ISD::SRL, VT, Custom);
    858       setOperationAction(ISD::SHL, VT, Custom);
    859       setOperationAction(ISD::SRA, VT, Custom);
    860     }
    861 
    862     // In the customized shift lowering, the legal cases in AVX2 will be
    863     // recognized.
    864     for (auto VT : { MVT::v4i32, MVT::v2i64 }) {
    865       setOperationAction(ISD::SRL, VT, Custom);
    866       setOperationAction(ISD::SHL, VT, Custom);
    867       setOperationAction(ISD::SRA, VT, Custom);
    868     }
    869   }
    870 
    871   if (!Subtarget.useSoftFloat() && Subtarget.hasSSSE3()) {
    872     setOperationAction(ISD::BITREVERSE,         MVT::v16i8, Custom);
    873     setOperationAction(ISD::CTLZ,               MVT::v16i8, Custom);
    874     setOperationAction(ISD::CTLZ,               MVT::v8i16, Custom);
    875     // ISD::CTLZ v4i32 - scalarization is faster.
    876     // ISD::CTLZ v2i64 - scalarization is faster.
    877   }
    878 
    879   if (!Subtarget.useSoftFloat() && Subtarget.hasSSE41()) {
    880     for (MVT RoundedTy : {MVT::f32, MVT::f64, MVT::v4f32, MVT::v2f64}) {
    881       setOperationAction(ISD::FFLOOR,           RoundedTy,  Legal);
    882       setOperationAction(ISD::FCEIL,            RoundedTy,  Legal);
    883       setOperationAction(ISD::FTRUNC,           RoundedTy,  Legal);
    884       setOperationAction(ISD::FRINT,            RoundedTy,  Legal);
    885       setOperationAction(ISD::FNEARBYINT,       RoundedTy,  Legal);
    886     }
    887 
    888     setOperationAction(ISD::SMAX,               MVT::v16i8, Legal);
    889     setOperationAction(ISD::SMAX,               MVT::v4i32, Legal);
    890     setOperationAction(ISD::UMAX,               MVT::v8i16, Legal);
    891     setOperationAction(ISD::UMAX,               MVT::v4i32, Legal);
    892     setOperationAction(ISD::SMIN,               MVT::v16i8, Legal);
    893     setOperationAction(ISD::SMIN,               MVT::v4i32, Legal);
    894     setOperationAction(ISD::UMIN,               MVT::v8i16, Legal);
    895     setOperationAction(ISD::UMIN,               MVT::v4i32, Legal);
    896 
    897     // FIXME: Do we need to handle scalar-to-vector here?
    898     setOperationAction(ISD::MUL,                MVT::v4i32, Legal);
    899 
    900     // We directly match byte blends in the backend as they match the VSELECT
    901     // condition form.
    902     setOperationAction(ISD::VSELECT,            MVT::v16i8, Legal);
    903 
    904     // SSE41 brings specific instructions for doing vector sign extend even in
    905     // cases where we don't have SRA.
    906     for (MVT VT : MVT::integer_vector_valuetypes()) {
    907       setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v2i8, Custom);
    908       setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v2i16, Custom);
    909       setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v2i32, Custom);
    910     }
    911 
    912     // SSE41 also has vector sign/zero extending loads, PMOV[SZ]X
    913     setLoadExtAction(ISD::SEXTLOAD, MVT::v8i16, MVT::v8i8,  Legal);
    914     setLoadExtAction(ISD::SEXTLOAD, MVT::v4i32, MVT::v4i8,  Legal);
    915     setLoadExtAction(ISD::SEXTLOAD, MVT::v2i64, MVT::v2i8,  Legal);
    916     setLoadExtAction(ISD::SEXTLOAD, MVT::v4i32, MVT::v4i16, Legal);
    917     setLoadExtAction(ISD::SEXTLOAD, MVT::v2i64, MVT::v2i16, Legal);
    918     setLoadExtAction(ISD::SEXTLOAD, MVT::v2i64, MVT::v2i32, Legal);
    919 
    920     setLoadExtAction(ISD::ZEXTLOAD, MVT::v8i16, MVT::v8i8,  Legal);
    921     setLoadExtAction(ISD::ZEXTLOAD, MVT::v4i32, MVT::v4i8,  Legal);
    922     setLoadExtAction(ISD::ZEXTLOAD, MVT::v2i64, MVT::v2i8,  Legal);
    923     setLoadExtAction(ISD::ZEXTLOAD, MVT::v4i32, MVT::v4i16, Legal);
    924     setLoadExtAction(ISD::ZEXTLOAD, MVT::v2i64, MVT::v2i16, Legal);
    925     setLoadExtAction(ISD::ZEXTLOAD, MVT::v2i64, MVT::v2i32, Legal);
    926 
    927     // i8 vectors are custom because the source register and source
    928     // source memory operand types are not the same width.
    929     setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v16i8, Custom);
    930   }
    931 
    932   if (!Subtarget.useSoftFloat() && Subtarget.hasXOP()) {
    933     for (auto VT : { MVT::v16i8, MVT::v8i16,  MVT::v4i32, MVT::v2i64,
    934                      MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 })
    935       setOperationAction(ISD::ROTL, VT, Custom);
    936 
    937     // XOP can efficiently perform BITREVERSE with VPPERM.
    938     for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 })
    939       setOperationAction(ISD::BITREVERSE, VT, Custom);
    940 
    941     for (auto VT : { MVT::v16i8, MVT::v8i16,  MVT::v4i32, MVT::v2i64,
    942                      MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 })
    943       setOperationAction(ISD::BITREVERSE, VT, Custom);
    944   }
    945 
    946   if (!Subtarget.useSoftFloat() && Subtarget.hasFp256()) {
    947     bool HasInt256 = Subtarget.hasInt256();
    948 
    949     addRegisterClass(MVT::v32i8,  &X86::VR256RegClass);
    950     addRegisterClass(MVT::v16i16, &X86::VR256RegClass);
    951     addRegisterClass(MVT::v8i32,  &X86::VR256RegClass);
    952     addRegisterClass(MVT::v8f32,  &X86::VR256RegClass);
    953     addRegisterClass(MVT::v4i64,  &X86::VR256RegClass);
    954     addRegisterClass(MVT::v4f64,  &X86::VR256RegClass);
    955 
    956     for (auto VT : { MVT::v8f32, MVT::v4f64 }) {
    957       setOperationAction(ISD::FFLOOR,     VT, Legal);
    958       setOperationAction(ISD::FCEIL,      VT, Legal);
    959       setOperationAction(ISD::FTRUNC,     VT, Legal);
    960       setOperationAction(ISD::FRINT,      VT, Legal);
    961       setOperationAction(ISD::FNEARBYINT, VT, Legal);
    962       setOperationAction(ISD::FNEG,       VT, Custom);
    963       setOperationAction(ISD::FABS,       VT, Custom);
    964     }
    965 
    966     // (fp_to_int:v8i16 (v8f32 ..)) requires the result type to be promoted
    967     // even though v8i16 is a legal type.
    968     setOperationAction(ISD::FP_TO_SINT,         MVT::v8i16, Promote);
    969     setOperationAction(ISD::FP_TO_UINT,         MVT::v8i16, Promote);
    970     setOperationAction(ISD::FP_TO_SINT,         MVT::v8i32, Legal);
    971 
    972     setOperationAction(ISD::SINT_TO_FP,         MVT::v8i16, Promote);
    973     setOperationAction(ISD::SINT_TO_FP,         MVT::v8i32, Legal);
    974     setOperationAction(ISD::FP_ROUND,           MVT::v4f32, Legal);
    975 
    976     setOperationAction(ISD::UINT_TO_FP,         MVT::v8i8,  Custom);
    977     setOperationAction(ISD::UINT_TO_FP,         MVT::v8i16, Custom);
    978 
    979     for (MVT VT : MVT::fp_vector_valuetypes())
    980       setLoadExtAction(ISD::EXTLOAD, VT, MVT::v4f32, Legal);
    981 
    982     for (auto VT : { MVT::v32i8, MVT::v16i16 }) {
    983       setOperationAction(ISD::SRL, VT, Custom);
    984       setOperationAction(ISD::SHL, VT, Custom);
    985       setOperationAction(ISD::SRA, VT, Custom);
    986     }
    987 
    988     setOperationAction(ISD::SETCC,             MVT::v32i8, Custom);
    989     setOperationAction(ISD::SETCC,             MVT::v16i16, Custom);
    990     setOperationAction(ISD::SETCC,             MVT::v8i32, Custom);
    991     setOperationAction(ISD::SETCC,             MVT::v4i64, Custom);
    992 
    993     setOperationAction(ISD::SELECT,            MVT::v4f64, Custom);
    994     setOperationAction(ISD::SELECT,            MVT::v4i64, Custom);
    995     setOperationAction(ISD::SELECT,            MVT::v8f32, Custom);
    996 
    997     setOperationAction(ISD::SIGN_EXTEND,       MVT::v4i64, Custom);
    998     setOperationAction(ISD::SIGN_EXTEND,       MVT::v8i32, Custom);
    999     setOperationAction(ISD::SIGN_EXTEND,       MVT::v16i16, Custom);
   1000     setOperationAction(ISD::ZERO_EXTEND,       MVT::v4i64, Custom);
   1001     setOperationAction(ISD::ZERO_EXTEND,       MVT::v8i32, Custom);
   1002     setOperationAction(ISD::ZERO_EXTEND,       MVT::v16i16, Custom);
   1003     setOperationAction(ISD::ANY_EXTEND,        MVT::v4i64, Custom);
   1004     setOperationAction(ISD::ANY_EXTEND,        MVT::v8i32, Custom);
   1005     setOperationAction(ISD::ANY_EXTEND,        MVT::v16i16, Custom);
   1006     setOperationAction(ISD::TRUNCATE,          MVT::v16i8, Custom);
   1007     setOperationAction(ISD::TRUNCATE,          MVT::v8i16, Custom);
   1008     setOperationAction(ISD::TRUNCATE,          MVT::v4i32, Custom);
   1009     setOperationAction(ISD::BITREVERSE,        MVT::v32i8, Custom);
   1010 
   1011     for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 }) {
   1012       setOperationAction(ISD::CTPOP,           VT, Custom);
   1013       setOperationAction(ISD::CTTZ,            VT, Custom);
   1014     }
   1015 
   1016     // ISD::CTLZ v8i32/v4i64 - scalarization is faster without AVX2
   1017     // as we end up splitting the 256-bit vectors.
   1018     for (auto VT : { MVT::v32i8, MVT::v16i16 })
   1019       setOperationAction(ISD::CTLZ,            VT, Custom);
   1020 
   1021     if (HasInt256)
   1022       for (auto VT : { MVT::v8i32, MVT::v4i64 })
   1023         setOperationAction(ISD::CTLZ,          VT, Custom);
   1024 
   1025     if (Subtarget.hasAnyFMA()) {
   1026       for (auto VT : { MVT::f32, MVT::f64, MVT::v4f32, MVT::v8f32,
   1027                        MVT::v2f64, MVT::v4f64 })
   1028         setOperationAction(ISD::FMA, VT, Legal);
   1029     }
   1030 
   1031     for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 }) {
   1032       setOperationAction(ISD::ADD, VT, HasInt256 ? Legal : Custom);
   1033       setOperationAction(ISD::SUB, VT, HasInt256 ? Legal : Custom);
   1034     }
   1035 
   1036     setOperationAction(ISD::MUL,       MVT::v4i64,  Custom);
   1037     setOperationAction(ISD::MUL,       MVT::v8i32,  HasInt256 ? Legal : Custom);
   1038     setOperationAction(ISD::MUL,       MVT::v16i16, HasInt256 ? Legal : Custom);
   1039     setOperationAction(ISD::MUL,       MVT::v32i8,  Custom);
   1040 
   1041     setOperationAction(ISD::UMUL_LOHI, MVT::v8i32,  Custom);
   1042     setOperationAction(ISD::SMUL_LOHI, MVT::v8i32,  Custom);
   1043 
   1044     setOperationAction(ISD::MULHU,     MVT::v16i16, HasInt256 ? Legal : Custom);
   1045     setOperationAction(ISD::MULHS,     MVT::v16i16, HasInt256 ? Legal : Custom);
   1046     setOperationAction(ISD::MULHU,     MVT::v32i8,  Custom);
   1047     setOperationAction(ISD::MULHS,     MVT::v32i8,  Custom);
   1048 
   1049     for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32 }) {
   1050       setOperationAction(ISD::SMAX, VT, HasInt256 ? Legal : Custom);
   1051       setOperationAction(ISD::UMAX, VT, HasInt256 ? Legal : Custom);
   1052       setOperationAction(ISD::SMIN, VT, HasInt256 ? Legal : Custom);
   1053       setOperationAction(ISD::UMIN, VT, HasInt256 ? Legal : Custom);
   1054     }
   1055 
   1056     if (HasInt256) {
   1057       setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v4i64,  Custom);
   1058       setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v8i32,  Custom);
   1059       setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v16i16, Custom);
   1060 
   1061       // The custom lowering for UINT_TO_FP for v8i32 becomes interesting
   1062       // when we have a 256bit-wide blend with immediate.
   1063       setOperationAction(ISD::UINT_TO_FP, MVT::v8i32, Custom);
   1064 
   1065       // AVX2 also has wider vector sign/zero extending loads, VPMOV[SZ]X
   1066       setLoadExtAction(ISD::SEXTLOAD, MVT::v16i16, MVT::v16i8, Legal);
   1067       setLoadExtAction(ISD::SEXTLOAD, MVT::v8i32,  MVT::v8i8,  Legal);
   1068       setLoadExtAction(ISD::SEXTLOAD, MVT::v4i64,  MVT::v4i8,  Legal);
   1069       setLoadExtAction(ISD::SEXTLOAD, MVT::v8i32,  MVT::v8i16, Legal);
   1070       setLoadExtAction(ISD::SEXTLOAD, MVT::v4i64,  MVT::v4i16, Legal);
   1071       setLoadExtAction(ISD::SEXTLOAD, MVT::v4i64,  MVT::v4i32, Legal);
   1072 
   1073       setLoadExtAction(ISD::ZEXTLOAD, MVT::v16i16, MVT::v16i8, Legal);
   1074       setLoadExtAction(ISD::ZEXTLOAD, MVT::v8i32,  MVT::v8i8,  Legal);
   1075       setLoadExtAction(ISD::ZEXTLOAD, MVT::v4i64,  MVT::v4i8,  Legal);
   1076       setLoadExtAction(ISD::ZEXTLOAD, MVT::v8i32,  MVT::v8i16, Legal);
   1077       setLoadExtAction(ISD::ZEXTLOAD, MVT::v4i64,  MVT::v4i16, Legal);
   1078       setLoadExtAction(ISD::ZEXTLOAD, MVT::v4i64,  MVT::v4i32, Legal);
   1079     }
   1080 
   1081     // In the customized shift lowering, the legal cases in AVX2 will be
   1082     // recognized.
   1083     for (auto VT : { MVT::v8i32, MVT::v4i64 }) {
   1084       setOperationAction(ISD::SRL, VT, Custom);
   1085       setOperationAction(ISD::SHL, VT, Custom);
   1086       setOperationAction(ISD::SRA, VT, Custom);
   1087     }
   1088 
   1089     for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64,
   1090                      MVT::v4f32, MVT::v8f32, MVT::v2f64, MVT::v4f64 }) {
   1091       setOperationAction(ISD::MLOAD,  VT, Legal);
   1092       setOperationAction(ISD::MSTORE, VT, Legal);
   1093     }
   1094 
   1095     // Extract subvector is special because the value type
   1096     // (result) is 128-bit but the source is 256-bit wide.
   1097     for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64,
   1098                      MVT::v4f32, MVT::v2f64 }) {
   1099       setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Custom);
   1100     }
   1101 
   1102     // Custom lower several nodes for 256-bit types.
   1103     for (MVT VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64,
   1104                     MVT::v8f32, MVT::v4f64 }) {
   1105       setOperationAction(ISD::BUILD_VECTOR,       VT, Custom);
   1106       setOperationAction(ISD::VECTOR_SHUFFLE,     VT, Custom);
   1107       setOperationAction(ISD::VSELECT,            VT, Custom);
   1108       setOperationAction(ISD::INSERT_VECTOR_ELT,  VT, Custom);
   1109       setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
   1110       setOperationAction(ISD::SCALAR_TO_VECTOR,   VT, Custom);
   1111       setOperationAction(ISD::INSERT_SUBVECTOR,   VT, Custom);
   1112       setOperationAction(ISD::CONCAT_VECTORS,     VT, Custom);
   1113     }
   1114 
   1115     if (HasInt256)
   1116       setOperationAction(ISD::VSELECT,         MVT::v32i8, Legal);
   1117 
   1118     // Promote v32i8, v16i16, v8i32 select, and, or, xor to v4i64.
   1119     for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32 }) {
   1120       setOperationPromotedToType(ISD::AND,    VT, MVT::v4i64);
   1121       setOperationPromotedToType(ISD::OR,     VT, MVT::v4i64);
   1122       setOperationPromotedToType(ISD::XOR,    VT, MVT::v4i64);
   1123       setOperationPromotedToType(ISD::LOAD,   VT, MVT::v4i64);
   1124       setOperationPromotedToType(ISD::SELECT, VT, MVT::v4i64);
   1125     }
   1126   }
   1127 
   1128   if (!Subtarget.useSoftFloat() && Subtarget.hasAVX512()) {
   1129     addRegisterClass(MVT::v16i32, &X86::VR512RegClass);
   1130     addRegisterClass(MVT::v16f32, &X86::VR512RegClass);
   1131     addRegisterClass(MVT::v8i64,  &X86::VR512RegClass);
   1132     addRegisterClass(MVT::v8f64,  &X86::VR512RegClass);
   1133 
   1134     addRegisterClass(MVT::i1,     &X86::VK1RegClass);
   1135     addRegisterClass(MVT::v8i1,   &X86::VK8RegClass);
   1136     addRegisterClass(MVT::v16i1,  &X86::VK16RegClass);
   1137 
   1138     for (MVT VT : MVT::fp_vector_valuetypes())
   1139       setLoadExtAction(ISD::EXTLOAD, VT, MVT::v8f32, Legal);
   1140 
   1141     for (auto ExtType : {ISD::ZEXTLOAD, ISD::SEXTLOAD, ISD::EXTLOAD}) {
   1142       setLoadExtAction(ExtType, MVT::v16i32, MVT::v16i8,  Legal);
   1143       setLoadExtAction(ExtType, MVT::v16i32, MVT::v16i16, Legal);
   1144       setLoadExtAction(ExtType, MVT::v32i16, MVT::v32i8,  Legal);
   1145       setLoadExtAction(ExtType, MVT::v8i64,  MVT::v8i8,   Legal);
   1146       setLoadExtAction(ExtType, MVT::v8i64,  MVT::v8i16,  Legal);
   1147       setLoadExtAction(ExtType, MVT::v8i64,  MVT::v8i32,  Legal);
   1148     }
   1149     setOperationAction(ISD::BR_CC,              MVT::i1,    Expand);
   1150     setOperationAction(ISD::SETCC,              MVT::i1,    Custom);
   1151     setOperationAction(ISD::SETCCE,             MVT::i1,    Custom);
   1152     setOperationAction(ISD::SELECT_CC,          MVT::i1,    Expand);
   1153     setOperationAction(ISD::XOR,                MVT::i1,    Legal);
   1154     setOperationAction(ISD::OR,                 MVT::i1,    Legal);
   1155     setOperationAction(ISD::AND,                MVT::i1,    Legal);
   1156     setOperationAction(ISD::SUB,                MVT::i1,    Custom);
   1157     setOperationAction(ISD::ADD,                MVT::i1,    Custom);
   1158     setOperationAction(ISD::MUL,                MVT::i1,    Custom);
   1159 
   1160     for (MVT VT : {MVT::v2i64, MVT::v4i32, MVT::v8i32, MVT::v4i64, MVT::v8i16,
   1161                    MVT::v16i8, MVT::v16i16, MVT::v32i8, MVT::v16i32,
   1162                    MVT::v8i64, MVT::v32i16, MVT::v64i8}) {
   1163       MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getVectorNumElements());
   1164       setLoadExtAction(ISD::SEXTLOAD, VT, MaskVT, Custom);
   1165       setLoadExtAction(ISD::ZEXTLOAD, VT, MaskVT, Custom);
   1166       setLoadExtAction(ISD::EXTLOAD,  VT, MaskVT, Custom);
   1167       setTruncStoreAction(VT, MaskVT, Custom);
   1168     }
   1169 
   1170     for (MVT VT : { MVT::v16f32, MVT::v8f64 }) {
   1171       setOperationAction(ISD::FNEG,  VT, Custom);
   1172       setOperationAction(ISD::FABS,  VT, Custom);
   1173       setOperationAction(ISD::FMA,   VT, Legal);
   1174     }
   1175 
   1176     setOperationAction(ISD::FP_TO_SINT,         MVT::v16i32, Legal);
   1177     setOperationAction(ISD::FP_TO_UINT,         MVT::v16i32, Legal);
   1178     setOperationAction(ISD::FP_TO_UINT,         MVT::v8i32, Legal);
   1179     setOperationAction(ISD::FP_TO_UINT,         MVT::v4i32, Legal);
   1180     setOperationAction(ISD::SINT_TO_FP,         MVT::v16i32, Legal);
   1181     setOperationAction(ISD::SINT_TO_FP,         MVT::v8i1,   Custom);
   1182     setOperationAction(ISD::SINT_TO_FP,         MVT::v16i1,  Custom);
   1183     setOperationAction(ISD::SINT_TO_FP,         MVT::v16i8,  Promote);
   1184     setOperationAction(ISD::SINT_TO_FP,         MVT::v16i16, Promote);
   1185     setOperationAction(ISD::UINT_TO_FP,         MVT::v16i32, Legal);
   1186     setOperationAction(ISD::UINT_TO_FP,         MVT::v8i32, Legal);
   1187     setOperationAction(ISD::UINT_TO_FP,         MVT::v4i32, Legal);
   1188     setOperationAction(ISD::UINT_TO_FP,         MVT::v16i8, Custom);
   1189     setOperationAction(ISD::UINT_TO_FP,         MVT::v16i16, Custom);
   1190     setOperationAction(ISD::FP_ROUND,           MVT::v8f32, Legal);
   1191     setOperationAction(ISD::FP_EXTEND,          MVT::v8f32, Legal);
   1192 
   1193     setTruncStoreAction(MVT::v8i64,   MVT::v8i8,   Legal);
   1194     setTruncStoreAction(MVT::v8i64,   MVT::v8i16,  Legal);
   1195     setTruncStoreAction(MVT::v8i64,   MVT::v8i32,  Legal);
   1196     setTruncStoreAction(MVT::v16i32,  MVT::v16i8,  Legal);
   1197     setTruncStoreAction(MVT::v16i32,  MVT::v16i16, Legal);
   1198     if (Subtarget.hasVLX()){
   1199       setTruncStoreAction(MVT::v4i64, MVT::v4i8,  Legal);
   1200       setTruncStoreAction(MVT::v4i64, MVT::v4i16, Legal);
   1201       setTruncStoreAction(MVT::v4i64, MVT::v4i32, Legal);
   1202       setTruncStoreAction(MVT::v8i32, MVT::v8i8,  Legal);
   1203       setTruncStoreAction(MVT::v8i32, MVT::v8i16, Legal);
   1204 
   1205       setTruncStoreAction(MVT::v2i64, MVT::v2i8,  Legal);
   1206       setTruncStoreAction(MVT::v2i64, MVT::v2i16, Legal);
   1207       setTruncStoreAction(MVT::v2i64, MVT::v2i32, Legal);
   1208       setTruncStoreAction(MVT::v4i32, MVT::v4i8,  Legal);
   1209       setTruncStoreAction(MVT::v4i32, MVT::v4i16, Legal);
   1210     } else {
   1211       setOperationAction(ISD::MLOAD,    MVT::v8i32, Custom);
   1212       setOperationAction(ISD::MLOAD,    MVT::v8f32, Custom);
   1213       setOperationAction(ISD::MSTORE,   MVT::v8i32, Custom);
   1214       setOperationAction(ISD::MSTORE,   MVT::v8f32, Custom);
   1215     }
   1216     setOperationAction(ISD::TRUNCATE,           MVT::i1, Custom);
   1217     setOperationAction(ISD::TRUNCATE,           MVT::v16i8, Custom);
   1218     setOperationAction(ISD::TRUNCATE,           MVT::v8i32, Custom);
   1219     setOperationAction(ISD::VECTOR_SHUFFLE,     MVT::v8i1,  Custom);
   1220     setOperationAction(ISD::VECTOR_SHUFFLE,     MVT::v16i1, Custom);
   1221     setOperationAction(ISD::VSELECT,            MVT::v8i1,  Expand);
   1222     setOperationAction(ISD::VSELECT,            MVT::v16i1, Expand);
   1223     if (Subtarget.hasDQI()) {
   1224       setOperationAction(ISD::SINT_TO_FP,       MVT::v8i64, Legal);
   1225       setOperationAction(ISD::UINT_TO_FP,       MVT::v8i64, Legal);
   1226       setOperationAction(ISD::FP_TO_SINT,       MVT::v8i64, Legal);
   1227       setOperationAction(ISD::FP_TO_UINT,       MVT::v8i64, Legal);
   1228       if (Subtarget.hasVLX()) {
   1229         setOperationAction(ISD::SINT_TO_FP,    MVT::v4i64, Legal);
   1230         setOperationAction(ISD::SINT_TO_FP,    MVT::v2i64, Legal);
   1231         setOperationAction(ISD::UINT_TO_FP,    MVT::v4i64, Legal);
   1232         setOperationAction(ISD::UINT_TO_FP,    MVT::v2i64, Legal);
   1233         setOperationAction(ISD::FP_TO_SINT,    MVT::v4i64, Legal);
   1234         setOperationAction(ISD::FP_TO_SINT,    MVT::v2i64, Legal);
   1235         setOperationAction(ISD::FP_TO_UINT,    MVT::v4i64, Legal);
   1236         setOperationAction(ISD::FP_TO_UINT,    MVT::v2i64, Legal);
   1237       }
   1238     }
   1239     if (Subtarget.hasVLX()) {
   1240       setOperationAction(ISD::SINT_TO_FP,       MVT::v8i32, Legal);
   1241       setOperationAction(ISD::UINT_TO_FP,       MVT::v8i32, Legal);
   1242       setOperationAction(ISD::FP_TO_SINT,       MVT::v8i32, Legal);
   1243       setOperationAction(ISD::FP_TO_UINT,       MVT::v8i32, Legal);
   1244       setOperationAction(ISD::SINT_TO_FP,       MVT::v4i32, Legal);
   1245       setOperationAction(ISD::UINT_TO_FP,       MVT::v4i32, Legal);
   1246       setOperationAction(ISD::FP_TO_SINT,       MVT::v4i32, Legal);
   1247       setOperationAction(ISD::FP_TO_UINT,       MVT::v4i32, Legal);
   1248       setOperationAction(ISD::ZERO_EXTEND,      MVT::v4i32, Custom);
   1249       setOperationAction(ISD::ZERO_EXTEND,      MVT::v2i64, Custom);
   1250 
   1251       // FIXME. This commands are available on SSE/AVX2, add relevant patterns.
   1252       setLoadExtAction(ISD::EXTLOAD, MVT::v8i32, MVT::v8i8,  Legal);
   1253       setLoadExtAction(ISD::EXTLOAD, MVT::v8i32, MVT::v8i16, Legal);
   1254       setLoadExtAction(ISD::EXTLOAD, MVT::v4i32, MVT::v4i8,  Legal);
   1255       setLoadExtAction(ISD::EXTLOAD, MVT::v4i32, MVT::v4i16, Legal);
   1256       setLoadExtAction(ISD::EXTLOAD, MVT::v4i64, MVT::v4i8,  Legal);
   1257       setLoadExtAction(ISD::EXTLOAD, MVT::v4i64, MVT::v4i16, Legal);
   1258       setLoadExtAction(ISD::EXTLOAD, MVT::v4i64, MVT::v4i32, Legal);
   1259       setLoadExtAction(ISD::EXTLOAD, MVT::v2i64, MVT::v2i8,  Legal);
   1260       setLoadExtAction(ISD::EXTLOAD, MVT::v2i64, MVT::v2i16, Legal);
   1261       setLoadExtAction(ISD::EXTLOAD, MVT::v2i64, MVT::v2i32, Legal);
   1262     }
   1263 
   1264     setOperationAction(ISD::TRUNCATE,           MVT::v8i1, Custom);
   1265     setOperationAction(ISD::TRUNCATE,           MVT::v16i1, Custom);
   1266     setOperationAction(ISD::TRUNCATE,           MVT::v16i16, Custom);
   1267     setOperationAction(ISD::ZERO_EXTEND,        MVT::v16i32, Custom);
   1268     setOperationAction(ISD::ZERO_EXTEND,        MVT::v8i64, Custom);
   1269     setOperationAction(ISD::ANY_EXTEND,         MVT::v16i32, Custom);
   1270     setOperationAction(ISD::ANY_EXTEND,         MVT::v8i64, Custom);
   1271     setOperationAction(ISD::SIGN_EXTEND,        MVT::v16i32, Custom);
   1272     setOperationAction(ISD::SIGN_EXTEND,        MVT::v8i64, Custom);
   1273     setOperationAction(ISD::SIGN_EXTEND,        MVT::v16i8, Custom);
   1274     setOperationAction(ISD::SIGN_EXTEND,        MVT::v8i16, Custom);
   1275     setOperationAction(ISD::SIGN_EXTEND,        MVT::v16i16, Custom);
   1276     if (Subtarget.hasDQI()) {
   1277       setOperationAction(ISD::SIGN_EXTEND,        MVT::v4i32, Custom);
   1278       setOperationAction(ISD::SIGN_EXTEND,        MVT::v2i64, Custom);
   1279     }
   1280     for (auto VT : { MVT::v16f32, MVT::v8f64 }) {
   1281       setOperationAction(ISD::FFLOOR,     VT, Legal);
   1282       setOperationAction(ISD::FCEIL,      VT, Legal);
   1283       setOperationAction(ISD::FTRUNC,     VT, Legal);
   1284       setOperationAction(ISD::FRINT,      VT, Legal);
   1285       setOperationAction(ISD::FNEARBYINT, VT, Legal);
   1286     }
   1287 
   1288     setOperationAction(ISD::CONCAT_VECTORS,     MVT::v8f64,  Custom);
   1289     setOperationAction(ISD::CONCAT_VECTORS,     MVT::v8i64,  Custom);
   1290     setOperationAction(ISD::CONCAT_VECTORS,     MVT::v16f32,  Custom);
   1291     setOperationAction(ISD::CONCAT_VECTORS,     MVT::v16i32,  Custom);
   1292     setOperationAction(ISD::CONCAT_VECTORS,     MVT::v16i1,   Custom);
   1293 
   1294     setOperationAction(ISD::SETCC,              MVT::v16i1, Custom);
   1295     setOperationAction(ISD::SETCC,              MVT::v8i1, Custom);
   1296 
   1297     setOperationAction(ISD::MUL,              MVT::v8i64, Custom);
   1298 
   1299     setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v8i1,  Custom);
   1300     setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v16i1, Custom);
   1301     setOperationAction(ISD::INSERT_SUBVECTOR,   MVT::v16i1, Custom);
   1302     setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v16i1, Custom);
   1303     setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v8i1, Custom);
   1304     setOperationAction(ISD::BUILD_VECTOR,       MVT::v8i1, Custom);
   1305     setOperationAction(ISD::BUILD_VECTOR,       MVT::v16i1, Custom);
   1306     setOperationAction(ISD::SELECT,             MVT::v8f64, Custom);
   1307     setOperationAction(ISD::SELECT,             MVT::v8i64, Custom);
   1308     setOperationAction(ISD::SELECT,             MVT::v16f32, Custom);
   1309     setOperationAction(ISD::SELECT,             MVT::v16i1, Custom);
   1310     setOperationAction(ISD::SELECT,             MVT::v8i1,  Custom);
   1311 
   1312     setOperationAction(ISD::SMAX,               MVT::v16i32, Legal);
   1313     setOperationAction(ISD::SMAX,               MVT::v8i64, Legal);
   1314     setOperationAction(ISD::UMAX,               MVT::v16i32, Legal);
   1315     setOperationAction(ISD::UMAX,               MVT::v8i64, Legal);
   1316     setOperationAction(ISD::SMIN,               MVT::v16i32, Legal);
   1317     setOperationAction(ISD::SMIN,               MVT::v8i64, Legal);
   1318     setOperationAction(ISD::UMIN,               MVT::v16i32, Legal);
   1319     setOperationAction(ISD::UMIN,               MVT::v8i64, Legal);
   1320 
   1321     setOperationAction(ISD::ADD,                MVT::v8i1,  Expand);
   1322     setOperationAction(ISD::ADD,                MVT::v16i1, Expand);
   1323     setOperationAction(ISD::SUB,                MVT::v8i1,  Expand);
   1324     setOperationAction(ISD::SUB,                MVT::v16i1, Expand);
   1325     setOperationAction(ISD::MUL,                MVT::v8i1,  Expand);
   1326     setOperationAction(ISD::MUL,                MVT::v16i1, Expand);
   1327 
   1328     setOperationAction(ISD::MUL,                MVT::v16i32, Legal);
   1329 
   1330     for (auto VT : { MVT::v16i32, MVT::v8i64 }) {
   1331       setOperationAction(ISD::SRL, VT, Custom);
   1332       setOperationAction(ISD::SHL, VT, Custom);
   1333       setOperationAction(ISD::SRA, VT, Custom);
   1334       setOperationAction(ISD::AND, VT, Legal);
   1335       setOperationAction(ISD::OR,  VT, Legal);
   1336       setOperationAction(ISD::XOR, VT, Legal);
   1337       setOperationAction(ISD::CTPOP, VT, Custom);
   1338       setOperationAction(ISD::CTTZ, VT, Custom);
   1339     }
   1340 
   1341     if (Subtarget.hasCDI()) {
   1342       setOperationAction(ISD::CTLZ,             MVT::v8i64,  Legal);
   1343       setOperationAction(ISD::CTLZ,             MVT::v16i32, Legal);
   1344 
   1345       setOperationAction(ISD::CTLZ,             MVT::v8i16,  Custom);
   1346       setOperationAction(ISD::CTLZ,             MVT::v16i8,  Custom);
   1347       setOperationAction(ISD::CTLZ,             MVT::v16i16, Custom);
   1348       setOperationAction(ISD::CTLZ,             MVT::v32i8,  Custom);
   1349 
   1350       setOperationAction(ISD::CTTZ_ZERO_UNDEF,  MVT::v8i64,  Custom);
   1351       setOperationAction(ISD::CTTZ_ZERO_UNDEF,  MVT::v16i32, Custom);
   1352 
   1353       if (Subtarget.hasVLX()) {
   1354         setOperationAction(ISD::CTLZ,             MVT::v4i64, Legal);
   1355         setOperationAction(ISD::CTLZ,             MVT::v8i32, Legal);
   1356         setOperationAction(ISD::CTLZ,             MVT::v2i64, Legal);
   1357         setOperationAction(ISD::CTLZ,             MVT::v4i32, Legal);
   1358       } else {
   1359         setOperationAction(ISD::CTLZ,             MVT::v4i64, Custom);
   1360         setOperationAction(ISD::CTLZ,             MVT::v8i32, Custom);
   1361         setOperationAction(ISD::CTLZ,             MVT::v2i64, Custom);
   1362         setOperationAction(ISD::CTLZ,             MVT::v4i32, Custom);
   1363       }
   1364 
   1365       setOperationAction(ISD::CTTZ_ZERO_UNDEF,  MVT::v4i64, Custom);
   1366       setOperationAction(ISD::CTTZ_ZERO_UNDEF,  MVT::v8i32, Custom);
   1367       setOperationAction(ISD::CTTZ_ZERO_UNDEF,  MVT::v2i64, Custom);
   1368       setOperationAction(ISD::CTTZ_ZERO_UNDEF,  MVT::v4i32, Custom);
   1369     } // Subtarget.hasCDI()
   1370 
   1371     if (Subtarget.hasDQI()) {
   1372       if (Subtarget.hasVLX()) {
   1373         setOperationAction(ISD::MUL,             MVT::v2i64, Legal);
   1374         setOperationAction(ISD::MUL,             MVT::v4i64, Legal);
   1375       }
   1376       setOperationAction(ISD::MUL,             MVT::v8i64, Legal);
   1377     }
   1378     // Custom lower several nodes.
   1379     for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64,
   1380                      MVT::v4f32, MVT::v8f32, MVT::v2f64, MVT::v4f64 }) {
   1381       setOperationAction(ISD::MGATHER,  VT, Custom);
   1382       setOperationAction(ISD::MSCATTER, VT, Custom);
   1383     }
   1384     // Extract subvector is special because the value type
   1385     // (result) is 256-bit but the source is 512-bit wide.
   1386     // 128-bit was made Custom under AVX1.
   1387     for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64,
   1388                      MVT::v8f32, MVT::v4f64 })
   1389       setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Custom);
   1390     for (auto VT : { MVT::v2i1, MVT::v4i1, MVT::v8i1,
   1391                      MVT::v16i1, MVT::v32i1, MVT::v64i1 })
   1392       setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Legal);
   1393 
   1394     for (auto VT : { MVT::v16i32, MVT::v8i64, MVT::v16f32, MVT::v8f64 }) {
   1395       setOperationAction(ISD::VECTOR_SHUFFLE,      VT, Custom);
   1396       setOperationAction(ISD::INSERT_VECTOR_ELT,   VT, Custom);
   1397       setOperationAction(ISD::BUILD_VECTOR,        VT, Custom);
   1398       setOperationAction(ISD::VSELECT,             VT, Legal);
   1399       setOperationAction(ISD::EXTRACT_VECTOR_ELT,  VT, Custom);
   1400       setOperationAction(ISD::SCALAR_TO_VECTOR,    VT, Custom);
   1401       setOperationAction(ISD::INSERT_SUBVECTOR,    VT, Custom);
   1402       setOperationAction(ISD::MLOAD,               VT, Legal);
   1403       setOperationAction(ISD::MSTORE,              VT, Legal);
   1404       setOperationAction(ISD::MGATHER,             VT, Legal);
   1405       setOperationAction(ISD::MSCATTER,            VT, Custom);
   1406     }
   1407     for (auto VT : { MVT::v64i8, MVT::v32i16, MVT::v16i32 }) {
   1408       setOperationPromotedToType(ISD::SELECT, VT, MVT::v8i64);
   1409     }
   1410   }// has  AVX-512
   1411 
   1412   if (!Subtarget.useSoftFloat() && Subtarget.hasBWI()) {
   1413     addRegisterClass(MVT::v32i16, &X86::VR512RegClass);
   1414     addRegisterClass(MVT::v64i8,  &X86::VR512RegClass);
   1415 
   1416     addRegisterClass(MVT::v32i1,  &X86::VK32RegClass);
   1417     addRegisterClass(MVT::v64i1,  &X86::VK64RegClass);
   1418 
   1419     setOperationAction(ISD::ADD,                MVT::v32i1, Expand);
   1420     setOperationAction(ISD::ADD,                MVT::v64i1, Expand);
   1421     setOperationAction(ISD::SUB,                MVT::v32i1, Expand);
   1422     setOperationAction(ISD::SUB,                MVT::v64i1, Expand);
   1423     setOperationAction(ISD::MUL,                MVT::v32i1, Expand);
   1424     setOperationAction(ISD::MUL,                MVT::v64i1, Expand);
   1425 
   1426     setOperationAction(ISD::SETCC,              MVT::v32i1, Custom);
   1427     setOperationAction(ISD::SETCC,              MVT::v64i1, Custom);
   1428     setOperationAction(ISD::MUL,                MVT::v32i16, Legal);
   1429     setOperationAction(ISD::MUL,                MVT::v64i8, Custom);
   1430     setOperationAction(ISD::MULHS,              MVT::v32i16, Legal);
   1431     setOperationAction(ISD::MULHU,              MVT::v32i16, Legal);
   1432     setOperationAction(ISD::CONCAT_VECTORS,     MVT::v32i1, Custom);
   1433     setOperationAction(ISD::CONCAT_VECTORS,     MVT::v64i1, Custom);
   1434     setOperationAction(ISD::CONCAT_VECTORS,     MVT::v32i16, Custom);
   1435     setOperationAction(ISD::CONCAT_VECTORS,     MVT::v64i8, Custom);
   1436     setOperationAction(ISD::INSERT_SUBVECTOR,   MVT::v32i1, Custom);
   1437     setOperationAction(ISD::INSERT_SUBVECTOR,   MVT::v64i1, Custom);
   1438     setOperationAction(ISD::INSERT_SUBVECTOR,   MVT::v32i16, Custom);
   1439     setOperationAction(ISD::INSERT_SUBVECTOR,   MVT::v64i8, Custom);
   1440     setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v32i16, Custom);
   1441     setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v64i8, Custom);
   1442     setOperationAction(ISD::SCALAR_TO_VECTOR,   MVT::v32i16, Custom);
   1443     setOperationAction(ISD::SCALAR_TO_VECTOR,   MVT::v64i8, Custom);
   1444     setOperationAction(ISD::SELECT,             MVT::v32i1, Custom);
   1445     setOperationAction(ISD::SELECT,             MVT::v64i1, Custom);
   1446     setOperationAction(ISD::SIGN_EXTEND,        MVT::v32i8, Custom);
   1447     setOperationAction(ISD::ZERO_EXTEND,        MVT::v32i8, Custom);
   1448     setOperationAction(ISD::SIGN_EXTEND,        MVT::v32i16, Custom);
   1449     setOperationAction(ISD::ZERO_EXTEND,        MVT::v32i16, Custom);
   1450     setOperationAction(ISD::ANY_EXTEND,         MVT::v32i16, Custom);
   1451     setOperationAction(ISD::VECTOR_SHUFFLE,     MVT::v32i16, Custom);
   1452     setOperationAction(ISD::VECTOR_SHUFFLE,     MVT::v64i8, Custom);
   1453     setOperationAction(ISD::SIGN_EXTEND,        MVT::v64i8, Custom);
   1454     setOperationAction(ISD::ZERO_EXTEND,        MVT::v64i8, Custom);
   1455     setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v32i1, Custom);
   1456     setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v64i1, Custom);
   1457     setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v32i16, Custom);
   1458     setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v64i8, Custom);
   1459     setOperationAction(ISD::VSELECT,            MVT::v32i16, Legal);
   1460     setOperationAction(ISD::VSELECT,            MVT::v64i8, Legal);
   1461     setOperationAction(ISD::TRUNCATE,           MVT::v32i1, Custom);
   1462     setOperationAction(ISD::TRUNCATE,           MVT::v64i1, Custom);
   1463     setOperationAction(ISD::TRUNCATE,           MVT::v32i8, Custom);
   1464     setOperationAction(ISD::VECTOR_SHUFFLE,     MVT::v32i1, Custom);
   1465     setOperationAction(ISD::VECTOR_SHUFFLE,     MVT::v64i1, Custom);
   1466     setOperationAction(ISD::BUILD_VECTOR,       MVT::v32i1, Custom);
   1467     setOperationAction(ISD::BUILD_VECTOR,       MVT::v64i1, Custom);
   1468     setOperationAction(ISD::VSELECT,            MVT::v32i1, Expand);
   1469     setOperationAction(ISD::VSELECT,            MVT::v64i1, Expand);
   1470     setOperationAction(ISD::BITREVERSE,         MVT::v64i8, Custom);
   1471 
   1472     setOperationAction(ISD::SMAX,               MVT::v64i8, Legal);
   1473     setOperationAction(ISD::SMAX,               MVT::v32i16, Legal);
   1474     setOperationAction(ISD::UMAX,               MVT::v64i8, Legal);
   1475     setOperationAction(ISD::UMAX,               MVT::v32i16, Legal);
   1476     setOperationAction(ISD::SMIN,               MVT::v64i8, Legal);
   1477     setOperationAction(ISD::SMIN,               MVT::v32i16, Legal);
   1478     setOperationAction(ISD::UMIN,               MVT::v64i8, Legal);
   1479     setOperationAction(ISD::UMIN,               MVT::v32i16, Legal);
   1480 
   1481     setTruncStoreAction(MVT::v32i16,  MVT::v32i8, Legal);
   1482     setTruncStoreAction(MVT::v16i16,  MVT::v16i8, Legal);
   1483     if (Subtarget.hasVLX())
   1484       setTruncStoreAction(MVT::v8i16,   MVT::v8i8,  Legal);
   1485 
   1486     LegalizeAction Action = Subtarget.hasVLX() ? Legal : Custom;
   1487     for (auto VT : { MVT::v32i8, MVT::v16i8, MVT::v16i16, MVT::v8i16 }) {
   1488       setOperationAction(ISD::MLOAD,               VT, Action);
   1489       setOperationAction(ISD::MSTORE,              VT, Action);
   1490     }
   1491 
   1492     if (Subtarget.hasCDI()) {
   1493       setOperationAction(ISD::CTLZ,            MVT::v32i16, Custom);
   1494       setOperationAction(ISD::CTLZ,            MVT::v64i8,  Custom);
   1495     }
   1496 
   1497     for (auto VT : { MVT::v64i8, MVT::v32i16 }) {
   1498       setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
   1499       setOperationAction(ISD::VSELECT,      VT, Legal);
   1500       setOperationAction(ISD::SRL,          VT, Custom);
   1501       setOperationAction(ISD::SHL,          VT, Custom);
   1502       setOperationAction(ISD::SRA,          VT, Custom);
   1503       setOperationAction(ISD::MLOAD,        VT, Legal);
   1504       setOperationAction(ISD::MSTORE,       VT, Legal);
   1505       setOperationAction(ISD::CTPOP,        VT, Custom);
   1506       setOperationAction(ISD::CTTZ,         VT, Custom);
   1507 
   1508       setOperationPromotedToType(ISD::AND,  VT, MVT::v8i64);
   1509       setOperationPromotedToType(ISD::OR,   VT, MVT::v8i64);
   1510       setOperationPromotedToType(ISD::XOR,  VT, MVT::v8i64);
   1511     }
   1512 
   1513     for (auto ExtType : {ISD::ZEXTLOAD, ISD::SEXTLOAD, ISD::EXTLOAD}) {
   1514       setLoadExtAction(ExtType, MVT::v32i16, MVT::v32i8, Legal);
   1515       if (Subtarget.hasVLX()) {
   1516         // FIXME. This commands are available on SSE/AVX2, add relevant patterns.
   1517         setLoadExtAction(ExtType, MVT::v16i16, MVT::v16i8, Legal);
   1518         setLoadExtAction(ExtType, MVT::v8i16,  MVT::v8i8,  Legal);
   1519       }
   1520     }
   1521   }
   1522 
   1523   if (!Subtarget.useSoftFloat() && Subtarget.hasVLX()) {
   1524     addRegisterClass(MVT::v4i1,   &X86::VK4RegClass);
   1525     addRegisterClass(MVT::v2i1,   &X86::VK2RegClass);
   1526 
   1527     setOperationAction(ISD::ADD,                MVT::v2i1, Expand);
   1528     setOperationAction(ISD::ADD,                MVT::v4i1, Expand);
   1529     setOperationAction(ISD::SUB,                MVT::v2i1, Expand);
   1530     setOperationAction(ISD::SUB,                MVT::v4i1, Expand);
   1531     setOperationAction(ISD::MUL,                MVT::v2i1, Expand);
   1532     setOperationAction(ISD::MUL,                MVT::v4i1, Expand);
   1533 
   1534     setOperationAction(ISD::TRUNCATE,           MVT::v2i1, Custom);
   1535     setOperationAction(ISD::TRUNCATE,           MVT::v4i1, Custom);
   1536     setOperationAction(ISD::SETCC,              MVT::v4i1, Custom);
   1537     setOperationAction(ISD::SETCC,              MVT::v2i1, Custom);
   1538     setOperationAction(ISD::CONCAT_VECTORS,     MVT::v4i1, Custom);
   1539     setOperationAction(ISD::CONCAT_VECTORS,     MVT::v8i1, Custom);
   1540     setOperationAction(ISD::INSERT_SUBVECTOR,   MVT::v8i1, Custom);
   1541     setOperationAction(ISD::INSERT_SUBVECTOR,   MVT::v4i1, Custom);
   1542     setOperationAction(ISD::SELECT,             MVT::v4i1, Custom);
   1543     setOperationAction(ISD::SELECT,             MVT::v2i1, Custom);
   1544     setOperationAction(ISD::BUILD_VECTOR,       MVT::v4i1, Custom);
   1545     setOperationAction(ISD::BUILD_VECTOR,       MVT::v2i1, Custom);
   1546     setOperationAction(ISD::VECTOR_SHUFFLE,     MVT::v2i1, Custom);
   1547     setOperationAction(ISD::VECTOR_SHUFFLE,     MVT::v4i1, Custom);
   1548     setOperationAction(ISD::VSELECT,            MVT::v2i1, Expand);
   1549     setOperationAction(ISD::VSELECT,            MVT::v4i1, Expand);
   1550 
   1551     for (auto VT : { MVT::v4i32, MVT::v8i32 }) {
   1552       setOperationAction(ISD::AND, VT, Legal);
   1553       setOperationAction(ISD::OR,  VT, Legal);
   1554       setOperationAction(ISD::XOR, VT, Legal);
   1555     }
   1556 
   1557     for (auto VT : { MVT::v2i64, MVT::v4i64 }) {
   1558       setOperationAction(ISD::SMAX, VT, Legal);
   1559       setOperationAction(ISD::UMAX, VT, Legal);
   1560       setOperationAction(ISD::SMIN, VT, Legal);
   1561       setOperationAction(ISD::UMIN, VT, Legal);
   1562     }
   1563   }
   1564 
   1565   // We want to custom lower some of our intrinsics.
   1566   setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom);
   1567   setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::Other, Custom);
   1568   setOperationAction(ISD::INTRINSIC_VOID, MVT::Other, Custom);
   1569   if (!Subtarget.is64Bit()) {
   1570     setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::i64, Custom);
   1571     setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::i64, Custom);
   1572   }
   1573 
   1574   // Only custom-lower 64-bit SADDO and friends on 64-bit because we don't
   1575   // handle type legalization for these operations here.
   1576   //
   1577   // FIXME: We really should do custom legalization for addition and
   1578   // subtraction on x86-32 once PR3203 is fixed.  We really can't do much better
   1579   // than generic legalization for 64-bit multiplication-with-overflow, though.
   1580   for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {
   1581     if (VT == MVT::i64 && !Subtarget.is64Bit())
   1582       continue;
   1583     // Add/Sub/Mul with overflow operations are custom lowered.
   1584     setOperationAction(ISD::SADDO, VT, Custom);
   1585     setOperationAction(ISD::UADDO, VT, Custom);
   1586     setOperationAction(ISD::SSUBO, VT, Custom);
   1587     setOperationAction(ISD::USUBO, VT, Custom);
   1588     setOperationAction(ISD::SMULO, VT, Custom);
   1589     setOperationAction(ISD::UMULO, VT, Custom);
   1590   }
   1591 
   1592   if (!Subtarget.is64Bit()) {
   1593     // These libcalls are not available in 32-bit.
   1594     setLibcallName(RTLIB::SHL_I128, nullptr);
   1595     setLibcallName(RTLIB::SRL_I128, nullptr);
   1596     setLibcallName(RTLIB::SRA_I128, nullptr);
   1597   }
   1598 
   1599   // Combine sin / cos into one node or libcall if possible.
   1600   if (Subtarget.hasSinCos()) {
   1601     setLibcallName(RTLIB::SINCOS_F32, "sincosf");
   1602     setLibcallName(RTLIB::SINCOS_F64, "sincos");
   1603     if (Subtarget.isTargetDarwin()) {
   1604       // For MacOSX, we don't want the normal expansion of a libcall to sincos.
   1605       // We want to issue a libcall to __sincos_stret to avoid memory traffic.
   1606       setOperationAction(ISD::FSINCOS, MVT::f64, Custom);
   1607       setOperationAction(ISD::FSINCOS, MVT::f32, Custom);
   1608     }
   1609   }
   1610 
   1611   if (Subtarget.isTargetWin64()) {
   1612     setOperationAction(ISD::SDIV, MVT::i128, Custom);
   1613     setOperationAction(ISD::UDIV, MVT::i128, Custom);
   1614     setOperationAction(ISD::SREM, MVT::i128, Custom);
   1615     setOperationAction(ISD::UREM, MVT::i128, Custom);
   1616     setOperationAction(ISD::SDIVREM, MVT::i128, Custom);
   1617     setOperationAction(ISD::UDIVREM, MVT::i128, Custom);
   1618   }
   1619 
   1620   // On 32 bit MSVC, `fmodf(f32)` is not defined - only `fmod(f64)`
   1621   // is. We should promote the value to 64-bits to solve this.
   1622   // This is what the CRT headers do - `fmodf` is an inline header
   1623   // function casting to f64 and calling `fmod`.
   1624   if (Subtarget.is32Bit() && Subtarget.isTargetKnownWindowsMSVC())
   1625     for (ISD::NodeType Op :
   1626          {ISD::FCEIL, ISD::FCOS, ISD::FEXP, ISD::FFLOOR, ISD::FREM, ISD::FLOG,
   1627           ISD::FLOG10, ISD::FPOW, ISD::FSIN})
   1628       if (isOperationExpand(Op, MVT::f32))
   1629         setOperationAction(Op, MVT::f32, Promote);
   1630 
   1631   // We have target-specific dag combine patterns for the following nodes:
   1632   setTargetDAGCombine(ISD::VECTOR_SHUFFLE);
   1633   setTargetDAGCombine(ISD::EXTRACT_VECTOR_ELT);
   1634   setTargetDAGCombine(ISD::BITCAST);
   1635   setTargetDAGCombine(ISD::VSELECT);
   1636   setTargetDAGCombine(ISD::SELECT);
   1637   setTargetDAGCombine(ISD::SHL);
   1638   setTargetDAGCombine(ISD::SRA);
   1639   setTargetDAGCombine(ISD::SRL);
   1640   setTargetDAGCombine(ISD::OR);
   1641   setTargetDAGCombine(ISD::AND);
   1642   setTargetDAGCombine(ISD::ADD);
   1643   setTargetDAGCombine(ISD::FADD);
   1644   setTargetDAGCombine(ISD::FSUB);
   1645   setTargetDAGCombine(ISD::FNEG);
   1646   setTargetDAGCombine(ISD::FMA);
   1647   setTargetDAGCombine(ISD::FMINNUM);
   1648   setTargetDAGCombine(ISD::FMAXNUM);
   1649   setTargetDAGCombine(ISD::SUB);
   1650   setTargetDAGCombine(ISD::LOAD);
   1651   setTargetDAGCombine(ISD::MLOAD);
   1652   setTargetDAGCombine(ISD::STORE);
   1653   setTargetDAGCombine(ISD::MSTORE);
   1654   setTargetDAGCombine(ISD::TRUNCATE);
   1655   setTargetDAGCombine(ISD::ZERO_EXTEND);
   1656   setTargetDAGCombine(ISD::ANY_EXTEND);
   1657   setTargetDAGCombine(ISD::SIGN_EXTEND);
   1658   setTargetDAGCombine(ISD::SIGN_EXTEND_INREG);
   1659   setTargetDAGCombine(ISD::SINT_TO_FP);
   1660   setTargetDAGCombine(ISD::UINT_TO_FP);
   1661   setTargetDAGCombine(ISD::SETCC);
   1662   setTargetDAGCombine(ISD::MUL);
   1663   setTargetDAGCombine(ISD::XOR);
   1664   setTargetDAGCombine(ISD::MSCATTER);
   1665   setTargetDAGCombine(ISD::MGATHER);
   1666 
   1667   computeRegisterProperties(Subtarget.getRegisterInfo());
   1668 
   1669   MaxStoresPerMemset = 16; // For @llvm.memset -> sequence of stores
   1670   MaxStoresPerMemsetOptSize = 8;
   1671   MaxStoresPerMemcpy = 8; // For @llvm.memcpy -> sequence of stores
   1672   MaxStoresPerMemcpyOptSize = 4;
   1673   MaxStoresPerMemmove = 8; // For @llvm.memmove -> sequence of stores
   1674   MaxStoresPerMemmoveOptSize = 4;
   1675   setPrefLoopAlignment(4); // 2^4 bytes.
   1676 
   1677   // An out-of-order CPU can speculatively execute past a predictable branch,
   1678   // but a conditional move could be stalled by an expensive earlier operation.
   1679   PredictableSelectIsExpensive = Subtarget.getSchedModel().isOutOfOrder();
   1680   EnableExtLdPromotion = true;
   1681   setPrefFunctionAlignment(4); // 2^4 bytes.
   1682 
   1683   verifyIntrinsicTables();
   1684 }
   1685 
   1686 // This has so far only been implemented for 64-bit MachO.
   1687 bool X86TargetLowering::useLoadStackGuardNode() const {
   1688   return Subtarget.isTargetMachO() && Subtarget.is64Bit();
   1689 }
   1690 
   1691 TargetLoweringBase::LegalizeTypeAction
   1692 X86TargetLowering::getPreferredVectorAction(EVT VT) const {
   1693   if (ExperimentalVectorWideningLegalization &&
   1694       VT.getVectorNumElements() != 1 &&
   1695       VT.getVectorElementType().getSimpleVT() != MVT::i1)
   1696     return TypeWidenVector;
   1697 
   1698   return TargetLoweringBase::getPreferredVectorAction(VT);
   1699 }
   1700 
   1701 EVT X86TargetLowering::getSetCCResultType(const DataLayout &DL,
   1702                                           LLVMContext& Context,
   1703                                           EVT VT) const {
   1704   if (!VT.isVector())
   1705     return Subtarget.hasAVX512() ? MVT::i1: MVT::i8;
   1706 
   1707   if (VT.isSimple()) {
   1708     MVT VVT = VT.getSimpleVT();
   1709     const unsigned NumElts = VVT.getVectorNumElements();
   1710     MVT EltVT = VVT.getVectorElementType();
   1711     if (VVT.is512BitVector()) {
   1712       if (Subtarget.hasAVX512())
   1713         if (EltVT == MVT::i32 || EltVT == MVT::i64 ||
   1714             EltVT == MVT::f32 || EltVT == MVT::f64)
   1715           switch(NumElts) {
   1716           case  8: return MVT::v8i1;
   1717           case 16: return MVT::v16i1;
   1718         }
   1719       if (Subtarget.hasBWI())
   1720         if (EltVT == MVT::i8 || EltVT == MVT::i16)
   1721           switch(NumElts) {
   1722           case 32: return MVT::v32i1;
   1723           case 64: return MVT::v64i1;
   1724         }
   1725     }
   1726 
   1727     if (Subtarget.hasBWI() && Subtarget.hasVLX())
   1728       return MVT::getVectorVT(MVT::i1, NumElts);
   1729 
   1730     if (!isTypeLegal(VT) && getTypeAction(Context, VT) == TypePromoteInteger) {
   1731       EVT LegalVT = getTypeToTransformTo(Context, VT);
   1732       EltVT = LegalVT.getVectorElementType().getSimpleVT();
   1733     }
   1734 
   1735     if (Subtarget.hasVLX() && EltVT.getSizeInBits() >= 32)
   1736       switch(NumElts) {
   1737       case 2: return MVT::v2i1;
   1738       case 4: return MVT::v4i1;
   1739       case 8: return MVT::v8i1;
   1740       }
   1741   }
   1742 
   1743   return VT.changeVectorElementTypeToInteger();
   1744 }
   1745 
   1746 /// Helper for getByValTypeAlignment to determine
   1747 /// the desired ByVal argument alignment.
   1748 static void getMaxByValAlign(Type *Ty, unsigned &MaxAlign) {
   1749   if (MaxAlign == 16)
   1750     return;
   1751   if (VectorType *VTy = dyn_cast<VectorType>(Ty)) {
   1752     if (VTy->getBitWidth() == 128)
   1753       MaxAlign = 16;
   1754   } else if (ArrayType *ATy = dyn_cast<ArrayType>(Ty)) {
   1755     unsigned EltAlign = 0;
   1756     getMaxByValAlign(ATy->getElementType(), EltAlign);
   1757     if (EltAlign > MaxAlign)
   1758       MaxAlign = EltAlign;
   1759   } else if (StructType *STy = dyn_cast<StructType>(Ty)) {
   1760     for (auto *EltTy : STy->elements()) {
   1761       unsigned EltAlign = 0;
   1762       getMaxByValAlign(EltTy, EltAlign);
   1763       if (EltAlign > MaxAlign)
   1764         MaxAlign = EltAlign;
   1765       if (MaxAlign == 16)
   1766         break;
   1767     }
   1768   }
   1769 }
   1770 
   1771 /// Return the desired alignment for ByVal aggregate
   1772 /// function arguments in the caller parameter area. For X86, aggregates
   1773 /// that contain SSE vectors are placed at 16-byte boundaries while the rest
   1774 /// are at 4-byte boundaries.
   1775 unsigned X86TargetLowering::getByValTypeAlignment(Type *Ty,
   1776                                                   const DataLayout &DL) const {
   1777   if (Subtarget.is64Bit()) {
   1778     // Max of 8 and alignment of type.
   1779     unsigned TyAlign = DL.getABITypeAlignment(Ty);
   1780     if (TyAlign > 8)
   1781       return TyAlign;
   1782     return 8;
   1783   }
   1784 
   1785   unsigned Align = 4;
   1786   if (Subtarget.hasSSE1())
   1787     getMaxByValAlign(Ty, Align);
   1788   return Align;
   1789 }
   1790 
   1791 /// Returns the target specific optimal type for load
   1792 /// and store operations as a result of memset, memcpy, and memmove
   1793 /// lowering. If DstAlign is zero that means it's safe to destination
   1794 /// alignment can satisfy any constraint. Similarly if SrcAlign is zero it
   1795 /// means there isn't a need to check it against alignment requirement,
   1796 /// probably because the source does not need to be loaded. If 'IsMemset' is
   1797 /// true, that means it's expanding a memset. If 'ZeroMemset' is true, that
   1798 /// means it's a memset of zero. 'MemcpyStrSrc' indicates whether the memcpy
   1799 /// source is constant so it does not need to be loaded.
   1800 /// It returns EVT::Other if the type should be determined using generic
   1801 /// target-independent logic.
   1802 EVT
   1803 X86TargetLowering::getOptimalMemOpType(uint64_t Size,
   1804                                        unsigned DstAlign, unsigned SrcAlign,
   1805                                        bool IsMemset, bool ZeroMemset,
   1806                                        bool MemcpyStrSrc,
   1807                                        MachineFunction &MF) const {
   1808   const Function *F = MF.getFunction();
   1809   if (!F->hasFnAttribute(Attribute::NoImplicitFloat)) {
   1810     if (Size >= 16 &&
   1811         (!Subtarget.isUnalignedMem16Slow() ||
   1812          ((DstAlign == 0 || DstAlign >= 16) &&
   1813           (SrcAlign == 0 || SrcAlign >= 16)))) {
   1814       // FIXME: Check if unaligned 32-byte accesses are slow.
   1815       if (Size >= 32 && Subtarget.hasAVX()) {
   1816         // Although this isn't a well-supported type for AVX1, we'll let
   1817         // legalization and shuffle lowering produce the optimal codegen. If we
   1818         // choose an optimal type with a vector element larger than a byte,
   1819         // getMemsetStores() may create an intermediate splat (using an integer
   1820         // multiply) before we splat as a vector.
   1821         return MVT::v32i8;
   1822       }
   1823       if (Subtarget.hasSSE2())
   1824         return MVT::v16i8;
   1825       // TODO: Can SSE1 handle a byte vector?
   1826       if (Subtarget.hasSSE1())
   1827         return MVT::v4f32;
   1828     } else if ((!IsMemset || ZeroMemset) && !MemcpyStrSrc && Size >= 8 &&
   1829                !Subtarget.is64Bit() && Subtarget.hasSSE2()) {
   1830       // Do not use f64 to lower memcpy if source is string constant. It's
   1831       // better to use i32 to avoid the loads.
   1832       // Also, do not use f64 to lower memset unless this is a memset of zeros.
   1833       // The gymnastics of splatting a byte value into an XMM register and then
   1834       // only using 8-byte stores (because this is a CPU with slow unaligned
   1835       // 16-byte accesses) makes that a loser.
   1836       return MVT::f64;
   1837     }
   1838   }
   1839   // This is a compromise. If we reach here, unaligned accesses may be slow on
   1840   // this target. However, creating smaller, aligned accesses could be even
   1841   // slower and would certainly be a lot more code.
   1842   if (Subtarget.is64Bit() && Size >= 8)
   1843     return MVT::i64;
   1844   return MVT::i32;
   1845 }
   1846 
   1847 bool X86TargetLowering::isSafeMemOpType(MVT VT) const {
   1848   if (VT == MVT::f32)
   1849     return X86ScalarSSEf32;
   1850   else if (VT == MVT::f64)
   1851     return X86ScalarSSEf64;
   1852   return true;
   1853 }
   1854 
   1855 bool
   1856 X86TargetLowering::allowsMisalignedMemoryAccesses(EVT VT,
   1857                                                   unsigned,
   1858                                                   unsigned,
   1859                                                   bool *Fast) const {
   1860   if (Fast) {
   1861     switch (VT.getSizeInBits()) {
   1862     default:
   1863       // 8-byte and under are always assumed to be fast.
   1864       *Fast = true;
   1865       break;
   1866     case 128:
   1867       *Fast = !Subtarget.isUnalignedMem16Slow();
   1868       break;
   1869     case 256:
   1870       *Fast = !Subtarget.isUnalignedMem32Slow();
   1871       break;
   1872     // TODO: What about AVX-512 (512-bit) accesses?
   1873     }
   1874   }
   1875   // Misaligned accesses of any size are always allowed.
   1876   return true;
   1877 }
   1878 
   1879 /// Return the entry encoding for a jump table in the
   1880 /// current function.  The returned value is a member of the
   1881 /// MachineJumpTableInfo::JTEntryKind enum.
   1882 unsigned X86TargetLowering::getJumpTableEncoding() const {
   1883   // In GOT pic mode, each entry in the jump table is emitted as a @GOTOFF
   1884   // symbol.
   1885   if (isPositionIndependent() && Subtarget.isPICStyleGOT())
   1886     return MachineJumpTableInfo::EK_Custom32;
   1887 
   1888   // Otherwise, use the normal jump table encoding heuristics.
   1889   return TargetLowering::getJumpTableEncoding();
   1890 }
   1891 
   1892 bool X86TargetLowering::useSoftFloat() const {
   1893   return Subtarget.useSoftFloat();
   1894 }
   1895 
   1896 const MCExpr *
   1897 X86TargetLowering::LowerCustomJumpTableEntry(const MachineJumpTableInfo *MJTI,
   1898                                              const MachineBasicBlock *MBB,
   1899                                              unsigned uid,MCContext &Ctx) const{
   1900   assert(isPositionIndependent() && Subtarget.isPICStyleGOT());
   1901   // In 32-bit ELF systems, our jump table entries are formed with @GOTOFF
   1902   // entries.
   1903   return MCSymbolRefExpr::create(MBB->getSymbol(),
   1904                                  MCSymbolRefExpr::VK_GOTOFF, Ctx);
   1905 }
   1906 
   1907 /// Returns relocation base for the given PIC jumptable.
   1908 SDValue X86TargetLowering::getPICJumpTableRelocBase(SDValue Table,
   1909                                                     SelectionDAG &DAG) const {
   1910   if (!Subtarget.is64Bit())
   1911     // This doesn't have SDLoc associated with it, but is not really the
   1912     // same as a Register.
   1913     return DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(),
   1914                        getPointerTy(DAG.getDataLayout()));
   1915   return Table;
   1916 }
   1917 
   1918 /// This returns the relocation base for the given PIC jumptable,
   1919 /// the same as getPICJumpTableRelocBase, but as an MCExpr.
   1920 const MCExpr *X86TargetLowering::
   1921 getPICJumpTableRelocBaseExpr(const MachineFunction *MF, unsigned JTI,
   1922                              MCContext &Ctx) const {
   1923   // X86-64 uses RIP relative addressing based on the jump table label.
   1924   if (Subtarget.isPICStyleRIPRel())
   1925     return TargetLowering::getPICJumpTableRelocBaseExpr(MF, JTI, Ctx);
   1926 
   1927   // Otherwise, the reference is relative to the PIC base.
   1928   return MCSymbolRefExpr::create(MF->getPICBaseSymbol(), Ctx);
   1929 }
   1930 
   1931 std::pair<const TargetRegisterClass *, uint8_t>
   1932 X86TargetLowering::findRepresentativeClass(const TargetRegisterInfo *TRI,
   1933                                            MVT VT) const {
   1934   const TargetRegisterClass *RRC = nullptr;
   1935   uint8_t Cost = 1;
   1936   switch (VT.SimpleTy) {
   1937   default:
   1938     return TargetLowering::findRepresentativeClass(TRI, VT);
   1939   case MVT::i8: case MVT::i16: case MVT::i32: case MVT::i64:
   1940     RRC = Subtarget.is64Bit() ? &X86::GR64RegClass : &X86::GR32RegClass;
   1941     break;
   1942   case MVT::x86mmx:
   1943     RRC = &X86::VR64RegClass;
   1944     break;
   1945   case MVT::f32: case MVT::f64:
   1946   case MVT::v16i8: case MVT::v8i16: case MVT::v4i32: case MVT::v2i64:
   1947   case MVT::v4f32: case MVT::v2f64:
   1948   case MVT::v32i8: case MVT::v8i32: case MVT::v4i64: case MVT::v8f32:
   1949   case MVT::v4f64:
   1950     RRC = &X86::VR128RegClass;
   1951     break;
   1952   }
   1953   return std::make_pair(RRC, Cost);
   1954 }
   1955 
   1956 unsigned X86TargetLowering::getAddressSpace() const {
   1957   if (Subtarget.is64Bit())
   1958     return (getTargetMachine().getCodeModel() == CodeModel::Kernel) ? 256 : 257;
   1959   return 256;
   1960 }
   1961 
   1962 Value *X86TargetLowering::getIRStackGuard(IRBuilder<> &IRB) const {
   1963   // glibc has a special slot for the stack guard in tcbhead_t, use it instead
   1964   // of the usual global variable (see sysdeps/{i386,x86_64}/nptl/tls.h)
   1965   if (!Subtarget.isTargetGlibc())
   1966     return TargetLowering::getIRStackGuard(IRB);
   1967 
   1968   // %fs:0x28, unless we're using a Kernel code model, in which case it's %gs:
   1969   // %gs:0x14 on i386
   1970   unsigned Offset = (Subtarget.is64Bit()) ? 0x28 : 0x14;
   1971   unsigned AddressSpace = getAddressSpace();
   1972   return ConstantExpr::getIntToPtr(
   1973       ConstantInt::get(Type::getInt32Ty(IRB.getContext()), Offset),
   1974       Type::getInt8PtrTy(IRB.getContext())->getPointerTo(AddressSpace));
   1975 }
   1976 
   1977 void X86TargetLowering::insertSSPDeclarations(Module &M) const {
   1978   // MSVC CRT provides functionalities for stack protection.
   1979   if (Subtarget.getTargetTriple().isOSMSVCRT()) {
   1980     // MSVC CRT has a global variable holding security cookie.
   1981     M.getOrInsertGlobal("__security_cookie",
   1982                         Type::getInt8PtrTy(M.getContext()));
   1983 
   1984     // MSVC CRT has a function to validate security cookie.
   1985     auto *SecurityCheckCookie = cast<Function>(
   1986         M.getOrInsertFunction("__security_check_cookie",
   1987                               Type::getVoidTy(M.getContext()),
   1988                               Type::getInt8PtrTy(M.getContext()), nullptr));
   1989     SecurityCheckCookie->setCallingConv(CallingConv::X86_FastCall);
   1990     SecurityCheckCookie->addAttribute(1, Attribute::AttrKind::InReg);
   1991     return;
   1992   }
   1993   // glibc has a special slot for the stack guard.
   1994   if (Subtarget.isTargetGlibc())
   1995     return;
   1996   TargetLowering::insertSSPDeclarations(M);
   1997 }
   1998 
   1999 Value *X86TargetLowering::getSDagStackGuard(const Module &M) const {
   2000   // MSVC CRT has a global variable holding security cookie.
   2001   if (Subtarget.getTargetTriple().isOSMSVCRT())
   2002     return M.getGlobalVariable("__security_cookie");
   2003   return TargetLowering::getSDagStackGuard(M);
   2004 }
   2005 
   2006 Value *X86TargetLowering::getSSPStackGuardCheck(const Module &M) const {
   2007   // MSVC CRT has a function to validate security cookie.
   2008   if (Subtarget.getTargetTriple().isOSMSVCRT())
   2009     return M.getFunction("__security_check_cookie");
   2010   return TargetLowering::getSSPStackGuardCheck(M);
   2011 }
   2012 
   2013 Value *X86TargetLowering::getSafeStackPointerLocation(IRBuilder<> &IRB) const {
   2014   if (!Subtarget.isTargetAndroid())
   2015     return TargetLowering::getSafeStackPointerLocation(IRB);
   2016 
   2017   // Android provides a fixed TLS slot for the SafeStack pointer. See the
   2018   // definition of TLS_SLOT_SAFESTACK in
   2019   // https://android.googlesource.com/platform/bionic/+/master/libc/private/bionic_tls.h
   2020   unsigned AddressSpace, Offset;
   2021 
   2022   // %fs:0x48, unless we're using a Kernel code model, in which case it's %gs:
   2023   // %gs:0x24 on i386
   2024   Offset = (Subtarget.is64Bit()) ? 0x48 : 0x24;
   2025   AddressSpace = getAddressSpace();
   2026   return ConstantExpr::getIntToPtr(
   2027       ConstantInt::get(Type::getInt32Ty(IRB.getContext()), Offset),
   2028       Type::getInt8PtrTy(IRB.getContext())->getPointerTo(AddressSpace));
   2029 }
   2030 
   2031 bool X86TargetLowering::isNoopAddrSpaceCast(unsigned SrcAS,
   2032                                             unsigned DestAS) const {
   2033   assert(SrcAS != DestAS && "Expected different address spaces!");
   2034 
   2035   return SrcAS < 256 && DestAS < 256;
   2036 }
   2037 
   2038 //===----------------------------------------------------------------------===//
   2039 //               Return Value Calling Convention Implementation
   2040 //===----------------------------------------------------------------------===//
   2041 
   2042 #include "X86GenCallingConv.inc"
   2043 
   2044 bool X86TargetLowering::CanLowerReturn(
   2045     CallingConv::ID CallConv, MachineFunction &MF, bool isVarArg,
   2046     const SmallVectorImpl<ISD::OutputArg> &Outs, LLVMContext &Context) const {
   2047   SmallVector<CCValAssign, 16> RVLocs;
   2048   CCState CCInfo(CallConv, isVarArg, MF, RVLocs, Context);
   2049   return CCInfo.CheckReturn(Outs, RetCC_X86);
   2050 }
   2051 
   2052 const MCPhysReg *X86TargetLowering::getScratchRegisters(CallingConv::ID) const {
   2053   static const MCPhysReg ScratchRegs[] = { X86::R11, 0 };
   2054   return ScratchRegs;
   2055 }
   2056 
   2057 SDValue
   2058 X86TargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
   2059                                bool isVarArg,
   2060                                const SmallVectorImpl<ISD::OutputArg> &Outs,
   2061                                const SmallVectorImpl<SDValue> &OutVals,
   2062                                const SDLoc &dl, SelectionDAG &DAG) const {
   2063   MachineFunction &MF = DAG.getMachineFunction();
   2064   X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
   2065 
   2066   if (CallConv == CallingConv::X86_INTR && !Outs.empty())
   2067     report_fatal_error("X86 interrupts may not return any value");
   2068 
   2069   SmallVector<CCValAssign, 16> RVLocs;
   2070   CCState CCInfo(CallConv, isVarArg, MF, RVLocs, *DAG.getContext());
   2071   CCInfo.AnalyzeReturn(Outs, RetCC_X86);
   2072 
   2073   SDValue Flag;
   2074   SmallVector<SDValue, 6> RetOps;
   2075   RetOps.push_back(Chain); // Operand #0 = Chain (updated below)
   2076   // Operand #1 = Bytes To Pop
   2077   RetOps.push_back(DAG.getTargetConstant(FuncInfo->getBytesToPopOnReturn(), dl,
   2078                    MVT::i32));
   2079 
   2080   // Copy the result values into the output registers.
   2081   for (unsigned i = 0, e = RVLocs.size(); i != e; ++i) {
   2082     CCValAssign &VA = RVLocs[i];
   2083     assert(VA.isRegLoc() && "Can only return in registers!");
   2084     SDValue ValToCopy = OutVals[i];
   2085     EVT ValVT = ValToCopy.getValueType();
   2086 
   2087     // Promote values to the appropriate types.
   2088     if (VA.getLocInfo() == CCValAssign::SExt)
   2089       ValToCopy = DAG.getNode(ISD::SIGN_EXTEND, dl, VA.getLocVT(), ValToCopy);
   2090     else if (VA.getLocInfo() == CCValAssign::ZExt)
   2091       ValToCopy = DAG.getNode(ISD::ZERO_EXTEND, dl, VA.getLocVT(), ValToCopy);
   2092     else if (VA.getLocInfo() == CCValAssign::AExt) {
   2093       if (ValVT.isVector() && ValVT.getVectorElementType() == MVT::i1)
   2094         ValToCopy = DAG.getNode(ISD::SIGN_EXTEND, dl, VA.getLocVT(), ValToCopy);
   2095       else
   2096         ValToCopy = DAG.getNode(ISD::ANY_EXTEND, dl, VA.getLocVT(), ValToCopy);
   2097     }
   2098     else if (VA.getLocInfo() == CCValAssign::BCvt)
   2099       ValToCopy = DAG.getBitcast(VA.getLocVT(), ValToCopy);
   2100 
   2101     assert(VA.getLocInfo() != CCValAssign::FPExt &&
   2102            "Unexpected FP-extend for return value.");
   2103 
   2104     // If this is x86-64, and we disabled SSE, we can't return FP values,
   2105     // or SSE or MMX vectors.
   2106     if ((ValVT == MVT::f32 || ValVT == MVT::f64 ||
   2107          VA.getLocReg() == X86::XMM0 || VA.getLocReg() == X86::XMM1) &&
   2108           (Subtarget.is64Bit() && !Subtarget.hasSSE1())) {
   2109       report_fatal_error("SSE register return with SSE disabled");
   2110     }
   2111     // Likewise we can't return F64 values with SSE1 only.  gcc does so, but
   2112     // llvm-gcc has never done it right and no one has noticed, so this
   2113     // should be OK for now.
   2114     if (ValVT == MVT::f64 &&
   2115         (Subtarget.is64Bit() && !Subtarget.hasSSE2()))
   2116       report_fatal_error("SSE2 register return with SSE2 disabled");
   2117 
   2118     // Returns in ST0/ST1 are handled specially: these are pushed as operands to
   2119     // the RET instruction and handled by the FP Stackifier.
   2120     if (VA.getLocReg() == X86::FP0 ||
   2121         VA.getLocReg() == X86::FP1) {
   2122       // If this is a copy from an xmm register to ST(0), use an FPExtend to
   2123       // change the value to the FP stack register class.
   2124       if (isScalarFPTypeInSSEReg(VA.getValVT()))
   2125         ValToCopy = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f80, ValToCopy);
   2126       RetOps.push_back(ValToCopy);
   2127       // Don't emit a copytoreg.
   2128       continue;
   2129     }
   2130 
   2131     // 64-bit vector (MMX) values are returned in XMM0 / XMM1 except for v1i64
   2132     // which is returned in RAX / RDX.
   2133     if (Subtarget.is64Bit()) {
   2134       if (ValVT == MVT::x86mmx) {
   2135         if (VA.getLocReg() == X86::XMM0 || VA.getLocReg() == X86::XMM1) {
   2136           ValToCopy = DAG.getBitcast(MVT::i64, ValToCopy);
   2137           ValToCopy = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64,
   2138                                   ValToCopy);
   2139           // If we don't have SSE2 available, convert to v4f32 so the generated
   2140           // register is legal.
   2141           if (!Subtarget.hasSSE2())
   2142             ValToCopy = DAG.getBitcast(MVT::v4f32, ValToCopy);
   2143         }
   2144       }
   2145     }
   2146 
   2147     Chain = DAG.getCopyToReg(Chain, dl, VA.getLocReg(), ValToCopy, Flag);
   2148     Flag = Chain.getValue(1);
   2149     RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT()));
   2150   }
   2151 
   2152   // Swift calling convention does not require we copy the sret argument
   2153   // into %rax/%eax for the return, and SRetReturnReg is not set for Swift.
   2154 
   2155   // All x86 ABIs require that for returning structs by value we copy
   2156   // the sret argument into %rax/%eax (depending on ABI) for the return.
   2157   // We saved the argument into a virtual register in the entry block,
   2158   // so now we copy the value out and into %rax/%eax.
   2159   //
   2160   // Checking Function.hasStructRetAttr() here is insufficient because the IR
   2161   // may not have an explicit sret argument. If FuncInfo.CanLowerReturn is
   2162   // false, then an sret argument may be implicitly inserted in the SelDAG. In
   2163   // either case FuncInfo->setSRetReturnReg() will have been called.
   2164   if (unsigned SRetReg = FuncInfo->getSRetReturnReg()) {
   2165     // When we have both sret and another return value, we should use the
   2166     // original Chain stored in RetOps[0], instead of the current Chain updated
   2167     // in the above loop. If we only have sret, RetOps[0] equals to Chain.
   2168 
   2169     // For the case of sret and another return value, we have
   2170     //   Chain_0 at the function entry
   2171     //   Chain_1 = getCopyToReg(Chain_0) in the above loop
   2172     // If we use Chain_1 in getCopyFromReg, we will have
   2173     //   Val = getCopyFromReg(Chain_1)
   2174     //   Chain_2 = getCopyToReg(Chain_1, Val) from below
   2175 
   2176     // getCopyToReg(Chain_0) will be glued together with
   2177     // getCopyToReg(Chain_1, Val) into Unit A, getCopyFromReg(Chain_1) will be
   2178     // in Unit B, and we will have cyclic dependency between Unit A and Unit B:
   2179     //   Data dependency from Unit B to Unit A due to usage of Val in
   2180     //     getCopyToReg(Chain_1, Val)
   2181     //   Chain dependency from Unit A to Unit B
   2182 
   2183     // So here, we use RetOps[0] (i.e Chain_0) for getCopyFromReg.
   2184     SDValue Val = DAG.getCopyFromReg(RetOps[0], dl, SRetReg,
   2185                                      getPointerTy(MF.getDataLayout()));
   2186 
   2187     unsigned RetValReg
   2188         = (Subtarget.is64Bit() && !Subtarget.isTarget64BitILP32()) ?
   2189           X86::RAX : X86::EAX;
   2190     Chain = DAG.getCopyToReg(Chain, dl, RetValReg, Val, Flag);
   2191     Flag = Chain.getValue(1);
   2192 
   2193     // RAX/EAX now acts like a return value.
   2194     RetOps.push_back(
   2195         DAG.getRegister(RetValReg, getPointerTy(DAG.getDataLayout())));
   2196   }
   2197 
   2198   const X86RegisterInfo *TRI = Subtarget.getRegisterInfo();
   2199   const MCPhysReg *I =
   2200       TRI->getCalleeSavedRegsViaCopy(&DAG.getMachineFunction());
   2201   if (I) {
   2202     for (; *I; ++I) {
   2203       if (X86::GR64RegClass.contains(*I))
   2204         RetOps.push_back(DAG.getRegister(*I, MVT::i64));
   2205       else
   2206         llvm_unreachable("Unexpected register class in CSRsViaCopy!");
   2207     }
   2208   }
   2209 
   2210   RetOps[0] = Chain;  // Update chain.
   2211 
   2212   // Add the flag if we have it.
   2213   if (Flag.getNode())
   2214     RetOps.push_back(Flag);
   2215 
   2216   X86ISD::NodeType opcode = X86ISD::RET_FLAG;
   2217   if (CallConv == CallingConv::X86_INTR)
   2218     opcode = X86ISD::IRET;
   2219   return DAG.getNode(opcode, dl, MVT::Other, RetOps);
   2220 }
   2221 
   2222 bool X86TargetLowering::isUsedByReturnOnly(SDNode *N, SDValue &Chain) const {
   2223   if (N->getNumValues() != 1 || !N->hasNUsesOfValue(1, 0))
   2224     return false;
   2225 
   2226   SDValue TCChain = Chain;
   2227   SDNode *Copy = *N->use_begin();
   2228   if (Copy->getOpcode() == ISD::CopyToReg) {
   2229     // If the copy has a glue operand, we conservatively assume it isn't safe to
   2230     // perform a tail call.
   2231     if (Copy->getOperand(Copy->getNumOperands()-1).getValueType() == MVT::Glue)
   2232       return false;
   2233     TCChain = Copy->getOperand(0);
   2234   } else if (Copy->getOpcode() != ISD::FP_EXTEND)
   2235     return false;
   2236 
   2237   bool HasRet = false;
   2238   for (SDNode::use_iterator UI = Copy->use_begin(), UE = Copy->use_end();
   2239        UI != UE; ++UI) {
   2240     if (UI->getOpcode() != X86ISD::RET_FLAG)
   2241       return false;
   2242     // If we are returning more than one value, we can definitely
   2243     // not make a tail call see PR19530
   2244     if (UI->getNumOperands() > 4)
   2245       return false;
   2246     if (UI->getNumOperands() == 4 &&
   2247         UI->getOperand(UI->getNumOperands()-1).getValueType() != MVT::Glue)
   2248       return false;
   2249     HasRet = true;
   2250   }
   2251 
   2252   if (!HasRet)
   2253     return false;
   2254 
   2255   Chain = TCChain;
   2256   return true;
   2257 }
   2258 
   2259 EVT X86TargetLowering::getTypeForExtReturn(LLVMContext &Context, EVT VT,
   2260                                            ISD::NodeType ExtendKind) const {
   2261   MVT ReturnMVT = MVT::i32;
   2262 
   2263   bool Darwin = Subtarget.getTargetTriple().isOSDarwin();
   2264   if (VT == MVT::i1 || (!Darwin && (VT == MVT::i8 || VT == MVT::i16))) {
   2265     // The ABI does not require i1, i8 or i16 to be extended.
   2266     //
   2267     // On Darwin, there is code in the wild relying on Clang's old behaviour of
   2268     // always extending i8/i16 return values, so keep doing that for now.
   2269     // (PR26665).
   2270     ReturnMVT = MVT::i8;
   2271   }
   2272 
   2273   EVT MinVT = getRegisterType(Context, ReturnMVT);
   2274   return VT.bitsLT(MinVT) ? MinVT : VT;
   2275 }
   2276 
   2277 /// Lower the result values of a call into the
   2278 /// appropriate copies out of appropriate physical registers.
   2279 ///
   2280 SDValue X86TargetLowering::LowerCallResult(
   2281     SDValue Chain, SDValue InFlag, CallingConv::ID CallConv, bool isVarArg,
   2282     const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl,
   2283     SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const {
   2284 
   2285   // Assign locations to each value returned by this call.
   2286   SmallVector<CCValAssign, 16> RVLocs;
   2287   bool Is64Bit = Subtarget.is64Bit();
   2288   CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), RVLocs,
   2289                  *DAG.getContext());
   2290   CCInfo.AnalyzeCallResult(Ins, RetCC_X86);
   2291 
   2292   // Copy all of the result registers out of their specified physreg.
   2293   for (unsigned i = 0, e = RVLocs.size(); i != e; ++i) {
   2294     CCValAssign &VA = RVLocs[i];
   2295     EVT CopyVT = VA.getLocVT();
   2296 
   2297     // If this is x86-64, and we disabled SSE, we can't return FP values
   2298     if ((CopyVT == MVT::f32 || CopyVT == MVT::f64 || CopyVT == MVT::f128) &&
   2299         ((Is64Bit || Ins[i].Flags.isInReg()) && !Subtarget.hasSSE1())) {
   2300       report_fatal_error("SSE register return with SSE disabled");
   2301     }
   2302 
   2303     // If we prefer to use the value in xmm registers, copy it out as f80 and
   2304     // use a truncate to move it from fp stack reg to xmm reg.
   2305     bool RoundAfterCopy = false;
   2306     if ((VA.getLocReg() == X86::FP0 || VA.getLocReg() == X86::FP1) &&
   2307         isScalarFPTypeInSSEReg(VA.getValVT())) {
   2308       if (!Subtarget.hasX87())
   2309         report_fatal_error("X87 register return with X87 disabled");
   2310       CopyVT = MVT::f80;
   2311       RoundAfterCopy = (CopyVT != VA.getLocVT());
   2312     }
   2313 
   2314     Chain = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(),
   2315                                CopyVT, InFlag).getValue(1);
   2316     SDValue Val = Chain.getValue(0);
   2317 
   2318     if (RoundAfterCopy)
   2319       Val = DAG.getNode(ISD::FP_ROUND, dl, VA.getValVT(), Val,
   2320                         // This truncation won't change the value.
   2321                         DAG.getIntPtrConstant(1, dl));
   2322 
   2323     if (VA.isExtInLoc() && VA.getValVT().getScalarType() == MVT::i1)
   2324       Val = DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), Val);
   2325 
   2326     InFlag = Chain.getValue(2);
   2327     InVals.push_back(Val);
   2328   }
   2329 
   2330   return Chain;
   2331 }
   2332 
   2333 //===----------------------------------------------------------------------===//
   2334 //                C & StdCall & Fast Calling Convention implementation
   2335 //===----------------------------------------------------------------------===//
   2336 //  StdCall calling convention seems to be standard for many Windows' API
   2337 //  routines and around. It differs from C calling convention just a little:
   2338 //  callee should clean up the stack, not caller. Symbols should be also
   2339 //  decorated in some fancy way :) It doesn't support any vector arguments.
   2340 //  For info on fast calling convention see Fast Calling Convention (tail call)
   2341 //  implementation LowerX86_32FastCCCallTo.
   2342 
   2343 /// CallIsStructReturn - Determines whether a call uses struct return
   2344 /// semantics.
   2345 enum StructReturnType {
   2346   NotStructReturn,
   2347   RegStructReturn,
   2348   StackStructReturn
   2349 };
   2350 static StructReturnType
   2351 callIsStructReturn(const SmallVectorImpl<ISD::OutputArg> &Outs, bool IsMCU) {
   2352   if (Outs.empty())
   2353     return NotStructReturn;
   2354 
   2355   const ISD::ArgFlagsTy &Flags = Outs[0].Flags;
   2356   if (!Flags.isSRet())
   2357     return NotStructReturn;
   2358   if (Flags.isInReg() || IsMCU)
   2359     return RegStructReturn;
   2360   return StackStructReturn;
   2361 }
   2362 
   2363 /// Determines whether a function uses struct return semantics.
   2364 static StructReturnType
   2365 argsAreStructReturn(const SmallVectorImpl<ISD::InputArg> &Ins, bool IsMCU) {
   2366   if (Ins.empty())
   2367     return NotStructReturn;
   2368 
   2369   const ISD::ArgFlagsTy &Flags = Ins[0].Flags;
   2370   if (!Flags.isSRet())
   2371     return NotStructReturn;
   2372   if (Flags.isInReg() || IsMCU)
   2373     return RegStructReturn;
   2374   return StackStructReturn;
   2375 }
   2376 
   2377 /// Make a copy of an aggregate at address specified by "Src" to address
   2378 /// "Dst" with size and alignment information specified by the specific
   2379 /// parameter attribute. The copy will be passed as a byval function parameter.
   2380 static SDValue CreateCopyOfByValArgument(SDValue Src, SDValue Dst,
   2381                                          SDValue Chain, ISD::ArgFlagsTy Flags,
   2382                                          SelectionDAG &DAG, const SDLoc &dl) {
   2383   SDValue SizeNode = DAG.getConstant(Flags.getByValSize(), dl, MVT::i32);
   2384 
   2385   return DAG.getMemcpy(Chain, dl, Dst, Src, SizeNode, Flags.getByValAlign(),
   2386                        /*isVolatile*/false, /*AlwaysInline=*/true,
   2387                        /*isTailCall*/false,
   2388                        MachinePointerInfo(), MachinePointerInfo());
   2389 }
   2390 
   2391 /// Return true if the calling convention is one that we can guarantee TCO for.
   2392 static bool canGuaranteeTCO(CallingConv::ID CC) {
   2393   return (CC == CallingConv::Fast || CC == CallingConv::GHC ||
   2394           CC == CallingConv::HiPE || CC == CallingConv::HHVM);
   2395 }
   2396 
   2397 /// Return true if we might ever do TCO for calls with this calling convention.
   2398 static bool mayTailCallThisCC(CallingConv::ID CC) {
   2399   switch (CC) {
   2400   // C calling conventions:
   2401   case CallingConv::C:
   2402   case CallingConv::X86_64_Win64:
   2403   case CallingConv::X86_64_SysV:
   2404   // Callee pop conventions:
   2405   case CallingConv::X86_ThisCall:
   2406   case CallingConv::X86_StdCall:
   2407   case CallingConv::X86_VectorCall:
   2408   case CallingConv::X86_FastCall:
   2409     return true;
   2410   default:
   2411     return canGuaranteeTCO(CC);
   2412   }
   2413 }
   2414 
   2415 /// Return true if the function is being made into a tailcall target by
   2416 /// changing its ABI.
   2417 static bool shouldGuaranteeTCO(CallingConv::ID CC, bool GuaranteedTailCallOpt) {
   2418   return GuaranteedTailCallOpt && canGuaranteeTCO(CC);
   2419 }
   2420 
   2421 bool X86TargetLowering::mayBeEmittedAsTailCall(CallInst *CI) const {
   2422   auto Attr =
   2423       CI->getParent()->getParent()->getFnAttribute("disable-tail-calls");
   2424   if (!CI->isTailCall() || Attr.getValueAsString() == "true")
   2425     return false;
   2426 
   2427   CallSite CS(CI);
   2428   CallingConv::ID CalleeCC = CS.getCallingConv();
   2429   if (!mayTailCallThisCC(CalleeCC))
   2430     return false;
   2431 
   2432   return true;
   2433 }
   2434 
   2435 SDValue
   2436 X86TargetLowering::LowerMemArgument(SDValue Chain, CallingConv::ID CallConv,
   2437                                     const SmallVectorImpl<ISD::InputArg> &Ins,
   2438                                     const SDLoc &dl, SelectionDAG &DAG,
   2439                                     const CCValAssign &VA,
   2440                                     MachineFrameInfo *MFI, unsigned i) const {
   2441   // Create the nodes corresponding to a load from this parameter slot.
   2442   ISD::ArgFlagsTy Flags = Ins[i].Flags;
   2443   bool AlwaysUseMutable = shouldGuaranteeTCO(
   2444       CallConv, DAG.getTarget().Options.GuaranteedTailCallOpt);
   2445   bool isImmutable = !AlwaysUseMutable && !Flags.isByVal();
   2446   EVT ValVT;
   2447 
   2448   // If value is passed by pointer we have address passed instead of the value
   2449   // itself.
   2450   bool ExtendedInMem = VA.isExtInLoc() &&
   2451     VA.getValVT().getScalarType() == MVT::i1;
   2452 
   2453   if (VA.getLocInfo() == CCValAssign::Indirect || ExtendedInMem)
   2454     ValVT = VA.getLocVT();
   2455   else
   2456     ValVT = VA.getValVT();
   2457 
   2458   // Calculate SP offset of interrupt parameter, re-arrange the slot normally
   2459   // taken by a return address.
   2460   int Offset = 0;
   2461   if (CallConv == CallingConv::X86_INTR) {
   2462     const X86Subtarget& Subtarget =
   2463         static_cast<const X86Subtarget&>(DAG.getSubtarget());
   2464     // X86 interrupts may take one or two arguments.
   2465     // On the stack there will be no return address as in regular call.
   2466     // Offset of last argument need to be set to -4/-8 bytes.
   2467     // Where offset of the first argument out of two, should be set to 0 bytes.
   2468     Offset = (Subtarget.is64Bit() ? 8 : 4) * ((i + 1) % Ins.size() - 1);
   2469   }
   2470 
   2471   // FIXME: For now, all byval parameter objects are marked mutable. This can be
   2472   // changed with more analysis.
   2473   // In case of tail call optimization mark all arguments mutable. Since they
   2474   // could be overwritten by lowering of arguments in case of a tail call.
   2475   if (Flags.isByVal()) {
   2476     unsigned Bytes = Flags.getByValSize();
   2477     if (Bytes == 0) Bytes = 1; // Don't create zero-sized stack objects.
   2478     int FI = MFI->CreateFixedObject(Bytes, VA.getLocMemOffset(), isImmutable);
   2479     // Adjust SP offset of interrupt parameter.
   2480     if (CallConv == CallingConv::X86_INTR) {
   2481       MFI->setObjectOffset(FI, Offset);
   2482     }
   2483     return DAG.getFrameIndex(FI, getPointerTy(DAG.getDataLayout()));
   2484   } else {
   2485     int FI = MFI->CreateFixedObject(ValVT.getSizeInBits()/8,
   2486                                     VA.getLocMemOffset(), isImmutable);
   2487 
   2488     // Set SExt or ZExt flag.
   2489     if (VA.getLocInfo() == CCValAssign::ZExt) {
   2490       MFI->setObjectZExt(FI, true);
   2491     } else if (VA.getLocInfo() == CCValAssign::SExt) {
   2492       MFI->setObjectSExt(FI, true);
   2493     }
   2494 
   2495     // Adjust SP offset of interrupt parameter.
   2496     if (CallConv == CallingConv::X86_INTR) {
   2497       MFI->setObjectOffset(FI, Offset);
   2498     }
   2499 
   2500     SDValue FIN = DAG.getFrameIndex(FI, getPointerTy(DAG.getDataLayout()));
   2501     SDValue Val = DAG.getLoad(
   2502         ValVT, dl, Chain, FIN,
   2503         MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI), false,
   2504         false, false, 0);
   2505     return ExtendedInMem ?
   2506       DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), Val) : Val;
   2507   }
   2508 }
   2509 
   2510 // FIXME: Get this from tablegen.
   2511 static ArrayRef<MCPhysReg> get64BitArgumentGPRs(CallingConv::ID CallConv,
   2512                                                 const X86Subtarget &Subtarget) {
   2513   assert(Subtarget.is64Bit());
   2514 
   2515   if (Subtarget.isCallingConvWin64(CallConv)) {
   2516     static const MCPhysReg GPR64ArgRegsWin64[] = {
   2517       X86::RCX, X86::RDX, X86::R8,  X86::R9
   2518     };
   2519     return makeArrayRef(std::begin(GPR64ArgRegsWin64), std::end(GPR64ArgRegsWin64));
   2520   }
   2521 
   2522   static const MCPhysReg GPR64ArgRegs64Bit[] = {
   2523     X86::RDI, X86::RSI, X86::RDX, X86::RCX, X86::R8, X86::R9
   2524   };
   2525   return makeArrayRef(std::begin(GPR64ArgRegs64Bit), std::end(GPR64ArgRegs64Bit));
   2526 }
   2527 
   2528 // FIXME: Get this from tablegen.
   2529 static ArrayRef<MCPhysReg> get64BitArgumentXMMs(MachineFunction &MF,
   2530                                                 CallingConv::ID CallConv,
   2531                                                 const X86Subtarget &Subtarget) {
   2532   assert(Subtarget.is64Bit());
   2533   if (Subtarget.isCallingConvWin64(CallConv)) {
   2534     // The XMM registers which might contain var arg parameters are shadowed
   2535     // in their paired GPR.  So we only need to save the GPR to their home
   2536     // slots.
   2537     // TODO: __vectorcall will change this.
   2538     return None;
   2539   }
   2540 
   2541   const Function *Fn = MF.getFunction();
   2542   bool NoImplicitFloatOps = Fn->hasFnAttribute(Attribute::NoImplicitFloat);
   2543   bool isSoftFloat = Subtarget.useSoftFloat();
   2544   assert(!(isSoftFloat && NoImplicitFloatOps) &&
   2545          "SSE register cannot be used when SSE is disabled!");
   2546   if (isSoftFloat || NoImplicitFloatOps || !Subtarget.hasSSE1())
   2547     // Kernel mode asks for SSE to be disabled, so there are no XMM argument
   2548     // registers.
   2549     return None;
   2550 
   2551   static const MCPhysReg XMMArgRegs64Bit[] = {
   2552     X86::XMM0, X86::XMM1, X86::XMM2, X86::XMM3,
   2553     X86::XMM4, X86::XMM5, X86::XMM6, X86::XMM7
   2554   };
   2555   return makeArrayRef(std::begin(XMMArgRegs64Bit), std::end(XMMArgRegs64Bit));
   2556 }
   2557 
   2558 SDValue X86TargetLowering::LowerFormalArguments(
   2559     SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
   2560     const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl,
   2561     SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const {
   2562   MachineFunction &MF = DAG.getMachineFunction();
   2563   X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
   2564   const TargetFrameLowering &TFI = *Subtarget.getFrameLowering();
   2565 
   2566   const Function *Fn = MF.getFunction();
   2567   if (Fn->hasExternalLinkage() &&
   2568       Subtarget.isTargetCygMing() &&
   2569       Fn->getName() == "main")
   2570     FuncInfo->setForceFramePointer(true);
   2571 
   2572   MachineFrameInfo *MFI = MF.getFrameInfo();
   2573   bool Is64Bit = Subtarget.is64Bit();
   2574   bool IsWin64 = Subtarget.isCallingConvWin64(CallConv);
   2575 
   2576   assert(!(isVarArg && canGuaranteeTCO(CallConv)) &&
   2577          "Var args not supported with calling convention fastcc, ghc or hipe");
   2578 
   2579   if (CallConv == CallingConv::X86_INTR) {
   2580     bool isLegal = Ins.size() == 1 ||
   2581                    (Ins.size() == 2 && ((Is64Bit && Ins[1].VT == MVT::i64) ||
   2582                                         (!Is64Bit && Ins[1].VT == MVT::i32)));
   2583     if (!isLegal)
   2584       report_fatal_error("X86 interrupts may take one or two arguments");
   2585   }
   2586 
   2587   // Assign locations to all of the incoming arguments.
   2588   SmallVector<CCValAssign, 16> ArgLocs;
   2589   CCState CCInfo(CallConv, isVarArg, MF, ArgLocs, *DAG.getContext());
   2590 
   2591   // Allocate shadow area for Win64
   2592   if (IsWin64)
   2593     CCInfo.AllocateStack(32, 8);
   2594 
   2595   CCInfo.AnalyzeFormalArguments(Ins, CC_X86);
   2596 
   2597   unsigned LastVal = ~0U;
   2598   SDValue ArgValue;
   2599   for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
   2600     CCValAssign &VA = ArgLocs[i];
   2601     // TODO: If an arg is passed in two places (e.g. reg and stack), skip later
   2602     // places.
   2603     assert(VA.getValNo() != LastVal &&
   2604            "Don't support value assigned to multiple locs yet");
   2605     (void)LastVal;
   2606     LastVal = VA.getValNo();
   2607 
   2608     if (VA.isRegLoc()) {
   2609       EVT RegVT = VA.getLocVT();
   2610       const TargetRegisterClass *RC;
   2611       if (RegVT == MVT::i32)
   2612         RC = &X86::GR32RegClass;
   2613       else if (Is64Bit && RegVT == MVT::i64)
   2614         RC = &X86::GR64RegClass;
   2615       else if (RegVT == MVT::f32)
   2616         RC = &X86::FR32RegClass;
   2617       else if (RegVT == MVT::f64)
   2618         RC = &X86::FR64RegClass;
   2619       else if (RegVT == MVT::f128)
   2620         RC = &X86::FR128RegClass;
   2621       else if (RegVT.is512BitVector())
   2622         RC = &X86::VR512RegClass;
   2623       else if (RegVT.is256BitVector())
   2624         RC = &X86::VR256RegClass;
   2625       else if (RegVT.is128BitVector())
   2626         RC = &X86::VR128RegClass;
   2627       else if (RegVT == MVT::x86mmx)
   2628         RC = &X86::VR64RegClass;
   2629       else if (RegVT == MVT::i1)
   2630         RC = &X86::VK1RegClass;
   2631       else if (RegVT == MVT::v8i1)
   2632         RC = &X86::VK8RegClass;
   2633       else if (RegVT == MVT::v16i1)
   2634         RC = &X86::VK16RegClass;
   2635       else if (RegVT == MVT::v32i1)
   2636         RC = &X86::VK32RegClass;
   2637       else if (RegVT == MVT::v64i1)
   2638         RC = &X86::VK64RegClass;
   2639       else
   2640         llvm_unreachable("Unknown argument type!");
   2641 
   2642       unsigned Reg = MF.addLiveIn(VA.getLocReg(), RC);
   2643       ArgValue = DAG.getCopyFromReg(Chain, dl, Reg, RegVT);
   2644 
   2645       // If this is an 8 or 16-bit value, it is really passed promoted to 32
   2646       // bits.  Insert an assert[sz]ext to capture this, then truncate to the
   2647       // right size.
   2648       if (VA.getLocInfo() == CCValAssign::SExt)
   2649         ArgValue = DAG.getNode(ISD::AssertSext, dl, RegVT, ArgValue,
   2650                                DAG.getValueType(VA.getValVT()));
   2651       else if (VA.getLocInfo() == CCValAssign::ZExt)
   2652         ArgValue = DAG.getNode(ISD::AssertZext, dl, RegVT, ArgValue,
   2653                                DAG.getValueType(VA.getValVT()));
   2654       else if (VA.getLocInfo() == CCValAssign::BCvt)
   2655         ArgValue = DAG.getBitcast(VA.getValVT(), ArgValue);
   2656 
   2657       if (VA.isExtInLoc()) {
   2658         // Handle MMX values passed in XMM regs.
   2659         if (RegVT.isVector() && VA.getValVT().getScalarType() != MVT::i1)
   2660           ArgValue = DAG.getNode(X86ISD::MOVDQ2Q, dl, VA.getValVT(), ArgValue);
   2661         else
   2662           ArgValue = DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), ArgValue);
   2663       }
   2664     } else {
   2665       assert(VA.isMemLoc());
   2666       ArgValue = LowerMemArgument(Chain, CallConv, Ins, dl, DAG, VA, MFI, i);
   2667     }
   2668 
   2669     // If value is passed via pointer - do a load.
   2670     if (VA.getLocInfo() == CCValAssign::Indirect)
   2671       ArgValue = DAG.getLoad(VA.getValVT(), dl, Chain, ArgValue,
   2672                              MachinePointerInfo(), false, false, false, 0);
   2673 
   2674     InVals.push_back(ArgValue);
   2675   }
   2676 
   2677   for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
   2678     // Swift calling convention does not require we copy the sret argument
   2679     // into %rax/%eax for the return. We don't set SRetReturnReg for Swift.
   2680     if (CallConv == CallingConv::Swift)
   2681       continue;
   2682 
   2683     // All x86 ABIs require that for returning structs by value we copy the
   2684     // sret argument into %rax/%eax (depending on ABI) for the return. Save
   2685     // the argument into a virtual register so that we can access it from the
   2686     // return points.
   2687     if (Ins[i].Flags.isSRet()) {
   2688       unsigned Reg = FuncInfo->getSRetReturnReg();
   2689       if (!Reg) {
   2690         MVT PtrTy = getPointerTy(DAG.getDataLayout());
   2691         Reg = MF.getRegInfo().createVirtualRegister(getRegClassFor(PtrTy));
   2692         FuncInfo->setSRetReturnReg(Reg);
   2693       }
   2694       SDValue Copy = DAG.getCopyToReg(DAG.getEntryNode(), dl, Reg, InVals[i]);
   2695       Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Copy, Chain);
   2696       break;
   2697     }
   2698   }
   2699 
   2700   unsigned StackSize = CCInfo.getNextStackOffset();
   2701   // Align stack specially for tail calls.
   2702   if (shouldGuaranteeTCO(CallConv,
   2703                          MF.getTarget().Options.GuaranteedTailCallOpt))
   2704     StackSize = GetAlignedArgumentStackSize(StackSize, DAG);
   2705 
   2706   // If the function takes variable number of arguments, make a frame index for
   2707   // the start of the first vararg value... for expansion of llvm.va_start. We
   2708   // can skip this if there are no va_start calls.
   2709   if (MFI->hasVAStart() &&
   2710       (Is64Bit || (CallConv != CallingConv::X86_FastCall &&
   2711                    CallConv != CallingConv::X86_ThisCall))) {
   2712     FuncInfo->setVarArgsFrameIndex(
   2713         MFI->CreateFixedObject(1, StackSize, true));
   2714   }
   2715 
   2716   // Figure out if XMM registers are in use.
   2717   assert(!(Subtarget.useSoftFloat() &&
   2718            Fn->hasFnAttribute(Attribute::NoImplicitFloat)) &&
   2719          "SSE register cannot be used when SSE is disabled!");
   2720 
   2721   // 64-bit calling conventions support varargs and register parameters, so we
   2722   // have to do extra work to spill them in the prologue.
   2723   if (Is64Bit && isVarArg && MFI->hasVAStart()) {
   2724     // Find the first unallocated argument registers.
   2725     ArrayRef<MCPhysReg> ArgGPRs = get64BitArgumentGPRs(CallConv, Subtarget);
   2726     ArrayRef<MCPhysReg> ArgXMMs = get64BitArgumentXMMs(MF, CallConv, Subtarget);
   2727     unsigned NumIntRegs = CCInfo.getFirstUnallocated(ArgGPRs);
   2728     unsigned NumXMMRegs = CCInfo.getFirstUnallocated(ArgXMMs);
   2729     assert(!(NumXMMRegs && !Subtarget.hasSSE1()) &&
   2730            "SSE register cannot be used when SSE is disabled!");
   2731 
   2732     // Gather all the live in physical registers.
   2733     SmallVector<SDValue, 6> LiveGPRs;
   2734     SmallVector<SDValue, 8> LiveXMMRegs;
   2735     SDValue ALVal;
   2736     for (MCPhysReg Reg : ArgGPRs.slice(NumIntRegs)) {
   2737       unsigned GPR = MF.addLiveIn(Reg, &X86::GR64RegClass);
   2738       LiveGPRs.push_back(
   2739           DAG.getCopyFromReg(Chain, dl, GPR, MVT::i64));
   2740     }
   2741     if (!ArgXMMs.empty()) {
   2742       unsigned AL = MF.addLiveIn(X86::AL, &X86::GR8RegClass);
   2743       ALVal = DAG.getCopyFromReg(Chain, dl, AL, MVT::i8);
   2744       for (MCPhysReg Reg : ArgXMMs.slice(NumXMMRegs)) {
   2745         unsigned XMMReg = MF.addLiveIn(Reg, &X86::VR128RegClass);
   2746         LiveXMMRegs.push_back(
   2747             DAG.getCopyFromReg(Chain, dl, XMMReg, MVT::v4f32));
   2748       }
   2749     }
   2750 
   2751     if (IsWin64) {
   2752       // Get to the caller-allocated home save location.  Add 8 to account
   2753       // for the return address.
   2754       int HomeOffset = TFI.getOffsetOfLocalArea() + 8;
   2755       FuncInfo->setRegSaveFrameIndex(
   2756           MFI->CreateFixedObject(1, NumIntRegs * 8 + HomeOffset, false));
   2757       // Fixup to set vararg frame on shadow area (4 x i64).
   2758       if (NumIntRegs < 4)
   2759         FuncInfo->setVarArgsFrameIndex(FuncInfo->getRegSaveFrameIndex());
   2760     } else {
   2761       // For X86-64, if there are vararg parameters that are passed via
   2762       // registers, then we must store them to their spots on the stack so
   2763       // they may be loaded by dereferencing the result of va_next.
   2764       FuncInfo->setVarArgsGPOffset(NumIntRegs * 8);
   2765       FuncInfo->setVarArgsFPOffset(ArgGPRs.size() * 8 + NumXMMRegs * 16);
   2766       FuncInfo->setRegSaveFrameIndex(MFI->CreateStackObject(
   2767           ArgGPRs.size() * 8 + ArgXMMs.size() * 16, 16, false));
   2768     }
   2769 
   2770     // Store the integer parameter registers.
   2771     SmallVector<SDValue, 8> MemOps;
   2772     SDValue RSFIN = DAG.getFrameIndex(FuncInfo->getRegSaveFrameIndex(),
   2773                                       getPointerTy(DAG.getDataLayout()));
   2774     unsigned Offset = FuncInfo->getVarArgsGPOffset();
   2775     for (SDValue Val : LiveGPRs) {
   2776       SDValue FIN = DAG.getNode(ISD::ADD, dl, getPointerTy(DAG.getDataLayout()),
   2777                                 RSFIN, DAG.getIntPtrConstant(Offset, dl));
   2778       SDValue Store =
   2779           DAG.getStore(Val.getValue(1), dl, Val, FIN,
   2780                        MachinePointerInfo::getFixedStack(
   2781                            DAG.getMachineFunction(),
   2782                            FuncInfo->getRegSaveFrameIndex(), Offset),
   2783                        false, false, 0);
   2784       MemOps.push_back(Store);
   2785       Offset += 8;
   2786     }
   2787 
   2788     if (!ArgXMMs.empty() && NumXMMRegs != ArgXMMs.size()) {
   2789       // Now store the XMM (fp + vector) parameter registers.
   2790       SmallVector<SDValue, 12> SaveXMMOps;
   2791       SaveXMMOps.push_back(Chain);
   2792       SaveXMMOps.push_back(ALVal);
   2793       SaveXMMOps.push_back(DAG.getIntPtrConstant(
   2794                              FuncInfo->getRegSaveFrameIndex(), dl));
   2795       SaveXMMOps.push_back(DAG.getIntPtrConstant(
   2796                              FuncInfo->getVarArgsFPOffset(), dl));
   2797       SaveXMMOps.insert(SaveXMMOps.end(), LiveXMMRegs.begin(),
   2798                         LiveXMMRegs.end());
   2799       MemOps.push_back(DAG.getNode(X86ISD::VASTART_SAVE_XMM_REGS, dl,
   2800                                    MVT::Other, SaveXMMOps));
   2801     }
   2802 
   2803     if (!MemOps.empty())
   2804       Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOps);
   2805   }
   2806 
   2807   if (isVarArg && MFI->hasMustTailInVarArgFunc()) {
   2808     // Find the largest legal vector type.
   2809     MVT VecVT = MVT::Other;
   2810     // FIXME: Only some x86_32 calling conventions support AVX512.
   2811     if (Subtarget.hasAVX512() &&
   2812         (Is64Bit || (CallConv == CallingConv::X86_VectorCall ||
   2813                      CallConv == CallingConv::Intel_OCL_BI)))
   2814       VecVT = MVT::v16f32;
   2815     else if (Subtarget.hasAVX())
   2816       VecVT = MVT::v8f32;
   2817     else if (Subtarget.hasSSE2())
   2818       VecVT = MVT::v4f32;
   2819 
   2820     // We forward some GPRs and some vector types.
   2821     SmallVector<MVT, 2> RegParmTypes;
   2822     MVT IntVT = Is64Bit ? MVT::i64 : MVT::i32;
   2823     RegParmTypes.push_back(IntVT);
   2824     if (VecVT != MVT::Other)
   2825       RegParmTypes.push_back(VecVT);
   2826 
   2827     // Compute the set of forwarded registers. The rest are scratch.
   2828     SmallVectorImpl<ForwardedRegister> &Forwards =
   2829         FuncInfo->getForwardedMustTailRegParms();
   2830     CCInfo.analyzeMustTailForwardedRegisters(Forwards, RegParmTypes, CC_X86);
   2831 
   2832     // Conservatively forward AL on x86_64, since it might be used for varargs.
   2833     if (Is64Bit && !CCInfo.isAllocated(X86::AL)) {
   2834       unsigned ALVReg = MF.addLiveIn(X86::AL, &X86::GR8RegClass);
   2835       Forwards.push_back(ForwardedRegister(ALVReg, X86::AL, MVT::i8));
   2836     }
   2837 
   2838     // Copy all forwards from physical to virtual registers.
   2839     for (ForwardedRegister &F : Forwards) {
   2840       // FIXME: Can we use a less constrained schedule?
   2841       SDValue RegVal = DAG.getCopyFromReg(Chain, dl, F.VReg, F.VT);
   2842       F.VReg = MF.getRegInfo().createVirtualRegister(getRegClassFor(F.VT));
   2843       Chain = DAG.getCopyToReg(Chain, dl, F.VReg, RegVal);
   2844     }
   2845   }
   2846 
   2847   // Some CCs need callee pop.
   2848   if (X86::isCalleePop(CallConv, Is64Bit, isVarArg,
   2849                        MF.getTarget().Options.GuaranteedTailCallOpt)) {
   2850     FuncInfo->setBytesToPopOnReturn(StackSize); // Callee pops everything.
   2851   } else if (CallConv == CallingConv::X86_INTR && Ins.size() == 2) {
   2852     // X86 interrupts must pop the error code if present
   2853     FuncInfo->setBytesToPopOnReturn(Is64Bit ? 8 : 4);
   2854   } else {
   2855     FuncInfo->setBytesToPopOnReturn(0); // Callee pops nothing.
   2856     // If this is an sret function, the return should pop the hidden pointer.
   2857     if (!Is64Bit && !canGuaranteeTCO(CallConv) &&
   2858         !Subtarget.getTargetTriple().isOSMSVCRT() &&
   2859         argsAreStructReturn(Ins, Subtarget.isTargetMCU()) == StackStructReturn)
   2860       FuncInfo->setBytesToPopOnReturn(4);
   2861   }
   2862 
   2863   if (!Is64Bit) {
   2864     // RegSaveFrameIndex is X86-64 only.
   2865     FuncInfo->setRegSaveFrameIndex(0xAAAAAAA);
   2866     if (CallConv == CallingConv::X86_FastCall ||
   2867         CallConv == CallingConv::X86_ThisCall)
   2868       // fastcc functions can't have varargs.
   2869       FuncInfo->setVarArgsFrameIndex(0xAAAAAAA);
   2870   }
   2871 
   2872   FuncInfo->setArgumentStackSize(StackSize);
   2873 
   2874   if (WinEHFuncInfo *EHInfo = MF.getWinEHFuncInfo()) {
   2875     EHPersonality Personality = classifyEHPersonality(Fn->getPersonalityFn());
   2876     if (Personality == EHPersonality::CoreCLR) {
   2877       assert(Is64Bit);
   2878       // TODO: Add a mechanism to frame lowering that will allow us to indicate
   2879       // that we'd prefer this slot be allocated towards the bottom of the frame
   2880       // (i.e. near the stack pointer after allocating the frame).  Every
   2881       // funclet needs a copy of this slot in its (mostly empty) frame, and the
   2882       // offset from the bottom of this and each funclet's frame must be the
   2883       // same, so the size of funclets' (mostly empty) frames is dictated by
   2884       // how far this slot is from the bottom (since they allocate just enough
   2885       // space to accommodate holding this slot at the correct offset).
   2886       int PSPSymFI = MFI->CreateStackObject(8, 8, /*isSS=*/false);
   2887       EHInfo->PSPSymFrameIdx = PSPSymFI;
   2888     }
   2889   }
   2890 
   2891   return Chain;
   2892 }
   2893 
   2894 SDValue X86TargetLowering::LowerMemOpCallTo(SDValue Chain, SDValue StackPtr,
   2895                                             SDValue Arg, const SDLoc &dl,
   2896                                             SelectionDAG &DAG,
   2897                                             const CCValAssign &VA,
   2898                                             ISD::ArgFlagsTy Flags) const {
   2899   unsigned LocMemOffset = VA.getLocMemOffset();
   2900   SDValue PtrOff = DAG.getIntPtrConstant(LocMemOffset, dl);
   2901   PtrOff = DAG.getNode(ISD::ADD, dl, getPointerTy(DAG.getDataLayout()),
   2902                        StackPtr, PtrOff);
   2903   if (Flags.isByVal())
   2904     return CreateCopyOfByValArgument(Arg, PtrOff, Chain, Flags, DAG, dl);
   2905 
   2906   return DAG.getStore(
   2907       Chain, dl, Arg, PtrOff,
   2908       MachinePointerInfo::getStack(DAG.getMachineFunction(), LocMemOffset),
   2909       false, false, 0);
   2910 }
   2911 
   2912 /// Emit a load of return address if tail call
   2913 /// optimization is performed and it is required.
   2914 SDValue X86TargetLowering::EmitTailCallLoadRetAddr(
   2915     SelectionDAG &DAG, SDValue &OutRetAddr, SDValue Chain, bool IsTailCall,
   2916     bool Is64Bit, int FPDiff, const SDLoc &dl) const {
   2917   // Adjust the Return address stack slot.
   2918   EVT VT = getPointerTy(DAG.getDataLayout());
   2919   OutRetAddr = getReturnAddressFrameIndex(DAG);
   2920 
   2921   // Load the "old" Return address.
   2922   OutRetAddr = DAG.getLoad(VT, dl, Chain, OutRetAddr, MachinePointerInfo(),
   2923                            false, false, false, 0);
   2924   return SDValue(OutRetAddr.getNode(), 1);
   2925 }
   2926 
   2927 /// Emit a store of the return address if tail call
   2928 /// optimization is performed and it is required (FPDiff!=0).
   2929 static SDValue EmitTailCallStoreRetAddr(SelectionDAG &DAG, MachineFunction &MF,
   2930                                         SDValue Chain, SDValue RetAddrFrIdx,
   2931                                         EVT PtrVT, unsigned SlotSize,
   2932                                         int FPDiff, const SDLoc &dl) {
   2933   // Store the return address to the appropriate stack slot.
   2934   if (!FPDiff) return Chain;
   2935   // Calculate the new stack slot for the return address.
   2936   int NewReturnAddrFI =
   2937     MF.getFrameInfo()->CreateFixedObject(SlotSize, (int64_t)FPDiff - SlotSize,
   2938                                          false);
   2939   SDValue NewRetAddrFrIdx = DAG.getFrameIndex(NewReturnAddrFI, PtrVT);
   2940   Chain = DAG.getStore(Chain, dl, RetAddrFrIdx, NewRetAddrFrIdx,
   2941                        MachinePointerInfo::getFixedStack(
   2942                            DAG.getMachineFunction(), NewReturnAddrFI),
   2943                        false, false, 0);
   2944   return Chain;
   2945 }
   2946 
   2947 /// Returns a vector_shuffle mask for an movs{s|d}, movd
   2948 /// operation of specified width.
   2949 static SDValue getMOVL(SelectionDAG &DAG, const SDLoc &dl, MVT VT, SDValue V1,
   2950                        SDValue V2) {
   2951   unsigned NumElems = VT.getVectorNumElements();
   2952   SmallVector<int, 8> Mask;
   2953   Mask.push_back(NumElems);
   2954   for (unsigned i = 1; i != NumElems; ++i)
   2955     Mask.push_back(i);
   2956   return DAG.getVectorShuffle(VT, dl, V1, V2, Mask);
   2957 }
   2958 
   2959 SDValue
   2960 X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
   2961                              SmallVectorImpl<SDValue> &InVals) const {
   2962   SelectionDAG &DAG                     = CLI.DAG;
   2963   SDLoc &dl                             = CLI.DL;
   2964   SmallVectorImpl<ISD::OutputArg> &Outs = CLI.Outs;
   2965   SmallVectorImpl<SDValue> &OutVals     = CLI.OutVals;
   2966   SmallVectorImpl<ISD::InputArg> &Ins   = CLI.Ins;
   2967   SDValue Chain                         = CLI.Chain;
   2968   SDValue Callee                        = CLI.Callee;
   2969   CallingConv::ID CallConv              = CLI.CallConv;
   2970   bool &isTailCall                      = CLI.IsTailCall;
   2971   bool isVarArg                         = CLI.IsVarArg;
   2972 
   2973   MachineFunction &MF = DAG.getMachineFunction();
   2974   bool Is64Bit        = Subtarget.is64Bit();
   2975   bool IsWin64        = Subtarget.isCallingConvWin64(CallConv);
   2976   StructReturnType SR = callIsStructReturn(Outs, Subtarget.isTargetMCU());
   2977   bool IsSibcall      = false;
   2978   X86MachineFunctionInfo *X86Info = MF.getInfo<X86MachineFunctionInfo>();
   2979   auto Attr = MF.getFunction()->getFnAttribute("disable-tail-calls");
   2980 
   2981   if (CallConv == CallingConv::X86_INTR)
   2982     report_fatal_error("X86 interrupts may not be called directly");
   2983 
   2984   if (Attr.getValueAsString() == "true")
   2985     isTailCall = false;
   2986 
   2987   if (Subtarget.isPICStyleGOT() &&
   2988       !MF.getTarget().Options.GuaranteedTailCallOpt) {
   2989     // If we are using a GOT, disable tail calls to external symbols with
   2990     // default visibility. Tail calling such a symbol requires using a GOT
   2991     // relocation, which forces early binding of the symbol. This breaks code
   2992     // that require lazy function symbol resolution. Using musttail or
   2993     // GuaranteedTailCallOpt will override this.
   2994     GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee);
   2995     if (!G || (!G->getGlobal()->hasLocalLinkage() &&
   2996                G->getGlobal()->hasDefaultVisibility()))
   2997       isTailCall = false;
   2998   }
   2999 
   3000   bool IsMustTail = CLI.CS && CLI.CS->isMustTailCall();
   3001   if (IsMustTail) {
   3002     // Force this to be a tail call.  The verifier rules are enough to ensure
   3003     // that we can lower this successfully without moving the return address
   3004     // around.
   3005     isTailCall = true;
   3006   } else if (isTailCall) {
   3007     // Check if it's really possible to do a tail call.
   3008     isTailCall = IsEligibleForTailCallOptimization(Callee, CallConv,
   3009                     isVarArg, SR != NotStructReturn,
   3010                     MF.getFunction()->hasStructRetAttr(), CLI.RetTy,
   3011                     Outs, OutVals, Ins, DAG);
   3012 
   3013     // Sibcalls are automatically detected tailcalls which do not require
   3014     // ABI changes.
   3015     if (!MF.getTarget().Options.GuaranteedTailCallOpt && isTailCall)
   3016       IsSibcall = true;
   3017 
   3018     if (isTailCall)
   3019       ++NumTailCalls;
   3020   }
   3021 
   3022   assert(!(isVarArg && canGuaranteeTCO(CallConv)) &&
   3023          "Var args not supported with calling convention fastcc, ghc or hipe");
   3024 
   3025   // Analyze operands of the call, assigning locations to each operand.
   3026   SmallVector<CCValAssign, 16> ArgLocs;
   3027   CCState CCInfo(CallConv, isVarArg, MF, ArgLocs, *DAG.getContext());
   3028 
   3029   // Allocate shadow area for Win64
   3030   if (IsWin64)
   3031     CCInfo.AllocateStack(32, 8);
   3032 
   3033   CCInfo.AnalyzeCallOperands(Outs, CC_X86);
   3034 
   3035   // Get a count of how many bytes are to be pushed on the stack.
   3036   unsigned NumBytes = CCInfo.getAlignedCallFrameSize();
   3037   if (IsSibcall)
   3038     // This is a sibcall. The memory operands are available in caller's
   3039     // own caller's stack.
   3040     NumBytes = 0;
   3041   else if (MF.getTarget().Options.GuaranteedTailCallOpt &&
   3042            canGuaranteeTCO(CallConv))
   3043     NumBytes = GetAlignedArgumentStackSize(NumBytes, DAG);
   3044 
   3045   int FPDiff = 0;
   3046   if (isTailCall && !IsSibcall && !IsMustTail) {
   3047     // Lower arguments at fp - stackoffset + fpdiff.
   3048     unsigned NumBytesCallerPushed = X86Info->getBytesToPopOnReturn();
   3049 
   3050     FPDiff = NumBytesCallerPushed - NumBytes;
   3051 
   3052     // Set the delta of movement of the returnaddr stackslot.
   3053     // But only set if delta is greater than previous delta.
   3054     if (FPDiff < X86Info->getTCReturnAddrDelta())
   3055       X86Info->setTCReturnAddrDelta(FPDiff);
   3056   }
   3057 
   3058   unsigned NumBytesToPush = NumBytes;
   3059   unsigned NumBytesToPop = NumBytes;
   3060 
   3061   // If we have an inalloca argument, all stack space has already been allocated
   3062   // for us and be right at the top of the stack.  We don't support multiple
   3063   // arguments passed in memory when using inalloca.
   3064   if (!Outs.empty() && Outs.back().Flags.isInAlloca()) {
   3065     NumBytesToPush = 0;
   3066     if (!ArgLocs.back().isMemLoc())
   3067       report_fatal_error("cannot use inalloca attribute on a register "
   3068                          "parameter");
   3069     if (ArgLocs.back().getLocMemOffset() != 0)
   3070       report_fatal_error("any parameter with the inalloca attribute must be "
   3071                          "the only memory argument");
   3072   }
   3073 
   3074   if (!IsSibcall)
   3075     Chain = DAG.getCALLSEQ_START(
   3076         Chain, DAG.getIntPtrConstant(NumBytesToPush, dl, true), dl);
   3077 
   3078   SDValue RetAddrFrIdx;
   3079   // Load return address for tail calls.
   3080   if (isTailCall && FPDiff)
   3081     Chain = EmitTailCallLoadRetAddr(DAG, RetAddrFrIdx, Chain, isTailCall,
   3082                                     Is64Bit, FPDiff, dl);
   3083 
   3084   SmallVector<std::pair<unsigned, SDValue>, 8> RegsToPass;
   3085   SmallVector<SDValue, 8> MemOpChains;
   3086   SDValue StackPtr;
   3087 
   3088   // Walk the register/memloc assignments, inserting copies/loads.  In the case
   3089   // of tail call optimization arguments are handle later.
   3090   const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
   3091   for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
   3092     // Skip inalloca arguments, they have already been written.
   3093     ISD::ArgFlagsTy Flags = Outs[i].Flags;
   3094     if (Flags.isInAlloca())
   3095       continue;
   3096 
   3097     CCValAssign &VA = ArgLocs[i];
   3098     EVT RegVT = VA.getLocVT();
   3099     SDValue Arg = OutVals[i];
   3100     bool isByVal = Flags.isByVal();
   3101 
   3102     // Promote the value if needed.
   3103     switch (VA.getLocInfo()) {
   3104     default: llvm_unreachable("Unknown loc info!");
   3105     case CCValAssign::Full: break;
   3106     case CCValAssign::SExt:
   3107       Arg = DAG.getNode(ISD::SIGN_EXTEND, dl, RegVT, Arg);
   3108       break;
   3109     case CCValAssign::ZExt:
   3110       Arg = DAG.getNode(ISD::ZERO_EXTEND, dl, RegVT, Arg);
   3111       break;
   3112     case CCValAssign::AExt:
   3113       if (Arg.getValueType().isVector() &&
   3114           Arg.getValueType().getVectorElementType() == MVT::i1)
   3115         Arg = DAG.getNode(ISD::SIGN_EXTEND, dl, RegVT, Arg);
   3116       else if (RegVT.is128BitVector()) {
   3117         // Special case: passing MMX values in XMM registers.
   3118         Arg = DAG.getBitcast(MVT::i64, Arg);
   3119         Arg = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, Arg);
   3120         Arg = getMOVL(DAG, dl, MVT::v2i64, DAG.getUNDEF(MVT::v2i64), Arg);
   3121       } else
   3122         Arg = DAG.getNode(ISD::ANY_EXTEND, dl, RegVT, Arg);
   3123       break;
   3124     case CCValAssign::BCvt:
   3125       Arg = DAG.getBitcast(RegVT, Arg);
   3126       break;
   3127     case CCValAssign::Indirect: {
   3128       // Store the argument.
   3129       SDValue SpillSlot = DAG.CreateStackTemporary(VA.getValVT());
   3130       int FI = cast<FrameIndexSDNode>(SpillSlot)->getIndex();
   3131       Chain = DAG.getStore(
   3132           Chain, dl, Arg, SpillSlot,
   3133           MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI),
   3134           false, false, 0);
   3135       Arg = SpillSlot;
   3136       break;
   3137     }
   3138     }
   3139 
   3140     if (VA.isRegLoc()) {
   3141       RegsToPass.push_back(std::make_pair(VA.getLocReg(), Arg));
   3142       if (isVarArg && IsWin64) {
   3143         // Win64 ABI requires argument XMM reg to be copied to the corresponding
   3144         // shadow reg if callee is a varargs function.
   3145         unsigned ShadowReg = 0;
   3146         switch (VA.getLocReg()) {
   3147         case X86::XMM0: ShadowReg = X86::RCX; break;
   3148         case X86::XMM1: ShadowReg = X86::RDX; break;
   3149         case X86::XMM2: ShadowReg = X86::R8; break;
   3150         case X86::XMM3: ShadowReg = X86::R9; break;
   3151         }
   3152         if (ShadowReg)
   3153           RegsToPass.push_back(std::make_pair(ShadowReg, Arg));
   3154       }
   3155     } else if (!IsSibcall && (!isTailCall || isByVal)) {
   3156       assert(VA.isMemLoc());
   3157       if (!StackPtr.getNode())
   3158         StackPtr = DAG.getCopyFromReg(Chain, dl, RegInfo->getStackRegister(),
   3159                                       getPointerTy(DAG.getDataLayout()));
   3160       MemOpChains.push_back(LowerMemOpCallTo(Chain, StackPtr, Arg,
   3161                                              dl, DAG, VA, Flags));
   3162     }
   3163   }
   3164 
   3165   if (!MemOpChains.empty())
   3166     Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOpChains);
   3167 
   3168   if (Subtarget.isPICStyleGOT()) {
   3169     // ELF / PIC requires GOT in the EBX register before function calls via PLT
   3170     // GOT pointer.
   3171     if (!isTailCall) {
   3172       RegsToPass.push_back(std::make_pair(
   3173           unsigned(X86::EBX), DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(),
   3174                                           getPointerTy(DAG.getDataLayout()))));
   3175     } else {
   3176       // If we are tail calling and generating PIC/GOT style code load the
   3177       // address of the callee into ECX. The value in ecx is used as target of
   3178       // the tail jump. This is done to circumvent the ebx/callee-saved problem
   3179       // for tail calls on PIC/GOT architectures. Normally we would just put the
   3180       // address of GOT into ebx and then call target@PLT. But for tail calls
   3181       // ebx would be restored (since ebx is callee saved) before jumping to the
   3182       // target@PLT.
   3183 
   3184       // Note: The actual moving to ECX is done further down.
   3185       GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee);
   3186       if (G && !G->getGlobal()->hasLocalLinkage() &&
   3187           G->getGlobal()->hasDefaultVisibility())
   3188         Callee = LowerGlobalAddress(Callee, DAG);
   3189       else if (isa<ExternalSymbolSDNode>(Callee))
   3190         Callee = LowerExternalSymbol(Callee, DAG);
   3191     }
   3192   }
   3193 
   3194   if (Is64Bit && isVarArg && !IsWin64 && !IsMustTail) {
   3195     // From AMD64 ABI document:
   3196     // For calls that may call functions that use varargs or stdargs
   3197     // (prototype-less calls or calls to functions containing ellipsis (...) in
   3198     // the declaration) %al is used as hidden argument to specify the number
   3199     // of SSE registers used. The contents of %al do not need to match exactly
   3200     // the number of registers, but must be an ubound on the number of SSE
   3201     // registers used and is in the range 0 - 8 inclusive.
   3202 
   3203     // Count the number of XMM registers allocated.
   3204     static const MCPhysReg XMMArgRegs[] = {
   3205       X86::XMM0, X86::XMM1, X86::XMM2, X86::XMM3,
   3206       X86::XMM4, X86::XMM5, X86::XMM6, X86::XMM7
   3207     };
   3208     unsigned NumXMMRegs = CCInfo.getFirstUnallocated(XMMArgRegs);
   3209     assert((Subtarget.hasSSE1() || !NumXMMRegs)
   3210            && "SSE registers cannot be used when SSE is disabled");
   3211 
   3212     RegsToPass.push_back(std::make_pair(unsigned(X86::AL),
   3213                                         DAG.getConstant(NumXMMRegs, dl,
   3214                                                         MVT::i8)));
   3215   }
   3216 
   3217   if (isVarArg && IsMustTail) {
   3218     const auto &Forwards = X86Info->getForwardedMustTailRegParms();
   3219     for (const auto &F : Forwards) {
   3220       SDValue Val = DAG.getCopyFromReg(Chain, dl, F.VReg, F.VT);
   3221       RegsToPass.push_back(std::make_pair(unsigned(F.PReg), Val));
   3222     }
   3223   }
   3224 
   3225   // For tail calls lower the arguments to the 'real' stack slots.  Sibcalls
   3226   // don't need this because the eligibility check rejects calls that require
   3227   // shuffling arguments passed in memory.
   3228   if (!IsSibcall && isTailCall) {
   3229     // Force all the incoming stack arguments to be loaded from the stack
   3230     // before any new outgoing arguments are stored to the stack, because the
   3231     // outgoing stack slots may alias the incoming argument stack slots, and
   3232     // the alias isn't otherwise explicit. This is slightly more conservative
   3233     // than necessary, because it means that each store effectively depends
   3234     // on every argument instead of just those arguments it would clobber.
   3235     SDValue ArgChain = DAG.getStackArgumentTokenFactor(Chain);
   3236 
   3237     SmallVector<SDValue, 8> MemOpChains2;
   3238     SDValue FIN;
   3239     int FI = 0;
   3240     for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
   3241       CCValAssign &VA = ArgLocs[i];
   3242       if (VA.isRegLoc())
   3243         continue;
   3244       assert(VA.isMemLoc());
   3245       SDValue Arg = OutVals[i];
   3246       ISD::ArgFlagsTy Flags = Outs[i].Flags;
   3247       // Skip inalloca arguments.  They don't require any work.
   3248       if (Flags.isInAlloca())
   3249         continue;
   3250       // Create frame index.
   3251       int32_t Offset = VA.getLocMemOffset()+FPDiff;
   3252       uint32_t OpSize = (VA.getLocVT().getSizeInBits()+7)/8;
   3253       FI = MF.getFrameInfo()->CreateFixedObject(OpSize, Offset, true);
   3254       FIN = DAG.getFrameIndex(FI, getPointerTy(DAG.getDataLayout()));
   3255 
   3256       if (Flags.isByVal()) {
   3257         // Copy relative to framepointer.
   3258         SDValue Source = DAG.getIntPtrConstant(VA.getLocMemOffset(), dl);
   3259         if (!StackPtr.getNode())
   3260           StackPtr = DAG.getCopyFromReg(Chain, dl, RegInfo->getStackRegister(),
   3261                                         getPointerTy(DAG.getDataLayout()));
   3262         Source = DAG.getNode(ISD::ADD, dl, getPointerTy(DAG.getDataLayout()),
   3263                              StackPtr, Source);
   3264 
   3265         MemOpChains2.push_back(CreateCopyOfByValArgument(Source, FIN,
   3266                                                          ArgChain,
   3267                                                          Flags, DAG, dl));
   3268       } else {
   3269         // Store relative to framepointer.
   3270         MemOpChains2.push_back(DAG.getStore(
   3271             ArgChain, dl, Arg, FIN,
   3272             MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI),
   3273             false, false, 0));
   3274       }
   3275     }
   3276 
   3277     if (!MemOpChains2.empty())
   3278       Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOpChains2);
   3279 
   3280     // Store the return address to the appropriate stack slot.
   3281     Chain = EmitTailCallStoreRetAddr(DAG, MF, Chain, RetAddrFrIdx,
   3282                                      getPointerTy(DAG.getDataLayout()),
   3283                                      RegInfo->getSlotSize(), FPDiff, dl);
   3284   }
   3285 
   3286   // Build a sequence of copy-to-reg nodes chained together with token chain
   3287   // and flag operands which copy the outgoing args into registers.
   3288   SDValue InFlag;
   3289   for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) {
   3290     Chain = DAG.getCopyToReg(Chain, dl, RegsToPass[i].first,
   3291                              RegsToPass[i].second, InFlag);
   3292     InFlag = Chain.getValue(1);
   3293   }
   3294 
   3295   if (DAG.getTarget().getCodeModel() == CodeModel::Large) {
   3296     assert(Is64Bit && "Large code model is only legal in 64-bit mode.");
   3297     // In the 64-bit large code model, we have to make all calls
   3298     // through a register, since the call instruction's 32-bit
   3299     // pc-relative offset may not be large enough to hold the whole
   3300     // address.
   3301   } else if (Callee->getOpcode() == ISD::GlobalAddress) {
   3302     // If the callee is a GlobalAddress node (quite common, every direct call
   3303     // is) turn it into a TargetGlobalAddress node so that legalize doesn't hack
   3304     // it.
   3305     GlobalAddressSDNode* G = cast<GlobalAddressSDNode>(Callee);
   3306 
   3307     // We should use extra load for direct calls to dllimported functions in
   3308     // non-JIT mode.
   3309     const GlobalValue *GV = G->getGlobal();
   3310     if (!GV->hasDLLImportStorageClass()) {
   3311       unsigned char OpFlags = Subtarget.classifyGlobalFunctionReference(GV);
   3312 
   3313       Callee = DAG.getTargetGlobalAddress(
   3314           GV, dl, getPointerTy(DAG.getDataLayout()), G->getOffset(), OpFlags);
   3315 
   3316       if (OpFlags == X86II::MO_GOTPCREL) {
   3317         // Add a wrapper.
   3318         Callee = DAG.getNode(X86ISD::WrapperRIP, dl,
   3319           getPointerTy(DAG.getDataLayout()), Callee);
   3320         // Add extra indirection
   3321         Callee = DAG.getLoad(
   3322           getPointerTy(DAG.getDataLayout()), dl, DAG.getEntryNode(), Callee,
   3323           MachinePointerInfo::getGOT(DAG.getMachineFunction()), false, false,
   3324           false, 0);
   3325       }
   3326     }
   3327   } else if (ExternalSymbolSDNode *S = dyn_cast<ExternalSymbolSDNode>(Callee)) {
   3328     const Module *Mod = DAG.getMachineFunction().getFunction()->getParent();
   3329     unsigned char OpFlags =
   3330         Subtarget.classifyGlobalFunctionReference(nullptr, *Mod);
   3331 
   3332     Callee = DAG.getTargetExternalSymbol(
   3333         S->getSymbol(), getPointerTy(DAG.getDataLayout()), OpFlags);
   3334   } else if (Subtarget.isTarget64BitILP32() &&
   3335              Callee->getValueType(0) == MVT::i32) {
   3336     // Zero-extend the 32-bit Callee address into a 64-bit according to x32 ABI
   3337     Callee = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i64, Callee);
   3338   }
   3339 
   3340   // Returns a chain & a flag for retval copy to use.
   3341   SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
   3342   SmallVector<SDValue, 8> Ops;
   3343 
   3344   if (!IsSibcall && isTailCall) {
   3345     Chain = DAG.getCALLSEQ_END(Chain,
   3346                                DAG.getIntPtrConstant(NumBytesToPop, dl, true),
   3347                                DAG.getIntPtrConstant(0, dl, true), InFlag, dl);
   3348     InFlag = Chain.getValue(1);
   3349   }
   3350 
   3351   Ops.push_back(Chain);
   3352   Ops.push_back(Callee);
   3353 
   3354   if (isTailCall)
   3355     Ops.push_back(DAG.getConstant(FPDiff, dl, MVT::i32));
   3356 
   3357   // Add argument registers to the end of the list so that they are known live
   3358   // into the call.
   3359   for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i)
   3360     Ops.push_back(DAG.getRegister(RegsToPass[i].first,
   3361                                   RegsToPass[i].second.getValueType()));
   3362 
   3363   // Add a register mask operand representing the call-preserved registers.
   3364   const uint32_t *Mask = RegInfo->getCallPreservedMask(MF, CallConv);
   3365   assert(Mask && "Missing call preserved mask for calling convention");
   3366 
   3367   // If this is an invoke in a 32-bit function using a funclet-based
   3368   // personality, assume the function clobbers all registers. If an exception
   3369   // is thrown, the runtime will not restore CSRs.
   3370   // FIXME: Model this more precisely so that we can register allocate across
   3371   // the normal edge and spill and fill across the exceptional edge.
   3372   if (!Is64Bit && CLI.CS && CLI.CS->isInvoke()) {
   3373     const Function *CallerFn = MF.getFunction();
   3374     EHPersonality Pers =
   3375         CallerFn->hasPersonalityFn()
   3376             ? classifyEHPersonality(CallerFn->getPersonalityFn())
   3377             : EHPersonality::Unknown;
   3378     if (isFuncletEHPersonality(Pers))
   3379       Mask = RegInfo->getNoPreservedMask();
   3380   }
   3381 
   3382   Ops.push_back(DAG.getRegisterMask(Mask));
   3383 
   3384   if (InFlag.getNode())
   3385     Ops.push_back(InFlag);
   3386 
   3387   if (isTailCall) {
   3388     // We used to do:
   3389     //// If this is the first return lowered for this function, add the regs
   3390     //// to the liveout set for the function.
   3391     // This isn't right, although it's probably harmless on x86; liveouts
   3392     // should be computed from returns not tail calls.  Consider a void
   3393     // function making a tail call to a function returning int.
   3394     MF.getFrameInfo()->setHasTailCall();
   3395     return DAG.getNode(X86ISD::TC_RETURN, dl, NodeTys, Ops);
   3396   }
   3397 
   3398   Chain = DAG.getNode(X86ISD::CALL, dl, NodeTys, Ops);
   3399   InFlag = Chain.getValue(1);
   3400 
   3401   // Create the CALLSEQ_END node.
   3402   unsigned NumBytesForCalleeToPop;
   3403   if (X86::isCalleePop(CallConv, Is64Bit, isVarArg,
   3404                        DAG.getTarget().Options.GuaranteedTailCallOpt))
   3405     NumBytesForCalleeToPop = NumBytes;    // Callee pops everything
   3406   else if (!Is64Bit && !canGuaranteeTCO(CallConv) &&
   3407            !Subtarget.getTargetTriple().isOSMSVCRT() &&
   3408            SR == StackStructReturn)
   3409     // If this is a call to a struct-return function, the callee
   3410     // pops the hidden struct pointer, so we have to push it back.
   3411     // This is common for Darwin/X86, Linux & Mingw32 targets.
   3412     // For MSVC Win32 targets, the caller pops the hidden struct pointer.
   3413     NumBytesForCalleeToPop = 4;
   3414   else
   3415     NumBytesForCalleeToPop = 0;  // Callee pops nothing.
   3416 
   3417   if (CLI.DoesNotReturn && !getTargetMachine().Options.TrapUnreachable) {
   3418     // No need to reset the stack after the call if the call doesn't return. To
   3419     // make the MI verify, we'll pretend the callee does it for us.
   3420     NumBytesForCalleeToPop = NumBytes;
   3421   }
   3422 
   3423   // Returns a flag for retval copy to use.
   3424   if (!IsSibcall) {
   3425     Chain = DAG.getCALLSEQ_END(Chain,
   3426                                DAG.getIntPtrConstant(NumBytesToPop, dl, true),
   3427                                DAG.getIntPtrConstant(NumBytesForCalleeToPop, dl,
   3428                                                      true),
   3429                                InFlag, dl);
   3430     InFlag = Chain.getValue(1);
   3431   }
   3432 
   3433   // Handle result values, copying them out of physregs into vregs that we
   3434   // return.
   3435   return LowerCallResult(Chain, InFlag, CallConv, isVarArg,
   3436                          Ins, dl, DAG, InVals);
   3437 }
   3438 
   3439 //===----------------------------------------------------------------------===//
   3440 //                Fast Calling Convention (tail call) implementation
   3441 //===----------------------------------------------------------------------===//
   3442 
   3443 //  Like std call, callee cleans arguments, convention except that ECX is
   3444 //  reserved for storing the tail called function address. Only 2 registers are
   3445 //  free for argument passing (inreg). Tail call optimization is performed
   3446 //  provided:
   3447 //                * tailcallopt is enabled
   3448 //                * caller/callee are fastcc
   3449 //  On X86_64 architecture with GOT-style position independent code only local
   3450 //  (within module) calls are supported at the moment.
   3451 //  To keep the stack aligned according to platform abi the function
   3452 //  GetAlignedArgumentStackSize ensures that argument delta is always multiples
   3453 //  of stack alignment. (Dynamic linkers need this - darwin's dyld for example)
   3454 //  If a tail called function callee has more arguments than the caller the
   3455 //  caller needs to make sure that there is room to move the RETADDR to. This is
   3456 //  achieved by reserving an area the size of the argument delta right after the
   3457 //  original RETADDR, but before the saved framepointer or the spilled registers
   3458 //  e.g. caller(arg1, arg2) calls callee(arg1, arg2,arg3,arg4)
   3459 //  stack layout:
   3460 //    arg1
   3461 //    arg2
   3462 //    RETADDR
   3463 //    [ new RETADDR
   3464 //      move area ]
   3465 //    (possible EBP)
   3466 //    ESI
   3467 //    EDI
   3468 //    local1 ..
   3469 
   3470 /// Make the stack size align e.g 16n + 12 aligned for a 16-byte align
   3471 /// requirement.
   3472 unsigned
   3473 X86TargetLowering::GetAlignedArgumentStackSize(unsigned StackSize,
   3474                                                SelectionDAG& DAG) const {
   3475   const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
   3476   const TargetFrameLowering &TFI = *Subtarget.getFrameLowering();
   3477   unsigned StackAlignment = TFI.getStackAlignment();
   3478   uint64_t AlignMask = StackAlignment - 1;
   3479   int64_t Offset = StackSize;
   3480   unsigned SlotSize = RegInfo->getSlotSize();
   3481   if ( (Offset & AlignMask) <= (StackAlignment - SlotSize) ) {
   3482     // Number smaller than 12 so just add the difference.
   3483     Offset += ((StackAlignment - SlotSize) - (Offset & AlignMask));
   3484   } else {
   3485     // Mask out lower bits, add stackalignment once plus the 12 bytes.
   3486     Offset = ((~AlignMask) & Offset) + StackAlignment +
   3487       (StackAlignment-SlotSize);
   3488   }
   3489   return Offset;
   3490 }
   3491 
   3492 /// Return true if the given stack call argument is already available in the
   3493 /// same position (relatively) of the caller's incoming argument stack.
   3494 static
   3495 bool MatchingStackOffset(SDValue Arg, unsigned Offset, ISD::ArgFlagsTy Flags,
   3496                          MachineFrameInfo *MFI, const MachineRegisterInfo *MRI,
   3497                          const X86InstrInfo *TII, const CCValAssign &VA) {
   3498   unsigned Bytes = Arg.getValueType().getSizeInBits() / 8;
   3499 
   3500   for (;;) {
   3501     // Look through nodes that don't alter the bits of the incoming value.
   3502     unsigned Op = Arg.getOpcode();
   3503     if (Op == ISD::ZERO_EXTEND || Op == ISD::ANY_EXTEND || Op == ISD::BITCAST) {
   3504       Arg = Arg.getOperand(0);
   3505       continue;
   3506     }
   3507     if (Op == ISD::TRUNCATE) {
   3508       const SDValue &TruncInput = Arg.getOperand(0);
   3509       if (TruncInput.getOpcode() == ISD::AssertZext &&
   3510           cast<VTSDNode>(TruncInput.getOperand(1))->getVT() ==
   3511               Arg.getValueType()) {
   3512         Arg = TruncInput.getOperand(0);
   3513         continue;
   3514       }
   3515     }
   3516     break;
   3517   }
   3518 
   3519   int FI = INT_MAX;
   3520   if (Arg.getOpcode() == ISD::CopyFromReg) {
   3521     unsigned VR = cast<RegisterSDNode>(Arg.getOperand(1))->getReg();
   3522     if (!TargetRegisterInfo::isVirtualRegister(VR))
   3523       return false;
   3524     MachineInstr *Def = MRI->getVRegDef(VR);
   3525     if (!Def)
   3526       return false;
   3527     if (!Flags.isByVal()) {
   3528       if (!TII->isLoadFromStackSlot(*Def, FI))
   3529         return false;
   3530     } else {
   3531       unsigned Opcode = Def->getOpcode();
   3532       if ((Opcode == X86::LEA32r || Opcode == X86::LEA64r ||
   3533            Opcode == X86::LEA64_32r) &&
   3534           Def->getOperand(1).isFI()) {
   3535         FI = Def->getOperand(1).getIndex();
   3536         Bytes = Flags.getByValSize();
   3537       } else
   3538         return false;
   3539     }
   3540   } else if (LoadSDNode *Ld = dyn_cast<LoadSDNode>(Arg)) {
   3541     if (Flags.isByVal())
   3542       // ByVal argument is passed in as a pointer but it's now being
   3543       // dereferenced. e.g.
   3544       // define @foo(%struct.X* %A) {
   3545       //   tail call @bar(%struct.X* byval %A)
   3546       // }
   3547       return false;
   3548     SDValue Ptr = Ld->getBasePtr();
   3549     FrameIndexSDNode *FINode = dyn_cast<FrameIndexSDNode>(Ptr);
   3550     if (!FINode)
   3551       return false;
   3552     FI = FINode->getIndex();
   3553   } else if (Arg.getOpcode() == ISD::FrameIndex && Flags.isByVal()) {
   3554     FrameIndexSDNode *FINode = cast<FrameIndexSDNode>(Arg);
   3555     FI = FINode->getIndex();
   3556     Bytes = Flags.getByValSize();
   3557   } else
   3558     return false;
   3559 
   3560   assert(FI != INT_MAX);
   3561   if (!MFI->isFixedObjectIndex(FI))
   3562     return false;
   3563 
   3564   if (Offset != MFI->getObjectOffset(FI))
   3565     return false;
   3566 
   3567   if (VA.getLocVT().getSizeInBits() > Arg.getValueType().getSizeInBits()) {
   3568     // If the argument location is wider than the argument type, check that any
   3569     // extension flags match.
   3570     if (Flags.isZExt() != MFI->isObjectZExt(FI) ||
   3571         Flags.isSExt() != MFI->isObjectSExt(FI)) {
   3572       return false;
   3573     }
   3574   }
   3575 
   3576   return Bytes == MFI->getObjectSize(FI);
   3577 }
   3578 
   3579 /// Check whether the call is eligible for tail call optimization. Targets
   3580 /// that want to do tail call optimization should implement this function.
   3581 bool X86TargetLowering::IsEligibleForTailCallOptimization(
   3582     SDValue Callee, CallingConv::ID CalleeCC, bool isVarArg,
   3583     bool isCalleeStructRet, bool isCallerStructRet, Type *RetTy,
   3584     const SmallVectorImpl<ISD::OutputArg> &Outs,
   3585     const SmallVectorImpl<SDValue> &OutVals,
   3586     const SmallVectorImpl<ISD::InputArg> &Ins, SelectionDAG &DAG) const {
   3587   if (!mayTailCallThisCC(CalleeCC))
   3588     return false;
   3589 
   3590   // If -tailcallopt is specified, make fastcc functions tail-callable.
   3591   MachineFunction &MF = DAG.getMachineFunction();
   3592   const Function *CallerF = MF.getFunction();
   3593 
   3594   // If the function return type is x86_fp80 and the callee return type is not,
   3595   // then the FP_EXTEND of the call result is not a nop. It's not safe to
   3596   // perform a tailcall optimization here.
   3597   if (CallerF->getReturnType()->isX86_FP80Ty() && !RetTy->isX86_FP80Ty())
   3598     return false;
   3599 
   3600   CallingConv::ID CallerCC = CallerF->getCallingConv();
   3601   bool CCMatch = CallerCC == CalleeCC;
   3602   bool IsCalleeWin64 = Subtarget.isCallingConvWin64(CalleeCC);
   3603   bool IsCallerWin64 = Subtarget.isCallingConvWin64(CallerCC);
   3604 
   3605   // Win64 functions have extra shadow space for argument homing. Don't do the
   3606   // sibcall if the caller and callee have mismatched expectations for this
   3607   // space.
   3608   if (IsCalleeWin64 != IsCallerWin64)
   3609     return false;
   3610 
   3611   if (DAG.getTarget().Options.GuaranteedTailCallOpt) {
   3612     if (canGuaranteeTCO(CalleeCC) && CCMatch)
   3613       return true;
   3614     return false;
   3615   }
   3616 
   3617   // Look for obvious safe cases to perform tail call optimization that do not
   3618   // require ABI changes. This is what gcc calls sibcall.
   3619 
   3620   // Can't do sibcall if stack needs to be dynamically re-aligned. PEI needs to
   3621   // emit a special epilogue.
   3622   const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
   3623   if (RegInfo->needsStackRealignment(MF))
   3624     return false;
   3625 
   3626   // Also avoid sibcall optimization if either caller or callee uses struct
   3627   // return semantics.
   3628   if (isCalleeStructRet || isCallerStructRet)
   3629     return false;
   3630 
   3631   // Do not sibcall optimize vararg calls unless all arguments are passed via
   3632   // registers.
   3633   LLVMContext &C = *DAG.getContext();
   3634   if (isVarArg && !Outs.empty()) {
   3635     // Optimizing for varargs on Win64 is unlikely to be safe without
   3636     // additional testing.
   3637     if (IsCalleeWin64 || IsCallerWin64)
   3638       return false;
   3639 
   3640     SmallVector<CCValAssign, 16> ArgLocs;
   3641     CCState CCInfo(CalleeCC, isVarArg, MF, ArgLocs, C);
   3642 
   3643     CCInfo.AnalyzeCallOperands(Outs, CC_X86);
   3644     for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i)
   3645       if (!ArgLocs[i].isRegLoc())
   3646         return false;
   3647   }
   3648 
   3649   // If the call result is in ST0 / ST1, it needs to be popped off the x87
   3650   // stack.  Therefore, if it's not used by the call it is not safe to optimize
   3651   // this into a sibcall.
   3652   bool Unused = false;
   3653   for (unsigned i = 0, e = Ins.size(); i != e; ++i) {
   3654     if (!Ins[i].Used) {
   3655       Unused = true;
   3656       break;
   3657     }
   3658   }
   3659   if (Unused) {
   3660     SmallVector<CCValAssign, 16> RVLocs;
   3661     CCState CCInfo(CalleeCC, false, MF, RVLocs, C);
   3662     CCInfo.AnalyzeCallResult(Ins, RetCC_X86);
   3663     for (unsigned i = 0, e = RVLocs.size(); i != e; ++i) {
   3664       CCValAssign &VA = RVLocs[i];
   3665       if (VA.getLocReg() == X86::FP0 || VA.getLocReg() == X86::FP1)
   3666         return false;
   3667     }
   3668   }
   3669 
   3670   // Check that the call results are passed in the same way.
   3671   if (!CCState::resultsCompatible(CalleeCC, CallerCC, MF, C, Ins,
   3672                                   RetCC_X86, RetCC_X86))
   3673     return false;
   3674   // The callee has to preserve all registers the caller needs to preserve.
   3675   const X86RegisterInfo *TRI = Subtarget.getRegisterInfo();
   3676   const uint32_t *CallerPreserved = TRI->getCallPreservedMask(MF, CallerCC);
   3677   if (!CCMatch) {
   3678     const uint32_t *CalleePreserved = TRI->getCallPreservedMask(MF, CalleeCC);
   3679     if (!TRI->regmaskSubsetEqual(CallerPreserved, CalleePreserved))
   3680       return false;
   3681   }
   3682 
   3683   unsigned StackArgsSize = 0;
   3684 
   3685   // If the callee takes no arguments then go on to check the results of the
   3686   // call.
   3687   if (!Outs.empty()) {
   3688     // Check if stack adjustment is needed. For now, do not do this if any
   3689     // argument is passed on the stack.
   3690     SmallVector<CCValAssign, 16> ArgLocs;
   3691     CCState CCInfo(CalleeCC, isVarArg, MF, ArgLocs, C);
   3692 
   3693     // Allocate shadow area for Win64
   3694     if (IsCalleeWin64)
   3695       CCInfo.AllocateStack(32, 8);
   3696 
   3697     CCInfo.AnalyzeCallOperands(Outs, CC_X86);
   3698     StackArgsSize = CCInfo.getNextStackOffset();
   3699 
   3700     if (CCInfo.getNextStackOffset()) {
   3701       // Check if the arguments are already laid out in the right way as
   3702       // the caller's fixed stack objects.
   3703       MachineFrameInfo *MFI = MF.getFrameInfo();
   3704       const MachineRegisterInfo *MRI = &MF.getRegInfo();
   3705       const X86InstrInfo *TII = Subtarget.getInstrInfo();
   3706       for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
   3707         CCValAssign &VA = ArgLocs[i];
   3708         SDValue Arg = OutVals[i];
   3709         ISD::ArgFlagsTy Flags = Outs[i].Flags;
   3710         if (VA.getLocInfo() == CCValAssign::Indirect)
   3711           return false;
   3712         if (!VA.isRegLoc()) {
   3713           if (!MatchingStackOffset(Arg, VA.getLocMemOffset(), Flags,
   3714                                    MFI, MRI, TII, VA))
   3715             return false;
   3716         }
   3717       }
   3718     }
   3719 
   3720     bool PositionIndependent = isPositionIndependent();
   3721     // If the tailcall address may be in a register, then make sure it's
   3722     // possible to register allocate for it. In 32-bit, the call address can
   3723     // only target EAX, EDX, or ECX since the tail call must be scheduled after
   3724     // callee-saved registers are restored. These happen to be the same
   3725     // registers used to pass 'inreg' arguments so watch out for those.
   3726     if (!Subtarget.is64Bit() && ((!isa<GlobalAddressSDNode>(Callee) &&
   3727                                   !isa<ExternalSymbolSDNode>(Callee)) ||
   3728                                  PositionIndependent)) {
   3729       unsigned NumInRegs = 0;
   3730       // In PIC we need an extra register to formulate the address computation
   3731       // for the callee.
   3732       unsigned MaxInRegs = PositionIndependent ? 2 : 3;
   3733 
   3734       for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
   3735         CCValAssign &VA = ArgLocs[i];
   3736         if (!VA.isRegLoc())
   3737           continue;
   3738         unsigned Reg = VA.getLocReg();
   3739         switch (Reg) {
   3740         default: break;
   3741         case X86::EAX: case X86::EDX: case X86::ECX:
   3742           if (++NumInRegs == MaxInRegs)
   3743             return false;
   3744           break;
   3745         }
   3746       }
   3747     }
   3748 
   3749     const MachineRegisterInfo &MRI = MF.getRegInfo();
   3750     if (!parametersInCSRMatch(MRI, CallerPreserved, ArgLocs, OutVals))
   3751       return false;
   3752   }
   3753 
   3754   bool CalleeWillPop =
   3755       X86::isCalleePop(CalleeCC, Subtarget.is64Bit(), isVarArg,
   3756                        MF.getTarget().Options.GuaranteedTailCallOpt);
   3757 
   3758   if (unsigned BytesToPop =
   3759           MF.getInfo<X86MachineFunctionInfo>()->getBytesToPopOnReturn()) {
   3760     // If we have bytes to pop, the callee must pop them.
   3761     bool CalleePopMatches = CalleeWillPop && BytesToPop == StackArgsSize;
   3762     if (!CalleePopMatches)
   3763       return false;
   3764   } else if (CalleeWillPop && StackArgsSize > 0) {
   3765     // If we don't have bytes to pop, make sure the callee doesn't pop any.
   3766     return false;
   3767   }
   3768 
   3769   return true;
   3770 }
   3771 
   3772 FastISel *
   3773 X86TargetLowering::createFastISel(FunctionLoweringInfo &funcInfo,
   3774                                   const TargetLibraryInfo *libInfo) const {
   3775   return X86::createFastISel(funcInfo, libInfo);
   3776 }
   3777 
   3778 //===----------------------------------------------------------------------===//
   3779 //                           Other Lowering Hooks
   3780 //===----------------------------------------------------------------------===//
   3781 
   3782 static bool MayFoldLoad(SDValue Op) {
   3783   return Op.hasOneUse() && ISD::isNormalLoad(Op.getNode());
   3784 }
   3785 
   3786 static bool MayFoldIntoStore(SDValue Op) {
   3787   return Op.hasOneUse() && ISD::isNormalStore(*Op.getNode()->use_begin());
   3788 }
   3789 
   3790 static bool isTargetShuffle(unsigned Opcode) {
   3791   switch(Opcode) {
   3792   default: return false;
   3793   case X86ISD::BLENDI:
   3794   case X86ISD::PSHUFB:
   3795   case X86ISD::PSHUFD:
   3796   case X86ISD::PSHUFHW:
   3797   case X86ISD::PSHUFLW:
   3798   case X86ISD::SHUFP:
   3799   case X86ISD::INSERTPS:
   3800   case X86ISD::PALIGNR:
   3801   case X86ISD::VSHLDQ:
   3802   case X86ISD::VSRLDQ:
   3803   case X86ISD::MOVLHPS:
   3804   case X86ISD::MOVLHPD:
   3805   case X86ISD::MOVHLPS:
   3806   case X86ISD::MOVLPS:
   3807   case X86ISD::MOVLPD:
   3808   case X86ISD::MOVSHDUP:
   3809   case X86ISD::MOVSLDUP:
   3810   case X86ISD::MOVDDUP:
   3811   case X86ISD::MOVSS:
   3812   case X86ISD::MOVSD:
   3813   case X86ISD::UNPCKL:
   3814   case X86ISD::UNPCKH:
   3815   case X86ISD::VPERMILPI:
   3816   case X86ISD::VPERMILPV:
   3817   case X86ISD::VPERM2X128:
   3818   case X86ISD::VPERMIL2:
   3819   case X86ISD::VPERMI:
   3820   case X86ISD::VPPERM:
   3821   case X86ISD::VPERMV:
   3822   case X86ISD::VPERMV3:
   3823   case X86ISD::VZEXT_MOVL:
   3824     return true;
   3825   }
   3826 }
   3827 
   3828 static bool isTargetShuffleVariableMask(unsigned Opcode) {
   3829   switch (Opcode) {
   3830   default: return false;
   3831   case X86ISD::PSHUFB:
   3832   case X86ISD::VPERMILPV:
   3833     return true;
   3834   }
   3835 }
   3836 
   3837 static SDValue getTargetShuffleNode(unsigned Opc, const SDLoc &dl, MVT VT,
   3838                                     SDValue V1, unsigned TargetMask,
   3839                                     SelectionDAG &DAG) {
   3840   switch(Opc) {
   3841   default: llvm_unreachable("Unknown x86 shuffle node");
   3842   case X86ISD::PSHUFD:
   3843   case X86ISD::PSHUFHW:
   3844   case X86ISD::PSHUFLW:
   3845   case X86ISD::VPERMILPI:
   3846   case X86ISD::VPERMI:
   3847     return DAG.getNode(Opc, dl, VT, V1,
   3848                        DAG.getConstant(TargetMask, dl, MVT::i8));
   3849   }
   3850 }
   3851 
   3852 static SDValue getTargetShuffleNode(unsigned Opc, const SDLoc &dl, MVT VT,
   3853                                     SDValue V1, SDValue V2, SelectionDAG &DAG) {
   3854   switch(Opc) {
   3855   default: llvm_unreachable("Unknown x86 shuffle node");
   3856   case X86ISD::MOVLHPS:
   3857   case X86ISD::MOVLHPD:
   3858   case X86ISD::MOVHLPS:
   3859   case X86ISD::MOVLPS:
   3860   case X86ISD::MOVLPD:
   3861   case X86ISD::MOVSS:
   3862   case X86ISD::MOVSD:
   3863   case X86ISD::UNPCKL:
   3864   case X86ISD::UNPCKH:
   3865     return DAG.getNode(Opc, dl, VT, V1, V2);
   3866   }
   3867 }
   3868 
   3869 SDValue X86TargetLowering::getReturnAddressFrameIndex(SelectionDAG &DAG) const {
   3870   MachineFunction &MF = DAG.getMachineFunction();
   3871   const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
   3872   X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
   3873   int ReturnAddrIndex = FuncInfo->getRAIndex();
   3874 
   3875   if (ReturnAddrIndex == 0) {
   3876     // Set up a frame object for the return address.
   3877     unsigned SlotSize = RegInfo->getSlotSize();
   3878     ReturnAddrIndex = MF.getFrameInfo()->CreateFixedObject(SlotSize,
   3879                                                            -(int64_t)SlotSize,
   3880                                                            false);
   3881     FuncInfo->setRAIndex(ReturnAddrIndex);
   3882   }
   3883 
   3884   return DAG.getFrameIndex(ReturnAddrIndex, getPointerTy(DAG.getDataLayout()));
   3885 }
   3886 
   3887 bool X86::isOffsetSuitableForCodeModel(int64_t Offset, CodeModel::Model M,
   3888                                        bool hasSymbolicDisplacement) {
   3889   // Offset should fit into 32 bit immediate field.
   3890   if (!isInt<32>(Offset))
   3891     return false;
   3892 
   3893   // If we don't have a symbolic displacement - we don't have any extra
   3894   // restrictions.
   3895   if (!hasSymbolicDisplacement)
   3896     return true;
   3897 
   3898   // FIXME: Some tweaks might be needed for medium code model.
   3899   if (M != CodeModel::Small && M != CodeModel::Kernel)
   3900     return false;
   3901 
   3902   // For small code model we assume that latest object is 16MB before end of 31
   3903   // bits boundary. We may also accept pretty large negative constants knowing
   3904   // that all objects are in the positive half of address space.
   3905   if (M == CodeModel::Small && Offset < 16*1024*1024)
   3906     return true;
   3907 
   3908   // For kernel code model we know that all object resist in the negative half
   3909   // of 32bits address space. We may not accept negative offsets, since they may
   3910   // be just off and we may accept pretty large positive ones.
   3911   if (M == CodeModel::Kernel && Offset >= 0)
   3912     return true;
   3913 
   3914   return false;
   3915 }
   3916 
   3917 /// Determines whether the callee is required to pop its own arguments.
   3918 /// Callee pop is necessary to support tail calls.
   3919 bool X86::isCalleePop(CallingConv::ID CallingConv,
   3920                       bool is64Bit, bool IsVarArg, bool GuaranteeTCO) {
   3921   // If GuaranteeTCO is true, we force some calls to be callee pop so that we
   3922   // can guarantee TCO.
   3923   if (!IsVarArg && shouldGuaranteeTCO(CallingConv, GuaranteeTCO))
   3924     return true;
   3925 
   3926   switch (CallingConv) {
   3927   default:
   3928     return false;
   3929   case CallingConv::X86_StdCall:
   3930   case CallingConv::X86_FastCall:
   3931   case CallingConv::X86_ThisCall:
   3932   case CallingConv::X86_VectorCall:
   3933     return !is64Bit;
   3934   }
   3935 }
   3936 
   3937 /// \brief Return true if the condition is an unsigned comparison operation.
   3938 static bool isX86CCUnsigned(unsigned X86CC) {
   3939   switch (X86CC) {
   3940   default:
   3941     llvm_unreachable("Invalid integer condition!");
   3942   case X86::COND_E:
   3943   case X86::COND_NE:
   3944   case X86::COND_B:
   3945   case X86::COND_A:
   3946   case X86::COND_BE:
   3947   case X86::COND_AE:
   3948     return true;
   3949   case X86::COND_G:
   3950   case X86::COND_GE:
   3951   case X86::COND_L:
   3952   case X86::COND_LE:
   3953     return false;
   3954   }
   3955 }
   3956 
   3957 static X86::CondCode TranslateIntegerX86CC(ISD::CondCode SetCCOpcode) {
   3958   switch (SetCCOpcode) {
   3959   default: llvm_unreachable("Invalid integer condition!");
   3960   case ISD::SETEQ:  return X86::COND_E;
   3961   case ISD::SETGT:  return X86::COND_G;
   3962   case ISD::SETGE:  return X86::COND_GE;
   3963   case ISD::SETLT:  return X86::COND_L;
   3964   case ISD::SETLE:  return X86::COND_LE;
   3965   case ISD::SETNE:  return X86::COND_NE;
   3966   case ISD::SETULT: return X86::COND_B;
   3967   case ISD::SETUGT: return X86::COND_A;
   3968   case ISD::SETULE: return X86::COND_BE;
   3969   case ISD::SETUGE: return X86::COND_AE;
   3970   }
   3971 }
   3972 
   3973 /// Do a one-to-one translation of a ISD::CondCode to the X86-specific
   3974 /// condition code, returning the condition code and the LHS/RHS of the
   3975 /// comparison to make.
   3976 static unsigned TranslateX86CC(ISD::CondCode SetCCOpcode, const SDLoc &DL,
   3977                                bool isFP, SDValue &LHS, SDValue &RHS,
   3978                                SelectionDAG &DAG) {
   3979   if (!isFP) {
   3980     if (ConstantSDNode *RHSC = dyn_cast<ConstantSDNode>(RHS)) {
   3981       if (SetCCOpcode == ISD::SETGT && RHSC->isAllOnesValue()) {
   3982         // X > -1   -> X == 0, jump !sign.
   3983         RHS = DAG.getConstant(0, DL, RHS.getValueType());
   3984         return X86::COND_NS;
   3985       }
   3986       if (SetCCOpcode == ISD::SETLT && RHSC->isNullValue()) {
   3987         // X < 0   -> X == 0, jump on sign.
   3988         return X86::COND_S;
   3989       }
   3990       if (SetCCOpcode == ISD::SETLT && RHSC->getZExtValue() == 1) {
   3991         // X < 1   -> X <= 0
   3992         RHS = DAG.getConstant(0, DL, RHS.getValueType());
   3993         return X86::COND_LE;
   3994       }
   3995     }
   3996 
   3997     return TranslateIntegerX86CC(SetCCOpcode);
   3998   }
   3999 
   4000   // First determine if it is required or is profitable to flip the operands.
   4001 
   4002   // If LHS is a foldable load, but RHS is not, flip the condition.
   4003   if (ISD::isNON_EXTLoad(LHS.getNode()) &&
   4004       !ISD::isNON_EXTLoad(RHS.getNode())) {
   4005     SetCCOpcode = getSetCCSwappedOperands(SetCCOpcode);
   4006     std::swap(LHS, RHS);
   4007   }
   4008 
   4009   switch (SetCCOpcode) {
   4010   default: break;
   4011   case ISD::SETOLT:
   4012   case ISD::SETOLE:
   4013   case ISD::SETUGT:
   4014   case ISD::SETUGE:
   4015     std::swap(LHS, RHS);
   4016     break;
   4017   }
   4018 
   4019   // On a floating point condition, the flags are set as follows:
   4020   // ZF  PF  CF   op
   4021   //  0 | 0 | 0 | X > Y
   4022   //  0 | 0 | 1 | X < Y
   4023   //  1 | 0 | 0 | X == Y
   4024   //  1 | 1 | 1 | unordered
   4025   switch (SetCCOpcode) {
   4026   default: llvm_unreachable("Condcode should be pre-legalized away");
   4027   case ISD::SETUEQ:
   4028   case ISD::SETEQ:   return X86::COND_E;
   4029   case ISD::SETOLT:              // flipped
   4030   case ISD::SETOGT:
   4031   case ISD::SETGT:   return X86::COND_A;
   4032   case ISD::SETOLE:              // flipped
   4033   case ISD::SETOGE:
   4034   case ISD::SETGE:   return X86::COND_AE;
   4035   case ISD::SETUGT:              // flipped
   4036   case ISD::SETULT:
   4037   case ISD::SETLT:   return X86::COND_B;
   4038   case ISD::SETUGE:              // flipped
   4039   case ISD::SETULE:
   4040   case ISD::SETLE:   return X86::COND_BE;
   4041   case ISD::SETONE:
   4042   case ISD::SETNE:   return X86::COND_NE;
   4043   case ISD::SETUO:   return X86::COND_P;
   4044   case ISD::SETO:    return X86::COND_NP;
   4045   case ISD::SETOEQ:
   4046   case ISD::SETUNE:  return X86::COND_INVALID;
   4047   }
   4048 }
   4049 
   4050 /// Is there a floating point cmov for the specific X86 condition code?
   4051 /// Current x86 isa includes the following FP cmov instructions:
   4052 /// fcmovb, fcomvbe, fcomve, fcmovu, fcmovae, fcmova, fcmovne, fcmovnu.
   4053 static bool hasFPCMov(unsigned X86CC) {
   4054   switch (X86CC) {
   4055   default:
   4056     return false;
   4057   case X86::COND_B:
   4058   case X86::COND_BE:
   4059   case X86::COND_E:
   4060   case X86::COND_P:
   4061   case X86::COND_A:
   4062   case X86::COND_AE:
   4063   case X86::COND_NE:
   4064   case X86::COND_NP:
   4065     return true;
   4066   }
   4067 }
   4068 
   4069 
   4070 bool X86TargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
   4071                                            const CallInst &I,
   4072                                            unsigned Intrinsic) const {
   4073 
   4074   const IntrinsicData* IntrData = getIntrinsicWithChain(Intrinsic);
   4075   if (!IntrData)
   4076     return false;
   4077 
   4078   Info.opc = ISD::INTRINSIC_W_CHAIN;
   4079   Info.readMem = false;
   4080   Info.writeMem = false;
   4081   Info.vol = false;
   4082   Info.offset = 0;
   4083 
   4084   switch (IntrData->Type) {
   4085   case EXPAND_FROM_MEM: {
   4086     Info.ptrVal = I.getArgOperand(0);
   4087     Info.memVT = MVT::getVT(I.getType());
   4088     Info.align = 1;
   4089     Info.readMem = true;
   4090     break;
   4091   }
   4092   case COMPRESS_TO_MEM: {
   4093     Info.ptrVal = I.getArgOperand(0);
   4094     Info.memVT = MVT::getVT(I.getArgOperand(1)->getType());
   4095     Info.align = 1;
   4096     Info.writeMem = true;
   4097     break;
   4098   }
   4099   case TRUNCATE_TO_MEM_VI8:
   4100   case TRUNCATE_TO_MEM_VI16:
   4101   case TRUNCATE_TO_MEM_VI32: {
   4102     Info.ptrVal = I.getArgOperand(0);
   4103     MVT VT  = MVT::getVT(I.getArgOperand(1)->getType());
   4104     MVT ScalarVT = MVT::INVALID_SIMPLE_VALUE_TYPE;
   4105     if (IntrData->Type == TRUNCATE_TO_MEM_VI8)
   4106       ScalarVT = MVT::i8;
   4107     else if (IntrData->Type == TRUNCATE_TO_MEM_VI16)
   4108       ScalarVT = MVT::i16;
   4109     else if (IntrData->Type == TRUNCATE_TO_MEM_VI32)
   4110       ScalarVT = MVT::i32;
   4111 
   4112     Info.memVT = MVT::getVectorVT(ScalarVT, VT.getVectorNumElements());
   4113     Info.align = 1;
   4114     Info.writeMem = true;
   4115     break;
   4116   }
   4117   default:
   4118     return false;
   4119   }
   4120 
   4121   return true;
   4122 }
   4123 
   4124 /// Returns true if the target can instruction select the
   4125 /// specified FP immediate natively. If false, the legalizer will
   4126 /// materialize the FP immediate as a load from a constant pool.
   4127 bool X86TargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT) const {
   4128   for (unsigned i = 0, e = LegalFPImmediates.size(); i != e; ++i) {
   4129     if (Imm.bitwiseIsEqual(LegalFPImmediates[i]))
   4130       return true;
   4131   }
   4132   return false;
   4133 }
   4134 
   4135 bool X86TargetLowering::shouldReduceLoadWidth(SDNode *Load,
   4136                                               ISD::LoadExtType ExtTy,
   4137                                               EVT NewVT) const {
   4138   // "ELF Handling for Thread-Local Storage" specifies that R_X86_64_GOTTPOFF
   4139   // relocation target a movq or addq instruction: don't let the load shrink.
   4140   SDValue BasePtr = cast<LoadSDNode>(Load)->getBasePtr();
   4141   if (BasePtr.getOpcode() == X86ISD::WrapperRIP)
   4142     if (const auto *GA = dyn_cast<GlobalAddressSDNode>(BasePtr.getOperand(0)))
   4143       return GA->getTargetFlags() != X86II::MO_GOTTPOFF;
   4144   return true;
   4145 }
   4146 
   4147 /// \brief Returns true if it is beneficial to convert a load of a constant
   4148 /// to just the constant itself.
   4149 bool X86TargetLowering::shouldConvertConstantLoadToIntImm(const APInt &Imm,
   4150                                                           Type *Ty) const {
   4151   assert(Ty->isIntegerTy());
   4152 
   4153   unsigned BitSize = Ty->getPrimitiveSizeInBits();
   4154   if (BitSize == 0 || BitSize > 64)
   4155     return false;
   4156   return true;
   4157 }
   4158 
   4159 bool X86TargetLowering::isExtractSubvectorCheap(EVT ResVT,
   4160                                                 unsigned Index) const {
   4161   if (!isOperationLegalOrCustom(ISD::EXTRACT_SUBVECTOR, ResVT))
   4162     return false;
   4163 
   4164   return (Index == 0 || Index == ResVT.getVectorNumElements());
   4165 }
   4166 
   4167 bool X86TargetLowering::isCheapToSpeculateCttz() const {
   4168   // Speculate cttz only if we can directly use TZCNT.
   4169   return Subtarget.hasBMI();
   4170 }
   4171 
   4172 bool X86TargetLowering::isCheapToSpeculateCtlz() const {
   4173   // Speculate ctlz only if we can directly use LZCNT.
   4174   return Subtarget.hasLZCNT();
   4175 }
   4176 
   4177 bool X86TargetLowering::hasAndNotCompare(SDValue Y) const {
   4178   if (!Subtarget.hasBMI())
   4179     return false;
   4180 
   4181   // There are only 32-bit and 64-bit forms for 'andn'.
   4182   EVT VT = Y.getValueType();
   4183   if (VT != MVT::i32 && VT != MVT::i64)
   4184     return false;
   4185 
   4186   return true;
   4187 }
   4188 
   4189 /// Return true if every element in Mask, beginning
   4190 /// from position Pos and ending in Pos+Size is undef.
   4191 static bool isUndefInRange(ArrayRef<int> Mask, unsigned Pos, unsigned Size) {
   4192   for (unsigned i = Pos, e = Pos + Size; i != e; ++i)
   4193     if (0 <= Mask[i])
   4194       return false;
   4195   return true;
   4196 }
   4197 
   4198 /// Return true if Val is undef or if its value falls within the
   4199 /// specified range (L, H].
   4200 static bool isUndefOrInRange(int Val, int Low, int Hi) {
   4201   return (Val < 0) || (Val >= Low && Val < Hi);
   4202 }
   4203 
   4204 /// Return true if every element in Mask is undef or if its value
   4205 /// falls within the specified range (L, H].
   4206 static bool isUndefOrInRange(ArrayRef<int> Mask,
   4207                              int Low, int Hi) {
   4208   for (int M : Mask)
   4209     if (!isUndefOrInRange(M, Low, Hi))
   4210       return false;
   4211   return true;
   4212 }
   4213 
   4214 /// Val is either less than zero (undef) or equal to the specified value.
   4215 static bool isUndefOrEqual(int Val, int CmpVal) {
   4216   return (Val < 0 || Val == CmpVal);
   4217 }
   4218 
   4219 /// Val is either the undef or zero sentinel value.
   4220 static bool isUndefOrZero(int Val) {
   4221   return (Val == SM_SentinelUndef || Val == SM_SentinelZero);
   4222 }
   4223 
   4224 /// Return true if every element in Mask, beginning
   4225 /// from position Pos and ending in Pos+Size, falls within the specified
   4226 /// sequential range (Low, Low+Size]. or is undef.
   4227 static bool isSequentialOrUndefInRange(ArrayRef<int> Mask,
   4228                                        unsigned Pos, unsigned Size, int Low) {
   4229   for (unsigned i = Pos, e = Pos+Size; i != e; ++i, ++Low)
   4230     if (!isUndefOrEqual(Mask[i], Low))
   4231       return false;
   4232   return true;
   4233 }
   4234 
   4235 /// Return true if every element in Mask, beginning
   4236 /// from position Pos and ending in Pos+Size, falls within the specified
   4237 /// sequential range (Low, Low+Size], or is undef or is zero.
   4238 static bool isSequentialOrUndefOrZeroInRange(ArrayRef<int> Mask, unsigned Pos,
   4239                                              unsigned Size, int Low) {
   4240   for (unsigned i = Pos, e = Pos + Size; i != e; ++i, ++Low)
   4241     if (!isUndefOrZero(Mask[i]) && Mask[i] != Low)
   4242       return false;
   4243   return true;
   4244 }
   4245 
   4246 /// Return true if the specified EXTRACT_SUBVECTOR operand specifies a vector
   4247 /// extract that is suitable for instruction that extract 128 or 256 bit vectors
   4248 static bool isVEXTRACTIndex(SDNode *N, unsigned vecWidth) {
   4249   assert((vecWidth == 128 || vecWidth == 256) && "Unexpected vector width");
   4250   if (!isa<ConstantSDNode>(N->getOperand(1).getNode()))
   4251     return false;
   4252 
   4253   // The index should be aligned on a vecWidth-bit boundary.
   4254   uint64_t Index =
   4255     cast<ConstantSDNode>(N->getOperand(1).getNode())->getZExtValue();
   4256 
   4257   MVT VT = N->getSimpleValueType(0);
   4258   unsigned ElSize = VT.getVectorElementType().getSizeInBits();
   4259   bool Result = (Index * ElSize) % vecWidth == 0;
   4260 
   4261   return Result;
   4262 }
   4263 
   4264 /// Return true if the specified INSERT_SUBVECTOR
   4265 /// operand specifies a subvector insert that is suitable for input to
   4266 /// insertion of 128 or 256-bit subvectors
   4267 static bool isVINSERTIndex(SDNode *N, unsigned vecWidth) {
   4268   assert((vecWidth == 128 || vecWidth == 256) && "Unexpected vector width");
   4269   if (!isa<ConstantSDNode>(N->getOperand(2).getNode()))
   4270     return false;
   4271   // The index should be aligned on a vecWidth-bit boundary.
   4272   uint64_t Index =
   4273     cast<ConstantSDNode>(N->getOperand(2).getNode())->getZExtValue();
   4274 
   4275   MVT VT = N->getSimpleValueType(0);
   4276   unsigned ElSize = VT.getVectorElementType().getSizeInBits();
   4277   bool Result = (Index * ElSize) % vecWidth == 0;
   4278 
   4279   return Result;
   4280 }
   4281 
   4282 bool X86::isVINSERT128Index(SDNode *N) {
   4283   return isVINSERTIndex(N, 128);
   4284 }
   4285 
   4286 bool X86::isVINSERT256Index(SDNode *N) {
   4287   return isVINSERTIndex(N, 256);
   4288 }
   4289 
   4290 bool X86::isVEXTRACT128Index(SDNode *N) {
   4291   return isVEXTRACTIndex(N, 128);
   4292 }
   4293 
   4294 bool X86::isVEXTRACT256Index(SDNode *N) {
   4295   return isVEXTRACTIndex(N, 256);
   4296 }
   4297 
   4298 static unsigned getExtractVEXTRACTImmediate(SDNode *N, unsigned vecWidth) {
   4299   assert((vecWidth == 128 || vecWidth == 256) && "Unsupported vector width");
   4300   assert(isa<ConstantSDNode>(N->getOperand(1).getNode()) &&
   4301          "Illegal extract subvector for VEXTRACT");
   4302 
   4303   uint64_t Index =
   4304     cast<ConstantSDNode>(N->getOperand(1).getNode())->getZExtValue();
   4305 
   4306   MVT VecVT = N->getOperand(0).getSimpleValueType();
   4307   MVT ElVT = VecVT.getVectorElementType();
   4308 
   4309   unsigned NumElemsPerChunk = vecWidth / ElVT.getSizeInBits();
   4310   return Index / NumElemsPerChunk;
   4311 }
   4312 
   4313 static unsigned getInsertVINSERTImmediate(SDNode *N, unsigned vecWidth) {
   4314   assert((vecWidth == 128 || vecWidth == 256) && "Unsupported vector width");
   4315   assert(isa<ConstantSDNode>(N->getOperand(2).getNode()) &&
   4316          "Illegal insert subvector for VINSERT");
   4317 
   4318   uint64_t Index =
   4319     cast<ConstantSDNode>(N->getOperand(2).getNode())->getZExtValue();
   4320 
   4321   MVT VecVT = N->getSimpleValueType(0);
   4322   MVT ElVT = VecVT.getVectorElementType();
   4323 
   4324   unsigned NumElemsPerChunk = vecWidth / ElVT.getSizeInBits();
   4325   return Index / NumElemsPerChunk;
   4326 }
   4327 
   4328 /// Return the appropriate immediate to extract the specified
   4329 /// EXTRACT_SUBVECTOR index with VEXTRACTF128 and VINSERTI128 instructions.
   4330 unsigned X86::getExtractVEXTRACT128Immediate(SDNode *N) {
   4331   return getExtractVEXTRACTImmediate(N, 128);
   4332 }
   4333 
   4334 /// Return the appropriate immediate to extract the specified
   4335 /// EXTRACT_SUBVECTOR index with VEXTRACTF64x4 and VINSERTI64x4 instructions.
   4336 unsigned X86::getExtractVEXTRACT256Immediate(SDNode *N) {
   4337   return getExtractVEXTRACTImmediate(N, 256);
   4338 }
   4339 
   4340 /// Return the appropriate immediate to insert at the specified
   4341 /// INSERT_SUBVECTOR index with VINSERTF128 and VINSERTI128 instructions.
   4342 unsigned X86::getInsertVINSERT128Immediate(SDNode *N) {
   4343   return getInsertVINSERTImmediate(N, 128);
   4344 }
   4345 
   4346 /// Return the appropriate immediate to insert at the specified
   4347 /// INSERT_SUBVECTOR index with VINSERTF46x4 and VINSERTI64x4 instructions.
   4348 unsigned X86::getInsertVINSERT256Immediate(SDNode *N) {
   4349   return getInsertVINSERTImmediate(N, 256);
   4350 }
   4351 
   4352 /// Returns true if Elt is a constant zero or a floating point constant +0.0.
   4353 bool X86::isZeroNode(SDValue Elt) {
   4354   return isNullConstant(Elt) || isNullFPConstant(Elt);
   4355 }
   4356 
   4357 // Build a vector of constants
   4358 // Use an UNDEF node if MaskElt == -1.
   4359 // Spilt 64-bit constants in the 32-bit mode.
   4360 static SDValue getConstVector(ArrayRef<int> Values, MVT VT, SelectionDAG &DAG,
   4361                               const SDLoc &dl, bool IsMask = false) {
   4362 
   4363   SmallVector<SDValue, 32>  Ops;
   4364   bool Split = false;
   4365 
   4366   MVT ConstVecVT = VT;
   4367   unsigned NumElts = VT.getVectorNumElements();
   4368   bool In64BitMode = DAG.getTargetLoweringInfo().isTypeLegal(MVT::i64);
   4369   if (!In64BitMode && VT.getVectorElementType() == MVT::i64) {
   4370     ConstVecVT = MVT::getVectorVT(MVT::i32, NumElts * 2);
   4371     Split = true;
   4372   }
   4373 
   4374   MVT EltVT = ConstVecVT.getVectorElementType();
   4375   for (unsigned i = 0; i < NumElts; ++i) {
   4376     bool IsUndef = Values[i] < 0 && IsMask;
   4377     SDValue OpNode = IsUndef ? DAG.getUNDEF(EltVT) :
   4378       DAG.getConstant(Values[i], dl, EltVT);
   4379     Ops.push_back(OpNode);
   4380     if (Split)
   4381       Ops.push_back(IsUndef ? DAG.getUNDEF(EltVT) :
   4382                     DAG.getConstant(0, dl, EltVT));
   4383   }
   4384   SDValue ConstsNode = DAG.getBuildVector(ConstVecVT, dl, Ops);
   4385   if (Split)
   4386     ConstsNode = DAG.getBitcast(VT, ConstsNode);
   4387   return ConstsNode;
   4388 }
   4389 
   4390 /// Returns a vector of specified type with all zero elements.
   4391 static SDValue getZeroVector(MVT VT, const X86Subtarget &Subtarget,
   4392                              SelectionDAG &DAG, const SDLoc &dl) {
   4393   assert((VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector() ||
   4394           VT.getVectorElementType() == MVT::i1) &&
   4395          "Unexpected vector type");
   4396 
   4397   // Try to build SSE/AVX zero vectors as <N x i32> bitcasted to their dest
   4398   // type. This ensures they get CSE'd. But if the integer type is not
   4399   // available, use a floating-point +0.0 instead.
   4400   SDValue Vec;
   4401   if (!Subtarget.hasSSE2() && VT.is128BitVector()) {
   4402     Vec = DAG.getConstantFP(+0.0, dl, MVT::v4f32);
   4403   } else if (VT.getVectorElementType() == MVT::i1) {
   4404     assert((Subtarget.hasBWI() || VT.getVectorNumElements() <= 16) &&
   4405            "Unexpected vector type");
   4406     assert((Subtarget.hasVLX() || VT.getVectorNumElements() >= 8) &&
   4407            "Unexpected vector type");
   4408     Vec = DAG.getConstant(0, dl, VT);
   4409   } else {
   4410     unsigned Num32BitElts = VT.getSizeInBits() / 32;
   4411     Vec = DAG.getConstant(0, dl, MVT::getVectorVT(MVT::i32, Num32BitElts));
   4412   }
   4413   return DAG.getBitcast(VT, Vec);
   4414 }
   4415 
   4416 static SDValue extractSubVector(SDValue Vec, unsigned IdxVal, SelectionDAG &DAG,
   4417                                 const SDLoc &dl, unsigned vectorWidth) {
   4418   assert((vectorWidth == 128 || vectorWidth == 256) &&
   4419          "Unsupported vector width");
   4420   EVT VT = Vec.getValueType();
   4421   EVT ElVT = VT.getVectorElementType();
   4422   unsigned Factor = VT.getSizeInBits()/vectorWidth;
   4423   EVT ResultVT = EVT::getVectorVT(*DAG.getContext(), ElVT,
   4424                                   VT.getVectorNumElements()/Factor);
   4425 
   4426   // Extract from UNDEF is UNDEF.
   4427   if (Vec.isUndef())
   4428     return DAG.getUNDEF(ResultVT);
   4429 
   4430   // Extract the relevant vectorWidth bits.  Generate an EXTRACT_SUBVECTOR
   4431   unsigned ElemsPerChunk = vectorWidth / ElVT.getSizeInBits();
   4432   assert(isPowerOf2_32(ElemsPerChunk) && "Elements per chunk not power of 2");
   4433 
   4434   // This is the index of the first element of the vectorWidth-bit chunk
   4435   // we want. Since ElemsPerChunk is a power of 2 just need to clear bits.
   4436   IdxVal &= ~(ElemsPerChunk - 1);
   4437 
   4438   // If the input is a buildvector just emit a smaller one.
   4439   if (Vec.getOpcode() == ISD::BUILD_VECTOR)
   4440     return DAG.getNode(ISD::BUILD_VECTOR,
   4441          dl, ResultVT, makeArrayRef(Vec->op_begin() + IdxVal, ElemsPerChunk));
   4442 
   4443   SDValue VecIdx = DAG.getIntPtrConstant(IdxVal, dl);
   4444   return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, ResultVT, Vec, VecIdx);
   4445 }
   4446 
   4447 /// Generate a DAG to grab 128-bits from a vector > 128 bits.  This
   4448 /// sets things up to match to an AVX VEXTRACTF128 / VEXTRACTI128
   4449 /// or AVX-512 VEXTRACTF32x4 / VEXTRACTI32x4
   4450 /// instructions or a simple subregister reference. Idx is an index in the
   4451 /// 128 bits we want.  It need not be aligned to a 128-bit boundary.  That makes
   4452 /// lowering EXTRACT_VECTOR_ELT operations easier.
   4453 static SDValue extract128BitVector(SDValue Vec, unsigned IdxVal,
   4454                                    SelectionDAG &DAG, const SDLoc &dl) {
   4455   assert((Vec.getValueType().is256BitVector() ||
   4456           Vec.getValueType().is512BitVector()) && "Unexpected vector size!");
   4457   return extractSubVector(Vec, IdxVal, DAG, dl, 128);
   4458 }
   4459 
   4460 /// Generate a DAG to grab 256-bits from a 512-bit vector.
   4461 static SDValue extract256BitVector(SDValue Vec, unsigned IdxVal,
   4462                                    SelectionDAG &DAG, const SDLoc &dl) {
   4463   assert(Vec.getValueType().is512BitVector() && "Unexpected vector size!");
   4464   return extractSubVector(Vec, IdxVal, DAG, dl, 256);
   4465 }
   4466 
   4467 static SDValue insertSubVector(SDValue Result, SDValue Vec, unsigned IdxVal,
   4468                                SelectionDAG &DAG, const SDLoc &dl,
   4469                                unsigned vectorWidth) {
   4470   assert((vectorWidth == 128 || vectorWidth == 256) &&
   4471          "Unsupported vector width");
   4472   // Inserting UNDEF is Result
   4473   if (Vec.isUndef())
   4474     return Result;
   4475   EVT VT = Vec.getValueType();
   4476   EVT ElVT = VT.getVectorElementType();
   4477   EVT ResultVT = Result.getValueType();
   4478 
   4479   // Insert the relevant vectorWidth bits.
   4480   unsigned ElemsPerChunk = vectorWidth/ElVT.getSizeInBits();
   4481   assert(isPowerOf2_32(ElemsPerChunk) && "Elements per chunk not power of 2");
   4482 
   4483   // This is the index of the first element of the vectorWidth-bit chunk
   4484   // we want. Since ElemsPerChunk is a power of 2 just need to clear bits.
   4485   IdxVal &= ~(ElemsPerChunk - 1);
   4486 
   4487   SDValue VecIdx = DAG.getIntPtrConstant(IdxVal, dl);
   4488   return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResultVT, Result, Vec, VecIdx);
   4489 }
   4490 
   4491 /// Generate a DAG to put 128-bits into a vector > 128 bits.  This
   4492 /// sets things up to match to an AVX VINSERTF128/VINSERTI128 or
   4493 /// AVX-512 VINSERTF32x4/VINSERTI32x4 instructions or a
   4494 /// simple superregister reference.  Idx is an index in the 128 bits
   4495 /// we want.  It need not be aligned to a 128-bit boundary.  That makes
   4496 /// lowering INSERT_VECTOR_ELT operations easier.
   4497 static SDValue insert128BitVector(SDValue Result, SDValue Vec, unsigned IdxVal,
   4498                                   SelectionDAG &DAG, const SDLoc &dl) {
   4499   assert(Vec.getValueType().is128BitVector() && "Unexpected vector size!");
   4500 
   4501   // For insertion into the zero index (low half) of a 256-bit vector, it is
   4502   // more efficient to generate a blend with immediate instead of an insert*128.
   4503   // We are still creating an INSERT_SUBVECTOR below with an undef node to
   4504   // extend the subvector to the size of the result vector. Make sure that
   4505   // we are not recursing on that node by checking for undef here.
   4506   if (IdxVal == 0 && Result.getValueType().is256BitVector() &&
   4507       !Result.isUndef()) {
   4508     EVT ResultVT = Result.getValueType();
   4509     SDValue ZeroIndex = DAG.getIntPtrConstant(0, dl);
   4510     SDValue Undef = DAG.getUNDEF(ResultVT);
   4511     SDValue Vec256 = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResultVT, Undef,
   4512                                  Vec, ZeroIndex);
   4513 
   4514     // The blend instruction, and therefore its mask, depend on the data type.
   4515     MVT ScalarType = ResultVT.getVectorElementType().getSimpleVT();
   4516     if (ScalarType.isFloatingPoint()) {
   4517       // Choose either vblendps (float) or vblendpd (double).
   4518       unsigned ScalarSize = ScalarType.getSizeInBits();
   4519       assert((ScalarSize == 64 || ScalarSize == 32) && "Unknown float type");
   4520       unsigned MaskVal = (ScalarSize == 64) ? 0x03 : 0x0f;
   4521       SDValue Mask = DAG.getConstant(MaskVal, dl, MVT::i8);
   4522       return DAG.getNode(X86ISD::BLENDI, dl, ResultVT, Result, Vec256, Mask);
   4523     }
   4524 
   4525     const X86Subtarget &Subtarget =
   4526     static_cast<const X86Subtarget &>(DAG.getSubtarget());
   4527 
   4528     // AVX2 is needed for 256-bit integer blend support.
   4529     // Integers must be cast to 32-bit because there is only vpblendd;
   4530     // vpblendw can't be used for this because it has a handicapped mask.
   4531 
   4532     // If we don't have AVX2, then cast to float. Using a wrong domain blend
   4533     // is still more efficient than using the wrong domain vinsertf128 that
   4534     // will be created by InsertSubVector().
   4535     MVT CastVT = Subtarget.hasAVX2() ? MVT::v8i32 : MVT::v8f32;
   4536 
   4537     SDValue Mask = DAG.getConstant(0x0f, dl, MVT::i8);
   4538     Result = DAG.getBitcast(CastVT, Result);
   4539     Vec256 = DAG.getBitcast(CastVT, Vec256);
   4540     Vec256 = DAG.getNode(X86ISD::BLENDI, dl, CastVT, Result, Vec256, Mask);
   4541     return DAG.getBitcast(ResultVT, Vec256);
   4542   }
   4543 
   4544   return insertSubVector(Result, Vec, IdxVal, DAG, dl, 128);
   4545 }
   4546 
   4547 static SDValue insert256BitVector(SDValue Result, SDValue Vec, unsigned IdxVal,
   4548                                   SelectionDAG &DAG, const SDLoc &dl) {
   4549   assert(Vec.getValueType().is256BitVector() && "Unexpected vector size!");
   4550   return insertSubVector(Result, Vec, IdxVal, DAG, dl, 256);
   4551 }
   4552 
   4553 /// Insert i1-subvector to i1-vector.
   4554 static SDValue insert1BitVector(SDValue Op, SelectionDAG &DAG,
   4555                                 const X86Subtarget &Subtarget) {
   4556 
   4557   SDLoc dl(Op);
   4558   SDValue Vec = Op.getOperand(0);
   4559   SDValue SubVec = Op.getOperand(1);
   4560   SDValue Idx = Op.getOperand(2);
   4561 
   4562   if (!isa<ConstantSDNode>(Idx))
   4563     return SDValue();
   4564 
   4565   unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
   4566   if (IdxVal == 0  && Vec.isUndef()) // the operation is legal
   4567     return Op;
   4568 
   4569   MVT OpVT = Op.getSimpleValueType();
   4570   MVT SubVecVT = SubVec.getSimpleValueType();
   4571   unsigned NumElems = OpVT.getVectorNumElements();
   4572   unsigned SubVecNumElems = SubVecVT.getVectorNumElements();
   4573 
   4574   assert(IdxVal + SubVecNumElems <= NumElems &&
   4575          IdxVal % SubVecVT.getSizeInBits() == 0 &&
   4576          "Unexpected index value in INSERT_SUBVECTOR");
   4577 
   4578   // There are 3 possible cases:
   4579   // 1. Subvector should be inserted in the lower part (IdxVal == 0)
   4580   // 2. Subvector should be inserted in the upper part
   4581   //    (IdxVal + SubVecNumElems == NumElems)
   4582   // 3. Subvector should be inserted in the middle (for example v2i1
   4583   //    to v16i1, index 2)
   4584 
   4585   // extend to natively supported kshift
   4586   MVT MinVT = Subtarget.hasDQI() ? MVT::v8i1 : MVT::v16i1;
   4587   MVT WideOpVT = OpVT;
   4588   if (OpVT.getSizeInBits() < MinVT.getStoreSizeInBits())
   4589     WideOpVT = MinVT;
   4590 
   4591   SDValue ZeroIdx = DAG.getIntPtrConstant(0, dl);
   4592   SDValue Undef = DAG.getUNDEF(WideOpVT);
   4593   SDValue WideSubVec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT,
   4594                                    Undef, SubVec, ZeroIdx);
   4595 
   4596   // Extract sub-vector if require.
   4597   auto ExtractSubVec = [&](SDValue V) {
   4598     return (WideOpVT == OpVT) ? V : DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl,
   4599                                                 OpVT, V, ZeroIdx);
   4600   };
   4601 
   4602   if (Vec.isUndef()) {
   4603     if (IdxVal != 0) {
   4604       SDValue ShiftBits = DAG.getConstant(IdxVal, dl, MVT::i8);
   4605       WideSubVec = DAG.getNode(X86ISD::VSHLI, dl, WideOpVT, WideSubVec, ShiftBits);
   4606     }
   4607     return ExtractSubVec(WideSubVec);
   4608   }
   4609 
   4610   if (ISD::isBuildVectorAllZeros(Vec.getNode())) {
   4611     NumElems = WideOpVT.getVectorNumElements();
   4612     unsigned ShiftLeft = NumElems - SubVecNumElems;
   4613     unsigned ShiftRight = NumElems - SubVecNumElems - IdxVal;
   4614     Vec = DAG.getNode(X86ISD::VSHLI, dl, WideOpVT, WideSubVec,
   4615                              DAG.getConstant(ShiftLeft, dl, MVT::i8));
   4616     Vec = ShiftRight ? DAG.getNode(X86ISD::VSRLI, dl, WideOpVT, Vec,
   4617       DAG.getConstant(ShiftRight, dl, MVT::i8)) : Vec;
   4618     return ExtractSubVec(Vec);
   4619   }
   4620 
   4621   if (IdxVal == 0) {
   4622     // Zero lower bits of the Vec
   4623     SDValue ShiftBits = DAG.getConstant(SubVecNumElems, dl, MVT::i8);
   4624     Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT, Undef, Vec, ZeroIdx);
   4625     Vec = DAG.getNode(X86ISD::VSRLI, dl, WideOpVT, Vec, ShiftBits);
   4626     Vec = DAG.getNode(X86ISD::VSHLI, dl, WideOpVT, Vec, ShiftBits);
   4627     // Merge them together, SubVec should be zero extended.
   4628     WideSubVec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT,
   4629                              getZeroVector(WideOpVT, Subtarget, DAG, dl),
   4630                              SubVec, ZeroIdx);
   4631     Vec =  DAG.getNode(ISD::OR, dl, WideOpVT, Vec, WideSubVec);
   4632     return ExtractSubVec(Vec);
   4633   }
   4634 
   4635   // Simple case when we put subvector in the upper part
   4636   if (IdxVal + SubVecNumElems == NumElems) {
   4637     // Zero upper bits of the Vec
   4638     WideSubVec = DAG.getNode(X86ISD::VSHLI, dl, WideOpVT, WideSubVec,
   4639                              DAG.getConstant(IdxVal, dl, MVT::i8));
   4640     SDValue ShiftBits = DAG.getConstant(SubVecNumElems, dl, MVT::i8);
   4641     Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT, Undef, Vec, ZeroIdx);
   4642     Vec = DAG.getNode(X86ISD::VSHLI, dl, WideOpVT, Vec, ShiftBits);
   4643     Vec = DAG.getNode(X86ISD::VSRLI, dl, WideOpVT, Vec, ShiftBits);
   4644     Vec = DAG.getNode(ISD::OR, dl, WideOpVT, Vec, WideSubVec);
   4645     return ExtractSubVec(Vec);
   4646   }
   4647   // Subvector should be inserted in the middle - use shuffle
   4648   WideSubVec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, OpVT, Undef,
   4649                            SubVec, ZeroIdx);
   4650   SmallVector<int, 64> Mask;
   4651   for (unsigned i = 0; i < NumElems; ++i)
   4652     Mask.push_back(i >= IdxVal && i < IdxVal + SubVecNumElems ?
   4653                     i : i + NumElems);
   4654   return DAG.getVectorShuffle(OpVT, dl, WideSubVec, Vec, Mask);
   4655 }
   4656 
   4657 /// Concat two 128-bit vectors into a 256 bit vector using VINSERTF128
   4658 /// instructions. This is used because creating CONCAT_VECTOR nodes of
   4659 /// BUILD_VECTORS returns a larger BUILD_VECTOR while we're trying to lower
   4660 /// large BUILD_VECTORS.
   4661 static SDValue concat128BitVectors(SDValue V1, SDValue V2, EVT VT,
   4662                                    unsigned NumElems, SelectionDAG &DAG,
   4663                                    const SDLoc &dl) {
   4664   SDValue V = insert128BitVector(DAG.getUNDEF(VT), V1, 0, DAG, dl);
   4665   return insert128BitVector(V, V2, NumElems / 2, DAG, dl);
   4666 }
   4667 
   4668 static SDValue concat256BitVectors(SDValue V1, SDValue V2, EVT VT,
   4669                                    unsigned NumElems, SelectionDAG &DAG,
   4670                                    const SDLoc &dl) {
   4671   SDValue V = insert256BitVector(DAG.getUNDEF(VT), V1, 0, DAG, dl);
   4672   return insert256BitVector(V, V2, NumElems / 2, DAG, dl);
   4673 }
   4674 
   4675 /// Returns a vector of specified type with all bits set.
   4676 /// Always build ones vectors as <4 x i32> or <8 x i32>. For 256-bit types with
   4677 /// no AVX2 support, use two <4 x i32> inserted in a <8 x i32> appropriately.
   4678 /// Then bitcast to their original type, ensuring they get CSE'd.
   4679 static SDValue getOnesVector(EVT VT, const X86Subtarget &Subtarget,
   4680                              SelectionDAG &DAG, const SDLoc &dl) {
   4681   assert((VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()) &&
   4682          "Expected a 128/256/512-bit vector type");
   4683 
   4684   APInt Ones = APInt::getAllOnesValue(32);
   4685   unsigned NumElts = VT.getSizeInBits() / 32;
   4686   SDValue Vec;
   4687   if (!Subtarget.hasInt256() && NumElts == 8) {
   4688     Vec = DAG.getConstant(Ones, dl, MVT::v4i32);
   4689     Vec = concat128BitVectors(Vec, Vec, MVT::v8i32, 8, DAG, dl);
   4690   } else {
   4691     Vec = DAG.getConstant(Ones, dl, MVT::getVectorVT(MVT::i32, NumElts));
   4692   }
   4693   return DAG.getBitcast(VT, Vec);
   4694 }
   4695 
   4696 /// Returns a vector_shuffle node for an unpackl operation.
   4697 static SDValue getUnpackl(SelectionDAG &DAG, const SDLoc &dl, MVT VT,
   4698                           SDValue V1, SDValue V2) {
   4699   assert(VT.is128BitVector() && "Expected a 128-bit vector type");
   4700   unsigned NumElems = VT.getVectorNumElements();
   4701   SmallVector<int, 8> Mask(NumElems);
   4702   for (unsigned i = 0, e = NumElems/2; i != e; ++i) {
   4703     Mask[i * 2]     = i;
   4704     Mask[i * 2 + 1] = i + NumElems;
   4705   }
   4706   return DAG.getVectorShuffle(VT, dl, V1, V2, Mask);
   4707 }
   4708 
   4709 /// Returns a vector_shuffle node for an unpackh operation.
   4710 static SDValue getUnpackh(SelectionDAG &DAG, const SDLoc &dl, MVT VT,
   4711                           SDValue V1, SDValue V2) {
   4712   assert(VT.is128BitVector() && "Expected a 128-bit vector type");
   4713   unsigned NumElems = VT.getVectorNumElements();
   4714   SmallVector<int, 8> Mask(NumElems);
   4715   for (unsigned i = 0, Half = NumElems/2; i != Half; ++i) {
   4716     Mask[i * 2]     = i + Half;
   4717     Mask[i * 2 + 1] = i + NumElems + Half;
   4718   }
   4719   return DAG.getVectorShuffle(VT, dl, V1, V2, Mask);
   4720 }
   4721 
   4722 /// Return a vector_shuffle of the specified vector of zero or undef vector.
   4723 /// This produces a shuffle where the low element of V2 is swizzled into the
   4724 /// zero/undef vector, landing at element Idx.
   4725 /// This produces a shuffle mask like 4,1,2,3 (idx=0) or  0,1,2,4 (idx=3).
   4726 static SDValue getShuffleVectorZeroOrUndef(SDValue V2, int Idx,
   4727                                            bool IsZero,
   4728                                            const X86Subtarget &Subtarget,
   4729                                            SelectionDAG &DAG) {
   4730   MVT VT = V2.getSimpleValueType();
   4731   SDValue V1 = IsZero
   4732     ? getZeroVector(VT, Subtarget, DAG, SDLoc(V2)) : DAG.getUNDEF(VT);
   4733   int NumElems = VT.getVectorNumElements();
   4734   SmallVector<int, 16> MaskVec(NumElems);
   4735   for (int i = 0; i != NumElems; ++i)
   4736     // If this is the insertion idx, put the low elt of V2 here.
   4737     MaskVec[i] = (i == Idx) ? NumElems : i;
   4738   return DAG.getVectorShuffle(VT, SDLoc(V2), V1, V2, MaskVec);
   4739 }
   4740 
   4741 static SDValue peekThroughBitcasts(SDValue V) {
   4742   while (V.getNode() && V.getOpcode() == ISD::BITCAST)
   4743     V = V.getOperand(0);
   4744   return V;
   4745 }
   4746 
   4747 static bool getTargetShuffleMaskIndices(SDValue MaskNode,
   4748                                         unsigned MaskEltSizeInBits,
   4749                                         SmallVectorImpl<uint64_t> &RawMask) {
   4750   MaskNode = peekThroughBitcasts(MaskNode);
   4751 
   4752   MVT VT = MaskNode.getSimpleValueType();
   4753   assert(VT.isVector() && "Can't produce a non-vector with a build_vector!");
   4754 
   4755   // Split an APInt element into MaskEltSizeInBits sized pieces and
   4756   // insert into the shuffle mask.
   4757   auto SplitElementToMask = [&](APInt Element) {
   4758     // Note that this is x86 and so always little endian: the low byte is
   4759     // the first byte of the mask.
   4760     int Split = VT.getScalarSizeInBits() / MaskEltSizeInBits;
   4761     for (int i = 0; i < Split; ++i) {
   4762       APInt RawElt = Element.getLoBits(MaskEltSizeInBits);
   4763       Element = Element.lshr(MaskEltSizeInBits);
   4764       RawMask.push_back(RawElt.getZExtValue());
   4765     }
   4766   };
   4767 
   4768   if (MaskNode.getOpcode() == X86ISD::VBROADCAST) {
   4769     // TODO: Handle (MaskEltSizeInBits % VT.getScalarSizeInBits()) == 0
   4770     // TODO: Handle (VT.getScalarSizeInBits() % MaskEltSizeInBits) == 0
   4771     if (VT.getScalarSizeInBits() != MaskEltSizeInBits)
   4772       return false;
   4773     if (auto *CN = dyn_cast<ConstantSDNode>(MaskNode.getOperand(0))) {
   4774       const APInt &MaskElement = CN->getAPIntValue();
   4775       for (unsigned i = 0, e = VT.getVectorNumElements(); i != e; ++i) {
   4776         APInt RawElt = MaskElement.getLoBits(MaskEltSizeInBits);
   4777         RawMask.push_back(RawElt.getZExtValue());
   4778       }
   4779     }
   4780     return false;
   4781   }
   4782 
   4783   if (MaskNode.getOpcode() == X86ISD::VZEXT_MOVL &&
   4784       MaskNode.getOperand(0).getOpcode() == ISD::SCALAR_TO_VECTOR) {
   4785 
   4786     // TODO: Handle (MaskEltSizeInBits % VT.getScalarSizeInBits()) == 0
   4787     if ((VT.getScalarSizeInBits() % MaskEltSizeInBits) != 0)
   4788       return false;
   4789     unsigned ElementSplit = VT.getScalarSizeInBits() / MaskEltSizeInBits;
   4790 
   4791     SDValue MaskOp = MaskNode.getOperand(0).getOperand(0);
   4792     if (auto *CN = dyn_cast<ConstantSDNode>(MaskOp)) {
   4793       SplitElementToMask(CN->getAPIntValue());
   4794       RawMask.append((VT.getVectorNumElements() - 1) * ElementSplit, 0);
   4795       return true;
   4796     }
   4797     return false;
   4798   }
   4799 
   4800   if (MaskNode.getOpcode() != ISD::BUILD_VECTOR)
   4801     return false;
   4802 
   4803   // We can always decode if the buildvector is all zero constants,
   4804   // but can't use isBuildVectorAllZeros as it might contain UNDEFs.
   4805   if (llvm::all_of(MaskNode->ops(), X86::isZeroNode)) {
   4806     RawMask.append(VT.getSizeInBits() / MaskEltSizeInBits, 0);
   4807     return true;
   4808   }
   4809 
   4810   // TODO: Handle (MaskEltSizeInBits % VT.getScalarSizeInBits()) == 0
   4811   if ((VT.getScalarSizeInBits() % MaskEltSizeInBits) != 0)
   4812     return false;
   4813 
   4814   for (SDValue Op : MaskNode->ops()) {
   4815     if (auto *CN = dyn_cast<ConstantSDNode>(Op.getNode()))
   4816       SplitElementToMask(CN->getAPIntValue());
   4817     else if (auto *CFN = dyn_cast<ConstantFPSDNode>(Op.getNode()))
   4818       SplitElementToMask(CFN->getValueAPF().bitcastToAPInt());
   4819     else
   4820       return false;
   4821   }
   4822 
   4823   return true;
   4824 }
   4825 
   4826 static const Constant *getTargetShuffleMaskConstant(SDValue MaskNode) {
   4827   MaskNode = peekThroughBitcasts(MaskNode);
   4828 
   4829   auto *MaskLoad = dyn_cast<LoadSDNode>(MaskNode);
   4830   if (!MaskLoad)
   4831     return nullptr;
   4832 
   4833   SDValue Ptr = MaskLoad->getBasePtr();
   4834   if (Ptr->getOpcode() == X86ISD::Wrapper ||
   4835       Ptr->getOpcode() == X86ISD::WrapperRIP)
   4836     Ptr = Ptr->getOperand(0);
   4837 
   4838   auto *MaskCP = dyn_cast<ConstantPoolSDNode>(Ptr);
   4839   if (!MaskCP || MaskCP->isMachineConstantPoolEntry())
   4840     return nullptr;
   4841 
   4842   return dyn_cast<Constant>(MaskCP->getConstVal());
   4843 }
   4844 
   4845 /// Calculates the shuffle mask corresponding to the target-specific opcode.
   4846 /// If the mask could be calculated, returns it in \p Mask, returns the shuffle
   4847 /// operands in \p Ops, and returns true.
   4848 /// Sets \p IsUnary to true if only one source is used. Note that this will set
   4849 /// IsUnary for shuffles which use a single input multiple times, and in those
   4850 /// cases it will adjust the mask to only have indices within that single input.
   4851 /// It is an error to call this with non-empty Mask/Ops vectors.
   4852 static bool getTargetShuffleMask(SDNode *N, MVT VT, bool AllowSentinelZero,
   4853                                  SmallVectorImpl<SDValue> &Ops,
   4854                                  SmallVectorImpl<int> &Mask, bool &IsUnary) {
   4855   unsigned NumElems = VT.getVectorNumElements();
   4856   SDValue ImmN;
   4857 
   4858   assert(Mask.empty() && "getTargetShuffleMask expects an empty Mask vector");
   4859   assert(Ops.empty() && "getTargetShuffleMask expects an empty Ops vector");
   4860 
   4861   IsUnary = false;
   4862   bool IsFakeUnary = false;
   4863   switch(N->getOpcode()) {
   4864   case X86ISD::BLENDI:
   4865     ImmN = N->getOperand(N->getNumOperands()-1);
   4866     DecodeBLENDMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
   4867     break;
   4868   case X86ISD::SHUFP:
   4869     ImmN = N->getOperand(N->getNumOperands()-1);
   4870     DecodeSHUFPMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
   4871     IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
   4872     break;
   4873   case X86ISD::INSERTPS:
   4874     ImmN = N->getOperand(N->getNumOperands()-1);
   4875     DecodeINSERTPSMask(cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
   4876     IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
   4877     break;
   4878   case X86ISD::UNPCKH:
   4879     DecodeUNPCKHMask(VT, Mask);
   4880     IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
   4881     break;
   4882   case X86ISD::UNPCKL:
   4883     DecodeUNPCKLMask(VT, Mask);
   4884     IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
   4885     break;
   4886   case X86ISD::MOVHLPS:
   4887     DecodeMOVHLPSMask(NumElems, Mask);
   4888     IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
   4889     break;
   4890   case X86ISD::MOVLHPS:
   4891     DecodeMOVLHPSMask(NumElems, Mask);
   4892     IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
   4893     break;
   4894   case X86ISD::PALIGNR:
   4895     assert(VT.getScalarType() == MVT::i8 && "Byte vector expected");
   4896     ImmN = N->getOperand(N->getNumOperands()-1);
   4897     DecodePALIGNRMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
   4898     break;
   4899   case X86ISD::VSHLDQ:
   4900     assert(VT.getScalarType() == MVT::i8 && "Byte vector expected");
   4901     ImmN = N->getOperand(N->getNumOperands() - 1);
   4902     DecodePSLLDQMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
   4903     IsUnary = true;
   4904     break;
   4905   case X86ISD::VSRLDQ:
   4906     assert(VT.getScalarType() == MVT::i8 && "Byte vector expected");
   4907     ImmN = N->getOperand(N->getNumOperands() - 1);
   4908     DecodePSRLDQMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
   4909     IsUnary = true;
   4910     break;
   4911   case X86ISD::PSHUFD:
   4912   case X86ISD::VPERMILPI:
   4913     ImmN = N->getOperand(N->getNumOperands()-1);
   4914     DecodePSHUFMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
   4915     IsUnary = true;
   4916     break;
   4917   case X86ISD::PSHUFHW:
   4918     ImmN = N->getOperand(N->getNumOperands()-1);
   4919     DecodePSHUFHWMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
   4920     IsUnary = true;
   4921     break;
   4922   case X86ISD::PSHUFLW:
   4923     ImmN = N->getOperand(N->getNumOperands()-1);
   4924     DecodePSHUFLWMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
   4925     IsUnary = true;
   4926     break;
   4927   case X86ISD::VZEXT_MOVL:
   4928     DecodeZeroMoveLowMask(VT, Mask);
   4929     IsUnary = true;
   4930     break;
   4931   case X86ISD::VPERMILPV: {
   4932     IsUnary = true;
   4933     SDValue MaskNode = N->getOperand(1);
   4934     unsigned MaskEltSize = VT.getScalarSizeInBits();
   4935     SmallVector<uint64_t, 32> RawMask;
   4936     if (getTargetShuffleMaskIndices(MaskNode, MaskEltSize, RawMask)) {
   4937       DecodeVPERMILPMask(VT, RawMask, Mask);
   4938       break;
   4939     }
   4940     if (auto *C = getTargetShuffleMaskConstant(MaskNode)) {
   4941       DecodeVPERMILPMask(C, MaskEltSize, Mask);
   4942       break;
   4943     }
   4944     return false;
   4945   }
   4946   case X86ISD::PSHUFB: {
   4947     IsUnary = true;
   4948     SDValue MaskNode = N->getOperand(1);
   4949     SmallVector<uint64_t, 32> RawMask;
   4950     if (getTargetShuffleMaskIndices(MaskNode, 8, RawMask)) {
   4951       DecodePSHUFBMask(RawMask, Mask);
   4952       break;
   4953     }
   4954     if (auto *C = getTargetShuffleMaskConstant(MaskNode)) {
   4955       DecodePSHUFBMask(C, Mask);
   4956       break;
   4957     }
   4958     return false;
   4959   }
   4960   case X86ISD::VPERMI:
   4961     ImmN = N->getOperand(N->getNumOperands()-1);
   4962     DecodeVPERMMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
   4963     IsUnary = true;
   4964     break;
   4965   case X86ISD::MOVSS:
   4966   case X86ISD::MOVSD:
   4967     DecodeScalarMoveMask(VT, /* IsLoad */ false, Mask);
   4968     break;
   4969   case X86ISD::VPERM2X128:
   4970     ImmN = N->getOperand(N->getNumOperands()-1);
   4971     DecodeVPERM2X128Mask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
   4972     IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
   4973     break;
   4974   case X86ISD::MOVSLDUP:
   4975     DecodeMOVSLDUPMask(VT, Mask);
   4976     IsUnary = true;
   4977     break;
   4978   case X86ISD::MOVSHDUP:
   4979     DecodeMOVSHDUPMask(VT, Mask);
   4980     IsUnary = true;
   4981     break;
   4982   case X86ISD::MOVDDUP:
   4983     DecodeMOVDDUPMask(VT, Mask);
   4984     IsUnary = true;
   4985     break;
   4986   case X86ISD::MOVLHPD:
   4987   case X86ISD::MOVLPD:
   4988   case X86ISD::MOVLPS:
   4989     // Not yet implemented
   4990     return false;
   4991   case X86ISD::VPERMIL2: {
   4992     IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
   4993     unsigned MaskEltSize = VT.getScalarSizeInBits();
   4994     SDValue MaskNode = N->getOperand(2);
   4995     SDValue CtrlNode = N->getOperand(3);
   4996     if (ConstantSDNode *CtrlOp = dyn_cast<ConstantSDNode>(CtrlNode)) {
   4997       unsigned CtrlImm = CtrlOp->getZExtValue();
   4998       SmallVector<uint64_t, 32> RawMask;
   4999       if (getTargetShuffleMaskIndices(MaskNode, MaskEltSize, RawMask)) {
   5000         DecodeVPERMIL2PMask(VT, CtrlImm, RawMask, Mask);
   5001         break;
   5002       }
   5003       if (auto *C = getTargetShuffleMaskConstant(MaskNode)) {
   5004         DecodeVPERMIL2PMask(C, CtrlImm, MaskEltSize, Mask);
   5005         break;
   5006       }
   5007     }
   5008     return false;
   5009   }
   5010   case X86ISD::VPPERM: {
   5011     IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
   5012     SDValue MaskNode = N->getOperand(2);
   5013     SmallVector<uint64_t, 32> RawMask;
   5014     if (getTargetShuffleMaskIndices(MaskNode, 8, RawMask)) {
   5015       DecodeVPPERMMask(RawMask, Mask);
   5016       break;
   5017     }
   5018     if (auto *C = getTargetShuffleMaskConstant(MaskNode)) {
   5019       DecodeVPPERMMask(C, Mask);
   5020       break;
   5021     }
   5022     return false;
   5023   }
   5024   case X86ISD::VPERMV: {
   5025     IsUnary = true;
   5026     // Unlike most shuffle nodes, VPERMV's mask operand is operand 0.
   5027     Ops.push_back(N->getOperand(1));
   5028     SDValue MaskNode = N->getOperand(0);
   5029     SmallVector<uint64_t, 32> RawMask;
   5030     unsigned MaskEltSize = VT.getScalarSizeInBits();
   5031     if (getTargetShuffleMaskIndices(MaskNode, MaskEltSize, RawMask)) {
   5032       DecodeVPERMVMask(RawMask, Mask);
   5033       break;
   5034     }
   5035     if (auto *C = getTargetShuffleMaskConstant(MaskNode)) {
   5036       DecodeVPERMVMask(C, VT, Mask);
   5037       break;
   5038     }
   5039     return false;
   5040   }
   5041   case X86ISD::VPERMV3: {
   5042     IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(2);
   5043     // Unlike most shuffle nodes, VPERMV3's mask operand is the middle one.
   5044     Ops.push_back(N->getOperand(0));
   5045     Ops.push_back(N->getOperand(2));
   5046     SDValue MaskNode = N->getOperand(1);
   5047     if (auto *C = getTargetShuffleMaskConstant(MaskNode)) {
   5048       DecodeVPERMV3Mask(C, VT, Mask);
   5049       break;
   5050     }
   5051     return false;
   5052   }
   5053   default: llvm_unreachable("unknown target shuffle node");
   5054   }
   5055 
   5056   // Empty mask indicates the decode failed.
   5057   if (Mask.empty())
   5058     return false;
   5059 
   5060   // Check if we're getting a shuffle mask with zero'd elements.
   5061   if (!AllowSentinelZero)
   5062     if (llvm::any_of(Mask, [](int M) { return M == SM_SentinelZero; }))
   5063       return false;
   5064 
   5065   // If we have a fake unary shuffle, the shuffle mask is spread across two
   5066   // inputs that are actually the same node. Re-map the mask to always point
   5067   // into the first input.
   5068   if (IsFakeUnary)
   5069     for (int &M : Mask)
   5070       if (M >= (int)Mask.size())
   5071         M -= Mask.size();
   5072 
   5073   // If we didn't already add operands in the opcode-specific code, default to
   5074   // adding 1 or 2 operands starting at 0.
   5075   if (Ops.empty()) {
   5076     Ops.push_back(N->getOperand(0));
   5077     if (!IsUnary || IsFakeUnary)
   5078       Ops.push_back(N->getOperand(1));
   5079   }
   5080 
   5081   return true;
   5082 }
   5083 
   5084 /// Check a target shuffle mask's inputs to see if we can set any values to
   5085 /// SM_SentinelZero - this is for elements that are known to be zero
   5086 /// (not just zeroable) from their inputs.
   5087 /// Returns true if the target shuffle mask was decoded.
   5088 static bool setTargetShuffleZeroElements(SDValue N,
   5089                                          SmallVectorImpl<int> &Mask,
   5090                                          SmallVectorImpl<SDValue> &Ops) {
   5091   bool IsUnary;
   5092   if (!isTargetShuffle(N.getOpcode()))
   5093     return false;
   5094   if (!getTargetShuffleMask(N.getNode(), N.getSimpleValueType(), true, Ops,
   5095                             Mask, IsUnary))
   5096     return false;
   5097 
   5098   SDValue V1 = Ops[0];
   5099   SDValue V2 = IsUnary ? V1 : Ops[1];
   5100 
   5101   V1 = peekThroughBitcasts(V1);
   5102   V2 = peekThroughBitcasts(V2);
   5103 
   5104   for (int i = 0, Size = Mask.size(); i < Size; ++i) {
   5105     int M = Mask[i];
   5106 
   5107     // Already decoded as SM_SentinelZero / SM_SentinelUndef.
   5108     if (M < 0)
   5109       continue;
   5110 
   5111     // Determine shuffle input and normalize the mask.
   5112     SDValue V = M < Size ? V1 : V2;
   5113     M %= Size;
   5114 
   5115     // We are referencing an UNDEF input.
   5116     if (V.isUndef()) {
   5117       Mask[i] = SM_SentinelUndef;
   5118       continue;
   5119     }
   5120 
   5121     // Currently we can only search BUILD_VECTOR for UNDEF/ZERO elements.
   5122     if (V.getOpcode() != ISD::BUILD_VECTOR)
   5123       continue;
   5124 
   5125     // If the BUILD_VECTOR has fewer elements then the (larger) source
   5126     // element must be UNDEF/ZERO.
   5127     // TODO: Is it worth testing the individual bits of a constant?
   5128     if ((Size % V.getNumOperands()) == 0) {
   5129       int Scale = Size / V->getNumOperands();
   5130       SDValue Op = V.getOperand(M / Scale);
   5131       if (Op.isUndef())
   5132         Mask[i] = SM_SentinelUndef;
   5133       else if (X86::isZeroNode(Op))
   5134         Mask[i] = SM_SentinelZero;
   5135       continue;
   5136     }
   5137 
   5138     // If the BUILD_VECTOR has more elements then all the (smaller) source
   5139     // elements must be all UNDEF or all ZERO.
   5140     if ((V.getNumOperands() % Size) == 0) {
   5141       int Scale = V->getNumOperands() / Size;
   5142       bool AllUndef = true;
   5143       bool AllZero = true;
   5144       for (int j = 0; j < Scale; ++j) {
   5145         SDValue Op = V.getOperand((M * Scale) + j);
   5146         AllUndef &= Op.isUndef();
   5147         AllZero &= X86::isZeroNode(Op);
   5148       }
   5149       if (AllUndef)
   5150         Mask[i] = SM_SentinelUndef;
   5151       else if (AllZero)
   5152         Mask[i] = SM_SentinelZero;
   5153       continue;
   5154     }
   5155   }
   5156 
   5157   return true;
   5158 }
   5159 
   5160 /// Calls setTargetShuffleZeroElements to resolve a target shuffle mask's inputs
   5161 /// and set the SM_SentinelUndef and SM_SentinelZero values. Then check the
   5162 /// remaining input indices in case we now have a unary shuffle and adjust the
   5163 /// Op0/Op1 inputs accordingly.
   5164 /// Returns true if the target shuffle mask was decoded.
   5165 static bool resolveTargetShuffleInputs(SDValue Op, SDValue &Op0, SDValue &Op1,
   5166                                        SmallVectorImpl<int> &Mask) {
   5167   SmallVector<SDValue, 2> Ops;
   5168   if (!setTargetShuffleZeroElements(Op, Mask, Ops))
   5169     return false;
   5170 
   5171   int NumElts = Mask.size();
   5172   bool Op0InUse = std::any_of(Mask.begin(), Mask.end(), [NumElts](int Idx) {
   5173     return 0 <= Idx && Idx < NumElts;
   5174   });
   5175   bool Op1InUse = std::any_of(Mask.begin(), Mask.end(),
   5176                               [NumElts](int Idx) { return NumElts <= Idx; });
   5177 
   5178   Op0 = Op0InUse ? Ops[0] : SDValue();
   5179   Op1 = Op1InUse ? Ops[1] : SDValue();
   5180 
   5181   // We're only using Op1 - commute the mask and inputs.
   5182   if (!Op0InUse && Op1InUse) {
   5183     for (int &M : Mask)
   5184       if (NumElts <= M)
   5185         M -= NumElts;
   5186     Op0 = Op1;
   5187     Op1 = SDValue();
   5188   }
   5189 
   5190   return true;
   5191 }
   5192 
   5193 /// Returns the scalar element that will make up the ith
   5194 /// element of the result of the vector shuffle.
   5195 static SDValue getShuffleScalarElt(SDNode *N, unsigned Index, SelectionDAG &DAG,
   5196                                    unsigned Depth) {
   5197   if (Depth == 6)
   5198     return SDValue();  // Limit search depth.
   5199 
   5200   SDValue V = SDValue(N, 0);
   5201   EVT VT = V.getValueType();
   5202   unsigned Opcode = V.getOpcode();
   5203 
   5204   // Recurse into ISD::VECTOR_SHUFFLE node to find scalars.
   5205   if (const ShuffleVectorSDNode *SV = dyn_cast<ShuffleVectorSDNode>(N)) {
   5206     int Elt = SV->getMaskElt(Index);
   5207 
   5208     if (Elt < 0)
   5209       return DAG.getUNDEF(VT.getVectorElementType());
   5210 
   5211     unsigned NumElems = VT.getVectorNumElements();
   5212     SDValue NewV = (Elt < (int)NumElems) ? SV->getOperand(0)
   5213                                          : SV->getOperand(1);
   5214     return getShuffleScalarElt(NewV.getNode(), Elt % NumElems, DAG, Depth+1);
   5215   }
   5216 
   5217   // Recurse into target specific vector shuffles to find scalars.
   5218   if (isTargetShuffle(Opcode)) {
   5219     MVT ShufVT = V.getSimpleValueType();
   5220     MVT ShufSVT = ShufVT.getVectorElementType();
   5221     int NumElems = (int)ShufVT.getVectorNumElements();
   5222     SmallVector<int, 16> ShuffleMask;
   5223     SmallVector<SDValue, 16> ShuffleOps;
   5224     bool IsUnary;
   5225 
   5226     if (!getTargetShuffleMask(N, ShufVT, true, ShuffleOps, ShuffleMask, IsUnary))
   5227       return SDValue();
   5228 
   5229     int Elt = ShuffleMask[Index];
   5230     if (Elt == SM_SentinelZero)
   5231       return ShufSVT.isInteger() ? DAG.getConstant(0, SDLoc(N), ShufSVT)
   5232                                  : DAG.getConstantFP(+0.0, SDLoc(N), ShufSVT);
   5233     if (Elt == SM_SentinelUndef)
   5234       return DAG.getUNDEF(ShufSVT);
   5235 
   5236     assert(0 <= Elt && Elt < (2*NumElems) && "Shuffle index out of range");
   5237     SDValue NewV = (Elt < NumElems) ? ShuffleOps[0] : ShuffleOps[1];
   5238     return getShuffleScalarElt(NewV.getNode(), Elt % NumElems, DAG,
   5239                                Depth+1);
   5240   }
   5241 
   5242   // Actual nodes that may contain scalar elements
   5243   if (Opcode == ISD::BITCAST) {
   5244     V = V.getOperand(0);
   5245     EVT SrcVT = V.getValueType();
   5246     unsigned NumElems = VT.getVectorNumElements();
   5247 
   5248     if (!SrcVT.isVector() || SrcVT.getVectorNumElements() != NumElems)
   5249       return SDValue();
   5250   }
   5251 
   5252   if (V.getOpcode() == ISD::SCALAR_TO_VECTOR)
   5253     return (Index == 0) ? V.getOperand(0)
   5254                         : DAG.getUNDEF(VT.getVectorElementType());
   5255 
   5256   if (V.getOpcode() == ISD::BUILD_VECTOR)
   5257     return V.getOperand(Index);
   5258 
   5259   return SDValue();
   5260 }
   5261 
   5262 /// Custom lower build_vector of v16i8.
   5263 static SDValue LowerBuildVectorv16i8(SDValue Op, unsigned NonZeros,
   5264                                        unsigned NumNonZero, unsigned NumZero,
   5265                                        SelectionDAG &DAG,
   5266                                        const X86Subtarget &Subtarget,
   5267                                        const TargetLowering &TLI) {
   5268   if (NumNonZero > 8)
   5269     return SDValue();
   5270 
   5271   SDLoc dl(Op);
   5272   SDValue V;
   5273   bool First = true;
   5274 
   5275   // SSE4.1 - use PINSRB to insert each byte directly.
   5276   if (Subtarget.hasSSE41()) {
   5277     for (unsigned i = 0; i < 16; ++i) {
   5278       bool isNonZero = (NonZeros & (1 << i)) != 0;
   5279       if (isNonZero) {
   5280         if (First) {
   5281           if (NumZero)
   5282             V = getZeroVector(MVT::v16i8, Subtarget, DAG, dl);
   5283           else
   5284             V = DAG.getUNDEF(MVT::v16i8);
   5285           First = false;
   5286         }
   5287         V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl,
   5288                         MVT::v16i8, V, Op.getOperand(i),
   5289                         DAG.getIntPtrConstant(i, dl));
   5290       }
   5291     }
   5292 
   5293     return V;
   5294   }
   5295 
   5296   // Pre-SSE4.1 - merge byte pairs and insert with PINSRW.
   5297   for (unsigned i = 0; i < 16; ++i) {
   5298     bool ThisIsNonZero = (NonZeros & (1 << i)) != 0;
   5299     if (ThisIsNonZero && First) {
   5300       if (NumZero)
   5301         V = getZeroVector(MVT::v8i16, Subtarget, DAG, dl);
   5302       else
   5303         V = DAG.getUNDEF(MVT::v8i16);
   5304       First = false;
   5305     }
   5306 
   5307     if ((i & 1) != 0) {
   5308       SDValue ThisElt, LastElt;
   5309       bool LastIsNonZero = (NonZeros & (1 << (i-1))) != 0;
   5310       if (LastIsNonZero) {
   5311         LastElt = DAG.getNode(ISD::ZERO_EXTEND, dl,
   5312                               MVT::i16, Op.getOperand(i-1));
   5313       }
   5314       if (ThisIsNonZero) {
   5315         ThisElt = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i16, Op.getOperand(i));
   5316         ThisElt = DAG.getNode(ISD::SHL, dl, MVT::i16,
   5317                               ThisElt, DAG.getConstant(8, dl, MVT::i8));
   5318         if (LastIsNonZero)
   5319           ThisElt = DAG.getNode(ISD::OR, dl, MVT::i16, ThisElt, LastElt);
   5320       } else
   5321         ThisElt = LastElt;
   5322 
   5323       if (ThisElt.getNode())
   5324         V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v8i16, V, ThisElt,
   5325                         DAG.getIntPtrConstant(i/2, dl));
   5326     }
   5327   }
   5328 
   5329   return DAG.getBitcast(MVT::v16i8, V);
   5330 }
   5331 
   5332 /// Custom lower build_vector of v8i16.
   5333 static SDValue LowerBuildVectorv8i16(SDValue Op, unsigned NonZeros,
   5334                                      unsigned NumNonZero, unsigned NumZero,
   5335                                      SelectionDAG &DAG,
   5336                                      const X86Subtarget &Subtarget,
   5337                                      const TargetLowering &TLI) {
   5338   if (NumNonZero > 4)
   5339     return SDValue();
   5340 
   5341   SDLoc dl(Op);
   5342   SDValue V;
   5343   bool First = true;
   5344   for (unsigned i = 0; i < 8; ++i) {
   5345     bool isNonZero = (NonZeros & (1 << i)) != 0;
   5346     if (isNonZero) {
   5347       if (First) {
   5348         if (NumZero)
   5349           V = getZeroVector(MVT::v8i16, Subtarget, DAG, dl);
   5350         else
   5351           V = DAG.getUNDEF(MVT::v8i16);
   5352         First = false;
   5353       }
   5354       V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl,
   5355                       MVT::v8i16, V, Op.getOperand(i),
   5356                       DAG.getIntPtrConstant(i, dl));
   5357     }
   5358   }
   5359 
   5360   return V;
   5361 }
   5362 
   5363 /// Custom lower build_vector of v4i32 or v4f32.
   5364 static SDValue LowerBuildVectorv4x32(SDValue Op, SelectionDAG &DAG,
   5365                                      const X86Subtarget &Subtarget,
   5366                                      const TargetLowering &TLI) {
   5367   // Find all zeroable elements.
   5368   std::bitset<4> Zeroable;
   5369   for (int i=0; i < 4; ++i) {
   5370     SDValue Elt = Op->getOperand(i);
   5371     Zeroable[i] = (Elt.isUndef() || X86::isZeroNode(Elt));
   5372   }
   5373   assert(Zeroable.size() - Zeroable.count() > 1 &&
   5374          "We expect at least two non-zero elements!");
   5375 
   5376   // We only know how to deal with build_vector nodes where elements are either
   5377   // zeroable or extract_vector_elt with constant index.
   5378   SDValue FirstNonZero;
   5379   unsigned FirstNonZeroIdx;
   5380   for (unsigned i=0; i < 4; ++i) {
   5381     if (Zeroable[i])
   5382       continue;
   5383     SDValue Elt = Op->getOperand(i);
   5384     if (Elt.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
   5385         !isa<ConstantSDNode>(Elt.getOperand(1)))
   5386       return SDValue();
   5387     // Make sure that this node is extracting from a 128-bit vector.
   5388     MVT VT = Elt.getOperand(0).getSimpleValueType();
   5389     if (!VT.is128BitVector())
   5390       return SDValue();
   5391     if (!FirstNonZero.getNode()) {
   5392       FirstNonZero = Elt;
   5393       FirstNonZeroIdx = i;
   5394     }
   5395   }
   5396 
   5397   assert(FirstNonZero.getNode() && "Unexpected build vector of all zeros!");
   5398   SDValue V1 = FirstNonZero.getOperand(0);
   5399   MVT VT = V1.getSimpleValueType();
   5400 
   5401   // See if this build_vector can be lowered as a blend with zero.
   5402   SDValue Elt;
   5403   unsigned EltMaskIdx, EltIdx;
   5404   int Mask[4];
   5405   for (EltIdx = 0; EltIdx < 4; ++EltIdx) {
   5406     if (Zeroable[EltIdx]) {
   5407       // The zero vector will be on the right hand side.
   5408       Mask[EltIdx] = EltIdx+4;
   5409       continue;
   5410     }
   5411 
   5412     Elt = Op->getOperand(EltIdx);
   5413     // By construction, Elt is a EXTRACT_VECTOR_ELT with constant index.
   5414     EltMaskIdx = cast<ConstantSDNode>(Elt.getOperand(1))->getZExtValue();
   5415     if (Elt.getOperand(0) != V1 || EltMaskIdx != EltIdx)
   5416       break;
   5417     Mask[EltIdx] = EltIdx;
   5418   }
   5419 
   5420   if (EltIdx == 4) {
   5421     // Let the shuffle legalizer deal with blend operations.
   5422     SDValue VZero = getZeroVector(VT, Subtarget, DAG, SDLoc(Op));
   5423     if (V1.getSimpleValueType() != VT)
   5424       V1 = DAG.getBitcast(VT, V1);
   5425     return DAG.getVectorShuffle(VT, SDLoc(V1), V1, VZero, Mask);
   5426   }
   5427 
   5428   // See if we can lower this build_vector to a INSERTPS.
   5429   if (!Subtarget.hasSSE41())
   5430     return SDValue();
   5431 
   5432   SDValue V2 = Elt.getOperand(0);
   5433   if (Elt == FirstNonZero && EltIdx == FirstNonZeroIdx)
   5434     V1 = SDValue();
   5435 
   5436   bool CanFold = true;
   5437   for (unsigned i = EltIdx + 1; i < 4 && CanFold; ++i) {
   5438     if (Zeroable[i])
   5439       continue;
   5440 
   5441     SDValue Current = Op->getOperand(i);
   5442     SDValue SrcVector = Current->getOperand(0);
   5443     if (!V1.getNode())
   5444       V1 = SrcVector;
   5445     CanFold = SrcVector == V1 &&
   5446       cast<ConstantSDNode>(Current.getOperand(1))->getZExtValue() == i;
   5447   }
   5448 
   5449   if (!CanFold)
   5450     return SDValue();
   5451 
   5452   assert(V1.getNode() && "Expected at least two non-zero elements!");
   5453   if (V1.getSimpleValueType() != MVT::v4f32)
   5454     V1 = DAG.getBitcast(MVT::v4f32, V1);
   5455   if (V2.getSimpleValueType() != MVT::v4f32)
   5456     V2 = DAG.getBitcast(MVT::v4f32, V2);
   5457 
   5458   // Ok, we can emit an INSERTPS instruction.
   5459   unsigned ZMask = Zeroable.to_ulong();
   5460 
   5461   unsigned InsertPSMask = EltMaskIdx << 6 | EltIdx << 4 | ZMask;
   5462   assert((InsertPSMask & ~0xFFu) == 0 && "Invalid mask!");
   5463   SDLoc DL(Op);
   5464   SDValue Result = DAG.getNode(X86ISD::INSERTPS, DL, MVT::v4f32, V1, V2,
   5465                                DAG.getIntPtrConstant(InsertPSMask, DL));
   5466   return DAG.getBitcast(VT, Result);
   5467 }
   5468 
   5469 /// Return a vector logical shift node.
   5470 static SDValue getVShift(bool isLeft, EVT VT, SDValue SrcOp, unsigned NumBits,
   5471                          SelectionDAG &DAG, const TargetLowering &TLI,
   5472                          const SDLoc &dl) {
   5473   assert(VT.is128BitVector() && "Unknown type for VShift");
   5474   MVT ShVT = MVT::v16i8;
   5475   unsigned Opc = isLeft ? X86ISD::VSHLDQ : X86ISD::VSRLDQ;
   5476   SrcOp = DAG.getBitcast(ShVT, SrcOp);
   5477   MVT ScalarShiftTy = TLI.getScalarShiftAmountTy(DAG.getDataLayout(), VT);
   5478   assert(NumBits % 8 == 0 && "Only support byte sized shifts");
   5479   SDValue ShiftVal = DAG.getConstant(NumBits/8, dl, ScalarShiftTy);
   5480   return DAG.getBitcast(VT, DAG.getNode(Opc, dl, ShVT, SrcOp, ShiftVal));
   5481 }
   5482 
   5483 static SDValue LowerAsSplatVectorLoad(SDValue SrcOp, MVT VT, const SDLoc &dl,
   5484                                       SelectionDAG &DAG) {
   5485 
   5486   // Check if the scalar load can be widened into a vector load. And if
   5487   // the address is "base + cst" see if the cst can be "absorbed" into
   5488   // the shuffle mask.
   5489   if (LoadSDNode *LD = dyn_cast<LoadSDNode>(SrcOp)) {
   5490     SDValue Ptr = LD->getBasePtr();
   5491     if (!ISD::isNormalLoad(LD) || LD->isVolatile())
   5492       return SDValue();
   5493     EVT PVT = LD->getValueType(0);
   5494     if (PVT != MVT::i32 && PVT != MVT::f32)
   5495       return SDValue();
   5496 
   5497     int FI = -1;
   5498     int64_t Offset = 0;
   5499     if (FrameIndexSDNode *FINode = dyn_cast<FrameIndexSDNode>(Ptr)) {
   5500       FI = FINode->getIndex();
   5501       Offset = 0;
   5502     } else if (DAG.isBaseWithConstantOffset(Ptr) &&
   5503                isa<FrameIndexSDNode>(Ptr.getOperand(0))) {
   5504       FI = cast<FrameIndexSDNode>(Ptr.getOperand(0))->getIndex();
   5505       Offset = Ptr.getConstantOperandVal(1);
   5506       Ptr = Ptr.getOperand(0);
   5507     } else {
   5508       return SDValue();
   5509     }
   5510 
   5511     // FIXME: 256-bit vector instructions don't require a strict alignment,
   5512     // improve this code to support it better.
   5513     unsigned RequiredAlign = VT.getSizeInBits()/8;
   5514     SDValue Chain = LD->getChain();
   5515     // Make sure the stack object alignment is at least 16 or 32.
   5516     MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo();
   5517     if (DAG.InferPtrAlignment(Ptr) < RequiredAlign) {
   5518       if (MFI->isFixedObjectIndex(FI)) {
   5519         // Can't change the alignment. FIXME: It's possible to compute
   5520         // the exact stack offset and reference FI + adjust offset instead.
   5521         // If someone *really* cares about this. That's the way to implement it.
   5522         return SDValue();
   5523       } else {
   5524         MFI->setObjectAlignment(FI, RequiredAlign);
   5525       }
   5526     }
   5527 
   5528     // (Offset % 16 or 32) must be multiple of 4. Then address is then
   5529     // Ptr + (Offset & ~15).
   5530     if (Offset < 0)
   5531       return SDValue();
   5532     if ((Offset % RequiredAlign) & 3)
   5533       return SDValue();
   5534     int64_t StartOffset = Offset & ~int64_t(RequiredAlign - 1);
   5535     if (StartOffset) {
   5536       SDLoc DL(Ptr);
   5537       Ptr = DAG.getNode(ISD::ADD, DL, Ptr.getValueType(), Ptr,
   5538                         DAG.getConstant(StartOffset, DL, Ptr.getValueType()));
   5539     }
   5540 
   5541     int EltNo = (Offset - StartOffset) >> 2;
   5542     unsigned NumElems = VT.getVectorNumElements();
   5543 
   5544     EVT NVT = EVT::getVectorVT(*DAG.getContext(), PVT, NumElems);
   5545     SDValue V1 = DAG.getLoad(NVT, dl, Chain, Ptr,
   5546                              LD->getPointerInfo().getWithOffset(StartOffset),
   5547                              false, false, false, 0);
   5548 
   5549     SmallVector<int, 8> Mask(NumElems, EltNo);
   5550 
   5551     return DAG.getVectorShuffle(NVT, dl, V1, DAG.getUNDEF(NVT), Mask);
   5552   }
   5553 
   5554   return SDValue();
   5555 }
   5556 
   5557 /// Given the initializing elements 'Elts' of a vector of type 'VT', see if the
   5558 /// elements can be replaced by a single large load which has the same value as
   5559 /// a build_vector or insert_subvector whose loaded operands are 'Elts'.
   5560 ///
   5561 /// Example: <load i32 *a, load i32 *a+4, zero, undef> -> zextload a
   5562 static SDValue EltsFromConsecutiveLoads(EVT VT, ArrayRef<SDValue> Elts,
   5563                                         SDLoc &DL, SelectionDAG &DAG,
   5564                                         bool isAfterLegalize) {
   5565   unsigned NumElems = Elts.size();
   5566 
   5567   int LastLoadedElt = -1;
   5568   SmallBitVector LoadMask(NumElems, false);
   5569   SmallBitVector ZeroMask(NumElems, false);
   5570   SmallBitVector UndefMask(NumElems, false);
   5571 
   5572   // For each element in the initializer, see if we've found a load, zero or an
   5573   // undef.
   5574   for (unsigned i = 0; i < NumElems; ++i) {
   5575     SDValue Elt = peekThroughBitcasts(Elts[i]);
   5576     if (!Elt.getNode())
   5577       return SDValue();
   5578 
   5579     if (Elt.isUndef())
   5580       UndefMask[i] = true;
   5581     else if (X86::isZeroNode(Elt) || ISD::isBuildVectorAllZeros(Elt.getNode()))
   5582       ZeroMask[i] = true;
   5583     else if (ISD::isNON_EXTLoad(Elt.getNode())) {
   5584       LoadMask[i] = true;
   5585       LastLoadedElt = i;
   5586       // Each loaded element must be the correct fractional portion of the
   5587       // requested vector load.
   5588       if ((NumElems * Elt.getValueSizeInBits()) != VT.getSizeInBits())
   5589         return SDValue();
   5590     } else
   5591       return SDValue();
   5592   }
   5593   assert((ZeroMask | UndefMask | LoadMask).count() == NumElems &&
   5594          "Incomplete element masks");
   5595 
   5596   // Handle Special Cases - all undef or undef/zero.
   5597   if (UndefMask.count() == NumElems)
   5598     return DAG.getUNDEF(VT);
   5599 
   5600   // FIXME: Should we return this as a BUILD_VECTOR instead?
   5601   if ((ZeroMask | UndefMask).count() == NumElems)
   5602     return VT.isInteger() ? DAG.getConstant(0, DL, VT)
   5603                           : DAG.getConstantFP(0.0, DL, VT);
   5604 
   5605   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
   5606   int FirstLoadedElt = LoadMask.find_first();
   5607   SDValue EltBase = peekThroughBitcasts(Elts[FirstLoadedElt]);
   5608   LoadSDNode *LDBase = cast<LoadSDNode>(EltBase);
   5609   EVT LDBaseVT = EltBase.getValueType();
   5610 
   5611   // Consecutive loads can contain UNDEFS but not ZERO elements.
   5612   // Consecutive loads with UNDEFs and ZEROs elements require a
   5613   // an additional shuffle stage to clear the ZERO elements.
   5614   bool IsConsecutiveLoad = true;
   5615   bool IsConsecutiveLoadWithZeros = true;
   5616   for (int i = FirstLoadedElt + 1; i <= LastLoadedElt; ++i) {
   5617     if (LoadMask[i]) {
   5618       SDValue Elt = peekThroughBitcasts(Elts[i]);
   5619       LoadSDNode *LD = cast<LoadSDNode>(Elt);
   5620       if (!DAG.areNonVolatileConsecutiveLoads(
   5621               LD, LDBase, Elt.getValueType().getStoreSizeInBits() / 8,
   5622               i - FirstLoadedElt)) {
   5623         IsConsecutiveLoad = false;
   5624         IsConsecutiveLoadWithZeros = false;
   5625         break;
   5626       }
   5627     } else if (ZeroMask[i]) {
   5628       IsConsecutiveLoad = false;
   5629     }
   5630   }
   5631 
   5632   auto CreateLoad = [&DAG, &DL](EVT VT, LoadSDNode *LDBase) {
   5633     SDValue NewLd = DAG.getLoad(
   5634         VT, DL, LDBase->getChain(), LDBase->getBasePtr(),
   5635         LDBase->getPointerInfo(), false /*LDBase->isVolatile()*/,
   5636         LDBase->isNonTemporal(), LDBase->isInvariant(), LDBase->getAlignment());
   5637 
   5638     if (LDBase->hasAnyUseOfValue(1)) {
   5639       SDValue NewChain =
   5640           DAG.getNode(ISD::TokenFactor, DL, MVT::Other, SDValue(LDBase, 1),
   5641                       SDValue(NewLd.getNode(), 1));
   5642       DAG.ReplaceAllUsesOfValueWith(SDValue(LDBase, 1), NewChain);
   5643       DAG.UpdateNodeOperands(NewChain.getNode(), SDValue(LDBase, 1),
   5644                              SDValue(NewLd.getNode(), 1));
   5645     }
   5646 
   5647     return NewLd;
   5648   };
   5649 
   5650   // LOAD - all consecutive load/undefs (must start/end with a load).
   5651   // If we have found an entire vector of loads and undefs, then return a large
   5652   // load of the entire vector width starting at the base pointer.
   5653   // If the vector contains zeros, then attempt to shuffle those elements.
   5654   if (FirstLoadedElt == 0 && LastLoadedElt == (int)(NumElems - 1) &&
   5655       (IsConsecutiveLoad || IsConsecutiveLoadWithZeros)) {
   5656     assert(LDBase && "Did not find base load for merging consecutive loads");
   5657     EVT EltVT = LDBase->getValueType(0);
   5658     // Ensure that the input vector size for the merged loads matches the
   5659     // cumulative size of the input elements.
   5660     if (VT.getSizeInBits() != EltVT.getSizeInBits() * NumElems)
   5661       return SDValue();
   5662 
   5663     if (isAfterLegalize && !TLI.isOperationLegal(ISD::LOAD, VT))
   5664       return SDValue();
   5665 
   5666     if (IsConsecutiveLoad)
   5667       return CreateLoad(VT, LDBase);
   5668 
   5669     // IsConsecutiveLoadWithZeros - we need to create a shuffle of the loaded
   5670     // vector and a zero vector to clear out the zero elements.
   5671     if (!isAfterLegalize && NumElems == VT.getVectorNumElements()) {
   5672       SmallVector<int, 4> ClearMask(NumElems, -1);
   5673       for (unsigned i = 0; i < NumElems; ++i) {
   5674         if (ZeroMask[i])
   5675           ClearMask[i] = i + NumElems;
   5676         else if (LoadMask[i])
   5677           ClearMask[i] = i;
   5678       }
   5679       SDValue V = CreateLoad(VT, LDBase);
   5680       SDValue Z = VT.isInteger() ? DAG.getConstant(0, DL, VT)
   5681                                  : DAG.getConstantFP(0.0, DL, VT);
   5682       return DAG.getVectorShuffle(VT, DL, V, Z, ClearMask);
   5683     }
   5684   }
   5685 
   5686   int LoadSize =
   5687       (1 + LastLoadedElt - FirstLoadedElt) * LDBaseVT.getStoreSizeInBits();
   5688 
   5689   // VZEXT_LOAD - consecutive load/undefs followed by zeros/undefs.
   5690   if (IsConsecutiveLoad && FirstLoadedElt == 0 && LoadSize == 64 &&
   5691       ((VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()))) {
   5692     MVT VecSVT = VT.isFloatingPoint() ? MVT::f64 : MVT::i64;
   5693     MVT VecVT = MVT::getVectorVT(VecSVT, VT.getSizeInBits() / 64);
   5694     if (TLI.isTypeLegal(VecVT)) {
   5695       SDVTList Tys = DAG.getVTList(VecVT, MVT::Other);
   5696       SDValue Ops[] = { LDBase->getChain(), LDBase->getBasePtr() };
   5697       SDValue ResNode =
   5698           DAG.getMemIntrinsicNode(X86ISD::VZEXT_LOAD, DL, Tys, Ops, VecSVT,
   5699                                   LDBase->getPointerInfo(),
   5700                                   LDBase->getAlignment(),
   5701                                   false/*isVolatile*/, true/*ReadMem*/,
   5702                                   false/*WriteMem*/);
   5703 
   5704       // Make sure the newly-created LOAD is in the same position as LDBase in
   5705       // terms of dependency. We create a TokenFactor for LDBase and ResNode,
   5706       // and update uses of LDBase's output chain to use the TokenFactor.
   5707       if (LDBase->hasAnyUseOfValue(1)) {
   5708         SDValue NewChain =
   5709             DAG.getNode(ISD::TokenFactor, DL, MVT::Other, SDValue(LDBase, 1),
   5710                         SDValue(ResNode.getNode(), 1));
   5711         DAG.ReplaceAllUsesOfValueWith(SDValue(LDBase, 1), NewChain);
   5712         DAG.UpdateNodeOperands(NewChain.getNode(), SDValue(LDBase, 1),
   5713                                SDValue(ResNode.getNode(), 1));
   5714       }
   5715 
   5716       return DAG.getBitcast(VT, ResNode);
   5717     }
   5718   }
   5719 
   5720   // VZEXT_MOVL - consecutive 32-bit load/undefs followed by zeros/undefs.
   5721   if (IsConsecutiveLoad && FirstLoadedElt == 0 && LoadSize == 32 &&
   5722       ((VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()))) {
   5723     MVT VecSVT = VT.isFloatingPoint() ? MVT::f32 : MVT::i32;
   5724     MVT VecVT = MVT::getVectorVT(VecSVT, VT.getSizeInBits() / 32);
   5725     if (TLI.isTypeLegal(VecVT)) {
   5726       SDValue V = LastLoadedElt != 0 ? CreateLoad(VecSVT, LDBase)
   5727                                      : DAG.getBitcast(VecSVT, EltBase);
   5728       V = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, V);
   5729       V = DAG.getNode(X86ISD::VZEXT_MOVL, DL, VecVT, V);
   5730       return DAG.getBitcast(VT, V);
   5731     }
   5732   }
   5733 
   5734   return SDValue();
   5735 }
   5736 
   5737 /// Attempt to use the vbroadcast instruction to generate a splat value for the
   5738 /// following cases:
   5739 /// 1. A splat BUILD_VECTOR which uses a single scalar load, or a constant.
   5740 /// 2. A splat shuffle which uses a scalar_to_vector node which comes from
   5741 /// a scalar load, or a constant.
   5742 /// The VBROADCAST node is returned when a pattern is found,
   5743 /// or SDValue() otherwise.
   5744 static SDValue LowerVectorBroadcast(SDValue Op, const X86Subtarget &Subtarget,
   5745                                     SelectionDAG &DAG) {
   5746   // VBROADCAST requires AVX.
   5747   // TODO: Splats could be generated for non-AVX CPUs using SSE
   5748   // instructions, but there's less potential gain for only 128-bit vectors.
   5749   if (!Subtarget.hasAVX())
   5750     return SDValue();
   5751 
   5752   MVT VT = Op.getSimpleValueType();
   5753   SDLoc dl(Op);
   5754 
   5755   assert((VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()) &&
   5756          "Unsupported vector type for broadcast.");
   5757 
   5758   SDValue Ld;
   5759   bool ConstSplatVal;
   5760 
   5761   switch (Op.getOpcode()) {
   5762     default:
   5763       // Unknown pattern found.
   5764       return SDValue();
   5765 
   5766     case ISD::BUILD_VECTOR: {
   5767       auto *BVOp = cast<BuildVectorSDNode>(Op.getNode());
   5768       BitVector UndefElements;
   5769       SDValue Splat = BVOp->getSplatValue(&UndefElements);
   5770 
   5771       // We need a splat of a single value to use broadcast, and it doesn't
   5772       // make any sense if the value is only in one element of the vector.
   5773       if (!Splat || (VT.getVectorNumElements() - UndefElements.count()) <= 1)
   5774         return SDValue();
   5775 
   5776       Ld = Splat;
   5777       ConstSplatVal = (Ld.getOpcode() == ISD::Constant ||
   5778                        Ld.getOpcode() == ISD::ConstantFP);
   5779 
   5780       // Make sure that all of the users of a non-constant load are from the
   5781       // BUILD_VECTOR node.
   5782       if (!ConstSplatVal && !BVOp->isOnlyUserOf(Ld.getNode()))
   5783         return SDValue();
   5784       break;
   5785     }
   5786 
   5787     case ISD::VECTOR_SHUFFLE: {
   5788       ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
   5789 
   5790       // Shuffles must have a splat mask where the first element is
   5791       // broadcasted.
   5792       if ((!SVOp->isSplat()) || SVOp->getMaskElt(0) != 0)
   5793         return SDValue();
   5794 
   5795       SDValue Sc = Op.getOperand(0);
   5796       if (Sc.getOpcode() != ISD::SCALAR_TO_VECTOR &&
   5797           Sc.getOpcode() != ISD::BUILD_VECTOR) {
   5798 
   5799         if (!Subtarget.hasInt256())
   5800           return SDValue();
   5801 
   5802         // Use the register form of the broadcast instruction available on AVX2.
   5803         if (VT.getSizeInBits() >= 256)
   5804           Sc = extract128BitVector(Sc, 0, DAG, dl);
   5805         return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Sc);
   5806       }
   5807 
   5808       Ld = Sc.getOperand(0);
   5809       ConstSplatVal = (Ld.getOpcode() == ISD::Constant ||
   5810                        Ld.getOpcode() == ISD::ConstantFP);
   5811 
   5812       // The scalar_to_vector node and the suspected
   5813       // load node must have exactly one user.
   5814       // Constants may have multiple users.
   5815 
   5816       // AVX-512 has register version of the broadcast
   5817       bool hasRegVer = Subtarget.hasAVX512() && VT.is512BitVector() &&
   5818         Ld.getValueType().getSizeInBits() >= 32;
   5819       if (!ConstSplatVal && ((!Sc.hasOneUse() || !Ld.hasOneUse()) &&
   5820           !hasRegVer))
   5821         return SDValue();
   5822       break;
   5823     }
   5824   }
   5825 
   5826   unsigned ScalarSize = Ld.getValueType().getSizeInBits();
   5827   bool IsGE256 = (VT.getSizeInBits() >= 256);
   5828 
   5829   // When optimizing for size, generate up to 5 extra bytes for a broadcast
   5830   // instruction to save 8 or more bytes of constant pool data.
   5831   // TODO: If multiple splats are generated to load the same constant,
   5832   // it may be detrimental to overall size. There needs to be a way to detect
   5833   // that condition to know if this is truly a size win.
   5834   bool OptForSize = DAG.getMachineFunction().getFunction()->optForSize();
   5835 
   5836   // Handle broadcasting a single constant scalar from the constant pool
   5837   // into a vector.
   5838   // On Sandybridge (no AVX2), it is still better to load a constant vector
   5839   // from the constant pool and not to broadcast it from a scalar.
   5840   // But override that restriction when optimizing for size.
   5841   // TODO: Check if splatting is recommended for other AVX-capable CPUs.
   5842   if (ConstSplatVal && (Subtarget.hasAVX2() || OptForSize)) {
   5843     EVT CVT = Ld.getValueType();
   5844     assert(!CVT.isVector() && "Must not broadcast a vector type");
   5845 
   5846     // Splat f32, i32, v4f64, v4i64 in all cases with AVX2.
   5847     // For size optimization, also splat v2f64 and v2i64, and for size opt
   5848     // with AVX2, also splat i8 and i16.
   5849     // With pattern matching, the VBROADCAST node may become a VMOVDDUP.
   5850     if (ScalarSize == 32 || (IsGE256 && ScalarSize == 64) ||
   5851         (OptForSize && (ScalarSize == 64 || Subtarget.hasAVX2()))) {
   5852       const Constant *C = nullptr;
   5853       if (ConstantSDNode *CI = dyn_cast<ConstantSDNode>(Ld))
   5854         C = CI->getConstantIntValue();
   5855       else if (ConstantFPSDNode *CF = dyn_cast<ConstantFPSDNode>(Ld))
   5856         C = CF->getConstantFPValue();
   5857 
   5858       assert(C && "Invalid constant type");
   5859 
   5860       const TargetLowering &TLI = DAG.getTargetLoweringInfo();
   5861       SDValue CP =
   5862           DAG.getConstantPool(C, TLI.getPointerTy(DAG.getDataLayout()));
   5863       unsigned Alignment = cast<ConstantPoolSDNode>(CP)->getAlignment();
   5864       Ld = DAG.getLoad(
   5865           CVT, dl, DAG.getEntryNode(), CP,
   5866           MachinePointerInfo::getConstantPool(DAG.getMachineFunction()), false,
   5867           false, false, Alignment);
   5868 
   5869       return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld);
   5870     }
   5871   }
   5872 
   5873   bool IsLoad = ISD::isNormalLoad(Ld.getNode());
   5874 
   5875   // Handle AVX2 in-register broadcasts.
   5876   if (!IsLoad && Subtarget.hasInt256() &&
   5877       (ScalarSize == 32 || (IsGE256 && ScalarSize == 64)))
   5878     return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld);
   5879 
   5880   // The scalar source must be a normal load.
   5881   if (!IsLoad)
   5882     return SDValue();
   5883 
   5884   if (ScalarSize == 32 || (IsGE256 && ScalarSize == 64) ||
   5885       (Subtarget.hasVLX() && ScalarSize == 64))
   5886     return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld);
   5887 
   5888   // The integer check is needed for the 64-bit into 128-bit so it doesn't match
   5889   // double since there is no vbroadcastsd xmm
   5890   if (Subtarget.hasInt256() && Ld.getValueType().isInteger()) {
   5891     if (ScalarSize == 8 || ScalarSize == 16 || ScalarSize == 64)
   5892       return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld);
   5893   }
   5894 
   5895   // Unsupported broadcast.
   5896   return SDValue();
   5897 }
   5898 
   5899 /// \brief For an EXTRACT_VECTOR_ELT with a constant index return the real
   5900 /// underlying vector and index.
   5901 ///
   5902 /// Modifies \p ExtractedFromVec to the real vector and returns the real
   5903 /// index.
   5904 static int getUnderlyingExtractedFromVec(SDValue &ExtractedFromVec,
   5905                                          SDValue ExtIdx) {
   5906   int Idx = cast<ConstantSDNode>(ExtIdx)->getZExtValue();
   5907   if (!isa<ShuffleVectorSDNode>(ExtractedFromVec))
   5908     return Idx;
   5909 
   5910   // For 256-bit vectors, LowerEXTRACT_VECTOR_ELT_SSE4 may have already
   5911   // lowered this:
   5912   //   (extract_vector_elt (v8f32 %vreg1), Constant<6>)
   5913   // to:
   5914   //   (extract_vector_elt (vector_shuffle<2,u,u,u>
   5915   //                           (extract_subvector (v8f32 %vreg0), Constant<4>),
   5916   //                           undef)
   5917   //                       Constant<0>)
   5918   // In this case the vector is the extract_subvector expression and the index
   5919   // is 2, as specified by the shuffle.
   5920   ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(ExtractedFromVec);
   5921   SDValue ShuffleVec = SVOp->getOperand(0);
   5922   MVT ShuffleVecVT = ShuffleVec.getSimpleValueType();
   5923   assert(ShuffleVecVT.getVectorElementType() ==
   5924          ExtractedFromVec.getSimpleValueType().getVectorElementType());
   5925 
   5926   int ShuffleIdx = SVOp->getMaskElt(Idx);
   5927   if (isUndefOrInRange(ShuffleIdx, 0, ShuffleVecVT.getVectorNumElements())) {
   5928     ExtractedFromVec = ShuffleVec;
   5929     return ShuffleIdx;
   5930   }
   5931   return Idx;
   5932 }
   5933 
   5934 static SDValue buildFromShuffleMostly(SDValue Op, SelectionDAG &DAG) {
   5935   MVT VT = Op.getSimpleValueType();
   5936 
   5937   // Skip if insert_vec_elt is not supported.
   5938   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
   5939   if (!TLI.isOperationLegalOrCustom(ISD::INSERT_VECTOR_ELT, VT))
   5940     return SDValue();
   5941 
   5942   SDLoc DL(Op);
   5943   unsigned NumElems = Op.getNumOperands();
   5944 
   5945   SDValue VecIn1;
   5946   SDValue VecIn2;
   5947   SmallVector<unsigned, 4> InsertIndices;
   5948   SmallVector<int, 8> Mask(NumElems, -1);
   5949 
   5950   for (unsigned i = 0; i != NumElems; ++i) {
   5951     unsigned Opc = Op.getOperand(i).getOpcode();
   5952 
   5953     if (Opc == ISD::UNDEF)
   5954       continue;
   5955 
   5956     if (Opc != ISD::EXTRACT_VECTOR_ELT) {
   5957       // Quit if more than 1 elements need inserting.
   5958       if (InsertIndices.size() > 1)
   5959         return SDValue();
   5960 
   5961       InsertIndices.push_back(i);
   5962       continue;
   5963     }
   5964 
   5965     SDValue ExtractedFromVec = Op.getOperand(i).getOperand(0);
   5966     SDValue ExtIdx = Op.getOperand(i).getOperand(1);
   5967     // Quit if non-constant index.
   5968     if (!isa<ConstantSDNode>(ExtIdx))
   5969       return SDValue();
   5970     int Idx = getUnderlyingExtractedFromVec(ExtractedFromVec, ExtIdx);
   5971 
   5972     // Quit if extracted from vector of different type.
   5973     if (ExtractedFromVec.getValueType() != VT)
   5974       return SDValue();
   5975 
   5976     if (!VecIn1.getNode())
   5977       VecIn1 = ExtractedFromVec;
   5978     else if (VecIn1 != ExtractedFromVec) {
   5979       if (!VecIn2.getNode())
   5980         VecIn2 = ExtractedFromVec;
   5981       else if (VecIn2 != ExtractedFromVec)
   5982         // Quit if more than 2 vectors to shuffle
   5983         return SDValue();
   5984     }
   5985 
   5986     if (ExtractedFromVec == VecIn1)
   5987       Mask[i] = Idx;
   5988     else if (ExtractedFromVec == VecIn2)
   5989       Mask[i] = Idx + NumElems;
   5990   }
   5991 
   5992   if (!VecIn1.getNode())
   5993     return SDValue();
   5994 
   5995   VecIn2 = VecIn2.getNode() ? VecIn2 : DAG.getUNDEF(VT);
   5996   SDValue NV = DAG.getVectorShuffle(VT, DL, VecIn1, VecIn2, Mask);
   5997   for (unsigned i = 0, e = InsertIndices.size(); i != e; ++i) {
   5998     unsigned Idx = InsertIndices[i];
   5999     NV = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, VT, NV, Op.getOperand(Idx),
   6000                      DAG.getIntPtrConstant(Idx, DL));
   6001   }
   6002 
   6003   return NV;
   6004 }
   6005 
   6006 static SDValue ConvertI1VectorToInteger(SDValue Op, SelectionDAG &DAG) {
   6007   assert(ISD::isBuildVectorOfConstantSDNodes(Op.getNode()) &&
   6008          Op.getScalarValueSizeInBits() == 1 &&
   6009          "Can not convert non-constant vector");
   6010   uint64_t Immediate = 0;
   6011   for (unsigned idx = 0, e = Op.getNumOperands(); idx < e; ++idx) {
   6012     SDValue In = Op.getOperand(idx);
   6013     if (!In.isUndef())
   6014       Immediate |= cast<ConstantSDNode>(In)->getZExtValue() << idx;
   6015   }
   6016   SDLoc dl(Op);
   6017   MVT VT =
   6018    MVT::getIntegerVT(std::max((int)Op.getValueType().getSizeInBits(), 8));
   6019   return DAG.getConstant(Immediate, dl, VT);
   6020 }
   6021 // Lower BUILD_VECTOR operation for v8i1 and v16i1 types.
   6022 SDValue
   6023 X86TargetLowering::LowerBUILD_VECTORvXi1(SDValue Op, SelectionDAG &DAG) const {
   6024 
   6025   MVT VT = Op.getSimpleValueType();
   6026   assert((VT.getVectorElementType() == MVT::i1) &&
   6027          "Unexpected type in LowerBUILD_VECTORvXi1!");
   6028 
   6029   SDLoc dl(Op);
   6030   if (ISD::isBuildVectorAllZeros(Op.getNode()))
   6031     return DAG.getTargetConstant(0, dl, VT);
   6032 
   6033   if (ISD::isBuildVectorAllOnes(Op.getNode()))
   6034     return DAG.getTargetConstant(1, dl, VT);
   6035 
   6036   if (ISD::isBuildVectorOfConstantSDNodes(Op.getNode())) {
   6037     SDValue Imm = ConvertI1VectorToInteger(Op, DAG);
   6038     if (Imm.getValueSizeInBits() == VT.getSizeInBits())
   6039       return DAG.getBitcast(VT, Imm);
   6040     SDValue ExtVec = DAG.getBitcast(MVT::v8i1, Imm);
   6041     return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, ExtVec,
   6042                         DAG.getIntPtrConstant(0, dl));
   6043   }
   6044 
   6045   // Vector has one or more non-const elements
   6046   uint64_t Immediate = 0;
   6047   SmallVector<unsigned, 16> NonConstIdx;
   6048   bool IsSplat = true;
   6049   bool HasConstElts = false;
   6050   int SplatIdx = -1;
   6051   for (unsigned idx = 0, e = Op.getNumOperands(); idx < e; ++idx) {
   6052     SDValue In = Op.getOperand(idx);
   6053     if (In.isUndef())
   6054       continue;
   6055     if (!isa<ConstantSDNode>(In))
   6056       NonConstIdx.push_back(idx);
   6057     else {
   6058       Immediate |= cast<ConstantSDNode>(In)->getZExtValue() << idx;
   6059       HasConstElts = true;
   6060     }
   6061     if (SplatIdx < 0)
   6062       SplatIdx = idx;
   6063     else if (In != Op.getOperand(SplatIdx))
   6064       IsSplat = false;
   6065   }
   6066 
   6067   // for splat use " (select i1 splat_elt, all-ones, all-zeroes)"
   6068   if (IsSplat)
   6069     return DAG.getNode(ISD::SELECT, dl, VT, Op.getOperand(SplatIdx),
   6070                        DAG.getConstant(1, dl, VT),
   6071                        DAG.getConstant(0, dl, VT));
   6072 
   6073   // insert elements one by one
   6074   SDValue DstVec;
   6075   SDValue Imm;
   6076   if (Immediate) {
   6077     MVT ImmVT = MVT::getIntegerVT(std::max((int)VT.getSizeInBits(), 8));
   6078     Imm = DAG.getConstant(Immediate, dl, ImmVT);
   6079   }
   6080   else if (HasConstElts)
   6081     Imm = DAG.getConstant(0, dl, VT);
   6082   else
   6083     Imm = DAG.getUNDEF(VT);
   6084   if (Imm.getValueSizeInBits() == VT.getSizeInBits())
   6085     DstVec = DAG.getBitcast(VT, Imm);
   6086   else {
   6087     SDValue ExtVec = DAG.getBitcast(MVT::v8i1, Imm);
   6088     DstVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, ExtVec,
   6089                          DAG.getIntPtrConstant(0, dl));
   6090   }
   6091 
   6092   for (unsigned i = 0, e = NonConstIdx.size(); i != e; ++i) {
   6093     unsigned InsertIdx = NonConstIdx[i];
   6094     DstVec = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, DstVec,
   6095                          Op.getOperand(InsertIdx),
   6096                          DAG.getIntPtrConstant(InsertIdx, dl));
   6097   }
   6098   return DstVec;
   6099 }
   6100 
   6101 /// \brief Return true if \p N implements a horizontal binop and return the
   6102 /// operands for the horizontal binop into V0 and V1.
   6103 ///
   6104 /// This is a helper function of LowerToHorizontalOp().
   6105 /// This function checks that the build_vector \p N in input implements a
   6106 /// horizontal operation. Parameter \p Opcode defines the kind of horizontal
   6107 /// operation to match.
   6108 /// For example, if \p Opcode is equal to ISD::ADD, then this function
   6109 /// checks if \p N implements a horizontal arithmetic add; if instead \p Opcode
   6110 /// is equal to ISD::SUB, then this function checks if this is a horizontal
   6111 /// arithmetic sub.
   6112 ///
   6113 /// This function only analyzes elements of \p N whose indices are
   6114 /// in range [BaseIdx, LastIdx).
   6115 static bool isHorizontalBinOp(const BuildVectorSDNode *N, unsigned Opcode,
   6116                               SelectionDAG &DAG,
   6117                               unsigned BaseIdx, unsigned LastIdx,
   6118                               SDValue &V0, SDValue &V1) {
   6119   EVT VT = N->getValueType(0);
   6120 
   6121   assert(BaseIdx * 2 <= LastIdx && "Invalid Indices in input!");
   6122   assert(VT.isVector() && VT.getVectorNumElements() >= LastIdx &&
   6123          "Invalid Vector in input!");
   6124 
   6125   bool IsCommutable = (Opcode == ISD::ADD || Opcode == ISD::FADD);
   6126   bool CanFold = true;
   6127   unsigned ExpectedVExtractIdx = BaseIdx;
   6128   unsigned NumElts = LastIdx - BaseIdx;
   6129   V0 = DAG.getUNDEF(VT);
   6130   V1 = DAG.getUNDEF(VT);
   6131 
   6132   // Check if N implements a horizontal binop.
   6133   for (unsigned i = 0, e = NumElts; i != e && CanFold; ++i) {
   6134     SDValue Op = N->getOperand(i + BaseIdx);
   6135 
   6136     // Skip UNDEFs.
   6137     if (Op->isUndef()) {
   6138       // Update the expected vector extract index.
   6139       if (i * 2 == NumElts)
   6140         ExpectedVExtractIdx = BaseIdx;
   6141       ExpectedVExtractIdx += 2;
   6142       continue;
   6143     }
   6144 
   6145     CanFold = Op->getOpcode() == Opcode && Op->hasOneUse();
   6146 
   6147     if (!CanFold)
   6148       break;
   6149 
   6150     SDValue Op0 = Op.getOperand(0);
   6151     SDValue Op1 = Op.getOperand(1);
   6152 
   6153     // Try to match the following pattern:
   6154     // (BINOP (extract_vector_elt A, I), (extract_vector_elt A, I+1))
   6155     CanFold = (Op0.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
   6156         Op1.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
   6157         Op0.getOperand(0) == Op1.getOperand(0) &&
   6158         isa<ConstantSDNode>(Op0.getOperand(1)) &&
   6159         isa<ConstantSDNode>(Op1.getOperand(1)));
   6160     if (!CanFold)
   6161       break;
   6162 
   6163     unsigned I0 = cast<ConstantSDNode>(Op0.getOperand(1))->getZExtValue();
   6164     unsigned I1 = cast<ConstantSDNode>(Op1.getOperand(1))->getZExtValue();
   6165 
   6166     if (i * 2 < NumElts) {
   6167       if (V0.isUndef()) {
   6168         V0 = Op0.getOperand(0);
   6169         if (V0.getValueType() != VT)
   6170           return false;
   6171       }
   6172     } else {
   6173       if (V1.isUndef()) {
   6174         V1 = Op0.getOperand(0);
   6175         if (V1.getValueType() != VT)
   6176           return false;
   6177       }
   6178       if (i * 2 == NumElts)
   6179         ExpectedVExtractIdx = BaseIdx;
   6180     }
   6181 
   6182     SDValue Expected = (i * 2 < NumElts) ? V0 : V1;
   6183     if (I0 == ExpectedVExtractIdx)
   6184       CanFold = I1 == I0 + 1 && Op0.getOperand(0) == Expected;
   6185     else if (IsCommutable && I1 == ExpectedVExtractIdx) {
   6186       // Try to match the following dag sequence:
   6187       // (BINOP (extract_vector_elt A, I+1), (extract_vector_elt A, I))
   6188       CanFold = I0 == I1 + 1 && Op1.getOperand(0) == Expected;
   6189     } else
   6190       CanFold = false;
   6191 
   6192     ExpectedVExtractIdx += 2;
   6193   }
   6194 
   6195   return CanFold;
   6196 }
   6197 
   6198 /// \brief Emit a sequence of two 128-bit horizontal add/sub followed by
   6199 /// a concat_vector.
   6200 ///
   6201 /// This is a helper function of LowerToHorizontalOp().
   6202 /// This function expects two 256-bit vectors called V0 and V1.
   6203 /// At first, each vector is split into two separate 128-bit vectors.
   6204 /// Then, the resulting 128-bit vectors are used to implement two
   6205 /// horizontal binary operations.
   6206 ///
   6207 /// The kind of horizontal binary operation is defined by \p X86Opcode.
   6208 ///
   6209 /// \p Mode specifies how the 128-bit parts of V0 and V1 are passed in input to
   6210 /// the two new horizontal binop.
   6211 /// When Mode is set, the first horizontal binop dag node would take as input
   6212 /// the lower 128-bit of V0 and the upper 128-bit of V0. The second
   6213 /// horizontal binop dag node would take as input the lower 128-bit of V1
   6214 /// and the upper 128-bit of V1.
   6215 ///   Example:
   6216 ///     HADD V0_LO, V0_HI
   6217 ///     HADD V1_LO, V1_HI
   6218 ///
   6219 /// Otherwise, the first horizontal binop dag node takes as input the lower
   6220 /// 128-bit of V0 and the lower 128-bit of V1, and the second horizontal binop
   6221 /// dag node takes the upper 128-bit of V0 and the upper 128-bit of V1.
   6222 ///   Example:
   6223 ///     HADD V0_LO, V1_LO
   6224 ///     HADD V0_HI, V1_HI
   6225 ///
   6226 /// If \p isUndefLO is set, then the algorithm propagates UNDEF to the lower
   6227 /// 128-bits of the result. If \p isUndefHI is set, then UNDEF is propagated to
   6228 /// the upper 128-bits of the result.
   6229 static SDValue ExpandHorizontalBinOp(const SDValue &V0, const SDValue &V1,
   6230                                      const SDLoc &DL, SelectionDAG &DAG,
   6231                                      unsigned X86Opcode, bool Mode,
   6232                                      bool isUndefLO, bool isUndefHI) {
   6233   MVT VT = V0.getSimpleValueType();
   6234   assert(VT.is256BitVector() && VT == V1.getSimpleValueType() &&
   6235          "Invalid nodes in input!");
   6236 
   6237   unsigned NumElts = VT.getVectorNumElements();
   6238   SDValue V0_LO = extract128BitVector(V0, 0, DAG, DL);
   6239   SDValue V0_HI = extract128BitVector(V0, NumElts/2, DAG, DL);
   6240   SDValue V1_LO = extract128BitVector(V1, 0, DAG, DL);
   6241   SDValue V1_HI = extract128BitVector(V1, NumElts/2, DAG, DL);
   6242   MVT NewVT = V0_LO.getSimpleValueType();
   6243 
   6244   SDValue LO = DAG.getUNDEF(NewVT);
   6245   SDValue HI = DAG.getUNDEF(NewVT);
   6246 
   6247   if (Mode) {
   6248     // Don't emit a horizontal binop if the result is expected to be UNDEF.
   6249     if (!isUndefLO && !V0->isUndef())
   6250       LO = DAG.getNode(X86Opcode, DL, NewVT, V0_LO, V0_HI);
   6251     if (!isUndefHI && !V1->isUndef())
   6252       HI = DAG.getNode(X86Opcode, DL, NewVT, V1_LO, V1_HI);
   6253   } else {
   6254     // Don't emit a horizontal binop if the result is expected to be UNDEF.
   6255     if (!isUndefLO && (!V0_LO->isUndef() || !V1_LO->isUndef()))
   6256       LO = DAG.getNode(X86Opcode, DL, NewVT, V0_LO, V1_LO);
   6257 
   6258     if (!isUndefHI && (!V0_HI->isUndef() || !V1_HI->isUndef()))
   6259       HI = DAG.getNode(X86Opcode, DL, NewVT, V0_HI, V1_HI);
   6260   }
   6261 
   6262   return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, LO, HI);
   6263 }
   6264 
   6265 /// Try to fold a build_vector that performs an 'addsub' to an X86ISD::ADDSUB
   6266 /// node.
   6267 static SDValue LowerToAddSub(const BuildVectorSDNode *BV,
   6268                              const X86Subtarget &Subtarget, SelectionDAG &DAG) {
   6269   MVT VT = BV->getSimpleValueType(0);
   6270   if ((!Subtarget.hasSSE3() || (VT != MVT::v4f32 && VT != MVT::v2f64)) &&
   6271       (!Subtarget.hasAVX() || (VT != MVT::v8f32 && VT != MVT::v4f64)))
   6272     return SDValue();
   6273 
   6274   SDLoc DL(BV);
   6275   unsigned NumElts = VT.getVectorNumElements();
   6276   SDValue InVec0 = DAG.getUNDEF(VT);
   6277   SDValue InVec1 = DAG.getUNDEF(VT);
   6278 
   6279   assert((VT == MVT::v8f32 || VT == MVT::v4f64 || VT == MVT::v4f32 ||
   6280           VT == MVT::v2f64) && "build_vector with an invalid type found!");
   6281 
   6282   // Odd-numbered elements in the input build vector are obtained from
   6283   // adding two integer/float elements.
   6284   // Even-numbered elements in the input build vector are obtained from
   6285   // subtracting two integer/float elements.
   6286   unsigned ExpectedOpcode = ISD::FSUB;
   6287   unsigned NextExpectedOpcode = ISD::FADD;
   6288   bool AddFound = false;
   6289   bool SubFound = false;
   6290 
   6291   for (unsigned i = 0, e = NumElts; i != e; ++i) {
   6292     SDValue Op = BV->getOperand(i);
   6293 
   6294     // Skip 'undef' values.
   6295     unsigned Opcode = Op.getOpcode();
   6296     if (Opcode == ISD::UNDEF) {
   6297       std::swap(ExpectedOpcode, NextExpectedOpcode);
   6298       continue;
   6299     }
   6300 
   6301     // Early exit if we found an unexpected opcode.
   6302     if (Opcode != ExpectedOpcode)
   6303       return SDValue();
   6304 
   6305     SDValue Op0 = Op.getOperand(0);
   6306     SDValue Op1 = Op.getOperand(1);
   6307 
   6308     // Try to match the following pattern:
   6309     // (BINOP (extract_vector_elt A, i), (extract_vector_elt B, i))
   6310     // Early exit if we cannot match that sequence.
   6311     if (Op0.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
   6312         Op1.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
   6313         !isa<ConstantSDNode>(Op0.getOperand(1)) ||
   6314         !isa<ConstantSDNode>(Op1.getOperand(1)) ||
   6315         Op0.getOperand(1) != Op1.getOperand(1))
   6316       return SDValue();
   6317 
   6318     unsigned I0 = cast<ConstantSDNode>(Op0.getOperand(1))->getZExtValue();
   6319     if (I0 != i)
   6320       return SDValue();
   6321 
   6322     // We found a valid add/sub node. Update the information accordingly.
   6323     if (i & 1)
   6324       AddFound = true;
   6325     else
   6326       SubFound = true;
   6327 
   6328     // Update InVec0 and InVec1.
   6329     if (InVec0.isUndef()) {
   6330       InVec0 = Op0.getOperand(0);
   6331       if (InVec0.getSimpleValueType() != VT)
   6332         return SDValue();
   6333     }
   6334     if (InVec1.isUndef()) {
   6335       InVec1 = Op1.getOperand(0);
   6336       if (InVec1.getSimpleValueType() != VT)
   6337         return SDValue();
   6338     }
   6339 
   6340     // Make sure that operands in input to each add/sub node always
   6341     // come from a same pair of vectors.
   6342     if (InVec0 != Op0.getOperand(0)) {
   6343       if (ExpectedOpcode == ISD::FSUB)
   6344         return SDValue();
   6345 
   6346       // FADD is commutable. Try to commute the operands
   6347       // and then test again.
   6348       std::swap(Op0, Op1);
   6349       if (InVec0 != Op0.getOperand(0))
   6350         return SDValue();
   6351     }
   6352 
   6353     if (InVec1 != Op1.getOperand(0))
   6354       return SDValue();
   6355 
   6356     // Update the pair of expected opcodes.
   6357     std::swap(ExpectedOpcode, NextExpectedOpcode);
   6358   }
   6359 
   6360   // Don't try to fold this build_vector into an ADDSUB if the inputs are undef.
   6361   if (AddFound && SubFound && !InVec0.isUndef() && !InVec1.isUndef())
   6362     return DAG.getNode(X86ISD::ADDSUB, DL, VT, InVec0, InVec1);
   6363 
   6364   return SDValue();
   6365 }
   6366 
   6367 /// Lower BUILD_VECTOR to a horizontal add/sub operation if possible.
   6368 static SDValue LowerToHorizontalOp(const BuildVectorSDNode *BV,
   6369                                    const X86Subtarget &Subtarget,
   6370                                    SelectionDAG &DAG) {
   6371   MVT VT = BV->getSimpleValueType(0);
   6372   unsigned NumElts = VT.getVectorNumElements();
   6373   unsigned NumUndefsLO = 0;
   6374   unsigned NumUndefsHI = 0;
   6375   unsigned Half = NumElts/2;
   6376 
   6377   // Count the number of UNDEF operands in the build_vector in input.
   6378   for (unsigned i = 0, e = Half; i != e; ++i)
   6379     if (BV->getOperand(i)->isUndef())
   6380       NumUndefsLO++;
   6381 
   6382   for (unsigned i = Half, e = NumElts; i != e; ++i)
   6383     if (BV->getOperand(i)->isUndef())
   6384       NumUndefsHI++;
   6385 
   6386   // Early exit if this is either a build_vector of all UNDEFs or all the
   6387   // operands but one are UNDEF.
   6388   if (NumUndefsLO + NumUndefsHI + 1 >= NumElts)
   6389     return SDValue();
   6390 
   6391   SDLoc DL(BV);
   6392   SDValue InVec0, InVec1;
   6393   if ((VT == MVT::v4f32 || VT == MVT::v2f64) && Subtarget.hasSSE3()) {
   6394     // Try to match an SSE3 float HADD/HSUB.
   6395     if (isHorizontalBinOp(BV, ISD::FADD, DAG, 0, NumElts, InVec0, InVec1))
   6396       return DAG.getNode(X86ISD::FHADD, DL, VT, InVec0, InVec1);
   6397 
   6398     if (isHorizontalBinOp(BV, ISD::FSUB, DAG, 0, NumElts, InVec0, InVec1))
   6399       return DAG.getNode(X86ISD::FHSUB, DL, VT, InVec0, InVec1);
   6400   } else if ((VT == MVT::v4i32 || VT == MVT::v8i16) && Subtarget.hasSSSE3()) {
   6401     // Try to match an SSSE3 integer HADD/HSUB.
   6402     if (isHorizontalBinOp(BV, ISD::ADD, DAG, 0, NumElts, InVec0, InVec1))
   6403       return DAG.getNode(X86ISD::HADD, DL, VT, InVec0, InVec1);
   6404 
   6405     if (isHorizontalBinOp(BV, ISD::SUB, DAG, 0, NumElts, InVec0, InVec1))
   6406       return DAG.getNode(X86ISD::HSUB, DL, VT, InVec0, InVec1);
   6407   }
   6408 
   6409   if (!Subtarget.hasAVX())
   6410     return SDValue();
   6411 
   6412   if ((VT == MVT::v8f32 || VT == MVT::v4f64)) {
   6413     // Try to match an AVX horizontal add/sub of packed single/double
   6414     // precision floating point values from 256-bit vectors.
   6415     SDValue InVec2, InVec3;
   6416     if (isHorizontalBinOp(BV, ISD::FADD, DAG, 0, Half, InVec0, InVec1) &&
   6417         isHorizontalBinOp(BV, ISD::FADD, DAG, Half, NumElts, InVec2, InVec3) &&
   6418         ((InVec0.isUndef() || InVec2.isUndef()) || InVec0 == InVec2) &&
   6419         ((InVec1.isUndef() || InVec3.isUndef()) || InVec1 == InVec3))
   6420       return DAG.getNode(X86ISD::FHADD, DL, VT, InVec0, InVec1);
   6421 
   6422     if (isHorizontalBinOp(BV, ISD::FSUB, DAG, 0, Half, InVec0, InVec1) &&
   6423         isHorizontalBinOp(BV, ISD::FSUB, DAG, Half, NumElts, InVec2, InVec3) &&
   6424         ((InVec0.isUndef() || InVec2.isUndef()) || InVec0 == InVec2) &&
   6425         ((InVec1.isUndef() || InVec3.isUndef()) || InVec1 == InVec3))
   6426       return DAG.getNode(X86ISD::FHSUB, DL, VT, InVec0, InVec1);
   6427   } else if (VT == MVT::v8i32 || VT == MVT::v16i16) {
   6428     // Try to match an AVX2 horizontal add/sub of signed integers.
   6429     SDValue InVec2, InVec3;
   6430     unsigned X86Opcode;
   6431     bool CanFold = true;
   6432 
   6433     if (isHorizontalBinOp(BV, ISD::ADD, DAG, 0, Half, InVec0, InVec1) &&
   6434         isHorizontalBinOp(BV, ISD::ADD, DAG, Half, NumElts, InVec2, InVec3) &&
   6435         ((InVec0.isUndef() || InVec2.isUndef()) || InVec0 == InVec2) &&
   6436         ((InVec1.isUndef() || InVec3.isUndef()) || InVec1 == InVec3))
   6437       X86Opcode = X86ISD::HADD;
   6438     else if (isHorizontalBinOp(BV, ISD::SUB, DAG, 0, Half, InVec0, InVec1) &&
   6439         isHorizontalBinOp(BV, ISD::SUB, DAG, Half, NumElts, InVec2, InVec3) &&
   6440         ((InVec0.isUndef() || InVec2.isUndef()) || InVec0 == InVec2) &&
   6441         ((InVec1.isUndef() || InVec3.isUndef()) || InVec1 == InVec3))
   6442       X86Opcode = X86ISD::HSUB;
   6443     else
   6444       CanFold = false;
   6445 
   6446     if (CanFold) {
   6447       // Fold this build_vector into a single horizontal add/sub.
   6448       // Do this only if the target has AVX2.
   6449       if (Subtarget.hasAVX2())
   6450         return DAG.getNode(X86Opcode, DL, VT, InVec0, InVec1);
   6451 
   6452       // Do not try to expand this build_vector into a pair of horizontal
   6453       // add/sub if we can emit a pair of scalar add/sub.
   6454       if (NumUndefsLO + 1 == Half || NumUndefsHI + 1 == Half)
   6455         return SDValue();
   6456 
   6457       // Convert this build_vector into a pair of horizontal binop followed by
   6458       // a concat vector.
   6459       bool isUndefLO = NumUndefsLO == Half;
   6460       bool isUndefHI = NumUndefsHI == Half;
   6461       return ExpandHorizontalBinOp(InVec0, InVec1, DL, DAG, X86Opcode, false,
   6462                                    isUndefLO, isUndefHI);
   6463     }
   6464   }
   6465 
   6466   if ((VT == MVT::v8f32 || VT == MVT::v4f64 || VT == MVT::v8i32 ||
   6467        VT == MVT::v16i16) && Subtarget.hasAVX()) {
   6468     unsigned X86Opcode;
   6469     if (isHorizontalBinOp(BV, ISD::ADD, DAG, 0, NumElts, InVec0, InVec1))
   6470       X86Opcode = X86ISD::HADD;
   6471     else if (isHorizontalBinOp(BV, ISD::SUB, DAG, 0, NumElts, InVec0, InVec1))
   6472       X86Opcode = X86ISD::HSUB;
   6473     else if (isHorizontalBinOp(BV, ISD::FADD, DAG, 0, NumElts, InVec0, InVec1))
   6474       X86Opcode = X86ISD::FHADD;
   6475     else if (isHorizontalBinOp(BV, ISD::FSUB, DAG, 0, NumElts, InVec0, InVec1))
   6476       X86Opcode = X86ISD::FHSUB;
   6477     else
   6478       return SDValue();
   6479 
   6480     // Don't try to expand this build_vector into a pair of horizontal add/sub
   6481     // if we can simply emit a pair of scalar add/sub.
   6482     if (NumUndefsLO + 1 == Half || NumUndefsHI + 1 == Half)
   6483       return SDValue();
   6484 
   6485     // Convert this build_vector into two horizontal add/sub followed by
   6486     // a concat vector.
   6487     bool isUndefLO = NumUndefsLO == Half;
   6488     bool isUndefHI = NumUndefsHI == Half;
   6489     return ExpandHorizontalBinOp(InVec0, InVec1, DL, DAG, X86Opcode, true,
   6490                                  isUndefLO, isUndefHI);
   6491   }
   6492 
   6493   return SDValue();
   6494 }
   6495 
   6496 /// If a BUILD_VECTOR's source elements all apply the same bit operation and
   6497 /// one of their operands is constant, lower to a pair of BUILD_VECTOR and
   6498 /// just apply the bit to the vectors.
   6499 /// NOTE: Its not in our interest to start make a general purpose vectorizer
   6500 /// from this, but enough scalar bit operations are created from the later
   6501 /// legalization + scalarization stages to need basic support.
   6502 static SDValue lowerBuildVectorToBitOp(SDValue Op, SelectionDAG &DAG) {
   6503   SDLoc DL(Op);
   6504   MVT VT = Op.getSimpleValueType();
   6505   unsigned NumElems = VT.getVectorNumElements();
   6506   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
   6507 
   6508   // Check that all elements have the same opcode.
   6509   // TODO: Should we allow UNDEFS and if so how many?
   6510   unsigned Opcode = Op.getOperand(0).getOpcode();
   6511   for (unsigned i = 1; i < NumElems; ++i)
   6512     if (Opcode != Op.getOperand(i).getOpcode())
   6513       return SDValue();
   6514 
   6515   // TODO: We may be able to add support for other Ops (ADD/SUB + shifts).
   6516   switch (Opcode) {
   6517   default:
   6518     return SDValue();
   6519   case ISD::AND:
   6520   case ISD::XOR:
   6521   case ISD::OR:
   6522     if (!TLI.isOperationLegalOrPromote(Opcode, VT))
   6523       return SDValue();
   6524     break;
   6525   }
   6526 
   6527   SmallVector<SDValue, 4> LHSElts, RHSElts;
   6528   for (SDValue Elt : Op->ops()) {
   6529     SDValue LHS = Elt.getOperand(0);
   6530     SDValue RHS = Elt.getOperand(1);
   6531 
   6532     // We expect the canonicalized RHS operand to be the constant.
   6533     if (!isa<ConstantSDNode>(RHS))
   6534       return SDValue();
   6535     LHSElts.push_back(LHS);
   6536     RHSElts.push_back(RHS);
   6537   }
   6538 
   6539   SDValue LHS = DAG.getBuildVector(VT, DL, LHSElts);
   6540   SDValue RHS = DAG.getBuildVector(VT, DL, RHSElts);
   6541   return DAG.getNode(Opcode, DL, VT, LHS, RHS);
   6542 }
   6543 
   6544 /// Create a vector constant without a load. SSE/AVX provide the bare minimum
   6545 /// functionality to do this, so it's all zeros, all ones, or some derivation
   6546 /// that is cheap to calculate.
   6547 static SDValue materializeVectorConstant(SDValue Op, SelectionDAG &DAG,
   6548                                          const X86Subtarget &Subtarget) {
   6549   SDLoc DL(Op);
   6550   MVT VT = Op.getSimpleValueType();
   6551 
   6552   // Vectors containing all zeros can be matched by pxor and xorps.
   6553   if (ISD::isBuildVectorAllZeros(Op.getNode())) {
   6554     // Canonicalize this to <4 x i32> to 1) ensure the zero vectors are CSE'd
   6555     // and 2) ensure that i64 scalars are eliminated on x86-32 hosts.
   6556     if (VT == MVT::v4i32 || VT == MVT::v8i32 || VT == MVT::v16i32)
   6557       return Op;
   6558 
   6559     return getZeroVector(VT, Subtarget, DAG, DL);
   6560   }
   6561 
   6562   // Vectors containing all ones can be matched by pcmpeqd on 128-bit width
   6563   // vectors or broken into v4i32 operations on 256-bit vectors. AVX2 can use
   6564   // vpcmpeqd on 256-bit vectors.
   6565   if (Subtarget.hasSSE2() && ISD::isBuildVectorAllOnes(Op.getNode())) {
   6566     if (VT == MVT::v4i32 || VT == MVT::v16i32 ||
   6567         (VT == MVT::v8i32 && Subtarget.hasInt256()))
   6568       return Op;
   6569 
   6570     return getOnesVector(VT, Subtarget, DAG, DL);
   6571   }
   6572 
   6573   return SDValue();
   6574 }
   6575 
   6576 SDValue
   6577 X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const {
   6578   SDLoc dl(Op);
   6579 
   6580   MVT VT = Op.getSimpleValueType();
   6581   MVT ExtVT = VT.getVectorElementType();
   6582   unsigned NumElems = Op.getNumOperands();
   6583 
   6584   // Generate vectors for predicate vectors.
   6585   if (VT.getVectorElementType() == MVT::i1 && Subtarget.hasAVX512())
   6586     return LowerBUILD_VECTORvXi1(Op, DAG);
   6587 
   6588   if (SDValue VectorConstant = materializeVectorConstant(Op, DAG, Subtarget))
   6589     return VectorConstant;
   6590 
   6591   BuildVectorSDNode *BV = cast<BuildVectorSDNode>(Op.getNode());
   6592   if (SDValue AddSub = LowerToAddSub(BV, Subtarget, DAG))
   6593     return AddSub;
   6594   if (SDValue HorizontalOp = LowerToHorizontalOp(BV, Subtarget, DAG))
   6595     return HorizontalOp;
   6596   if (SDValue Broadcast = LowerVectorBroadcast(Op, Subtarget, DAG))
   6597     return Broadcast;
   6598   if (SDValue BitOp = lowerBuildVectorToBitOp(Op, DAG))
   6599     return BitOp;
   6600 
   6601   unsigned EVTBits = ExtVT.getSizeInBits();
   6602 
   6603   unsigned NumZero  = 0;
   6604   unsigned NumNonZero = 0;
   6605   uint64_t NonZeros = 0;
   6606   bool IsAllConstants = true;
   6607   SmallSet<SDValue, 8> Values;
   6608   for (unsigned i = 0; i < NumElems; ++i) {
   6609     SDValue Elt = Op.getOperand(i);
   6610     if (Elt.isUndef())
   6611       continue;
   6612     Values.insert(Elt);
   6613     if (Elt.getOpcode() != ISD::Constant &&
   6614         Elt.getOpcode() != ISD::ConstantFP)
   6615       IsAllConstants = false;
   6616     if (X86::isZeroNode(Elt))
   6617       NumZero++;
   6618     else {
   6619       assert(i < sizeof(NonZeros) * 8); // Make sure the shift is within range.
   6620       NonZeros |= ((uint64_t)1 << i);
   6621       NumNonZero++;
   6622     }
   6623   }
   6624 
   6625   // All undef vector. Return an UNDEF.  All zero vectors were handled above.
   6626   if (NumNonZero == 0)
   6627     return DAG.getUNDEF(VT);
   6628 
   6629   // Special case for single non-zero, non-undef, element.
   6630   if (NumNonZero == 1) {
   6631     unsigned Idx = countTrailingZeros(NonZeros);
   6632     SDValue Item = Op.getOperand(Idx);
   6633 
   6634     // If this is an insertion of an i64 value on x86-32, and if the top bits of
   6635     // the value are obviously zero, truncate the value to i32 and do the
   6636     // insertion that way.  Only do this if the value is non-constant or if the
   6637     // value is a constant being inserted into element 0.  It is cheaper to do
   6638     // a constant pool load than it is to do a movd + shuffle.
   6639     if (ExtVT == MVT::i64 && !Subtarget.is64Bit() &&
   6640         (!IsAllConstants || Idx == 0)) {
   6641       if (DAG.MaskedValueIsZero(Item, APInt::getBitsSet(64, 32, 64))) {
   6642         // Handle SSE only.
   6643         assert(VT == MVT::v2i64 && "Expected an SSE value type!");
   6644         MVT VecVT = MVT::v4i32;
   6645 
   6646         // Truncate the value (which may itself be a constant) to i32, and
   6647         // convert it to a vector with movd (S2V+shuffle to zero extend).
   6648         Item = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, Item);
   6649         Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VecVT, Item);
   6650         return DAG.getBitcast(VT, getShuffleVectorZeroOrUndef(
   6651                                       Item, Idx * 2, true, Subtarget, DAG));
   6652       }
   6653     }
   6654 
   6655     // If we have a constant or non-constant insertion into the low element of
   6656     // a vector, we can do this with SCALAR_TO_VECTOR + shuffle of zero into
   6657     // the rest of the elements.  This will be matched as movd/movq/movss/movsd
   6658     // depending on what the source datatype is.
   6659     if (Idx == 0) {
   6660       if (NumZero == 0)
   6661         return DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item);
   6662 
   6663       if (ExtVT == MVT::i32 || ExtVT == MVT::f32 || ExtVT == MVT::f64 ||
   6664           (ExtVT == MVT::i64 && Subtarget.is64Bit())) {
   6665         if (VT.is512BitVector()) {
   6666           SDValue ZeroVec = getZeroVector(VT, Subtarget, DAG, dl);
   6667           return DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, ZeroVec,
   6668                              Item, DAG.getIntPtrConstant(0, dl));
   6669         }
   6670         assert((VT.is128BitVector() || VT.is256BitVector()) &&
   6671                "Expected an SSE value type!");
   6672         Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item);
   6673         // Turn it into a MOVL (i.e. movss, movsd, or movd) to a zero vector.
   6674         return getShuffleVectorZeroOrUndef(Item, 0, true, Subtarget, DAG);
   6675       }
   6676 
   6677       // We can't directly insert an i8 or i16 into a vector, so zero extend
   6678       // it to i32 first.
   6679       if (ExtVT == MVT::i16 || ExtVT == MVT::i8) {
   6680         Item = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, Item);
   6681         if (VT.getSizeInBits() >= 256) {
   6682           MVT ShufVT = MVT::getVectorVT(MVT::i32, VT.getSizeInBits()/32);
   6683           if (Subtarget.hasAVX()) {
   6684             Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, ShufVT, Item);
   6685             Item = getShuffleVectorZeroOrUndef(Item, 0, true, Subtarget, DAG);
   6686           } else {
   6687             // Without AVX, we need to extend to a 128-bit vector and then
   6688             // insert into the 256-bit vector.
   6689             Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, Item);
   6690             SDValue ZeroVec = getZeroVector(ShufVT, Subtarget, DAG, dl);
   6691             Item = insert128BitVector(ZeroVec, Item, 0, DAG, dl);
   6692           }
   6693         } else {
   6694           assert(VT.is128BitVector() && "Expected an SSE value type!");
   6695           Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, Item);
   6696           Item = getShuffleVectorZeroOrUndef(Item, 0, true, Subtarget, DAG);
   6697         }
   6698         return DAG.getBitcast(VT, Item);
   6699       }
   6700     }
   6701 
   6702     // Is it a vector logical left shift?
   6703     if (NumElems == 2 && Idx == 1 &&
   6704         X86::isZeroNode(Op.getOperand(0)) &&
   6705         !X86::isZeroNode(Op.getOperand(1))) {
   6706       unsigned NumBits = VT.getSizeInBits();
   6707       return getVShift(true, VT,
   6708                        DAG.getNode(ISD::SCALAR_TO_VECTOR, dl,
   6709                                    VT, Op.getOperand(1)),
   6710                        NumBits/2, DAG, *this, dl);
   6711     }
   6712 
   6713     if (IsAllConstants) // Otherwise, it's better to do a constpool load.
   6714       return SDValue();
   6715 
   6716     // Otherwise, if this is a vector with i32 or f32 elements, and the element
   6717     // is a non-constant being inserted into an element other than the low one,
   6718     // we can't use a constant pool load.  Instead, use SCALAR_TO_VECTOR (aka
   6719     // movd/movss) to move this into the low element, then shuffle it into
   6720     // place.
   6721     if (EVTBits == 32) {
   6722       Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item);
   6723       return getShuffleVectorZeroOrUndef(Item, Idx, NumZero > 0, Subtarget, DAG);
   6724     }
   6725   }
   6726 
   6727   // Splat is obviously ok. Let legalizer expand it to a shuffle.
   6728   if (Values.size() == 1) {
   6729     if (EVTBits == 32) {
   6730       // Instead of a shuffle like this:
   6731       // shuffle (scalar_to_vector (load (ptr + 4))), undef, <0, 0, 0, 0>
   6732       // Check if it's possible to issue this instead.
   6733       // shuffle (vload ptr)), undef, <1, 1, 1, 1>
   6734       unsigned Idx = countTrailingZeros(NonZeros);
   6735       SDValue Item = Op.getOperand(Idx);
   6736       if (Op.getNode()->isOnlyUserOf(Item.getNode()))
   6737         return LowerAsSplatVectorLoad(Item, VT, dl, DAG);
   6738     }
   6739     return SDValue();
   6740   }
   6741 
   6742   // A vector full of immediates; various special cases are already
   6743   // handled, so this is best done with a single constant-pool load.
   6744   if (IsAllConstants)
   6745     return SDValue();
   6746 
   6747   // See if we can use a vector load to get all of the elements.
   6748   if (VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()) {
   6749     SmallVector<SDValue, 64> Ops(Op->op_begin(), Op->op_begin() + NumElems);
   6750     if (SDValue LD = EltsFromConsecutiveLoads(VT, Ops, dl, DAG, false))
   6751       return LD;
   6752   }
   6753 
   6754   // For AVX-length vectors, build the individual 128-bit pieces and use
   6755   // shuffles to put them in place.
   6756   if (VT.is256BitVector() || VT.is512BitVector()) {
   6757     SmallVector<SDValue, 64> Ops(Op->op_begin(), Op->op_begin() + NumElems);
   6758 
   6759     EVT HVT = EVT::getVectorVT(*DAG.getContext(), ExtVT, NumElems/2);
   6760 
   6761     // Build both the lower and upper subvector.
   6762     SDValue Lower =
   6763         DAG.getBuildVector(HVT, dl, makeArrayRef(&Ops[0], NumElems / 2));
   6764     SDValue Upper = DAG.getBuildVector(
   6765         HVT, dl, makeArrayRef(&Ops[NumElems / 2], NumElems / 2));
   6766 
   6767     // Recreate the wider vector with the lower and upper part.
   6768     if (VT.is256BitVector())
   6769       return concat128BitVectors(Lower, Upper, VT, NumElems, DAG, dl);
   6770     return concat256BitVectors(Lower, Upper, VT, NumElems, DAG, dl);
   6771   }
   6772 
   6773   // Let legalizer expand 2-wide build_vectors.
   6774   if (EVTBits == 64) {
   6775     if (NumNonZero == 1) {
   6776       // One half is zero or undef.
   6777       unsigned Idx = countTrailingZeros(NonZeros);
   6778       SDValue V2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT,
   6779                                Op.getOperand(Idx));
   6780       return getShuffleVectorZeroOrUndef(V2, Idx, true, Subtarget, DAG);
   6781     }
   6782     return SDValue();
   6783   }
   6784 
   6785   // If element VT is < 32 bits, convert it to inserts into a zero vector.
   6786   if (EVTBits == 8 && NumElems == 16)
   6787     if (SDValue V = LowerBuildVectorv16i8(Op, NonZeros, NumNonZero, NumZero,
   6788                                           DAG, Subtarget, *this))
   6789       return V;
   6790 
   6791   if (EVTBits == 16 && NumElems == 8)
   6792     if (SDValue V = LowerBuildVectorv8i16(Op, NonZeros, NumNonZero, NumZero,
   6793                                           DAG, Subtarget, *this))
   6794       return V;
   6795 
   6796   // If element VT is == 32 bits and has 4 elems, try to generate an INSERTPS
   6797   if (EVTBits == 32 && NumElems == 4)
   6798     if (SDValue V = LowerBuildVectorv4x32(Op, DAG, Subtarget, *this))
   6799       return V;
   6800 
   6801   // If element VT is == 32 bits, turn it into a number of shuffles.
   6802   if (NumElems == 4 && NumZero > 0) {
   6803     SmallVector<SDValue, 8> Ops(NumElems);
   6804     for (unsigned i = 0; i < 4; ++i) {
   6805       bool isZero = !(NonZeros & (1ULL << i));
   6806       if (isZero)
   6807         Ops[i] = getZeroVector(VT, Subtarget, DAG, dl);
   6808       else
   6809         Ops[i] = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(i));
   6810     }
   6811 
   6812     for (unsigned i = 0; i < 2; ++i) {
   6813       switch ((NonZeros & (0x3 << i*2)) >> (i*2)) {
   6814         default: break;
   6815         case 0:
   6816           Ops[i] = Ops[i*2];  // Must be a zero vector.
   6817           break;
   6818         case 1:
   6819           Ops[i] = getMOVL(DAG, dl, VT, Ops[i*2+1], Ops[i*2]);
   6820           break;
   6821         case 2:
   6822           Ops[i] = getMOVL(DAG, dl, VT, Ops[i*2], Ops[i*2+1]);
   6823           break;
   6824         case 3:
   6825           Ops[i] = getUnpackl(DAG, dl, VT, Ops[i*2], Ops[i*2+1]);
   6826           break;
   6827       }
   6828     }
   6829 
   6830     bool Reverse1 = (NonZeros & 0x3) == 2;
   6831     bool Reverse2 = ((NonZeros & (0x3 << 2)) >> 2) == 2;
   6832     int MaskVec[] = {
   6833       Reverse1 ? 1 : 0,
   6834       Reverse1 ? 0 : 1,
   6835       static_cast<int>(Reverse2 ? NumElems+1 : NumElems),
   6836       static_cast<int>(Reverse2 ? NumElems   : NumElems+1)
   6837     };
   6838     return DAG.getVectorShuffle(VT, dl, Ops[0], Ops[1], MaskVec);
   6839   }
   6840 
   6841   if (Values.size() > 1 && VT.is128BitVector()) {
   6842     // Check for a build vector from mostly shuffle plus few inserting.
   6843     if (SDValue Sh = buildFromShuffleMostly(Op, DAG))
   6844       return Sh;
   6845 
   6846     // For SSE 4.1, use insertps to put the high elements into the low element.
   6847     if (Subtarget.hasSSE41()) {
   6848       SDValue Result;
   6849       if (!Op.getOperand(0).isUndef())
   6850         Result = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(0));
   6851       else
   6852         Result = DAG.getUNDEF(VT);
   6853 
   6854       for (unsigned i = 1; i < NumElems; ++i) {
   6855         if (Op.getOperand(i).isUndef()) continue;
   6856         Result = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, Result,
   6857                              Op.getOperand(i), DAG.getIntPtrConstant(i, dl));
   6858       }
   6859       return Result;
   6860     }
   6861 
   6862     // Otherwise, expand into a number of unpckl*, start by extending each of
   6863     // our (non-undef) elements to the full vector width with the element in the
   6864     // bottom slot of the vector (which generates no code for SSE).
   6865     SmallVector<SDValue, 8> Ops(NumElems);
   6866     for (unsigned i = 0; i < NumElems; ++i) {
   6867       if (!Op.getOperand(i).isUndef())
   6868         Ops[i] = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(i));
   6869       else
   6870         Ops[i] = DAG.getUNDEF(VT);
   6871     }
   6872 
   6873     // Next, we iteratively mix elements, e.g. for v4f32:
   6874     //   Step 1: unpcklps 0, 2 ==> X: <?, ?, 2, 0>
   6875     //         : unpcklps 1, 3 ==> Y: <?, ?, 3, 1>
   6876     //   Step 2: unpcklps X, Y ==>    <3, 2, 1, 0>
   6877     unsigned EltStride = NumElems >> 1;
   6878     while (EltStride != 0) {
   6879       for (unsigned i = 0; i < EltStride; ++i) {
   6880         // If Ops[i+EltStride] is undef and this is the first round of mixing,
   6881         // then it is safe to just drop this shuffle: V[i] is already in the
   6882         // right place, the one element (since it's the first round) being
   6883         // inserted as undef can be dropped.  This isn't safe for successive
   6884         // rounds because they will permute elements within both vectors.
   6885         if (Ops[i+EltStride].isUndef() &&
   6886             EltStride == NumElems/2)
   6887           continue;
   6888 
   6889         Ops[i] = getUnpackl(DAG, dl, VT, Ops[i], Ops[i + EltStride]);
   6890       }
   6891       EltStride >>= 1;
   6892     }
   6893     return Ops[0];
   6894   }
   6895   return SDValue();
   6896 }
   6897 
   6898 // 256-bit AVX can use the vinsertf128 instruction
   6899 // to create 256-bit vectors from two other 128-bit ones.
   6900 static SDValue LowerAVXCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) {
   6901   SDLoc dl(Op);
   6902   MVT ResVT = Op.getSimpleValueType();
   6903 
   6904   assert((ResVT.is256BitVector() ||
   6905           ResVT.is512BitVector()) && "Value type must be 256-/512-bit wide");
   6906 
   6907   SDValue V1 = Op.getOperand(0);
   6908   SDValue V2 = Op.getOperand(1);
   6909   unsigned NumElems = ResVT.getVectorNumElements();
   6910   if (ResVT.is256BitVector())
   6911     return concat128BitVectors(V1, V2, ResVT, NumElems, DAG, dl);
   6912 
   6913   if (Op.getNumOperands() == 4) {
   6914     MVT HalfVT = MVT::getVectorVT(ResVT.getVectorElementType(),
   6915                                   ResVT.getVectorNumElements()/2);
   6916     SDValue V3 = Op.getOperand(2);
   6917     SDValue V4 = Op.getOperand(3);
   6918     return concat256BitVectors(
   6919         concat128BitVectors(V1, V2, HalfVT, NumElems / 2, DAG, dl),
   6920         concat128BitVectors(V3, V4, HalfVT, NumElems / 2, DAG, dl), ResVT,
   6921         NumElems, DAG, dl);
   6922   }
   6923   return concat256BitVectors(V1, V2, ResVT, NumElems, DAG, dl);
   6924 }
   6925 
   6926 static SDValue LowerCONCAT_VECTORSvXi1(SDValue Op,
   6927                                        const X86Subtarget &Subtarget,
   6928                                        SelectionDAG & DAG) {
   6929   SDLoc dl(Op);
   6930   MVT ResVT = Op.getSimpleValueType();
   6931   unsigned NumOfOperands = Op.getNumOperands();
   6932 
   6933   assert(isPowerOf2_32(NumOfOperands) &&
   6934          "Unexpected number of operands in CONCAT_VECTORS");
   6935 
   6936   SDValue Undef = DAG.getUNDEF(ResVT);
   6937   if (NumOfOperands > 2) {
   6938     // Specialize the cases when all, or all but one, of the operands are undef.
   6939     unsigned NumOfDefinedOps = 0;
   6940     unsigned OpIdx = 0;
   6941     for (unsigned i = 0; i < NumOfOperands; i++)
   6942       if (!Op.getOperand(i).isUndef()) {
   6943         NumOfDefinedOps++;
   6944         OpIdx = i;
   6945       }
   6946     if (NumOfDefinedOps == 0)
   6947       return Undef;
   6948     if (NumOfDefinedOps == 1) {
   6949       unsigned SubVecNumElts =
   6950         Op.getOperand(OpIdx).getValueType().getVectorNumElements();
   6951       SDValue IdxVal = DAG.getIntPtrConstant(SubVecNumElts * OpIdx, dl);
   6952       return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, Undef,
   6953                          Op.getOperand(OpIdx), IdxVal);
   6954     }
   6955 
   6956     MVT HalfVT = MVT::getVectorVT(ResVT.getVectorElementType(),
   6957                                   ResVT.getVectorNumElements()/2);
   6958     SmallVector<SDValue, 2> Ops;
   6959     for (unsigned i = 0; i < NumOfOperands/2; i++)
   6960       Ops.push_back(Op.getOperand(i));
   6961     SDValue Lo = DAG.getNode(ISD::CONCAT_VECTORS, dl, HalfVT, Ops);
   6962     Ops.clear();
   6963     for (unsigned i = NumOfOperands/2; i < NumOfOperands; i++)
   6964       Ops.push_back(Op.getOperand(i));
   6965     SDValue Hi = DAG.getNode(ISD::CONCAT_VECTORS, dl, HalfVT, Ops);
   6966     return DAG.getNode(ISD::CONCAT_VECTORS, dl, ResVT, Lo, Hi);
   6967   }
   6968 
   6969   // 2 operands
   6970   SDValue V1 = Op.getOperand(0);
   6971   SDValue V2 = Op.getOperand(1);
   6972   unsigned NumElems = ResVT.getVectorNumElements();
   6973   assert(V1.getValueType() == V2.getValueType() &&
   6974          V1.getValueType().getVectorNumElements() == NumElems/2 &&
   6975          "Unexpected operands in CONCAT_VECTORS");
   6976 
   6977   if (ResVT.getSizeInBits() >= 16)
   6978     return Op; // The operation is legal with KUNPCK
   6979 
   6980   bool IsZeroV1 = ISD::isBuildVectorAllZeros(V1.getNode());
   6981   bool IsZeroV2 = ISD::isBuildVectorAllZeros(V2.getNode());
   6982   SDValue ZeroVec = getZeroVector(ResVT, Subtarget, DAG, dl);
   6983   if (IsZeroV1 && IsZeroV2)
   6984     return ZeroVec;
   6985 
   6986   SDValue ZeroIdx = DAG.getIntPtrConstant(0, dl);
   6987   if (V2.isUndef())
   6988     return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, Undef, V1, ZeroIdx);
   6989   if (IsZeroV2)
   6990     return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, ZeroVec, V1, ZeroIdx);
   6991 
   6992   SDValue IdxVal = DAG.getIntPtrConstant(NumElems/2, dl);
   6993   if (V1.isUndef())
   6994     V2 = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, Undef, V2, IdxVal);
   6995 
   6996   if (IsZeroV1)
   6997     return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, ZeroVec, V2, IdxVal);
   6998 
   6999   V1 = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, Undef, V1, ZeroIdx);
   7000   return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, V1, V2, IdxVal);
   7001 }
   7002 
   7003 static SDValue LowerCONCAT_VECTORS(SDValue Op,
   7004                                    const X86Subtarget &Subtarget,
   7005                                    SelectionDAG &DAG) {
   7006   MVT VT = Op.getSimpleValueType();
   7007   if (VT.getVectorElementType() == MVT::i1)
   7008     return LowerCONCAT_VECTORSvXi1(Op, Subtarget, DAG);
   7009 
   7010   assert((VT.is256BitVector() && Op.getNumOperands() == 2) ||
   7011          (VT.is512BitVector() && (Op.getNumOperands() == 2 ||
   7012           Op.getNumOperands() == 4)));
   7013 
   7014   // AVX can use the vinsertf128 instruction to create 256-bit vectors
   7015   // from two other 128-bit ones.
   7016 
   7017   // 512-bit vector may contain 2 256-bit vectors or 4 128-bit vectors
   7018   return LowerAVXCONCAT_VECTORS(Op, DAG);
   7019 }
   7020 
   7021 //===----------------------------------------------------------------------===//
   7022 // Vector shuffle lowering
   7023 //
   7024 // This is an experimental code path for lowering vector shuffles on x86. It is
   7025 // designed to handle arbitrary vector shuffles and blends, gracefully
   7026 // degrading performance as necessary. It works hard to recognize idiomatic
   7027 // shuffles and lower them to optimal instruction patterns without leaving
   7028 // a framework that allows reasonably efficient handling of all vector shuffle
   7029 // patterns.
   7030 //===----------------------------------------------------------------------===//
   7031 
   7032 /// \brief Tiny helper function to identify a no-op mask.
   7033 ///
   7034 /// This is a somewhat boring predicate function. It checks whether the mask
   7035 /// array input, which is assumed to be a single-input shuffle mask of the kind
   7036 /// used by the X86 shuffle instructions (not a fully general
   7037 /// ShuffleVectorSDNode mask) requires any shuffles to occur. Both undef and an
   7038 /// in-place shuffle are 'no-op's.
   7039 static bool isNoopShuffleMask(ArrayRef<int> Mask) {
   7040   for (int i = 0, Size = Mask.size(); i < Size; ++i) {
   7041     assert(Mask[i] >= -1 && "Out of bound mask element!");
   7042     if (Mask[i] >= 0 && Mask[i] != i)
   7043       return false;
   7044   }
   7045   return true;
   7046 }
   7047 
   7048 /// \brief Test whether there are elements crossing 128-bit lanes in this
   7049 /// shuffle mask.
   7050 ///
   7051 /// X86 divides up its shuffles into in-lane and cross-lane shuffle operations
   7052 /// and we routinely test for these.
   7053 static bool is128BitLaneCrossingShuffleMask(MVT VT, ArrayRef<int> Mask) {
   7054   int LaneSize = 128 / VT.getScalarSizeInBits();
   7055   int Size = Mask.size();
   7056   for (int i = 0; i < Size; ++i)
   7057     if (Mask[i] >= 0 && (Mask[i] % Size) / LaneSize != i / LaneSize)
   7058       return true;
   7059   return false;
   7060 }
   7061 
   7062 /// \brief Test whether a shuffle mask is equivalent within each sub-lane.
   7063 ///
   7064 /// This checks a shuffle mask to see if it is performing the same
   7065 /// lane-relative shuffle in each sub-lane. This trivially implies
   7066 /// that it is also not lane-crossing. It may however involve a blend from the
   7067 /// same lane of a second vector.
   7068 ///
   7069 /// The specific repeated shuffle mask is populated in \p RepeatedMask, as it is
   7070 /// non-trivial to compute in the face of undef lanes. The representation is
   7071 /// suitable for use with existing 128-bit shuffles as entries from the second
   7072 /// vector have been remapped to [LaneSize, 2*LaneSize).
   7073 static bool isRepeatedShuffleMask(unsigned LaneSizeInBits, MVT VT,
   7074                                   ArrayRef<int> Mask,
   7075                                   SmallVectorImpl<int> &RepeatedMask) {
   7076   int LaneSize = LaneSizeInBits / VT.getScalarSizeInBits();
   7077   RepeatedMask.assign(LaneSize, -1);
   7078   int Size = Mask.size();
   7079   for (int i = 0; i < Size; ++i) {
   7080     if (Mask[i] < 0)
   7081       continue;
   7082     if ((Mask[i] % Size) / LaneSize != i / LaneSize)
   7083       // This entry crosses lanes, so there is no way to model this shuffle.
   7084       return false;
   7085 
   7086     // Ok, handle the in-lane shuffles by detecting if and when they repeat.
   7087     // Adjust second vector indices to start at LaneSize instead of Size.
   7088     int LocalM = Mask[i] < Size ? Mask[i] % LaneSize
   7089                                 : Mask[i] % LaneSize + LaneSize;
   7090     if (RepeatedMask[i % LaneSize] < 0)
   7091       // This is the first non-undef entry in this slot of a 128-bit lane.
   7092       RepeatedMask[i % LaneSize] = LocalM;
   7093     else if (RepeatedMask[i % LaneSize] != LocalM)
   7094       // Found a mismatch with the repeated mask.
   7095       return false;
   7096   }
   7097   return true;
   7098 }
   7099 
   7100 /// Test whether a shuffle mask is equivalent within each 128-bit lane.
   7101 static bool
   7102 is128BitLaneRepeatedShuffleMask(MVT VT, ArrayRef<int> Mask,
   7103                                 SmallVectorImpl<int> &RepeatedMask) {
   7104   return isRepeatedShuffleMask(128, VT, Mask, RepeatedMask);
   7105 }
   7106 
   7107 /// Test whether a shuffle mask is equivalent within each 256-bit lane.
   7108 static bool
   7109 is256BitLaneRepeatedShuffleMask(MVT VT, ArrayRef<int> Mask,
   7110                                 SmallVectorImpl<int> &RepeatedMask) {
   7111   return isRepeatedShuffleMask(256, VT, Mask, RepeatedMask);
   7112 }
   7113 
   7114 static void scaleShuffleMask(int Scale, ArrayRef<int> Mask,
   7115                              SmallVectorImpl<int> &ScaledMask) {
   7116   assert(0 < Scale && "Unexpected scaling factor");
   7117   int NumElts = Mask.size();
   7118   ScaledMask.assign(NumElts * Scale, -1);
   7119 
   7120   for (int i = 0; i != NumElts; ++i) {
   7121     int M = Mask[i];
   7122 
   7123     // Repeat sentinel values in every mask element.
   7124     if (M < 0) {
   7125       for (int s = 0; s != Scale; ++s)
   7126         ScaledMask[(Scale * i) + s] = M;
   7127       continue;
   7128     }
   7129 
   7130     // Scale mask element and increment across each mask element.
   7131     for (int s = 0; s != Scale; ++s)
   7132       ScaledMask[(Scale * i) + s] = (Scale * M) + s;
   7133   }
   7134 }
   7135 
   7136 /// \brief Checks whether a shuffle mask is equivalent to an explicit list of
   7137 /// arguments.
   7138 ///
   7139 /// This is a fast way to test a shuffle mask against a fixed pattern:
   7140 ///
   7141 ///   if (isShuffleEquivalent(Mask, 3, 2, {1, 0})) { ... }
   7142 ///
   7143 /// It returns true if the mask is exactly as wide as the argument list, and
   7144 /// each element of the mask is either -1 (signifying undef) or the value given
   7145 /// in the argument.
   7146 static bool isShuffleEquivalent(SDValue V1, SDValue V2, ArrayRef<int> Mask,
   7147                                 ArrayRef<int> ExpectedMask) {
   7148   if (Mask.size() != ExpectedMask.size())
   7149     return false;
   7150 
   7151   int Size = Mask.size();
   7152 
   7153   // If the values are build vectors, we can look through them to find
   7154   // equivalent inputs that make the shuffles equivalent.
   7155   auto *BV1 = dyn_cast<BuildVectorSDNode>(V1);
   7156   auto *BV2 = dyn_cast<BuildVectorSDNode>(V2);
   7157 
   7158   for (int i = 0; i < Size; ++i) {
   7159     assert(Mask[i] >= -1 && "Out of bound mask element!");
   7160     if (Mask[i] >= 0 && Mask[i] != ExpectedMask[i]) {
   7161       auto *MaskBV = Mask[i] < Size ? BV1 : BV2;
   7162       auto *ExpectedBV = ExpectedMask[i] < Size ? BV1 : BV2;
   7163       if (!MaskBV || !ExpectedBV ||
   7164           MaskBV->getOperand(Mask[i] % Size) !=
   7165               ExpectedBV->getOperand(ExpectedMask[i] % Size))
   7166         return false;
   7167     }
   7168 }
   7169 
   7170   return true;
   7171 }
   7172 
   7173 /// Checks whether a target shuffle mask is equivalent to an explicit pattern.
   7174 ///
   7175 /// The masks must be exactly the same width.
   7176 ///
   7177 /// If an element in Mask matches SM_SentinelUndef (-1) then the corresponding
   7178 /// value in ExpectedMask is always accepted. Otherwise the indices must match.
   7179 ///
   7180 /// SM_SentinelZero is accepted as a valid negative index but must match in both.
   7181 static bool isTargetShuffleEquivalent(ArrayRef<int> Mask,
   7182                                       ArrayRef<int> ExpectedMask) {
   7183   int Size = Mask.size();
   7184   if (Size != (int)ExpectedMask.size())
   7185     return false;
   7186 
   7187   for (int i = 0; i < Size; ++i)
   7188     if (Mask[i] == SM_SentinelUndef)
   7189       continue;
   7190     else if (Mask[i] < 0 && Mask[i] != SM_SentinelZero)
   7191       return false;
   7192     else if (Mask[i] != ExpectedMask[i])
   7193       return false;
   7194 
   7195   return true;
   7196 }
   7197 
   7198 /// \brief Get a 4-lane 8-bit shuffle immediate for a mask.
   7199 ///
   7200 /// This helper function produces an 8-bit shuffle immediate corresponding to
   7201 /// the ubiquitous shuffle encoding scheme used in x86 instructions for
   7202 /// shuffling 4 lanes. It can be used with most of the PSHUF instructions for
   7203 /// example.
   7204 ///
   7205 /// NB: We rely heavily on "undef" masks preserving the input lane.
   7206 static unsigned getV4X86ShuffleImm(ArrayRef<int> Mask) {
   7207   assert(Mask.size() == 4 && "Only 4-lane shuffle masks");
   7208   assert(Mask[0] >= -1 && Mask[0] < 4 && "Out of bound mask element!");
   7209   assert(Mask[1] >= -1 && Mask[1] < 4 && "Out of bound mask element!");
   7210   assert(Mask[2] >= -1 && Mask[2] < 4 && "Out of bound mask element!");
   7211   assert(Mask[3] >= -1 && Mask[3] < 4 && "Out of bound mask element!");
   7212 
   7213   unsigned Imm = 0;
   7214   Imm |= (Mask[0] < 0 ? 0 : Mask[0]) << 0;
   7215   Imm |= (Mask[1] < 0 ? 1 : Mask[1]) << 2;
   7216   Imm |= (Mask[2] < 0 ? 2 : Mask[2]) << 4;
   7217   Imm |= (Mask[3] < 0 ? 3 : Mask[3]) << 6;
   7218   return Imm;
   7219 }
   7220 
   7221 static SDValue getV4X86ShuffleImm8ForMask(ArrayRef<int> Mask, SDLoc DL,
   7222                                           SelectionDAG &DAG) {
   7223   return DAG.getConstant(getV4X86ShuffleImm(Mask), DL, MVT::i8);
   7224 }
   7225 
   7226 /// \brief Compute whether each element of a shuffle is zeroable.
   7227 ///
   7228 /// A "zeroable" vector shuffle element is one which can be lowered to zero.
   7229 /// Either it is an undef element in the shuffle mask, the element of the input
   7230 /// referenced is undef, or the element of the input referenced is known to be
   7231 /// zero. Many x86 shuffles can zero lanes cheaply and we often want to handle
   7232 /// as many lanes with this technique as possible to simplify the remaining
   7233 /// shuffle.
   7234 static SmallBitVector computeZeroableShuffleElements(ArrayRef<int> Mask,
   7235                                                      SDValue V1, SDValue V2) {
   7236   SmallBitVector Zeroable(Mask.size(), false);
   7237   V1 = peekThroughBitcasts(V1);
   7238   V2 = peekThroughBitcasts(V2);
   7239 
   7240   bool V1IsZero = ISD::isBuildVectorAllZeros(V1.getNode());
   7241   bool V2IsZero = ISD::isBuildVectorAllZeros(V2.getNode());
   7242 
   7243   int VectorSizeInBits = V1.getValueType().getSizeInBits();
   7244   int ScalarSizeInBits = VectorSizeInBits / Mask.size();
   7245   assert(!(VectorSizeInBits % ScalarSizeInBits) && "Illegal shuffle mask size");
   7246 
   7247   for (int i = 0, Size = Mask.size(); i < Size; ++i) {
   7248     int M = Mask[i];
   7249     // Handle the easy cases.
   7250     if (M < 0 || (M >= 0 && M < Size && V1IsZero) || (M >= Size && V2IsZero)) {
   7251       Zeroable[i] = true;
   7252       continue;
   7253     }
   7254 
   7255     // Determine shuffle input and normalize the mask.
   7256     SDValue V = M < Size ? V1 : V2;
   7257     M %= Size;
   7258 
   7259     // Currently we can only search BUILD_VECTOR for UNDEF/ZERO elements.
   7260     if (V.getOpcode() != ISD::BUILD_VECTOR)
   7261       continue;
   7262 
   7263     // If the BUILD_VECTOR has fewer elements then the bitcasted portion of
   7264     // the (larger) source element must be UNDEF/ZERO.
   7265     if ((Size % V.getNumOperands()) == 0) {
   7266       int Scale = Size / V->getNumOperands();
   7267       SDValue Op = V.getOperand(M / Scale);
   7268       if (Op.isUndef() || X86::isZeroNode(Op))
   7269         Zeroable[i] = true;
   7270       else if (ConstantSDNode *Cst = dyn_cast<ConstantSDNode>(Op)) {
   7271         APInt Val = Cst->getAPIntValue();
   7272         Val = Val.lshr((M % Scale) * ScalarSizeInBits);
   7273         Val = Val.getLoBits(ScalarSizeInBits);
   7274         Zeroable[i] = (Val == 0);
   7275       } else if (ConstantFPSDNode *Cst = dyn_cast<ConstantFPSDNode>(Op)) {
   7276         APInt Val = Cst->getValueAPF().bitcastToAPInt();
   7277         Val = Val.lshr((M % Scale) * ScalarSizeInBits);
   7278         Val = Val.getLoBits(ScalarSizeInBits);
   7279         Zeroable[i] = (Val == 0);
   7280       }
   7281       continue;
   7282     }
   7283 
   7284     // If the BUILD_VECTOR has more elements then all the (smaller) source
   7285     // elements must be UNDEF or ZERO.
   7286     if ((V.getNumOperands() % Size) == 0) {
   7287       int Scale = V->getNumOperands() / Size;
   7288       bool AllZeroable = true;
   7289       for (int j = 0; j < Scale; ++j) {
   7290         SDValue Op = V.getOperand((M * Scale) + j);
   7291         AllZeroable &= (Op.isUndef() || X86::isZeroNode(Op));
   7292       }
   7293       Zeroable[i] = AllZeroable;
   7294       continue;
   7295     }
   7296   }
   7297 
   7298   return Zeroable;
   7299 }
   7300 
   7301 /// Try to lower a shuffle with a single PSHUFB of V1.
   7302 /// This is only possible if V2 is unused (at all, or only for zero elements).
   7303 static SDValue lowerVectorShuffleWithPSHUFB(const SDLoc &DL, MVT VT,
   7304                                             ArrayRef<int> Mask, SDValue V1,
   7305                                             SDValue V2,
   7306                                             const X86Subtarget &Subtarget,
   7307                                             SelectionDAG &DAG) {
   7308   int Size = Mask.size();
   7309   int LaneSize = 128 / VT.getScalarSizeInBits();
   7310   const int NumBytes = VT.getSizeInBits() / 8;
   7311   const int NumEltBytes = VT.getScalarSizeInBits() / 8;
   7312 
   7313   assert((Subtarget.hasSSSE3() && VT.is128BitVector()) ||
   7314          (Subtarget.hasAVX2() && VT.is256BitVector()) ||
   7315          (Subtarget.hasBWI() && VT.is512BitVector()));
   7316 
   7317   SmallBitVector Zeroable = computeZeroableShuffleElements(Mask, V1, V2);
   7318 
   7319   SmallVector<SDValue, 64> PSHUFBMask(NumBytes);
   7320   // Sign bit set in i8 mask means zero element.
   7321   SDValue ZeroMask = DAG.getConstant(0x80, DL, MVT::i8);
   7322 
   7323   for (int i = 0; i < NumBytes; ++i) {
   7324     int M = Mask[i / NumEltBytes];
   7325     if (M < 0) {
   7326       PSHUFBMask[i] = DAG.getUNDEF(MVT::i8);
   7327       continue;
   7328     }
   7329     if (Zeroable[i / NumEltBytes]) {
   7330       PSHUFBMask[i] = ZeroMask;
   7331       continue;
   7332     }
   7333     // Only allow V1.
   7334     if (M >= Size)
   7335       return SDValue();
   7336 
   7337     // PSHUFB can't cross lanes, ensure this doesn't happen.
   7338     if ((M / LaneSize) != ((i / NumEltBytes) / LaneSize))
   7339       return SDValue();
   7340 
   7341     M = M % LaneSize;
   7342     M = M * NumEltBytes + (i % NumEltBytes);
   7343     PSHUFBMask[i] = DAG.getConstant(M, DL, MVT::i8);
   7344   }
   7345 
   7346   MVT I8VT = MVT::getVectorVT(MVT::i8, NumBytes);
   7347   return DAG.getBitcast(
   7348       VT, DAG.getNode(X86ISD::PSHUFB, DL, I8VT, DAG.getBitcast(I8VT, V1),
   7349                       DAG.getBuildVector(I8VT, DL, PSHUFBMask)));
   7350 }
   7351 
   7352 // X86 has dedicated unpack instructions that can handle specific blend
   7353 // operations: UNPCKH and UNPCKL.
   7354 static SDValue lowerVectorShuffleWithUNPCK(const SDLoc &DL, MVT VT,
   7355                                            ArrayRef<int> Mask, SDValue V1,
   7356                                            SDValue V2, SelectionDAG &DAG) {
   7357   int NumElts = VT.getVectorNumElements();
   7358   int NumEltsInLane = 128 / VT.getScalarSizeInBits();
   7359   SmallVector<int, 8> Unpckl(NumElts);
   7360   SmallVector<int, 8> Unpckh(NumElts);
   7361 
   7362   for (int i = 0; i < NumElts; ++i) {
   7363     unsigned LaneStart = (i / NumEltsInLane) * NumEltsInLane;
   7364     int LoPos = (i % NumEltsInLane) / 2 + LaneStart + NumElts * (i % 2);
   7365     int HiPos = LoPos + NumEltsInLane / 2;
   7366     Unpckl[i] = LoPos;
   7367     Unpckh[i] = HiPos;
   7368   }
   7369 
   7370   if (isShuffleEquivalent(V1, V2, Mask, Unpckl))
   7371     return DAG.getNode(X86ISD::UNPCKL, DL, VT, V1, V2);
   7372   if (isShuffleEquivalent(V1, V2, Mask, Unpckh))
   7373     return DAG.getNode(X86ISD::UNPCKH, DL, VT, V1, V2);
   7374 
   7375   // Commute and try again.
   7376   ShuffleVectorSDNode::commuteMask(Unpckl);
   7377   if (isShuffleEquivalent(V1, V2, Mask, Unpckl))
   7378     return DAG.getNode(X86ISD::UNPCKL, DL, VT, V2, V1);
   7379 
   7380   ShuffleVectorSDNode::commuteMask(Unpckh);
   7381   if (isShuffleEquivalent(V1, V2, Mask, Unpckh))
   7382     return DAG.getNode(X86ISD::UNPCKH, DL, VT, V2, V1);
   7383 
   7384   return SDValue();
   7385 }
   7386 
   7387 /// \brief Try to emit a bitmask instruction for a shuffle.
   7388 ///
   7389 /// This handles cases where we can model a blend exactly as a bitmask due to
   7390 /// one of the inputs being zeroable.
   7391 static SDValue lowerVectorShuffleAsBitMask(const SDLoc &DL, MVT VT, SDValue V1,
   7392                                            SDValue V2, ArrayRef<int> Mask,
   7393                                            SelectionDAG &DAG) {
   7394   MVT EltVT = VT.getVectorElementType();
   7395   int NumEltBits = EltVT.getSizeInBits();
   7396   MVT IntEltVT = MVT::getIntegerVT(NumEltBits);
   7397   SDValue Zero = DAG.getConstant(0, DL, IntEltVT);
   7398   SDValue AllOnes = DAG.getConstant(APInt::getAllOnesValue(NumEltBits), DL,
   7399                                     IntEltVT);
   7400   if (EltVT.isFloatingPoint()) {
   7401     Zero = DAG.getBitcast(EltVT, Zero);
   7402     AllOnes = DAG.getBitcast(EltVT, AllOnes);
   7403   }
   7404   SmallVector<SDValue, 16> VMaskOps(Mask.size(), Zero);
   7405   SmallBitVector Zeroable = computeZeroableShuffleElements(Mask, V1, V2);
   7406   SDValue V;
   7407   for (int i = 0, Size = Mask.size(); i < Size; ++i) {
   7408     if (Zeroable[i])
   7409       continue;
   7410     if (Mask[i] % Size != i)
   7411       return SDValue(); // Not a blend.
   7412     if (!V)
   7413       V = Mask[i] < Size ? V1 : V2;
   7414     else if (V != (Mask[i] < Size ? V1 : V2))
   7415       return SDValue(); // Can only let one input through the mask.
   7416 
   7417     VMaskOps[i] = AllOnes;
   7418   }
   7419   if (!V)
   7420     return SDValue(); // No non-zeroable elements!
   7421 
   7422   SDValue VMask = DAG.getBuildVector(VT, DL, VMaskOps);
   7423   V = DAG.getNode(VT.isFloatingPoint()
   7424                   ? (unsigned) X86ISD::FAND : (unsigned) ISD::AND,
   7425                   DL, VT, V, VMask);
   7426   return V;
   7427 }
   7428 
   7429 /// \brief Try to emit a blend instruction for a shuffle using bit math.
   7430 ///
   7431 /// This is used as a fallback approach when first class blend instructions are
   7432 /// unavailable. Currently it is only suitable for integer vectors, but could
   7433 /// be generalized for floating point vectors if desirable.
   7434 static SDValue lowerVectorShuffleAsBitBlend(const SDLoc &DL, MVT VT, SDValue V1,
   7435                                             SDValue V2, ArrayRef<int> Mask,
   7436                                             SelectionDAG &DAG) {
   7437   assert(VT.isInteger() && "Only supports integer vector types!");
   7438   MVT EltVT = VT.getVectorElementType();
   7439   int NumEltBits = EltVT.getSizeInBits();
   7440   SDValue Zero = DAG.getConstant(0, DL, EltVT);
   7441   SDValue AllOnes = DAG.getConstant(APInt::getAllOnesValue(NumEltBits), DL,
   7442                                     EltVT);
   7443   SmallVector<SDValue, 16> MaskOps;
   7444   for (int i = 0, Size = Mask.size(); i < Size; ++i) {
   7445     if (Mask[i] >= 0 && Mask[i] != i && Mask[i] != i + Size)
   7446       return SDValue(); // Shuffled input!
   7447     MaskOps.push_back(Mask[i] < Size ? AllOnes : Zero);
   7448   }
   7449 
   7450   SDValue V1Mask = DAG.getBuildVector(VT, DL, MaskOps);
   7451   V1 = DAG.getNode(ISD::AND, DL, VT, V1, V1Mask);
   7452   // We have to cast V2 around.
   7453   MVT MaskVT = MVT::getVectorVT(MVT::i64, VT.getSizeInBits() / 64);
   7454   V2 = DAG.getBitcast(VT, DAG.getNode(X86ISD::ANDNP, DL, MaskVT,
   7455                                       DAG.getBitcast(MaskVT, V1Mask),
   7456                                       DAG.getBitcast(MaskVT, V2)));
   7457   return DAG.getNode(ISD::OR, DL, VT, V1, V2);
   7458 }
   7459 
   7460 /// \brief Try to emit a blend instruction for a shuffle.
   7461 ///
   7462 /// This doesn't do any checks for the availability of instructions for blending
   7463 /// these values. It relies on the availability of the X86ISD::BLENDI pattern to
   7464 /// be matched in the backend with the type given. What it does check for is
   7465 /// that the shuffle mask is a blend, or convertible into a blend with zero.
   7466 static SDValue lowerVectorShuffleAsBlend(const SDLoc &DL, MVT VT, SDValue V1,
   7467                                          SDValue V2, ArrayRef<int> Original,
   7468                                          const X86Subtarget &Subtarget,
   7469                                          SelectionDAG &DAG) {
   7470   bool V1IsZero = ISD::isBuildVectorAllZeros(V1.getNode());
   7471   bool V2IsZero = ISD::isBuildVectorAllZeros(V2.getNode());
   7472   SmallVector<int, 8> Mask(Original.begin(), Original.end());
   7473   SmallBitVector Zeroable = computeZeroableShuffleElements(Mask, V1, V2);
   7474   bool ForceV1Zero = false, ForceV2Zero = false;
   7475 
   7476   // Attempt to generate the binary blend mask. If an input is zero then
   7477   // we can use any lane.
   7478   // TODO: generalize the zero matching to any scalar like isShuffleEquivalent.
   7479   unsigned BlendMask = 0;
   7480   for (int i = 0, Size = Mask.size(); i < Size; ++i) {
   7481     int M = Mask[i];
   7482     if (M < 0)
   7483       continue;
   7484     if (M == i)
   7485       continue;
   7486     if (M == i + Size) {
   7487       BlendMask |= 1u << i;
   7488       continue;
   7489     }
   7490     if (Zeroable[i]) {
   7491       if (V1IsZero) {
   7492         ForceV1Zero = true;
   7493         Mask[i] = i;
   7494         continue;
   7495       }
   7496       if (V2IsZero) {
   7497         ForceV2Zero = true;
   7498         BlendMask |= 1u << i;
   7499         Mask[i] = i + Size;
   7500         continue;
   7501       }
   7502     }
   7503     return SDValue(); // Shuffled input!
   7504   }
   7505 
   7506   // Create a REAL zero vector - ISD::isBuildVectorAllZeros allows UNDEFs.
   7507   if (ForceV1Zero)
   7508     V1 = getZeroVector(VT, Subtarget, DAG, DL);
   7509   if (ForceV2Zero)
   7510     V2 = getZeroVector(VT, Subtarget, DAG, DL);
   7511 
   7512   auto ScaleBlendMask = [](unsigned BlendMask, int Size, int Scale) {
   7513     unsigned ScaledMask = 0;
   7514     for (int i = 0; i != Size; ++i)
   7515       if (BlendMask & (1u << i))
   7516         for (int j = 0; j != Scale; ++j)
   7517           ScaledMask |= 1u << (i * Scale + j);
   7518     return ScaledMask;
   7519   };
   7520 
   7521   switch (VT.SimpleTy) {
   7522   case MVT::v2f64:
   7523   case MVT::v4f32:
   7524   case MVT::v4f64:
   7525   case MVT::v8f32:
   7526     return DAG.getNode(X86ISD::BLENDI, DL, VT, V1, V2,
   7527                        DAG.getConstant(BlendMask, DL, MVT::i8));
   7528 
   7529   case MVT::v4i64:
   7530   case MVT::v8i32:
   7531     assert(Subtarget.hasAVX2() && "256-bit integer blends require AVX2!");
   7532     // FALLTHROUGH
   7533   case MVT::v2i64:
   7534   case MVT::v4i32:
   7535     // If we have AVX2 it is faster to use VPBLENDD when the shuffle fits into
   7536     // that instruction.
   7537     if (Subtarget.hasAVX2()) {
   7538       // Scale the blend by the number of 32-bit dwords per element.
   7539       int Scale =  VT.getScalarSizeInBits() / 32;
   7540       BlendMask = ScaleBlendMask(BlendMask, Mask.size(), Scale);
   7541       MVT BlendVT = VT.getSizeInBits() > 128 ? MVT::v8i32 : MVT::v4i32;
   7542       V1 = DAG.getBitcast(BlendVT, V1);
   7543       V2 = DAG.getBitcast(BlendVT, V2);
   7544       return DAG.getBitcast(
   7545           VT, DAG.getNode(X86ISD::BLENDI, DL, BlendVT, V1, V2,
   7546                           DAG.getConstant(BlendMask, DL, MVT::i8)));
   7547     }
   7548     // FALLTHROUGH
   7549   case MVT::v8i16: {
   7550     // For integer shuffles we need to expand the mask and cast the inputs to
   7551     // v8i16s prior to blending.
   7552     int Scale = 8 / VT.getVectorNumElements();
   7553     BlendMask = ScaleBlendMask(BlendMask, Mask.size(), Scale);
   7554     V1 = DAG.getBitcast(MVT::v8i16, V1);
   7555     V2 = DAG.getBitcast(MVT::v8i16, V2);
   7556     return DAG.getBitcast(VT,
   7557                           DAG.getNode(X86ISD::BLENDI, DL, MVT::v8i16, V1, V2,
   7558                                       DAG.getConstant(BlendMask, DL, MVT::i8)));
   7559   }
   7560 
   7561   case MVT::v16i16: {
   7562     assert(Subtarget.hasAVX2() && "256-bit integer blends require AVX2!");
   7563     SmallVector<int, 8> RepeatedMask;
   7564     if (is128BitLaneRepeatedShuffleMask(MVT::v16i16, Mask, RepeatedMask)) {
   7565       // We can lower these with PBLENDW which is mirrored across 128-bit lanes.
   7566       assert(RepeatedMask.size() == 8 && "Repeated mask size doesn't match!");
   7567       BlendMask = 0;
   7568       for (int i = 0; i < 8; ++i)
   7569         if (RepeatedMask[i] >= 8)
   7570           BlendMask |= 1u << i;
   7571       return DAG.getNode(X86ISD::BLENDI, DL, MVT::v16i16, V1, V2,
   7572                          DAG.getConstant(BlendMask, DL, MVT::i8));
   7573     }
   7574   }
   7575     // FALLTHROUGH
   7576   case MVT::v16i8:
   7577   case MVT::v32i8: {
   7578     assert((VT.is128BitVector() || Subtarget.hasAVX2()) &&
   7579            "256-bit byte-blends require AVX2 support!");
   7580 
   7581     // Attempt to lower to a bitmask if we can. VPAND is faster than VPBLENDVB.
   7582     if (SDValue Masked = lowerVectorShuffleAsBitMask(DL, VT, V1, V2, Mask, DAG))
   7583       return Masked;
   7584 
   7585     // Scale the blend by the number of bytes per element.
   7586     int Scale = VT.getScalarSizeInBits() / 8;
   7587 
   7588     // This form of blend is always done on bytes. Compute the byte vector
   7589     // type.
   7590     MVT BlendVT = MVT::getVectorVT(MVT::i8, VT.getSizeInBits() / 8);
   7591 
   7592     // Compute the VSELECT mask. Note that VSELECT is really confusing in the
   7593     // mix of LLVM's code generator and the x86 backend. We tell the code
   7594     // generator that boolean values in the elements of an x86 vector register
   7595     // are -1 for true and 0 for false. We then use the LLVM semantics of 'true'
   7596     // mapping a select to operand #1, and 'false' mapping to operand #2. The
   7597     // reality in x86 is that vector masks (pre-AVX-512) use only the high bit
   7598     // of the element (the remaining are ignored) and 0 in that high bit would
   7599     // mean operand #1 while 1 in the high bit would mean operand #2. So while
   7600     // the LLVM model for boolean values in vector elements gets the relevant
   7601     // bit set, it is set backwards and over constrained relative to x86's
   7602     // actual model.
   7603     SmallVector<SDValue, 32> VSELECTMask;
   7604     for (int i = 0, Size = Mask.size(); i < Size; ++i)
   7605       for (int j = 0; j < Scale; ++j)
   7606         VSELECTMask.push_back(
   7607             Mask[i] < 0 ? DAG.getUNDEF(MVT::i8)
   7608                         : DAG.getConstant(Mask[i] < Size ? -1 : 0, DL,
   7609                                           MVT::i8));
   7610 
   7611     V1 = DAG.getBitcast(BlendVT, V1);
   7612     V2 = DAG.getBitcast(BlendVT, V2);
   7613     return DAG.getBitcast(
   7614         VT, DAG.getNode(ISD::VSELECT, DL, BlendVT,
   7615                         DAG.getBuildVector(BlendVT, DL, VSELECTMask), V1, V2));
   7616   }
   7617 
   7618   default:
   7619     llvm_unreachable("Not a supported integer vector type!");
   7620   }
   7621 }
   7622 
   7623 /// \brief Try to lower as a blend of elements from two inputs followed by
   7624 /// a single-input permutation.
   7625 ///
   7626 /// This matches the pattern where we can blend elements from two inputs and
   7627 /// then reduce the shuffle to a single-input permutation.
   7628 static SDValue lowerVectorShuffleAsBlendAndPermute(const SDLoc &DL, MVT VT,
   7629                                                    SDValue V1, SDValue V2,
   7630                                                    ArrayRef<int> Mask,
   7631                                                    SelectionDAG &DAG) {
   7632   // We build up the blend mask while checking whether a blend is a viable way
   7633   // to reduce the shuffle.
   7634   SmallVector<int, 32> BlendMask(Mask.size(), -1);
   7635   SmallVector<int, 32> PermuteMask(Mask.size(), -1);
   7636 
   7637   for (int i = 0, Size = Mask.size(); i < Size; ++i) {
   7638     if (Mask[i] < 0)
   7639       continue;
   7640 
   7641     assert(Mask[i] < Size * 2 && "Shuffle input is out of bounds.");
   7642 
   7643     if (BlendMask[Mask[i] % Size] < 0)
   7644       BlendMask[Mask[i] % Size] = Mask[i];
   7645     else if (BlendMask[Mask[i] % Size] != Mask[i])
   7646       return SDValue(); // Can't blend in the needed input!
   7647 
   7648     PermuteMask[i] = Mask[i] % Size;
   7649   }
   7650 
   7651   SDValue V = DAG.getVectorShuffle(VT, DL, V1, V2, BlendMask);
   7652   return DAG.getVectorShuffle(VT, DL, V, DAG.getUNDEF(VT), PermuteMask);
   7653 }
   7654 
   7655 /// \brief Generic routine to decompose a shuffle and blend into indepndent
   7656 /// blends and permutes.
   7657 ///
   7658 /// This matches the extremely common pattern for handling combined
   7659 /// shuffle+blend operations on newer X86 ISAs where we have very fast blend
   7660 /// operations. It will try to pick the best arrangement of shuffles and
   7661 /// blends.
   7662 static SDValue lowerVectorShuffleAsDecomposedShuffleBlend(const SDLoc &DL,
   7663                                                           MVT VT, SDValue V1,
   7664                                                           SDValue V2,
   7665                                                           ArrayRef<int> Mask,
   7666                                                           SelectionDAG &DAG) {
   7667   // Shuffle the input elements into the desired positions in V1 and V2 and
   7668   // blend them together.
   7669   SmallVector<int, 32> V1Mask(Mask.size(), -1);
   7670   SmallVector<int, 32> V2Mask(Mask.size(), -1);
   7671   SmallVector<int, 32> BlendMask(Mask.size(), -1);
   7672   for (int i = 0, Size = Mask.size(); i < Size; ++i)
   7673     if (Mask[i] >= 0 && Mask[i] < Size) {
   7674       V1Mask[i] = Mask[i];
   7675       BlendMask[i] = i;
   7676     } else if (Mask[i] >= Size) {
   7677       V2Mask[i] = Mask[i] - Size;
   7678       BlendMask[i] = i + Size;
   7679     }
   7680 
   7681   // Try to lower with the simpler initial blend strategy unless one of the
   7682   // input shuffles would be a no-op. We prefer to shuffle inputs as the
   7683   // shuffle may be able to fold with a load or other benefit. However, when
   7684   // we'll have to do 2x as many shuffles in order to achieve this, blending
   7685   // first is a better strategy.
   7686   if (!isNoopShuffleMask(V1Mask) && !isNoopShuffleMask(V2Mask))
   7687     if (SDValue BlendPerm =
   7688             lowerVectorShuffleAsBlendAndPermute(DL, VT, V1, V2, Mask, DAG))
   7689       return BlendPerm;
   7690 
   7691   V1 = DAG.getVectorShuffle(VT, DL, V1, DAG.getUNDEF(VT), V1Mask);
   7692   V2 = DAG.getVectorShuffle(VT, DL, V2, DAG.getUNDEF(VT), V2Mask);
   7693   return DAG.getVectorShuffle(VT, DL, V1, V2, BlendMask);
   7694 }
   7695 
   7696 /// \brief Try to lower a vector shuffle as a byte rotation.
   7697 ///
   7698 /// SSSE3 has a generic PALIGNR instruction in x86 that will do an arbitrary
   7699 /// byte-rotation of the concatenation of two vectors; pre-SSSE3 can use
   7700 /// a PSRLDQ/PSLLDQ/POR pattern to get a similar effect. This routine will
   7701 /// try to generically lower a vector shuffle through such an pattern. It
   7702 /// does not check for the profitability of lowering either as PALIGNR or
   7703 /// PSRLDQ/PSLLDQ/POR, only whether the mask is valid to lower in that form.
   7704 /// This matches shuffle vectors that look like:
   7705 ///
   7706 ///   v8i16 [11, 12, 13, 14, 15, 0, 1, 2]
   7707 ///
   7708 /// Essentially it concatenates V1 and V2, shifts right by some number of
   7709 /// elements, and takes the low elements as the result. Note that while this is
   7710 /// specified as a *right shift* because x86 is little-endian, it is a *left
   7711 /// rotate* of the vector lanes.
   7712 static SDValue lowerVectorShuffleAsByteRotate(const SDLoc &DL, MVT VT,
   7713                                               SDValue V1, SDValue V2,
   7714                                               ArrayRef<int> Mask,
   7715                                               const X86Subtarget &Subtarget,
   7716                                               SelectionDAG &DAG) {
   7717   assert(!isNoopShuffleMask(Mask) && "We shouldn't lower no-op shuffles!");
   7718 
   7719   int NumElts = Mask.size();
   7720   int NumLanes = VT.getSizeInBits() / 128;
   7721   int NumLaneElts = NumElts / NumLanes;
   7722 
   7723   // We need to detect various ways of spelling a rotation:
   7724   //   [11, 12, 13, 14, 15,  0,  1,  2]
   7725   //   [-1, 12, 13, 14, -1, -1,  1, -1]
   7726   //   [-1, -1, -1, -1, -1, -1,  1,  2]
   7727   //   [ 3,  4,  5,  6,  7,  8,  9, 10]
   7728   //   [-1,  4,  5,  6, -1, -1,  9, -1]
   7729   //   [-1,  4,  5,  6, -1, -1, -1, -1]
   7730   int Rotation = 0;
   7731   SDValue Lo, Hi;
   7732   for (int l = 0; l < NumElts; l += NumLaneElts) {
   7733     for (int i = 0; i < NumLaneElts; ++i) {
   7734       if (Mask[l + i] < 0)
   7735         continue;
   7736 
   7737       // Get the mod-Size index and lane correct it.
   7738       int LaneIdx = (Mask[l + i] % NumElts) - l;
   7739       // Make sure it was in this lane.
   7740       if (LaneIdx < 0 || LaneIdx >= NumLaneElts)
   7741         return SDValue();
   7742 
   7743       // Determine where a rotated vector would have started.
   7744       int StartIdx = i - LaneIdx;
   7745       if (StartIdx == 0)
   7746         // The identity rotation isn't interesting, stop.
   7747         return SDValue();
   7748 
   7749       // If we found the tail of a vector the rotation must be the missing
   7750       // front. If we found the head of a vector, it must be how much of the
   7751       // head.
   7752       int CandidateRotation = StartIdx < 0 ? -StartIdx : NumLaneElts - StartIdx;
   7753 
   7754       if (Rotation == 0)
   7755         Rotation = CandidateRotation;
   7756       else if (Rotation != CandidateRotation)
   7757         // The rotations don't match, so we can't match this mask.
   7758         return SDValue();
   7759 
   7760       // Compute which value this mask is pointing at.
   7761       SDValue MaskV = Mask[l + i] < NumElts ? V1 : V2;
   7762 
   7763       // Compute which of the two target values this index should be assigned
   7764       // to. This reflects whether the high elements are remaining or the low
   7765       // elements are remaining.
   7766       SDValue &TargetV = StartIdx < 0 ? Hi : Lo;
   7767 
   7768       // Either set up this value if we've not encountered it before, or check
   7769       // that it remains consistent.
   7770       if (!TargetV)
   7771         TargetV = MaskV;
   7772       else if (TargetV != MaskV)
   7773         // This may be a rotation, but it pulls from the inputs in some
   7774         // unsupported interleaving.
   7775         return SDValue();
   7776     }
   7777   }
   7778 
   7779   // Check that we successfully analyzed the mask, and normalize the results.
   7780   assert(Rotation != 0 && "Failed to locate a viable rotation!");
   7781   assert((Lo || Hi) && "Failed to find a rotated input vector!");
   7782   if (!Lo)
   7783     Lo = Hi;
   7784   else if (!Hi)
   7785     Hi = Lo;
   7786 
   7787   // Cast the inputs to i8 vector of correct length to match PALIGNR or
   7788   // PSLLDQ/PSRLDQ.
   7789   MVT ByteVT = MVT::getVectorVT(MVT::i8, 16 * NumLanes);
   7790   Lo = DAG.getBitcast(ByteVT, Lo);
   7791   Hi = DAG.getBitcast(ByteVT, Hi);
   7792 
   7793   // The actual rotate instruction rotates bytes, so we need to scale the
   7794   // rotation based on how many bytes are in the vector lane.
   7795   int Scale = 16 / NumLaneElts;
   7796 
   7797   // SSSE3 targets can use the palignr instruction.
   7798   if (Subtarget.hasSSSE3()) {
   7799     assert((!VT.is512BitVector() || Subtarget.hasBWI()) &&
   7800            "512-bit PALIGNR requires BWI instructions");
   7801     return DAG.getBitcast(
   7802         VT, DAG.getNode(X86ISD::PALIGNR, DL, ByteVT, Lo, Hi,
   7803                         DAG.getConstant(Rotation * Scale, DL, MVT::i8)));
   7804   }
   7805 
   7806   assert(VT.is128BitVector() &&
   7807          "Rotate-based lowering only supports 128-bit lowering!");
   7808   assert(Mask.size() <= 16 &&
   7809          "Can shuffle at most 16 bytes in a 128-bit vector!");
   7810   assert(ByteVT == MVT::v16i8 &&
   7811          "SSE2 rotate lowering only needed for v16i8!");
   7812 
   7813   // Default SSE2 implementation
   7814   int LoByteShift = 16 - Rotation * Scale;
   7815   int HiByteShift = Rotation * Scale;
   7816 
   7817   SDValue LoShift = DAG.getNode(X86ISD::VSHLDQ, DL, MVT::v16i8, Lo,
   7818                                 DAG.getConstant(LoByteShift, DL, MVT::i8));
   7819   SDValue HiShift = DAG.getNode(X86ISD::VSRLDQ, DL, MVT::v16i8, Hi,
   7820                                 DAG.getConstant(HiByteShift, DL, MVT::i8));
   7821   return DAG.getBitcast(VT,
   7822                         DAG.getNode(ISD::OR, DL, MVT::v16i8, LoShift, HiShift));
   7823 }
   7824 
   7825 /// \brief Try to lower a vector shuffle as a bit shift (shifts in zeros).
   7826 ///
   7827 /// Attempts to match a shuffle mask against the PSLL(W/D/Q/DQ) and
   7828 /// PSRL(W/D/Q/DQ) SSE2 and AVX2 logical bit-shift instructions. The function
   7829 /// matches elements from one of the input vectors shuffled to the left or
   7830 /// right with zeroable elements 'shifted in'. It handles both the strictly
   7831 /// bit-wise element shifts and the byte shift across an entire 128-bit double
   7832 /// quad word lane.
   7833 ///
   7834 /// PSHL : (little-endian) left bit shift.
   7835 /// [ zz, 0, zz,  2 ]
   7836 /// [ -1, 4, zz, -1 ]
   7837 /// PSRL : (little-endian) right bit shift.
   7838 /// [  1, zz,  3, zz]
   7839 /// [ -1, -1,  7, zz]
   7840 /// PSLLDQ : (little-endian) left byte shift
   7841 /// [ zz,  0,  1,  2,  3,  4,  5,  6]
   7842 /// [ zz, zz, -1, -1,  2,  3,  4, -1]
   7843 /// [ zz, zz, zz, zz, zz, zz, -1,  1]
   7844 /// PSRLDQ : (little-endian) right byte shift
   7845 /// [  5, 6,  7, zz, zz, zz, zz, zz]
   7846 /// [ -1, 5,  6,  7, zz, zz, zz, zz]
   7847 /// [  1, 2, -1, -1, -1, -1, zz, zz]
   7848 static SDValue lowerVectorShuffleAsShift(const SDLoc &DL, MVT VT, SDValue V1,
   7849                                          SDValue V2, ArrayRef<int> Mask,
   7850                                          const X86Subtarget &Subtarget,
   7851                                          SelectionDAG &DAG) {
   7852   SmallBitVector Zeroable = computeZeroableShuffleElements(Mask, V1, V2);
   7853 
   7854   int Size = Mask.size();
   7855   assert(Size == (int)VT.getVectorNumElements() && "Unexpected mask size");
   7856 
   7857   auto CheckZeros = [&](int Shift, int Scale, bool Left) {
   7858     for (int i = 0; i < Size; i += Scale)
   7859       for (int j = 0; j < Shift; ++j)
   7860         if (!Zeroable[i + j + (Left ? 0 : (Scale - Shift))])
   7861           return false;
   7862 
   7863     return true;
   7864   };
   7865 
   7866   auto MatchShift = [&](int Shift, int Scale, bool Left, SDValue V) {
   7867     for (int i = 0; i != Size; i += Scale) {
   7868       unsigned Pos = Left ? i + Shift : i;
   7869       unsigned Low = Left ? i : i + Shift;
   7870       unsigned Len = Scale - Shift;
   7871       if (!isSequentialOrUndefInRange(Mask, Pos, Len,
   7872                                       Low + (V == V1 ? 0 : Size)))
   7873         return SDValue();
   7874     }
   7875 
   7876     int ShiftEltBits = VT.getScalarSizeInBits() * Scale;
   7877     bool ByteShift = ShiftEltBits > 64;
   7878     unsigned OpCode = Left ? (ByteShift ? X86ISD::VSHLDQ : X86ISD::VSHLI)
   7879                            : (ByteShift ? X86ISD::VSRLDQ : X86ISD::VSRLI);
   7880     int ShiftAmt = Shift * VT.getScalarSizeInBits() / (ByteShift ? 8 : 1);
   7881 
   7882     // Normalize the scale for byte shifts to still produce an i64 element
   7883     // type.
   7884     Scale = ByteShift ? Scale / 2 : Scale;
   7885 
   7886     // We need to round trip through the appropriate type for the shift.
   7887     MVT ShiftSVT = MVT::getIntegerVT(VT.getScalarSizeInBits() * Scale);
   7888     MVT ShiftVT = ByteShift ? MVT::getVectorVT(MVT::i8, VT.getSizeInBits() / 8)
   7889                             : MVT::getVectorVT(ShiftSVT, Size / Scale);
   7890     assert(DAG.getTargetLoweringInfo().isTypeLegal(ShiftVT) &&
   7891            "Illegal integer vector type");
   7892     V = DAG.getBitcast(ShiftVT, V);
   7893 
   7894     V = DAG.getNode(OpCode, DL, ShiftVT, V,
   7895                     DAG.getConstant(ShiftAmt, DL, MVT::i8));
   7896     return DAG.getBitcast(VT, V);
   7897   };
   7898 
   7899   // SSE/AVX supports logical shifts up to 64-bit integers - so we can just
   7900   // keep doubling the size of the integer elements up to that. We can
   7901   // then shift the elements of the integer vector by whole multiples of
   7902   // their width within the elements of the larger integer vector. Test each
   7903   // multiple to see if we can find a match with the moved element indices
   7904   // and that the shifted in elements are all zeroable.
   7905   unsigned MaxWidth = (VT.is512BitVector() && !Subtarget.hasBWI() ? 64 : 128);
   7906   for (int Scale = 2; Scale * VT.getScalarSizeInBits() <= MaxWidth; Scale *= 2)
   7907     for (int Shift = 1; Shift != Scale; ++Shift)
   7908       for (bool Left : {true, false})
   7909         if (CheckZeros(Shift, Scale, Left))
   7910           for (SDValue V : {V1, V2})
   7911             if (SDValue Match = MatchShift(Shift, Scale, Left, V))
   7912               return Match;
   7913 
   7914   // no match
   7915   return SDValue();
   7916 }
   7917 
   7918 /// \brief Try to lower a vector shuffle using SSE4a EXTRQ/INSERTQ.
   7919 static SDValue lowerVectorShuffleWithSSE4A(const SDLoc &DL, MVT VT, SDValue V1,
   7920                                            SDValue V2, ArrayRef<int> Mask,
   7921                                            SelectionDAG &DAG) {
   7922   SmallBitVector Zeroable = computeZeroableShuffleElements(Mask, V1, V2);
   7923   assert(!Zeroable.all() && "Fully zeroable shuffle mask");
   7924 
   7925   int Size = Mask.size();
   7926   int HalfSize = Size / 2;
   7927   assert(Size == (int)VT.getVectorNumElements() && "Unexpected mask size");
   7928 
   7929   // Upper half must be undefined.
   7930   if (!isUndefInRange(Mask, HalfSize, HalfSize))
   7931     return SDValue();
   7932 
   7933   // EXTRQ: Extract Len elements from lower half of source, starting at Idx.
   7934   // Remainder of lower half result is zero and upper half is all undef.
   7935   auto LowerAsEXTRQ = [&]() {
   7936     // Determine the extraction length from the part of the
   7937     // lower half that isn't zeroable.
   7938     int Len = HalfSize;
   7939     for (; Len > 0; --Len)
   7940       if (!Zeroable[Len - 1])
   7941         break;
   7942     assert(Len > 0 && "Zeroable shuffle mask");
   7943 
   7944     // Attempt to match first Len sequential elements from the lower half.
   7945     SDValue Src;
   7946     int Idx = -1;
   7947     for (int i = 0; i != Len; ++i) {
   7948       int M = Mask[i];
   7949       if (M < 0)
   7950         continue;
   7951       SDValue &V = (M < Size ? V1 : V2);
   7952       M = M % Size;
   7953 
   7954       // The extracted elements must start at a valid index and all mask
   7955       // elements must be in the lower half.
   7956       if (i > M || M >= HalfSize)
   7957         return SDValue();
   7958 
   7959       if (Idx < 0 || (Src == V && Idx == (M - i))) {
   7960         Src = V;
   7961         Idx = M - i;
   7962         continue;
   7963       }
   7964       return SDValue();
   7965     }
   7966 
   7967     if (Idx < 0)
   7968       return SDValue();
   7969 
   7970     assert((Idx + Len) <= HalfSize && "Illegal extraction mask");
   7971     int BitLen = (Len * VT.getScalarSizeInBits()) & 0x3f;
   7972     int BitIdx = (Idx * VT.getScalarSizeInBits()) & 0x3f;
   7973     return DAG.getNode(X86ISD::EXTRQI, DL, VT, Src,
   7974                        DAG.getConstant(BitLen, DL, MVT::i8),
   7975                        DAG.getConstant(BitIdx, DL, MVT::i8));
   7976   };
   7977 
   7978   if (SDValue ExtrQ = LowerAsEXTRQ())
   7979     return ExtrQ;
   7980 
   7981   // INSERTQ: Extract lowest Len elements from lower half of second source and
   7982   // insert over first source, starting at Idx.
   7983   // { A[0], .., A[Idx-1], B[0], .., B[Len-1], A[Idx+Len], .., UNDEF, ... }
   7984   auto LowerAsInsertQ = [&]() {
   7985     for (int Idx = 0; Idx != HalfSize; ++Idx) {
   7986       SDValue Base;
   7987 
   7988       // Attempt to match first source from mask before insertion point.
   7989       if (isUndefInRange(Mask, 0, Idx)) {
   7990         /* EMPTY */
   7991       } else if (isSequentialOrUndefInRange(Mask, 0, Idx, 0)) {
   7992         Base = V1;
   7993       } else if (isSequentialOrUndefInRange(Mask, 0, Idx, Size)) {
   7994         Base = V2;
   7995       } else {
   7996         continue;
   7997       }
   7998 
   7999       // Extend the extraction length looking to match both the insertion of
   8000       // the second source and the remaining elements of the first.
   8001       for (int Hi = Idx + 1; Hi <= HalfSize; ++Hi) {
   8002         SDValue Insert;
   8003         int Len = Hi - Idx;
   8004 
   8005         // Match insertion.
   8006         if (isSequentialOrUndefInRange(Mask, Idx, Len, 0)) {
   8007           Insert = V1;
   8008         } else if (isSequentialOrUndefInRange(Mask, Idx, Len, Size)) {
   8009           Insert = V2;
   8010         } else {
   8011           continue;
   8012         }
   8013 
   8014         // Match the remaining elements of the lower half.
   8015         if (isUndefInRange(Mask, Hi, HalfSize - Hi)) {
   8016           /* EMPTY */
   8017         } else if ((!Base || (Base == V1)) &&
   8018                    isSequentialOrUndefInRange(Mask, Hi, HalfSize - Hi, Hi)) {
   8019           Base = V1;
   8020         } else if ((!Base || (Base == V2)) &&
   8021                    isSequentialOrUndefInRange(Mask, Hi, HalfSize - Hi,
   8022                                               Size + Hi)) {
   8023           Base = V2;
   8024         } else {
   8025           continue;
   8026         }
   8027 
   8028         // We may not have a base (first source) - this can safely be undefined.
   8029         if (!Base)
   8030           Base = DAG.getUNDEF(VT);
   8031 
   8032         int BitLen = (Len * VT.getScalarSizeInBits()) & 0x3f;
   8033         int BitIdx = (Idx * VT.getScalarSizeInBits()) & 0x3f;
   8034         return DAG.getNode(X86ISD::INSERTQI, DL, VT, Base, Insert,
   8035                            DAG.getConstant(BitLen, DL, MVT::i8),
   8036                            DAG.getConstant(BitIdx, DL, MVT::i8));
   8037       }
   8038     }
   8039 
   8040     return SDValue();
   8041   };
   8042 
   8043   if (SDValue InsertQ = LowerAsInsertQ())
   8044     return InsertQ;
   8045 
   8046   return SDValue();
   8047 }
   8048 
   8049 /// \brief Lower a vector shuffle as a zero or any extension.
   8050 ///
   8051 /// Given a specific number of elements, element bit width, and extension
   8052 /// stride, produce either a zero or any extension based on the available
   8053 /// features of the subtarget. The extended elements are consecutive and
   8054 /// begin and can start from an offseted element index in the input; to
   8055 /// avoid excess shuffling the offset must either being in the bottom lane
   8056 /// or at the start of a higher lane. All extended elements must be from
   8057 /// the same lane.
   8058 static SDValue lowerVectorShuffleAsSpecificZeroOrAnyExtend(
   8059     const SDLoc &DL, MVT VT, int Scale, int Offset, bool AnyExt, SDValue InputV,
   8060     ArrayRef<int> Mask, const X86Subtarget &Subtarget, SelectionDAG &DAG) {
   8061   assert(Scale > 1 && "Need a scale to extend.");
   8062   int EltBits = VT.getScalarSizeInBits();
   8063   int NumElements = VT.getVectorNumElements();
   8064   int NumEltsPerLane = 128 / EltBits;
   8065   int OffsetLane = Offset / NumEltsPerLane;
   8066   assert((EltBits == 8 || EltBits == 16 || EltBits == 32) &&
   8067          "Only 8, 16, and 32 bit elements can be extended.");
   8068   assert(Scale * EltBits <= 64 && "Cannot zero extend past 64 bits.");
   8069   assert(0 <= Offset && "Extension offset must be positive.");
   8070   assert((Offset < NumEltsPerLane || Offset % NumEltsPerLane == 0) &&
   8071          "Extension offset must be in the first lane or start an upper lane.");
   8072 
   8073   // Check that an index is in same lane as the base offset.
   8074   auto SafeOffset = [&](int Idx) {
   8075     return OffsetLane == (Idx / NumEltsPerLane);
   8076   };
   8077 
   8078   // Shift along an input so that the offset base moves to the first element.
   8079   auto ShuffleOffset = [&](SDValue V) {
   8080     if (!Offset)
   8081       return V;
   8082 
   8083     SmallVector<int, 8> ShMask((unsigned)NumElements, -1);
   8084     for (int i = 0; i * Scale < NumElements; ++i) {
   8085       int SrcIdx = i + Offset;
   8086       ShMask[i] = SafeOffset(SrcIdx) ? SrcIdx : -1;
   8087     }
   8088     return DAG.getVectorShuffle(VT, DL, V, DAG.getUNDEF(VT), ShMask);
   8089   };
   8090 
   8091   // Found a valid zext mask! Try various lowering strategies based on the
   8092   // input type and available ISA extensions.
   8093   if (Subtarget.hasSSE41()) {
   8094     // Not worth offseting 128-bit vectors if scale == 2, a pattern using
   8095     // PUNPCK will catch this in a later shuffle match.
   8096     if (Offset && Scale == 2 && VT.is128BitVector())
   8097       return SDValue();
   8098     MVT ExtVT = MVT::getVectorVT(MVT::getIntegerVT(EltBits * Scale),
   8099                                  NumElements / Scale);
   8100     InputV = ShuffleOffset(InputV);
   8101 
   8102     // For 256-bit vectors, we only need the lower (128-bit) input half.
   8103     if (VT.is256BitVector())
   8104       InputV = extract128BitVector(InputV, 0, DAG, DL);
   8105 
   8106     InputV = DAG.getNode(X86ISD::VZEXT, DL, ExtVT, InputV);
   8107     return DAG.getBitcast(VT, InputV);
   8108   }
   8109 
   8110   assert(VT.is128BitVector() && "Only 128-bit vectors can be extended.");
   8111 
   8112   // For any extends we can cheat for larger element sizes and use shuffle
   8113   // instructions that can fold with a load and/or copy.
   8114   if (AnyExt && EltBits == 32) {
   8115     int PSHUFDMask[4] = {Offset, -1, SafeOffset(Offset + 1) ? Offset + 1 : -1,
   8116                          -1};
   8117     return DAG.getBitcast(
   8118         VT, DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32,
   8119                         DAG.getBitcast(MVT::v4i32, InputV),
   8120                         getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG)));
   8121   }
   8122   if (AnyExt && EltBits == 16 && Scale > 2) {
   8123     int PSHUFDMask[4] = {Offset / 2, -1,
   8124                          SafeOffset(Offset + 1) ? (Offset + 1) / 2 : -1, -1};
   8125     InputV = DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32,
   8126                          DAG.getBitcast(MVT::v4i32, InputV),
   8127                          getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG));
   8128     int PSHUFWMask[4] = {1, -1, -1, -1};
   8129     unsigned OddEvenOp = (Offset & 1 ? X86ISD::PSHUFLW : X86ISD::PSHUFHW);
   8130     return DAG.getBitcast(
   8131         VT, DAG.getNode(OddEvenOp, DL, MVT::v8i16,
   8132                         DAG.getBitcast(MVT::v8i16, InputV),
   8133                         getV4X86ShuffleImm8ForMask(PSHUFWMask, DL, DAG)));
   8134   }
   8135 
   8136   // The SSE4A EXTRQ instruction can efficiently extend the first 2 lanes
   8137   // to 64-bits.
   8138   if ((Scale * EltBits) == 64 && EltBits < 32 && Subtarget.hasSSE4A()) {
   8139     assert(NumElements == (int)Mask.size() && "Unexpected shuffle mask size!");
   8140     assert(VT.is128BitVector() && "Unexpected vector width!");
   8141 
   8142     int LoIdx = Offset * EltBits;
   8143     SDValue Lo = DAG.getBitcast(
   8144         MVT::v2i64, DAG.getNode(X86ISD::EXTRQI, DL, VT, InputV,
   8145                                 DAG.getConstant(EltBits, DL, MVT::i8),
   8146                                 DAG.getConstant(LoIdx, DL, MVT::i8)));
   8147 
   8148     if (isUndefInRange(Mask, NumElements / 2, NumElements / 2) ||
   8149         !SafeOffset(Offset + 1))
   8150       return DAG.getBitcast(VT, Lo);
   8151 
   8152     int HiIdx = (Offset + 1) * EltBits;
   8153     SDValue Hi = DAG.getBitcast(
   8154         MVT::v2i64, DAG.getNode(X86ISD::EXTRQI, DL, VT, InputV,
   8155                                 DAG.getConstant(EltBits, DL, MVT::i8),
   8156                                 DAG.getConstant(HiIdx, DL, MVT::i8)));
   8157     return DAG.getBitcast(VT,
   8158                           DAG.getNode(X86ISD::UNPCKL, DL, MVT::v2i64, Lo, Hi));
   8159   }
   8160 
   8161   // If this would require more than 2 unpack instructions to expand, use
   8162   // pshufb when available. We can only use more than 2 unpack instructions
   8163   // when zero extending i8 elements which also makes it easier to use pshufb.
   8164   if (Scale > 4 && EltBits == 8 && Subtarget.hasSSSE3()) {
   8165     assert(NumElements == 16 && "Unexpected byte vector width!");
   8166     SDValue PSHUFBMask[16];
   8167     for (int i = 0; i < 16; ++i) {
   8168       int Idx = Offset + (i / Scale);
   8169       PSHUFBMask[i] = DAG.getConstant(
   8170           (i % Scale == 0 && SafeOffset(Idx)) ? Idx : 0x80, DL, MVT::i8);
   8171     }
   8172     InputV = DAG.getBitcast(MVT::v16i8, InputV);
   8173     return DAG.getBitcast(
   8174         VT, DAG.getNode(X86ISD::PSHUFB, DL, MVT::v16i8, InputV,
   8175                         DAG.getBuildVector(MVT::v16i8, DL, PSHUFBMask)));
   8176   }
   8177 
   8178   // If we are extending from an offset, ensure we start on a boundary that
   8179   // we can unpack from.
   8180   int AlignToUnpack = Offset % (NumElements / Scale);
   8181   if (AlignToUnpack) {
   8182     SmallVector<int, 8> ShMask((unsigned)NumElements, -1);
   8183     for (int i = AlignToUnpack; i < NumElements; ++i)
   8184       ShMask[i - AlignToUnpack] = i;
   8185     InputV = DAG.getVectorShuffle(VT, DL, InputV, DAG.getUNDEF(VT), ShMask);
   8186     Offset -= AlignToUnpack;
   8187   }
   8188 
   8189   // Otherwise emit a sequence of unpacks.
   8190   do {
   8191     unsigned UnpackLoHi = X86ISD::UNPCKL;
   8192     if (Offset >= (NumElements / 2)) {
   8193       UnpackLoHi = X86ISD::UNPCKH;
   8194       Offset -= (NumElements / 2);
   8195     }
   8196 
   8197     MVT InputVT = MVT::getVectorVT(MVT::getIntegerVT(EltBits), NumElements);
   8198     SDValue Ext = AnyExt ? DAG.getUNDEF(InputVT)
   8199                          : getZeroVector(InputVT, Subtarget, DAG, DL);
   8200     InputV = DAG.getBitcast(InputVT, InputV);
   8201     InputV = DAG.getNode(UnpackLoHi, DL, InputVT, InputV, Ext);
   8202     Scale /= 2;
   8203     EltBits *= 2;
   8204     NumElements /= 2;
   8205   } while (Scale > 1);
   8206   return DAG.getBitcast(VT, InputV);
   8207 }
   8208 
   8209 /// \brief Try to lower a vector shuffle as a zero extension on any microarch.
   8210 ///
   8211 /// This routine will try to do everything in its power to cleverly lower
   8212 /// a shuffle which happens to match the pattern of a zero extend. It doesn't
   8213 /// check for the profitability of this lowering,  it tries to aggressively
   8214 /// match this pattern. It will use all of the micro-architectural details it
   8215 /// can to emit an efficient lowering. It handles both blends with all-zero
   8216 /// inputs to explicitly zero-extend and undef-lanes (sometimes undef due to
   8217 /// masking out later).
   8218 ///
   8219 /// The reason we have dedicated lowering for zext-style shuffles is that they
   8220 /// are both incredibly common and often quite performance sensitive.
   8221 static SDValue lowerVectorShuffleAsZeroOrAnyExtend(
   8222     const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
   8223     const X86Subtarget &Subtarget, SelectionDAG &DAG) {
   8224   SmallBitVector Zeroable = computeZeroableShuffleElements(Mask, V1, V2);
   8225 
   8226   int Bits = VT.getSizeInBits();
   8227   int NumLanes = Bits / 128;
   8228   int NumElements = VT.getVectorNumElements();
   8229   int NumEltsPerLane = NumElements / NumLanes;
   8230   assert(VT.getScalarSizeInBits() <= 32 &&
   8231          "Exceeds 32-bit integer zero extension limit");
   8232   assert((int)Mask.size() == NumElements && "Unexpected shuffle mask size");
   8233 
   8234   // Define a helper function to check a particular ext-scale and lower to it if
   8235   // valid.
   8236   auto Lower = [&](int Scale) -> SDValue {
   8237     SDValue InputV;
   8238     bool AnyExt = true;
   8239     int Offset = 0;
   8240     int Matches = 0;
   8241     for (int i = 0; i < NumElements; ++i) {
   8242       int M = Mask[i];
   8243       if (M < 0)
   8244         continue; // Valid anywhere but doesn't tell us anything.
   8245       if (i % Scale != 0) {
   8246         // Each of the extended elements need to be zeroable.
   8247         if (!Zeroable[i])
   8248           return SDValue();
   8249 
   8250         // We no longer are in the anyext case.
   8251         AnyExt = false;
   8252         continue;
   8253       }
   8254 
   8255       // Each of the base elements needs to be consecutive indices into the
   8256       // same input vector.
   8257       SDValue V = M < NumElements ? V1 : V2;
   8258       M = M % NumElements;
   8259       if (!InputV) {
   8260         InputV = V;
   8261         Offset = M - (i / Scale);
   8262       } else if (InputV != V)
   8263         return SDValue(); // Flip-flopping inputs.
   8264 
   8265       // Offset must start in the lowest 128-bit lane or at the start of an
   8266       // upper lane.
   8267       // FIXME: Is it ever worth allowing a negative base offset?
   8268       if (!((0 <= Offset && Offset < NumEltsPerLane) ||
   8269             (Offset % NumEltsPerLane) == 0))
   8270         return SDValue();
   8271 
   8272       // If we are offsetting, all referenced entries must come from the same
   8273       // lane.
   8274       if (Offset && (Offset / NumEltsPerLane) != (M / NumEltsPerLane))
   8275         return SDValue();
   8276 
   8277       if ((M % NumElements) != (Offset + (i / Scale)))
   8278         return SDValue(); // Non-consecutive strided elements.
   8279       Matches++;
   8280     }
   8281 
   8282     // If we fail to find an input, we have a zero-shuffle which should always
   8283     // have already been handled.
   8284     // FIXME: Maybe handle this here in case during blending we end up with one?
   8285     if (!InputV)
   8286       return SDValue();
   8287 
   8288     // If we are offsetting, don't extend if we only match a single input, we
   8289     // can always do better by using a basic PSHUF or PUNPCK.
   8290     if (Offset != 0 && Matches < 2)
   8291       return SDValue();
   8292 
   8293     return lowerVectorShuffleAsSpecificZeroOrAnyExtend(
   8294         DL, VT, Scale, Offset, AnyExt, InputV, Mask, Subtarget, DAG);
   8295   };
   8296 
   8297   // The widest scale possible for extending is to a 64-bit integer.
   8298   assert(Bits % 64 == 0 &&
   8299          "The number of bits in a vector must be divisible by 64 on x86!");
   8300   int NumExtElements = Bits / 64;
   8301 
   8302   // Each iteration, try extending the elements half as much, but into twice as
   8303   // many elements.
   8304   for (; NumExtElements < NumElements; NumExtElements *= 2) {
   8305     assert(NumElements % NumExtElements == 0 &&
   8306            "The input vector size must be divisible by the extended size.");
   8307     if (SDValue V = Lower(NumElements / NumExtElements))
   8308       return V;
   8309   }
   8310 
   8311   // General extends failed, but 128-bit vectors may be able to use MOVQ.
   8312   if (Bits != 128)
   8313     return SDValue();
   8314 
   8315   // Returns one of the source operands if the shuffle can be reduced to a
   8316   // MOVQ, copying the lower 64-bits and zero-extending to the upper 64-bits.
   8317   auto CanZExtLowHalf = [&]() {
   8318     for (int i = NumElements / 2; i != NumElements; ++i)
   8319       if (!Zeroable[i])
   8320         return SDValue();
   8321     if (isSequentialOrUndefInRange(Mask, 0, NumElements / 2, 0))
   8322       return V1;
   8323     if (isSequentialOrUndefInRange(Mask, 0, NumElements / 2, NumElements))
   8324       return V2;
   8325     return SDValue();
   8326   };
   8327 
   8328   if (SDValue V = CanZExtLowHalf()) {
   8329     V = DAG.getBitcast(MVT::v2i64, V);
   8330     V = DAG.getNode(X86ISD::VZEXT_MOVL, DL, MVT::v2i64, V);
   8331     return DAG.getBitcast(VT, V);
   8332   }
   8333 
   8334   // No viable ext lowering found.
   8335   return SDValue();
   8336 }
   8337 
   8338 /// \brief Try to get a scalar value for a specific element of a vector.
   8339 ///
   8340 /// Looks through BUILD_VECTOR and SCALAR_TO_VECTOR nodes to find a scalar.
   8341 static SDValue getScalarValueForVectorElement(SDValue V, int Idx,
   8342                                               SelectionDAG &DAG) {
   8343   MVT VT = V.getSimpleValueType();
   8344   MVT EltVT = VT.getVectorElementType();
   8345   V = peekThroughBitcasts(V);
   8346 
   8347   // If the bitcasts shift the element size, we can't extract an equivalent
   8348   // element from it.
   8349   MVT NewVT = V.getSimpleValueType();
   8350   if (!NewVT.isVector() || NewVT.getScalarSizeInBits() != VT.getScalarSizeInBits())
   8351     return SDValue();
   8352 
   8353   if (V.getOpcode() == ISD::BUILD_VECTOR ||
   8354       (Idx == 0 && V.getOpcode() == ISD::SCALAR_TO_VECTOR)) {
   8355     // Ensure the scalar operand is the same size as the destination.
   8356     // FIXME: Add support for scalar truncation where possible.
   8357     SDValue S = V.getOperand(Idx);
   8358     if (EltVT.getSizeInBits() == S.getSimpleValueType().getSizeInBits())
   8359       return DAG.getBitcast(EltVT, S);
   8360   }
   8361 
   8362   return SDValue();
   8363 }
   8364 
   8365 /// \brief Helper to test for a load that can be folded with x86 shuffles.
   8366 ///
   8367 /// This is particularly important because the set of instructions varies
   8368 /// significantly based on whether the operand is a load or not.
   8369 static bool isShuffleFoldableLoad(SDValue V) {
   8370   V = peekThroughBitcasts(V);
   8371   return ISD::isNON_EXTLoad(V.getNode());
   8372 }
   8373 
   8374 /// \brief Try to lower insertion of a single element into a zero vector.
   8375 ///
   8376 /// This is a common pattern that we have especially efficient patterns to lower
   8377 /// across all subtarget feature sets.
   8378 static SDValue lowerVectorShuffleAsElementInsertion(
   8379     const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
   8380     const X86Subtarget &Subtarget, SelectionDAG &DAG) {
   8381   SmallBitVector Zeroable = computeZeroableShuffleElements(Mask, V1, V2);
   8382   MVT ExtVT = VT;
   8383   MVT EltVT = VT.getVectorElementType();
   8384 
   8385   int V2Index = std::find_if(Mask.begin(), Mask.end(),
   8386                              [&Mask](int M) { return M >= (int)Mask.size(); }) -
   8387                 Mask.begin();
   8388   bool IsV1Zeroable = true;
   8389   for (int i = 0, Size = Mask.size(); i < Size; ++i)
   8390     if (i != V2Index && !Zeroable[i]) {
   8391       IsV1Zeroable = false;
   8392       break;
   8393     }
   8394 
   8395   // Check for a single input from a SCALAR_TO_VECTOR node.
   8396   // FIXME: All of this should be canonicalized into INSERT_VECTOR_ELT and
   8397   // all the smarts here sunk into that routine. However, the current
   8398   // lowering of BUILD_VECTOR makes that nearly impossible until the old
   8399   // vector shuffle lowering is dead.
   8400   SDValue V2S = getScalarValueForVectorElement(V2, Mask[V2Index] - Mask.size(),
   8401                                                DAG);
   8402   if (V2S && DAG.getTargetLoweringInfo().isTypeLegal(V2S.getValueType())) {
   8403     // We need to zext the scalar if it is smaller than an i32.
   8404     V2S = DAG.getBitcast(EltVT, V2S);
   8405     if (EltVT == MVT::i8 || EltVT == MVT::i16) {
   8406       // Using zext to expand a narrow element won't work for non-zero
   8407       // insertions.
   8408       if (!IsV1Zeroable)
   8409         return SDValue();
   8410 
   8411       // Zero-extend directly to i32.
   8412       ExtVT = MVT::v4i32;
   8413       V2S = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, V2S);
   8414     }
   8415     V2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, ExtVT, V2S);
   8416   } else if (Mask[V2Index] != (int)Mask.size() || EltVT == MVT::i8 ||
   8417              EltVT == MVT::i16) {
   8418     // Either not inserting from the low element of the input or the input
   8419     // element size is too small to use VZEXT_MOVL to clear the high bits.
   8420     return SDValue();
   8421   }
   8422 
   8423   if (!IsV1Zeroable) {
   8424     // If V1 can't be treated as a zero vector we have fewer options to lower
   8425     // this. We can't support integer vectors or non-zero targets cheaply, and
   8426     // the V1 elements can't be permuted in any way.
   8427     assert(VT == ExtVT && "Cannot change extended type when non-zeroable!");
   8428     if (!VT.isFloatingPoint() || V2Index != 0)
   8429       return SDValue();
   8430     SmallVector<int, 8> V1Mask(Mask.begin(), Mask.end());
   8431     V1Mask[V2Index] = -1;
   8432     if (!isNoopShuffleMask(V1Mask))
   8433       return SDValue();
   8434     // This is essentially a special case blend operation, but if we have
   8435     // general purpose blend operations, they are always faster. Bail and let
   8436     // the rest of the lowering handle these as blends.
   8437     if (Subtarget.hasSSE41())
   8438       return SDValue();
   8439 
   8440     // Otherwise, use MOVSD or MOVSS.
   8441     assert((EltVT == MVT::f32 || EltVT == MVT::f64) &&
   8442            "Only two types of floating point element types to handle!");
   8443     return DAG.getNode(EltVT == MVT::f32 ? X86ISD::MOVSS : X86ISD::MOVSD, DL,
   8444                        ExtVT, V1, V2);
   8445   }
   8446 
   8447   // This lowering only works for the low element with floating point vectors.
   8448   if (VT.isFloatingPoint() && V2Index != 0)
   8449     return SDValue();
   8450 
   8451   V2 = DAG.getNode(X86ISD::VZEXT_MOVL, DL, ExtVT, V2);
   8452   if (ExtVT != VT)
   8453     V2 = DAG.getBitcast(VT, V2);
   8454 
   8455   if (V2Index != 0) {
   8456     // If we have 4 or fewer lanes we can cheaply shuffle the element into
   8457     // the desired position. Otherwise it is more efficient to do a vector
   8458     // shift left. We know that we can do a vector shift left because all
   8459     // the inputs are zero.
   8460     if (VT.isFloatingPoint() || VT.getVectorNumElements() <= 4) {
   8461       SmallVector<int, 4> V2Shuffle(Mask.size(), 1);
   8462       V2Shuffle[V2Index] = 0;
   8463       V2 = DAG.getVectorShuffle(VT, DL, V2, DAG.getUNDEF(VT), V2Shuffle);
   8464     } else {
   8465       V2 = DAG.getBitcast(MVT::v16i8, V2);
   8466       V2 = DAG.getNode(
   8467           X86ISD::VSHLDQ, DL, MVT::v16i8, V2,
   8468           DAG.getConstant(V2Index * EltVT.getSizeInBits() / 8, DL,
   8469                           DAG.getTargetLoweringInfo().getScalarShiftAmountTy(
   8470                               DAG.getDataLayout(), VT)));
   8471       V2 = DAG.getBitcast(VT, V2);
   8472     }
   8473   }
   8474   return V2;
   8475 }
   8476 
   8477 /// Try to lower broadcast of a single - truncated - integer element,
   8478 /// coming from a scalar_to_vector/build_vector node \p V0 with larger elements.
   8479 ///
   8480 /// This assumes we have AVX2.
   8481 static SDValue lowerVectorShuffleAsTruncBroadcast(const SDLoc &DL, MVT VT,
   8482                                                   SDValue V0, int BroadcastIdx,
   8483                                                   const X86Subtarget &Subtarget,
   8484                                                   SelectionDAG &DAG) {
   8485   assert(Subtarget.hasAVX2() &&
   8486          "We can only lower integer broadcasts with AVX2!");
   8487 
   8488   EVT EltVT = VT.getVectorElementType();
   8489   EVT V0VT = V0.getValueType();
   8490 
   8491   assert(VT.isInteger() && "Unexpected non-integer trunc broadcast!");
   8492   assert(V0VT.isVector() && "Unexpected non-vector vector-sized value!");
   8493 
   8494   EVT V0EltVT = V0VT.getVectorElementType();
   8495   if (!V0EltVT.isInteger())
   8496     return SDValue();
   8497 
   8498   const unsigned EltSize = EltVT.getSizeInBits();
   8499   const unsigned V0EltSize = V0EltVT.getSizeInBits();
   8500 
   8501   // This is only a truncation if the original element type is larger.
   8502   if (V0EltSize <= EltSize)
   8503     return SDValue();
   8504 
   8505   assert(((V0EltSize % EltSize) == 0) &&
   8506          "Scalar type sizes must all be powers of 2 on x86!");
   8507 
   8508   const unsigned V0Opc = V0.getOpcode();
   8509   const unsigned Scale = V0EltSize / EltSize;
   8510   const unsigned V0BroadcastIdx = BroadcastIdx / Scale;
   8511 
   8512   if ((V0Opc != ISD::SCALAR_TO_VECTOR || V0BroadcastIdx != 0) &&
   8513       V0Opc != ISD::BUILD_VECTOR)
   8514     return SDValue();
   8515 
   8516   SDValue Scalar = V0.getOperand(V0BroadcastIdx);
   8517 
   8518   // If we're extracting non-least-significant bits, shift so we can truncate.
   8519   // Hopefully, we can fold away the trunc/srl/load into the broadcast.
   8520   // Even if we can't (and !isShuffleFoldableLoad(Scalar)), prefer
   8521   // vpbroadcast+vmovd+shr to vpshufb(m)+vmovd.
   8522   if (const int OffsetIdx = BroadcastIdx % Scale)
   8523     Scalar = DAG.getNode(ISD::SRL, DL, Scalar.getValueType(), Scalar,
   8524             DAG.getConstant(OffsetIdx * EltSize, DL, Scalar.getValueType()));
   8525 
   8526   return DAG.getNode(X86ISD::VBROADCAST, DL, VT,
   8527                      DAG.getNode(ISD::TRUNCATE, DL, EltVT, Scalar));
   8528 }
   8529 
   8530 /// \brief Try to lower broadcast of a single element.
   8531 ///
   8532 /// For convenience, this code also bundles all of the subtarget feature set
   8533 /// filtering. While a little annoying to re-dispatch on type here, there isn't
   8534 /// a convenient way to factor it out.
   8535 /// FIXME: This is very similar to LowerVectorBroadcast - can we merge them?
   8536 static SDValue lowerVectorShuffleAsBroadcast(const SDLoc &DL, MVT VT,
   8537                                              SDValue V1, SDValue V2,
   8538                                              ArrayRef<int> Mask,
   8539                                              const X86Subtarget &Subtarget,
   8540                                              SelectionDAG &DAG) {
   8541   if (!((Subtarget.hasSSE3() && VT == MVT::v2f64) ||
   8542         (Subtarget.hasAVX() && VT.isFloatingPoint()) ||
   8543         (Subtarget.hasAVX2() && VT.isInteger())))
   8544     return SDValue();
   8545 
   8546   // With MOVDDUP (v2f64) we can broadcast from a register or a load, otherwise
   8547   // we can only broadcast from a register with AVX2.
   8548   unsigned NumElts = Mask.size();
   8549   unsigned Opcode = VT == MVT::v2f64 ? X86ISD::MOVDDUP : X86ISD::VBROADCAST;
   8550   bool BroadcastFromReg = (Opcode == X86ISD::MOVDDUP) || Subtarget.hasAVX2();
   8551 
   8552   // Check that the mask is a broadcast.
   8553   int BroadcastIdx = -1;
   8554   for (int i = 0; i != (int)NumElts; ++i) {
   8555     SmallVector<int, 8> BroadcastMask(NumElts, i);
   8556     if (isShuffleEquivalent(V1, V2, Mask, BroadcastMask)) {
   8557       BroadcastIdx = i;
   8558       break;
   8559     }
   8560   }
   8561 
   8562   if (BroadcastIdx < 0)
   8563     return SDValue();
   8564   assert(BroadcastIdx < (int)Mask.size() && "We only expect to be called with "
   8565                                             "a sorted mask where the broadcast "
   8566                                             "comes from V1.");
   8567 
   8568   // Go up the chain of (vector) values to find a scalar load that we can
   8569   // combine with the broadcast.
   8570   SDValue V = V1;
   8571   for (;;) {
   8572     switch (V.getOpcode()) {
   8573     case ISD::BITCAST: {
   8574       SDValue VSrc = V.getOperand(0);
   8575       MVT SrcVT = VSrc.getSimpleValueType();
   8576       if (VT.getScalarSizeInBits() != SrcVT.getScalarSizeInBits())
   8577         break;
   8578       V = VSrc;
   8579       continue;
   8580     }
   8581     case ISD::CONCAT_VECTORS: {
   8582       int OperandSize = Mask.size() / V.getNumOperands();
   8583       V = V.getOperand(BroadcastIdx / OperandSize);
   8584       BroadcastIdx %= OperandSize;
   8585       continue;
   8586     }
   8587     case ISD::INSERT_SUBVECTOR: {
   8588       SDValue VOuter = V.getOperand(0), VInner = V.getOperand(1);
   8589       auto ConstantIdx = dyn_cast<ConstantSDNode>(V.getOperand(2));
   8590       if (!ConstantIdx)
   8591         break;
   8592 
   8593       int BeginIdx = (int)ConstantIdx->getZExtValue();
   8594       int EndIdx =
   8595           BeginIdx + (int)VInner.getSimpleValueType().getVectorNumElements();
   8596       if (BroadcastIdx >= BeginIdx && BroadcastIdx < EndIdx) {
   8597         BroadcastIdx -= BeginIdx;
   8598         V = VInner;
   8599       } else {
   8600         V = VOuter;
   8601       }
   8602       continue;
   8603     }
   8604     }
   8605     break;
   8606   }
   8607 
   8608   // Check if this is a broadcast of a scalar. We special case lowering
   8609   // for scalars so that we can more effectively fold with loads.
   8610   // First, look through bitcast: if the original value has a larger element
   8611   // type than the shuffle, the broadcast element is in essence truncated.
   8612   // Make that explicit to ease folding.
   8613   if (V.getOpcode() == ISD::BITCAST && VT.isInteger())
   8614     if (SDValue TruncBroadcast = lowerVectorShuffleAsTruncBroadcast(
   8615             DL, VT, V.getOperand(0), BroadcastIdx, Subtarget, DAG))
   8616       return TruncBroadcast;
   8617 
   8618   MVT BroadcastVT = VT;
   8619 
   8620   // Peek through any bitcast (only useful for loads).
   8621   SDValue BC = peekThroughBitcasts(V);
   8622 
   8623   // Also check the simpler case, where we can directly reuse the scalar.
   8624   if (V.getOpcode() == ISD::BUILD_VECTOR ||
   8625       (V.getOpcode() == ISD::SCALAR_TO_VECTOR && BroadcastIdx == 0)) {
   8626     V = V.getOperand(BroadcastIdx);
   8627 
   8628     // If we can't broadcast from a register, check that the input is a load.
   8629     if (!BroadcastFromReg && !isShuffleFoldableLoad(V))
   8630       return SDValue();
   8631   } else if (MayFoldLoad(BC) && !cast<LoadSDNode>(BC)->isVolatile()) {
   8632     // 32-bit targets need to load i64 as a f64 and then bitcast the result.
   8633     if (!Subtarget.is64Bit() && VT.getScalarType() == MVT::i64) {
   8634       BroadcastVT = MVT::getVectorVT(MVT::f64, VT.getVectorNumElements());
   8635       Opcode = (BroadcastVT.is128BitVector() ? X86ISD::MOVDDUP : Opcode);
   8636     }
   8637 
   8638     // If we are broadcasting a load that is only used by the shuffle
   8639     // then we can reduce the vector load to the broadcasted scalar load.
   8640     LoadSDNode *Ld = cast<LoadSDNode>(BC);
   8641     SDValue BaseAddr = Ld->getOperand(1);
   8642     EVT SVT = BroadcastVT.getScalarType();
   8643     unsigned Offset = BroadcastIdx * SVT.getStoreSize();
   8644     SDValue NewAddr = DAG.getMemBasePlusOffset(BaseAddr, Offset, DL);
   8645     V = DAG.getLoad(SVT, DL, Ld->getChain(), NewAddr,
   8646                     DAG.getMachineFunction().getMachineMemOperand(
   8647                         Ld->getMemOperand(), Offset, SVT.getStoreSize()));
   8648   } else if (!BroadcastFromReg) {
   8649     // We can't broadcast from a vector register.
   8650     return SDValue();
   8651   } else if (BroadcastIdx != 0) {
   8652     // We can only broadcast from the zero-element of a vector register,
   8653     // but it can be advantageous to broadcast from the zero-element of a
   8654     // subvector.
   8655     if (!VT.is256BitVector() && !VT.is512BitVector())
   8656       return SDValue();
   8657 
   8658     // VPERMQ/VPERMPD can perform the cross-lane shuffle directly.
   8659     if (VT == MVT::v4f64 || VT == MVT::v4i64)
   8660       return SDValue();
   8661 
   8662     // Only broadcast the zero-element of a 128-bit subvector.
   8663     unsigned EltSize = VT.getScalarSizeInBits();
   8664     if (((BroadcastIdx * EltSize) % 128) != 0)
   8665       return SDValue();
   8666 
   8667     MVT ExtVT = MVT::getVectorVT(VT.getScalarType(), 128 / EltSize);
   8668     V = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, ExtVT, V,
   8669                     DAG.getIntPtrConstant(BroadcastIdx, DL));
   8670   }
   8671 
   8672   if (Opcode == X86ISD::MOVDDUP && !V.getValueType().isVector())
   8673     V = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v2f64,
   8674                     DAG.getBitcast(MVT::f64, V));
   8675 
   8676   // Bitcast back to the same scalar type as BroadcastVT.
   8677   MVT SrcVT = V.getSimpleValueType();
   8678   if (SrcVT.getScalarType() != BroadcastVT.getScalarType()) {
   8679     assert(SrcVT.getScalarSizeInBits() == BroadcastVT.getScalarSizeInBits() &&
   8680            "Unexpected vector element size");
   8681     if (SrcVT.isVector()) {
   8682       unsigned NumSrcElts = SrcVT.getVectorNumElements();
   8683       SrcVT = MVT::getVectorVT(BroadcastVT.getScalarType(), NumSrcElts);
   8684     } else {
   8685       SrcVT = BroadcastVT.getScalarType();
   8686     }
   8687     V = DAG.getBitcast(SrcVT, V);
   8688   }
   8689 
   8690   return DAG.getBitcast(VT, DAG.getNode(Opcode, DL, BroadcastVT, V));
   8691 }
   8692 
   8693 // Check for whether we can use INSERTPS to perform the shuffle. We only use
   8694 // INSERTPS when the V1 elements are already in the correct locations
   8695 // because otherwise we can just always use two SHUFPS instructions which
   8696 // are much smaller to encode than a SHUFPS and an INSERTPS. We can also
   8697 // perform INSERTPS if a single V1 element is out of place and all V2
   8698 // elements are zeroable.
   8699 static bool matchVectorShuffleAsInsertPS(SDValue &V1, SDValue &V2,
   8700                                          unsigned &InsertPSMask,
   8701                                          const SmallBitVector &Zeroable,
   8702                                          ArrayRef<int> Mask,
   8703                                          SelectionDAG &DAG) {
   8704   assert(V1.getSimpleValueType().is128BitVector() && "Bad operand type!");
   8705   assert(V2.getSimpleValueType().is128BitVector() && "Bad operand type!");
   8706   assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!");
   8707   unsigned ZMask = 0;
   8708   int V1DstIndex = -1;
   8709   int V2DstIndex = -1;
   8710   bool V1UsedInPlace = false;
   8711 
   8712   for (int i = 0; i < 4; ++i) {
   8713     // Synthesize a zero mask from the zeroable elements (includes undefs).
   8714     if (Zeroable[i]) {
   8715       ZMask |= 1 << i;
   8716       continue;
   8717     }
   8718 
   8719     // Flag if we use any V1 inputs in place.
   8720     if (i == Mask[i]) {
   8721       V1UsedInPlace = true;
   8722       continue;
   8723     }
   8724 
   8725     // We can only insert a single non-zeroable element.
   8726     if (V1DstIndex >= 0 || V2DstIndex >= 0)
   8727       return false;
   8728 
   8729     if (Mask[i] < 4) {
   8730       // V1 input out of place for insertion.
   8731       V1DstIndex = i;
   8732     } else {
   8733       // V2 input for insertion.
   8734       V2DstIndex = i;
   8735     }
   8736   }
   8737 
   8738   // Don't bother if we have no (non-zeroable) element for insertion.
   8739   if (V1DstIndex < 0 && V2DstIndex < 0)
   8740     return false;
   8741 
   8742   // Determine element insertion src/dst indices. The src index is from the
   8743   // start of the inserted vector, not the start of the concatenated vector.
   8744   unsigned V2SrcIndex = 0;
   8745   if (V1DstIndex >= 0) {
   8746     // If we have a V1 input out of place, we use V1 as the V2 element insertion
   8747     // and don't use the original V2 at all.
   8748     V2SrcIndex = Mask[V1DstIndex];
   8749     V2DstIndex = V1DstIndex;
   8750     V2 = V1;
   8751   } else {
   8752     V2SrcIndex = Mask[V2DstIndex] - 4;
   8753   }
   8754 
   8755   // If no V1 inputs are used in place, then the result is created only from
   8756   // the zero mask and the V2 insertion - so remove V1 dependency.
   8757   if (!V1UsedInPlace)
   8758     V1 = DAG.getUNDEF(MVT::v4f32);
   8759 
   8760   // Insert the V2 element into the desired position.
   8761   InsertPSMask = V2SrcIndex << 6 | V2DstIndex << 4 | ZMask;
   8762   assert((InsertPSMask & ~0xFFu) == 0 && "Invalid mask!");
   8763   return true;
   8764 }
   8765 
   8766 static SDValue lowerVectorShuffleAsInsertPS(const SDLoc &DL, SDValue V1,
   8767                                             SDValue V2, ArrayRef<int> Mask,
   8768                                             SelectionDAG &DAG) {
   8769   assert(V1.getSimpleValueType() == MVT::v4f32 && "Bad operand type!");
   8770   assert(V2.getSimpleValueType() == MVT::v4f32 && "Bad operand type!");
   8771   SmallBitVector Zeroable = computeZeroableShuffleElements(Mask, V1, V2);
   8772 
   8773   // Attempt to match the insertps pattern.
   8774   unsigned InsertPSMask;
   8775   if (!matchVectorShuffleAsInsertPS(V1, V2, InsertPSMask, Zeroable, Mask, DAG))
   8776     return SDValue();
   8777 
   8778   // Insert the V2 element into the desired position.
   8779   return DAG.getNode(X86ISD::INSERTPS, DL, MVT::v4f32, V1, V2,
   8780                      DAG.getConstant(InsertPSMask, DL, MVT::i8));
   8781 }
   8782 
   8783 /// \brief Try to lower a shuffle as a permute of the inputs followed by an
   8784 /// UNPCK instruction.
   8785 ///
   8786 /// This specifically targets cases where we end up with alternating between
   8787 /// the two inputs, and so can permute them into something that feeds a single
   8788 /// UNPCK instruction. Note that this routine only targets integer vectors
   8789 /// because for floating point vectors we have a generalized SHUFPS lowering
   8790 /// strategy that handles everything that doesn't *exactly* match an unpack,
   8791 /// making this clever lowering unnecessary.
   8792 static SDValue lowerVectorShuffleAsPermuteAndUnpack(const SDLoc &DL, MVT VT,
   8793                                                     SDValue V1, SDValue V2,
   8794                                                     ArrayRef<int> Mask,
   8795                                                     SelectionDAG &DAG) {
   8796   assert(!VT.isFloatingPoint() &&
   8797          "This routine only supports integer vectors.");
   8798   assert(VT.is128BitVector() &&
   8799          "This routine only works on 128-bit vectors.");
   8800   assert(!V2.isUndef() &&
   8801          "This routine should only be used when blending two inputs.");
   8802   assert(Mask.size() >= 2 && "Single element masks are invalid.");
   8803 
   8804   int Size = Mask.size();
   8805 
   8806   int NumLoInputs =
   8807       count_if(Mask, [Size](int M) { return M >= 0 && M % Size < Size / 2; });
   8808   int NumHiInputs =
   8809       count_if(Mask, [Size](int M) { return M % Size >= Size / 2; });
   8810 
   8811   bool UnpackLo = NumLoInputs >= NumHiInputs;
   8812 
   8813   auto TryUnpack = [&](MVT UnpackVT, int Scale) {
   8814     SmallVector<int, 16> V1Mask(Mask.size(), -1);
   8815     SmallVector<int, 16> V2Mask(Mask.size(), -1);
   8816 
   8817     for (int i = 0; i < Size; ++i) {
   8818       if (Mask[i] < 0)
   8819         continue;
   8820 
   8821       // Each element of the unpack contains Scale elements from this mask.
   8822       int UnpackIdx = i / Scale;
   8823 
   8824       // We only handle the case where V1 feeds the first slots of the unpack.
   8825       // We rely on canonicalization to ensure this is the case.
   8826       if ((UnpackIdx % 2 == 0) != (Mask[i] < Size))
   8827         return SDValue();
   8828 
   8829       // Setup the mask for this input. The indexing is tricky as we have to
   8830       // handle the unpack stride.
   8831       SmallVectorImpl<int> &VMask = (UnpackIdx % 2 == 0) ? V1Mask : V2Mask;
   8832       VMask[(UnpackIdx / 2) * Scale + i % Scale + (UnpackLo ? 0 : Size / 2)] =
   8833           Mask[i] % Size;
   8834     }
   8835 
   8836     // If we will have to shuffle both inputs to use the unpack, check whether
   8837     // we can just unpack first and shuffle the result. If so, skip this unpack.
   8838     if ((NumLoInputs == 0 || NumHiInputs == 0) && !isNoopShuffleMask(V1Mask) &&
   8839         !isNoopShuffleMask(V2Mask))
   8840       return SDValue();
   8841 
   8842     // Shuffle the inputs into place.
   8843     V1 = DAG.getVectorShuffle(VT, DL, V1, DAG.getUNDEF(VT), V1Mask);
   8844     V2 = DAG.getVectorShuffle(VT, DL, V2, DAG.getUNDEF(VT), V2Mask);
   8845 
   8846     // Cast the inputs to the type we will use to unpack them.
   8847     V1 = DAG.getBitcast(UnpackVT, V1);
   8848     V2 = DAG.getBitcast(UnpackVT, V2);
   8849 
   8850     // Unpack the inputs and cast the result back to the desired type.
   8851     return DAG.getBitcast(
   8852         VT, DAG.getNode(UnpackLo ? X86ISD::UNPCKL : X86ISD::UNPCKH, DL,
   8853                         UnpackVT, V1, V2));
   8854   };
   8855 
   8856   // We try each unpack from the largest to the smallest to try and find one
   8857   // that fits this mask.
   8858   int OrigNumElements = VT.getVectorNumElements();
   8859   int OrigScalarSize = VT.getScalarSizeInBits();
   8860   for (int ScalarSize = 64; ScalarSize >= OrigScalarSize; ScalarSize /= 2) {
   8861     int Scale = ScalarSize / OrigScalarSize;
   8862     int NumElements = OrigNumElements / Scale;
   8863     MVT UnpackVT = MVT::getVectorVT(MVT::getIntegerVT(ScalarSize), NumElements);
   8864     if (SDValue Unpack = TryUnpack(UnpackVT, Scale))
   8865       return Unpack;
   8866   }
   8867 
   8868   // If none of the unpack-rooted lowerings worked (or were profitable) try an
   8869   // initial unpack.
   8870   if (NumLoInputs == 0 || NumHiInputs == 0) {
   8871     assert((NumLoInputs > 0 || NumHiInputs > 0) &&
   8872            "We have to have *some* inputs!");
   8873     int HalfOffset = NumLoInputs == 0 ? Size / 2 : 0;
   8874 
   8875     // FIXME: We could consider the total complexity of the permute of each
   8876     // possible unpacking. Or at the least we should consider how many
   8877     // half-crossings are created.
   8878     // FIXME: We could consider commuting the unpacks.
   8879 
   8880     SmallVector<int, 32> PermMask((unsigned)Size, -1);
   8881     for (int i = 0; i < Size; ++i) {
   8882       if (Mask[i] < 0)
   8883         continue;
   8884 
   8885       assert(Mask[i] % Size >= HalfOffset && "Found input from wrong half!");
   8886 
   8887       PermMask[i] =
   8888           2 * ((Mask[i] % Size) - HalfOffset) + (Mask[i] < Size ? 0 : 1);
   8889     }
   8890     return DAG.getVectorShuffle(
   8891         VT, DL, DAG.getNode(NumLoInputs == 0 ? X86ISD::UNPCKH : X86ISD::UNPCKL,
   8892                             DL, VT, V1, V2),
   8893         DAG.getUNDEF(VT), PermMask);
   8894   }
   8895 
   8896   return SDValue();
   8897 }
   8898 
   8899 /// \brief Handle lowering of 2-lane 64-bit floating point shuffles.
   8900 ///
   8901 /// This is the basis function for the 2-lane 64-bit shuffles as we have full
   8902 /// support for floating point shuffles but not integer shuffles. These
   8903 /// instructions will incur a domain crossing penalty on some chips though so
   8904 /// it is better to avoid lowering through this for integer vectors where
   8905 /// possible.
   8906 static SDValue lowerV2F64VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
   8907                                        SDValue V1, SDValue V2,
   8908                                        const X86Subtarget &Subtarget,
   8909                                        SelectionDAG &DAG) {
   8910   assert(V1.getSimpleValueType() == MVT::v2f64 && "Bad operand type!");
   8911   assert(V2.getSimpleValueType() == MVT::v2f64 && "Bad operand type!");
   8912   assert(Mask.size() == 2 && "Unexpected mask size for v2 shuffle!");
   8913 
   8914   if (V2.isUndef()) {
   8915     // Check for being able to broadcast a single element.
   8916     if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(
   8917             DL, MVT::v2f64, V1, V2, Mask, Subtarget, DAG))
   8918       return Broadcast;
   8919 
   8920     // Straight shuffle of a single input vector. Simulate this by using the
   8921     // single input as both of the "inputs" to this instruction..
   8922     unsigned SHUFPDMask = (Mask[0] == 1) | ((Mask[1] == 1) << 1);
   8923 
   8924     if (Subtarget.hasAVX()) {
   8925       // If we have AVX, we can use VPERMILPS which will allow folding a load
   8926       // into the shuffle.
   8927       return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v2f64, V1,
   8928                          DAG.getConstant(SHUFPDMask, DL, MVT::i8));
   8929     }
   8930 
   8931     return DAG.getNode(X86ISD::SHUFP, DL, MVT::v2f64, V1, V1,
   8932                        DAG.getConstant(SHUFPDMask, DL, MVT::i8));
   8933   }
   8934   assert(Mask[0] >= 0 && Mask[0] < 2 && "Non-canonicalized blend!");
   8935   assert(Mask[1] >= 2 && "Non-canonicalized blend!");
   8936 
   8937   // If we have a single input, insert that into V1 if we can do so cheaply.
   8938   if ((Mask[0] >= 2) + (Mask[1] >= 2) == 1) {
   8939     if (SDValue Insertion = lowerVectorShuffleAsElementInsertion(
   8940             DL, MVT::v2f64, V1, V2, Mask, Subtarget, DAG))
   8941       return Insertion;
   8942     // Try inverting the insertion since for v2 masks it is easy to do and we
   8943     // can't reliably sort the mask one way or the other.
   8944     int InverseMask[2] = {Mask[0] < 0 ? -1 : (Mask[0] ^ 2),
   8945                           Mask[1] < 0 ? -1 : (Mask[1] ^ 2)};
   8946     if (SDValue Insertion = lowerVectorShuffleAsElementInsertion(
   8947             DL, MVT::v2f64, V2, V1, InverseMask, Subtarget, DAG))
   8948       return Insertion;
   8949   }
   8950 
   8951   // Try to use one of the special instruction patterns to handle two common
   8952   // blend patterns if a zero-blend above didn't work.
   8953   if (isShuffleEquivalent(V1, V2, Mask, {0, 3}) ||
   8954       isShuffleEquivalent(V1, V2, Mask, {1, 3}))
   8955     if (SDValue V1S = getScalarValueForVectorElement(V1, Mask[0], DAG))
   8956       // We can either use a special instruction to load over the low double or
   8957       // to move just the low double.
   8958       return DAG.getNode(
   8959           isShuffleFoldableLoad(V1S) ? X86ISD::MOVLPD : X86ISD::MOVSD,
   8960           DL, MVT::v2f64, V2,
   8961           DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v2f64, V1S));
   8962 
   8963   if (Subtarget.hasSSE41())
   8964     if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v2f64, V1, V2, Mask,
   8965                                                   Subtarget, DAG))
   8966       return Blend;
   8967 
   8968   // Use dedicated unpack instructions for masks that match their pattern.
   8969   if (SDValue V =
   8970           lowerVectorShuffleWithUNPCK(DL, MVT::v2f64, Mask, V1, V2, DAG))
   8971     return V;
   8972 
   8973   unsigned SHUFPDMask = (Mask[0] == 1) | (((Mask[1] - 2) == 1) << 1);
   8974   return DAG.getNode(X86ISD::SHUFP, DL, MVT::v2f64, V1, V2,
   8975                      DAG.getConstant(SHUFPDMask, DL, MVT::i8));
   8976 }
   8977 
   8978 /// \brief Handle lowering of 2-lane 64-bit integer shuffles.
   8979 ///
   8980 /// Tries to lower a 2-lane 64-bit shuffle using shuffle operations provided by
   8981 /// the integer unit to minimize domain crossing penalties. However, for blends
   8982 /// it falls back to the floating point shuffle operation with appropriate bit
   8983 /// casting.
   8984 static SDValue lowerV2I64VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
   8985                                        SDValue V1, SDValue V2,
   8986                                        const X86Subtarget &Subtarget,
   8987                                        SelectionDAG &DAG) {
   8988   assert(V1.getSimpleValueType() == MVT::v2i64 && "Bad operand type!");
   8989   assert(V2.getSimpleValueType() == MVT::v2i64 && "Bad operand type!");
   8990   assert(Mask.size() == 2 && "Unexpected mask size for v2 shuffle!");
   8991 
   8992   if (V2.isUndef()) {
   8993     // Check for being able to broadcast a single element.
   8994     if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(
   8995             DL, MVT::v2i64, V1, V2, Mask, Subtarget, DAG))
   8996       return Broadcast;
   8997 
   8998     // Straight shuffle of a single input vector. For everything from SSE2
   8999     // onward this has a single fast instruction with no scary immediates.
   9000     // We have to map the mask as it is actually a v4i32 shuffle instruction.
   9001     V1 = DAG.getBitcast(MVT::v4i32, V1);
   9002     int WidenedMask[4] = {
   9003         std::max(Mask[0], 0) * 2, std::max(Mask[0], 0) * 2 + 1,
   9004         std::max(Mask[1], 0) * 2, std::max(Mask[1], 0) * 2 + 1};
   9005     return DAG.getBitcast(
   9006         MVT::v2i64,
   9007         DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32, V1,
   9008                     getV4X86ShuffleImm8ForMask(WidenedMask, DL, DAG)));
   9009   }
   9010   assert(Mask[0] != -1 && "No undef lanes in multi-input v2 shuffles!");
   9011   assert(Mask[1] != -1 && "No undef lanes in multi-input v2 shuffles!");
   9012   assert(Mask[0] < 2 && "We sort V1 to be the first input.");
   9013   assert(Mask[1] >= 2 && "We sort V2 to be the second input.");
   9014 
   9015   // If we have a blend of two same-type PACKUS operations and the blend aligns
   9016   // with the low and high halves, we can just merge the PACKUS operations.
   9017   // This is particularly important as it lets us merge shuffles that this
   9018   // routine itself creates.
   9019   auto GetPackNode = [](SDValue V) {
   9020     V = peekThroughBitcasts(V);
   9021     return V.getOpcode() == X86ISD::PACKUS ? V : SDValue();
   9022   };
   9023   if (SDValue V1Pack = GetPackNode(V1))
   9024     if (SDValue V2Pack = GetPackNode(V2)) {
   9025       EVT PackVT = V1Pack.getValueType();
   9026       if (PackVT == V2Pack.getValueType())
   9027         return DAG.getBitcast(MVT::v2i64,
   9028                               DAG.getNode(X86ISD::PACKUS, DL, PackVT,
   9029                                           Mask[0] == 0 ? V1Pack.getOperand(0)
   9030                                                        : V1Pack.getOperand(1),
   9031                                           Mask[1] == 2 ? V2Pack.getOperand(0)
   9032                                                        : V2Pack.getOperand(1)));
   9033     }
   9034 
   9035   // Try to use shift instructions.
   9036   if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v2i64, V1, V2, Mask,
   9037                                                 Subtarget, DAG))
   9038     return Shift;
   9039 
   9040   // When loading a scalar and then shuffling it into a vector we can often do
   9041   // the insertion cheaply.
   9042   if (SDValue Insertion = lowerVectorShuffleAsElementInsertion(
   9043           DL, MVT::v2i64, V1, V2, Mask, Subtarget, DAG))
   9044     return Insertion;
   9045   // Try inverting the insertion since for v2 masks it is easy to do and we
   9046   // can't reliably sort the mask one way or the other.
   9047   int InverseMask[2] = {Mask[0] ^ 2, Mask[1] ^ 2};
   9048   if (SDValue Insertion = lowerVectorShuffleAsElementInsertion(
   9049           DL, MVT::v2i64, V2, V1, InverseMask, Subtarget, DAG))
   9050     return Insertion;
   9051 
   9052   // We have different paths for blend lowering, but they all must use the
   9053   // *exact* same predicate.
   9054   bool IsBlendSupported = Subtarget.hasSSE41();
   9055   if (IsBlendSupported)
   9056     if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v2i64, V1, V2, Mask,
   9057                                                   Subtarget, DAG))
   9058       return Blend;
   9059 
   9060   // Use dedicated unpack instructions for masks that match their pattern.
   9061   if (SDValue V =
   9062           lowerVectorShuffleWithUNPCK(DL, MVT::v2i64, Mask, V1, V2, DAG))
   9063     return V;
   9064 
   9065   // Try to use byte rotation instructions.
   9066   // Its more profitable for pre-SSSE3 to use shuffles/unpacks.
   9067   if (Subtarget.hasSSSE3())
   9068     if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
   9069             DL, MVT::v2i64, V1, V2, Mask, Subtarget, DAG))
   9070       return Rotate;
   9071 
   9072   // If we have direct support for blends, we should lower by decomposing into
   9073   // a permute. That will be faster than the domain cross.
   9074   if (IsBlendSupported)
   9075     return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v2i64, V1, V2,
   9076                                                       Mask, DAG);
   9077 
   9078   // We implement this with SHUFPD which is pretty lame because it will likely
   9079   // incur 2 cycles of stall for integer vectors on Nehalem and older chips.
   9080   // However, all the alternatives are still more cycles and newer chips don't
   9081   // have this problem. It would be really nice if x86 had better shuffles here.
   9082   V1 = DAG.getBitcast(MVT::v2f64, V1);
   9083   V2 = DAG.getBitcast(MVT::v2f64, V2);
   9084   return DAG.getBitcast(MVT::v2i64,
   9085                         DAG.getVectorShuffle(MVT::v2f64, DL, V1, V2, Mask));
   9086 }
   9087 
   9088 /// \brief Test whether this can be lowered with a single SHUFPS instruction.
   9089 ///
   9090 /// This is used to disable more specialized lowerings when the shufps lowering
   9091 /// will happen to be efficient.
   9092 static bool isSingleSHUFPSMask(ArrayRef<int> Mask) {
   9093   // This routine only handles 128-bit shufps.
   9094   assert(Mask.size() == 4 && "Unsupported mask size!");
   9095   assert(Mask[0] >= -1 && Mask[0] < 8 && "Out of bound mask element!");
   9096   assert(Mask[1] >= -1 && Mask[1] < 8 && "Out of bound mask element!");
   9097   assert(Mask[2] >= -1 && Mask[2] < 8 && "Out of bound mask element!");
   9098   assert(Mask[3] >= -1 && Mask[3] < 8 && "Out of bound mask element!");
   9099 
   9100   // To lower with a single SHUFPS we need to have the low half and high half
   9101   // each requiring a single input.
   9102   if (Mask[0] >= 0 && Mask[1] >= 0 && (Mask[0] < 4) != (Mask[1] < 4))
   9103     return false;
   9104   if (Mask[2] >= 0 && Mask[3] >= 0 && (Mask[2] < 4) != (Mask[3] < 4))
   9105     return false;
   9106 
   9107   return true;
   9108 }
   9109 
   9110 /// \brief Lower a vector shuffle using the SHUFPS instruction.
   9111 ///
   9112 /// This is a helper routine dedicated to lowering vector shuffles using SHUFPS.
   9113 /// It makes no assumptions about whether this is the *best* lowering, it simply
   9114 /// uses it.
   9115 static SDValue lowerVectorShuffleWithSHUFPS(const SDLoc &DL, MVT VT,
   9116                                             ArrayRef<int> Mask, SDValue V1,
   9117                                             SDValue V2, SelectionDAG &DAG) {
   9118   SDValue LowV = V1, HighV = V2;
   9119   int NewMask[4] = {Mask[0], Mask[1], Mask[2], Mask[3]};
   9120 
   9121   int NumV2Elements = count_if(Mask, [](int M) { return M >= 4; });
   9122 
   9123   if (NumV2Elements == 1) {
   9124     int V2Index =
   9125         std::find_if(Mask.begin(), Mask.end(), [](int M) { return M >= 4; }) -
   9126         Mask.begin();
   9127 
   9128     // Compute the index adjacent to V2Index and in the same half by toggling
   9129     // the low bit.
   9130     int V2AdjIndex = V2Index ^ 1;
   9131 
   9132     if (Mask[V2AdjIndex] < 0) {
   9133       // Handles all the cases where we have a single V2 element and an undef.
   9134       // This will only ever happen in the high lanes because we commute the
   9135       // vector otherwise.
   9136       if (V2Index < 2)
   9137         std::swap(LowV, HighV);
   9138       NewMask[V2Index] -= 4;
   9139     } else {
   9140       // Handle the case where the V2 element ends up adjacent to a V1 element.
   9141       // To make this work, blend them together as the first step.
   9142       int V1Index = V2AdjIndex;
   9143       int BlendMask[4] = {Mask[V2Index] - 4, 0, Mask[V1Index], 0};
   9144       V2 = DAG.getNode(X86ISD::SHUFP, DL, VT, V2, V1,
   9145                        getV4X86ShuffleImm8ForMask(BlendMask, DL, DAG));
   9146 
   9147       // Now proceed to reconstruct the final blend as we have the necessary
   9148       // high or low half formed.
   9149       if (V2Index < 2) {
   9150         LowV = V2;
   9151         HighV = V1;
   9152       } else {
   9153         HighV = V2;
   9154       }
   9155       NewMask[V1Index] = 2; // We put the V1 element in V2[2].
   9156       NewMask[V2Index] = 0; // We shifted the V2 element into V2[0].
   9157     }
   9158   } else if (NumV2Elements == 2) {
   9159     if (Mask[0] < 4 && Mask[1] < 4) {
   9160       // Handle the easy case where we have V1 in the low lanes and V2 in the
   9161       // high lanes.
   9162       NewMask[2] -= 4;
   9163       NewMask[3] -= 4;
   9164     } else if (Mask[2] < 4 && Mask[3] < 4) {
   9165       // We also handle the reversed case because this utility may get called
   9166       // when we detect a SHUFPS pattern but can't easily commute the shuffle to
   9167       // arrange things in the right direction.
   9168       NewMask[0] -= 4;
   9169       NewMask[1] -= 4;
   9170       HighV = V1;
   9171       LowV = V2;
   9172     } else {
   9173       // We have a mixture of V1 and V2 in both low and high lanes. Rather than
   9174       // trying to place elements directly, just blend them and set up the final
   9175       // shuffle to place them.
   9176 
   9177       // The first two blend mask elements are for V1, the second two are for
   9178       // V2.
   9179       int BlendMask[4] = {Mask[0] < 4 ? Mask[0] : Mask[1],
   9180                           Mask[2] < 4 ? Mask[2] : Mask[3],
   9181                           (Mask[0] >= 4 ? Mask[0] : Mask[1]) - 4,
   9182                           (Mask[2] >= 4 ? Mask[2] : Mask[3]) - 4};
   9183       V1 = DAG.getNode(X86ISD::SHUFP, DL, VT, V1, V2,
   9184                        getV4X86ShuffleImm8ForMask(BlendMask, DL, DAG));
   9185 
   9186       // Now we do a normal shuffle of V1 by giving V1 as both operands to
   9187       // a blend.
   9188       LowV = HighV = V1;
   9189       NewMask[0] = Mask[0] < 4 ? 0 : 2;
   9190       NewMask[1] = Mask[0] < 4 ? 2 : 0;
   9191       NewMask[2] = Mask[2] < 4 ? 1 : 3;
   9192       NewMask[3] = Mask[2] < 4 ? 3 : 1;
   9193     }
   9194   }
   9195   return DAG.getNode(X86ISD::SHUFP, DL, VT, LowV, HighV,
   9196                      getV4X86ShuffleImm8ForMask(NewMask, DL, DAG));
   9197 }
   9198 
   9199 /// \brief Lower 4-lane 32-bit floating point shuffles.
   9200 ///
   9201 /// Uses instructions exclusively from the floating point unit to minimize
   9202 /// domain crossing penalties, as these are sufficient to implement all v4f32
   9203 /// shuffles.
   9204 static SDValue lowerV4F32VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
   9205                                        SDValue V1, SDValue V2,
   9206                                        const X86Subtarget &Subtarget,
   9207                                        SelectionDAG &DAG) {
   9208   assert(V1.getSimpleValueType() == MVT::v4f32 && "Bad operand type!");
   9209   assert(V2.getSimpleValueType() == MVT::v4f32 && "Bad operand type!");
   9210   assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!");
   9211 
   9212   int NumV2Elements = count_if(Mask, [](int M) { return M >= 4; });
   9213 
   9214   if (NumV2Elements == 0) {
   9215     // Check for being able to broadcast a single element.
   9216     if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(
   9217             DL, MVT::v4f32, V1, V2, Mask, Subtarget, DAG))
   9218       return Broadcast;
   9219 
   9220     // Use even/odd duplicate instructions for masks that match their pattern.
   9221     if (Subtarget.hasSSE3()) {
   9222       if (isShuffleEquivalent(V1, V2, Mask, {0, 0, 2, 2}))
   9223         return DAG.getNode(X86ISD::MOVSLDUP, DL, MVT::v4f32, V1);
   9224       if (isShuffleEquivalent(V1, V2, Mask, {1, 1, 3, 3}))
   9225         return DAG.getNode(X86ISD::MOVSHDUP, DL, MVT::v4f32, V1);
   9226     }
   9227 
   9228     if (Subtarget.hasAVX()) {
   9229       // If we have AVX, we can use VPERMILPS which will allow folding a load
   9230       // into the shuffle.
   9231       return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v4f32, V1,
   9232                          getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
   9233     }
   9234 
   9235     // Otherwise, use a straight shuffle of a single input vector. We pass the
   9236     // input vector to both operands to simulate this with a SHUFPS.
   9237     return DAG.getNode(X86ISD::SHUFP, DL, MVT::v4f32, V1, V1,
   9238                        getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
   9239   }
   9240 
   9241   // There are special ways we can lower some single-element blends. However, we
   9242   // have custom ways we can lower more complex single-element blends below that
   9243   // we defer to if both this and BLENDPS fail to match, so restrict this to
   9244   // when the V2 input is targeting element 0 of the mask -- that is the fast
   9245   // case here.
   9246   if (NumV2Elements == 1 && Mask[0] >= 4)
   9247     if (SDValue V = lowerVectorShuffleAsElementInsertion(DL, MVT::v4f32, V1, V2,
   9248                                                          Mask, Subtarget, DAG))
   9249       return V;
   9250 
   9251   if (Subtarget.hasSSE41()) {
   9252     if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v4f32, V1, V2, Mask,
   9253                                                   Subtarget, DAG))
   9254       return Blend;
   9255 
   9256     // Use INSERTPS if we can complete the shuffle efficiently.
   9257     if (SDValue V = lowerVectorShuffleAsInsertPS(DL, V1, V2, Mask, DAG))
   9258       return V;
   9259 
   9260     if (!isSingleSHUFPSMask(Mask))
   9261       if (SDValue BlendPerm = lowerVectorShuffleAsBlendAndPermute(
   9262               DL, MVT::v4f32, V1, V2, Mask, DAG))
   9263         return BlendPerm;
   9264   }
   9265 
   9266   // Use low/high mov instructions.
   9267   if (isShuffleEquivalent(V1, V2, Mask, {0, 1, 4, 5}))
   9268     return DAG.getNode(X86ISD::MOVLHPS, DL, MVT::v4f32, V1, V2);
   9269   if (isShuffleEquivalent(V1, V2, Mask, {2, 3, 6, 7}))
   9270     return DAG.getNode(X86ISD::MOVHLPS, DL, MVT::v4f32, V2, V1);
   9271 
   9272   // Use dedicated unpack instructions for masks that match their pattern.
   9273   if (SDValue V =
   9274           lowerVectorShuffleWithUNPCK(DL, MVT::v4f32, Mask, V1, V2, DAG))
   9275     return V;
   9276 
   9277   // Otherwise fall back to a SHUFPS lowering strategy.
   9278   return lowerVectorShuffleWithSHUFPS(DL, MVT::v4f32, Mask, V1, V2, DAG);
   9279 }
   9280 
   9281 /// \brief Lower 4-lane i32 vector shuffles.
   9282 ///
   9283 /// We try to handle these with integer-domain shuffles where we can, but for
   9284 /// blends we use the floating point domain blend instructions.
   9285 static SDValue lowerV4I32VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
   9286                                        SDValue V1, SDValue V2,
   9287                                        const X86Subtarget &Subtarget,
   9288                                        SelectionDAG &DAG) {
   9289   assert(V1.getSimpleValueType() == MVT::v4i32 && "Bad operand type!");
   9290   assert(V2.getSimpleValueType() == MVT::v4i32 && "Bad operand type!");
   9291   assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!");
   9292 
   9293   // Whenever we can lower this as a zext, that instruction is strictly faster
   9294   // than any alternative. It also allows us to fold memory operands into the
   9295   // shuffle in many cases.
   9296   if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend(DL, MVT::v4i32, V1, V2,
   9297                                                          Mask, Subtarget, DAG))
   9298     return ZExt;
   9299 
   9300   int NumV2Elements = count_if(Mask, [](int M) { return M >= 4; });
   9301 
   9302   if (NumV2Elements == 0) {
   9303     // Check for being able to broadcast a single element.
   9304     if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(
   9305             DL, MVT::v4i32, V1, V2, Mask, Subtarget, DAG))
   9306       return Broadcast;
   9307 
   9308     // Straight shuffle of a single input vector. For everything from SSE2
   9309     // onward this has a single fast instruction with no scary immediates.
   9310     // We coerce the shuffle pattern to be compatible with UNPCK instructions
   9311     // but we aren't actually going to use the UNPCK instruction because doing
   9312     // so prevents folding a load into this instruction or making a copy.
   9313     const int UnpackLoMask[] = {0, 0, 1, 1};
   9314     const int UnpackHiMask[] = {2, 2, 3, 3};
   9315     if (isShuffleEquivalent(V1, V2, Mask, {0, 0, 1, 1}))
   9316       Mask = UnpackLoMask;
   9317     else if (isShuffleEquivalent(V1, V2, Mask, {2, 2, 3, 3}))
   9318       Mask = UnpackHiMask;
   9319 
   9320     return DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32, V1,
   9321                        getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
   9322   }
   9323 
   9324   // Try to use shift instructions.
   9325   if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v4i32, V1, V2, Mask,
   9326                                                 Subtarget, DAG))
   9327     return Shift;
   9328 
   9329   // There are special ways we can lower some single-element blends.
   9330   if (NumV2Elements == 1)
   9331     if (SDValue V = lowerVectorShuffleAsElementInsertion(DL, MVT::v4i32, V1, V2,
   9332                                                          Mask, Subtarget, DAG))
   9333       return V;
   9334 
   9335   // We have different paths for blend lowering, but they all must use the
   9336   // *exact* same predicate.
   9337   bool IsBlendSupported = Subtarget.hasSSE41();
   9338   if (IsBlendSupported)
   9339     if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v4i32, V1, V2, Mask,
   9340                                                   Subtarget, DAG))
   9341       return Blend;
   9342 
   9343   if (SDValue Masked =
   9344           lowerVectorShuffleAsBitMask(DL, MVT::v4i32, V1, V2, Mask, DAG))
   9345     return Masked;
   9346 
   9347   // Use dedicated unpack instructions for masks that match their pattern.
   9348   if (SDValue V =
   9349           lowerVectorShuffleWithUNPCK(DL, MVT::v4i32, Mask, V1, V2, DAG))
   9350     return V;
   9351 
   9352   // Try to use byte rotation instructions.
   9353   // Its more profitable for pre-SSSE3 to use shuffles/unpacks.
   9354   if (Subtarget.hasSSSE3())
   9355     if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
   9356             DL, MVT::v4i32, V1, V2, Mask, Subtarget, DAG))
   9357       return Rotate;
   9358 
   9359   // If we have direct support for blends, we should lower by decomposing into
   9360   // a permute. That will be faster than the domain cross.
   9361   if (IsBlendSupported)
   9362     return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v4i32, V1, V2,
   9363                                                       Mask, DAG);
   9364 
   9365   // Try to lower by permuting the inputs into an unpack instruction.
   9366   if (SDValue Unpack = lowerVectorShuffleAsPermuteAndUnpack(DL, MVT::v4i32, V1,
   9367                                                             V2, Mask, DAG))
   9368     return Unpack;
   9369 
   9370   // We implement this with SHUFPS because it can blend from two vectors.
   9371   // Because we're going to eventually use SHUFPS, we use SHUFPS even to build
   9372   // up the inputs, bypassing domain shift penalties that we would encur if we
   9373   // directly used PSHUFD on Nehalem and older. For newer chips, this isn't
   9374   // relevant.
   9375   return DAG.getBitcast(
   9376       MVT::v4i32,
   9377       DAG.getVectorShuffle(MVT::v4f32, DL, DAG.getBitcast(MVT::v4f32, V1),
   9378                            DAG.getBitcast(MVT::v4f32, V2), Mask));
   9379 }
   9380 
   9381 /// \brief Lowering of single-input v8i16 shuffles is the cornerstone of SSE2
   9382 /// shuffle lowering, and the most complex part.
   9383 ///
   9384 /// The lowering strategy is to try to form pairs of input lanes which are
   9385 /// targeted at the same half of the final vector, and then use a dword shuffle
   9386 /// to place them onto the right half, and finally unpack the paired lanes into
   9387 /// their final position.
   9388 ///
   9389 /// The exact breakdown of how to form these dword pairs and align them on the
   9390 /// correct sides is really tricky. See the comments within the function for
   9391 /// more of the details.
   9392 ///
   9393 /// This code also handles repeated 128-bit lanes of v8i16 shuffles, but each
   9394 /// lane must shuffle the *exact* same way. In fact, you must pass a v8 Mask to
   9395 /// this routine for it to work correctly. To shuffle a 256-bit or 512-bit i16
   9396 /// vector, form the analogous 128-bit 8-element Mask.
   9397 static SDValue lowerV8I16GeneralSingleInputVectorShuffle(
   9398     const SDLoc &DL, MVT VT, SDValue V, MutableArrayRef<int> Mask,
   9399     const X86Subtarget &Subtarget, SelectionDAG &DAG) {
   9400   assert(VT.getVectorElementType() == MVT::i16 && "Bad input type!");
   9401   MVT PSHUFDVT = MVT::getVectorVT(MVT::i32, VT.getVectorNumElements() / 2);
   9402 
   9403   assert(Mask.size() == 8 && "Shuffle mask length doen't match!");
   9404   MutableArrayRef<int> LoMask = Mask.slice(0, 4);
   9405   MutableArrayRef<int> HiMask = Mask.slice(4, 4);
   9406 
   9407   SmallVector<int, 4> LoInputs;
   9408   std::copy_if(LoMask.begin(), LoMask.end(), std::back_inserter(LoInputs),
   9409                [](int M) { return M >= 0; });
   9410   std::sort(LoInputs.begin(), LoInputs.end());
   9411   LoInputs.erase(std::unique(LoInputs.begin(), LoInputs.end()), LoInputs.end());
   9412   SmallVector<int, 4> HiInputs;
   9413   std::copy_if(HiMask.begin(), HiMask.end(), std::back_inserter(HiInputs),
   9414                [](int M) { return M >= 0; });
   9415   std::sort(HiInputs.begin(), HiInputs.end());
   9416   HiInputs.erase(std::unique(HiInputs.begin(), HiInputs.end()), HiInputs.end());
   9417   int NumLToL =
   9418       std::lower_bound(LoInputs.begin(), LoInputs.end(), 4) - LoInputs.begin();
   9419   int NumHToL = LoInputs.size() - NumLToL;
   9420   int NumLToH =
   9421       std::lower_bound(HiInputs.begin(), HiInputs.end(), 4) - HiInputs.begin();
   9422   int NumHToH = HiInputs.size() - NumLToH;
   9423   MutableArrayRef<int> LToLInputs(LoInputs.data(), NumLToL);
   9424   MutableArrayRef<int> LToHInputs(HiInputs.data(), NumLToH);
   9425   MutableArrayRef<int> HToLInputs(LoInputs.data() + NumLToL, NumHToL);
   9426   MutableArrayRef<int> HToHInputs(HiInputs.data() + NumLToH, NumHToH);
   9427 
   9428   // If we are splatting two values from one half - one to each half, then
   9429   // we can shuffle that half so each is splatted to a dword, then splat those
   9430   // to their respective halves.
   9431   auto SplatHalfs = [&](int LoInput, int HiInput, unsigned ShufWOp,
   9432                         int DOffset) {
   9433     int PSHUFHalfMask[] = {LoInput % 4, LoInput % 4, HiInput % 4, HiInput % 4};
   9434     int PSHUFDMask[] = {DOffset + 0, DOffset + 0, DOffset + 1, DOffset + 1};
   9435     V = DAG.getNode(ShufWOp, DL, VT, V,
   9436                     getV4X86ShuffleImm8ForMask(PSHUFHalfMask, DL, DAG));
   9437     V = DAG.getBitcast(PSHUFDVT, V);
   9438     V = DAG.getNode(X86ISD::PSHUFD, DL, PSHUFDVT, V,
   9439                     getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG));
   9440     return DAG.getBitcast(VT, V);
   9441   };
   9442 
   9443   if (NumLToL == 1 && NumLToH == 1 && (NumHToL + NumHToH) == 0)
   9444     return SplatHalfs(LToLInputs[0], LToHInputs[0], X86ISD::PSHUFLW, 0);
   9445   if (NumHToL == 1 && NumHToH == 1 && (NumLToL + NumLToH) == 0)
   9446     return SplatHalfs(HToLInputs[0], HToHInputs[0], X86ISD::PSHUFHW, 2);
   9447 
   9448   // Simplify the 1-into-3 and 3-into-1 cases with a single pshufd. For all
   9449   // such inputs we can swap two of the dwords across the half mark and end up
   9450   // with <=2 inputs to each half in each half. Once there, we can fall through
   9451   // to the generic code below. For example:
   9452   //
   9453   // Input: [a, b, c, d, e, f, g, h] -PSHUFD[0,2,1,3]-> [a, b, e, f, c, d, g, h]
   9454   // Mask:  [0, 1, 2, 7, 4, 5, 6, 3] -----------------> [0, 1, 4, 7, 2, 3, 6, 5]
   9455   //
   9456   // However in some very rare cases we have a 1-into-3 or 3-into-1 on one half
   9457   // and an existing 2-into-2 on the other half. In this case we may have to
   9458   // pre-shuffle the 2-into-2 half to avoid turning it into a 3-into-1 or
   9459   // 1-into-3 which could cause us to cycle endlessly fixing each side in turn.
   9460   // Fortunately, we don't have to handle anything but a 2-into-2 pattern
   9461   // because any other situation (including a 3-into-1 or 1-into-3 in the other
   9462   // half than the one we target for fixing) will be fixed when we re-enter this
   9463   // path. We will also combine away any sequence of PSHUFD instructions that
   9464   // result into a single instruction. Here is an example of the tricky case:
   9465   //
   9466   // Input: [a, b, c, d, e, f, g, h] -PSHUFD[0,2,1,3]-> [a, b, e, f, c, d, g, h]
   9467   // Mask:  [3, 7, 1, 0, 2, 7, 3, 5] -THIS-IS-BAD!!!!-> [5, 7, 1, 0, 4, 7, 5, 3]
   9468   //
   9469   // This now has a 1-into-3 in the high half! Instead, we do two shuffles:
   9470   //
   9471   // Input: [a, b, c, d, e, f, g, h] PSHUFHW[0,2,1,3]-> [a, b, c, d, e, g, f, h]
   9472   // Mask:  [3, 7, 1, 0, 2, 7, 3, 5] -----------------> [3, 7, 1, 0, 2, 7, 3, 6]
   9473   //
   9474   // Input: [a, b, c, d, e, g, f, h] -PSHUFD[0,2,1,3]-> [a, b, e, g, c, d, f, h]
   9475   // Mask:  [3, 7, 1, 0, 2, 7, 3, 6] -----------------> [5, 7, 1, 0, 4, 7, 5, 6]
   9476   //
   9477   // The result is fine to be handled by the generic logic.
   9478   auto balanceSides = [&](ArrayRef<int> AToAInputs, ArrayRef<int> BToAInputs,
   9479                           ArrayRef<int> BToBInputs, ArrayRef<int> AToBInputs,
   9480                           int AOffset, int BOffset) {
   9481     assert((AToAInputs.size() == 3 || AToAInputs.size() == 1) &&
   9482            "Must call this with A having 3 or 1 inputs from the A half.");
   9483     assert((BToAInputs.size() == 1 || BToAInputs.size() == 3) &&
   9484            "Must call this with B having 1 or 3 inputs from the B half.");
   9485     assert(AToAInputs.size() + BToAInputs.size() == 4 &&
   9486            "Must call this with either 3:1 or 1:3 inputs (summing to 4).");
   9487 
   9488     bool ThreeAInputs = AToAInputs.size() == 3;
   9489 
   9490     // Compute the index of dword with only one word among the three inputs in
   9491     // a half by taking the sum of the half with three inputs and subtracting
   9492     // the sum of the actual three inputs. The difference is the remaining
   9493     // slot.
   9494     int ADWord, BDWord;
   9495     int &TripleDWord = ThreeAInputs ? ADWord : BDWord;
   9496     int &OneInputDWord = ThreeAInputs ? BDWord : ADWord;
   9497     int TripleInputOffset = ThreeAInputs ? AOffset : BOffset;
   9498     ArrayRef<int> TripleInputs = ThreeAInputs ? AToAInputs : BToAInputs;
   9499     int OneInput = ThreeAInputs ? BToAInputs[0] : AToAInputs[0];
   9500     int TripleInputSum = 0 + 1 + 2 + 3 + (4 * TripleInputOffset);
   9501     int TripleNonInputIdx =
   9502         TripleInputSum - std::accumulate(TripleInputs.begin(), TripleInputs.end(), 0);
   9503     TripleDWord = TripleNonInputIdx / 2;
   9504 
   9505     // We use xor with one to compute the adjacent DWord to whichever one the
   9506     // OneInput is in.
   9507     OneInputDWord = (OneInput / 2) ^ 1;
   9508 
   9509     // Check for one tricky case: We're fixing a 3<-1 or a 1<-3 shuffle for AToA
   9510     // and BToA inputs. If there is also such a problem with the BToB and AToB
   9511     // inputs, we don't try to fix it necessarily -- we'll recurse and see it in
   9512     // the next pass. However, if we have a 2<-2 in the BToB and AToB inputs, it
   9513     // is essential that we don't *create* a 3<-1 as then we might oscillate.
   9514     if (BToBInputs.size() == 2 && AToBInputs.size() == 2) {
   9515       // Compute how many inputs will be flipped by swapping these DWords. We
   9516       // need
   9517       // to balance this to ensure we don't form a 3-1 shuffle in the other
   9518       // half.
   9519       int NumFlippedAToBInputs =
   9520           std::count(AToBInputs.begin(), AToBInputs.end(), 2 * ADWord) +
   9521           std::count(AToBInputs.begin(), AToBInputs.end(), 2 * ADWord + 1);
   9522       int NumFlippedBToBInputs =
   9523           std::count(BToBInputs.begin(), BToBInputs.end(), 2 * BDWord) +
   9524           std::count(BToBInputs.begin(), BToBInputs.end(), 2 * BDWord + 1);
   9525       if ((NumFlippedAToBInputs == 1 &&
   9526            (NumFlippedBToBInputs == 0 || NumFlippedBToBInputs == 2)) ||
   9527           (NumFlippedBToBInputs == 1 &&
   9528            (NumFlippedAToBInputs == 0 || NumFlippedAToBInputs == 2))) {
   9529         // We choose whether to fix the A half or B half based on whether that
   9530         // half has zero flipped inputs. At zero, we may not be able to fix it
   9531         // with that half. We also bias towards fixing the B half because that
   9532         // will more commonly be the high half, and we have to bias one way.
   9533         auto FixFlippedInputs = [&V, &DL, &Mask, &DAG](int PinnedIdx, int DWord,
   9534                                                        ArrayRef<int> Inputs) {
   9535           int FixIdx = PinnedIdx ^ 1; // The adjacent slot to the pinned slot.
   9536           bool IsFixIdxInput = std::find(Inputs.begin(), Inputs.end(),
   9537                                          PinnedIdx ^ 1) != Inputs.end();
   9538           // Determine whether the free index is in the flipped dword or the
   9539           // unflipped dword based on where the pinned index is. We use this bit
   9540           // in an xor to conditionally select the adjacent dword.
   9541           int FixFreeIdx = 2 * (DWord ^ (PinnedIdx / 2 == DWord));
   9542           bool IsFixFreeIdxInput = std::find(Inputs.begin(), Inputs.end(),
   9543                                              FixFreeIdx) != Inputs.end();
   9544           if (IsFixIdxInput == IsFixFreeIdxInput)
   9545             FixFreeIdx += 1;
   9546           IsFixFreeIdxInput = std::find(Inputs.begin(), Inputs.end(),
   9547                                         FixFreeIdx) != Inputs.end();
   9548           assert(IsFixIdxInput != IsFixFreeIdxInput &&
   9549                  "We need to be changing the number of flipped inputs!");
   9550           int PSHUFHalfMask[] = {0, 1, 2, 3};
   9551           std::swap(PSHUFHalfMask[FixFreeIdx % 4], PSHUFHalfMask[FixIdx % 4]);
   9552           V = DAG.getNode(FixIdx < 4 ? X86ISD::PSHUFLW : X86ISD::PSHUFHW, DL,
   9553                           MVT::v8i16, V,
   9554                           getV4X86ShuffleImm8ForMask(PSHUFHalfMask, DL, DAG));
   9555 
   9556           for (int &M : Mask)
   9557             if (M >= 0 && M == FixIdx)
   9558               M = FixFreeIdx;
   9559             else if (M >= 0 && M == FixFreeIdx)
   9560               M = FixIdx;
   9561         };
   9562         if (NumFlippedBToBInputs != 0) {
   9563           int BPinnedIdx =
   9564               BToAInputs.size() == 3 ? TripleNonInputIdx : OneInput;
   9565           FixFlippedInputs(BPinnedIdx, BDWord, BToBInputs);
   9566         } else {
   9567           assert(NumFlippedAToBInputs != 0 && "Impossible given predicates!");
   9568           int APinnedIdx = ThreeAInputs ? TripleNonInputIdx : OneInput;
   9569           FixFlippedInputs(APinnedIdx, ADWord, AToBInputs);
   9570         }
   9571       }
   9572     }
   9573 
   9574     int PSHUFDMask[] = {0, 1, 2, 3};
   9575     PSHUFDMask[ADWord] = BDWord;
   9576     PSHUFDMask[BDWord] = ADWord;
   9577     V = DAG.getBitcast(
   9578         VT,
   9579         DAG.getNode(X86ISD::PSHUFD, DL, PSHUFDVT, DAG.getBitcast(PSHUFDVT, V),
   9580                     getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG)));
   9581 
   9582     // Adjust the mask to match the new locations of A and B.
   9583     for (int &M : Mask)
   9584       if (M >= 0 && M/2 == ADWord)
   9585         M = 2 * BDWord + M % 2;
   9586       else if (M >= 0 && M/2 == BDWord)
   9587         M = 2 * ADWord + M % 2;
   9588 
   9589     // Recurse back into this routine to re-compute state now that this isn't
   9590     // a 3 and 1 problem.
   9591     return lowerV8I16GeneralSingleInputVectorShuffle(DL, VT, V, Mask, Subtarget,
   9592                                                      DAG);
   9593   };
   9594   if ((NumLToL == 3 && NumHToL == 1) || (NumLToL == 1 && NumHToL == 3))
   9595     return balanceSides(LToLInputs, HToLInputs, HToHInputs, LToHInputs, 0, 4);
   9596   else if ((NumHToH == 3 && NumLToH == 1) || (NumHToH == 1 && NumLToH == 3))
   9597     return balanceSides(HToHInputs, LToHInputs, LToLInputs, HToLInputs, 4, 0);
   9598 
   9599   // At this point there are at most two inputs to the low and high halves from
   9600   // each half. That means the inputs can always be grouped into dwords and
   9601   // those dwords can then be moved to the correct half with a dword shuffle.
   9602   // We use at most one low and one high word shuffle to collect these paired
   9603   // inputs into dwords, and finally a dword shuffle to place them.
   9604   int PSHUFLMask[4] = {-1, -1, -1, -1};
   9605   int PSHUFHMask[4] = {-1, -1, -1, -1};
   9606   int PSHUFDMask[4] = {-1, -1, -1, -1};
   9607 
   9608   // First fix the masks for all the inputs that are staying in their
   9609   // original halves. This will then dictate the targets of the cross-half
   9610   // shuffles.
   9611   auto fixInPlaceInputs =
   9612       [&PSHUFDMask](ArrayRef<int> InPlaceInputs, ArrayRef<int> IncomingInputs,
   9613                     MutableArrayRef<int> SourceHalfMask,
   9614                     MutableArrayRef<int> HalfMask, int HalfOffset) {
   9615     if (InPlaceInputs.empty())
   9616       return;
   9617     if (InPlaceInputs.size() == 1) {
   9618       SourceHalfMask[InPlaceInputs[0] - HalfOffset] =
   9619           InPlaceInputs[0] - HalfOffset;
   9620       PSHUFDMask[InPlaceInputs[0] / 2] = InPlaceInputs[0] / 2;
   9621       return;
   9622     }
   9623     if (IncomingInputs.empty()) {
   9624       // Just fix all of the in place inputs.
   9625       for (int Input : InPlaceInputs) {
   9626         SourceHalfMask[Input - HalfOffset] = Input - HalfOffset;
   9627         PSHUFDMask[Input / 2] = Input / 2;
   9628       }
   9629       return;
   9630     }
   9631 
   9632     assert(InPlaceInputs.size() == 2 && "Cannot handle 3 or 4 inputs!");
   9633     SourceHalfMask[InPlaceInputs[0] - HalfOffset] =
   9634         InPlaceInputs[0] - HalfOffset;
   9635     // Put the second input next to the first so that they are packed into
   9636     // a dword. We find the adjacent index by toggling the low bit.
   9637     int AdjIndex = InPlaceInputs[0] ^ 1;
   9638     SourceHalfMask[AdjIndex - HalfOffset] = InPlaceInputs[1] - HalfOffset;
   9639     std::replace(HalfMask.begin(), HalfMask.end(), InPlaceInputs[1], AdjIndex);
   9640     PSHUFDMask[AdjIndex / 2] = AdjIndex / 2;
   9641   };
   9642   fixInPlaceInputs(LToLInputs, HToLInputs, PSHUFLMask, LoMask, 0);
   9643   fixInPlaceInputs(HToHInputs, LToHInputs, PSHUFHMask, HiMask, 4);
   9644 
   9645   // Now gather the cross-half inputs and place them into a free dword of
   9646   // their target half.
   9647   // FIXME: This operation could almost certainly be simplified dramatically to
   9648   // look more like the 3-1 fixing operation.
   9649   auto moveInputsToRightHalf = [&PSHUFDMask](
   9650       MutableArrayRef<int> IncomingInputs, ArrayRef<int> ExistingInputs,
   9651       MutableArrayRef<int> SourceHalfMask, MutableArrayRef<int> HalfMask,
   9652       MutableArrayRef<int> FinalSourceHalfMask, int SourceOffset,
   9653       int DestOffset) {
   9654     auto isWordClobbered = [](ArrayRef<int> SourceHalfMask, int Word) {
   9655       return SourceHalfMask[Word] >= 0 && SourceHalfMask[Word] != Word;
   9656     };
   9657     auto isDWordClobbered = [&isWordClobbered](ArrayRef<int> SourceHalfMask,
   9658                                                int Word) {
   9659       int LowWord = Word & ~1;
   9660       int HighWord = Word | 1;
   9661       return isWordClobbered(SourceHalfMask, LowWord) ||
   9662              isWordClobbered(SourceHalfMask, HighWord);
   9663     };
   9664 
   9665     if (IncomingInputs.empty())
   9666       return;
   9667 
   9668     if (ExistingInputs.empty()) {
   9669       // Map any dwords with inputs from them into the right half.
   9670       for (int Input : IncomingInputs) {
   9671         // If the source half mask maps over the inputs, turn those into
   9672         // swaps and use the swapped lane.
   9673         if (isWordClobbered(SourceHalfMask, Input - SourceOffset)) {
   9674           if (SourceHalfMask[SourceHalfMask[Input - SourceOffset]] < 0) {
   9675             SourceHalfMask[SourceHalfMask[Input - SourceOffset]] =
   9676                 Input - SourceOffset;
   9677             // We have to swap the uses in our half mask in one sweep.
   9678             for (int &M : HalfMask)
   9679               if (M == SourceHalfMask[Input - SourceOffset] + SourceOffset)
   9680                 M = Input;
   9681               else if (M == Input)
   9682                 M = SourceHalfMask[Input - SourceOffset] + SourceOffset;
   9683           } else {
   9684             assert(SourceHalfMask[SourceHalfMask[Input - SourceOffset]] ==
   9685                        Input - SourceOffset &&
   9686                    "Previous placement doesn't match!");
   9687           }
   9688           // Note that this correctly re-maps both when we do a swap and when
   9689           // we observe the other side of the swap above. We rely on that to
   9690           // avoid swapping the members of the input list directly.
   9691           Input = SourceHalfMask[Input - SourceOffset] + SourceOffset;
   9692         }
   9693 
   9694         // Map the input's dword into the correct half.
   9695         if (PSHUFDMask[(Input - SourceOffset + DestOffset) / 2] < 0)
   9696           PSHUFDMask[(Input - SourceOffset + DestOffset) / 2] = Input / 2;
   9697         else
   9698           assert(PSHUFDMask[(Input - SourceOffset + DestOffset) / 2] ==
   9699                      Input / 2 &&
   9700                  "Previous placement doesn't match!");
   9701       }
   9702 
   9703       // And just directly shift any other-half mask elements to be same-half
   9704       // as we will have mirrored the dword containing the element into the
   9705       // same position within that half.
   9706       for (int &M : HalfMask)
   9707         if (M >= SourceOffset && M < SourceOffset + 4) {
   9708           M = M - SourceOffset + DestOffset;
   9709           assert(M >= 0 && "This should never wrap below zero!");
   9710         }
   9711       return;
   9712     }
   9713 
   9714     // Ensure we have the input in a viable dword of its current half. This
   9715     // is particularly tricky because the original position may be clobbered
   9716     // by inputs being moved and *staying* in that half.
   9717     if (IncomingInputs.size() == 1) {
   9718       if (isWordClobbered(SourceHalfMask, IncomingInputs[0] - SourceOffset)) {
   9719         int InputFixed = std::find(std::begin(SourceHalfMask),
   9720                                    std::end(SourceHalfMask), -1) -
   9721                          std::begin(SourceHalfMask) + SourceOffset;
   9722         SourceHalfMask[InputFixed - SourceOffset] =
   9723             IncomingInputs[0] - SourceOffset;
   9724         std::replace(HalfMask.begin(), HalfMask.end(), IncomingInputs[0],
   9725                      InputFixed);
   9726         IncomingInputs[0] = InputFixed;
   9727       }
   9728     } else if (IncomingInputs.size() == 2) {
   9729       if (IncomingInputs[0] / 2 != IncomingInputs[1] / 2 ||
   9730           isDWordClobbered(SourceHalfMask, IncomingInputs[0] - SourceOffset)) {
   9731         // We have two non-adjacent or clobbered inputs we need to extract from
   9732         // the source half. To do this, we need to map them into some adjacent
   9733         // dword slot in the source mask.
   9734         int InputsFixed[2] = {IncomingInputs[0] - SourceOffset,
   9735                               IncomingInputs[1] - SourceOffset};
   9736 
   9737         // If there is a free slot in the source half mask adjacent to one of
   9738         // the inputs, place the other input in it. We use (Index XOR 1) to
   9739         // compute an adjacent index.
   9740         if (!isWordClobbered(SourceHalfMask, InputsFixed[0]) &&
   9741             SourceHalfMask[InputsFixed[0] ^ 1] < 0) {
   9742           SourceHalfMask[InputsFixed[0]] = InputsFixed[0];
   9743           SourceHalfMask[InputsFixed[0] ^ 1] = InputsFixed[1];
   9744           InputsFixed[1] = InputsFixed[0] ^ 1;
   9745         } else if (!isWordClobbered(SourceHalfMask, InputsFixed[1]) &&
   9746                    SourceHalfMask[InputsFixed[1] ^ 1] < 0) {
   9747           SourceHalfMask[InputsFixed[1]] = InputsFixed[1];
   9748           SourceHalfMask[InputsFixed[1] ^ 1] = InputsFixed[0];
   9749           InputsFixed[0] = InputsFixed[1] ^ 1;
   9750         } else if (SourceHalfMask[2 * ((InputsFixed[0] / 2) ^ 1)] < 0 &&
   9751                    SourceHalfMask[2 * ((InputsFixed[0] / 2) ^ 1) + 1] < 0) {
   9752           // The two inputs are in the same DWord but it is clobbered and the
   9753           // adjacent DWord isn't used at all. Move both inputs to the free
   9754           // slot.
   9755           SourceHalfMask[2 * ((InputsFixed[0] / 2) ^ 1)] = InputsFixed[0];
   9756           SourceHalfMask[2 * ((InputsFixed[0] / 2) ^ 1) + 1] = InputsFixed[1];
   9757           InputsFixed[0] = 2 * ((InputsFixed[0] / 2) ^ 1);
   9758           InputsFixed[1] = 2 * ((InputsFixed[0] / 2) ^ 1) + 1;
   9759         } else {
   9760           // The only way we hit this point is if there is no clobbering
   9761           // (because there are no off-half inputs to this half) and there is no
   9762           // free slot adjacent to one of the inputs. In this case, we have to
   9763           // swap an input with a non-input.
   9764           for (int i = 0; i < 4; ++i)
   9765             assert((SourceHalfMask[i] < 0 || SourceHalfMask[i] == i) &&
   9766                    "We can't handle any clobbers here!");
   9767           assert(InputsFixed[1] != (InputsFixed[0] ^ 1) &&
   9768                  "Cannot have adjacent inputs here!");
   9769 
   9770           SourceHalfMask[InputsFixed[0] ^ 1] = InputsFixed[1];
   9771           SourceHalfMask[InputsFixed[1]] = InputsFixed[0] ^ 1;
   9772 
   9773           // We also have to update the final source mask in this case because
   9774           // it may need to undo the above swap.
   9775           for (int &M : FinalSourceHalfMask)
   9776             if (M == (InputsFixed[0] ^ 1) + SourceOffset)
   9777               M = InputsFixed[1] + SourceOffset;
   9778             else if (M == InputsFixed[1] + SourceOffset)
   9779               M = (InputsFixed[0] ^ 1) + SourceOffset;
   9780 
   9781           InputsFixed[1] = InputsFixed[0] ^ 1;
   9782         }
   9783 
   9784         // Point everything at the fixed inputs.
   9785         for (int &M : HalfMask)
   9786           if (M == IncomingInputs[0])
   9787             M = InputsFixed[0] + SourceOffset;
   9788           else if (M == IncomingInputs[1])
   9789             M = InputsFixed[1] + SourceOffset;
   9790 
   9791         IncomingInputs[0] = InputsFixed[0] + SourceOffset;
   9792         IncomingInputs[1] = InputsFixed[1] + SourceOffset;
   9793       }
   9794     } else {
   9795       llvm_unreachable("Unhandled input size!");
   9796     }
   9797 
   9798     // Now hoist the DWord down to the right half.
   9799     int FreeDWord = (PSHUFDMask[DestOffset / 2] < 0 ? 0 : 1) + DestOffset / 2;
   9800     assert(PSHUFDMask[FreeDWord] < 0 && "DWord not free");
   9801     PSHUFDMask[FreeDWord] = IncomingInputs[0] / 2;
   9802     for (int &M : HalfMask)
   9803       for (int Input : IncomingInputs)
   9804         if (M == Input)
   9805           M = FreeDWord * 2 + Input % 2;
   9806   };
   9807   moveInputsToRightHalf(HToLInputs, LToLInputs, PSHUFHMask, LoMask, HiMask,
   9808                         /*SourceOffset*/ 4, /*DestOffset*/ 0);
   9809   moveInputsToRightHalf(LToHInputs, HToHInputs, PSHUFLMask, HiMask, LoMask,
   9810                         /*SourceOffset*/ 0, /*DestOffset*/ 4);
   9811 
   9812   // Now enact all the shuffles we've computed to move the inputs into their
   9813   // target half.
   9814   if (!isNoopShuffleMask(PSHUFLMask))
   9815     V = DAG.getNode(X86ISD::PSHUFLW, DL, VT, V,
   9816                     getV4X86ShuffleImm8ForMask(PSHUFLMask, DL, DAG));
   9817   if (!isNoopShuffleMask(PSHUFHMask))
   9818     V = DAG.getNode(X86ISD::PSHUFHW, DL, VT, V,
   9819                     getV4X86ShuffleImm8ForMask(PSHUFHMask, DL, DAG));
   9820   if (!isNoopShuffleMask(PSHUFDMask))
   9821     V = DAG.getBitcast(
   9822         VT,
   9823         DAG.getNode(X86ISD::PSHUFD, DL, PSHUFDVT, DAG.getBitcast(PSHUFDVT, V),
   9824                     getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG)));
   9825 
   9826   // At this point, each half should contain all its inputs, and we can then
   9827   // just shuffle them into their final position.
   9828   assert(count_if(LoMask, [](int M) { return M >= 4; }) == 0 &&
   9829          "Failed to lift all the high half inputs to the low mask!");
   9830   assert(count_if(HiMask, [](int M) { return M >= 0 && M < 4; }) == 0 &&
   9831          "Failed to lift all the low half inputs to the high mask!");
   9832 
   9833   // Do a half shuffle for the low mask.
   9834   if (!isNoopShuffleMask(LoMask))
   9835     V = DAG.getNode(X86ISD::PSHUFLW, DL, VT, V,
   9836                     getV4X86ShuffleImm8ForMask(LoMask, DL, DAG));
   9837 
   9838   // Do a half shuffle with the high mask after shifting its values down.
   9839   for (int &M : HiMask)
   9840     if (M >= 0)
   9841       M -= 4;
   9842   if (!isNoopShuffleMask(HiMask))
   9843     V = DAG.getNode(X86ISD::PSHUFHW, DL, VT, V,
   9844                     getV4X86ShuffleImm8ForMask(HiMask, DL, DAG));
   9845 
   9846   return V;
   9847 }
   9848 
   9849 /// Helper to form a PSHUFB-based shuffle+blend, opportunistically avoiding the
   9850 /// blend if only one input is used.
   9851 static SDValue lowerVectorShuffleAsBlendOfPSHUFBs(
   9852     const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
   9853     SelectionDAG &DAG, bool &V1InUse, bool &V2InUse) {
   9854   SmallBitVector Zeroable = computeZeroableShuffleElements(Mask, V1, V2);
   9855   SDValue V1Mask[16];
   9856   SDValue V2Mask[16];
   9857   V1InUse = false;
   9858   V2InUse = false;
   9859 
   9860   int Size = Mask.size();
   9861   int Scale = 16 / Size;
   9862   for (int i = 0; i < 16; ++i) {
   9863     if (Mask[i / Scale] < 0) {
   9864       V1Mask[i] = V2Mask[i] = DAG.getUNDEF(MVT::i8);
   9865     } else {
   9866       const int ZeroMask = 0x80;
   9867       int V1Idx = Mask[i / Scale] < Size ? Mask[i / Scale] * Scale + i % Scale
   9868                                           : ZeroMask;
   9869       int V2Idx = Mask[i / Scale] < Size
   9870                       ? ZeroMask
   9871                       : (Mask[i / Scale] - Size) * Scale + i % Scale;
   9872       if (Zeroable[i / Scale])
   9873         V1Idx = V2Idx = ZeroMask;
   9874       V1Mask[i] = DAG.getConstant(V1Idx, DL, MVT::i8);
   9875       V2Mask[i] = DAG.getConstant(V2Idx, DL, MVT::i8);
   9876       V1InUse |= (ZeroMask != V1Idx);
   9877       V2InUse |= (ZeroMask != V2Idx);
   9878     }
   9879   }
   9880 
   9881   if (V1InUse)
   9882     V1 = DAG.getNode(X86ISD::PSHUFB, DL, MVT::v16i8,
   9883                      DAG.getBitcast(MVT::v16i8, V1),
   9884                      DAG.getBuildVector(MVT::v16i8, DL, V1Mask));
   9885   if (V2InUse)
   9886     V2 = DAG.getNode(X86ISD::PSHUFB, DL, MVT::v16i8,
   9887                      DAG.getBitcast(MVT::v16i8, V2),
   9888                      DAG.getBuildVector(MVT::v16i8, DL, V2Mask));
   9889 
   9890   // If we need shuffled inputs from both, blend the two.
   9891   SDValue V;
   9892   if (V1InUse && V2InUse)
   9893     V = DAG.getNode(ISD::OR, DL, MVT::v16i8, V1, V2);
   9894   else
   9895     V = V1InUse ? V1 : V2;
   9896 
   9897   // Cast the result back to the correct type.
   9898   return DAG.getBitcast(VT, V);
   9899 }
   9900 
   9901 /// \brief Generic lowering of 8-lane i16 shuffles.
   9902 ///
   9903 /// This handles both single-input shuffles and combined shuffle/blends with
   9904 /// two inputs. The single input shuffles are immediately delegated to
   9905 /// a dedicated lowering routine.
   9906 ///
   9907 /// The blends are lowered in one of three fundamental ways. If there are few
   9908 /// enough inputs, it delegates to a basic UNPCK-based strategy. If the shuffle
   9909 /// of the input is significantly cheaper when lowered as an interleaving of
   9910 /// the two inputs, try to interleave them. Otherwise, blend the low and high
   9911 /// halves of the inputs separately (making them have relatively few inputs)
   9912 /// and then concatenate them.
   9913 static SDValue lowerV8I16VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
   9914                                        SDValue V1, SDValue V2,
   9915                                        const X86Subtarget &Subtarget,
   9916                                        SelectionDAG &DAG) {
   9917   assert(V1.getSimpleValueType() == MVT::v8i16 && "Bad operand type!");
   9918   assert(V2.getSimpleValueType() == MVT::v8i16 && "Bad operand type!");
   9919   assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");
   9920 
   9921   // Whenever we can lower this as a zext, that instruction is strictly faster
   9922   // than any alternative.
   9923   if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend(
   9924           DL, MVT::v8i16, V1, V2, Mask, Subtarget, DAG))
   9925     return ZExt;
   9926 
   9927   int NumV2Inputs = count_if(Mask, [](int M) { return M >= 8; });
   9928 
   9929   if (NumV2Inputs == 0) {
   9930     // Check for being able to broadcast a single element.
   9931     if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(
   9932             DL, MVT::v8i16, V1, V2, Mask, Subtarget, DAG))
   9933       return Broadcast;
   9934 
   9935     // Try to use shift instructions.
   9936     if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v8i16, V1, V1, Mask,
   9937                                                   Subtarget, DAG))
   9938       return Shift;
   9939 
   9940     // Use dedicated unpack instructions for masks that match their pattern.
   9941     if (SDValue V =
   9942             lowerVectorShuffleWithUNPCK(DL, MVT::v8i16, Mask, V1, V2, DAG))
   9943       return V;
   9944 
   9945     // Try to use byte rotation instructions.
   9946     if (SDValue Rotate = lowerVectorShuffleAsByteRotate(DL, MVT::v8i16, V1, V1,
   9947                                                         Mask, Subtarget, DAG))
   9948       return Rotate;
   9949 
   9950     // Make a copy of the mask so it can be modified.
   9951     SmallVector<int, 8> MutableMask(Mask.begin(), Mask.end());
   9952     return lowerV8I16GeneralSingleInputVectorShuffle(DL, MVT::v8i16, V1,
   9953                                                      MutableMask, Subtarget,
   9954                                                      DAG);
   9955   }
   9956 
   9957   assert(llvm::any_of(Mask, [](int M) { return M >= 0 && M < 8; }) &&
   9958          "All single-input shuffles should be canonicalized to be V1-input "
   9959          "shuffles.");
   9960 
   9961   // Try to use shift instructions.
   9962   if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v8i16, V1, V2, Mask,
   9963                                                 Subtarget, DAG))
   9964     return Shift;
   9965 
   9966   // See if we can use SSE4A Extraction / Insertion.
   9967   if (Subtarget.hasSSE4A())
   9968     if (SDValue V = lowerVectorShuffleWithSSE4A(DL, MVT::v8i16, V1, V2, Mask, DAG))
   9969       return V;
   9970 
   9971   // There are special ways we can lower some single-element blends.
   9972   if (NumV2Inputs == 1)
   9973     if (SDValue V = lowerVectorShuffleAsElementInsertion(DL, MVT::v8i16, V1, V2,
   9974                                                          Mask, Subtarget, DAG))
   9975       return V;
   9976 
   9977   // We have different paths for blend lowering, but they all must use the
   9978   // *exact* same predicate.
   9979   bool IsBlendSupported = Subtarget.hasSSE41();
   9980   if (IsBlendSupported)
   9981     if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v8i16, V1, V2, Mask,
   9982                                                   Subtarget, DAG))
   9983       return Blend;
   9984 
   9985   if (SDValue Masked =
   9986           lowerVectorShuffleAsBitMask(DL, MVT::v8i16, V1, V2, Mask, DAG))
   9987     return Masked;
   9988 
   9989   // Use dedicated unpack instructions for masks that match their pattern.
   9990   if (SDValue V =
   9991           lowerVectorShuffleWithUNPCK(DL, MVT::v8i16, Mask, V1, V2, DAG))
   9992     return V;
   9993 
   9994   // Try to use byte rotation instructions.
   9995   if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
   9996           DL, MVT::v8i16, V1, V2, Mask, Subtarget, DAG))
   9997     return Rotate;
   9998 
   9999   if (SDValue BitBlend =
   10000           lowerVectorShuffleAsBitBlend(DL, MVT::v8i16, V1, V2, Mask, DAG))
   10001     return BitBlend;
   10002 
   10003   if (SDValue Unpack = lowerVectorShuffleAsPermuteAndUnpack(DL, MVT::v8i16, V1,
   10004                                                             V2, Mask, DAG))
   10005     return Unpack;
   10006 
   10007   // If we can't directly blend but can use PSHUFB, that will be better as it
   10008   // can both shuffle and set up the inefficient blend.
   10009   if (!IsBlendSupported && Subtarget.hasSSSE3()) {
   10010     bool V1InUse, V2InUse;
   10011     return lowerVectorShuffleAsBlendOfPSHUFBs(DL, MVT::v8i16, V1, V2, Mask, DAG,
   10012                                               V1InUse, V2InUse);
   10013   }
   10014 
   10015   // We can always bit-blend if we have to so the fallback strategy is to
   10016   // decompose into single-input permutes and blends.
   10017   return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v8i16, V1, V2,
   10018                                                       Mask, DAG);
   10019 }
   10020 
   10021 /// \brief Check whether a compaction lowering can be done by dropping even
   10022 /// elements and compute how many times even elements must be dropped.
   10023 ///
   10024 /// This handles shuffles which take every Nth element where N is a power of
   10025 /// two. Example shuffle masks:
   10026 ///
   10027 ///  N = 1:  0,  2,  4,  6,  8, 10, 12, 14,  0,  2,  4,  6,  8, 10, 12, 14
   10028 ///  N = 1:  0,  2,  4,  6,  8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30
   10029 ///  N = 2:  0,  4,  8, 12,  0,  4,  8, 12,  0,  4,  8, 12,  0,  4,  8, 12
   10030 ///  N = 2:  0,  4,  8, 12, 16, 20, 24, 28,  0,  4,  8, 12, 16, 20, 24, 28
   10031 ///  N = 3:  0,  8,  0,  8,  0,  8,  0,  8,  0,  8,  0,  8,  0,  8,  0,  8
   10032 ///  N = 3:  0,  8, 16, 24,  0,  8, 16, 24,  0,  8, 16, 24,  0,  8, 16, 24
   10033 ///
   10034 /// Any of these lanes can of course be undef.
   10035 ///
   10036 /// This routine only supports N <= 3.
   10037 /// FIXME: Evaluate whether either AVX or AVX-512 have any opportunities here
   10038 /// for larger N.
   10039 ///
   10040 /// \returns N above, or the number of times even elements must be dropped if
   10041 /// there is such a number. Otherwise returns zero.
   10042 static int canLowerByDroppingEvenElements(ArrayRef<int> Mask,
   10043                                           bool IsSingleInput) {
   10044   // The modulus for the shuffle vector entries is based on whether this is
   10045   // a single input or not.
   10046   int ShuffleModulus = Mask.size() * (IsSingleInput ? 1 : 2);
   10047   assert(isPowerOf2_32((uint32_t)ShuffleModulus) &&
   10048          "We should only be called with masks with a power-of-2 size!");
   10049 
   10050   uint64_t ModMask = (uint64_t)ShuffleModulus - 1;
   10051 
   10052   // We track whether the input is viable for all power-of-2 strides 2^1, 2^2,
   10053   // and 2^3 simultaneously. This is because we may have ambiguity with
   10054   // partially undef inputs.
   10055   bool ViableForN[3] = {true, true, true};
   10056 
   10057   for (int i = 0, e = Mask.size(); i < e; ++i) {
   10058     // Ignore undef lanes, we'll optimistically collapse them to the pattern we
   10059     // want.
   10060     if (Mask[i] < 0)
   10061       continue;
   10062 
   10063     bool IsAnyViable = false;
   10064     for (unsigned j = 0; j != array_lengthof(ViableForN); ++j)
   10065       if (ViableForN[j]) {
   10066         uint64_t N = j + 1;
   10067 
   10068         // The shuffle mask must be equal to (i * 2^N) % M.
   10069         if ((uint64_t)Mask[i] == (((uint64_t)i << N) & ModMask))
   10070           IsAnyViable = true;
   10071         else
   10072           ViableForN[j] = false;
   10073       }
   10074     // Early exit if we exhaust the possible powers of two.
   10075     if (!IsAnyViable)
   10076       break;
   10077   }
   10078 
   10079   for (unsigned j = 0; j != array_lengthof(ViableForN); ++j)
   10080     if (ViableForN[j])
   10081       return j + 1;
   10082 
   10083   // Return 0 as there is no viable power of two.
   10084   return 0;
   10085 }
   10086 
   10087 /// \brief Generic lowering of v16i8 shuffles.
   10088 ///
   10089 /// This is a hybrid strategy to lower v16i8 vectors. It first attempts to
   10090 /// detect any complexity reducing interleaving. If that doesn't help, it uses
   10091 /// UNPCK to spread the i8 elements across two i16-element vectors, and uses
   10092 /// the existing lowering for v8i16 blends on each half, finally PACK-ing them
   10093 /// back together.
   10094 static SDValue lowerV16I8VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
   10095                                        SDValue V1, SDValue V2,
   10096                                        const X86Subtarget &Subtarget,
   10097                                        SelectionDAG &DAG) {
   10098   assert(V1.getSimpleValueType() == MVT::v16i8 && "Bad operand type!");
   10099   assert(V2.getSimpleValueType() == MVT::v16i8 && "Bad operand type!");
   10100   assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!");
   10101 
   10102   // Try to use shift instructions.
   10103   if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v16i8, V1, V2, Mask,
   10104                                                 Subtarget, DAG))
   10105     return Shift;
   10106 
   10107   // Try to use byte rotation instructions.
   10108   if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
   10109           DL, MVT::v16i8, V1, V2, Mask, Subtarget, DAG))
   10110     return Rotate;
   10111 
   10112   // Try to use a zext lowering.
   10113   if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend(
   10114           DL, MVT::v16i8, V1, V2, Mask, Subtarget, DAG))
   10115     return ZExt;
   10116 
   10117   // See if we can use SSE4A Extraction / Insertion.
   10118   if (Subtarget.hasSSE4A())
   10119     if (SDValue V = lowerVectorShuffleWithSSE4A(DL, MVT::v16i8, V1, V2, Mask, DAG))
   10120       return V;
   10121 
   10122   int NumV2Elements = count_if(Mask, [](int M) { return M >= 16; });
   10123 
   10124   // For single-input shuffles, there are some nicer lowering tricks we can use.
   10125   if (NumV2Elements == 0) {
   10126     // Check for being able to broadcast a single element.
   10127     if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(
   10128             DL, MVT::v16i8, V1, V2, Mask, Subtarget, DAG))
   10129       return Broadcast;
   10130 
   10131     // Check whether we can widen this to an i16 shuffle by duplicating bytes.
   10132     // Notably, this handles splat and partial-splat shuffles more efficiently.
   10133     // However, it only makes sense if the pre-duplication shuffle simplifies
   10134     // things significantly. Currently, this means we need to be able to
   10135     // express the pre-duplication shuffle as an i16 shuffle.
   10136     //
   10137     // FIXME: We should check for other patterns which can be widened into an
   10138     // i16 shuffle as well.
   10139     auto canWidenViaDuplication = [](ArrayRef<int> Mask) {
   10140       for (int i = 0; i < 16; i += 2)
   10141         if (Mask[i] >= 0 && Mask[i + 1] >= 0 && Mask[i] != Mask[i + 1])
   10142           return false;
   10143 
   10144       return true;
   10145     };
   10146     auto tryToWidenViaDuplication = [&]() -> SDValue {
   10147       if (!canWidenViaDuplication(Mask))
   10148         return SDValue();
   10149       SmallVector<int, 4> LoInputs;
   10150       std::copy_if(Mask.begin(), Mask.end(), std::back_inserter(LoInputs),
   10151                    [](int M) { return M >= 0 && M < 8; });
   10152       std::sort(LoInputs.begin(), LoInputs.end());
   10153       LoInputs.erase(std::unique(LoInputs.begin(), LoInputs.end()),
   10154                      LoInputs.end());
   10155       SmallVector<int, 4> HiInputs;
   10156       std::copy_if(Mask.begin(), Mask.end(), std::back_inserter(HiInputs),
   10157                    [](int M) { return M >= 8; });
   10158       std::sort(HiInputs.begin(), HiInputs.end());
   10159       HiInputs.erase(std::unique(HiInputs.begin(), HiInputs.end()),
   10160                      HiInputs.end());
   10161 
   10162       bool TargetLo = LoInputs.size() >= HiInputs.size();
   10163       ArrayRef<int> InPlaceInputs = TargetLo ? LoInputs : HiInputs;
   10164       ArrayRef<int> MovingInputs = TargetLo ? HiInputs : LoInputs;
   10165 
   10166       int PreDupI16Shuffle[] = {-1, -1, -1, -1, -1, -1, -1, -1};
   10167       SmallDenseMap<int, int, 8> LaneMap;
   10168       for (int I : InPlaceInputs) {
   10169         PreDupI16Shuffle[I/2] = I/2;
   10170         LaneMap[I] = I;
   10171       }
   10172       int j = TargetLo ? 0 : 4, je = j + 4;
   10173       for (int i = 0, ie = MovingInputs.size(); i < ie; ++i) {
   10174         // Check if j is already a shuffle of this input. This happens when
   10175         // there are two adjacent bytes after we move the low one.
   10176         if (PreDupI16Shuffle[j] != MovingInputs[i] / 2) {
   10177           // If we haven't yet mapped the input, search for a slot into which
   10178           // we can map it.
   10179           while (j < je && PreDupI16Shuffle[j] >= 0)
   10180             ++j;
   10181 
   10182           if (j == je)
   10183             // We can't place the inputs into a single half with a simple i16 shuffle, so bail.
   10184             return SDValue();
   10185 
   10186           // Map this input with the i16 shuffle.
   10187           PreDupI16Shuffle[j] = MovingInputs[i] / 2;
   10188         }
   10189 
   10190         // Update the lane map based on the mapping we ended up with.
   10191         LaneMap[MovingInputs[i]] = 2 * j + MovingInputs[i] % 2;
   10192       }
   10193       V1 = DAG.getBitcast(
   10194           MVT::v16i8,
   10195           DAG.getVectorShuffle(MVT::v8i16, DL, DAG.getBitcast(MVT::v8i16, V1),
   10196                                DAG.getUNDEF(MVT::v8i16), PreDupI16Shuffle));
   10197 
   10198       // Unpack the bytes to form the i16s that will be shuffled into place.
   10199       V1 = DAG.getNode(TargetLo ? X86ISD::UNPCKL : X86ISD::UNPCKH, DL,
   10200                        MVT::v16i8, V1, V1);
   10201 
   10202       int PostDupI16Shuffle[8] = {-1, -1, -1, -1, -1, -1, -1, -1};
   10203       for (int i = 0; i < 16; ++i)
   10204         if (Mask[i] >= 0) {
   10205           int MappedMask = LaneMap[Mask[i]] - (TargetLo ? 0 : 8);
   10206           assert(MappedMask < 8 && "Invalid v8 shuffle mask!");
   10207           if (PostDupI16Shuffle[i / 2] < 0)
   10208             PostDupI16Shuffle[i / 2] = MappedMask;
   10209           else
   10210             assert(PostDupI16Shuffle[i / 2] == MappedMask &&
   10211                    "Conflicting entrties in the original shuffle!");
   10212         }
   10213       return DAG.getBitcast(
   10214           MVT::v16i8,
   10215           DAG.getVectorShuffle(MVT::v8i16, DL, DAG.getBitcast(MVT::v8i16, V1),
   10216                                DAG.getUNDEF(MVT::v8i16), PostDupI16Shuffle));
   10217     };
   10218     if (SDValue V = tryToWidenViaDuplication())
   10219       return V;
   10220   }
   10221 
   10222   if (SDValue Masked =
   10223           lowerVectorShuffleAsBitMask(DL, MVT::v16i8, V1, V2, Mask, DAG))
   10224     return Masked;
   10225 
   10226   // Use dedicated unpack instructions for masks that match their pattern.
   10227   if (SDValue V =
   10228           lowerVectorShuffleWithUNPCK(DL, MVT::v16i8, Mask, V1, V2, DAG))
   10229     return V;
   10230 
   10231   // Check for SSSE3 which lets us lower all v16i8 shuffles much more directly
   10232   // with PSHUFB. It is important to do this before we attempt to generate any
   10233   // blends but after all of the single-input lowerings. If the single input
   10234   // lowerings can find an instruction sequence that is faster than a PSHUFB, we
   10235   // want to preserve that and we can DAG combine any longer sequences into
   10236   // a PSHUFB in the end. But once we start blending from multiple inputs,
   10237   // the complexity of DAG combining bad patterns back into PSHUFB is too high,
   10238   // and there are *very* few patterns that would actually be faster than the
   10239   // PSHUFB approach because of its ability to zero lanes.
   10240   //
   10241   // FIXME: The only exceptions to the above are blends which are exact
   10242   // interleavings with direct instructions supporting them. We currently don't
   10243   // handle those well here.
   10244   if (Subtarget.hasSSSE3()) {
   10245     bool V1InUse = false;
   10246     bool V2InUse = false;
   10247 
   10248     SDValue PSHUFB = lowerVectorShuffleAsBlendOfPSHUFBs(
   10249         DL, MVT::v16i8, V1, V2, Mask, DAG, V1InUse, V2InUse);
   10250 
   10251     // If both V1 and V2 are in use and we can use a direct blend or an unpack,
   10252     // do so. This avoids using them to handle blends-with-zero which is
   10253     // important as a single pshufb is significantly faster for that.
   10254     if (V1InUse && V2InUse) {
   10255       if (Subtarget.hasSSE41())
   10256         if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v16i8, V1, V2,
   10257                                                       Mask, Subtarget, DAG))
   10258           return Blend;
   10259 
   10260       // We can use an unpack to do the blending rather than an or in some
   10261       // cases. Even though the or may be (very minorly) more efficient, we
   10262       // preference this lowering because there are common cases where part of
   10263       // the complexity of the shuffles goes away when we do the final blend as
   10264       // an unpack.
   10265       // FIXME: It might be worth trying to detect if the unpack-feeding
   10266       // shuffles will both be pshufb, in which case we shouldn't bother with
   10267       // this.
   10268       if (SDValue Unpack = lowerVectorShuffleAsPermuteAndUnpack(
   10269               DL, MVT::v16i8, V1, V2, Mask, DAG))
   10270         return Unpack;
   10271     }
   10272 
   10273     return PSHUFB;
   10274   }
   10275 
   10276   // There are special ways we can lower some single-element blends.
   10277   if (NumV2Elements == 1)
   10278     if (SDValue V = lowerVectorShuffleAsElementInsertion(DL, MVT::v16i8, V1, V2,
   10279                                                          Mask, Subtarget, DAG))
   10280       return V;
   10281 
   10282   if (SDValue BitBlend =
   10283           lowerVectorShuffleAsBitBlend(DL, MVT::v16i8, V1, V2, Mask, DAG))
   10284     return BitBlend;
   10285 
   10286   // Check whether a compaction lowering can be done. This handles shuffles
   10287   // which take every Nth element for some even N. See the helper function for
   10288   // details.
   10289   //
   10290   // We special case these as they can be particularly efficiently handled with
   10291   // the PACKUSB instruction on x86 and they show up in common patterns of
   10292   // rearranging bytes to truncate wide elements.
   10293   bool IsSingleInput = V2.isUndef();
   10294   if (int NumEvenDrops = canLowerByDroppingEvenElements(Mask, IsSingleInput)) {
   10295     // NumEvenDrops is the power of two stride of the elements. Another way of
   10296     // thinking about it is that we need to drop the even elements this many
   10297     // times to get the original input.
   10298 
   10299     // First we need to zero all the dropped bytes.
   10300     assert(NumEvenDrops <= 3 &&
   10301            "No support for dropping even elements more than 3 times.");
   10302     // We use the mask type to pick which bytes are preserved based on how many
   10303     // elements are dropped.
   10304     MVT MaskVTs[] = { MVT::v8i16, MVT::v4i32, MVT::v2i64 };
   10305     SDValue ByteClearMask = DAG.getBitcast(
   10306         MVT::v16i8, DAG.getConstant(0xFF, DL, MaskVTs[NumEvenDrops - 1]));
   10307     V1 = DAG.getNode(ISD::AND, DL, MVT::v16i8, V1, ByteClearMask);
   10308     if (!IsSingleInput)
   10309       V2 = DAG.getNode(ISD::AND, DL, MVT::v16i8, V2, ByteClearMask);
   10310 
   10311     // Now pack things back together.
   10312     V1 = DAG.getBitcast(MVT::v8i16, V1);
   10313     V2 = IsSingleInput ? V1 : DAG.getBitcast(MVT::v8i16, V2);
   10314     SDValue Result = DAG.getNode(X86ISD::PACKUS, DL, MVT::v16i8, V1, V2);
   10315     for (int i = 1; i < NumEvenDrops; ++i) {
   10316       Result = DAG.getBitcast(MVT::v8i16, Result);
   10317       Result = DAG.getNode(X86ISD::PACKUS, DL, MVT::v16i8, Result, Result);
   10318     }
   10319 
   10320     return Result;
   10321   }
   10322 
   10323   // Handle multi-input cases by blending single-input shuffles.
   10324   if (NumV2Elements > 0)
   10325     return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v16i8, V1, V2,
   10326                                                       Mask, DAG);
   10327 
   10328   // The fallback path for single-input shuffles widens this into two v8i16
   10329   // vectors with unpacks, shuffles those, and then pulls them back together
   10330   // with a pack.
   10331   SDValue V = V1;
   10332 
   10333   int LoBlendMask[8] = {-1, -1, -1, -1, -1, -1, -1, -1};
   10334   int HiBlendMask[8] = {-1, -1, -1, -1, -1, -1, -1, -1};
   10335   for (int i = 0; i < 16; ++i)
   10336     if (Mask[i] >= 0)
   10337       (i < 8 ? LoBlendMask[i] : HiBlendMask[i % 8]) = Mask[i];
   10338 
   10339   SDValue Zero = getZeroVector(MVT::v8i16, Subtarget, DAG, DL);
   10340 
   10341   SDValue VLoHalf, VHiHalf;
   10342   // Check if any of the odd lanes in the v16i8 are used. If not, we can mask
   10343   // them out and avoid using UNPCK{L,H} to extract the elements of V as
   10344   // i16s.
   10345   if (std::none_of(std::begin(LoBlendMask), std::end(LoBlendMask),
   10346                    [](int M) { return M >= 0 && M % 2 == 1; }) &&
   10347       std::none_of(std::begin(HiBlendMask), std::end(HiBlendMask),
   10348                    [](int M) { return M >= 0 && M % 2 == 1; })) {
   10349     // Use a mask to drop the high bytes.
   10350     VLoHalf = DAG.getBitcast(MVT::v8i16, V);
   10351     VLoHalf = DAG.getNode(ISD::AND, DL, MVT::v8i16, VLoHalf,
   10352                           DAG.getConstant(0x00FF, DL, MVT::v8i16));
   10353 
   10354     // This will be a single vector shuffle instead of a blend so nuke VHiHalf.
   10355     VHiHalf = DAG.getUNDEF(MVT::v8i16);
   10356 
   10357     // Squash the masks to point directly into VLoHalf.
   10358     for (int &M : LoBlendMask)
   10359       if (M >= 0)
   10360         M /= 2;
   10361     for (int &M : HiBlendMask)
   10362       if (M >= 0)
   10363         M /= 2;
   10364   } else {
   10365     // Otherwise just unpack the low half of V into VLoHalf and the high half into
   10366     // VHiHalf so that we can blend them as i16s.
   10367     VLoHalf = DAG.getBitcast(
   10368         MVT::v8i16, DAG.getNode(X86ISD::UNPCKL, DL, MVT::v16i8, V, Zero));
   10369     VHiHalf = DAG.getBitcast(
   10370         MVT::v8i16, DAG.getNode(X86ISD::UNPCKH, DL, MVT::v16i8, V, Zero));
   10371   }
   10372 
   10373   SDValue LoV = DAG.getVectorShuffle(MVT::v8i16, DL, VLoHalf, VHiHalf, LoBlendMask);
   10374   SDValue HiV = DAG.getVectorShuffle(MVT::v8i16, DL, VLoHalf, VHiHalf, HiBlendMask);
   10375 
   10376   return DAG.getNode(X86ISD::PACKUS, DL, MVT::v16i8, LoV, HiV);
   10377 }
   10378 
   10379 /// \brief Dispatching routine to lower various 128-bit x86 vector shuffles.
   10380 ///
   10381 /// This routine breaks down the specific type of 128-bit shuffle and
   10382 /// dispatches to the lowering routines accordingly.
   10383 static SDValue lower128BitVectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
   10384                                         MVT VT, SDValue V1, SDValue V2,
   10385                                         const X86Subtarget &Subtarget,
   10386                                         SelectionDAG &DAG) {
   10387   switch (VT.SimpleTy) {
   10388   case MVT::v2i64:
   10389     return lowerV2I64VectorShuffle(DL, Mask, V1, V2, Subtarget, DAG);
   10390   case MVT::v2f64:
   10391     return lowerV2F64VectorShuffle(DL, Mask, V1, V2, Subtarget, DAG);
   10392   case MVT::v4i32:
   10393     return lowerV4I32VectorShuffle(DL, Mask, V1, V2, Subtarget, DAG);
   10394   case MVT::v4f32:
   10395     return lowerV4F32VectorShuffle(DL, Mask, V1, V2, Subtarget, DAG);
   10396   case MVT::v8i16:
   10397     return lowerV8I16VectorShuffle(DL, Mask, V1, V2, Subtarget, DAG);
   10398   case MVT::v16i8:
   10399     return lowerV16I8VectorShuffle(DL, Mask, V1, V2, Subtarget, DAG);
   10400 
   10401   default:
   10402     llvm_unreachable("Unimplemented!");
   10403   }
   10404 }
   10405 
   10406 /// \brief Helper function to test whether a shuffle mask could be
   10407 /// simplified by widening the elements being shuffled.
   10408 ///
   10409 /// Appends the mask for wider elements in WidenedMask if valid. Otherwise
   10410 /// leaves it in an unspecified state.
   10411 ///
   10412 /// NOTE: This must handle normal vector shuffle masks and *target* vector
   10413 /// shuffle masks. The latter have the special property of a '-2' representing
   10414 /// a zero-ed lane of a vector.
   10415 static bool canWidenShuffleElements(ArrayRef<int> Mask,
   10416                                     SmallVectorImpl<int> &WidenedMask) {
   10417   WidenedMask.assign(Mask.size() / 2, 0);
   10418   for (int i = 0, Size = Mask.size(); i < Size; i += 2) {
   10419     // If both elements are undef, its trivial.
   10420     if (Mask[i] == SM_SentinelUndef && Mask[i + 1] == SM_SentinelUndef) {
   10421       WidenedMask[i/2] = SM_SentinelUndef;
   10422       continue;
   10423     }
   10424 
   10425     // Check for an undef mask and a mask value properly aligned to fit with
   10426     // a pair of values. If we find such a case, use the non-undef mask's value.
   10427     if (Mask[i] == SM_SentinelUndef && Mask[i + 1] >= 0 && Mask[i + 1] % 2 == 1) {
   10428       WidenedMask[i/2] = Mask[i + 1] / 2;
   10429       continue;
   10430     }
   10431     if (Mask[i + 1] == SM_SentinelUndef && Mask[i] >= 0 && Mask[i] % 2 == 0) {
   10432       WidenedMask[i/2] = Mask[i] / 2;
   10433       continue;
   10434     }
   10435 
   10436     // When zeroing, we need to spread the zeroing across both lanes to widen.
   10437     if (Mask[i] == SM_SentinelZero || Mask[i + 1] == SM_SentinelZero) {
   10438       if ((Mask[i] == SM_SentinelZero || Mask[i] == SM_SentinelUndef) &&
   10439           (Mask[i + 1] == SM_SentinelZero || Mask[i + 1] == SM_SentinelUndef)) {
   10440         WidenedMask[i/2] = SM_SentinelZero;
   10441         continue;
   10442       }
   10443       return false;
   10444     }
   10445 
   10446     // Finally check if the two mask values are adjacent and aligned with
   10447     // a pair.
   10448     if (Mask[i] != SM_SentinelUndef && Mask[i] % 2 == 0 && Mask[i] + 1 == Mask[i + 1]) {
   10449       WidenedMask[i/2] = Mask[i] / 2;
   10450       continue;
   10451     }
   10452 
   10453     // Otherwise we can't safely widen the elements used in this shuffle.
   10454     return false;
   10455   }
   10456   assert(WidenedMask.size() == Mask.size() / 2 &&
   10457          "Incorrect size of mask after widening the elements!");
   10458 
   10459   return true;
   10460 }
   10461 
   10462 /// \brief Generic routine to split vector shuffle into half-sized shuffles.
   10463 ///
   10464 /// This routine just extracts two subvectors, shuffles them independently, and
   10465 /// then concatenates them back together. This should work effectively with all
   10466 /// AVX vector shuffle types.
   10467 static SDValue splitAndLowerVectorShuffle(const SDLoc &DL, MVT VT, SDValue V1,
   10468                                           SDValue V2, ArrayRef<int> Mask,
   10469                                           SelectionDAG &DAG) {
   10470   assert(VT.getSizeInBits() >= 256 &&
   10471          "Only for 256-bit or wider vector shuffles!");
   10472   assert(V1.getSimpleValueType() == VT && "Bad operand type!");
   10473   assert(V2.getSimpleValueType() == VT && "Bad operand type!");
   10474 
   10475   ArrayRef<int> LoMask = Mask.slice(0, Mask.size() / 2);
   10476   ArrayRef<int> HiMask = Mask.slice(Mask.size() / 2);
   10477 
   10478   int NumElements = VT.getVectorNumElements();
   10479   int SplitNumElements = NumElements / 2;
   10480   MVT ScalarVT = VT.getVectorElementType();
   10481   MVT SplitVT = MVT::getVectorVT(ScalarVT, NumElements / 2);
   10482 
   10483   // Rather than splitting build-vectors, just build two narrower build
   10484   // vectors. This helps shuffling with splats and zeros.
   10485   auto SplitVector = [&](SDValue V) {
   10486     V = peekThroughBitcasts(V);
   10487 
   10488     MVT OrigVT = V.getSimpleValueType();
   10489     int OrigNumElements = OrigVT.getVectorNumElements();
   10490     int OrigSplitNumElements = OrigNumElements / 2;
   10491     MVT OrigScalarVT = OrigVT.getVectorElementType();
   10492     MVT OrigSplitVT = MVT::getVectorVT(OrigScalarVT, OrigNumElements / 2);
   10493 
   10494     SDValue LoV, HiV;
   10495 
   10496     auto *BV = dyn_cast<BuildVectorSDNode>(V);
   10497     if (!BV) {
   10498       LoV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, OrigSplitVT, V,
   10499                         DAG.getIntPtrConstant(0, DL));
   10500       HiV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, OrigSplitVT, V,
   10501                         DAG.getIntPtrConstant(OrigSplitNumElements, DL));
   10502     } else {
   10503 
   10504       SmallVector<SDValue, 16> LoOps, HiOps;
   10505       for (int i = 0; i < OrigSplitNumElements; ++i) {
   10506         LoOps.push_back(BV->getOperand(i));
   10507         HiOps.push_back(BV->getOperand(i + OrigSplitNumElements));
   10508       }
   10509       LoV = DAG.getBuildVector(OrigSplitVT, DL, LoOps);
   10510       HiV = DAG.getBuildVector(OrigSplitVT, DL, HiOps);
   10511     }
   10512     return std::make_pair(DAG.getBitcast(SplitVT, LoV),
   10513                           DAG.getBitcast(SplitVT, HiV));
   10514   };
   10515 
   10516   SDValue LoV1, HiV1, LoV2, HiV2;
   10517   std::tie(LoV1, HiV1) = SplitVector(V1);
   10518   std::tie(LoV2, HiV2) = SplitVector(V2);
   10519 
   10520   // Now create two 4-way blends of these half-width vectors.
   10521   auto HalfBlend = [&](ArrayRef<int> HalfMask) {
   10522     bool UseLoV1 = false, UseHiV1 = false, UseLoV2 = false, UseHiV2 = false;
   10523     SmallVector<int, 32> V1BlendMask((unsigned)SplitNumElements, -1);
   10524     SmallVector<int, 32> V2BlendMask((unsigned)SplitNumElements, -1);
   10525     SmallVector<int, 32> BlendMask((unsigned)SplitNumElements, -1);
   10526     for (int i = 0; i < SplitNumElements; ++i) {
   10527       int M = HalfMask[i];
   10528       if (M >= NumElements) {
   10529         if (M >= NumElements + SplitNumElements)
   10530           UseHiV2 = true;
   10531         else
   10532           UseLoV2 = true;
   10533         V2BlendMask[i] = M - NumElements;
   10534         BlendMask[i] = SplitNumElements + i;
   10535       } else if (M >= 0) {
   10536         if (M >= SplitNumElements)
   10537           UseHiV1 = true;
   10538         else
   10539           UseLoV1 = true;
   10540         V1BlendMask[i] = M;
   10541         BlendMask[i] = i;
   10542       }
   10543     }
   10544 
   10545     // Because the lowering happens after all combining takes place, we need to
   10546     // manually combine these blend masks as much as possible so that we create
   10547     // a minimal number of high-level vector shuffle nodes.
   10548 
   10549     // First try just blending the halves of V1 or V2.
   10550     if (!UseLoV1 && !UseHiV1 && !UseLoV2 && !UseHiV2)
   10551       return DAG.getUNDEF(SplitVT);
   10552     if (!UseLoV2 && !UseHiV2)
   10553       return DAG.getVectorShuffle(SplitVT, DL, LoV1, HiV1, V1BlendMask);
   10554     if (!UseLoV1 && !UseHiV1)
   10555       return DAG.getVectorShuffle(SplitVT, DL, LoV2, HiV2, V2BlendMask);
   10556 
   10557     SDValue V1Blend, V2Blend;
   10558     if (UseLoV1 && UseHiV1) {
   10559       V1Blend =
   10560         DAG.getVectorShuffle(SplitVT, DL, LoV1, HiV1, V1BlendMask);
   10561     } else {
   10562       // We only use half of V1 so map the usage down into the final blend mask.
   10563       V1Blend = UseLoV1 ? LoV1 : HiV1;
   10564       for (int i = 0; i < SplitNumElements; ++i)
   10565         if (BlendMask[i] >= 0 && BlendMask[i] < SplitNumElements)
   10566           BlendMask[i] = V1BlendMask[i] - (UseLoV1 ? 0 : SplitNumElements);
   10567     }
   10568     if (UseLoV2 && UseHiV2) {
   10569       V2Blend =
   10570         DAG.getVectorShuffle(SplitVT, DL, LoV2, HiV2, V2BlendMask);
   10571     } else {
   10572       // We only use half of V2 so map the usage down into the final blend mask.
   10573       V2Blend = UseLoV2 ? LoV2 : HiV2;
   10574       for (int i = 0; i < SplitNumElements; ++i)
   10575         if (BlendMask[i] >= SplitNumElements)
   10576           BlendMask[i] = V2BlendMask[i] + (UseLoV2 ? SplitNumElements : 0);
   10577     }
   10578     return DAG.getVectorShuffle(SplitVT, DL, V1Blend, V2Blend, BlendMask);
   10579   };
   10580   SDValue Lo = HalfBlend(LoMask);
   10581   SDValue Hi = HalfBlend(HiMask);
   10582   return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Lo, Hi);
   10583 }
   10584 
   10585 /// \brief Either split a vector in halves or decompose the shuffles and the
   10586 /// blend.
   10587 ///
   10588 /// This is provided as a good fallback for many lowerings of non-single-input
   10589 /// shuffles with more than one 128-bit lane. In those cases, we want to select
   10590 /// between splitting the shuffle into 128-bit components and stitching those
   10591 /// back together vs. extracting the single-input shuffles and blending those
   10592 /// results.
   10593 static SDValue lowerVectorShuffleAsSplitOrBlend(const SDLoc &DL, MVT VT,
   10594                                                 SDValue V1, SDValue V2,
   10595                                                 ArrayRef<int> Mask,
   10596                                                 SelectionDAG &DAG) {
   10597   assert(!V2.isUndef() && "This routine must not be used to lower single-input "
   10598          "shuffles as it could then recurse on itself.");
   10599   int Size = Mask.size();
   10600 
   10601   // If this can be modeled as a broadcast of two elements followed by a blend,
   10602   // prefer that lowering. This is especially important because broadcasts can
   10603   // often fold with memory operands.
   10604   auto DoBothBroadcast = [&] {
   10605     int V1BroadcastIdx = -1, V2BroadcastIdx = -1;
   10606     for (int M : Mask)
   10607       if (M >= Size) {
   10608         if (V2BroadcastIdx < 0)
   10609           V2BroadcastIdx = M - Size;
   10610         else if (M - Size != V2BroadcastIdx)
   10611           return false;
   10612       } else if (M >= 0) {
   10613         if (V1BroadcastIdx < 0)
   10614           V1BroadcastIdx = M;
   10615         else if (M != V1BroadcastIdx)
   10616           return false;
   10617       }
   10618     return true;
   10619   };
   10620   if (DoBothBroadcast())
   10621     return lowerVectorShuffleAsDecomposedShuffleBlend(DL, VT, V1, V2, Mask,
   10622                                                       DAG);
   10623 
   10624   // If the inputs all stem from a single 128-bit lane of each input, then we
   10625   // split them rather than blending because the split will decompose to
   10626   // unusually few instructions.
   10627   int LaneCount = VT.getSizeInBits() / 128;
   10628   int LaneSize = Size / LaneCount;
   10629   SmallBitVector LaneInputs[2];
   10630   LaneInputs[0].resize(LaneCount, false);
   10631   LaneInputs[1].resize(LaneCount, false);
   10632   for (int i = 0; i < Size; ++i)
   10633     if (Mask[i] >= 0)
   10634       LaneInputs[Mask[i] / Size][(Mask[i] % Size) / LaneSize] = true;
   10635   if (LaneInputs[0].count() <= 1 && LaneInputs[1].count() <= 1)
   10636     return splitAndLowerVectorShuffle(DL, VT, V1, V2, Mask, DAG);
   10637 
   10638   // Otherwise, just fall back to decomposed shuffles and a blend. This requires
   10639   // that the decomposed single-input shuffles don't end up here.
   10640   return lowerVectorShuffleAsDecomposedShuffleBlend(DL, VT, V1, V2, Mask, DAG);
   10641 }
   10642 
   10643 /// \brief Lower a vector shuffle crossing multiple 128-bit lanes as
   10644 /// a permutation and blend of those lanes.
   10645 ///
   10646 /// This essentially blends the out-of-lane inputs to each lane into the lane
   10647 /// from a permuted copy of the vector. This lowering strategy results in four
   10648 /// instructions in the worst case for a single-input cross lane shuffle which
   10649 /// is lower than any other fully general cross-lane shuffle strategy I'm aware
   10650 /// of. Special cases for each particular shuffle pattern should be handled
   10651 /// prior to trying this lowering.
   10652 static SDValue lowerVectorShuffleAsLanePermuteAndBlend(const SDLoc &DL, MVT VT,
   10653                                                        SDValue V1, SDValue V2,
   10654                                                        ArrayRef<int> Mask,
   10655                                                        SelectionDAG &DAG) {
   10656   // FIXME: This should probably be generalized for 512-bit vectors as well.
   10657   assert(VT.is256BitVector() && "Only for 256-bit vector shuffles!");
   10658   int Size = Mask.size();
   10659   int LaneSize = Size / 2;
   10660 
   10661   // If there are only inputs from one 128-bit lane, splitting will in fact be
   10662   // less expensive. The flags track whether the given lane contains an element
   10663   // that crosses to another lane.
   10664   bool LaneCrossing[2] = {false, false};
   10665   for (int i = 0; i < Size; ++i)
   10666     if (Mask[i] >= 0 && (Mask[i] % Size) / LaneSize != i / LaneSize)
   10667       LaneCrossing[(Mask[i] % Size) / LaneSize] = true;
   10668   if (!LaneCrossing[0] || !LaneCrossing[1])
   10669     return splitAndLowerVectorShuffle(DL, VT, V1, V2, Mask, DAG);
   10670 
   10671   assert(V2.isUndef() &&
   10672          "This last part of this routine only works on single input shuffles");
   10673 
   10674   SmallVector<int, 32> FlippedBlendMask(Size);
   10675   for (int i = 0; i < Size; ++i)
   10676     FlippedBlendMask[i] =
   10677         Mask[i] < 0 ? -1 : (((Mask[i] % Size) / LaneSize == i / LaneSize)
   10678                                 ? Mask[i]
   10679                                 : Mask[i] % LaneSize +
   10680                                       (i / LaneSize) * LaneSize + Size);
   10681 
   10682   // Flip the vector, and blend the results which should now be in-lane. The
   10683   // VPERM2X128 mask uses the low 2 bits for the low source and bits 4 and
   10684   // 5 for the high source. The value 3 selects the high half of source 2 and
   10685   // the value 2 selects the low half of source 2. We only use source 2 to
   10686   // allow folding it into a memory operand.
   10687   unsigned PERMMask = 3 | 2 << 4;
   10688   SDValue Flipped = DAG.getNode(X86ISD::VPERM2X128, DL, VT, DAG.getUNDEF(VT),
   10689                                 V1, DAG.getConstant(PERMMask, DL, MVT::i8));
   10690   return DAG.getVectorShuffle(VT, DL, V1, Flipped, FlippedBlendMask);
   10691 }
   10692 
   10693 /// \brief Handle lowering 2-lane 128-bit shuffles.
   10694 static SDValue lowerV2X128VectorShuffle(const SDLoc &DL, MVT VT, SDValue V1,
   10695                                         SDValue V2, ArrayRef<int> Mask,
   10696                                         const X86Subtarget &Subtarget,
   10697                                         SelectionDAG &DAG) {
   10698   // TODO: If minimizing size and one of the inputs is a zero vector and the
   10699   // the zero vector has only one use, we could use a VPERM2X128 to save the
   10700   // instruction bytes needed to explicitly generate the zero vector.
   10701 
   10702   // Blends are faster and handle all the non-lane-crossing cases.
   10703   if (SDValue Blend = lowerVectorShuffleAsBlend(DL, VT, V1, V2, Mask,
   10704                                                 Subtarget, DAG))
   10705     return Blend;
   10706 
   10707   bool IsV1Zero = ISD::isBuildVectorAllZeros(V1.getNode());
   10708   bool IsV2Zero = ISD::isBuildVectorAllZeros(V2.getNode());
   10709 
   10710   // If either input operand is a zero vector, use VPERM2X128 because its mask
   10711   // allows us to replace the zero input with an implicit zero.
   10712   if (!IsV1Zero && !IsV2Zero) {
   10713     // Check for patterns which can be matched with a single insert of a 128-bit
   10714     // subvector.
   10715     bool OnlyUsesV1 = isShuffleEquivalent(V1, V2, Mask, {0, 1, 0, 1});
   10716     if (OnlyUsesV1 || isShuffleEquivalent(V1, V2, Mask, {0, 1, 4, 5})) {
   10717       // With AVX2 we should use VPERMQ/VPERMPD to allow memory folding.
   10718       if (Subtarget.hasAVX2() && V2.isUndef())
   10719         return SDValue();
   10720 
   10721       MVT SubVT = MVT::getVectorVT(VT.getVectorElementType(),
   10722                                    VT.getVectorNumElements() / 2);
   10723       SDValue LoV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, V1,
   10724                                 DAG.getIntPtrConstant(0, DL));
   10725       SDValue HiV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT,
   10726                                 OnlyUsesV1 ? V1 : V2,
   10727                                 DAG.getIntPtrConstant(0, DL));
   10728       return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, LoV, HiV);
   10729     }
   10730   }
   10731 
   10732   // Otherwise form a 128-bit permutation. After accounting for undefs,
   10733   // convert the 64-bit shuffle mask selection values into 128-bit
   10734   // selection bits by dividing the indexes by 2 and shifting into positions
   10735   // defined by a vperm2*128 instruction's immediate control byte.
   10736 
   10737   // The immediate permute control byte looks like this:
   10738   //    [1:0] - select 128 bits from sources for low half of destination
   10739   //    [2]   - ignore
   10740   //    [3]   - zero low half of destination
   10741   //    [5:4] - select 128 bits from sources for high half of destination
   10742   //    [6]   - ignore
   10743   //    [7]   - zero high half of destination
   10744 
   10745   int MaskLO = Mask[0];
   10746   if (MaskLO == SM_SentinelUndef)
   10747     MaskLO = Mask[1] == SM_SentinelUndef ? 0 : Mask[1];
   10748 
   10749   int MaskHI = Mask[2];
   10750   if (MaskHI == SM_SentinelUndef)
   10751     MaskHI = Mask[3] == SM_SentinelUndef ? 0 : Mask[3];
   10752 
   10753   unsigned PermMask = MaskLO / 2 | (MaskHI / 2) << 4;
   10754 
   10755   // If either input is a zero vector, replace it with an undef input.
   10756   // Shuffle mask values <  4 are selecting elements of V1.
   10757   // Shuffle mask values >= 4 are selecting elements of V2.
   10758   // Adjust each half of the permute mask by clearing the half that was
   10759   // selecting the zero vector and setting the zero mask bit.
   10760   if (IsV1Zero) {
   10761     V1 = DAG.getUNDEF(VT);
   10762     if (MaskLO < 4)
   10763       PermMask = (PermMask & 0xf0) | 0x08;
   10764     if (MaskHI < 4)
   10765       PermMask = (PermMask & 0x0f) | 0x80;
   10766   }
   10767   if (IsV2Zero) {
   10768     V2 = DAG.getUNDEF(VT);
   10769     if (MaskLO >= 4)
   10770       PermMask = (PermMask & 0xf0) | 0x08;
   10771     if (MaskHI >= 4)
   10772       PermMask = (PermMask & 0x0f) | 0x80;
   10773   }
   10774 
   10775   return DAG.getNode(X86ISD::VPERM2X128, DL, VT, V1, V2,
   10776                      DAG.getConstant(PermMask, DL, MVT::i8));
   10777 }
   10778 
   10779 /// \brief Lower a vector shuffle by first fixing the 128-bit lanes and then
   10780 /// shuffling each lane.
   10781 ///
   10782 /// This will only succeed when the result of fixing the 128-bit lanes results
   10783 /// in a single-input non-lane-crossing shuffle with a repeating shuffle mask in
   10784 /// each 128-bit lanes. This handles many cases where we can quickly blend away
   10785 /// the lane crosses early and then use simpler shuffles within each lane.
   10786 ///
   10787 /// FIXME: It might be worthwhile at some point to support this without
   10788 /// requiring the 128-bit lane-relative shuffles to be repeating, but currently
   10789 /// in x86 only floating point has interesting non-repeating shuffles, and even
   10790 /// those are still *marginally* more expensive.
   10791 static SDValue lowerVectorShuffleByMerging128BitLanes(
   10792     const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
   10793     const X86Subtarget &Subtarget, SelectionDAG &DAG) {
   10794   assert(!V2.isUndef() && "This is only useful with multiple inputs.");
   10795 
   10796   int Size = Mask.size();
   10797   int LaneSize = 128 / VT.getScalarSizeInBits();
   10798   int NumLanes = Size / LaneSize;
   10799   assert(NumLanes > 1 && "Only handles 256-bit and wider shuffles.");
   10800 
   10801   // See if we can build a hypothetical 128-bit lane-fixing shuffle mask. Also
   10802   // check whether the in-128-bit lane shuffles share a repeating pattern.
   10803   SmallVector<int, 4> Lanes((unsigned)NumLanes, -1);
   10804   SmallVector<int, 4> InLaneMask((unsigned)LaneSize, -1);
   10805   for (int i = 0; i < Size; ++i) {
   10806     if (Mask[i] < 0)
   10807       continue;
   10808 
   10809     int j = i / LaneSize;
   10810 
   10811     if (Lanes[j] < 0) {
   10812       // First entry we've seen for this lane.
   10813       Lanes[j] = Mask[i] / LaneSize;
   10814     } else if (Lanes[j] != Mask[i] / LaneSize) {
   10815       // This doesn't match the lane selected previously!
   10816       return SDValue();
   10817     }
   10818 
   10819     // Check that within each lane we have a consistent shuffle mask.
   10820     int k = i % LaneSize;
   10821     if (InLaneMask[k] < 0) {
   10822       InLaneMask[k] = Mask[i] % LaneSize;
   10823     } else if (InLaneMask[k] != Mask[i] % LaneSize) {
   10824       // This doesn't fit a repeating in-lane mask.
   10825       return SDValue();
   10826     }
   10827   }
   10828 
   10829   // First shuffle the lanes into place.
   10830   MVT LaneVT = MVT::getVectorVT(VT.isFloatingPoint() ? MVT::f64 : MVT::i64,
   10831                                 VT.getSizeInBits() / 64);
   10832   SmallVector<int, 8> LaneMask((unsigned)NumLanes * 2, -1);
   10833   for (int i = 0; i < NumLanes; ++i)
   10834     if (Lanes[i] >= 0) {
   10835       LaneMask[2 * i + 0] = 2*Lanes[i] + 0;
   10836       LaneMask[2 * i + 1] = 2*Lanes[i] + 1;
   10837     }
   10838 
   10839   V1 = DAG.getBitcast(LaneVT, V1);
   10840   V2 = DAG.getBitcast(LaneVT, V2);
   10841   SDValue LaneShuffle = DAG.getVectorShuffle(LaneVT, DL, V1, V2, LaneMask);
   10842 
   10843   // Cast it back to the type we actually want.
   10844   LaneShuffle = DAG.getBitcast(VT, LaneShuffle);
   10845 
   10846   // Now do a simple shuffle that isn't lane crossing.
   10847   SmallVector<int, 8> NewMask((unsigned)Size, -1);
   10848   for (int i = 0; i < Size; ++i)
   10849     if (Mask[i] >= 0)
   10850       NewMask[i] = (i / LaneSize) * LaneSize + Mask[i] % LaneSize;
   10851   assert(!is128BitLaneCrossingShuffleMask(VT, NewMask) &&
   10852          "Must not introduce lane crosses at this point!");
   10853 
   10854   return DAG.getVectorShuffle(VT, DL, LaneShuffle, DAG.getUNDEF(VT), NewMask);
   10855 }
   10856 
   10857 /// Lower shuffles where an entire half of a 256-bit vector is UNDEF.
   10858 /// This allows for fast cases such as subvector extraction/insertion
   10859 /// or shuffling smaller vector types which can lower more efficiently.
   10860 static SDValue lowerVectorShuffleWithUndefHalf(const SDLoc &DL, MVT VT,
   10861                                                SDValue V1, SDValue V2,
   10862                                                ArrayRef<int> Mask,
   10863                                                const X86Subtarget &Subtarget,
   10864                                                SelectionDAG &DAG) {
   10865   assert(VT.is256BitVector() && "Expected 256-bit vector");
   10866 
   10867   unsigned NumElts = VT.getVectorNumElements();
   10868   unsigned HalfNumElts = NumElts / 2;
   10869   MVT HalfVT = MVT::getVectorVT(VT.getVectorElementType(), HalfNumElts);
   10870 
   10871   bool UndefLower = isUndefInRange(Mask, 0, HalfNumElts);
   10872   bool UndefUpper = isUndefInRange(Mask, HalfNumElts, HalfNumElts);
   10873   if (!UndefLower && !UndefUpper)
   10874     return SDValue();
   10875 
   10876   // Upper half is undef and lower half is whole upper subvector.
   10877   // e.g. vector_shuffle <4, 5, 6, 7, u, u, u, u> or <2, 3, u, u>
   10878   if (UndefUpper &&
   10879       isSequentialOrUndefInRange(Mask, 0, HalfNumElts, HalfNumElts)) {
   10880     SDValue Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, V1,
   10881                              DAG.getIntPtrConstant(HalfNumElts, DL));
   10882     return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, DAG.getUNDEF(VT), Hi,
   10883                        DAG.getIntPtrConstant(0, DL));
   10884   }
   10885 
   10886   // Lower half is undef and upper half is whole lower subvector.
   10887   // e.g. vector_shuffle <u, u, u, u, 0, 1, 2, 3> or <u, u, 0, 1>
   10888   if (UndefLower &&
   10889       isSequentialOrUndefInRange(Mask, HalfNumElts, HalfNumElts, 0)) {
   10890     SDValue Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, V1,
   10891                              DAG.getIntPtrConstant(0, DL));
   10892     return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, DAG.getUNDEF(VT), Hi,
   10893                        DAG.getIntPtrConstant(HalfNumElts, DL));
   10894   }
   10895 
   10896   // If the shuffle only uses two of the four halves of the input operands,
   10897   // then extract them and perform the 'half' shuffle at half width.
   10898   // e.g. vector_shuffle <X, X, X, X, u, u, u, u> or <X, X, u, u>
   10899   int HalfIdx1 = -1, HalfIdx2 = -1;
   10900   SmallVector<int, 8> HalfMask(HalfNumElts);
   10901   unsigned Offset = UndefLower ? HalfNumElts : 0;
   10902   for (unsigned i = 0; i != HalfNumElts; ++i) {
   10903     int M = Mask[i + Offset];
   10904     if (M < 0) {
   10905       HalfMask[i] = M;
   10906       continue;
   10907     }
   10908 
   10909     // Determine which of the 4 half vectors this element is from.
   10910     // i.e. 0 = Lower V1, 1 = Upper V1, 2 = Lower V2, 3 = Upper V2.
   10911     int HalfIdx = M / HalfNumElts;
   10912 
   10913     // Determine the element index into its half vector source.
   10914     int HalfElt = M % HalfNumElts;
   10915 
   10916     // We can shuffle with up to 2 half vectors, set the new 'half'
   10917     // shuffle mask accordingly.
   10918     if (HalfIdx1 < 0 || HalfIdx1 == HalfIdx) {
   10919       HalfMask[i] = HalfElt;
   10920       HalfIdx1 = HalfIdx;
   10921       continue;
   10922     }
   10923     if (HalfIdx2 < 0 || HalfIdx2 == HalfIdx) {
   10924       HalfMask[i] = HalfElt + HalfNumElts;
   10925       HalfIdx2 = HalfIdx;
   10926       continue;
   10927     }
   10928 
   10929     // Too many half vectors referenced.
   10930     return SDValue();
   10931   }
   10932   assert(HalfMask.size() == HalfNumElts && "Unexpected shuffle mask length");
   10933 
   10934   // Only shuffle the halves of the inputs when useful.
   10935   int NumLowerHalves =
   10936       (HalfIdx1 == 0 || HalfIdx1 == 2) + (HalfIdx2 == 0 || HalfIdx2 == 2);
   10937   int NumUpperHalves =
   10938       (HalfIdx1 == 1 || HalfIdx1 == 3) + (HalfIdx2 == 1 || HalfIdx2 == 3);
   10939 
   10940   // uuuuXXXX - don't extract uppers just to insert again.
   10941   if (UndefLower && NumUpperHalves != 0)
   10942     return SDValue();
   10943 
   10944   // XXXXuuuu - don't extract both uppers, instead shuffle and then extract.
   10945   if (UndefUpper && NumUpperHalves == 2)
   10946     return SDValue();
   10947 
   10948   // AVX2 - XXXXuuuu - always extract lowers.
   10949   if (Subtarget.hasAVX2() && !(UndefUpper && NumUpperHalves == 0)) {
   10950     // AVX2 supports efficient immediate 64-bit element cross-lane shuffles.
   10951     if (VT == MVT::v4f64 || VT == MVT::v4i64)
   10952       return SDValue();
   10953     // AVX2 supports variable 32-bit element cross-lane shuffles.
   10954     if (VT == MVT::v8f32 || VT == MVT::v8i32) {
   10955       // XXXXuuuu - don't extract lowers and uppers.
   10956       if (UndefUpper && NumLowerHalves != 0 && NumUpperHalves != 0)
   10957         return SDValue();
   10958     }
   10959   }
   10960 
   10961   auto GetHalfVector = [&](int HalfIdx) {
   10962     if (HalfIdx < 0)
   10963       return DAG.getUNDEF(HalfVT);
   10964     SDValue V = (HalfIdx < 2 ? V1 : V2);
   10965     HalfIdx = (HalfIdx % 2) * HalfNumElts;
   10966     return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, V,
   10967                        DAG.getIntPtrConstant(HalfIdx, DL));
   10968   };
   10969 
   10970   SDValue Half1 = GetHalfVector(HalfIdx1);
   10971   SDValue Half2 = GetHalfVector(HalfIdx2);
   10972   SDValue V = DAG.getVectorShuffle(HalfVT, DL, Half1, Half2, HalfMask);
   10973   return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, DAG.getUNDEF(VT), V,
   10974                      DAG.getIntPtrConstant(Offset, DL));
   10975 }
   10976 
   10977 /// \brief Test whether the specified input (0 or 1) is in-place blended by the
   10978 /// given mask.
   10979 ///
   10980 /// This returns true if the elements from a particular input are already in the
   10981 /// slot required by the given mask and require no permutation.
   10982 static bool isShuffleMaskInputInPlace(int Input, ArrayRef<int> Mask) {
   10983   assert((Input == 0 || Input == 1) && "Only two inputs to shuffles.");
   10984   int Size = Mask.size();
   10985   for (int i = 0; i < Size; ++i)
   10986     if (Mask[i] >= 0 && Mask[i] / Size == Input && Mask[i] % Size != i)
   10987       return false;
   10988 
   10989   return true;
   10990 }
   10991 
   10992 /// Handle case where shuffle sources are coming from the same 128-bit lane and
   10993 /// every lane can be represented as the same repeating mask - allowing us to
   10994 /// shuffle the sources with the repeating shuffle and then permute the result
   10995 /// to the destination lanes.
   10996 static SDValue lowerShuffleAsRepeatedMaskAndLanePermute(
   10997     const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
   10998     const X86Subtarget &Subtarget, SelectionDAG &DAG) {
   10999   int NumElts = VT.getVectorNumElements();
   11000   int NumLanes = VT.getSizeInBits() / 128;
   11001   int NumLaneElts = NumElts / NumLanes;
   11002 
   11003   // On AVX2 we may be able to just shuffle the lowest elements and then
   11004   // broadcast the result.
   11005   if (Subtarget.hasAVX2()) {
   11006     for (unsigned BroadcastSize : {16, 32, 64}) {
   11007       if (BroadcastSize <= VT.getScalarSizeInBits())
   11008         continue;
   11009       int NumBroadcastElts = BroadcastSize / VT.getScalarSizeInBits();
   11010 
   11011       // Attempt to match a repeating pattern every NumBroadcastElts,
   11012       // accounting for UNDEFs but only references the lowest 128-bit
   11013       // lane of the inputs.
   11014       auto FindRepeatingBroadcastMask = [&](SmallVectorImpl<int> &RepeatMask) {
   11015         for (int i = 0; i != NumElts; i += NumBroadcastElts)
   11016           for (int j = 0; j != NumBroadcastElts; ++j) {
   11017             int M = Mask[i + j];
   11018             if (M < 0)
   11019               continue;
   11020             int &R = RepeatMask[j];
   11021             if (0 != ((M % NumElts) / NumLaneElts))
   11022               return false;
   11023             if (0 <= R && R != M)
   11024               return false;
   11025             R = M;
   11026           }
   11027         return true;
   11028       };
   11029 
   11030       SmallVector<int, 8> RepeatMask((unsigned)NumElts, -1);
   11031       if (!FindRepeatingBroadcastMask(RepeatMask))
   11032         continue;
   11033 
   11034       // Shuffle the (lowest) repeated elements in place for broadcast.
   11035       SDValue RepeatShuf = DAG.getVectorShuffle(VT, DL, V1, V2, RepeatMask);
   11036 
   11037       // Shuffle the actual broadcast.
   11038       SmallVector<int, 8> BroadcastMask((unsigned)NumElts, -1);
   11039       for (int i = 0; i != NumElts; i += NumBroadcastElts)
   11040         for (int j = 0; j != NumBroadcastElts; ++j)
   11041           BroadcastMask[i + j] = j;
   11042       return DAG.getVectorShuffle(VT, DL, RepeatShuf, DAG.getUNDEF(VT),
   11043                                   BroadcastMask);
   11044     }
   11045   }
   11046 
   11047   // Bail if we already have a repeated lane shuffle mask.
   11048   SmallVector<int, 8> RepeatedShuffleMask;
   11049   if (is128BitLaneRepeatedShuffleMask(VT, Mask, RepeatedShuffleMask))
   11050     return SDValue();
   11051 
   11052   // On AVX2 targets we can permute 256-bit vectors as 64-bit sub-lanes
   11053   // (with PERMQ/PERMPD), otherwise we can only permute whole 128-bit lanes.
   11054   int SubLaneScale = Subtarget.hasAVX2() && VT.is256BitVector() ? 2 : 1;
   11055   int NumSubLanes = NumLanes * SubLaneScale;
   11056   int NumSubLaneElts = NumLaneElts / SubLaneScale;
   11057 
   11058   // Check that all the sources are coming from the same lane and see if we
   11059   // can form a repeating shuffle mask (local to each lane). At the same time,
   11060   // determine the source sub-lane for each destination sub-lane.
   11061   int TopSrcSubLane = -1;
   11062   SmallVector<int, 8> RepeatedLaneMask((unsigned)NumLaneElts, -1);
   11063   SmallVector<int, 8> Dst2SrcSubLanes((unsigned)NumSubLanes, -1);
   11064   for (int i = 0; i != NumElts; ++i) {
   11065     int M = Mask[i];
   11066     if (M < 0)
   11067       continue;
   11068     assert(0 <= M && M < 2 * NumElts);
   11069 
   11070     // Check that the local mask index is the same for every lane. We always do
   11071     // this with 128-bit lanes to match in is128BitLaneRepeatedShuffleMask.
   11072     int LocalM = M < NumElts ? (M % NumLaneElts) : (M % NumLaneElts) + NumElts;
   11073     int &RepeatM = RepeatedLaneMask[i % NumLaneElts];
   11074     if (0 <= RepeatM && RepeatM != LocalM)
   11075       return SDValue();
   11076     RepeatM = LocalM;
   11077 
   11078     // Check that the whole of each destination sub-lane comes from the same
   11079     // sub-lane, we need to calculate the source based off where the repeated
   11080     // lane mask will have left it.
   11081     int SrcLane = (M % NumElts) / NumLaneElts;
   11082     int SrcSubLane = (SrcLane * SubLaneScale) +
   11083                      ((i % NumLaneElts) / NumSubLaneElts);
   11084     int &Dst2SrcSubLane = Dst2SrcSubLanes[i / NumSubLaneElts];
   11085     if (0 <= Dst2SrcSubLane && SrcSubLane != Dst2SrcSubLane)
   11086       return SDValue();
   11087     Dst2SrcSubLane = SrcSubLane;
   11088 
   11089     // Track the top most source sub-lane - by setting the remaining to UNDEF
   11090     // we can greatly simplify shuffle matching.
   11091     TopSrcSubLane = std::max(TopSrcSubLane, SrcSubLane);
   11092   }
   11093   assert(0 <= TopSrcSubLane && TopSrcSubLane < NumSubLanes &&
   11094          "Unexpected source lane");
   11095 
   11096   // Create a repeating shuffle mask for the entire vector.
   11097   SmallVector<int, 8> RepeatedMask((unsigned)NumElts, -1);
   11098   for (int i = 0, e = ((TopSrcSubLane + 1) * NumSubLaneElts); i != e; ++i) {
   11099     int M = RepeatedLaneMask[i % NumLaneElts];
   11100     if (M < 0)
   11101       continue;
   11102     int Lane = i / NumLaneElts;
   11103     RepeatedMask[i] = M + (Lane * NumLaneElts);
   11104   }
   11105   SDValue RepeatedShuffle = DAG.getVectorShuffle(VT, DL, V1, V2, RepeatedMask);
   11106 
   11107   // Shuffle each source sub-lane to its destination.
   11108   SmallVector<int, 8> SubLaneMask((unsigned)NumElts, -1);
   11109   for (int i = 0; i != NumElts; i += NumSubLaneElts) {
   11110     int SrcSubLane = Dst2SrcSubLanes[i / NumSubLaneElts];
   11111     if (SrcSubLane < 0)
   11112       continue;
   11113     for (int j = 0; j != NumSubLaneElts; ++j)
   11114       SubLaneMask[i + j] = j + (SrcSubLane * NumSubLaneElts);
   11115   }
   11116 
   11117   return DAG.getVectorShuffle(VT, DL, RepeatedShuffle, DAG.getUNDEF(VT),
   11118                               SubLaneMask);
   11119 }
   11120 
   11121 static SDValue lowerVectorShuffleWithSHUFPD(const SDLoc &DL, MVT VT,
   11122                                             ArrayRef<int> Mask, SDValue V1,
   11123                                             SDValue V2, SelectionDAG &DAG) {
   11124 
   11125   // Mask for V8F64: 0/1,  8/9,  2/3,  10/11, 4/5, ..
   11126   // Mask for V4F64; 0/1,  4/5,  2/3,  6/7..
   11127   assert(VT.getScalarSizeInBits() == 64 && "Unexpected data type for VSHUFPD");
   11128   int NumElts = VT.getVectorNumElements();
   11129   bool ShufpdMask = true;
   11130   bool CommutableMask = true;
   11131   unsigned Immediate = 0;
   11132   for (int i = 0; i < NumElts; ++i) {
   11133     if (Mask[i] < 0)
   11134       continue;
   11135     int Val = (i & 6) + NumElts * (i & 1);
   11136     int CommutVal = (i & 0xe) + NumElts * ((i & 1)^1);
   11137     if (Mask[i] < Val ||  Mask[i] > Val + 1)
   11138       ShufpdMask = false;
   11139     if (Mask[i] < CommutVal ||  Mask[i] > CommutVal + 1)
   11140       CommutableMask = false;
   11141     Immediate |= (Mask[i] % 2) << i;
   11142   }
   11143   if (ShufpdMask)
   11144     return DAG.getNode(X86ISD::SHUFP, DL, VT, V1, V2,
   11145                        DAG.getConstant(Immediate, DL, MVT::i8));
   11146   if (CommutableMask)
   11147     return DAG.getNode(X86ISD::SHUFP, DL, VT, V2, V1,
   11148                        DAG.getConstant(Immediate, DL, MVT::i8));
   11149   return SDValue();
   11150 }
   11151 
   11152 /// \brief Handle lowering of 4-lane 64-bit floating point shuffles.
   11153 ///
   11154 /// Also ends up handling lowering of 4-lane 64-bit integer shuffles when AVX2
   11155 /// isn't available.
   11156 static SDValue lowerV4F64VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
   11157                                        SDValue V1, SDValue V2,
   11158                                        const X86Subtarget &Subtarget,
   11159                                        SelectionDAG &DAG) {
   11160   assert(V1.getSimpleValueType() == MVT::v4f64 && "Bad operand type!");
   11161   assert(V2.getSimpleValueType() == MVT::v4f64 && "Bad operand type!");
   11162   assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!");
   11163 
   11164   SmallVector<int, 4> WidenedMask;
   11165   if (canWidenShuffleElements(Mask, WidenedMask))
   11166     if (SDValue V = lowerV2X128VectorShuffle(DL, MVT::v4f64, V1, V2, Mask,
   11167                                              Subtarget, DAG))
   11168       return V;
   11169 
   11170   if (V2.isUndef()) {
   11171     // Check for being able to broadcast a single element.
   11172     if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(
   11173             DL, MVT::v4f64, V1, V2, Mask, Subtarget, DAG))
   11174       return Broadcast;
   11175 
   11176     // Use low duplicate instructions for masks that match their pattern.
   11177     if (isShuffleEquivalent(V1, V2, Mask, {0, 0, 2, 2}))
   11178       return DAG.getNode(X86ISD::MOVDDUP, DL, MVT::v4f64, V1);
   11179 
   11180     if (!is128BitLaneCrossingShuffleMask(MVT::v4f64, Mask)) {
   11181       // Non-half-crossing single input shuffles can be lowered with an
   11182       // interleaved permutation.
   11183       unsigned VPERMILPMask = (Mask[0] == 1) | ((Mask[1] == 1) << 1) |
   11184                               ((Mask[2] == 3) << 2) | ((Mask[3] == 3) << 3);
   11185       return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v4f64, V1,
   11186                          DAG.getConstant(VPERMILPMask, DL, MVT::i8));
   11187     }
   11188 
   11189     // With AVX2 we have direct support for this permutation.
   11190     if (Subtarget.hasAVX2())
   11191       return DAG.getNode(X86ISD::VPERMI, DL, MVT::v4f64, V1,
   11192                          getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
   11193 
   11194     // Try to create an in-lane repeating shuffle mask and then shuffle the
   11195     // the results into the target lanes.
   11196     if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
   11197             DL, MVT::v4f64, V1, V2, Mask, Subtarget, DAG))
   11198       return V;
   11199 
   11200     // Otherwise, fall back.
   11201     return lowerVectorShuffleAsLanePermuteAndBlend(DL, MVT::v4f64, V1, V2, Mask,
   11202                                                    DAG);
   11203   }
   11204 
   11205   // Use dedicated unpack instructions for masks that match their pattern.
   11206   if (SDValue V =
   11207           lowerVectorShuffleWithUNPCK(DL, MVT::v4f64, Mask, V1, V2, DAG))
   11208     return V;
   11209 
   11210   if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v4f64, V1, V2, Mask,
   11211                                                 Subtarget, DAG))
   11212     return Blend;
   11213 
   11214   // Check if the blend happens to exactly fit that of SHUFPD.
   11215   if (SDValue Op =
   11216       lowerVectorShuffleWithSHUFPD(DL, MVT::v4f64, Mask, V1, V2, DAG))
   11217     return Op;
   11218 
   11219   // Try to create an in-lane repeating shuffle mask and then shuffle the
   11220   // the results into the target lanes.
   11221   if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
   11222           DL, MVT::v4f64, V1, V2, Mask, Subtarget, DAG))
   11223   return V;
   11224 
   11225   // Try to simplify this by merging 128-bit lanes to enable a lane-based
   11226   // shuffle. However, if we have AVX2 and either inputs are already in place,
   11227   // we will be able to shuffle even across lanes the other input in a single
   11228   // instruction so skip this pattern.
   11229   if (!(Subtarget.hasAVX2() && (isShuffleMaskInputInPlace(0, Mask) ||
   11230                                 isShuffleMaskInputInPlace(1, Mask))))
   11231     if (SDValue Result = lowerVectorShuffleByMerging128BitLanes(
   11232             DL, MVT::v4f64, V1, V2, Mask, Subtarget, DAG))
   11233       return Result;
   11234 
   11235   // If we have AVX2 then we always want to lower with a blend because an v4 we
   11236   // can fully permute the elements.
   11237   if (Subtarget.hasAVX2())
   11238     return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v4f64, V1, V2,
   11239                                                       Mask, DAG);
   11240 
   11241   // Otherwise fall back on generic lowering.
   11242   return lowerVectorShuffleAsSplitOrBlend(DL, MVT::v4f64, V1, V2, Mask, DAG);
   11243 }
   11244 
   11245 /// \brief Handle lowering of 4-lane 64-bit integer shuffles.
   11246 ///
   11247 /// This routine is only called when we have AVX2 and thus a reasonable
   11248 /// instruction set for v4i64 shuffling..
   11249 static SDValue lowerV4I64VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
   11250                                        SDValue V1, SDValue V2,
   11251                                        const X86Subtarget &Subtarget,
   11252                                        SelectionDAG &DAG) {
   11253   assert(V1.getSimpleValueType() == MVT::v4i64 && "Bad operand type!");
   11254   assert(V2.getSimpleValueType() == MVT::v4i64 && "Bad operand type!");
   11255   assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!");
   11256   assert(Subtarget.hasAVX2() && "We can only lower v4i64 with AVX2!");
   11257 
   11258   SmallVector<int, 4> WidenedMask;
   11259   if (canWidenShuffleElements(Mask, WidenedMask))
   11260     if (SDValue V = lowerV2X128VectorShuffle(DL, MVT::v4i64, V1, V2, Mask,
   11261                                              Subtarget, DAG))
   11262       return V;
   11263 
   11264   if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v4i64, V1, V2, Mask,
   11265                                                 Subtarget, DAG))
   11266     return Blend;
   11267 
   11268   // Check for being able to broadcast a single element.
   11269   if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(DL, MVT::v4i64, V1, V2,
   11270                                                         Mask, Subtarget, DAG))
   11271     return Broadcast;
   11272 
   11273   if (V2.isUndef()) {
   11274     // When the shuffle is mirrored between the 128-bit lanes of the unit, we
   11275     // can use lower latency instructions that will operate on both lanes.
   11276     SmallVector<int, 2> RepeatedMask;
   11277     if (is128BitLaneRepeatedShuffleMask(MVT::v4i64, Mask, RepeatedMask)) {
   11278       SmallVector<int, 4> PSHUFDMask;
   11279       scaleShuffleMask(2, RepeatedMask, PSHUFDMask);
   11280       return DAG.getBitcast(
   11281           MVT::v4i64,
   11282           DAG.getNode(X86ISD::PSHUFD, DL, MVT::v8i32,
   11283                       DAG.getBitcast(MVT::v8i32, V1),
   11284                       getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG)));
   11285     }
   11286 
   11287     // AVX2 provides a direct instruction for permuting a single input across
   11288     // lanes.
   11289     return DAG.getNode(X86ISD::VPERMI, DL, MVT::v4i64, V1,
   11290                        getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
   11291   }
   11292 
   11293   // Try to use shift instructions.
   11294   if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v4i64, V1, V2, Mask,
   11295                                                 Subtarget, DAG))
   11296     return Shift;
   11297 
   11298   // Use dedicated unpack instructions for masks that match their pattern.
   11299   if (SDValue V =
   11300           lowerVectorShuffleWithUNPCK(DL, MVT::v4i64, Mask, V1, V2, DAG))
   11301     return V;
   11302 
   11303   // Try to simplify this by merging 128-bit lanes to enable a lane-based
   11304   // shuffle. However, if we have AVX2 and either inputs are already in place,
   11305   // we will be able to shuffle even across lanes the other input in a single
   11306   // instruction so skip this pattern.
   11307   if (!(Subtarget.hasAVX2() && (isShuffleMaskInputInPlace(0, Mask) ||
   11308                                  isShuffleMaskInputInPlace(1, Mask))))
   11309     if (SDValue Result = lowerVectorShuffleByMerging128BitLanes(
   11310             DL, MVT::v4i64, V1, V2, Mask, Subtarget, DAG))
   11311       return Result;
   11312 
   11313   // Otherwise fall back on generic blend lowering.
   11314   return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v4i64, V1, V2,
   11315                                                     Mask, DAG);
   11316 }
   11317 
   11318 /// \brief Handle lowering of 8-lane 32-bit floating point shuffles.
   11319 ///
   11320 /// Also ends up handling lowering of 8-lane 32-bit integer shuffles when AVX2
   11321 /// isn't available.
   11322 static SDValue lowerV8F32VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
   11323                                        SDValue V1, SDValue V2,
   11324                                        const X86Subtarget &Subtarget,
   11325                                        SelectionDAG &DAG) {
   11326   assert(V1.getSimpleValueType() == MVT::v8f32 && "Bad operand type!");
   11327   assert(V2.getSimpleValueType() == MVT::v8f32 && "Bad operand type!");
   11328   assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");
   11329 
   11330   if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v8f32, V1, V2, Mask,
   11331                                                 Subtarget, DAG))
   11332     return Blend;
   11333 
   11334   // Check for being able to broadcast a single element.
   11335   if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(DL, MVT::v8f32, V1, V2,
   11336                                                         Mask, Subtarget, DAG))
   11337     return Broadcast;
   11338 
   11339   // If the shuffle mask is repeated in each 128-bit lane, we have many more
   11340   // options to efficiently lower the shuffle.
   11341   SmallVector<int, 4> RepeatedMask;
   11342   if (is128BitLaneRepeatedShuffleMask(MVT::v8f32, Mask, RepeatedMask)) {
   11343     assert(RepeatedMask.size() == 4 &&
   11344            "Repeated masks must be half the mask width!");
   11345 
   11346     // Use even/odd duplicate instructions for masks that match their pattern.
   11347     if (isShuffleEquivalent(V1, V2, RepeatedMask, {0, 0, 2, 2}))
   11348       return DAG.getNode(X86ISD::MOVSLDUP, DL, MVT::v8f32, V1);
   11349     if (isShuffleEquivalent(V1, V2, RepeatedMask, {1, 1, 3, 3}))
   11350       return DAG.getNode(X86ISD::MOVSHDUP, DL, MVT::v8f32, V1);
   11351 
   11352     if (V2.isUndef())
   11353       return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v8f32, V1,
   11354                          getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG));
   11355 
   11356     // Use dedicated unpack instructions for masks that match their pattern.
   11357     if (SDValue V =
   11358             lowerVectorShuffleWithUNPCK(DL, MVT::v8f32, Mask, V1, V2, DAG))
   11359       return V;
   11360 
   11361     // Otherwise, fall back to a SHUFPS sequence. Here it is important that we
   11362     // have already handled any direct blends.
   11363     return lowerVectorShuffleWithSHUFPS(DL, MVT::v8f32, RepeatedMask, V1, V2, DAG);
   11364   }
   11365 
   11366   // Try to create an in-lane repeating shuffle mask and then shuffle the
   11367   // the results into the target lanes.
   11368   if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
   11369           DL, MVT::v8f32, V1, V2, Mask, Subtarget, DAG))
   11370     return V;
   11371 
   11372   // If we have a single input shuffle with different shuffle patterns in the
   11373   // two 128-bit lanes use the variable mask to VPERMILPS.
   11374   if (V2.isUndef()) {
   11375     SDValue VPermMask[8];
   11376     for (int i = 0; i < 8; ++i)
   11377       VPermMask[i] = Mask[i] < 0 ? DAG.getUNDEF(MVT::i32)
   11378                                  : DAG.getConstant(Mask[i], DL, MVT::i32);
   11379     if (!is128BitLaneCrossingShuffleMask(MVT::v8f32, Mask))
   11380       return DAG.getNode(X86ISD::VPERMILPV, DL, MVT::v8f32, V1,
   11381                          DAG.getBuildVector(MVT::v8i32, DL, VPermMask));
   11382 
   11383     if (Subtarget.hasAVX2())
   11384       return DAG.getNode(X86ISD::VPERMV, DL, MVT::v8f32,
   11385                          DAG.getBuildVector(MVT::v8i32, DL, VPermMask), V1);
   11386 
   11387     // Otherwise, fall back.
   11388     return lowerVectorShuffleAsLanePermuteAndBlend(DL, MVT::v8f32, V1, V2, Mask,
   11389                                                    DAG);
   11390   }
   11391 
   11392   // Try to simplify this by merging 128-bit lanes to enable a lane-based
   11393   // shuffle.
   11394   if (SDValue Result = lowerVectorShuffleByMerging128BitLanes(
   11395           DL, MVT::v8f32, V1, V2, Mask, Subtarget, DAG))
   11396     return Result;
   11397 
   11398   // If we have AVX2 then we always want to lower with a blend because at v8 we
   11399   // can fully permute the elements.
   11400   if (Subtarget.hasAVX2())
   11401     return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v8f32, V1, V2,
   11402                                                       Mask, DAG);
   11403 
   11404   // Otherwise fall back on generic lowering.
   11405   return lowerVectorShuffleAsSplitOrBlend(DL, MVT::v8f32, V1, V2, Mask, DAG);
   11406 }
   11407 
   11408 /// \brief Handle lowering of 8-lane 32-bit integer shuffles.
   11409 ///
   11410 /// This routine is only called when we have AVX2 and thus a reasonable
   11411 /// instruction set for v8i32 shuffling..
   11412 static SDValue lowerV8I32VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
   11413                                        SDValue V1, SDValue V2,
   11414                                        const X86Subtarget &Subtarget,
   11415                                        SelectionDAG &DAG) {
   11416   assert(V1.getSimpleValueType() == MVT::v8i32 && "Bad operand type!");
   11417   assert(V2.getSimpleValueType() == MVT::v8i32 && "Bad operand type!");
   11418   assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");
   11419   assert(Subtarget.hasAVX2() && "We can only lower v8i32 with AVX2!");
   11420 
   11421   // Whenever we can lower this as a zext, that instruction is strictly faster
   11422   // than any alternative. It also allows us to fold memory operands into the
   11423   // shuffle in many cases.
   11424   if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend(DL, MVT::v8i32, V1, V2,
   11425                                                          Mask, Subtarget, DAG))
   11426     return ZExt;
   11427 
   11428   if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v8i32, V1, V2, Mask,
   11429                                                 Subtarget, DAG))
   11430     return Blend;
   11431 
   11432   // Check for being able to broadcast a single element.
   11433   if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(DL, MVT::v8i32, V1, V2,
   11434                                                         Mask, Subtarget, DAG))
   11435     return Broadcast;
   11436 
   11437   // If the shuffle mask is repeated in each 128-bit lane we can use more
   11438   // efficient instructions that mirror the shuffles across the two 128-bit
   11439   // lanes.
   11440   SmallVector<int, 4> RepeatedMask;
   11441   if (is128BitLaneRepeatedShuffleMask(MVT::v8i32, Mask, RepeatedMask)) {
   11442     assert(RepeatedMask.size() == 4 && "Unexpected repeated mask size!");
   11443     if (V2.isUndef())
   11444       return DAG.getNode(X86ISD::PSHUFD, DL, MVT::v8i32, V1,
   11445                          getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG));
   11446 
   11447     // Use dedicated unpack instructions for masks that match their pattern.
   11448     if (SDValue V =
   11449             lowerVectorShuffleWithUNPCK(DL, MVT::v8i32, Mask, V1, V2, DAG))
   11450       return V;
   11451   }
   11452 
   11453   // Try to use shift instructions.
   11454   if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v8i32, V1, V2, Mask,
   11455                                                 Subtarget, DAG))
   11456     return Shift;
   11457 
   11458   // Try to use byte rotation instructions.
   11459   if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
   11460           DL, MVT::v8i32, V1, V2, Mask, Subtarget, DAG))
   11461     return Rotate;
   11462 
   11463   // Try to create an in-lane repeating shuffle mask and then shuffle the
   11464   // the results into the target lanes.
   11465   if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
   11466           DL, MVT::v8i32, V1, V2, Mask, Subtarget, DAG))
   11467     return V;
   11468 
   11469   // If the shuffle patterns aren't repeated but it is a single input, directly
   11470   // generate a cross-lane VPERMD instruction.
   11471   if (V2.isUndef()) {
   11472     SDValue VPermMask[8];
   11473     for (int i = 0; i < 8; ++i)
   11474       VPermMask[i] = Mask[i] < 0 ? DAG.getUNDEF(MVT::i32)
   11475                                  : DAG.getConstant(Mask[i], DL, MVT::i32);
   11476     return DAG.getNode(X86ISD::VPERMV, DL, MVT::v8i32,
   11477                        DAG.getBuildVector(MVT::v8i32, DL, VPermMask), V1);
   11478   }
   11479 
   11480   // Try to simplify this by merging 128-bit lanes to enable a lane-based
   11481   // shuffle.
   11482   if (SDValue Result = lowerVectorShuffleByMerging128BitLanes(
   11483           DL, MVT::v8i32, V1, V2, Mask, Subtarget, DAG))
   11484     return Result;
   11485 
   11486   // Otherwise fall back on generic blend lowering.
   11487   return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v8i32, V1, V2,
   11488                                                     Mask, DAG);
   11489 }
   11490 
   11491 /// \brief Handle lowering of 16-lane 16-bit integer shuffles.
   11492 ///
   11493 /// This routine is only called when we have AVX2 and thus a reasonable
   11494 /// instruction set for v16i16 shuffling..
   11495 static SDValue lowerV16I16VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
   11496                                         SDValue V1, SDValue V2,
   11497                                         const X86Subtarget &Subtarget,
   11498                                         SelectionDAG &DAG) {
   11499   assert(V1.getSimpleValueType() == MVT::v16i16 && "Bad operand type!");
   11500   assert(V2.getSimpleValueType() == MVT::v16i16 && "Bad operand type!");
   11501   assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!");
   11502   assert(Subtarget.hasAVX2() && "We can only lower v16i16 with AVX2!");
   11503 
   11504   // Whenever we can lower this as a zext, that instruction is strictly faster
   11505   // than any alternative. It also allows us to fold memory operands into the
   11506   // shuffle in many cases.
   11507   if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend(DL, MVT::v16i16, V1, V2,
   11508                                                          Mask, Subtarget, DAG))
   11509     return ZExt;
   11510 
   11511   // Check for being able to broadcast a single element.
   11512   if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(DL, MVT::v16i16, V1, V2,
   11513                                                         Mask, Subtarget, DAG))
   11514     return Broadcast;
   11515 
   11516   if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v16i16, V1, V2, Mask,
   11517                                                 Subtarget, DAG))
   11518     return Blend;
   11519 
   11520   // Use dedicated unpack instructions for masks that match their pattern.
   11521   if (SDValue V =
   11522           lowerVectorShuffleWithUNPCK(DL, MVT::v16i16, Mask, V1, V2, DAG))
   11523     return V;
   11524 
   11525   // Try to use shift instructions.
   11526   if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v16i16, V1, V2, Mask,
   11527                                                 Subtarget, DAG))
   11528     return Shift;
   11529 
   11530   // Try to use byte rotation instructions.
   11531   if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
   11532           DL, MVT::v16i16, V1, V2, Mask, Subtarget, DAG))
   11533     return Rotate;
   11534 
   11535   // Try to create an in-lane repeating shuffle mask and then shuffle the
   11536   // the results into the target lanes.
   11537   if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
   11538           DL, MVT::v16i16, V1, V2, Mask, Subtarget, DAG))
   11539     return V;
   11540 
   11541   if (V2.isUndef()) {
   11542     // There are no generalized cross-lane shuffle operations available on i16
   11543     // element types.
   11544     if (is128BitLaneCrossingShuffleMask(MVT::v16i16, Mask))
   11545       return lowerVectorShuffleAsLanePermuteAndBlend(DL, MVT::v16i16, V1, V2,
   11546                                                      Mask, DAG);
   11547 
   11548     SmallVector<int, 8> RepeatedMask;
   11549     if (is128BitLaneRepeatedShuffleMask(MVT::v16i16, Mask, RepeatedMask)) {
   11550       // As this is a single-input shuffle, the repeated mask should be
   11551       // a strictly valid v8i16 mask that we can pass through to the v8i16
   11552       // lowering to handle even the v16 case.
   11553       return lowerV8I16GeneralSingleInputVectorShuffle(
   11554           DL, MVT::v16i16, V1, RepeatedMask, Subtarget, DAG);
   11555     }
   11556   }
   11557 
   11558   if (SDValue PSHUFB = lowerVectorShuffleWithPSHUFB(DL, MVT::v16i16, Mask, V1,
   11559                                                     V2, Subtarget, DAG))
   11560     return PSHUFB;
   11561 
   11562   // Try to simplify this by merging 128-bit lanes to enable a lane-based
   11563   // shuffle.
   11564   if (SDValue Result = lowerVectorShuffleByMerging128BitLanes(
   11565           DL, MVT::v16i16, V1, V2, Mask, Subtarget, DAG))
   11566     return Result;
   11567 
   11568   // Otherwise fall back on generic lowering.
   11569   return lowerVectorShuffleAsSplitOrBlend(DL, MVT::v16i16, V1, V2, Mask, DAG);
   11570 }
   11571 
   11572 /// \brief Handle lowering of 32-lane 8-bit integer shuffles.
   11573 ///
   11574 /// This routine is only called when we have AVX2 and thus a reasonable
   11575 /// instruction set for v32i8 shuffling..
   11576 static SDValue lowerV32I8VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
   11577                                        SDValue V1, SDValue V2,
   11578                                        const X86Subtarget &Subtarget,
   11579                                        SelectionDAG &DAG) {
   11580   assert(V1.getSimpleValueType() == MVT::v32i8 && "Bad operand type!");
   11581   assert(V2.getSimpleValueType() == MVT::v32i8 && "Bad operand type!");
   11582   assert(Mask.size() == 32 && "Unexpected mask size for v32 shuffle!");
   11583   assert(Subtarget.hasAVX2() && "We can only lower v32i8 with AVX2!");
   11584 
   11585   // Whenever we can lower this as a zext, that instruction is strictly faster
   11586   // than any alternative. It also allows us to fold memory operands into the
   11587   // shuffle in many cases.
   11588   if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend(DL, MVT::v32i8, V1, V2,
   11589                                                          Mask, Subtarget, DAG))
   11590     return ZExt;
   11591 
   11592   // Check for being able to broadcast a single element.
   11593   if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(DL, MVT::v32i8, V1, V2,
   11594                                                         Mask, Subtarget, DAG))
   11595     return Broadcast;
   11596 
   11597   if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v32i8, V1, V2, Mask,
   11598                                                 Subtarget, DAG))
   11599     return Blend;
   11600 
   11601   // Use dedicated unpack instructions for masks that match their pattern.
   11602   if (SDValue V =
   11603           lowerVectorShuffleWithUNPCK(DL, MVT::v32i8, Mask, V1, V2, DAG))
   11604     return V;
   11605 
   11606   // Try to use shift instructions.
   11607   if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v32i8, V1, V2, Mask,
   11608                                                 Subtarget, DAG))
   11609     return Shift;
   11610 
   11611   // Try to use byte rotation instructions.
   11612   if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
   11613           DL, MVT::v32i8, V1, V2, Mask, Subtarget, DAG))
   11614     return Rotate;
   11615 
   11616   // Try to create an in-lane repeating shuffle mask and then shuffle the
   11617   // the results into the target lanes.
   11618   if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
   11619           DL, MVT::v32i8, V1, V2, Mask, Subtarget, DAG))
   11620     return V;
   11621 
   11622   // There are no generalized cross-lane shuffle operations available on i8
   11623   // element types.
   11624   if (V2.isUndef() && is128BitLaneCrossingShuffleMask(MVT::v32i8, Mask))
   11625     return lowerVectorShuffleAsLanePermuteAndBlend(DL, MVT::v32i8, V1, V2, Mask,
   11626                                                    DAG);
   11627 
   11628   if (SDValue PSHUFB = lowerVectorShuffleWithPSHUFB(DL, MVT::v32i8, Mask, V1,
   11629                                                     V2, Subtarget, DAG))
   11630     return PSHUFB;
   11631 
   11632   // Try to simplify this by merging 128-bit lanes to enable a lane-based
   11633   // shuffle.
   11634   if (SDValue Result = lowerVectorShuffleByMerging128BitLanes(
   11635           DL, MVT::v32i8, V1, V2, Mask, Subtarget, DAG))
   11636     return Result;
   11637 
   11638   // Otherwise fall back on generic lowering.
   11639   return lowerVectorShuffleAsSplitOrBlend(DL, MVT::v32i8, V1, V2, Mask, DAG);
   11640 }
   11641 
   11642 /// \brief High-level routine to lower various 256-bit x86 vector shuffles.
   11643 ///
   11644 /// This routine either breaks down the specific type of a 256-bit x86 vector
   11645 /// shuffle or splits it into two 128-bit shuffles and fuses the results back
   11646 /// together based on the available instructions.
   11647 static SDValue lower256BitVectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
   11648                                         MVT VT, SDValue V1, SDValue V2,
   11649                                         const X86Subtarget &Subtarget,
   11650                                         SelectionDAG &DAG) {
   11651   // If we have a single input to the zero element, insert that into V1 if we
   11652   // can do so cheaply.
   11653   int NumElts = VT.getVectorNumElements();
   11654   int NumV2Elements = count_if(Mask, [NumElts](int M) { return M >= NumElts; });
   11655 
   11656   if (NumV2Elements == 1 && Mask[0] >= NumElts)
   11657     if (SDValue Insertion = lowerVectorShuffleAsElementInsertion(
   11658                               DL, VT, V1, V2, Mask, Subtarget, DAG))
   11659       return Insertion;
   11660 
   11661   // Handle special cases where the lower or upper half is UNDEF.
   11662   if (SDValue V =
   11663           lowerVectorShuffleWithUndefHalf(DL, VT, V1, V2, Mask, Subtarget, DAG))
   11664     return V;
   11665 
   11666   // There is a really nice hard cut-over between AVX1 and AVX2 that means we
   11667   // can check for those subtargets here and avoid much of the subtarget
   11668   // querying in the per-vector-type lowering routines. With AVX1 we have
   11669   // essentially *zero* ability to manipulate a 256-bit vector with integer
   11670   // types. Since we'll use floating point types there eventually, just
   11671   // immediately cast everything to a float and operate entirely in that domain.
   11672   if (VT.isInteger() && !Subtarget.hasAVX2()) {
   11673     int ElementBits = VT.getScalarSizeInBits();
   11674     if (ElementBits < 32) {
   11675       // No floating point type available, if we can't use the bit operations
   11676       // for masking/blending then decompose into 128-bit vectors.
   11677       if (SDValue V = lowerVectorShuffleAsBitMask(DL, VT, V1, V2, Mask, DAG))
   11678         return V;
   11679       if (SDValue V = lowerVectorShuffleAsBitBlend(DL, VT, V1, V2, Mask, DAG))
   11680         return V;
   11681       return splitAndLowerVectorShuffle(DL, VT, V1, V2, Mask, DAG);
   11682     }
   11683 
   11684     MVT FpVT = MVT::getVectorVT(MVT::getFloatingPointVT(ElementBits),
   11685                                 VT.getVectorNumElements());
   11686     V1 = DAG.getBitcast(FpVT, V1);
   11687     V2 = DAG.getBitcast(FpVT, V2);
   11688     return DAG.getBitcast(VT, DAG.getVectorShuffle(FpVT, DL, V1, V2, Mask));
   11689   }
   11690 
   11691   switch (VT.SimpleTy) {
   11692   case MVT::v4f64:
   11693     return lowerV4F64VectorShuffle(DL, Mask, V1, V2, Subtarget, DAG);
   11694   case MVT::v4i64:
   11695     return lowerV4I64VectorShuffle(DL, Mask, V1, V2, Subtarget, DAG);
   11696   case MVT::v8f32:
   11697     return lowerV8F32VectorShuffle(DL, Mask, V1, V2, Subtarget, DAG);
   11698   case MVT::v8i32:
   11699     return lowerV8I32VectorShuffle(DL, Mask, V1, V2, Subtarget, DAG);
   11700   case MVT::v16i16:
   11701     return lowerV16I16VectorShuffle(DL, Mask, V1, V2, Subtarget, DAG);
   11702   case MVT::v32i8:
   11703     return lowerV32I8VectorShuffle(DL, Mask, V1, V2, Subtarget, DAG);
   11704 
   11705   default:
   11706     llvm_unreachable("Not a valid 256-bit x86 vector type!");
   11707   }
   11708 }
   11709 
   11710 /// \brief Try to lower a vector shuffle as a 128-bit shuffles.
   11711 static SDValue lowerV4X128VectorShuffle(const SDLoc &DL, MVT VT,
   11712                                         ArrayRef<int> Mask, SDValue V1,
   11713                                         SDValue V2, SelectionDAG &DAG) {
   11714   assert(VT.getScalarSizeInBits() == 64 &&
   11715          "Unexpected element type size for 128bit shuffle.");
   11716 
   11717   // To handle 256 bit vector requires VLX and most probably
   11718   // function lowerV2X128VectorShuffle() is better solution.
   11719   assert(VT.is512BitVector() && "Unexpected vector size for 512bit shuffle.");
   11720 
   11721   SmallVector<int, 4> WidenedMask;
   11722   if (!canWidenShuffleElements(Mask, WidenedMask))
   11723     return SDValue();
   11724 
   11725   SDValue Ops[2] = {DAG.getUNDEF(VT), DAG.getUNDEF(VT)};
   11726   // Insure elements came from the same Op.
   11727   int MaxOp1Index = VT.getVectorNumElements()/2 - 1;
   11728   for (int i = 0, Size = WidenedMask.size(); i < Size; ++i) {
   11729     if (WidenedMask[i] == SM_SentinelZero)
   11730       return SDValue();
   11731     if (WidenedMask[i] == SM_SentinelUndef)
   11732       continue;
   11733 
   11734     SDValue Op = WidenedMask[i] > MaxOp1Index ? V2 : V1;
   11735     unsigned OpIndex = (i < Size/2) ? 0 : 1;
   11736     if (Ops[OpIndex].isUndef())
   11737       Ops[OpIndex] = Op;
   11738     else if (Ops[OpIndex] != Op)
   11739       return SDValue();
   11740   }
   11741 
   11742   // Form a 128-bit permutation.
   11743   // Convert the 64-bit shuffle mask selection values into 128-bit selection
   11744   // bits defined by a vshuf64x2 instruction's immediate control byte.
   11745   unsigned PermMask = 0, Imm = 0;
   11746   unsigned ControlBitsNum = WidenedMask.size() / 2;
   11747 
   11748   for (int i = 0, Size = WidenedMask.size(); i < Size; ++i) {
   11749     // Use first element in place of undef mask.
   11750     Imm = (WidenedMask[i] == SM_SentinelUndef) ? 0 : WidenedMask[i];
   11751     PermMask |= (Imm % WidenedMask.size()) << (i * ControlBitsNum);
   11752   }
   11753 
   11754   return DAG.getNode(X86ISD::SHUF128, DL, VT, Ops[0], Ops[1],
   11755                      DAG.getConstant(PermMask, DL, MVT::i8));
   11756 }
   11757 
   11758 static SDValue lowerVectorShuffleWithPERMV(const SDLoc &DL, MVT VT,
   11759                                            ArrayRef<int> Mask, SDValue V1,
   11760                                            SDValue V2, SelectionDAG &DAG) {
   11761 
   11762   assert(VT.getScalarSizeInBits() >= 16 && "Unexpected data type for PERMV");
   11763 
   11764   MVT MaskEltVT = MVT::getIntegerVT(VT.getScalarSizeInBits());
   11765   MVT MaskVecVT = MVT::getVectorVT(MaskEltVT, VT.getVectorNumElements());
   11766 
   11767   SDValue MaskNode = getConstVector(Mask, MaskVecVT, DAG, DL, true);
   11768   if (V2.isUndef())
   11769     return DAG.getNode(X86ISD::VPERMV, DL, VT, MaskNode, V1);
   11770 
   11771   return DAG.getNode(X86ISD::VPERMV3, DL, VT, V1, MaskNode, V2);
   11772 }
   11773 
   11774 /// \brief Handle lowering of 8-lane 64-bit floating point shuffles.
   11775 static SDValue lowerV8F64VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
   11776                                        SDValue V1, SDValue V2,
   11777                                        const X86Subtarget &Subtarget,
   11778                                        SelectionDAG &DAG) {
   11779   assert(V1.getSimpleValueType() == MVT::v8f64 && "Bad operand type!");
   11780   assert(V2.getSimpleValueType() == MVT::v8f64 && "Bad operand type!");
   11781   assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");
   11782 
   11783   if (V2.isUndef()) {
   11784     // Use low duplicate instructions for masks that match their pattern.
   11785     if (isShuffleEquivalent(V1, V2, Mask, {0, 0, 2, 2, 4, 4, 6, 6}))
   11786       return DAG.getNode(X86ISD::MOVDDUP, DL, MVT::v8f64, V1);
   11787 
   11788     if (!is128BitLaneCrossingShuffleMask(MVT::v8f64, Mask)) {
   11789       // Non-half-crossing single input shuffles can be lowered with an
   11790       // interleaved permutation.
   11791       unsigned VPERMILPMask = (Mask[0] == 1) | ((Mask[1] == 1) << 1) |
   11792                               ((Mask[2] == 3) << 2) | ((Mask[3] == 3) << 3) |
   11793                               ((Mask[4] == 5) << 4) | ((Mask[5] == 5) << 5) |
   11794                               ((Mask[6] == 7) << 6) | ((Mask[7] == 7) << 7);
   11795       return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v8f64, V1,
   11796                          DAG.getConstant(VPERMILPMask, DL, MVT::i8));
   11797     }
   11798 
   11799     SmallVector<int, 4> RepeatedMask;
   11800     if (is256BitLaneRepeatedShuffleMask(MVT::v8f64, Mask, RepeatedMask))
   11801       return DAG.getNode(X86ISD::VPERMI, DL, MVT::v8f64, V1,
   11802                          getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG));
   11803   }
   11804 
   11805   if (SDValue Shuf128 =
   11806           lowerV4X128VectorShuffle(DL, MVT::v8f64, Mask, V1, V2, DAG))
   11807     return Shuf128;
   11808 
   11809   if (SDValue Unpck =
   11810           lowerVectorShuffleWithUNPCK(DL, MVT::v8f64, Mask, V1, V2, DAG))
   11811     return Unpck;
   11812 
   11813   // Check if the blend happens to exactly fit that of SHUFPD.
   11814   if (SDValue Op =
   11815       lowerVectorShuffleWithSHUFPD(DL, MVT::v8f64, Mask, V1, V2, DAG))
   11816     return Op;
   11817 
   11818   return lowerVectorShuffleWithPERMV(DL, MVT::v8f64, Mask, V1, V2, DAG);
   11819 }
   11820 
   11821 /// \brief Handle lowering of 16-lane 32-bit floating point shuffles.
   11822 static SDValue lowerV16F32VectorShuffle(SDLoc DL, ArrayRef<int> Mask,
   11823                                         SDValue V1, SDValue V2,
   11824                                         const X86Subtarget &Subtarget,
   11825                                         SelectionDAG &DAG) {
   11826   assert(V1.getSimpleValueType() == MVT::v16f32 && "Bad operand type!");
   11827   assert(V2.getSimpleValueType() == MVT::v16f32 && "Bad operand type!");
   11828   assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!");
   11829 
   11830   // If the shuffle mask is repeated in each 128-bit lane, we have many more
   11831   // options to efficiently lower the shuffle.
   11832   SmallVector<int, 4> RepeatedMask;
   11833   if (is128BitLaneRepeatedShuffleMask(MVT::v16f32, Mask, RepeatedMask)) {
   11834     assert(RepeatedMask.size() == 4 && "Unexpected repeated mask size!");
   11835 
   11836     // Use even/odd duplicate instructions for masks that match their pattern.
   11837     if (isShuffleEquivalent(V1, V2, RepeatedMask, {0, 0, 2, 2}))
   11838       return DAG.getNode(X86ISD::MOVSLDUP, DL, MVT::v16f32, V1);
   11839     if (isShuffleEquivalent(V1, V2, RepeatedMask, {1, 1, 3, 3}))
   11840       return DAG.getNode(X86ISD::MOVSHDUP, DL, MVT::v16f32, V1);
   11841 
   11842     if (V2.isUndef())
   11843       return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v16f32, V1,
   11844                          getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG));
   11845 
   11846     // Use dedicated unpack instructions for masks that match their pattern.
   11847     if (SDValue Unpck =
   11848             lowerVectorShuffleWithUNPCK(DL, MVT::v16f32, Mask, V1, V2, DAG))
   11849       return Unpck;
   11850 
   11851     // Otherwise, fall back to a SHUFPS sequence.
   11852     return lowerVectorShuffleWithSHUFPS(DL, MVT::v16f32, RepeatedMask, V1, V2, DAG);
   11853   }
   11854 
   11855   return lowerVectorShuffleWithPERMV(DL, MVT::v16f32, Mask, V1, V2, DAG);
   11856 }
   11857 
   11858 /// \brief Handle lowering of 8-lane 64-bit integer shuffles.
   11859 static SDValue lowerV8I64VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
   11860                                        SDValue V1, SDValue V2,
   11861                                        const X86Subtarget &Subtarget,
   11862                                        SelectionDAG &DAG) {
   11863   assert(V1.getSimpleValueType() == MVT::v8i64 && "Bad operand type!");
   11864   assert(V2.getSimpleValueType() == MVT::v8i64 && "Bad operand type!");
   11865   assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");
   11866 
   11867   if (SDValue Shuf128 =
   11868           lowerV4X128VectorShuffle(DL, MVT::v8i64, Mask, V1, V2, DAG))
   11869     return Shuf128;
   11870 
   11871   if (V2.isUndef()) {
   11872     // When the shuffle is mirrored between the 128-bit lanes of the unit, we
   11873     // can use lower latency instructions that will operate on all four
   11874     // 128-bit lanes.
   11875     SmallVector<int, 2> Repeated128Mask;
   11876     if (is128BitLaneRepeatedShuffleMask(MVT::v8i64, Mask, Repeated128Mask)) {
   11877       SmallVector<int, 4> PSHUFDMask;
   11878       scaleShuffleMask(2, Repeated128Mask, PSHUFDMask);
   11879       return DAG.getBitcast(
   11880           MVT::v8i64,
   11881           DAG.getNode(X86ISD::PSHUFD, DL, MVT::v16i32,
   11882                       DAG.getBitcast(MVT::v16i32, V1),
   11883                       getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG)));
   11884     }
   11885 
   11886     SmallVector<int, 4> Repeated256Mask;
   11887     if (is256BitLaneRepeatedShuffleMask(MVT::v8i64, Mask, Repeated256Mask))
   11888       return DAG.getNode(X86ISD::VPERMI, DL, MVT::v8i64, V1,
   11889                          getV4X86ShuffleImm8ForMask(Repeated256Mask, DL, DAG));
   11890   }
   11891 
   11892   // Try to use shift instructions.
   11893   if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v8i64, V1, V2, Mask,
   11894                                                 Subtarget, DAG))
   11895     return Shift;
   11896 
   11897   if (SDValue Unpck =
   11898           lowerVectorShuffleWithUNPCK(DL, MVT::v8i64, Mask, V1, V2, DAG))
   11899     return Unpck;
   11900 
   11901   return lowerVectorShuffleWithPERMV(DL, MVT::v8i64, Mask, V1, V2, DAG);
   11902 }
   11903 
   11904 /// \brief Handle lowering of 16-lane 32-bit integer shuffles.
   11905 static SDValue lowerV16I32VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
   11906                                         SDValue V1, SDValue V2,
   11907                                         const X86Subtarget &Subtarget,
   11908                                         SelectionDAG &DAG) {
   11909   assert(V1.getSimpleValueType() == MVT::v16i32 && "Bad operand type!");
   11910   assert(V2.getSimpleValueType() == MVT::v16i32 && "Bad operand type!");
   11911   assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!");
   11912 
   11913   // If the shuffle mask is repeated in each 128-bit lane we can use more
   11914   // efficient instructions that mirror the shuffles across the four 128-bit
   11915   // lanes.
   11916   SmallVector<int, 4> RepeatedMask;
   11917   if (is128BitLaneRepeatedShuffleMask(MVT::v16i32, Mask, RepeatedMask)) {
   11918     assert(RepeatedMask.size() == 4 && "Unexpected repeated mask size!");
   11919     if (V2.isUndef())
   11920       return DAG.getNode(X86ISD::PSHUFD, DL, MVT::v16i32, V1,
   11921                          getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG));
   11922 
   11923     // Use dedicated unpack instructions for masks that match their pattern.
   11924     if (SDValue V =
   11925             lowerVectorShuffleWithUNPCK(DL, MVT::v16i32, Mask, V1, V2, DAG))
   11926       return V;
   11927   }
   11928 
   11929   // Try to use shift instructions.
   11930   if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v16i32, V1, V2, Mask,
   11931                                                 Subtarget, DAG))
   11932     return Shift;
   11933 
   11934   // Try to use byte rotation instructions.
   11935   if (Subtarget.hasBWI())
   11936     if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
   11937             DL, MVT::v16i32, V1, V2, Mask, Subtarget, DAG))
   11938       return Rotate;
   11939 
   11940   return lowerVectorShuffleWithPERMV(DL, MVT::v16i32, Mask, V1, V2, DAG);
   11941 }
   11942 
   11943 /// \brief Handle lowering of 32-lane 16-bit integer shuffles.
   11944 static SDValue lowerV32I16VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
   11945                                         SDValue V1, SDValue V2,
   11946                                         const X86Subtarget &Subtarget,
   11947                                         SelectionDAG &DAG) {
   11948   assert(V1.getSimpleValueType() == MVT::v32i16 && "Bad operand type!");
   11949   assert(V2.getSimpleValueType() == MVT::v32i16 && "Bad operand type!");
   11950   assert(Mask.size() == 32 && "Unexpected mask size for v32 shuffle!");
   11951   assert(Subtarget.hasBWI() && "We can only lower v32i16 with AVX-512-BWI!");
   11952 
   11953   // Use dedicated unpack instructions for masks that match their pattern.
   11954   if (SDValue V =
   11955           lowerVectorShuffleWithUNPCK(DL, MVT::v32i16, Mask, V1, V2, DAG))
   11956     return V;
   11957 
   11958   // Try to use shift instructions.
   11959   if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v32i16, V1, V2, Mask,
   11960                                                 Subtarget, DAG))
   11961     return Shift;
   11962 
   11963   // Try to use byte rotation instructions.
   11964   if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
   11965           DL, MVT::v32i16, V1, V2, Mask, Subtarget, DAG))
   11966     return Rotate;
   11967 
   11968   if (V2.isUndef()) {
   11969     SmallVector<int, 8> RepeatedMask;
   11970     if (is128BitLaneRepeatedShuffleMask(MVT::v32i16, Mask, RepeatedMask)) {
   11971       // As this is a single-input shuffle, the repeated mask should be
   11972       // a strictly valid v8i16 mask that we can pass through to the v8i16
   11973       // lowering to handle even the v32 case.
   11974       return lowerV8I16GeneralSingleInputVectorShuffle(
   11975           DL, MVT::v32i16, V1, RepeatedMask, Subtarget, DAG);
   11976     }
   11977   }
   11978 
   11979   return lowerVectorShuffleWithPERMV(DL, MVT::v32i16, Mask, V1, V2, DAG);
   11980 }
   11981 
   11982 /// \brief Handle lowering of 64-lane 8-bit integer shuffles.
   11983 static SDValue lowerV64I8VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
   11984                                        SDValue V1, SDValue V2,
   11985                                        const X86Subtarget &Subtarget,
   11986                                        SelectionDAG &DAG) {
   11987   assert(V1.getSimpleValueType() == MVT::v64i8 && "Bad operand type!");
   11988   assert(V2.getSimpleValueType() == MVT::v64i8 && "Bad operand type!");
   11989   assert(Mask.size() == 64 && "Unexpected mask size for v64 shuffle!");
   11990   assert(Subtarget.hasBWI() && "We can only lower v64i8 with AVX-512-BWI!");
   11991 
   11992   // Use dedicated unpack instructions for masks that match their pattern.
   11993   if (SDValue V =
   11994           lowerVectorShuffleWithUNPCK(DL, MVT::v64i8, Mask, V1, V2, DAG))
   11995     return V;
   11996 
   11997   // Try to use shift instructions.
   11998   if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v64i8, V1, V2, Mask,
   11999                                                 Subtarget, DAG))
   12000     return Shift;
   12001 
   12002   // Try to use byte rotation instructions.
   12003   if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
   12004           DL, MVT::v64i8, V1, V2, Mask, Subtarget, DAG))
   12005     return Rotate;
   12006 
   12007   if (SDValue PSHUFB = lowerVectorShuffleWithPSHUFB(DL, MVT::v64i8, Mask, V1,
   12008                                                     V2, Subtarget, DAG))
   12009     return PSHUFB;
   12010 
   12011   // FIXME: Implement direct support for this type!
   12012   return splitAndLowerVectorShuffle(DL, MVT::v64i8, V1, V2, Mask, DAG);
   12013 }
   12014 
   12015 /// \brief High-level routine to lower various 512-bit x86 vector shuffles.
   12016 ///
   12017 /// This routine either breaks down the specific type of a 512-bit x86 vector
   12018 /// shuffle or splits it into two 256-bit shuffles and fuses the results back
   12019 /// together based on the available instructions.
   12020 static SDValue lower512BitVectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
   12021                                         MVT VT, SDValue V1, SDValue V2,
   12022                                         const X86Subtarget &Subtarget,
   12023                                         SelectionDAG &DAG) {
   12024   assert(Subtarget.hasAVX512() &&
   12025          "Cannot lower 512-bit vectors w/ basic ISA!");
   12026 
   12027   // Check for being able to broadcast a single element.
   12028   if (SDValue Broadcast =
   12029           lowerVectorShuffleAsBroadcast(DL, VT, V1, V2, Mask, Subtarget, DAG))
   12030     return Broadcast;
   12031 
   12032   // Dispatch to each element type for lowering. If we don't have support for
   12033   // specific element type shuffles at 512 bits, immediately split them and
   12034   // lower them. Each lowering routine of a given type is allowed to assume that
   12035   // the requisite ISA extensions for that element type are available.
   12036   switch (VT.SimpleTy) {
   12037   case MVT::v8f64:
   12038     return lowerV8F64VectorShuffle(DL, Mask, V1, V2, Subtarget, DAG);
   12039   case MVT::v16f32:
   12040     return lowerV16F32VectorShuffle(DL, Mask, V1, V2, Subtarget, DAG);
   12041   case MVT::v8i64:
   12042     return lowerV8I64VectorShuffle(DL, Mask, V1, V2, Subtarget, DAG);
   12043   case MVT::v16i32:
   12044     return lowerV16I32VectorShuffle(DL, Mask, V1, V2, Subtarget, DAG);
   12045   case MVT::v32i16:
   12046     return lowerV32I16VectorShuffle(DL, Mask, V1, V2, Subtarget, DAG);
   12047   case MVT::v64i8:
   12048     return lowerV64I8VectorShuffle(DL, Mask, V1, V2, Subtarget, DAG);
   12049 
   12050   default:
   12051     llvm_unreachable("Not a valid 512-bit x86 vector type!");
   12052   }
   12053 }
   12054 
   12055 // Lower vXi1 vector shuffles.
   12056 // There is no a dedicated instruction on AVX-512 that shuffles the masks.
   12057 // The only way to shuffle bits is to sign-extend the mask vector to SIMD
   12058 // vector, shuffle and then truncate it back.
   12059 static SDValue lower1BitVectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
   12060                                       MVT VT, SDValue V1, SDValue V2,
   12061                                       const X86Subtarget &Subtarget,
   12062                                       SelectionDAG &DAG) {
   12063   assert(Subtarget.hasAVX512() &&
   12064          "Cannot lower 512-bit vectors w/o basic ISA!");
   12065   MVT ExtVT;
   12066   switch (VT.SimpleTy) {
   12067   default:
   12068     llvm_unreachable("Expected a vector of i1 elements");
   12069   case MVT::v2i1:
   12070     ExtVT = MVT::v2i64;
   12071     break;
   12072   case MVT::v4i1:
   12073     ExtVT = MVT::v4i32;
   12074     break;
   12075   case MVT::v8i1:
   12076     ExtVT = MVT::v8i64; // Take 512-bit type, more shuffles on KNL
   12077     break;
   12078   case MVT::v16i1:
   12079     ExtVT = MVT::v16i32;
   12080     break;
   12081   case MVT::v32i1:
   12082     ExtVT = MVT::v32i16;
   12083     break;
   12084   case MVT::v64i1:
   12085     ExtVT = MVT::v64i8;
   12086     break;
   12087   }
   12088 
   12089   if (ISD::isBuildVectorAllZeros(V1.getNode()))
   12090     V1 = getZeroVector(ExtVT, Subtarget, DAG, DL);
   12091   else if (ISD::isBuildVectorAllOnes(V1.getNode()))
   12092     V1 = getOnesVector(ExtVT, Subtarget, DAG, DL);
   12093   else
   12094     V1 = DAG.getNode(ISD::SIGN_EXTEND, DL, ExtVT, V1);
   12095 
   12096   if (V2.isUndef())
   12097     V2 = DAG.getUNDEF(ExtVT);
   12098   else if (ISD::isBuildVectorAllZeros(V2.getNode()))
   12099     V2 = getZeroVector(ExtVT, Subtarget, DAG, DL);
   12100   else if (ISD::isBuildVectorAllOnes(V2.getNode()))
   12101     V2 = getOnesVector(ExtVT, Subtarget, DAG, DL);
   12102   else
   12103     V2 = DAG.getNode(ISD::SIGN_EXTEND, DL, ExtVT, V2);
   12104   return DAG.getNode(ISD::TRUNCATE, DL, VT,
   12105                      DAG.getVectorShuffle(ExtVT, DL, V1, V2, Mask));
   12106 }
   12107 /// \brief Top-level lowering for x86 vector shuffles.
   12108 ///
   12109 /// This handles decomposition, canonicalization, and lowering of all x86
   12110 /// vector shuffles. Most of the specific lowering strategies are encapsulated
   12111 /// above in helper routines. The canonicalization attempts to widen shuffles
   12112 /// to involve fewer lanes of wider elements, consolidate symmetric patterns
   12113 /// s.t. only one of the two inputs needs to be tested, etc.
   12114 static SDValue lowerVectorShuffle(SDValue Op, const X86Subtarget &Subtarget,
   12115                                   SelectionDAG &DAG) {
   12116   ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
   12117   ArrayRef<int> Mask = SVOp->getMask();
   12118   SDValue V1 = Op.getOperand(0);
   12119   SDValue V2 = Op.getOperand(1);
   12120   MVT VT = Op.getSimpleValueType();
   12121   int NumElements = VT.getVectorNumElements();
   12122   SDLoc DL(Op);
   12123   bool Is1BitVector = (VT.getVectorElementType() == MVT::i1);
   12124 
   12125   assert((VT.getSizeInBits() != 64 || Is1BitVector) &&
   12126          "Can't lower MMX shuffles");
   12127 
   12128   bool V1IsUndef = V1.isUndef();
   12129   bool V2IsUndef = V2.isUndef();
   12130   if (V1IsUndef && V2IsUndef)
   12131     return DAG.getUNDEF(VT);
   12132 
   12133   // When we create a shuffle node we put the UNDEF node to second operand,
   12134   // but in some cases the first operand may be transformed to UNDEF.
   12135   // In this case we should just commute the node.
   12136   if (V1IsUndef)
   12137     return DAG.getCommutedVectorShuffle(*SVOp);
   12138 
   12139   // Check for non-undef masks pointing at an undef vector and make the masks
   12140   // undef as well. This makes it easier to match the shuffle based solely on
   12141   // the mask.
   12142   if (V2IsUndef)
   12143     for (int M : Mask)
   12144       if (M >= NumElements) {
   12145         SmallVector<int, 8> NewMask(Mask.begin(), Mask.end());
   12146         for (int &M : NewMask)
   12147           if (M >= NumElements)
   12148             M = -1;
   12149         return DAG.getVectorShuffle(VT, DL, V1, V2, NewMask);
   12150       }
   12151 
   12152   // We actually see shuffles that are entirely re-arrangements of a set of
   12153   // zero inputs. This mostly happens while decomposing complex shuffles into
   12154   // simple ones. Directly lower these as a buildvector of zeros.
   12155   SmallBitVector Zeroable = computeZeroableShuffleElements(Mask, V1, V2);
   12156   if (Zeroable.all())
   12157     return getZeroVector(VT, Subtarget, DAG, DL);
   12158 
   12159   // Try to collapse shuffles into using a vector type with fewer elements but
   12160   // wider element types. We cap this to not form integers or floating point
   12161   // elements wider than 64 bits, but it might be interesting to form i128
   12162   // integers to handle flipping the low and high halves of AVX 256-bit vectors.
   12163   SmallVector<int, 16> WidenedMask;
   12164   if (VT.getScalarSizeInBits() < 64 && !Is1BitVector &&
   12165       canWidenShuffleElements(Mask, WidenedMask)) {
   12166     MVT NewEltVT = VT.isFloatingPoint()
   12167                        ? MVT::getFloatingPointVT(VT.getScalarSizeInBits() * 2)
   12168                        : MVT::getIntegerVT(VT.getScalarSizeInBits() * 2);
   12169     MVT NewVT = MVT::getVectorVT(NewEltVT, VT.getVectorNumElements() / 2);
   12170     // Make sure that the new vector type is legal. For example, v2f64 isn't
   12171     // legal on SSE1.
   12172     if (DAG.getTargetLoweringInfo().isTypeLegal(NewVT)) {
   12173       V1 = DAG.getBitcast(NewVT, V1);
   12174       V2 = DAG.getBitcast(NewVT, V2);
   12175       return DAG.getBitcast(
   12176           VT, DAG.getVectorShuffle(NewVT, DL, V1, V2, WidenedMask));
   12177     }
   12178   }
   12179 
   12180   int NumV1Elements = 0, NumUndefElements = 0, NumV2Elements = 0;
   12181   for (int M : Mask)
   12182     if (M < 0)
   12183       ++NumUndefElements;
   12184     else if (M < NumElements)
   12185       ++NumV1Elements;
   12186     else
   12187       ++NumV2Elements;
   12188 
   12189   // Commute the shuffle as needed such that more elements come from V1 than
   12190   // V2. This allows us to match the shuffle pattern strictly on how many
   12191   // elements come from V1 without handling the symmetric cases.
   12192   if (NumV2Elements > NumV1Elements)
   12193     return DAG.getCommutedVectorShuffle(*SVOp);
   12194 
   12195   assert(NumV1Elements > 0 && "No V1 indices");
   12196   assert((NumV2Elements > 0 || V2IsUndef) && "V2 not undef, but not used");
   12197 
   12198   // When the number of V1 and V2 elements are the same, try to minimize the
   12199   // number of uses of V2 in the low half of the vector. When that is tied,
   12200   // ensure that the sum of indices for V1 is equal to or lower than the sum
   12201   // indices for V2. When those are equal, try to ensure that the number of odd
   12202   // indices for V1 is lower than the number of odd indices for V2.
   12203   if (NumV1Elements == NumV2Elements) {
   12204     int LowV1Elements = 0, LowV2Elements = 0;
   12205     for (int M : Mask.slice(0, NumElements / 2))
   12206       if (M >= NumElements)
   12207         ++LowV2Elements;
   12208       else if (M >= 0)
   12209         ++LowV1Elements;
   12210     if (LowV2Elements > LowV1Elements)
   12211       return DAG.getCommutedVectorShuffle(*SVOp);
   12212     if (LowV2Elements == LowV1Elements) {
   12213       int SumV1Indices = 0, SumV2Indices = 0;
   12214       for (int i = 0, Size = Mask.size(); i < Size; ++i)
   12215         if (Mask[i] >= NumElements)
   12216           SumV2Indices += i;
   12217         else if (Mask[i] >= 0)
   12218           SumV1Indices += i;
   12219       if (SumV2Indices < SumV1Indices)
   12220         return DAG.getCommutedVectorShuffle(*SVOp);
   12221       if (SumV2Indices == SumV1Indices) {
   12222         int NumV1OddIndices = 0, NumV2OddIndices = 0;
   12223         for (int i = 0, Size = Mask.size(); i < Size; ++i)
   12224           if (Mask[i] >= NumElements)
   12225             NumV2OddIndices += i % 2;
   12226           else if (Mask[i] >= 0)
   12227             NumV1OddIndices += i % 2;
   12228         if (NumV2OddIndices < NumV1OddIndices)
   12229           return DAG.getCommutedVectorShuffle(*SVOp);
   12230       }
   12231     }
   12232   }
   12233 
   12234   // For each vector width, delegate to a specialized lowering routine.
   12235   if (VT.is128BitVector())
   12236     return lower128BitVectorShuffle(DL, Mask, VT, V1, V2, Subtarget, DAG);
   12237 
   12238   if (VT.is256BitVector())
   12239     return lower256BitVectorShuffle(DL, Mask, VT, V1, V2, Subtarget, DAG);
   12240 
   12241   if (VT.is512BitVector())
   12242     return lower512BitVectorShuffle(DL, Mask, VT, V1, V2, Subtarget, DAG);
   12243 
   12244   if (Is1BitVector)
   12245     return lower1BitVectorShuffle(DL, Mask, VT, V1, V2, Subtarget, DAG);
   12246 
   12247   llvm_unreachable("Unimplemented!");
   12248 }
   12249 
   12250 /// \brief Try to lower a VSELECT instruction to a vector shuffle.
   12251 static SDValue lowerVSELECTtoVectorShuffle(SDValue Op,
   12252                                            const X86Subtarget &Subtarget,
   12253                                            SelectionDAG &DAG) {
   12254   SDValue Cond = Op.getOperand(0);
   12255   SDValue LHS = Op.getOperand(1);
   12256   SDValue RHS = Op.getOperand(2);
   12257   SDLoc dl(Op);
   12258   MVT VT = Op.getSimpleValueType();
   12259 
   12260   if (!ISD::isBuildVectorOfConstantSDNodes(Cond.getNode()))
   12261     return SDValue();
   12262   auto *CondBV = cast<BuildVectorSDNode>(Cond);
   12263 
   12264   // Only non-legal VSELECTs reach this lowering, convert those into generic
   12265   // shuffles and re-use the shuffle lowering path for blends.
   12266   SmallVector<int, 32> Mask;
   12267   for (int i = 0, Size = VT.getVectorNumElements(); i < Size; ++i) {
   12268     SDValue CondElt = CondBV->getOperand(i);
   12269     Mask.push_back(
   12270         isa<ConstantSDNode>(CondElt) ? i + (isNullConstant(CondElt) ? Size : 0)
   12271                                      : -1);
   12272   }
   12273   return DAG.getVectorShuffle(VT, dl, LHS, RHS, Mask);
   12274 }
   12275 
   12276 SDValue X86TargetLowering::LowerVSELECT(SDValue Op, SelectionDAG &DAG) const {
   12277   // A vselect where all conditions and data are constants can be optimized into
   12278   // a single vector load by SelectionDAGLegalize::ExpandBUILD_VECTOR().
   12279   if (ISD::isBuildVectorOfConstantSDNodes(Op.getOperand(0).getNode()) &&
   12280       ISD::isBuildVectorOfConstantSDNodes(Op.getOperand(1).getNode()) &&
   12281       ISD::isBuildVectorOfConstantSDNodes(Op.getOperand(2).getNode()))
   12282     return SDValue();
   12283 
   12284   // Try to lower this to a blend-style vector shuffle. This can handle all
   12285   // constant condition cases.
   12286   if (SDValue BlendOp = lowerVSELECTtoVectorShuffle(Op, Subtarget, DAG))
   12287     return BlendOp;
   12288 
   12289   // Variable blends are only legal from SSE4.1 onward.
   12290   if (!Subtarget.hasSSE41())
   12291     return SDValue();
   12292 
   12293   // Only some types will be legal on some subtargets. If we can emit a legal
   12294   // VSELECT-matching blend, return Op, and but if we need to expand, return
   12295   // a null value.
   12296   switch (Op.getSimpleValueType().SimpleTy) {
   12297   default:
   12298     // Most of the vector types have blends past SSE4.1.
   12299     return Op;
   12300 
   12301   case MVT::v32i8:
   12302     // The byte blends for AVX vectors were introduced only in AVX2.
   12303     if (Subtarget.hasAVX2())
   12304       return Op;
   12305 
   12306     return SDValue();
   12307 
   12308   case MVT::v8i16:
   12309   case MVT::v16i16:
   12310     // AVX-512 BWI and VLX features support VSELECT with i16 elements.
   12311     if (Subtarget.hasBWI() && Subtarget.hasVLX())
   12312       return Op;
   12313 
   12314     // FIXME: We should custom lower this by fixing the condition and using i8
   12315     // blends.
   12316     return SDValue();
   12317   }
   12318 }
   12319 
   12320 static SDValue LowerEXTRACT_VECTOR_ELT_SSE4(SDValue Op, SelectionDAG &DAG) {
   12321   MVT VT = Op.getSimpleValueType();
   12322   SDLoc dl(Op);
   12323 
   12324   if (!Op.getOperand(0).getSimpleValueType().is128BitVector())
   12325     return SDValue();
   12326 
   12327   if (VT.getSizeInBits() == 8) {
   12328     SDValue Extract = DAG.getNode(X86ISD::PEXTRB, dl, MVT::i32,
   12329                                   Op.getOperand(0), Op.getOperand(1));
   12330     SDValue Assert  = DAG.getNode(ISD::AssertZext, dl, MVT::i32, Extract,
   12331                                   DAG.getValueType(VT));
   12332     return DAG.getNode(ISD::TRUNCATE, dl, VT, Assert);
   12333   }
   12334 
   12335   if (VT.getSizeInBits() == 16) {
   12336     // If Idx is 0, it's cheaper to do a move instead of a pextrw.
   12337     if (isNullConstant(Op.getOperand(1)))
   12338       return DAG.getNode(
   12339           ISD::TRUNCATE, dl, MVT::i16,
   12340           DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32,
   12341                       DAG.getBitcast(MVT::v4i32, Op.getOperand(0)),
   12342                       Op.getOperand(1)));
   12343     SDValue Extract = DAG.getNode(X86ISD::PEXTRW, dl, MVT::i32,
   12344                                   Op.getOperand(0), Op.getOperand(1));
   12345     SDValue Assert  = DAG.getNode(ISD::AssertZext, dl, MVT::i32, Extract,
   12346                                   DAG.getValueType(VT));
   12347     return DAG.getNode(ISD::TRUNCATE, dl, VT, Assert);
   12348   }
   12349 
   12350   if (VT == MVT::f32) {
   12351     // EXTRACTPS outputs to a GPR32 register which will require a movd to copy
   12352     // the result back to FR32 register. It's only worth matching if the
   12353     // result has a single use which is a store or a bitcast to i32.  And in
   12354     // the case of a store, it's not worth it if the index is a constant 0,
   12355     // because a MOVSSmr can be used instead, which is smaller and faster.
   12356     if (!Op.hasOneUse())
   12357       return SDValue();
   12358     SDNode *User = *Op.getNode()->use_begin();
   12359     if ((User->getOpcode() != ISD::STORE ||
   12360          isNullConstant(Op.getOperand(1))) &&
   12361         (User->getOpcode() != ISD::BITCAST ||
   12362          User->getValueType(0) != MVT::i32))
   12363       return SDValue();
   12364     SDValue Extract = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32,
   12365                                   DAG.getBitcast(MVT::v4i32, Op.getOperand(0)),
   12366                                   Op.getOperand(1));
   12367     return DAG.getBitcast(MVT::f32, Extract);
   12368   }
   12369 
   12370   if (VT == MVT::i32 || VT == MVT::i64) {
   12371     // ExtractPS/pextrq works with constant index.
   12372     if (isa<ConstantSDNode>(Op.getOperand(1)))
   12373       return Op;
   12374   }
   12375   return SDValue();
   12376 }
   12377 
   12378 /// Extract one bit from mask vector, like v16i1 or v8i1.
   12379 /// AVX-512 feature.
   12380 SDValue
   12381 X86TargetLowering::ExtractBitFromMaskVector(SDValue Op, SelectionDAG &DAG) const {
   12382   SDValue Vec = Op.getOperand(0);
   12383   SDLoc dl(Vec);
   12384   MVT VecVT = Vec.getSimpleValueType();
   12385   SDValue Idx = Op.getOperand(1);
   12386   MVT EltVT = Op.getSimpleValueType();
   12387 
   12388   assert((EltVT == MVT::i1) && "Unexpected operands in ExtractBitFromMaskVector");
   12389   assert((VecVT.getVectorNumElements() <= 16 || Subtarget.hasBWI()) &&
   12390          "Unexpected vector type in ExtractBitFromMaskVector");
   12391 
   12392   // variable index can't be handled in mask registers,
   12393   // extend vector to VR512
   12394   if (!isa<ConstantSDNode>(Idx)) {
   12395     MVT ExtVT = (VecVT == MVT::v8i1 ?  MVT::v8i64 : MVT::v16i32);
   12396     SDValue Ext = DAG.getNode(ISD::ZERO_EXTEND, dl, ExtVT, Vec);
   12397     SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl,
   12398                               ExtVT.getVectorElementType(), Ext, Idx);
   12399     return DAG.getNode(ISD::TRUNCATE, dl, EltVT, Elt);
   12400   }
   12401 
   12402   unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
   12403   if (!Subtarget.hasDQI() && (VecVT.getVectorNumElements() <= 8)) {
   12404     // Use kshiftlw/rw instruction.
   12405     VecVT = MVT::v16i1;
   12406     Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, VecVT,
   12407                       DAG.getUNDEF(VecVT),
   12408                       Vec,
   12409                       DAG.getIntPtrConstant(0, dl));
   12410   }
   12411   unsigned MaxSift = VecVT.getVectorNumElements() - 1;
   12412   Vec = DAG.getNode(X86ISD::VSHLI, dl, VecVT, Vec,
   12413                     DAG.getConstant(MaxSift - IdxVal, dl, MVT::i8));
   12414   Vec = DAG.getNode(X86ISD::VSRLI, dl, VecVT, Vec,
   12415                     DAG.getConstant(MaxSift, dl, MVT::i8));
   12416   return DAG.getNode(X86ISD::VEXTRACT, dl, MVT::i1, Vec,
   12417                        DAG.getIntPtrConstant(0, dl));
   12418 }
   12419 
   12420 SDValue
   12421 X86TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op,
   12422                                            SelectionDAG &DAG) const {
   12423   SDLoc dl(Op);
   12424   SDValue Vec = Op.getOperand(0);
   12425   MVT VecVT = Vec.getSimpleValueType();
   12426   SDValue Idx = Op.getOperand(1);
   12427 
   12428   if (Op.getSimpleValueType() == MVT::i1)
   12429     return ExtractBitFromMaskVector(Op, DAG);
   12430 
   12431   if (!isa<ConstantSDNode>(Idx)) {
   12432     if (VecVT.is512BitVector() ||
   12433         (VecVT.is256BitVector() && Subtarget.hasInt256() &&
   12434          VecVT.getVectorElementType().getSizeInBits() == 32)) {
   12435 
   12436       MVT MaskEltVT =
   12437         MVT::getIntegerVT(VecVT.getVectorElementType().getSizeInBits());
   12438       MVT MaskVT = MVT::getVectorVT(MaskEltVT, VecVT.getSizeInBits() /
   12439                                     MaskEltVT.getSizeInBits());
   12440 
   12441       Idx = DAG.getZExtOrTrunc(Idx, dl, MaskEltVT);
   12442       auto PtrVT = getPointerTy(DAG.getDataLayout());
   12443       SDValue Mask = DAG.getNode(X86ISD::VINSERT, dl, MaskVT,
   12444                                  getZeroVector(MaskVT, Subtarget, DAG, dl), Idx,
   12445                                  DAG.getConstant(0, dl, PtrVT));
   12446       SDValue Perm = DAG.getNode(X86ISD::VPERMV, dl, VecVT, Mask, Vec);
   12447       return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, Op.getValueType(), Perm,
   12448                          DAG.getConstant(0, dl, PtrVT));
   12449     }
   12450     return SDValue();
   12451   }
   12452 
   12453   unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
   12454 
   12455   // If this is a 256-bit vector result, first extract the 128-bit vector and
   12456   // then extract the element from the 128-bit vector.
   12457   if (VecVT.is256BitVector() || VecVT.is512BitVector()) {
   12458     // Get the 128-bit vector.
   12459     Vec = extract128BitVector(Vec, IdxVal, DAG, dl);
   12460     MVT EltVT = VecVT.getVectorElementType();
   12461 
   12462     unsigned ElemsPerChunk = 128 / EltVT.getSizeInBits();
   12463     assert(isPowerOf2_32(ElemsPerChunk) && "Elements per chunk not power of 2");
   12464 
   12465     // Find IdxVal modulo ElemsPerChunk. Since ElemsPerChunk is a power of 2
   12466     // this can be done with a mask.
   12467     IdxVal &= ElemsPerChunk - 1;
   12468     return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, Op.getValueType(), Vec,
   12469                        DAG.getConstant(IdxVal, dl, MVT::i32));
   12470   }
   12471 
   12472   assert(VecVT.is128BitVector() && "Unexpected vector length");
   12473 
   12474   if (Subtarget.hasSSE41())
   12475     if (SDValue Res = LowerEXTRACT_VECTOR_ELT_SSE4(Op, DAG))
   12476       return Res;
   12477 
   12478   MVT VT = Op.getSimpleValueType();
   12479   // TODO: handle v16i8.
   12480   if (VT.getSizeInBits() == 16) {
   12481     if (IdxVal == 0)
   12482       return DAG.getNode(ISD::TRUNCATE, dl, MVT::i16,
   12483                          DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32,
   12484                                      DAG.getBitcast(MVT::v4i32, Vec), Idx));
   12485 
   12486     // Transform it so it match pextrw which produces a 32-bit result.
   12487     MVT EltVT = MVT::i32;
   12488     SDValue Extract = DAG.getNode(X86ISD::PEXTRW, dl, EltVT, Vec, Idx);
   12489     SDValue Assert  = DAG.getNode(ISD::AssertZext, dl, EltVT, Extract,
   12490                                   DAG.getValueType(VT));
   12491     return DAG.getNode(ISD::TRUNCATE, dl, VT, Assert);
   12492   }
   12493 
   12494   if (VT.getSizeInBits() == 32) {
   12495     if (IdxVal == 0)
   12496       return Op;
   12497 
   12498     // SHUFPS the element to the lowest double word, then movss.
   12499     int Mask[4] = { static_cast<int>(IdxVal), -1, -1, -1 };
   12500     Vec = DAG.getVectorShuffle(VecVT, dl, Vec, DAG.getUNDEF(VecVT), Mask);
   12501     return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Vec,
   12502                        DAG.getIntPtrConstant(0, dl));
   12503   }
   12504 
   12505   if (VT.getSizeInBits() == 64) {
   12506     // FIXME: .td only matches this for <2 x f64>, not <2 x i64> on 32b
   12507     // FIXME: seems like this should be unnecessary if mov{h,l}pd were taught
   12508     //        to match extract_elt for f64.
   12509     if (IdxVal == 0)
   12510       return Op;
   12511 
   12512     // UNPCKHPD the element to the lowest double word, then movsd.
   12513     // Note if the lower 64 bits of the result of the UNPCKHPD is then stored
   12514     // to a f64mem, the whole operation is folded into a single MOVHPDmr.
   12515     int Mask[2] = { 1, -1 };
   12516     Vec = DAG.getVectorShuffle(VecVT, dl, Vec, DAG.getUNDEF(VecVT), Mask);
   12517     return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Vec,
   12518                        DAG.getIntPtrConstant(0, dl));
   12519   }
   12520 
   12521   return SDValue();
   12522 }
   12523 
   12524 /// Insert one bit to mask vector, like v16i1 or v8i1.
   12525 /// AVX-512 feature.
   12526 SDValue
   12527 X86TargetLowering::InsertBitToMaskVector(SDValue Op, SelectionDAG &DAG) const {
   12528   SDLoc dl(Op);
   12529   SDValue Vec = Op.getOperand(0);
   12530   SDValue Elt = Op.getOperand(1);
   12531   SDValue Idx = Op.getOperand(2);
   12532   MVT VecVT = Vec.getSimpleValueType();
   12533 
   12534   if (!isa<ConstantSDNode>(Idx)) {
   12535     // Non constant index. Extend source and destination,
   12536     // insert element and then truncate the result.
   12537     MVT ExtVecVT = (VecVT == MVT::v8i1 ?  MVT::v8i64 : MVT::v16i32);
   12538     MVT ExtEltVT = (VecVT == MVT::v8i1 ?  MVT::i64 : MVT::i32);
   12539     SDValue ExtOp = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, ExtVecVT,
   12540       DAG.getNode(ISD::ZERO_EXTEND, dl, ExtVecVT, Vec),
   12541       DAG.getNode(ISD::ZERO_EXTEND, dl, ExtEltVT, Elt), Idx);
   12542     return DAG.getNode(ISD::TRUNCATE, dl, VecVT, ExtOp);
   12543   }
   12544 
   12545   unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
   12546   SDValue EltInVec = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VecVT, Elt);
   12547   if (IdxVal)
   12548     EltInVec = DAG.getNode(X86ISD::VSHLI, dl, VecVT, EltInVec,
   12549                            DAG.getConstant(IdxVal, dl, MVT::i8));
   12550   if (Vec.isUndef())
   12551     return EltInVec;
   12552   return DAG.getNode(ISD::OR, dl, VecVT, Vec, EltInVec);
   12553 }
   12554 
   12555 SDValue X86TargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op,
   12556                                                   SelectionDAG &DAG) const {
   12557   MVT VT = Op.getSimpleValueType();
   12558   MVT EltVT = VT.getVectorElementType();
   12559   unsigned NumElts = VT.getVectorNumElements();
   12560 
   12561   if (EltVT == MVT::i1)
   12562     return InsertBitToMaskVector(Op, DAG);
   12563 
   12564   SDLoc dl(Op);
   12565   SDValue N0 = Op.getOperand(0);
   12566   SDValue N1 = Op.getOperand(1);
   12567   SDValue N2 = Op.getOperand(2);
   12568   if (!isa<ConstantSDNode>(N2))
   12569     return SDValue();
   12570   auto *N2C = cast<ConstantSDNode>(N2);
   12571   unsigned IdxVal = N2C->getZExtValue();
   12572 
   12573   // If we are clearing out a element, we do this more efficiently with a
   12574   // blend shuffle than a costly integer insertion.
   12575   // TODO: would other rematerializable values (e.g. allbits) benefit as well?
   12576   // TODO: pre-SSE41 targets will tend to use bit masking - this could still
   12577   // be beneficial if we are inserting several zeros and can combine the masks.
   12578   if (X86::isZeroNode(N1) && Subtarget.hasSSE41() && NumElts <= 8) {
   12579     SmallVector<int, 8> ClearMask;
   12580     for (unsigned i = 0; i != NumElts; ++i)
   12581       ClearMask.push_back(i == IdxVal ? i + NumElts : i);
   12582     SDValue ZeroVector = getZeroVector(VT, Subtarget, DAG, dl);
   12583     return DAG.getVectorShuffle(VT, dl, N0, ZeroVector, ClearMask);
   12584   }
   12585 
   12586   // If the vector is wider than 128 bits, extract the 128-bit subvector, insert
   12587   // into that, and then insert the subvector back into the result.
   12588   if (VT.is256BitVector() || VT.is512BitVector()) {
   12589     // With a 256-bit vector, we can insert into the zero element efficiently
   12590     // using a blend if we have AVX or AVX2 and the right data type.
   12591     if (VT.is256BitVector() && IdxVal == 0) {
   12592       // TODO: It is worthwhile to cast integer to floating point and back
   12593       // and incur a domain crossing penalty if that's what we'll end up
   12594       // doing anyway after extracting to a 128-bit vector.
   12595       if ((Subtarget.hasAVX() && (EltVT == MVT::f64 || EltVT == MVT::f32)) ||
   12596           (Subtarget.hasAVX2() && EltVT == MVT::i32)) {
   12597         SDValue N1Vec = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, N1);
   12598         N2 = DAG.getIntPtrConstant(1, dl);
   12599         return DAG.getNode(X86ISD::BLENDI, dl, VT, N0, N1Vec, N2);
   12600       }
   12601     }
   12602 
   12603     // Get the desired 128-bit vector chunk.
   12604     SDValue V = extract128BitVector(N0, IdxVal, DAG, dl);
   12605 
   12606     // Insert the element into the desired chunk.
   12607     unsigned NumEltsIn128 = 128 / EltVT.getSizeInBits();
   12608     assert(isPowerOf2_32(NumEltsIn128));
   12609     // Since NumEltsIn128 is a power of 2 we can use mask instead of modulo.
   12610     unsigned IdxIn128 = IdxVal & (NumEltsIn128 - 1);
   12611 
   12612     V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, V.getValueType(), V, N1,
   12613                     DAG.getConstant(IdxIn128, dl, MVT::i32));
   12614 
   12615     // Insert the changed part back into the bigger vector
   12616     return insert128BitVector(N0, V, IdxVal, DAG, dl);
   12617   }
   12618   assert(VT.is128BitVector() && "Only 128-bit vector types should be left!");
   12619 
   12620   if (Subtarget.hasSSE41()) {
   12621     if (EltVT.getSizeInBits() == 8 || EltVT.getSizeInBits() == 16) {
   12622       unsigned Opc;
   12623       if (VT == MVT::v8i16) {
   12624         Opc = X86ISD::PINSRW;
   12625       } else {
   12626         assert(VT == MVT::v16i8);
   12627         Opc = X86ISD::PINSRB;
   12628       }
   12629 
   12630       // Transform it so it match pinsr{b,w} which expects a GR32 as its second
   12631       // argument.
   12632       if (N1.getValueType() != MVT::i32)
   12633         N1 = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, N1);
   12634       if (N2.getValueType() != MVT::i32)
   12635         N2 = DAG.getIntPtrConstant(IdxVal, dl);
   12636       return DAG.getNode(Opc, dl, VT, N0, N1, N2);
   12637     }
   12638 
   12639     if (EltVT == MVT::f32) {
   12640       // Bits [7:6] of the constant are the source select. This will always be
   12641       //   zero here. The DAG Combiner may combine an extract_elt index into
   12642       //   these bits. For example (insert (extract, 3), 2) could be matched by
   12643       //   putting the '3' into bits [7:6] of X86ISD::INSERTPS.
   12644       // Bits [5:4] of the constant are the destination select. This is the
   12645       //   value of the incoming immediate.
   12646       // Bits [3:0] of the constant are the zero mask. The DAG Combiner may
   12647       //   combine either bitwise AND or insert of float 0.0 to set these bits.
   12648 
   12649       bool MinSize = DAG.getMachineFunction().getFunction()->optForMinSize();
   12650       if (IdxVal == 0 && (!MinSize || !MayFoldLoad(N1))) {
   12651         // If this is an insertion of 32-bits into the low 32-bits of
   12652         // a vector, we prefer to generate a blend with immediate rather
   12653         // than an insertps. Blends are simpler operations in hardware and so
   12654         // will always have equal or better performance than insertps.
   12655         // But if optimizing for size and there's a load folding opportunity,
   12656         // generate insertps because blendps does not have a 32-bit memory
   12657         // operand form.
   12658         N2 = DAG.getIntPtrConstant(1, dl);
   12659         N1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4f32, N1);
   12660         return DAG.getNode(X86ISD::BLENDI, dl, VT, N0, N1, N2);
   12661       }
   12662       N2 = DAG.getIntPtrConstant(IdxVal << 4, dl);
   12663       // Create this as a scalar to vector..
   12664       N1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4f32, N1);
   12665       return DAG.getNode(X86ISD::INSERTPS, dl, VT, N0, N1, N2);
   12666     }
   12667 
   12668     if (EltVT == MVT::i32 || EltVT == MVT::i64) {
   12669       // PINSR* works with constant index.
   12670       return Op;
   12671     }
   12672   }
   12673 
   12674   if (EltVT == MVT::i8)
   12675     return SDValue();
   12676 
   12677   if (EltVT.getSizeInBits() == 16) {
   12678     // Transform it so it match pinsrw which expects a 16-bit value in a GR32
   12679     // as its second argument.
   12680     if (N1.getValueType() != MVT::i32)
   12681       N1 = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, N1);
   12682     if (N2.getValueType() != MVT::i32)
   12683       N2 = DAG.getIntPtrConstant(IdxVal, dl);
   12684     return DAG.getNode(X86ISD::PINSRW, dl, VT, N0, N1, N2);
   12685   }
   12686   return SDValue();
   12687 }
   12688 
   12689 static SDValue LowerSCALAR_TO_VECTOR(SDValue Op, SelectionDAG &DAG) {
   12690   SDLoc dl(Op);
   12691   MVT OpVT = Op.getSimpleValueType();
   12692 
   12693   // If this is a 256-bit vector result, first insert into a 128-bit
   12694   // vector and then insert into the 256-bit vector.
   12695   if (!OpVT.is128BitVector()) {
   12696     // Insert into a 128-bit vector.
   12697     unsigned SizeFactor = OpVT.getSizeInBits()/128;
   12698     MVT VT128 = MVT::getVectorVT(OpVT.getVectorElementType(),
   12699                                  OpVT.getVectorNumElements() / SizeFactor);
   12700 
   12701     Op = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT128, Op.getOperand(0));
   12702 
   12703     // Insert the 128-bit vector.
   12704     return insert128BitVector(DAG.getUNDEF(OpVT), Op, 0, DAG, dl);
   12705   }
   12706 
   12707   if (OpVT == MVT::v1i64 &&
   12708       Op.getOperand(0).getValueType() == MVT::i64)
   12709     return DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v1i64, Op.getOperand(0));
   12710 
   12711   SDValue AnyExt = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, Op.getOperand(0));
   12712   assert(OpVT.is128BitVector() && "Expected an SSE type!");
   12713   return DAG.getBitcast(
   12714       OpVT, DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, AnyExt));
   12715 }
   12716 
   12717 // Lower a node with an EXTRACT_SUBVECTOR opcode.  This may result in
   12718 // a simple subregister reference or explicit instructions to grab
   12719 // upper bits of a vector.
   12720 static SDValue LowerEXTRACT_SUBVECTOR(SDValue Op, const X86Subtarget &Subtarget,
   12721                                       SelectionDAG &DAG) {
   12722   SDLoc dl(Op);
   12723   SDValue In =  Op.getOperand(0);
   12724   SDValue Idx = Op.getOperand(1);
   12725   unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
   12726   MVT ResVT   = Op.getSimpleValueType();
   12727   MVT InVT    = In.getSimpleValueType();
   12728 
   12729   if (Subtarget.hasFp256()) {
   12730     if (ResVT.is128BitVector() &&
   12731         (InVT.is256BitVector() || InVT.is512BitVector()) &&
   12732         isa<ConstantSDNode>(Idx)) {
   12733       return extract128BitVector(In, IdxVal, DAG, dl);
   12734     }
   12735     if (ResVT.is256BitVector() && InVT.is512BitVector() &&
   12736         isa<ConstantSDNode>(Idx)) {
   12737       return extract256BitVector(In, IdxVal, DAG, dl);
   12738     }
   12739   }
   12740   return SDValue();
   12741 }
   12742 
   12743 // Lower a node with an INSERT_SUBVECTOR opcode.  This may result in a
   12744 // simple superregister reference or explicit instructions to insert
   12745 // the upper bits of a vector.
   12746 static SDValue LowerINSERT_SUBVECTOR(SDValue Op, const X86Subtarget &Subtarget,
   12747                                      SelectionDAG &DAG) {
   12748   if (!Subtarget.hasAVX())
   12749     return SDValue();
   12750 
   12751   SDLoc dl(Op);
   12752   SDValue Vec = Op.getOperand(0);
   12753   SDValue SubVec = Op.getOperand(1);
   12754   SDValue Idx = Op.getOperand(2);
   12755 
   12756   if (!isa<ConstantSDNode>(Idx))
   12757     return SDValue();
   12758 
   12759   unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
   12760   MVT OpVT = Op.getSimpleValueType();
   12761   MVT SubVecVT = SubVec.getSimpleValueType();
   12762 
   12763   // Fold two 16-byte subvector loads into one 32-byte load:
   12764   // (insert_subvector (insert_subvector undef, (load addr), 0),
   12765   //                   (load addr + 16), Elts/2)
   12766   // --> load32 addr
   12767   if ((IdxVal == OpVT.getVectorNumElements() / 2) &&
   12768       Vec.getOpcode() == ISD::INSERT_SUBVECTOR &&
   12769       OpVT.is256BitVector() && SubVecVT.is128BitVector()) {
   12770     auto *Idx2 = dyn_cast<ConstantSDNode>(Vec.getOperand(2));
   12771     if (Idx2 && Idx2->getZExtValue() == 0) {
   12772       // If needed, look through bitcasts to get to the load.
   12773       SDValue SubVec2 = peekThroughBitcasts(Vec.getOperand(1));
   12774       if (auto *FirstLd = dyn_cast<LoadSDNode>(SubVec2)) {
   12775         bool Fast;
   12776         unsigned Alignment = FirstLd->getAlignment();
   12777         unsigned AS = FirstLd->getAddressSpace();
   12778         const X86TargetLowering *TLI = Subtarget.getTargetLowering();
   12779         if (TLI->allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(),
   12780                                     OpVT, AS, Alignment, &Fast) && Fast) {
   12781           SDValue Ops[] = { SubVec2, SubVec };
   12782           if (SDValue Ld = EltsFromConsecutiveLoads(OpVT, Ops, dl, DAG, false))
   12783             return Ld;
   12784         }
   12785       }
   12786     }
   12787   }
   12788 
   12789   if ((OpVT.is256BitVector() || OpVT.is512BitVector()) &&
   12790       SubVecVT.is128BitVector())
   12791     return insert128BitVector(Vec, SubVec, IdxVal, DAG, dl);
   12792 
   12793   if (OpVT.is512BitVector() && SubVecVT.is256BitVector())
   12794     return insert256BitVector(Vec, SubVec, IdxVal, DAG, dl);
   12795 
   12796   if (OpVT.getVectorElementType() == MVT::i1)
   12797     return insert1BitVector(Op, DAG, Subtarget);
   12798 
   12799   return SDValue();
   12800 }
   12801 
   12802 // ConstantPool, JumpTable, GlobalAddress, and ExternalSymbol are lowered as
   12803 // their target countpart wrapped in the X86ISD::Wrapper node. Suppose N is
   12804 // one of the above mentioned nodes. It has to be wrapped because otherwise
   12805 // Select(N) returns N. So the raw TargetGlobalAddress nodes, etc. can only
   12806 // be used to form addressing mode. These wrapped nodes will be selected
   12807 // into MOV32ri.
   12808 SDValue
   12809 X86TargetLowering::LowerConstantPool(SDValue Op, SelectionDAG &DAG) const {
   12810   ConstantPoolSDNode *CP = cast<ConstantPoolSDNode>(Op);
   12811 
   12812   // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the
   12813   // global base reg.
   12814   unsigned char OpFlag = Subtarget.classifyLocalReference(nullptr);
   12815   unsigned WrapperKind = X86ISD::Wrapper;
   12816   CodeModel::Model M = DAG.getTarget().getCodeModel();
   12817 
   12818   if (Subtarget.isPICStyleRIPRel() &&
   12819       (M == CodeModel::Small || M == CodeModel::Kernel))
   12820     WrapperKind = X86ISD::WrapperRIP;
   12821 
   12822   auto PtrVT = getPointerTy(DAG.getDataLayout());
   12823   SDValue Result = DAG.getTargetConstantPool(
   12824       CP->getConstVal(), PtrVT, CP->getAlignment(), CP->getOffset(), OpFlag);
   12825   SDLoc DL(CP);
   12826   Result = DAG.getNode(WrapperKind, DL, PtrVT, Result);
   12827   // With PIC, the address is actually $g + Offset.
   12828   if (OpFlag) {
   12829     Result =
   12830         DAG.getNode(ISD::ADD, DL, PtrVT,
   12831                     DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT), Result);
   12832   }
   12833 
   12834   return Result;
   12835 }
   12836 
   12837 SDValue X86TargetLowering::LowerJumpTable(SDValue Op, SelectionDAG &DAG) const {
   12838   JumpTableSDNode *JT = cast<JumpTableSDNode>(Op);
   12839 
   12840   // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the
   12841   // global base reg.
   12842   unsigned char OpFlag = Subtarget.classifyLocalReference(nullptr);
   12843   unsigned WrapperKind = X86ISD::Wrapper;
   12844   CodeModel::Model M = DAG.getTarget().getCodeModel();
   12845 
   12846   if (Subtarget.isPICStyleRIPRel() &&
   12847       (M == CodeModel::Small || M == CodeModel::Kernel))
   12848     WrapperKind = X86ISD::WrapperRIP;
   12849 
   12850   auto PtrVT = getPointerTy(DAG.getDataLayout());
   12851   SDValue Result = DAG.getTargetJumpTable(JT->getIndex(), PtrVT, OpFlag);
   12852   SDLoc DL(JT);
   12853   Result = DAG.getNode(WrapperKind, DL, PtrVT, Result);
   12854 
   12855   // With PIC, the address is actually $g + Offset.
   12856   if (OpFlag)
   12857     Result =
   12858         DAG.getNode(ISD::ADD, DL, PtrVT,
   12859                     DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT), Result);
   12860 
   12861   return Result;
   12862 }
   12863 
   12864 SDValue
   12865 X86TargetLowering::LowerExternalSymbol(SDValue Op, SelectionDAG &DAG) const {
   12866   const char *Sym = cast<ExternalSymbolSDNode>(Op)->getSymbol();
   12867 
   12868   // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the
   12869   // global base reg.
   12870   const Module *Mod = DAG.getMachineFunction().getFunction()->getParent();
   12871   unsigned char OpFlag = Subtarget.classifyGlobalReference(nullptr, *Mod);
   12872   unsigned WrapperKind = X86ISD::Wrapper;
   12873   CodeModel::Model M = DAG.getTarget().getCodeModel();
   12874 
   12875   if (Subtarget.isPICStyleRIPRel() &&
   12876       (M == CodeModel::Small || M == CodeModel::Kernel))
   12877     WrapperKind = X86ISD::WrapperRIP;
   12878 
   12879   auto PtrVT = getPointerTy(DAG.getDataLayout());
   12880   SDValue Result = DAG.getTargetExternalSymbol(Sym, PtrVT, OpFlag);
   12881 
   12882   SDLoc DL(Op);
   12883   Result = DAG.getNode(WrapperKind, DL, PtrVT, Result);
   12884 
   12885   // With PIC, the address is actually $g + Offset.
   12886   if (isPositionIndependent() && !Subtarget.is64Bit()) {
   12887     Result =
   12888         DAG.getNode(ISD::ADD, DL, PtrVT,
   12889                     DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT), Result);
   12890   }
   12891 
   12892   // For symbols that require a load from a stub to get the address, emit the
   12893   // load.
   12894   if (isGlobalStubReference(OpFlag))
   12895     Result = DAG.getLoad(PtrVT, DL, DAG.getEntryNode(), Result,
   12896                          MachinePointerInfo::getGOT(DAG.getMachineFunction()),
   12897                          false, false, false, 0);
   12898 
   12899   return Result;
   12900 }
   12901 
   12902 SDValue
   12903 X86TargetLowering::LowerBlockAddress(SDValue Op, SelectionDAG &DAG) const {
   12904   // Create the TargetBlockAddressAddress node.
   12905   unsigned char OpFlags =
   12906     Subtarget.classifyBlockAddressReference();
   12907   CodeModel::Model M = DAG.getTarget().getCodeModel();
   12908   const BlockAddress *BA = cast<BlockAddressSDNode>(Op)->getBlockAddress();
   12909   int64_t Offset = cast<BlockAddressSDNode>(Op)->getOffset();
   12910   SDLoc dl(Op);
   12911   auto PtrVT = getPointerTy(DAG.getDataLayout());
   12912   SDValue Result = DAG.getTargetBlockAddress(BA, PtrVT, Offset, OpFlags);
   12913 
   12914   if (Subtarget.isPICStyleRIPRel() &&
   12915       (M == CodeModel::Small || M == CodeModel::Kernel))
   12916     Result = DAG.getNode(X86ISD::WrapperRIP, dl, PtrVT, Result);
   12917   else
   12918     Result = DAG.getNode(X86ISD::Wrapper, dl, PtrVT, Result);
   12919 
   12920   // With PIC, the address is actually $g + Offset.
   12921   if (isGlobalRelativeToPICBase(OpFlags)) {
   12922     Result = DAG.getNode(ISD::ADD, dl, PtrVT,
   12923                          DAG.getNode(X86ISD::GlobalBaseReg, dl, PtrVT), Result);
   12924   }
   12925 
   12926   return Result;
   12927 }
   12928 
   12929 SDValue X86TargetLowering::LowerGlobalAddress(const GlobalValue *GV,
   12930                                               const SDLoc &dl, int64_t Offset,
   12931                                               SelectionDAG &DAG) const {
   12932   // Create the TargetGlobalAddress node, folding in the constant
   12933   // offset if it is legal.
   12934   unsigned char OpFlags = Subtarget.classifyGlobalReference(GV);
   12935   CodeModel::Model M = DAG.getTarget().getCodeModel();
   12936   auto PtrVT = getPointerTy(DAG.getDataLayout());
   12937   SDValue Result;
   12938   if (OpFlags == X86II::MO_NO_FLAG &&
   12939       X86::isOffsetSuitableForCodeModel(Offset, M)) {
   12940     // A direct static reference to a global.
   12941     Result = DAG.getTargetGlobalAddress(GV, dl, PtrVT, Offset);
   12942     Offset = 0;
   12943   } else {
   12944     Result = DAG.getTargetGlobalAddress(GV, dl, PtrVT, 0, OpFlags);
   12945   }
   12946 
   12947   if (Subtarget.isPICStyleRIPRel() &&
   12948       (M == CodeModel::Small || M == CodeModel::Kernel))
   12949     Result = DAG.getNode(X86ISD::WrapperRIP, dl, PtrVT, Result);
   12950   else
   12951     Result = DAG.getNode(X86ISD::Wrapper, dl, PtrVT, Result);
   12952 
   12953   // With PIC, the address is actually $g + Offset.
   12954   if (isGlobalRelativeToPICBase(OpFlags)) {
   12955     Result = DAG.getNode(ISD::ADD, dl, PtrVT,
   12956                          DAG.getNode(X86ISD::GlobalBaseReg, dl, PtrVT), Result);
   12957   }
   12958 
   12959   // For globals that require a load from a stub to get the address, emit the
   12960   // load.
   12961   if (isGlobalStubReference(OpFlags))
   12962     Result = DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), Result,
   12963                          MachinePointerInfo::getGOT(DAG.getMachineFunction()),
   12964                          false, false, false, 0);
   12965 
   12966   // If there was a non-zero offset that we didn't fold, create an explicit
   12967   // addition for it.
   12968   if (Offset != 0)
   12969     Result = DAG.getNode(ISD::ADD, dl, PtrVT, Result,
   12970                          DAG.getConstant(Offset, dl, PtrVT));
   12971 
   12972   return Result;
   12973 }
   12974 
   12975 SDValue
   12976 X86TargetLowering::LowerGlobalAddress(SDValue Op, SelectionDAG &DAG) const {
   12977   const GlobalValue *GV = cast<GlobalAddressSDNode>(Op)->getGlobal();
   12978   int64_t Offset = cast<GlobalAddressSDNode>(Op)->getOffset();
   12979   return LowerGlobalAddress(GV, SDLoc(Op), Offset, DAG);
   12980 }
   12981 
   12982 static SDValue
   12983 GetTLSADDR(SelectionDAG &DAG, SDValue Chain, GlobalAddressSDNode *GA,
   12984            SDValue *InFlag, const EVT PtrVT, unsigned ReturnReg,
   12985            unsigned char OperandFlags, bool LocalDynamic = false) {
   12986   MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo();
   12987   SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
   12988   SDLoc dl(GA);
   12989   SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), dl,
   12990                                            GA->getValueType(0),
   12991                                            GA->getOffset(),
   12992                                            OperandFlags);
   12993 
   12994   X86ISD::NodeType CallType = LocalDynamic ? X86ISD::TLSBASEADDR
   12995                                            : X86ISD::TLSADDR;
   12996 
   12997   if (InFlag) {
   12998     SDValue Ops[] = { Chain,  TGA, *InFlag };
   12999     Chain = DAG.getNode(CallType, dl, NodeTys, Ops);
   13000   } else {
   13001     SDValue Ops[]  = { Chain, TGA };
   13002     Chain = DAG.getNode(CallType, dl, NodeTys, Ops);
   13003   }
   13004 
   13005   // TLSADDR will be codegen'ed as call. Inform MFI that function has calls.
   13006   MFI->setAdjustsStack(true);
   13007   MFI->setHasCalls(true);
   13008 
   13009   SDValue Flag = Chain.getValue(1);
   13010   return DAG.getCopyFromReg(Chain, dl, ReturnReg, PtrVT, Flag);
   13011 }
   13012 
   13013 // Lower ISD::GlobalTLSAddress using the "general dynamic" model, 32 bit
   13014 static SDValue
   13015 LowerToTLSGeneralDynamicModel32(GlobalAddressSDNode *GA, SelectionDAG &DAG,
   13016                                 const EVT PtrVT) {
   13017   SDValue InFlag;
   13018   SDLoc dl(GA);  // ? function entry point might be better
   13019   SDValue Chain = DAG.getCopyToReg(DAG.getEntryNode(), dl, X86::EBX,
   13020                                    DAG.getNode(X86ISD::GlobalBaseReg,
   13021                                                SDLoc(), PtrVT), InFlag);
   13022   InFlag = Chain.getValue(1);
   13023 
   13024   return GetTLSADDR(DAG, Chain, GA, &InFlag, PtrVT, X86::EAX, X86II::MO_TLSGD);
   13025 }
   13026 
   13027 // Lower ISD::GlobalTLSAddress using the "general dynamic" model, 64 bit
   13028 static SDValue
   13029 LowerToTLSGeneralDynamicModel64(GlobalAddressSDNode *GA, SelectionDAG &DAG,
   13030                                 const EVT PtrVT) {
   13031   return GetTLSADDR(DAG, DAG.getEntryNode(), GA, nullptr, PtrVT,
   13032                     X86::RAX, X86II::MO_TLSGD);
   13033 }
   13034 
   13035 static SDValue LowerToTLSLocalDynamicModel(GlobalAddressSDNode *GA,
   13036                                            SelectionDAG &DAG,
   13037                                            const EVT PtrVT,
   13038                                            bool is64Bit) {
   13039   SDLoc dl(GA);
   13040 
   13041   // Get the start address of the TLS block for this module.
   13042   X86MachineFunctionInfo* MFI = DAG.getMachineFunction()
   13043       .getInfo<X86MachineFunctionInfo>();
   13044   MFI->incNumLocalDynamicTLSAccesses();
   13045 
   13046   SDValue Base;
   13047   if (is64Bit) {
   13048     Base = GetTLSADDR(DAG, DAG.getEntryNode(), GA, nullptr, PtrVT, X86::RAX,
   13049                       X86II::MO_TLSLD, /*LocalDynamic=*/true);
   13050   } else {
   13051     SDValue InFlag;
   13052     SDValue Chain = DAG.getCopyToReg(DAG.getEntryNode(), dl, X86::EBX,
   13053         DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT), InFlag);
   13054     InFlag = Chain.getValue(1);
   13055     Base = GetTLSADDR(DAG, Chain, GA, &InFlag, PtrVT, X86::EAX,
   13056                       X86II::MO_TLSLDM, /*LocalDynamic=*/true);
   13057   }
   13058 
   13059   // Note: the CleanupLocalDynamicTLSPass will remove redundant computations
   13060   // of Base.
   13061 
   13062   // Build x@dtpoff.
   13063   unsigned char OperandFlags = X86II::MO_DTPOFF;
   13064   unsigned WrapperKind = X86ISD::Wrapper;
   13065   SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), dl,
   13066                                            GA->getValueType(0),
   13067                                            GA->getOffset(), OperandFlags);
   13068   SDValue Offset = DAG.getNode(WrapperKind, dl, PtrVT, TGA);
   13069 
   13070   // Add x@dtpoff with the base.
   13071   return DAG.getNode(ISD::ADD, dl, PtrVT, Offset, Base);
   13072 }
   13073 
   13074 // Lower ISD::GlobalTLSAddress using the "initial exec" or "local exec" model.
   13075 static SDValue LowerToTLSExecModel(GlobalAddressSDNode *GA, SelectionDAG &DAG,
   13076                                    const EVT PtrVT, TLSModel::Model model,
   13077                                    bool is64Bit, bool isPIC) {
   13078   SDLoc dl(GA);
   13079 
   13080   // Get the Thread Pointer, which is %gs:0 (32-bit) or %fs:0 (64-bit).
   13081   Value *Ptr = Constant::getNullValue(Type::getInt8PtrTy(*DAG.getContext(),
   13082                                                          is64Bit ? 257 : 256));
   13083 
   13084   SDValue ThreadPointer =
   13085       DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), DAG.getIntPtrConstant(0, dl),
   13086                   MachinePointerInfo(Ptr), false, false, false, 0);
   13087 
   13088   unsigned char OperandFlags = 0;
   13089   // Most TLS accesses are not RIP relative, even on x86-64.  One exception is
   13090   // initialexec.
   13091   unsigned WrapperKind = X86ISD::Wrapper;
   13092   if (model == TLSModel::LocalExec) {
   13093     OperandFlags = is64Bit ? X86II::MO_TPOFF : X86II::MO_NTPOFF;
   13094   } else if (model == TLSModel::InitialExec) {
   13095     if (is64Bit) {
   13096       OperandFlags = X86II::MO_GOTTPOFF;
   13097       WrapperKind = X86ISD::WrapperRIP;
   13098     } else {
   13099       OperandFlags = isPIC ? X86II::MO_GOTNTPOFF : X86II::MO_INDNTPOFF;
   13100     }
   13101   } else {
   13102     llvm_unreachable("Unexpected model");
   13103   }
   13104 
   13105   // emit "addl x@ntpoff,%eax" (local exec)
   13106   // or "addl x@indntpoff,%eax" (initial exec)
   13107   // or "addl x@gotntpoff(%ebx) ,%eax" (initial exec, 32-bit pic)
   13108   SDValue TGA =
   13109       DAG.getTargetGlobalAddress(GA->getGlobal(), dl, GA->getValueType(0),
   13110                                  GA->getOffset(), OperandFlags);
   13111   SDValue Offset = DAG.getNode(WrapperKind, dl, PtrVT, TGA);
   13112 
   13113   if (model == TLSModel::InitialExec) {
   13114     if (isPIC && !is64Bit) {
   13115       Offset = DAG.getNode(ISD::ADD, dl, PtrVT,
   13116                            DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT),
   13117                            Offset);
   13118     }
   13119 
   13120     Offset = DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), Offset,
   13121                          MachinePointerInfo::getGOT(DAG.getMachineFunction()),
   13122                          false, false, false, 0);
   13123   }
   13124 
   13125   // The address of the thread local variable is the add of the thread
   13126   // pointer with the offset of the variable.
   13127   return DAG.getNode(ISD::ADD, dl, PtrVT, ThreadPointer, Offset);
   13128 }
   13129 
   13130 SDValue
   13131 X86TargetLowering::LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const {
   13132 
   13133   GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(Op);
   13134 
   13135   if (DAG.getTarget().Options.EmulatedTLS)
   13136     return LowerToTLSEmulatedModel(GA, DAG);
   13137 
   13138   const GlobalValue *GV = GA->getGlobal();
   13139   auto PtrVT = getPointerTy(DAG.getDataLayout());
   13140   bool PositionIndependent = isPositionIndependent();
   13141 
   13142   if (Subtarget.isTargetELF()) {
   13143     TLSModel::Model model = DAG.getTarget().getTLSModel(GV);
   13144     switch (model) {
   13145       case TLSModel::GeneralDynamic:
   13146         if (Subtarget.is64Bit())
   13147           return LowerToTLSGeneralDynamicModel64(GA, DAG, PtrVT);
   13148         return LowerToTLSGeneralDynamicModel32(GA, DAG, PtrVT);
   13149       case TLSModel::LocalDynamic:
   13150         return LowerToTLSLocalDynamicModel(GA, DAG, PtrVT,
   13151                                            Subtarget.is64Bit());
   13152       case TLSModel::InitialExec:
   13153       case TLSModel::LocalExec:
   13154         return LowerToTLSExecModel(GA, DAG, PtrVT, model, Subtarget.is64Bit(),
   13155                                    PositionIndependent);
   13156     }
   13157     llvm_unreachable("Unknown TLS model.");
   13158   }
   13159 
   13160   if (Subtarget.isTargetDarwin()) {
   13161     // Darwin only has one model of TLS.  Lower to that.
   13162     unsigned char OpFlag = 0;
   13163     unsigned WrapperKind = Subtarget.isPICStyleRIPRel() ?
   13164                            X86ISD::WrapperRIP : X86ISD::Wrapper;
   13165 
   13166     // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the
   13167     // global base reg.
   13168     bool PIC32 = PositionIndependent && !Subtarget.is64Bit();
   13169     if (PIC32)
   13170       OpFlag = X86II::MO_TLVP_PIC_BASE;
   13171     else
   13172       OpFlag = X86II::MO_TLVP;
   13173     SDLoc DL(Op);
   13174     SDValue Result = DAG.getTargetGlobalAddress(GA->getGlobal(), DL,
   13175                                                 GA->getValueType(0),
   13176                                                 GA->getOffset(), OpFlag);
   13177     SDValue Offset = DAG.getNode(WrapperKind, DL, PtrVT, Result);
   13178 
   13179     // With PIC32, the address is actually $g + Offset.
   13180     if (PIC32)
   13181       Offset = DAG.getNode(ISD::ADD, DL, PtrVT,
   13182                            DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT),
   13183                            Offset);
   13184 
   13185     // Lowering the machine isd will make sure everything is in the right
   13186     // location.
   13187     SDValue Chain = DAG.getEntryNode();
   13188     SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
   13189     Chain = DAG.getCALLSEQ_START(Chain, DAG.getIntPtrConstant(0, DL, true), DL);
   13190     SDValue Args[] = { Chain, Offset };
   13191     Chain = DAG.getNode(X86ISD::TLSCALL, DL, NodeTys, Args);
   13192     Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(0, DL, true),
   13193                                DAG.getIntPtrConstant(0, DL, true),
   13194                                Chain.getValue(1), DL);
   13195 
   13196     // TLSCALL will be codegen'ed as call. Inform MFI that function has calls.
   13197     MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo();
   13198     MFI->setAdjustsStack(true);
   13199 
   13200     // And our return value (tls address) is in the standard call return value
   13201     // location.
   13202     unsigned Reg = Subtarget.is64Bit() ? X86::RAX : X86::EAX;
   13203     return DAG.getCopyFromReg(Chain, DL, Reg, PtrVT, Chain.getValue(1));
   13204   }
   13205 
   13206   if (Subtarget.isTargetKnownWindowsMSVC() ||
   13207       Subtarget.isTargetWindowsItanium() ||
   13208       Subtarget.isTargetWindowsGNU()) {
   13209     // Just use the implicit TLS architecture
   13210     // Need to generate someting similar to:
   13211     //   mov     rdx, qword [gs:abs 58H]; Load pointer to ThreadLocalStorage
   13212     //                                  ; from TEB
   13213     //   mov     ecx, dword [rel _tls_index]: Load index (from C runtime)
   13214     //   mov     rcx, qword [rdx+rcx*8]
   13215     //   mov     eax, .tls$:tlsvar
   13216     //   [rax+rcx] contains the address
   13217     // Windows 64bit: gs:0x58
   13218     // Windows 32bit: fs:__tls_array
   13219 
   13220     SDLoc dl(GA);
   13221     SDValue Chain = DAG.getEntryNode();
   13222 
   13223     // Get the Thread Pointer, which is %fs:__tls_array (32-bit) or
   13224     // %gs:0x58 (64-bit). On MinGW, __tls_array is not available, so directly
   13225     // use its literal value of 0x2C.
   13226     Value *Ptr = Constant::getNullValue(Subtarget.is64Bit()
   13227                                         ? Type::getInt8PtrTy(*DAG.getContext(),
   13228                                                              256)
   13229                                         : Type::getInt32PtrTy(*DAG.getContext(),
   13230                                                               257));
   13231 
   13232     SDValue TlsArray = Subtarget.is64Bit()
   13233                            ? DAG.getIntPtrConstant(0x58, dl)
   13234                            : (Subtarget.isTargetWindowsGNU()
   13235                                   ? DAG.getIntPtrConstant(0x2C, dl)
   13236                                   : DAG.getExternalSymbol("_tls_array", PtrVT));
   13237 
   13238     SDValue ThreadPointer =
   13239         DAG.getLoad(PtrVT, dl, Chain, TlsArray, MachinePointerInfo(Ptr), false,
   13240                     false, false, 0);
   13241 
   13242     SDValue res;
   13243     if (GV->getThreadLocalMode() == GlobalVariable::LocalExecTLSModel) {
   13244       res = ThreadPointer;
   13245     } else {
   13246       // Load the _tls_index variable
   13247       SDValue IDX = DAG.getExternalSymbol("_tls_index", PtrVT);
   13248       if (Subtarget.is64Bit())
   13249         IDX = DAG.getExtLoad(ISD::ZEXTLOAD, dl, PtrVT, Chain, IDX,
   13250                              MachinePointerInfo(), MVT::i32, false, false,
   13251                              false, 0);
   13252       else
   13253         IDX = DAG.getLoad(PtrVT, dl, Chain, IDX, MachinePointerInfo(), false,
   13254                           false, false, 0);
   13255 
   13256       auto &DL = DAG.getDataLayout();
   13257       SDValue Scale =
   13258           DAG.getConstant(Log2_64_Ceil(DL.getPointerSize()), dl, PtrVT);
   13259       IDX = DAG.getNode(ISD::SHL, dl, PtrVT, IDX, Scale);
   13260 
   13261       res = DAG.getNode(ISD::ADD, dl, PtrVT, ThreadPointer, IDX);
   13262     }
   13263 
   13264     res = DAG.getLoad(PtrVT, dl, Chain, res, MachinePointerInfo(), false, false,
   13265                       false, 0);
   13266 
   13267     // Get the offset of start of .tls section
   13268     SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), dl,
   13269                                              GA->getValueType(0),
   13270                                              GA->getOffset(), X86II::MO_SECREL);
   13271     SDValue Offset = DAG.getNode(X86ISD::Wrapper, dl, PtrVT, TGA);
   13272 
   13273     // The address of the thread local variable is the add of the thread
   13274     // pointer with the offset of the variable.
   13275     return DAG.getNode(ISD::ADD, dl, PtrVT, res, Offset);
   13276   }
   13277 
   13278   llvm_unreachable("TLS not implemented for this target.");
   13279 }
   13280 
   13281 /// Lower SRA_PARTS and friends, which return two i32 values
   13282 /// and take a 2 x i32 value to shift plus a shift amount.
   13283 static SDValue LowerShiftParts(SDValue Op, SelectionDAG &DAG) {
   13284   assert(Op.getNumOperands() == 3 && "Not a double-shift!");
   13285   MVT VT = Op.getSimpleValueType();
   13286   unsigned VTBits = VT.getSizeInBits();
   13287   SDLoc dl(Op);
   13288   bool isSRA = Op.getOpcode() == ISD::SRA_PARTS;
   13289   SDValue ShOpLo = Op.getOperand(0);
   13290   SDValue ShOpHi = Op.getOperand(1);
   13291   SDValue ShAmt  = Op.getOperand(2);
   13292   // X86ISD::SHLD and X86ISD::SHRD have defined overflow behavior but the
   13293   // generic ISD nodes haven't. Insert an AND to be safe, it's optimized away
   13294   // during isel.
   13295   SDValue SafeShAmt = DAG.getNode(ISD::AND, dl, MVT::i8, ShAmt,
   13296                                   DAG.getConstant(VTBits - 1, dl, MVT::i8));
   13297   SDValue Tmp1 = isSRA ? DAG.getNode(ISD::SRA, dl, VT, ShOpHi,
   13298                                      DAG.getConstant(VTBits - 1, dl, MVT::i8))
   13299                        : DAG.getConstant(0, dl, VT);
   13300 
   13301   SDValue Tmp2, Tmp3;
   13302   if (Op.getOpcode() == ISD::SHL_PARTS) {
   13303     Tmp2 = DAG.getNode(X86ISD::SHLD, dl, VT, ShOpHi, ShOpLo, ShAmt);
   13304     Tmp3 = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, SafeShAmt);
   13305   } else {
   13306     Tmp2 = DAG.getNode(X86ISD::SHRD, dl, VT, ShOpLo, ShOpHi, ShAmt);
   13307     Tmp3 = DAG.getNode(isSRA ? ISD::SRA : ISD::SRL, dl, VT, ShOpHi, SafeShAmt);
   13308   }
   13309 
   13310   // If the shift amount is larger or equal than the width of a part we can't
   13311   // rely on the results of shld/shrd. Insert a test and select the appropriate
   13312   // values for large shift amounts.
   13313   SDValue AndNode = DAG.getNode(ISD::AND, dl, MVT::i8, ShAmt,
   13314                                 DAG.getConstant(VTBits, dl, MVT::i8));
   13315   SDValue Cond = DAG.getNode(X86ISD::CMP, dl, MVT::i32,
   13316                              AndNode, DAG.getConstant(0, dl, MVT::i8));
   13317 
   13318   SDValue Hi, Lo;
   13319   SDValue CC = DAG.getConstant(X86::COND_NE, dl, MVT::i8);
   13320   SDValue Ops0[4] = { Tmp2, Tmp3, CC, Cond };
   13321   SDValue Ops1[4] = { Tmp3, Tmp1, CC, Cond };
   13322 
   13323   if (Op.getOpcode() == ISD::SHL_PARTS) {
   13324     Hi = DAG.getNode(X86ISD::CMOV, dl, VT, Ops0);
   13325     Lo = DAG.getNode(X86ISD::CMOV, dl, VT, Ops1);
   13326   } else {
   13327     Lo = DAG.getNode(X86ISD::CMOV, dl, VT, Ops0);
   13328     Hi = DAG.getNode(X86ISD::CMOV, dl, VT, Ops1);
   13329   }
   13330 
   13331   SDValue Ops[2] = { Lo, Hi };
   13332   return DAG.getMergeValues(Ops, dl);
   13333 }
   13334 
   13335 SDValue X86TargetLowering::LowerSINT_TO_FP(SDValue Op,
   13336                                            SelectionDAG &DAG) const {
   13337   SDValue Src = Op.getOperand(0);
   13338   MVT SrcVT = Src.getSimpleValueType();
   13339   MVT VT = Op.getSimpleValueType();
   13340   SDLoc dl(Op);
   13341 
   13342   if (SrcVT.isVector()) {
   13343     if (SrcVT == MVT::v2i32 && VT == MVT::v2f64) {
   13344       return DAG.getNode(X86ISD::CVTDQ2PD, dl, VT,
   13345                          DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i32, Src,
   13346                          DAG.getUNDEF(SrcVT)));
   13347     }
   13348     if (SrcVT.getVectorElementType() == MVT::i1) {
   13349       MVT IntegerVT = MVT::getVectorVT(MVT::i32, SrcVT.getVectorNumElements());
   13350       return DAG.getNode(ISD::SINT_TO_FP, dl, Op.getValueType(),
   13351                          DAG.getNode(ISD::SIGN_EXTEND, dl, IntegerVT, Src));
   13352     }
   13353     return SDValue();
   13354   }
   13355 
   13356   assert(SrcVT <= MVT::i64 && SrcVT >= MVT::i16 &&
   13357          "Unknown SINT_TO_FP to lower!");
   13358 
   13359   // These are really Legal; return the operand so the caller accepts it as
   13360   // Legal.
   13361   if (SrcVT == MVT::i32 && isScalarFPTypeInSSEReg(Op.getValueType()))
   13362     return Op;
   13363   if (SrcVT == MVT::i64 && isScalarFPTypeInSSEReg(Op.getValueType()) &&
   13364       Subtarget.is64Bit()) {
   13365     return Op;
   13366   }
   13367 
   13368   SDValue ValueToStore = Op.getOperand(0);
   13369   if (SrcVT == MVT::i64 && isScalarFPTypeInSSEReg(Op.getValueType()) &&
   13370       !Subtarget.is64Bit())
   13371     // Bitcasting to f64 here allows us to do a single 64-bit store from
   13372     // an SSE register, avoiding the store forwarding penalty that would come
   13373     // with two 32-bit stores.
   13374     ValueToStore = DAG.getBitcast(MVT::f64, ValueToStore);
   13375 
   13376   unsigned Size = SrcVT.getSizeInBits()/8;
   13377   MachineFunction &MF = DAG.getMachineFunction();
   13378   auto PtrVT = getPointerTy(MF.getDataLayout());
   13379   int SSFI = MF.getFrameInfo()->CreateStackObject(Size, Size, false);
   13380   SDValue StackSlot = DAG.getFrameIndex(SSFI, PtrVT);
   13381   SDValue Chain = DAG.getStore(
   13382       DAG.getEntryNode(), dl, ValueToStore, StackSlot,
   13383       MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SSFI), false,
   13384       false, 0);
   13385   return BuildFILD(Op, SrcVT, Chain, StackSlot, DAG);
   13386 }
   13387 
   13388 SDValue X86TargetLowering::BuildFILD(SDValue Op, EVT SrcVT, SDValue Chain,
   13389                                      SDValue StackSlot,
   13390                                      SelectionDAG &DAG) const {
   13391   // Build the FILD
   13392   SDLoc DL(Op);
   13393   SDVTList Tys;
   13394   bool useSSE = isScalarFPTypeInSSEReg(Op.getValueType());
   13395   if (useSSE)
   13396     Tys = DAG.getVTList(MVT::f64, MVT::Other, MVT::Glue);
   13397   else
   13398     Tys = DAG.getVTList(Op.getValueType(), MVT::Other);
   13399 
   13400   unsigned ByteSize = SrcVT.getSizeInBits()/8;
   13401 
   13402   FrameIndexSDNode *FI = dyn_cast<FrameIndexSDNode>(StackSlot);
   13403   MachineMemOperand *MMO;
   13404   if (FI) {
   13405     int SSFI = FI->getIndex();
   13406     MMO = DAG.getMachineFunction().getMachineMemOperand(
   13407         MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SSFI),
   13408         MachineMemOperand::MOLoad, ByteSize, ByteSize);
   13409   } else {
   13410     MMO = cast<LoadSDNode>(StackSlot)->getMemOperand();
   13411     StackSlot = StackSlot.getOperand(1);
   13412   }
   13413   SDValue Ops[] = { Chain, StackSlot, DAG.getValueType(SrcVT) };
   13414   SDValue Result = DAG.getMemIntrinsicNode(useSSE ? X86ISD::FILD_FLAG :
   13415                                            X86ISD::FILD, DL,
   13416                                            Tys, Ops, SrcVT, MMO);
   13417 
   13418   if (useSSE) {
   13419     Chain = Result.getValue(1);
   13420     SDValue InFlag = Result.getValue(2);
   13421 
   13422     // FIXME: Currently the FST is flagged to the FILD_FLAG. This
   13423     // shouldn't be necessary except that RFP cannot be live across
   13424     // multiple blocks. When stackifier is fixed, they can be uncoupled.
   13425     MachineFunction &MF = DAG.getMachineFunction();
   13426     unsigned SSFISize = Op.getValueType().getSizeInBits()/8;
   13427     int SSFI = MF.getFrameInfo()->CreateStackObject(SSFISize, SSFISize, false);
   13428     auto PtrVT = getPointerTy(MF.getDataLayout());
   13429     SDValue StackSlot = DAG.getFrameIndex(SSFI, PtrVT);
   13430     Tys = DAG.getVTList(MVT::Other);
   13431     SDValue Ops[] = {
   13432       Chain, Result, StackSlot, DAG.getValueType(Op.getValueType()), InFlag
   13433     };
   13434     MachineMemOperand *MMO = DAG.getMachineFunction().getMachineMemOperand(
   13435         MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SSFI),
   13436         MachineMemOperand::MOStore, SSFISize, SSFISize);
   13437 
   13438     Chain = DAG.getMemIntrinsicNode(X86ISD::FST, DL, Tys,
   13439                                     Ops, Op.getValueType(), MMO);
   13440     Result = DAG.getLoad(
   13441         Op.getValueType(), DL, Chain, StackSlot,
   13442         MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SSFI),
   13443         false, false, false, 0);
   13444   }
   13445 
   13446   return Result;
   13447 }
   13448 
   13449 /// 64-bit unsigned integer to double expansion.
   13450 SDValue X86TargetLowering::LowerUINT_TO_FP_i64(SDValue Op,
   13451                                                SelectionDAG &DAG) const {
   13452   // This algorithm is not obvious. Here it is what we're trying to output:
   13453   /*
   13454      movq       %rax,  %xmm0
   13455      punpckldq  (c0),  %xmm0  // c0: (uint4){ 0x43300000U, 0x45300000U, 0U, 0U }
   13456      subpd      (c1),  %xmm0  // c1: (double2){ 0x1.0p52, 0x1.0p52 * 0x1.0p32 }
   13457      #ifdef __SSE3__
   13458        haddpd   %xmm0, %xmm0
   13459      #else
   13460        pshufd   $0x4e, %xmm0, %xmm1
   13461        addpd    %xmm1, %xmm0
   13462      #endif
   13463   */
   13464 
   13465   SDLoc dl(Op);
   13466   LLVMContext *Context = DAG.getContext();
   13467 
   13468   // Build some magic constants.
   13469   static const uint32_t CV0[] = { 0x43300000, 0x45300000, 0, 0 };
   13470   Constant *C0 = ConstantDataVector::get(*Context, CV0);
   13471   auto PtrVT = getPointerTy(DAG.getDataLayout());
   13472   SDValue CPIdx0 = DAG.getConstantPool(C0, PtrVT, 16);
   13473 
   13474   SmallVector<Constant*,2> CV1;
   13475   CV1.push_back(
   13476     ConstantFP::get(*Context, APFloat(APFloat::IEEEdouble,
   13477                                       APInt(64, 0x4330000000000000ULL))));
   13478   CV1.push_back(
   13479     ConstantFP::get(*Context, APFloat(APFloat::IEEEdouble,
   13480                                       APInt(64, 0x4530000000000000ULL))));
   13481   Constant *C1 = ConstantVector::get(CV1);
   13482   SDValue CPIdx1 = DAG.getConstantPool(C1, PtrVT, 16);
   13483 
   13484   // Load the 64-bit value into an XMM register.
   13485   SDValue XR1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64,
   13486                             Op.getOperand(0));
   13487   SDValue CLod0 =
   13488       DAG.getLoad(MVT::v4i32, dl, DAG.getEntryNode(), CPIdx0,
   13489                   MachinePointerInfo::getConstantPool(DAG.getMachineFunction()),
   13490                   false, false, false, 16);
   13491   SDValue Unpck1 =
   13492       getUnpackl(DAG, dl, MVT::v4i32, DAG.getBitcast(MVT::v4i32, XR1), CLod0);
   13493 
   13494   SDValue CLod1 =
   13495       DAG.getLoad(MVT::v2f64, dl, CLod0.getValue(1), CPIdx1,
   13496                   MachinePointerInfo::getConstantPool(DAG.getMachineFunction()),
   13497                   false, false, false, 16);
   13498   SDValue XR2F = DAG.getBitcast(MVT::v2f64, Unpck1);
   13499   // TODO: Are there any fast-math-flags to propagate here?
   13500   SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::v2f64, XR2F, CLod1);
   13501   SDValue Result;
   13502 
   13503   if (Subtarget.hasSSE3()) {
   13504     // FIXME: The 'haddpd' instruction may be slower than 'movhlps + addsd'.
   13505     Result = DAG.getNode(X86ISD::FHADD, dl, MVT::v2f64, Sub, Sub);
   13506   } else {
   13507     SDValue S2F = DAG.getBitcast(MVT::v4i32, Sub);
   13508     SDValue Shuffle = getTargetShuffleNode(X86ISD::PSHUFD, dl, MVT::v4i32,
   13509                                            S2F, 0x4E, DAG);
   13510     Result = DAG.getNode(ISD::FADD, dl, MVT::v2f64,
   13511                          DAG.getBitcast(MVT::v2f64, Shuffle), Sub);
   13512   }
   13513 
   13514   return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, Result,
   13515                      DAG.getIntPtrConstant(0, dl));
   13516 }
   13517 
   13518 /// 32-bit unsigned integer to float expansion.
   13519 SDValue X86TargetLowering::LowerUINT_TO_FP_i32(SDValue Op,
   13520                                                SelectionDAG &DAG) const {
   13521   SDLoc dl(Op);
   13522   // FP constant to bias correct the final result.
   13523   SDValue Bias = DAG.getConstantFP(BitsToDouble(0x4330000000000000ULL), dl,
   13524                                    MVT::f64);
   13525 
   13526   // Load the 32-bit value into an XMM register.
   13527   SDValue Load = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32,
   13528                              Op.getOperand(0));
   13529 
   13530   // Zero out the upper parts of the register.
   13531   Load = getShuffleVectorZeroOrUndef(Load, 0, true, Subtarget, DAG);
   13532 
   13533   Load = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64,
   13534                      DAG.getBitcast(MVT::v2f64, Load),
   13535                      DAG.getIntPtrConstant(0, dl));
   13536 
   13537   // Or the load with the bias.
   13538   SDValue Or = DAG.getNode(
   13539       ISD::OR, dl, MVT::v2i64,
   13540       DAG.getBitcast(MVT::v2i64,
   13541                      DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2f64, Load)),
   13542       DAG.getBitcast(MVT::v2i64,
   13543                      DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2f64, Bias)));
   13544   Or =
   13545       DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64,
   13546                   DAG.getBitcast(MVT::v2f64, Or), DAG.getIntPtrConstant(0, dl));
   13547 
   13548   // Subtract the bias.
   13549   // TODO: Are there any fast-math-flags to propagate here?
   13550   SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::f64, Or, Bias);
   13551 
   13552   // Handle final rounding.
   13553   MVT DestVT = Op.getSimpleValueType();
   13554 
   13555   if (DestVT.bitsLT(MVT::f64))
   13556     return DAG.getNode(ISD::FP_ROUND, dl, DestVT, Sub,
   13557                        DAG.getIntPtrConstant(0, dl));
   13558   if (DestVT.bitsGT(MVT::f64))
   13559     return DAG.getNode(ISD::FP_EXTEND, dl, DestVT, Sub);
   13560 
   13561   // Handle final rounding.
   13562   return Sub;
   13563 }
   13564 
   13565 static SDValue lowerUINT_TO_FP_vXi32(SDValue Op, SelectionDAG &DAG,
   13566                                      const X86Subtarget &Subtarget) {
   13567   // The algorithm is the following:
   13568   // #ifdef __SSE4_1__
   13569   //     uint4 lo = _mm_blend_epi16( v, (uint4) 0x4b000000, 0xaa);
   13570   //     uint4 hi = _mm_blend_epi16( _mm_srli_epi32(v,16),
   13571   //                                 (uint4) 0x53000000, 0xaa);
   13572   // #else
   13573   //     uint4 lo = (v & (uint4) 0xffff) | (uint4) 0x4b000000;
   13574   //     uint4 hi = (v >> 16) | (uint4) 0x53000000;
   13575   // #endif
   13576   //     float4 fhi = (float4) hi - (0x1.0p39f + 0x1.0p23f);
   13577   //     return (float4) lo + fhi;
   13578 
   13579   // We shouldn't use it when unsafe-fp-math is enabled though: we might later
   13580   // reassociate the two FADDs, and if we do that, the algorithm fails
   13581   // spectacularly (PR24512).
   13582   // FIXME: If we ever have some kind of Machine FMF, this should be marked
   13583   // as non-fast and always be enabled. Why isn't SDAG FMF enough? Because
   13584   // there's also the MachineCombiner reassociations happening on Machine IR.
   13585   if (DAG.getTarget().Options.UnsafeFPMath)
   13586     return SDValue();
   13587 
   13588   SDLoc DL(Op);
   13589   SDValue V = Op->getOperand(0);
   13590   MVT VecIntVT = V.getSimpleValueType();
   13591   bool Is128 = VecIntVT == MVT::v4i32;
   13592   MVT VecFloatVT = Is128 ? MVT::v4f32 : MVT::v8f32;
   13593   // If we convert to something else than the supported type, e.g., to v4f64,
   13594   // abort early.
   13595   if (VecFloatVT != Op->getSimpleValueType(0))
   13596     return SDValue();
   13597 
   13598   assert((VecIntVT == MVT::v4i32 || VecIntVT == MVT::v8i32) &&
   13599          "Unsupported custom type");
   13600 
   13601   // In the #idef/#else code, we have in common:
   13602   // - The vector of constants:
   13603   // -- 0x4b000000
   13604   // -- 0x53000000
   13605   // - A shift:
   13606   // -- v >> 16
   13607 
   13608   // Create the splat vector for 0x4b000000.
   13609   SDValue VecCstLow = DAG.getConstant(0x4b000000, DL, VecIntVT);
   13610   // Create the splat vector for 0x53000000.
   13611   SDValue VecCstHigh = DAG.getConstant(0x53000000, DL, VecIntVT);
   13612 
   13613   // Create the right shift.
   13614   SDValue VecCstShift = DAG.getConstant(16, DL, VecIntVT);
   13615   SDValue HighShift = DAG.getNode(ISD::SRL, DL, VecIntVT, V, VecCstShift);
   13616 
   13617   SDValue Low, High;
   13618   if (Subtarget.hasSSE41()) {
   13619     MVT VecI16VT = Is128 ? MVT::v8i16 : MVT::v16i16;
   13620     //     uint4 lo = _mm_blend_epi16( v, (uint4) 0x4b000000, 0xaa);
   13621     SDValue VecCstLowBitcast = DAG.getBitcast(VecI16VT, VecCstLow);
   13622     SDValue VecBitcast = DAG.getBitcast(VecI16VT, V);
   13623     // Low will be bitcasted right away, so do not bother bitcasting back to its
   13624     // original type.
   13625     Low = DAG.getNode(X86ISD::BLENDI, DL, VecI16VT, VecBitcast,
   13626                       VecCstLowBitcast, DAG.getConstant(0xaa, DL, MVT::i32));
   13627     //     uint4 hi = _mm_blend_epi16( _mm_srli_epi32(v,16),
   13628     //                                 (uint4) 0x53000000, 0xaa);
   13629     SDValue VecCstHighBitcast = DAG.getBitcast(VecI16VT, VecCstHigh);
   13630     SDValue VecShiftBitcast = DAG.getBitcast(VecI16VT, HighShift);
   13631     // High will be bitcasted right away, so do not bother bitcasting back to
   13632     // its original type.
   13633     High = DAG.getNode(X86ISD::BLENDI, DL, VecI16VT, VecShiftBitcast,
   13634                        VecCstHighBitcast, DAG.getConstant(0xaa, DL, MVT::i32));
   13635   } else {
   13636     SDValue VecCstMask = DAG.getConstant(0xffff, DL, VecIntVT);
   13637     //     uint4 lo = (v & (uint4) 0xffff) | (uint4) 0x4b000000;
   13638     SDValue LowAnd = DAG.getNode(ISD::AND, DL, VecIntVT, V, VecCstMask);
   13639     Low = DAG.getNode(ISD::OR, DL, VecIntVT, LowAnd, VecCstLow);
   13640 
   13641     //     uint4 hi = (v >> 16) | (uint4) 0x53000000;
   13642     High = DAG.getNode(ISD::OR, DL, VecIntVT, HighShift, VecCstHigh);
   13643   }
   13644 
   13645   // Create the vector constant for -(0x1.0p39f + 0x1.0p23f).
   13646   SDValue VecCstFAdd = DAG.getConstantFP(
   13647       APFloat(APFloat::IEEEsingle, APInt(32, 0xD3000080)), DL, VecFloatVT);
   13648 
   13649   //     float4 fhi = (float4) hi - (0x1.0p39f + 0x1.0p23f);
   13650   SDValue HighBitcast = DAG.getBitcast(VecFloatVT, High);
   13651   // TODO: Are there any fast-math-flags to propagate here?
   13652   SDValue FHigh =
   13653       DAG.getNode(ISD::FADD, DL, VecFloatVT, HighBitcast, VecCstFAdd);
   13654   //     return (float4) lo + fhi;
   13655   SDValue LowBitcast = DAG.getBitcast(VecFloatVT, Low);
   13656   return DAG.getNode(ISD::FADD, DL, VecFloatVT, LowBitcast, FHigh);
   13657 }
   13658 
   13659 SDValue X86TargetLowering::lowerUINT_TO_FP_vec(SDValue Op,
   13660                                                SelectionDAG &DAG) const {
   13661   SDValue N0 = Op.getOperand(0);
   13662   MVT SVT = N0.getSimpleValueType();
   13663   SDLoc dl(Op);
   13664 
   13665   switch (SVT.SimpleTy) {
   13666   default:
   13667     llvm_unreachable("Custom UINT_TO_FP is not supported!");
   13668   case MVT::v4i8:
   13669   case MVT::v4i16:
   13670   case MVT::v8i8:
   13671   case MVT::v8i16: {
   13672     MVT NVT = MVT::getVectorVT(MVT::i32, SVT.getVectorNumElements());
   13673     return DAG.getNode(ISD::SINT_TO_FP, dl, Op.getValueType(),
   13674                        DAG.getNode(ISD::ZERO_EXTEND, dl, NVT, N0));
   13675   }
   13676   case MVT::v4i32:
   13677   case MVT::v8i32:
   13678     return lowerUINT_TO_FP_vXi32(Op, DAG, Subtarget);
   13679   case MVT::v16i8:
   13680   case MVT::v16i16:
   13681     assert(Subtarget.hasAVX512());
   13682     return DAG.getNode(ISD::UINT_TO_FP, dl, Op.getValueType(),
   13683                        DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::v16i32, N0));
   13684   }
   13685 }
   13686 
   13687 SDValue X86TargetLowering::LowerUINT_TO_FP(SDValue Op,
   13688                                            SelectionDAG &DAG) const {
   13689   SDValue N0 = Op.getOperand(0);
   13690   SDLoc dl(Op);
   13691   auto PtrVT = getPointerTy(DAG.getDataLayout());
   13692 
   13693   if (Op.getSimpleValueType().isVector())
   13694     return lowerUINT_TO_FP_vec(Op, DAG);
   13695 
   13696   // Since UINT_TO_FP is legal (it's marked custom), dag combiner won't
   13697   // optimize it to a SINT_TO_FP when the sign bit is known zero. Perform
   13698   // the optimization here.
   13699   if (DAG.SignBitIsZero(N0))
   13700     return DAG.getNode(ISD::SINT_TO_FP, dl, Op.getValueType(), N0);
   13701 
   13702   MVT SrcVT = N0.getSimpleValueType();
   13703   MVT DstVT = Op.getSimpleValueType();
   13704 
   13705   if (Subtarget.hasAVX512() && isScalarFPTypeInSSEReg(DstVT) &&
   13706       (SrcVT == MVT::i32 || (SrcVT == MVT::i64 && Subtarget.is64Bit()))) {
   13707     // Conversions from unsigned i32 to f32/f64 are legal,
   13708     // using VCVTUSI2SS/SD.  Same for i64 in 64-bit mode.
   13709     return Op;
   13710   }
   13711 
   13712   if (SrcVT == MVT::i64 && DstVT == MVT::f64 && X86ScalarSSEf64)
   13713     return LowerUINT_TO_FP_i64(Op, DAG);
   13714   if (SrcVT == MVT::i32 && X86ScalarSSEf64)
   13715     return LowerUINT_TO_FP_i32(Op, DAG);
   13716   if (Subtarget.is64Bit() && SrcVT == MVT::i64 && DstVT == MVT::f32)
   13717     return SDValue();
   13718 
   13719   // Make a 64-bit buffer, and use it to build an FILD.
   13720   SDValue StackSlot = DAG.CreateStackTemporary(MVT::i64);
   13721   if (SrcVT == MVT::i32) {
   13722     SDValue OffsetSlot = DAG.getMemBasePlusOffset(StackSlot, 4, dl);
   13723     SDValue Store1 = DAG.getStore(DAG.getEntryNode(), dl, Op.getOperand(0),
   13724                                   StackSlot, MachinePointerInfo(),
   13725                                   false, false, 0);
   13726     SDValue Store2 = DAG.getStore(Store1, dl, DAG.getConstant(0, dl, MVT::i32),
   13727                                   OffsetSlot, MachinePointerInfo(),
   13728                                   false, false, 0);
   13729     SDValue Fild = BuildFILD(Op, MVT::i64, Store2, StackSlot, DAG);
   13730     return Fild;
   13731   }
   13732 
   13733   assert(SrcVT == MVT::i64 && "Unexpected type in UINT_TO_FP");
   13734   SDValue ValueToStore = Op.getOperand(0);
   13735   if (isScalarFPTypeInSSEReg(Op.getValueType()) && !Subtarget.is64Bit())
   13736     // Bitcasting to f64 here allows us to do a single 64-bit store from
   13737     // an SSE register, avoiding the store forwarding penalty that would come
   13738     // with two 32-bit stores.
   13739     ValueToStore = DAG.getBitcast(MVT::f64, ValueToStore);
   13740   SDValue Store = DAG.getStore(DAG.getEntryNode(), dl, ValueToStore,
   13741                                StackSlot, MachinePointerInfo(),
   13742                                false, false, 0);
   13743   // For i64 source, we need to add the appropriate power of 2 if the input
   13744   // was negative.  This is the same as the optimization in
   13745   // DAGTypeLegalizer::ExpandIntOp_UNIT_TO_FP, and for it to be safe here,
   13746   // we must be careful to do the computation in x87 extended precision, not
   13747   // in SSE. (The generic code can't know it's OK to do this, or how to.)
   13748   int SSFI = cast<FrameIndexSDNode>(StackSlot)->getIndex();
   13749   MachineMemOperand *MMO = DAG.getMachineFunction().getMachineMemOperand(
   13750       MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SSFI),
   13751       MachineMemOperand::MOLoad, 8, 8);
   13752 
   13753   SDVTList Tys = DAG.getVTList(MVT::f80, MVT::Other);
   13754   SDValue Ops[] = { Store, StackSlot, DAG.getValueType(MVT::i64) };
   13755   SDValue Fild = DAG.getMemIntrinsicNode(X86ISD::FILD, dl, Tys, Ops,
   13756                                          MVT::i64, MMO);
   13757 
   13758   APInt FF(32, 0x5F800000ULL);
   13759 
   13760   // Check whether the sign bit is set.
   13761   SDValue SignSet = DAG.getSetCC(
   13762       dl, getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::i64),
   13763       Op.getOperand(0), DAG.getConstant(0, dl, MVT::i64), ISD::SETLT);
   13764 
   13765   // Build a 64 bit pair (0, FF) in the constant pool, with FF in the lo bits.
   13766   SDValue FudgePtr = DAG.getConstantPool(
   13767       ConstantInt::get(*DAG.getContext(), FF.zext(64)), PtrVT);
   13768 
   13769   // Get a pointer to FF if the sign bit was set, or to 0 otherwise.
   13770   SDValue Zero = DAG.getIntPtrConstant(0, dl);
   13771   SDValue Four = DAG.getIntPtrConstant(4, dl);
   13772   SDValue Offset = DAG.getNode(ISD::SELECT, dl, Zero.getValueType(), SignSet,
   13773                                Zero, Four);
   13774   FudgePtr = DAG.getNode(ISD::ADD, dl, PtrVT, FudgePtr, Offset);
   13775 
   13776   // Load the value out, extending it from f32 to f80.
   13777   // FIXME: Avoid the extend by constructing the right constant pool?
   13778   SDValue Fudge = DAG.getExtLoad(
   13779       ISD::EXTLOAD, dl, MVT::f80, DAG.getEntryNode(), FudgePtr,
   13780       MachinePointerInfo::getConstantPool(DAG.getMachineFunction()), MVT::f32,
   13781       false, false, false, 4);
   13782   // Extend everything to 80 bits to force it to be done on x87.
   13783   // TODO: Are there any fast-math-flags to propagate here?
   13784   SDValue Add = DAG.getNode(ISD::FADD, dl, MVT::f80, Fild, Fudge);
   13785   return DAG.getNode(ISD::FP_ROUND, dl, DstVT, Add,
   13786                      DAG.getIntPtrConstant(0, dl));
   13787 }
   13788 
   13789 // If the given FP_TO_SINT (IsSigned) or FP_TO_UINT (!IsSigned) operation
   13790 // is legal, or has an fp128 or f16 source (which needs to be promoted to f32),
   13791 // just return an <SDValue(), SDValue()> pair.
   13792 // Otherwise it is assumed to be a conversion from one of f32, f64 or f80
   13793 // to i16, i32 or i64, and we lower it to a legal sequence.
   13794 // If lowered to the final integer result we return a <result, SDValue()> pair.
   13795 // Otherwise we lower it to a sequence ending with a FIST, return a
   13796 // <FIST, StackSlot> pair, and the caller is responsible for loading
   13797 // the final integer result from StackSlot.
   13798 std::pair<SDValue,SDValue>
   13799 X86TargetLowering::FP_TO_INTHelper(SDValue Op, SelectionDAG &DAG,
   13800                                    bool IsSigned, bool IsReplace) const {
   13801   SDLoc DL(Op);
   13802 
   13803   EVT DstTy = Op.getValueType();
   13804   EVT TheVT = Op.getOperand(0).getValueType();
   13805   auto PtrVT = getPointerTy(DAG.getDataLayout());
   13806 
   13807   if (TheVT != MVT::f32 && TheVT != MVT::f64 && TheVT != MVT::f80) {
   13808     // f16 must be promoted before using the lowering in this routine.
   13809     // fp128 does not use this lowering.
   13810     return std::make_pair(SDValue(), SDValue());
   13811   }
   13812 
   13813   // If using FIST to compute an unsigned i64, we'll need some fixup
   13814   // to handle values above the maximum signed i64.  A FIST is always
   13815   // used for the 32-bit subtarget, but also for f80 on a 64-bit target.
   13816   bool UnsignedFixup = !IsSigned &&
   13817                        DstTy == MVT::i64 &&
   13818                        (!Subtarget.is64Bit() ||
   13819                         !isScalarFPTypeInSSEReg(TheVT));
   13820 
   13821   if (!IsSigned && DstTy != MVT::i64 && !Subtarget.hasAVX512()) {
   13822     // Replace the fp-to-uint32 operation with an fp-to-sint64 FIST.
   13823     // The low 32 bits of the fist result will have the correct uint32 result.
   13824     assert(DstTy == MVT::i32 && "Unexpected FP_TO_UINT");
   13825     DstTy = MVT::i64;
   13826   }
   13827 
   13828   assert(DstTy.getSimpleVT() <= MVT::i64 &&
   13829          DstTy.getSimpleVT() >= MVT::i16 &&
   13830          "Unknown FP_TO_INT to lower!");
   13831 
   13832   // These are really Legal.
   13833   if (DstTy == MVT::i32 &&
   13834       isScalarFPTypeInSSEReg(Op.getOperand(0).getValueType()))
   13835     return std::make_pair(SDValue(), SDValue());
   13836   if (Subtarget.is64Bit() &&
   13837       DstTy == MVT::i64 &&
   13838       isScalarFPTypeInSSEReg(Op.getOperand(0).getValueType()))
   13839     return std::make_pair(SDValue(), SDValue());
   13840 
   13841   // We lower FP->int64 into FISTP64 followed by a load from a temporary
   13842   // stack slot.
   13843   MachineFunction &MF = DAG.getMachineFunction();
   13844   unsigned MemSize = DstTy.getSizeInBits()/8;
   13845   int SSFI = MF.getFrameInfo()->CreateStackObject(MemSize, MemSize, false);
   13846   SDValue StackSlot = DAG.getFrameIndex(SSFI, PtrVT);
   13847 
   13848   unsigned Opc;
   13849   switch (DstTy.getSimpleVT().SimpleTy) {
   13850   default: llvm_unreachable("Invalid FP_TO_SINT to lower!");
   13851   case MVT::i16: Opc = X86ISD::FP_TO_INT16_IN_MEM; break;
   13852   case MVT::i32: Opc = X86ISD::FP_TO_INT32_IN_MEM; break;
   13853   case MVT::i64: Opc = X86ISD::FP_TO_INT64_IN_MEM; break;
   13854   }
   13855 
   13856   SDValue Chain = DAG.getEntryNode();
   13857   SDValue Value = Op.getOperand(0);
   13858   SDValue Adjust; // 0x0 or 0x80000000, for result sign bit adjustment.
   13859 
   13860   if (UnsignedFixup) {
   13861     //
   13862     // Conversion to unsigned i64 is implemented with a select,
   13863     // depending on whether the source value fits in the range
   13864     // of a signed i64.  Let Thresh be the FP equivalent of
   13865     // 0x8000000000000000ULL.
   13866     //
   13867     //  Adjust i32 = (Value < Thresh) ? 0 : 0x80000000;
   13868     //  FistSrc    = (Value < Thresh) ? Value : (Value - Thresh);
   13869     //  Fist-to-mem64 FistSrc
   13870     //  Add 0 or 0x800...0ULL to the 64-bit result, which is equivalent
   13871     //  to XOR'ing the high 32 bits with Adjust.
   13872     //
   13873     // Being a power of 2, Thresh is exactly representable in all FP formats.
   13874     // For X87 we'd like to use the smallest FP type for this constant, but
   13875     // for DAG type consistency we have to match the FP operand type.
   13876 
   13877     APFloat Thresh(APFloat::IEEEsingle, APInt(32, 0x5f000000));
   13878     LLVM_ATTRIBUTE_UNUSED APFloat::opStatus Status = APFloat::opOK;
   13879     bool LosesInfo = false;
   13880     if (TheVT == MVT::f64)
   13881       // The rounding mode is irrelevant as the conversion should be exact.
   13882       Status = Thresh.convert(APFloat::IEEEdouble, APFloat::rmNearestTiesToEven,
   13883                               &LosesInfo);
   13884     else if (TheVT == MVT::f80)
   13885       Status = Thresh.convert(APFloat::x87DoubleExtended,
   13886                               APFloat::rmNearestTiesToEven, &LosesInfo);
   13887 
   13888     assert(Status == APFloat::opOK && !LosesInfo &&
   13889            "FP conversion should have been exact");
   13890 
   13891     SDValue ThreshVal = DAG.getConstantFP(Thresh, DL, TheVT);
   13892 
   13893     SDValue Cmp = DAG.getSetCC(DL,
   13894                                getSetCCResultType(DAG.getDataLayout(),
   13895                                                   *DAG.getContext(), TheVT),
   13896                                Value, ThreshVal, ISD::SETLT);
   13897     Adjust = DAG.getSelect(DL, MVT::i32, Cmp,
   13898                            DAG.getConstant(0, DL, MVT::i32),
   13899                            DAG.getConstant(0x80000000, DL, MVT::i32));
   13900     SDValue Sub = DAG.getNode(ISD::FSUB, DL, TheVT, Value, ThreshVal);
   13901     Cmp = DAG.getSetCC(DL, getSetCCResultType(DAG.getDataLayout(),
   13902                                               *DAG.getContext(), TheVT),
   13903                        Value, ThreshVal, ISD::SETLT);
   13904     Value = DAG.getSelect(DL, TheVT, Cmp, Value, Sub);
   13905   }
   13906 
   13907   // FIXME This causes a redundant load/store if the SSE-class value is already
   13908   // in memory, such as if it is on the callstack.
   13909   if (isScalarFPTypeInSSEReg(TheVT)) {
   13910     assert(DstTy == MVT::i64 && "Invalid FP_TO_SINT to lower!");
   13911     Chain = DAG.getStore(Chain, DL, Value, StackSlot,
   13912                          MachinePointerInfo::getFixedStack(MF, SSFI), false,
   13913                          false, 0);
   13914     SDVTList Tys = DAG.getVTList(Op.getOperand(0).getValueType(), MVT::Other);
   13915     SDValue Ops[] = {
   13916       Chain, StackSlot, DAG.getValueType(TheVT)
   13917     };
   13918 
   13919     MachineMemOperand *MMO =
   13920         MF.getMachineMemOperand(MachinePointerInfo::getFixedStack(MF, SSFI),
   13921                                 MachineMemOperand::MOLoad, MemSize, MemSize);
   13922     Value = DAG.getMemIntrinsicNode(X86ISD::FLD, DL, Tys, Ops, DstTy, MMO);
   13923     Chain = Value.getValue(1);
   13924     SSFI = MF.getFrameInfo()->CreateStackObject(MemSize, MemSize, false);
   13925     StackSlot = DAG.getFrameIndex(SSFI, PtrVT);
   13926   }
   13927 
   13928   MachineMemOperand *MMO =
   13929       MF.getMachineMemOperand(MachinePointerInfo::getFixedStack(MF, SSFI),
   13930                               MachineMemOperand::MOStore, MemSize, MemSize);
   13931 
   13932   if (UnsignedFixup) {
   13933 
   13934     // Insert the FIST, load its result as two i32's,
   13935     // and XOR the high i32 with Adjust.
   13936 
   13937     SDValue FistOps[] = { Chain, Value, StackSlot };
   13938     SDValue FIST = DAG.getMemIntrinsicNode(Opc, DL, DAG.getVTList(MVT::Other),
   13939                                            FistOps, DstTy, MMO);
   13940 
   13941     SDValue Low32 = DAG.getLoad(MVT::i32, DL, FIST, StackSlot,
   13942                                 MachinePointerInfo(),
   13943                                 false, false, false, 0);
   13944     SDValue HighAddr = DAG.getMemBasePlusOffset(StackSlot, 4, DL);
   13945 
   13946     SDValue High32 = DAG.getLoad(MVT::i32, DL, FIST, HighAddr,
   13947                                  MachinePointerInfo(),
   13948                                  false, false, false, 0);
   13949     High32 = DAG.getNode(ISD::XOR, DL, MVT::i32, High32, Adjust);
   13950 
   13951     if (Subtarget.is64Bit()) {
   13952       // Join High32 and Low32 into a 64-bit result.
   13953       // (High32 << 32) | Low32
   13954       Low32 = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i64, Low32);
   13955       High32 = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i64, High32);
   13956       High32 = DAG.getNode(ISD::SHL, DL, MVT::i64, High32,
   13957                            DAG.getConstant(32, DL, MVT::i8));
   13958       SDValue Result = DAG.getNode(ISD::OR, DL, MVT::i64, High32, Low32);
   13959       return std::make_pair(Result, SDValue());
   13960     }
   13961 
   13962     SDValue ResultOps[] = { Low32, High32 };
   13963 
   13964     SDValue pair = IsReplace
   13965       ? DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, ResultOps)
   13966       : DAG.getMergeValues(ResultOps, DL);
   13967     return std::make_pair(pair, SDValue());
   13968   } else {
   13969     // Build the FP_TO_INT*_IN_MEM
   13970     SDValue Ops[] = { Chain, Value, StackSlot };
   13971     SDValue FIST = DAG.getMemIntrinsicNode(Opc, DL, DAG.getVTList(MVT::Other),
   13972                                            Ops, DstTy, MMO);
   13973     return std::make_pair(FIST, StackSlot);
   13974   }
   13975 }
   13976 
   13977 static SDValue LowerAVXExtend(SDValue Op, SelectionDAG &DAG,
   13978                               const X86Subtarget &Subtarget) {
   13979   MVT VT = Op->getSimpleValueType(0);
   13980   SDValue In = Op->getOperand(0);
   13981   MVT InVT = In.getSimpleValueType();
   13982   SDLoc dl(Op);
   13983 
   13984   if (VT.is512BitVector() || InVT.getVectorElementType() == MVT::i1)
   13985     return DAG.getNode(ISD::ZERO_EXTEND, dl, VT, In);
   13986 
   13987   // Optimize vectors in AVX mode:
   13988   //
   13989   //   v8i16 -> v8i32
   13990   //   Use vpunpcklwd for 4 lower elements  v8i16 -> v4i32.
   13991   //   Use vpunpckhwd for 4 upper elements  v8i16 -> v4i32.
   13992   //   Concat upper and lower parts.
   13993   //
   13994   //   v4i32 -> v4i64
   13995   //   Use vpunpckldq for 4 lower elements  v4i32 -> v2i64.
   13996   //   Use vpunpckhdq for 4 upper elements  v4i32 -> v2i64.
   13997   //   Concat upper and lower parts.
   13998   //
   13999 
   14000   if (((VT != MVT::v16i16) || (InVT != MVT::v16i8)) &&
   14001       ((VT != MVT::v8i32) || (InVT != MVT::v8i16)) &&
   14002       ((VT != MVT::v4i64) || (InVT != MVT::v4i32)))
   14003     return SDValue();
   14004 
   14005   if (Subtarget.hasInt256())
   14006     return DAG.getNode(X86ISD::VZEXT, dl, VT, In);
   14007 
   14008   SDValue ZeroVec = getZeroVector(InVT, Subtarget, DAG, dl);
   14009   SDValue Undef = DAG.getUNDEF(InVT);
   14010   bool NeedZero = Op.getOpcode() == ISD::ZERO_EXTEND;
   14011   SDValue OpLo = getUnpackl(DAG, dl, InVT, In, NeedZero ? ZeroVec : Undef);
   14012   SDValue OpHi = getUnpackh(DAG, dl, InVT, In, NeedZero ? ZeroVec : Undef);
   14013 
   14014   MVT HVT = MVT::getVectorVT(VT.getVectorElementType(),
   14015                              VT.getVectorNumElements()/2);
   14016 
   14017   OpLo = DAG.getBitcast(HVT, OpLo);
   14018   OpHi = DAG.getBitcast(HVT, OpHi);
   14019 
   14020   return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, OpLo, OpHi);
   14021 }
   14022 
   14023 static  SDValue LowerZERO_EXTEND_AVX512(SDValue Op,
   14024                   const X86Subtarget &Subtarget, SelectionDAG &DAG) {
   14025   MVT VT = Op->getSimpleValueType(0);
   14026   SDValue In = Op->getOperand(0);
   14027   MVT InVT = In.getSimpleValueType();
   14028   SDLoc DL(Op);
   14029   unsigned int NumElts = VT.getVectorNumElements();
   14030   if (NumElts != 8 && NumElts != 16 && !Subtarget.hasBWI())
   14031     return SDValue();
   14032 
   14033   if (VT.is512BitVector() && InVT.getVectorElementType() != MVT::i1)
   14034     return DAG.getNode(X86ISD::VZEXT, DL, VT, In);
   14035 
   14036   assert(InVT.getVectorElementType() == MVT::i1);
   14037 
   14038   // Extend VT if the target is 256 or 128bit vector and VLX is not supported.
   14039   MVT ExtVT = VT;
   14040   if (!VT.is512BitVector() && !Subtarget.hasVLX())
   14041     ExtVT = MVT::getVectorVT(MVT::getIntegerVT(512/NumElts), NumElts);
   14042 
   14043   SDValue One =
   14044    DAG.getConstant(APInt(ExtVT.getScalarSizeInBits(), 1), DL, ExtVT);
   14045   SDValue Zero =
   14046    DAG.getConstant(APInt::getNullValue(ExtVT.getScalarSizeInBits()), DL, ExtVT);
   14047 
   14048   SDValue SelectedVal = DAG.getNode(ISD::VSELECT, DL, ExtVT, In, One, Zero);
   14049   if (VT == ExtVT)
   14050     return SelectedVal;
   14051   return DAG.getNode(X86ISD::VTRUNC, DL, VT, SelectedVal);
   14052 }
   14053 
   14054 static SDValue LowerANY_EXTEND(SDValue Op, const X86Subtarget &Subtarget,
   14055                                SelectionDAG &DAG) {
   14056   if (Subtarget.hasFp256())
   14057     if (SDValue Res = LowerAVXExtend(Op, DAG, Subtarget))
   14058       return Res;
   14059 
   14060   return SDValue();
   14061 }
   14062 
   14063 static SDValue LowerZERO_EXTEND(SDValue Op, const X86Subtarget &Subtarget,
   14064                                 SelectionDAG &DAG) {
   14065   SDLoc DL(Op);
   14066   MVT VT = Op.getSimpleValueType();
   14067   SDValue In = Op.getOperand(0);
   14068   MVT SVT = In.getSimpleValueType();
   14069 
   14070   if (VT.is512BitVector() || SVT.getVectorElementType() == MVT::i1)
   14071     return LowerZERO_EXTEND_AVX512(Op, Subtarget, DAG);
   14072 
   14073   if (Subtarget.hasFp256())
   14074     if (SDValue Res = LowerAVXExtend(Op, DAG, Subtarget))
   14075       return Res;
   14076 
   14077   assert(!VT.is256BitVector() || !SVT.is128BitVector() ||
   14078          VT.getVectorNumElements() != SVT.getVectorNumElements());
   14079   return SDValue();
   14080 }
   14081 
   14082 static SDValue LowerTruncateVecI1(SDValue Op, SelectionDAG &DAG,
   14083                                   const X86Subtarget &Subtarget) {
   14084 
   14085   SDLoc DL(Op);
   14086   MVT VT = Op.getSimpleValueType();
   14087   SDValue In = Op.getOperand(0);
   14088   MVT InVT = In.getSimpleValueType();
   14089 
   14090   assert(VT.getVectorElementType() == MVT::i1 && "Unexpected vector type.");
   14091 
   14092   // Shift LSB to MSB and use VPMOVB/W2M or TESTD/Q.
   14093   unsigned ShiftInx = InVT.getScalarSizeInBits() - 1;
   14094   if (InVT.getScalarSizeInBits() <= 16) {
   14095     if (Subtarget.hasBWI()) {
   14096       // legal, will go to VPMOVB2M, VPMOVW2M
   14097       // Shift packed bytes not supported natively, bitcast to word
   14098       MVT ExtVT = MVT::getVectorVT(MVT::i16, InVT.getSizeInBits()/16);
   14099       SDValue  ShiftNode = DAG.getNode(ISD::SHL, DL, ExtVT,
   14100                                        DAG.getBitcast(ExtVT, In),
   14101                                        DAG.getConstant(ShiftInx, DL, ExtVT));
   14102       ShiftNode = DAG.getBitcast(InVT, ShiftNode);
   14103       return DAG.getNode(X86ISD::CVT2MASK, DL, VT, ShiftNode);
   14104     }
   14105     // Use TESTD/Q, extended vector to packed dword/qword.
   14106     assert((InVT.is256BitVector() || InVT.is128BitVector()) &&
   14107            "Unexpected vector type.");
   14108     unsigned NumElts = InVT.getVectorNumElements();
   14109     MVT ExtVT = MVT::getVectorVT(MVT::getIntegerVT(512/NumElts), NumElts);
   14110     In = DAG.getNode(ISD::SIGN_EXTEND, DL, ExtVT, In);
   14111     InVT = ExtVT;
   14112     ShiftInx = InVT.getScalarSizeInBits() - 1;
   14113   }
   14114 
   14115   SDValue  ShiftNode = DAG.getNode(ISD::SHL, DL, InVT, In,
   14116                                    DAG.getConstant(ShiftInx, DL, InVT));
   14117   return DAG.getNode(X86ISD::TESTM, DL, VT, ShiftNode, ShiftNode);
   14118 }
   14119 
   14120 SDValue X86TargetLowering::LowerTRUNCATE(SDValue Op, SelectionDAG &DAG) const {
   14121   SDLoc DL(Op);
   14122   MVT VT = Op.getSimpleValueType();
   14123   SDValue In = Op.getOperand(0);
   14124   MVT InVT = In.getSimpleValueType();
   14125 
   14126   if (VT == MVT::i1) {
   14127     assert((InVT.isInteger() && (InVT.getSizeInBits() <= 64)) &&
   14128            "Invalid scalar TRUNCATE operation");
   14129     if (InVT.getSizeInBits() >= 32)
   14130       return SDValue();
   14131     In = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, In);
   14132     return DAG.getNode(ISD::TRUNCATE, DL, VT, In);
   14133   }
   14134   assert(VT.getVectorNumElements() == InVT.getVectorNumElements() &&
   14135          "Invalid TRUNCATE operation");
   14136 
   14137   if (VT.getVectorElementType() == MVT::i1)
   14138     return LowerTruncateVecI1(Op, DAG, Subtarget);
   14139 
   14140   // vpmovqb/w/d, vpmovdb/w, vpmovwb
   14141   if (Subtarget.hasAVX512()) {
   14142     // word to byte only under BWI
   14143     if (InVT == MVT::v16i16 && !Subtarget.hasBWI()) // v16i16 -> v16i8
   14144       return DAG.getNode(X86ISD::VTRUNC, DL, VT,
   14145                          DAG.getNode(X86ISD::VSEXT, DL, MVT::v16i32, In));
   14146     return DAG.getNode(X86ISD::VTRUNC, DL, VT, In);
   14147   }
   14148   if ((VT == MVT::v4i32) && (InVT == MVT::v4i64)) {
   14149     // On AVX2, v4i64 -> v4i32 becomes VPERMD.
   14150     if (Subtarget.hasInt256()) {
   14151       static const int ShufMask[] = {0, 2, 4, 6, -1, -1, -1, -1};
   14152       In = DAG.getBitcast(MVT::v8i32, In);
   14153       In = DAG.getVectorShuffle(MVT::v8i32, DL, In, DAG.getUNDEF(MVT::v8i32),
   14154                                 ShufMask);
   14155       return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, In,
   14156                          DAG.getIntPtrConstant(0, DL));
   14157     }
   14158 
   14159     SDValue OpLo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2i64, In,
   14160                                DAG.getIntPtrConstant(0, DL));
   14161     SDValue OpHi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2i64, In,
   14162                                DAG.getIntPtrConstant(2, DL));
   14163     OpLo = DAG.getBitcast(MVT::v4i32, OpLo);
   14164     OpHi = DAG.getBitcast(MVT::v4i32, OpHi);
   14165     static const int ShufMask[] = {0, 2, 4, 6};
   14166     return DAG.getVectorShuffle(VT, DL, OpLo, OpHi, ShufMask);
   14167   }
   14168 
   14169   if ((VT == MVT::v8i16) && (InVT == MVT::v8i32)) {
   14170     // On AVX2, v8i32 -> v8i16 becomed PSHUFB.
   14171     if (Subtarget.hasInt256()) {
   14172       In = DAG.getBitcast(MVT::v32i8, In);
   14173 
   14174       SmallVector<SDValue,32> pshufbMask;
   14175       for (unsigned i = 0; i < 2; ++i) {
   14176         pshufbMask.push_back(DAG.getConstant(0x0, DL, MVT::i8));
   14177         pshufbMask.push_back(DAG.getConstant(0x1, DL, MVT::i8));
   14178         pshufbMask.push_back(DAG.getConstant(0x4, DL, MVT::i8));
   14179         pshufbMask.push_back(DAG.getConstant(0x5, DL, MVT::i8));
   14180         pshufbMask.push_back(DAG.getConstant(0x8, DL, MVT::i8));
   14181         pshufbMask.push_back(DAG.getConstant(0x9, DL, MVT::i8));
   14182         pshufbMask.push_back(DAG.getConstant(0xc, DL, MVT::i8));
   14183         pshufbMask.push_back(DAG.getConstant(0xd, DL, MVT::i8));
   14184         for (unsigned j = 0; j < 8; ++j)
   14185           pshufbMask.push_back(DAG.getConstant(0x80, DL, MVT::i8));
   14186       }
   14187       SDValue BV = DAG.getBuildVector(MVT::v32i8, DL, pshufbMask);
   14188       In = DAG.getNode(X86ISD::PSHUFB, DL, MVT::v32i8, In, BV);
   14189       In = DAG.getBitcast(MVT::v4i64, In);
   14190 
   14191       static const int ShufMask[] = {0,  2,  -1,  -1};
   14192       In = DAG.getVectorShuffle(MVT::v4i64, DL,  In, DAG.getUNDEF(MVT::v4i64),
   14193                                 ShufMask);
   14194       In = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2i64, In,
   14195                        DAG.getIntPtrConstant(0, DL));
   14196       return DAG.getBitcast(VT, In);
   14197     }
   14198 
   14199     SDValue OpLo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v4i32, In,
   14200                                DAG.getIntPtrConstant(0, DL));
   14201 
   14202     SDValue OpHi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v4i32, In,
   14203                                DAG.getIntPtrConstant(4, DL));
   14204 
   14205     OpLo = DAG.getBitcast(MVT::v16i8, OpLo);
   14206     OpHi = DAG.getBitcast(MVT::v16i8, OpHi);
   14207 
   14208     // The PSHUFB mask:
   14209     static const int ShufMask1[] = {0,  1,  4,  5,  8,  9, 12, 13,
   14210                                    -1, -1, -1, -1, -1, -1, -1, -1};
   14211 
   14212     SDValue Undef = DAG.getUNDEF(MVT::v16i8);
   14213     OpLo = DAG.getVectorShuffle(MVT::v16i8, DL, OpLo, Undef, ShufMask1);
   14214     OpHi = DAG.getVectorShuffle(MVT::v16i8, DL, OpHi, Undef, ShufMask1);
   14215 
   14216     OpLo = DAG.getBitcast(MVT::v4i32, OpLo);
   14217     OpHi = DAG.getBitcast(MVT::v4i32, OpHi);
   14218 
   14219     // The MOVLHPS Mask:
   14220     static const int ShufMask2[] = {0, 1, 4, 5};
   14221     SDValue res = DAG.getVectorShuffle(MVT::v4i32, DL, OpLo, OpHi, ShufMask2);
   14222     return DAG.getBitcast(MVT::v8i16, res);
   14223   }
   14224 
   14225   // Handle truncation of V256 to V128 using shuffles.
   14226   if (!VT.is128BitVector() || !InVT.is256BitVector())
   14227     return SDValue();
   14228 
   14229   assert(Subtarget.hasFp256() && "256-bit vector without AVX!");
   14230 
   14231   unsigned NumElems = VT.getVectorNumElements();
   14232   MVT NVT = MVT::getVectorVT(VT.getVectorElementType(), NumElems * 2);
   14233 
   14234   SmallVector<int, 16> MaskVec(NumElems * 2, -1);
   14235   // Prepare truncation shuffle mask
   14236   for (unsigned i = 0; i != NumElems; ++i)
   14237     MaskVec[i] = i * 2;
   14238   SDValue V = DAG.getVectorShuffle(NVT, DL, DAG.getBitcast(NVT, In),
   14239                                    DAG.getUNDEF(NVT), MaskVec);
   14240   return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, V,
   14241                      DAG.getIntPtrConstant(0, DL));
   14242 }
   14243 
   14244 SDValue X86TargetLowering::LowerFP_TO_SINT(SDValue Op,
   14245                                            SelectionDAG &DAG) const {
   14246   assert(!Op.getSimpleValueType().isVector());
   14247 
   14248   std::pair<SDValue,SDValue> Vals = FP_TO_INTHelper(Op, DAG,
   14249     /*IsSigned=*/ true, /*IsReplace=*/ false);
   14250   SDValue FIST = Vals.first, StackSlot = Vals.second;
   14251   // If FP_TO_INTHelper failed, the node is actually supposed to be Legal.
   14252   if (!FIST.getNode())
   14253     return Op;
   14254 
   14255   if (StackSlot.getNode())
   14256     // Load the result.
   14257     return DAG.getLoad(Op.getValueType(), SDLoc(Op),
   14258                        FIST, StackSlot, MachinePointerInfo(),
   14259                        false, false, false, 0);
   14260 
   14261   // The node is the result.
   14262   return FIST;
   14263 }
   14264 
   14265 SDValue X86TargetLowering::LowerFP_TO_UINT(SDValue Op,
   14266                                            SelectionDAG &DAG) const {
   14267   std::pair<SDValue,SDValue> Vals = FP_TO_INTHelper(Op, DAG,
   14268     /*IsSigned=*/ false, /*IsReplace=*/ false);
   14269   SDValue FIST = Vals.first, StackSlot = Vals.second;
   14270   // If FP_TO_INTHelper failed, the node is actually supposed to be Legal.
   14271   if (!FIST.getNode())
   14272     return Op;
   14273 
   14274   if (StackSlot.getNode())
   14275     // Load the result.
   14276     return DAG.getLoad(Op.getValueType(), SDLoc(Op),
   14277                        FIST, StackSlot, MachinePointerInfo(),
   14278                        false, false, false, 0);
   14279 
   14280   // The node is the result.
   14281   return FIST;
   14282 }
   14283 
   14284 static SDValue LowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) {
   14285   SDLoc DL(Op);
   14286   MVT VT = Op.getSimpleValueType();
   14287   SDValue In = Op.getOperand(0);
   14288   MVT SVT = In.getSimpleValueType();
   14289 
   14290   assert(SVT == MVT::v2f32 && "Only customize MVT::v2f32 type legalization!");
   14291 
   14292   return DAG.getNode(X86ISD::VFPEXT, DL, VT,
   14293                      DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v4f32,
   14294                                  In, DAG.getUNDEF(SVT)));
   14295 }
   14296 
   14297 /// The only differences between FABS and FNEG are the mask and the logic op.
   14298 /// FNEG also has a folding opportunity for FNEG(FABS(x)).
   14299 static SDValue LowerFABSorFNEG(SDValue Op, SelectionDAG &DAG) {
   14300   assert((Op.getOpcode() == ISD::FABS || Op.getOpcode() == ISD::FNEG) &&
   14301          "Wrong opcode for lowering FABS or FNEG.");
   14302 
   14303   bool IsFABS = (Op.getOpcode() == ISD::FABS);
   14304 
   14305   // If this is a FABS and it has an FNEG user, bail out to fold the combination
   14306   // into an FNABS. We'll lower the FABS after that if it is still in use.
   14307   if (IsFABS)
   14308     for (SDNode *User : Op->uses())
   14309       if (User->getOpcode() == ISD::FNEG)
   14310         return Op;
   14311 
   14312   SDLoc dl(Op);
   14313   MVT VT = Op.getSimpleValueType();
   14314 
   14315   bool IsF128 = (VT == MVT::f128);
   14316 
   14317   // FIXME: Use function attribute "OptimizeForSize" and/or CodeGenOpt::Level to
   14318   // decide if we should generate a 16-byte constant mask when we only need 4 or
   14319   // 8 bytes for the scalar case.
   14320 
   14321   MVT LogicVT;
   14322   MVT EltVT;
   14323   unsigned NumElts;
   14324 
   14325   if (VT.isVector()) {
   14326     LogicVT = VT;
   14327     EltVT = VT.getVectorElementType();
   14328     NumElts = VT.getVectorNumElements();
   14329   } else if (IsF128) {
   14330     // SSE instructions are used for optimized f128 logical operations.
   14331     LogicVT = MVT::f128;
   14332     EltVT = VT;
   14333     NumElts = 1;
   14334   } else {
   14335     // There are no scalar bitwise logical SSE/AVX instructions, so we
   14336     // generate a 16-byte vector constant and logic op even for the scalar case.
   14337     // Using a 16-byte mask allows folding the load of the mask with
   14338     // the logic op, so it can save (~4 bytes) on code size.
   14339     LogicVT = (VT == MVT::f64) ? MVT::v2f64 : MVT::v4f32;
   14340     EltVT = VT;
   14341     NumElts = (VT == MVT::f64) ? 2 : 4;
   14342   }
   14343 
   14344   unsigned EltBits = EltVT.getSizeInBits();
   14345   LLVMContext *Context = DAG.getContext();
   14346   // For FABS, mask is 0x7f...; for FNEG, mask is 0x80...
   14347   APInt MaskElt =
   14348     IsFABS ? APInt::getSignedMaxValue(EltBits) : APInt::getSignBit(EltBits);
   14349   Constant *C = ConstantInt::get(*Context, MaskElt);
   14350   C = ConstantVector::getSplat(NumElts, C);
   14351   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
   14352   SDValue CPIdx = DAG.getConstantPool(C, TLI.getPointerTy(DAG.getDataLayout()));
   14353   unsigned Alignment = cast<ConstantPoolSDNode>(CPIdx)->getAlignment();
   14354   SDValue Mask =
   14355       DAG.getLoad(LogicVT, dl, DAG.getEntryNode(), CPIdx,
   14356                   MachinePointerInfo::getConstantPool(DAG.getMachineFunction()),
   14357                   false, false, false, Alignment);
   14358 
   14359   SDValue Op0 = Op.getOperand(0);
   14360   bool IsFNABS = !IsFABS && (Op0.getOpcode() == ISD::FABS);
   14361   unsigned LogicOp =
   14362     IsFABS ? X86ISD::FAND : IsFNABS ? X86ISD::FOR : X86ISD::FXOR;
   14363   SDValue Operand = IsFNABS ? Op0.getOperand(0) : Op0;
   14364 
   14365   if (VT.isVector() || IsF128)
   14366     return DAG.getNode(LogicOp, dl, LogicVT, Operand, Mask);
   14367 
   14368   // For the scalar case extend to a 128-bit vector, perform the logic op,
   14369   // and extract the scalar result back out.
   14370   Operand = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, LogicVT, Operand);
   14371   SDValue LogicNode = DAG.getNode(LogicOp, dl, LogicVT, Operand, Mask);
   14372   return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, LogicNode,
   14373                      DAG.getIntPtrConstant(0, dl));
   14374 }
   14375 
   14376 static SDValue LowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) {
   14377   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
   14378   LLVMContext *Context = DAG.getContext();
   14379   SDValue Op0 = Op.getOperand(0);
   14380   SDValue Op1 = Op.getOperand(1);
   14381   SDLoc dl(Op);
   14382   MVT VT = Op.getSimpleValueType();
   14383   MVT SrcVT = Op1.getSimpleValueType();
   14384   bool IsF128 = (VT == MVT::f128);
   14385 
   14386   // If second operand is smaller, extend it first.
   14387   if (SrcVT.bitsLT(VT)) {
   14388     Op1 = DAG.getNode(ISD::FP_EXTEND, dl, VT, Op1);
   14389     SrcVT = VT;
   14390   }
   14391   // And if it is bigger, shrink it first.
   14392   if (SrcVT.bitsGT(VT)) {
   14393     Op1 = DAG.getNode(ISD::FP_ROUND, dl, VT, Op1, DAG.getIntPtrConstant(1, dl));
   14394     SrcVT = VT;
   14395   }
   14396 
   14397   // At this point the operands and the result should have the same
   14398   // type, and that won't be f80 since that is not custom lowered.
   14399   assert((VT == MVT::f64 || VT == MVT::f32 || IsF128) &&
   14400          "Unexpected type in LowerFCOPYSIGN");
   14401 
   14402   const fltSemantics &Sem =
   14403       VT == MVT::f64 ? APFloat::IEEEdouble :
   14404           (IsF128 ? APFloat::IEEEquad : APFloat::IEEEsingle);
   14405   const unsigned SizeInBits = VT.getSizeInBits();
   14406 
   14407   SmallVector<Constant *, 4> CV(
   14408       VT == MVT::f64 ? 2 : (IsF128 ? 1 : 4),
   14409       ConstantFP::get(*Context, APFloat(Sem, APInt(SizeInBits, 0))));
   14410 
   14411   // First, clear all bits but the sign bit from the second operand (sign).
   14412   CV[0] = ConstantFP::get(*Context,
   14413                           APFloat(Sem, APInt::getHighBitsSet(SizeInBits, 1)));
   14414   Constant *C = ConstantVector::get(CV);
   14415   auto PtrVT = TLI.getPointerTy(DAG.getDataLayout());
   14416   SDValue CPIdx = DAG.getConstantPool(C, PtrVT, 16);
   14417 
   14418   // Perform all logic operations as 16-byte vectors because there are no
   14419   // scalar FP logic instructions in SSE. This allows load folding of the
   14420   // constants into the logic instructions.
   14421   MVT LogicVT = (VT == MVT::f64) ? MVT::v2f64 : (IsF128 ? MVT::f128 : MVT::v4f32);
   14422   SDValue Mask1 =
   14423       DAG.getLoad(LogicVT, dl, DAG.getEntryNode(), CPIdx,
   14424                   MachinePointerInfo::getConstantPool(DAG.getMachineFunction()),
   14425                   false, false, false, 16);
   14426   if (!IsF128)
   14427     Op1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, LogicVT, Op1);
   14428   SDValue SignBit = DAG.getNode(X86ISD::FAND, dl, LogicVT, Op1, Mask1);
   14429 
   14430   // Next, clear the sign bit from the first operand (magnitude).
   14431   // If it's a constant, we can clear it here.
   14432   if (ConstantFPSDNode *Op0CN = dyn_cast<ConstantFPSDNode>(Op0)) {
   14433     APFloat APF = Op0CN->getValueAPF();
   14434     // If the magnitude is a positive zero, the sign bit alone is enough.
   14435     if (APF.isPosZero())
   14436       return IsF128 ? SignBit :
   14437           DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, SrcVT, SignBit,
   14438                       DAG.getIntPtrConstant(0, dl));
   14439     APF.clearSign();
   14440     CV[0] = ConstantFP::get(*Context, APF);
   14441   } else {
   14442     CV[0] = ConstantFP::get(
   14443         *Context,
   14444         APFloat(Sem, APInt::getLowBitsSet(SizeInBits, SizeInBits - 1)));
   14445   }
   14446   C = ConstantVector::get(CV);
   14447   CPIdx = DAG.getConstantPool(C, PtrVT, 16);
   14448   SDValue Val =
   14449       DAG.getLoad(LogicVT, dl, DAG.getEntryNode(), CPIdx,
   14450                   MachinePointerInfo::getConstantPool(DAG.getMachineFunction()),
   14451                   false, false, false, 16);
   14452   // If the magnitude operand wasn't a constant, we need to AND out the sign.
   14453   if (!isa<ConstantFPSDNode>(Op0)) {
   14454     if (!IsF128)
   14455       Op0 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, LogicVT, Op0);
   14456     Val = DAG.getNode(X86ISD::FAND, dl, LogicVT, Op0, Val);
   14457   }
   14458   // OR the magnitude value with the sign bit.
   14459   Val = DAG.getNode(X86ISD::FOR, dl, LogicVT, Val, SignBit);
   14460   return IsF128 ? Val :
   14461       DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, SrcVT, Val,
   14462                   DAG.getIntPtrConstant(0, dl));
   14463 }
   14464 
   14465 static SDValue LowerFGETSIGN(SDValue Op, SelectionDAG &DAG) {
   14466   SDValue N0 = Op.getOperand(0);
   14467   SDLoc dl(Op);
   14468   MVT VT = Op.getSimpleValueType();
   14469 
   14470   MVT OpVT = N0.getSimpleValueType();
   14471   assert((OpVT == MVT::f32 || OpVT == MVT::f64) &&
   14472          "Unexpected type for FGETSIGN");
   14473 
   14474   // Lower ISD::FGETSIGN to (AND (X86ISD::MOVMSK ...) 1).
   14475   MVT VecVT = (OpVT == MVT::f32 ? MVT::v4f32 : MVT::v2f64);
   14476   SDValue Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VecVT, N0);
   14477   Res = DAG.getNode(X86ISD::MOVMSK, dl, MVT::i32, Res);
   14478   Res = DAG.getZExtOrTrunc(Res, dl, VT);
   14479   Res = DAG.getNode(ISD::AND, dl, VT, Res, DAG.getConstant(1, dl, VT));
   14480   return Res;
   14481 }
   14482 
   14483 // Check whether an OR'd tree is PTEST-able.
   14484 static SDValue LowerVectorAllZeroTest(SDValue Op, const X86Subtarget &Subtarget,
   14485                                       SelectionDAG &DAG) {
   14486   assert(Op.getOpcode() == ISD::OR && "Only check OR'd tree.");
   14487 
   14488   if (!Subtarget.hasSSE41())
   14489     return SDValue();
   14490 
   14491   if (!Op->hasOneUse())
   14492     return SDValue();
   14493 
   14494   SDNode *N = Op.getNode();
   14495   SDLoc DL(N);
   14496 
   14497   SmallVector<SDValue, 8> Opnds;
   14498   DenseMap<SDValue, unsigned> VecInMap;
   14499   SmallVector<SDValue, 8> VecIns;
   14500   EVT VT = MVT::Other;
   14501 
   14502   // Recognize a special case where a vector is casted into wide integer to
   14503   // test all 0s.
   14504   Opnds.push_back(N->getOperand(0));
   14505   Opnds.push_back(N->getOperand(1));
   14506 
   14507   for (unsigned Slot = 0, e = Opnds.size(); Slot < e; ++Slot) {
   14508     SmallVectorImpl<SDValue>::const_iterator I = Opnds.begin() + Slot;
   14509     // BFS traverse all OR'd operands.
   14510     if (I->getOpcode() == ISD::OR) {
   14511       Opnds.push_back(I->getOperand(0));
   14512       Opnds.push_back(I->getOperand(1));
   14513       // Re-evaluate the number of nodes to be traversed.
   14514       e += 2; // 2 more nodes (LHS and RHS) are pushed.
   14515       continue;
   14516     }
   14517 
   14518     // Quit if a non-EXTRACT_VECTOR_ELT
   14519     if (I->getOpcode() != ISD::EXTRACT_VECTOR_ELT)
   14520       return SDValue();
   14521 
   14522     // Quit if without a constant index.
   14523     SDValue Idx = I->getOperand(1);
   14524     if (!isa<ConstantSDNode>(Idx))
   14525       return SDValue();
   14526 
   14527     SDValue ExtractedFromVec = I->getOperand(0);
   14528     DenseMap<SDValue, unsigned>::iterator M = VecInMap.find(ExtractedFromVec);
   14529     if (M == VecInMap.end()) {
   14530       VT = ExtractedFromVec.getValueType();
   14531       // Quit if not 128/256-bit vector.
   14532       if (!VT.is128BitVector() && !VT.is256BitVector())
   14533         return SDValue();
   14534       // Quit if not the same type.
   14535       if (VecInMap.begin() != VecInMap.end() &&
   14536           VT != VecInMap.begin()->first.getValueType())
   14537         return SDValue();
   14538       M = VecInMap.insert(std::make_pair(ExtractedFromVec, 0)).first;
   14539       VecIns.push_back(ExtractedFromVec);
   14540     }
   14541     M->second |= 1U << cast<ConstantSDNode>(Idx)->getZExtValue();
   14542   }
   14543 
   14544   assert((VT.is128BitVector() || VT.is256BitVector()) &&
   14545          "Not extracted from 128-/256-bit vector.");
   14546 
   14547   unsigned FullMask = (1U << VT.getVectorNumElements()) - 1U;
   14548 
   14549   for (DenseMap<SDValue, unsigned>::const_iterator
   14550         I = VecInMap.begin(), E = VecInMap.end(); I != E; ++I) {
   14551     // Quit if not all elements are used.
   14552     if (I->second != FullMask)
   14553       return SDValue();
   14554   }
   14555 
   14556   MVT TestVT = VT.is128BitVector() ? MVT::v2i64 : MVT::v4i64;
   14557 
   14558   // Cast all vectors into TestVT for PTEST.
   14559   for (unsigned i = 0, e = VecIns.size(); i < e; ++i)
   14560     VecIns[i] = DAG.getBitcast(TestVT, VecIns[i]);
   14561 
   14562   // If more than one full vectors are evaluated, OR them first before PTEST.
   14563   for (unsigned Slot = 0, e = VecIns.size(); e - Slot > 1; Slot += 2, e += 1) {
   14564     // Each iteration will OR 2 nodes and append the result until there is only
   14565     // 1 node left, i.e. the final OR'd value of all vectors.
   14566     SDValue LHS = VecIns[Slot];
   14567     SDValue RHS = VecIns[Slot + 1];
   14568     VecIns.push_back(DAG.getNode(ISD::OR, DL, TestVT, LHS, RHS));
   14569   }
   14570 
   14571   return DAG.getNode(X86ISD::PTEST, DL, MVT::i32,
   14572                      VecIns.back(), VecIns.back());
   14573 }
   14574 
   14575 /// \brief return true if \c Op has a use that doesn't just read flags.
   14576 static bool hasNonFlagsUse(SDValue Op) {
   14577   for (SDNode::use_iterator UI = Op->use_begin(), UE = Op->use_end(); UI != UE;
   14578        ++UI) {
   14579     SDNode *User = *UI;
   14580     unsigned UOpNo = UI.getOperandNo();
   14581     if (User->getOpcode() == ISD::TRUNCATE && User->hasOneUse()) {
   14582       // Look pass truncate.
   14583       UOpNo = User->use_begin().getOperandNo();
   14584       User = *User->use_begin();
   14585     }
   14586 
   14587     if (User->getOpcode() != ISD::BRCOND && User->getOpcode() != ISD::SETCC &&
   14588         !(User->getOpcode() == ISD::SELECT && UOpNo == 0))
   14589       return true;
   14590   }
   14591   return false;
   14592 }
   14593 
   14594 // Emit KTEST instruction for bit vectors on AVX-512
   14595 static SDValue EmitKTEST(SDValue Op, SelectionDAG &DAG,
   14596                          const X86Subtarget &Subtarget) {
   14597   if (Op.getOpcode() == ISD::BITCAST) {
   14598     auto hasKTEST = [&](MVT VT) {
   14599       unsigned SizeInBits = VT.getSizeInBits();
   14600       return (Subtarget.hasDQI() && (SizeInBits == 8 || SizeInBits == 16)) ||
   14601         (Subtarget.hasBWI() && (SizeInBits == 32 || SizeInBits == 64));
   14602     };
   14603     SDValue Op0 = Op.getOperand(0);
   14604     MVT Op0VT = Op0.getValueType().getSimpleVT();
   14605     if (Op0VT.isVector() && Op0VT.getVectorElementType() == MVT::i1 &&
   14606         hasKTEST(Op0VT))
   14607       return DAG.getNode(X86ISD::KTEST, SDLoc(Op), Op0VT, Op0, Op0);
   14608   }
   14609   return SDValue();
   14610 }
   14611 
   14612 /// Emit nodes that will be selected as "test Op0,Op0", or something
   14613 /// equivalent.
   14614 SDValue X86TargetLowering::EmitTest(SDValue Op, unsigned X86CC, const SDLoc &dl,
   14615                                     SelectionDAG &DAG) const {
   14616   if (Op.getValueType() == MVT::i1) {
   14617     SDValue ExtOp = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i8, Op);
   14618     return DAG.getNode(X86ISD::CMP, dl, MVT::i32, ExtOp,
   14619                        DAG.getConstant(0, dl, MVT::i8));
   14620   }
   14621   // CF and OF aren't always set the way we want. Determine which
   14622   // of these we need.
   14623   bool NeedCF = false;
   14624   bool NeedOF = false;
   14625   switch (X86CC) {
   14626   default: break;
   14627   case X86::COND_A: case X86::COND_AE:
   14628   case X86::COND_B: case X86::COND_BE:
   14629     NeedCF = true;
   14630     break;
   14631   case X86::COND_G: case X86::COND_GE:
   14632   case X86::COND_L: case X86::COND_LE:
   14633   case X86::COND_O: case X86::COND_NO: {
   14634     // Check if we really need to set the
   14635     // Overflow flag. If NoSignedWrap is present
   14636     // that is not actually needed.
   14637     switch (Op->getOpcode()) {
   14638     case ISD::ADD:
   14639     case ISD::SUB:
   14640     case ISD::MUL:
   14641     case ISD::SHL: {
   14642       const auto *BinNode = cast<BinaryWithFlagsSDNode>(Op.getNode());
   14643       if (BinNode->Flags.hasNoSignedWrap())
   14644         break;
   14645     }
   14646     default:
   14647       NeedOF = true;
   14648       break;
   14649     }
   14650     break;
   14651   }
   14652   }
   14653   // See if we can use the EFLAGS value from the operand instead of
   14654   // doing a separate TEST. TEST always sets OF and CF to 0, so unless
   14655   // we prove that the arithmetic won't overflow, we can't use OF or CF.
   14656   if (Op.getResNo() != 0 || NeedOF || NeedCF) {
   14657     // Emit KTEST for bit vectors
   14658     if (auto Node = EmitKTEST(Op, DAG, Subtarget))
   14659       return Node;
   14660     // Emit a CMP with 0, which is the TEST pattern.
   14661     return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op,
   14662                        DAG.getConstant(0, dl, Op.getValueType()));
   14663   }
   14664   unsigned Opcode = 0;
   14665   unsigned NumOperands = 0;
   14666 
   14667   // Truncate operations may prevent the merge of the SETCC instruction
   14668   // and the arithmetic instruction before it. Attempt to truncate the operands
   14669   // of the arithmetic instruction and use a reduced bit-width instruction.
   14670   bool NeedTruncation = false;
   14671   SDValue ArithOp = Op;
   14672   if (Op->getOpcode() == ISD::TRUNCATE && Op->hasOneUse()) {
   14673     SDValue Arith = Op->getOperand(0);
   14674     // Both the trunc and the arithmetic op need to have one user each.
   14675     if (Arith->hasOneUse())
   14676       switch (Arith.getOpcode()) {
   14677         default: break;
   14678         case ISD::ADD:
   14679         case ISD::SUB:
   14680         case ISD::AND:
   14681         case ISD::OR:
   14682         case ISD::XOR: {
   14683           NeedTruncation = true;
   14684           ArithOp = Arith;
   14685         }
   14686       }
   14687   }
   14688 
   14689   // NOTICE: In the code below we use ArithOp to hold the arithmetic operation
   14690   // which may be the result of a CAST.  We use the variable 'Op', which is the
   14691   // non-casted variable when we check for possible users.
   14692   switch (ArithOp.getOpcode()) {
   14693   case ISD::ADD:
   14694     // Due to an isel shortcoming, be conservative if this add is likely to be
   14695     // selected as part of a load-modify-store instruction. When the root node
   14696     // in a match is a store, isel doesn't know how to remap non-chain non-flag
   14697     // uses of other nodes in the match, such as the ADD in this case. This
   14698     // leads to the ADD being left around and reselected, with the result being
   14699     // two adds in the output.  Alas, even if none our users are stores, that
   14700     // doesn't prove we're O.K.  Ergo, if we have any parents that aren't
   14701     // CopyToReg or SETCC, eschew INC/DEC.  A better fix seems to require
   14702     // climbing the DAG back to the root, and it doesn't seem to be worth the
   14703     // effort.
   14704     for (SDNode::use_iterator UI = Op.getNode()->use_begin(),
   14705          UE = Op.getNode()->use_end(); UI != UE; ++UI)
   14706       if (UI->getOpcode() != ISD::CopyToReg &&
   14707           UI->getOpcode() != ISD::SETCC &&
   14708           UI->getOpcode() != ISD::STORE)
   14709         goto default_case;
   14710 
   14711     if (ConstantSDNode *C =
   14712         dyn_cast<ConstantSDNode>(ArithOp.getNode()->getOperand(1))) {
   14713       // An add of one will be selected as an INC.
   14714       if (C->isOne() && !Subtarget.slowIncDec()) {
   14715         Opcode = X86ISD::INC;
   14716         NumOperands = 1;
   14717         break;
   14718       }
   14719 
   14720       // An add of negative one (subtract of one) will be selected as a DEC.
   14721       if (C->isAllOnesValue() && !Subtarget.slowIncDec()) {
   14722         Opcode = X86ISD::DEC;
   14723         NumOperands = 1;
   14724         break;
   14725       }
   14726     }
   14727 
   14728     // Otherwise use a regular EFLAGS-setting add.
   14729     Opcode = X86ISD::ADD;
   14730     NumOperands = 2;
   14731     break;
   14732   case ISD::SHL:
   14733   case ISD::SRL:
   14734     // If we have a constant logical shift that's only used in a comparison
   14735     // against zero turn it into an equivalent AND. This allows turning it into
   14736     // a TEST instruction later.
   14737     if ((X86CC == X86::COND_E || X86CC == X86::COND_NE) && Op->hasOneUse() &&
   14738         isa<ConstantSDNode>(Op->getOperand(1)) && !hasNonFlagsUse(Op)) {
   14739       EVT VT = Op.getValueType();
   14740       unsigned BitWidth = VT.getSizeInBits();
   14741       unsigned ShAmt = Op->getConstantOperandVal(1);
   14742       if (ShAmt >= BitWidth) // Avoid undefined shifts.
   14743         break;
   14744       APInt Mask = ArithOp.getOpcode() == ISD::SRL
   14745                        ? APInt::getHighBitsSet(BitWidth, BitWidth - ShAmt)
   14746                        : APInt::getLowBitsSet(BitWidth, BitWidth - ShAmt);
   14747       if (!Mask.isSignedIntN(32)) // Avoid large immediates.
   14748         break;
   14749       Op = DAG.getNode(ISD::AND, dl, VT, Op->getOperand(0),
   14750                        DAG.getConstant(Mask, dl, VT));
   14751     }
   14752     break;
   14753 
   14754   case ISD::AND:
   14755     // If the primary 'and' result isn't used, don't bother using X86ISD::AND,
   14756     // because a TEST instruction will be better.
   14757     if (!hasNonFlagsUse(Op)) {
   14758       SDValue Op0 = ArithOp->getOperand(0);
   14759       SDValue Op1 = ArithOp->getOperand(1);
   14760       EVT VT = ArithOp.getValueType();
   14761       bool isAndn = isBitwiseNot(Op0) || isBitwiseNot(Op1);
   14762       bool isLegalAndnType = VT == MVT::i32 || VT == MVT::i64;
   14763 
   14764       // But if we can combine this into an ANDN operation, then create an AND
   14765       // now and allow it to be pattern matched into an ANDN.
   14766       if (!Subtarget.hasBMI() || !isAndn || !isLegalAndnType)
   14767         break;
   14768     }
   14769     // FALL THROUGH
   14770   case ISD::SUB:
   14771   case ISD::OR:
   14772   case ISD::XOR:
   14773     // Due to the ISEL shortcoming noted above, be conservative if this op is
   14774     // likely to be selected as part of a load-modify-store instruction.
   14775     for (SDNode::use_iterator UI = Op.getNode()->use_begin(),
   14776            UE = Op.getNode()->use_end(); UI != UE; ++UI)
   14777       if (UI->getOpcode() == ISD::STORE)
   14778         goto default_case;
   14779 
   14780     // Otherwise use a regular EFLAGS-setting instruction.
   14781     switch (ArithOp.getOpcode()) {
   14782     default: llvm_unreachable("unexpected operator!");
   14783     case ISD::SUB: Opcode = X86ISD::SUB; break;
   14784     case ISD::XOR: Opcode = X86ISD::XOR; break;
   14785     case ISD::AND: Opcode = X86ISD::AND; break;
   14786     case ISD::OR: {
   14787       if (!NeedTruncation && (X86CC == X86::COND_E || X86CC == X86::COND_NE)) {
   14788         if (SDValue EFLAGS = LowerVectorAllZeroTest(Op, Subtarget, DAG))
   14789           return EFLAGS;
   14790       }
   14791       Opcode = X86ISD::OR;
   14792       break;
   14793     }
   14794     }
   14795 
   14796     NumOperands = 2;
   14797     break;
   14798   case X86ISD::ADD:
   14799   case X86ISD::SUB:
   14800   case X86ISD::INC:
   14801   case X86ISD::DEC:
   14802   case X86ISD::OR:
   14803   case X86ISD::XOR:
   14804   case X86ISD::AND:
   14805     return SDValue(Op.getNode(), 1);
   14806   default:
   14807   default_case:
   14808     break;
   14809   }
   14810 
   14811   // If we found that truncation is beneficial, perform the truncation and
   14812   // update 'Op'.
   14813   if (NeedTruncation) {
   14814     EVT VT = Op.getValueType();
   14815     SDValue WideVal = Op->getOperand(0);
   14816     EVT WideVT = WideVal.getValueType();
   14817     unsigned ConvertedOp = 0;
   14818     // Use a target machine opcode to prevent further DAGCombine
   14819     // optimizations that may separate the arithmetic operations
   14820     // from the setcc node.
   14821     switch (WideVal.getOpcode()) {
   14822       default: break;
   14823       case ISD::ADD: ConvertedOp = X86ISD::ADD; break;
   14824       case ISD::SUB: ConvertedOp = X86ISD::SUB; break;
   14825       case ISD::AND: ConvertedOp = X86ISD::AND; break;
   14826       case ISD::OR:  ConvertedOp = X86ISD::OR;  break;
   14827       case ISD::XOR: ConvertedOp = X86ISD::XOR; break;
   14828     }
   14829 
   14830     if (ConvertedOp) {
   14831       const TargetLowering &TLI = DAG.getTargetLoweringInfo();
   14832       if (TLI.isOperationLegal(WideVal.getOpcode(), WideVT)) {
   14833         SDValue V0 = DAG.getNode(ISD::TRUNCATE, dl, VT, WideVal.getOperand(0));
   14834         SDValue V1 = DAG.getNode(ISD::TRUNCATE, dl, VT, WideVal.getOperand(1));
   14835         Op = DAG.getNode(ConvertedOp, dl, VT, V0, V1);
   14836       }
   14837     }
   14838   }
   14839 
   14840   if (Opcode == 0) {
   14841     // Emit KTEST for bit vectors
   14842     if (auto Node = EmitKTEST(Op, DAG, Subtarget))
   14843       return Node;
   14844 
   14845     // Emit a CMP with 0, which is the TEST pattern.
   14846     return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op,
   14847                        DAG.getConstant(0, dl, Op.getValueType()));
   14848   }
   14849   SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32);
   14850   SmallVector<SDValue, 4> Ops(Op->op_begin(), Op->op_begin() + NumOperands);
   14851 
   14852   SDValue New = DAG.getNode(Opcode, dl, VTs, Ops);
   14853   DAG.ReplaceAllUsesWith(Op, New);
   14854   return SDValue(New.getNode(), 1);
   14855 }
   14856 
   14857 /// Emit nodes that will be selected as "cmp Op0,Op1", or something
   14858 /// equivalent.
   14859 SDValue X86TargetLowering::EmitCmp(SDValue Op0, SDValue Op1, unsigned X86CC,
   14860                                    const SDLoc &dl, SelectionDAG &DAG) const {
   14861   if (isNullConstant(Op1))
   14862     return EmitTest(Op0, X86CC, dl, DAG);
   14863 
   14864   assert(!(isa<ConstantSDNode>(Op1) && Op0.getValueType() == MVT::i1) &&
   14865          "Unexpected comparison operation for MVT::i1 operands");
   14866 
   14867   if ((Op0.getValueType() == MVT::i8 || Op0.getValueType() == MVT::i16 ||
   14868        Op0.getValueType() == MVT::i32 || Op0.getValueType() == MVT::i64)) {
   14869     // Only promote the compare up to I32 if it is a 16 bit operation
   14870     // with an immediate.  16 bit immediates are to be avoided.
   14871     if ((Op0.getValueType() == MVT::i16 &&
   14872          (isa<ConstantSDNode>(Op0) || isa<ConstantSDNode>(Op1))) &&
   14873         !DAG.getMachineFunction().getFunction()->optForMinSize() &&
   14874         !Subtarget.isAtom()) {
   14875       unsigned ExtendOp =
   14876           isX86CCUnsigned(X86CC) ? ISD::ZERO_EXTEND : ISD::SIGN_EXTEND;
   14877       Op0 = DAG.getNode(ExtendOp, dl, MVT::i32, Op0);
   14878       Op1 = DAG.getNode(ExtendOp, dl, MVT::i32, Op1);
   14879     }
   14880     // Use SUB instead of CMP to enable CSE between SUB and CMP.
   14881     SDVTList VTs = DAG.getVTList(Op0.getValueType(), MVT::i32);
   14882     SDValue Sub = DAG.getNode(X86ISD::SUB, dl, VTs,
   14883                               Op0, Op1);
   14884     return SDValue(Sub.getNode(), 1);
   14885   }
   14886   return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op0, Op1);
   14887 }
   14888 
   14889 /// Convert a comparison if required by the subtarget.
   14890 SDValue X86TargetLowering::ConvertCmpIfNecessary(SDValue Cmp,
   14891                                                  SelectionDAG &DAG) const {
   14892   // If the subtarget does not support the FUCOMI instruction, floating-point
   14893   // comparisons have to be converted.
   14894   if (Subtarget.hasCMov() ||
   14895       Cmp.getOpcode() != X86ISD::CMP ||
   14896       !Cmp.getOperand(0).getValueType().isFloatingPoint() ||
   14897       !Cmp.getOperand(1).getValueType().isFloatingPoint())
   14898     return Cmp;
   14899 
   14900   // The instruction selector will select an FUCOM instruction instead of
   14901   // FUCOMI, which writes the comparison result to FPSW instead of EFLAGS. Hence
   14902   // build an SDNode sequence that transfers the result from FPSW into EFLAGS:
   14903   // (X86sahf (trunc (srl (X86fp_stsw (trunc (X86cmp ...)), 8))))
   14904   SDLoc dl(Cmp);
   14905   SDValue TruncFPSW = DAG.getNode(ISD::TRUNCATE, dl, MVT::i16, Cmp);
   14906   SDValue FNStSW = DAG.getNode(X86ISD::FNSTSW16r, dl, MVT::i16, TruncFPSW);
   14907   SDValue Srl = DAG.getNode(ISD::SRL, dl, MVT::i16, FNStSW,
   14908                             DAG.getConstant(8, dl, MVT::i8));
   14909   SDValue TruncSrl = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Srl);
   14910 
   14911   // Some 64-bit targets lack SAHF support, but they do support FCOMI.
   14912   assert(Subtarget.hasLAHFSAHF() && "Target doesn't support SAHF or FCOMI?");
   14913   return DAG.getNode(X86ISD::SAHF, dl, MVT::i32, TruncSrl);
   14914 }
   14915 
   14916 /// The minimum architected relative accuracy is 2^-12. We need one
   14917 /// Newton-Raphson step to have a good float result (24 bits of precision).
   14918 SDValue X86TargetLowering::getRsqrtEstimate(SDValue Op,
   14919                                             DAGCombinerInfo &DCI,
   14920                                             unsigned &RefinementSteps,
   14921                                             bool &UseOneConstNR) const {
   14922   EVT VT = Op.getValueType();
   14923   const char *RecipOp;
   14924 
   14925   // SSE1 has rsqrtss and rsqrtps. AVX adds a 256-bit variant for rsqrtps.
   14926   // TODO: Add support for AVX512 (v16f32).
   14927   // It is likely not profitable to do this for f64 because a double-precision
   14928   // rsqrt estimate with refinement on x86 prior to FMA requires at least 16
   14929   // instructions: convert to single, rsqrtss, convert back to double, refine
   14930   // (3 steps = at least 13 insts). If an 'rsqrtsd' variant was added to the ISA
   14931   // along with FMA, this could be a throughput win.
   14932   if (VT == MVT::f32 && Subtarget.hasSSE1())
   14933     RecipOp = "sqrtf";
   14934   else if ((VT == MVT::v4f32 && Subtarget.hasSSE1()) ||
   14935            (VT == MVT::v8f32 && Subtarget.hasAVX()))
   14936     RecipOp = "vec-sqrtf";
   14937   else
   14938     return SDValue();
   14939 
   14940   TargetRecip Recips = DCI.DAG.getTarget().Options.Reciprocals;
   14941   if (!Recips.isEnabled(RecipOp))
   14942     return SDValue();
   14943 
   14944   RefinementSteps = Recips.getRefinementSteps(RecipOp);
   14945   UseOneConstNR = false;
   14946   return DCI.DAG.getNode(X86ISD::FRSQRT, SDLoc(Op), VT, Op);
   14947 }
   14948 
   14949 /// The minimum architected relative accuracy is 2^-12. We need one
   14950 /// Newton-Raphson step to have a good float result (24 bits of precision).
   14951 SDValue X86TargetLowering::getRecipEstimate(SDValue Op,
   14952                                             DAGCombinerInfo &DCI,
   14953                                             unsigned &RefinementSteps) const {
   14954   EVT VT = Op.getValueType();
   14955   const char *RecipOp;
   14956 
   14957   // SSE1 has rcpss and rcpps. AVX adds a 256-bit variant for rcpps.
   14958   // TODO: Add support for AVX512 (v16f32).
   14959   // It is likely not profitable to do this for f64 because a double-precision
   14960   // reciprocal estimate with refinement on x86 prior to FMA requires
   14961   // 15 instructions: convert to single, rcpss, convert back to double, refine
   14962   // (3 steps = 12 insts). If an 'rcpsd' variant was added to the ISA
   14963   // along with FMA, this could be a throughput win.
   14964   if (VT == MVT::f32 && Subtarget.hasSSE1())
   14965     RecipOp = "divf";
   14966   else if ((VT == MVT::v4f32 && Subtarget.hasSSE1()) ||
   14967            (VT == MVT::v8f32 && Subtarget.hasAVX()))
   14968     RecipOp = "vec-divf";
   14969   else
   14970     return SDValue();
   14971 
   14972   TargetRecip Recips = DCI.DAG.getTarget().Options.Reciprocals;
   14973   if (!Recips.isEnabled(RecipOp))
   14974     return SDValue();
   14975 
   14976   RefinementSteps = Recips.getRefinementSteps(RecipOp);
   14977   return DCI.DAG.getNode(X86ISD::FRCP, SDLoc(Op), VT, Op);
   14978 }
   14979 
   14980 /// If we have at least two divisions that use the same divisor, convert to
   14981 /// multplication by a reciprocal. This may need to be adjusted for a given
   14982 /// CPU if a division's cost is not at least twice the cost of a multiplication.
   14983 /// This is because we still need one division to calculate the reciprocal and
   14984 /// then we need two multiplies by that reciprocal as replacements for the
   14985 /// original divisions.
   14986 unsigned X86TargetLowering::combineRepeatedFPDivisors() const {
   14987   return 2;
   14988 }
   14989 
   14990 /// Result of 'and' is compared against zero. Change to a BT node if possible.
   14991 SDValue X86TargetLowering::LowerToBT(SDValue And, ISD::CondCode CC,
   14992                                      const SDLoc &dl, SelectionDAG &DAG) const {
   14993   SDValue Op0 = And.getOperand(0);
   14994   SDValue Op1 = And.getOperand(1);
   14995   if (Op0.getOpcode() == ISD::TRUNCATE)
   14996     Op0 = Op0.getOperand(0);
   14997   if (Op1.getOpcode() == ISD::TRUNCATE)
   14998     Op1 = Op1.getOperand(0);
   14999 
   15000   SDValue LHS, RHS;
   15001   if (Op1.getOpcode() == ISD::SHL)
   15002     std::swap(Op0, Op1);
   15003   if (Op0.getOpcode() == ISD::SHL) {
   15004     if (isOneConstant(Op0.getOperand(0))) {
   15005       // If we looked past a truncate, check that it's only truncating away
   15006       // known zeros.
   15007       unsigned BitWidth = Op0.getValueSizeInBits();
   15008       unsigned AndBitWidth = And.getValueSizeInBits();
   15009       if (BitWidth > AndBitWidth) {
   15010         APInt Zeros, Ones;
   15011         DAG.computeKnownBits(Op0, Zeros, Ones);
   15012         if (Zeros.countLeadingOnes() < BitWidth - AndBitWidth)
   15013           return SDValue();
   15014       }
   15015       LHS = Op1;
   15016       RHS = Op0.getOperand(1);
   15017     }
   15018   } else if (Op1.getOpcode() == ISD::Constant) {
   15019     ConstantSDNode *AndRHS = cast<ConstantSDNode>(Op1);
   15020     uint64_t AndRHSVal = AndRHS->getZExtValue();
   15021     SDValue AndLHS = Op0;
   15022 
   15023     if (AndRHSVal == 1 && AndLHS.getOpcode() == ISD::SRL) {
   15024       LHS = AndLHS.getOperand(0);
   15025       RHS = AndLHS.getOperand(1);
   15026     }
   15027 
   15028     // Use BT if the immediate can't be encoded in a TEST instruction.
   15029     if (!isUInt<32>(AndRHSVal) && isPowerOf2_64(AndRHSVal)) {
   15030       LHS = AndLHS;
   15031       RHS = DAG.getConstant(Log2_64_Ceil(AndRHSVal), dl, LHS.getValueType());
   15032     }
   15033   }
   15034 
   15035   if (LHS.getNode()) {
   15036     // If LHS is i8, promote it to i32 with any_extend.  There is no i8 BT
   15037     // instruction.  Since the shift amount is in-range-or-undefined, we know
   15038     // that doing a bittest on the i32 value is ok.  We extend to i32 because
   15039     // the encoding for the i16 version is larger than the i32 version.
   15040     // Also promote i16 to i32 for performance / code size reason.
   15041     if (LHS.getValueType() == MVT::i8 ||
   15042         LHS.getValueType() == MVT::i16)
   15043       LHS = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, LHS);
   15044 
   15045     // If the operand types disagree, extend the shift amount to match.  Since
   15046     // BT ignores high bits (like shifts) we can use anyextend.
   15047     if (LHS.getValueType() != RHS.getValueType())
   15048       RHS = DAG.getNode(ISD::ANY_EXTEND, dl, LHS.getValueType(), RHS);
   15049 
   15050     SDValue BT = DAG.getNode(X86ISD::BT, dl, MVT::i32, LHS, RHS);
   15051     X86::CondCode Cond = CC == ISD::SETEQ ? X86::COND_AE : X86::COND_B;
   15052     return DAG.getNode(X86ISD::SETCC, dl, MVT::i8,
   15053                        DAG.getConstant(Cond, dl, MVT::i8), BT);
   15054   }
   15055 
   15056   return SDValue();
   15057 }
   15058 
   15059 /// Turns an ISD::CondCode into a value suitable for SSE floating-point mask
   15060 /// CMPs.
   15061 static int translateX86FSETCC(ISD::CondCode SetCCOpcode, SDValue &Op0,
   15062                               SDValue &Op1) {
   15063   unsigned SSECC;
   15064   bool Swap = false;
   15065 
   15066   // SSE Condition code mapping:
   15067   //  0 - EQ
   15068   //  1 - LT
   15069   //  2 - LE
   15070   //  3 - UNORD
   15071   //  4 - NEQ
   15072   //  5 - NLT
   15073   //  6 - NLE
   15074   //  7 - ORD
   15075   switch (SetCCOpcode) {
   15076   default: llvm_unreachable("Unexpected SETCC condition");
   15077   case ISD::SETOEQ:
   15078   case ISD::SETEQ:  SSECC = 0; break;
   15079   case ISD::SETOGT:
   15080   case ISD::SETGT:  Swap = true; // Fallthrough
   15081   case ISD::SETLT:
   15082   case ISD::SETOLT: SSECC = 1; break;
   15083   case ISD::SETOGE:
   15084   case ISD::SETGE:  Swap = true; // Fallthrough
   15085   case ISD::SETLE:
   15086   case ISD::SETOLE: SSECC = 2; break;
   15087   case ISD::SETUO:  SSECC = 3; break;
   15088   case ISD::SETUNE:
   15089   case ISD::SETNE:  SSECC = 4; break;
   15090   case ISD::SETULE: Swap = true; // Fallthrough
   15091   case ISD::SETUGE: SSECC = 5; break;
   15092   case ISD::SETULT: Swap = true; // Fallthrough
   15093   case ISD::SETUGT: SSECC = 6; break;
   15094   case ISD::SETO:   SSECC = 7; break;
   15095   case ISD::SETUEQ:
   15096   case ISD::SETONE: SSECC = 8; break;
   15097   }
   15098   if (Swap)
   15099     std::swap(Op0, Op1);
   15100 
   15101   return SSECC;
   15102 }
   15103 
   15104 /// Break a VSETCC 256-bit integer VSETCC into two new 128 ones and then
   15105 /// concatenate the result back.
   15106 static SDValue Lower256IntVSETCC(SDValue Op, SelectionDAG &DAG) {
   15107   MVT VT = Op.getSimpleValueType();
   15108 
   15109   assert(VT.is256BitVector() && Op.getOpcode() == ISD::SETCC &&
   15110          "Unsupported value type for operation");
   15111 
   15112   unsigned NumElems = VT.getVectorNumElements();
   15113   SDLoc dl(Op);
   15114   SDValue CC = Op.getOperand(2);
   15115 
   15116   // Extract the LHS vectors
   15117   SDValue LHS = Op.getOperand(0);
   15118   SDValue LHS1 = extract128BitVector(LHS, 0, DAG, dl);
   15119   SDValue LHS2 = extract128BitVector(LHS, NumElems / 2, DAG, dl);
   15120 
   15121   // Extract the RHS vectors
   15122   SDValue RHS = Op.getOperand(1);
   15123   SDValue RHS1 = extract128BitVector(RHS, 0, DAG, dl);
   15124   SDValue RHS2 = extract128BitVector(RHS, NumElems / 2, DAG, dl);
   15125 
   15126   // Issue the operation on the smaller types and concatenate the result back
   15127   MVT EltVT = VT.getVectorElementType();
   15128   MVT NewVT = MVT::getVectorVT(EltVT, NumElems/2);
   15129   return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT,
   15130                      DAG.getNode(Op.getOpcode(), dl, NewVT, LHS1, RHS1, CC),
   15131                      DAG.getNode(Op.getOpcode(), dl, NewVT, LHS2, RHS2, CC));
   15132 }
   15133 
   15134 static SDValue LowerBoolVSETCC_AVX512(SDValue Op, SelectionDAG &DAG) {
   15135   SDValue Op0 = Op.getOperand(0);
   15136   SDValue Op1 = Op.getOperand(1);
   15137   SDValue CC = Op.getOperand(2);
   15138   MVT VT = Op.getSimpleValueType();
   15139   SDLoc dl(Op);
   15140 
   15141   assert(Op0.getSimpleValueType().getVectorElementType() == MVT::i1 &&
   15142          "Unexpected type for boolean compare operation");
   15143   ISD::CondCode SetCCOpcode = cast<CondCodeSDNode>(CC)->get();
   15144   SDValue NotOp0 = DAG.getNode(ISD::XOR, dl, VT, Op0,
   15145                                DAG.getConstant(-1, dl, VT));
   15146   SDValue NotOp1 = DAG.getNode(ISD::XOR, dl, VT, Op1,
   15147                                DAG.getConstant(-1, dl, VT));
   15148   switch (SetCCOpcode) {
   15149   default: llvm_unreachable("Unexpected SETCC condition");
   15150   case ISD::SETEQ:
   15151     // (x == y) -> ~(x ^ y)
   15152     return DAG.getNode(ISD::XOR, dl, VT,
   15153                        DAG.getNode(ISD::XOR, dl, VT, Op0, Op1),
   15154                        DAG.getConstant(-1, dl, VT));
   15155   case ISD::SETNE:
   15156     // (x != y) -> (x ^ y)
   15157     return DAG.getNode(ISD::XOR, dl, VT, Op0, Op1);
   15158   case ISD::SETUGT:
   15159   case ISD::SETGT:
   15160     // (x > y) -> (x & ~y)
   15161     return DAG.getNode(ISD::AND, dl, VT, Op0, NotOp1);
   15162   case ISD::SETULT:
   15163   case ISD::SETLT:
   15164     // (x < y) -> (~x & y)
   15165     return DAG.getNode(ISD::AND, dl, VT, NotOp0, Op1);
   15166   case ISD::SETULE:
   15167   case ISD::SETLE:
   15168     // (x <= y) -> (~x | y)
   15169     return DAG.getNode(ISD::OR, dl, VT, NotOp0, Op1);
   15170   case ISD::SETUGE:
   15171   case ISD::SETGE:
   15172     // (x >=y) -> (x | ~y)
   15173     return DAG.getNode(ISD::OR, dl, VT, Op0, NotOp1);
   15174   }
   15175 }
   15176 
   15177 static SDValue LowerIntVSETCC_AVX512(SDValue Op, SelectionDAG &DAG) {
   15178 
   15179   SDValue Op0 = Op.getOperand(0);
   15180   SDValue Op1 = Op.getOperand(1);
   15181   SDValue CC = Op.getOperand(2);
   15182   MVT VT = Op.getSimpleValueType();
   15183   SDLoc dl(Op);
   15184 
   15185   assert(VT.getVectorElementType() == MVT::i1 &&
   15186          "Cannot set masked compare for this operation");
   15187 
   15188   ISD::CondCode SetCCOpcode = cast<CondCodeSDNode>(CC)->get();
   15189   unsigned  Opc = 0;
   15190   bool Unsigned = false;
   15191   bool Swap = false;
   15192   unsigned SSECC;
   15193   switch (SetCCOpcode) {
   15194   default: llvm_unreachable("Unexpected SETCC condition");
   15195   case ISD::SETNE:  SSECC = 4; break;
   15196   case ISD::SETEQ:  Opc = X86ISD::PCMPEQM; break;
   15197   case ISD::SETUGT: SSECC = 6; Unsigned = true; break;
   15198   case ISD::SETLT:  Swap = true; //fall-through
   15199   case ISD::SETGT:  Opc = X86ISD::PCMPGTM; break;
   15200   case ISD::SETULT: SSECC = 1; Unsigned = true; break;
   15201   case ISD::SETUGE: SSECC = 5; Unsigned = true; break; //NLT
   15202   case ISD::SETGE:  Swap = true; SSECC = 2; break; // LE + swap
   15203   case ISD::SETULE: Unsigned = true; //fall-through
   15204   case ISD::SETLE:  SSECC = 2; break;
   15205   }
   15206 
   15207   if (Swap)
   15208     std::swap(Op0, Op1);
   15209   if (Opc)
   15210     return DAG.getNode(Opc, dl, VT, Op0, Op1);
   15211   Opc = Unsigned ? X86ISD::CMPMU: X86ISD::CMPM;
   15212   return DAG.getNode(Opc, dl, VT, Op0, Op1,
   15213                      DAG.getConstant(SSECC, dl, MVT::i8));
   15214 }
   15215 
   15216 /// \brief Try to turn a VSETULT into a VSETULE by modifying its second
   15217 /// operand \p Op1.  If non-trivial (for example because it's not constant)
   15218 /// return an empty value.
   15219 static SDValue ChangeVSETULTtoVSETULE(const SDLoc &dl, SDValue Op1,
   15220                                       SelectionDAG &DAG) {
   15221   BuildVectorSDNode *BV = dyn_cast<BuildVectorSDNode>(Op1.getNode());
   15222   if (!BV)
   15223     return SDValue();
   15224 
   15225   MVT VT = Op1.getSimpleValueType();
   15226   MVT EVT = VT.getVectorElementType();
   15227   unsigned n = VT.getVectorNumElements();
   15228   SmallVector<SDValue, 8> ULTOp1;
   15229 
   15230   for (unsigned i = 0; i < n; ++i) {
   15231     ConstantSDNode *Elt = dyn_cast<ConstantSDNode>(BV->getOperand(i));
   15232     if (!Elt || Elt->isOpaque() || Elt->getSimpleValueType(0) != EVT)
   15233       return SDValue();
   15234 
   15235     // Avoid underflow.
   15236     APInt Val = Elt->getAPIntValue();
   15237     if (Val == 0)
   15238       return SDValue();
   15239 
   15240     ULTOp1.push_back(DAG.getConstant(Val - 1, dl, EVT));
   15241   }
   15242 
   15243   return DAG.getBuildVector(VT, dl, ULTOp1);
   15244 }
   15245 
   15246 static SDValue LowerVSETCC(SDValue Op, const X86Subtarget &Subtarget,
   15247                            SelectionDAG &DAG) {
   15248   SDValue Op0 = Op.getOperand(0);
   15249   SDValue Op1 = Op.getOperand(1);
   15250   SDValue CC = Op.getOperand(2);
   15251   MVT VT = Op.getSimpleValueType();
   15252   ISD::CondCode SetCCOpcode = cast<CondCodeSDNode>(CC)->get();
   15253   bool isFP = Op.getOperand(1).getSimpleValueType().isFloatingPoint();
   15254   SDLoc dl(Op);
   15255 
   15256   if (isFP) {
   15257 #ifndef NDEBUG
   15258     MVT EltVT = Op0.getSimpleValueType().getVectorElementType();
   15259     assert(EltVT == MVT::f32 || EltVT == MVT::f64);
   15260 #endif
   15261 
   15262     unsigned Opc;
   15263     if (Subtarget.hasAVX512() && VT.getVectorElementType() == MVT::i1) {
   15264       assert(VT.getVectorNumElements() <= 16);
   15265       Opc = X86ISD::CMPM;
   15266     } else {
   15267       Opc = X86ISD::CMPP;
   15268       // The SSE/AVX packed FP comparison nodes are defined with a
   15269       // floating-point vector result that matches the operand type. This allows
   15270       // them to work with an SSE1 target (integer vector types are not legal).
   15271       VT = Op0.getSimpleValueType();
   15272     }
   15273 
   15274     // In the two cases not handled by SSE compare predicates (SETUEQ/SETONE),
   15275     // emit two comparisons and a logic op to tie them together.
   15276     // TODO: This can be avoided if Intel (and only Intel as of 2016) AVX is
   15277     // available.
   15278     SDValue Cmp;
   15279     unsigned SSECC = translateX86FSETCC(SetCCOpcode, Op0, Op1);
   15280     if (SSECC == 8) {
   15281       // LLVM predicate is SETUEQ or SETONE.
   15282       unsigned CC0, CC1;
   15283       unsigned CombineOpc;
   15284       if (SetCCOpcode == ISD::SETUEQ) {
   15285         CC0 = 3; // UNORD
   15286         CC1 = 0; // EQ
   15287         CombineOpc = Opc == X86ISD::CMPP ? static_cast<unsigned>(X86ISD::FOR) :
   15288                                            static_cast<unsigned>(ISD::OR);
   15289       } else {
   15290         assert(SetCCOpcode == ISD::SETONE);
   15291         CC0 = 7; // ORD
   15292         CC1 = 4; // NEQ
   15293         CombineOpc = Opc == X86ISD::CMPP ? static_cast<unsigned>(X86ISD::FAND) :
   15294                                            static_cast<unsigned>(ISD::AND);
   15295       }
   15296 
   15297       SDValue Cmp0 = DAG.getNode(Opc, dl, VT, Op0, Op1,
   15298                                  DAG.getConstant(CC0, dl, MVT::i8));
   15299       SDValue Cmp1 = DAG.getNode(Opc, dl, VT, Op0, Op1,
   15300                                  DAG.getConstant(CC1, dl, MVT::i8));
   15301       Cmp = DAG.getNode(CombineOpc, dl, VT, Cmp0, Cmp1);
   15302     } else {
   15303       // Handle all other FP comparisons here.
   15304       Cmp = DAG.getNode(Opc, dl, VT, Op0, Op1,
   15305                         DAG.getConstant(SSECC, dl, MVT::i8));
   15306     }
   15307 
   15308     // If this is SSE/AVX CMPP, bitcast the result back to integer to match the
   15309     // result type of SETCC. The bitcast is expected to be optimized away
   15310     // during combining/isel.
   15311     if (Opc == X86ISD::CMPP)
   15312       Cmp = DAG.getBitcast(Op.getSimpleValueType(), Cmp);
   15313 
   15314     return Cmp;
   15315   }
   15316 
   15317   MVT VTOp0 = Op0.getSimpleValueType();
   15318   assert(VTOp0 == Op1.getSimpleValueType() &&
   15319          "Expected operands with same type!");
   15320   assert(VT.getVectorNumElements() == VTOp0.getVectorNumElements() &&
   15321          "Invalid number of packed elements for source and destination!");
   15322 
   15323   if (VT.is128BitVector() && VTOp0.is256BitVector()) {
   15324     // On non-AVX512 targets, a vector of MVT::i1 is promoted by the type
   15325     // legalizer to a wider vector type.  In the case of 'vsetcc' nodes, the
   15326     // legalizer firstly checks if the first operand in input to the setcc has
   15327     // a legal type. If so, then it promotes the return type to that same type.
   15328     // Otherwise, the return type is promoted to the 'next legal type' which,
   15329     // for a vector of MVT::i1 is always a 128-bit integer vector type.
   15330     //
   15331     // We reach this code only if the following two conditions are met:
   15332     // 1. Both return type and operand type have been promoted to wider types
   15333     //    by the type legalizer.
   15334     // 2. The original operand type has been promoted to a 256-bit vector.
   15335     //
   15336     // Note that condition 2. only applies for AVX targets.
   15337     SDValue NewOp = DAG.getSetCC(dl, VTOp0, Op0, Op1, SetCCOpcode);
   15338     return DAG.getZExtOrTrunc(NewOp, dl, VT);
   15339   }
   15340 
   15341   // The non-AVX512 code below works under the assumption that source and
   15342   // destination types are the same.
   15343   assert((Subtarget.hasAVX512() || (VT == VTOp0)) &&
   15344          "Value types for source and destination must be the same!");
   15345 
   15346   // Break 256-bit integer vector compare into smaller ones.
   15347   if (VT.is256BitVector() && !Subtarget.hasInt256())
   15348     return Lower256IntVSETCC(Op, DAG);
   15349 
   15350   // Operands are boolean (vectors of i1)
   15351   MVT OpVT = Op1.getSimpleValueType();
   15352   if (OpVT.getVectorElementType() == MVT::i1)
   15353     return LowerBoolVSETCC_AVX512(Op, DAG);
   15354 
   15355   // The result is boolean, but operands are int/float
   15356   if (VT.getVectorElementType() == MVT::i1) {
   15357     // In AVX-512 architecture setcc returns mask with i1 elements,
   15358     // But there is no compare instruction for i8 and i16 elements in KNL.
   15359     // In this case use SSE compare
   15360     bool UseAVX512Inst =
   15361       (OpVT.is512BitVector() ||
   15362        OpVT.getVectorElementType().getSizeInBits() >= 32 ||
   15363        (Subtarget.hasBWI() && Subtarget.hasVLX()));
   15364 
   15365     if (UseAVX512Inst)
   15366       return LowerIntVSETCC_AVX512(Op, DAG);
   15367 
   15368     return DAG.getNode(ISD::TRUNCATE, dl, VT,
   15369                         DAG.getNode(ISD::SETCC, dl, OpVT, Op0, Op1, CC));
   15370   }
   15371 
   15372   // Lower using XOP integer comparisons.
   15373   if ((VT == MVT::v16i8 || VT == MVT::v8i16 ||
   15374        VT == MVT::v4i32 || VT == MVT::v2i64) && Subtarget.hasXOP()) {
   15375     // Translate compare code to XOP PCOM compare mode.
   15376     unsigned CmpMode = 0;
   15377     switch (SetCCOpcode) {
   15378     default: llvm_unreachable("Unexpected SETCC condition");
   15379     case ISD::SETULT:
   15380     case ISD::SETLT: CmpMode = 0x00; break;
   15381     case ISD::SETULE:
   15382     case ISD::SETLE: CmpMode = 0x01; break;
   15383     case ISD::SETUGT:
   15384     case ISD::SETGT: CmpMode = 0x02; break;
   15385     case ISD::SETUGE:
   15386     case ISD::SETGE: CmpMode = 0x03; break;
   15387     case ISD::SETEQ: CmpMode = 0x04; break;
   15388     case ISD::SETNE: CmpMode = 0x05; break;
   15389     }
   15390 
   15391     // Are we comparing unsigned or signed integers?
   15392     unsigned Opc = ISD::isUnsignedIntSetCC(SetCCOpcode)
   15393       ? X86ISD::VPCOMU : X86ISD::VPCOM;
   15394 
   15395     return DAG.getNode(Opc, dl, VT, Op0, Op1,
   15396                        DAG.getConstant(CmpMode, dl, MVT::i8));
   15397   }
   15398 
   15399   // We are handling one of the integer comparisons here.  Since SSE only has
   15400   // GT and EQ comparisons for integer, swapping operands and multiple
   15401   // operations may be required for some comparisons.
   15402   unsigned Opc;
   15403   bool Swap = false, Invert = false, FlipSigns = false, MinMax = false;
   15404   bool Subus = false;
   15405 
   15406   switch (SetCCOpcode) {
   15407   default: llvm_unreachable("Unexpected SETCC condition");
   15408   case ISD::SETNE:  Invert = true;
   15409   case ISD::SETEQ:  Opc = X86ISD::PCMPEQ; break;
   15410   case ISD::SETLT:  Swap = true;
   15411   case ISD::SETGT:  Opc = X86ISD::PCMPGT; break;
   15412   case ISD::SETGE:  Swap = true;
   15413   case ISD::SETLE:  Opc = X86ISD::PCMPGT;
   15414                     Invert = true; break;
   15415   case ISD::SETULT: Swap = true;
   15416   case ISD::SETUGT: Opc = X86ISD::PCMPGT;
   15417                     FlipSigns = true; break;
   15418   case ISD::SETUGE: Swap = true;
   15419   case ISD::SETULE: Opc = X86ISD::PCMPGT;
   15420                     FlipSigns = true; Invert = true; break;
   15421   }
   15422 
   15423   // Special case: Use min/max operations for SETULE/SETUGE
   15424   MVT VET = VT.getVectorElementType();
   15425   bool hasMinMax =
   15426        (Subtarget.hasSSE41() && (VET >= MVT::i8 && VET <= MVT::i32))
   15427     || (Subtarget.hasSSE2()  && (VET == MVT::i8));
   15428 
   15429   if (hasMinMax) {
   15430     switch (SetCCOpcode) {
   15431     default: break;
   15432     case ISD::SETULE: Opc = ISD::UMIN; MinMax = true; break;
   15433     case ISD::SETUGE: Opc = ISD::UMAX; MinMax = true; break;
   15434     }
   15435 
   15436     if (MinMax) { Swap = false; Invert = false; FlipSigns = false; }
   15437   }
   15438 
   15439   bool hasSubus = Subtarget.hasSSE2() && (VET == MVT::i8 || VET == MVT::i16);
   15440   if (!MinMax && hasSubus) {
   15441     // As another special case, use PSUBUS[BW] when it's profitable. E.g. for
   15442     // Op0 u<= Op1:
   15443     //   t = psubus Op0, Op1
   15444     //   pcmpeq t, <0..0>
   15445     switch (SetCCOpcode) {
   15446     default: break;
   15447     case ISD::SETULT: {
   15448       // If the comparison is against a constant we can turn this into a
   15449       // setule.  With psubus, setule does not require a swap.  This is
   15450       // beneficial because the constant in the register is no longer
   15451       // destructed as the destination so it can be hoisted out of a loop.
   15452       // Only do this pre-AVX since vpcmp* is no longer destructive.
   15453       if (Subtarget.hasAVX())
   15454         break;
   15455       if (SDValue ULEOp1 = ChangeVSETULTtoVSETULE(dl, Op1, DAG)) {
   15456         Op1 = ULEOp1;
   15457         Subus = true; Invert = false; Swap = false;
   15458       }
   15459       break;
   15460     }
   15461     // Psubus is better than flip-sign because it requires no inversion.
   15462     case ISD::SETUGE: Subus = true; Invert = false; Swap = true;  break;
   15463     case ISD::SETULE: Subus = true; Invert = false; Swap = false; break;
   15464     }
   15465 
   15466     if (Subus) {
   15467       Opc = X86ISD::SUBUS;
   15468       FlipSigns = false;
   15469     }
   15470   }
   15471 
   15472   if (Swap)
   15473     std::swap(Op0, Op1);
   15474 
   15475   // Check that the operation in question is available (most are plain SSE2,
   15476   // but PCMPGTQ and PCMPEQQ have different requirements).
   15477   if (VT == MVT::v2i64) {
   15478     if (Opc == X86ISD::PCMPGT && !Subtarget.hasSSE42()) {
   15479       assert(Subtarget.hasSSE2() && "Don't know how to lower!");
   15480 
   15481       // First cast everything to the right type.
   15482       Op0 = DAG.getBitcast(MVT::v4i32, Op0);
   15483       Op1 = DAG.getBitcast(MVT::v4i32, Op1);
   15484 
   15485       // Since SSE has no unsigned integer comparisons, we need to flip the sign
   15486       // bits of the inputs before performing those operations. The lower
   15487       // compare is always unsigned.
   15488       SDValue SB;
   15489       if (FlipSigns) {
   15490         SB = DAG.getConstant(0x80000000U, dl, MVT::v4i32);
   15491       } else {
   15492         SDValue Sign = DAG.getConstant(0x80000000U, dl, MVT::i32);
   15493         SDValue Zero = DAG.getConstant(0x00000000U, dl, MVT::i32);
   15494         SB = DAG.getBuildVector(MVT::v4i32, dl, {Sign, Zero, Sign, Zero});
   15495       }
   15496       Op0 = DAG.getNode(ISD::XOR, dl, MVT::v4i32, Op0, SB);
   15497       Op1 = DAG.getNode(ISD::XOR, dl, MVT::v4i32, Op1, SB);
   15498 
   15499       // Emulate PCMPGTQ with (hi1 > hi2) | ((hi1 == hi2) & (lo1 > lo2))
   15500       SDValue GT = DAG.getNode(X86ISD::PCMPGT, dl, MVT::v4i32, Op0, Op1);
   15501       SDValue EQ = DAG.getNode(X86ISD::PCMPEQ, dl, MVT::v4i32, Op0, Op1);
   15502 
   15503       // Create masks for only the low parts/high parts of the 64 bit integers.
   15504       static const int MaskHi[] = { 1, 1, 3, 3 };
   15505       static const int MaskLo[] = { 0, 0, 2, 2 };
   15506       SDValue EQHi = DAG.getVectorShuffle(MVT::v4i32, dl, EQ, EQ, MaskHi);
   15507       SDValue GTLo = DAG.getVectorShuffle(MVT::v4i32, dl, GT, GT, MaskLo);
   15508       SDValue GTHi = DAG.getVectorShuffle(MVT::v4i32, dl, GT, GT, MaskHi);
   15509 
   15510       SDValue Result = DAG.getNode(ISD::AND, dl, MVT::v4i32, EQHi, GTLo);
   15511       Result = DAG.getNode(ISD::OR, dl, MVT::v4i32, Result, GTHi);
   15512 
   15513       if (Invert)
   15514         Result = DAG.getNOT(dl, Result, MVT::v4i32);
   15515 
   15516       return DAG.getBitcast(VT, Result);
   15517     }
   15518 
   15519     if (Opc == X86ISD::PCMPEQ && !Subtarget.hasSSE41()) {
   15520       // If pcmpeqq is missing but pcmpeqd is available synthesize pcmpeqq with
   15521       // pcmpeqd + pshufd + pand.
   15522       assert(Subtarget.hasSSE2() && !FlipSigns && "Don't know how to lower!");
   15523 
   15524       // First cast everything to the right type.
   15525       Op0 = DAG.getBitcast(MVT::v4i32, Op0);
   15526       Op1 = DAG.getBitcast(MVT::v4i32, Op1);
   15527 
   15528       // Do the compare.
   15529       SDValue Result = DAG.getNode(Opc, dl, MVT::v4i32, Op0, Op1);
   15530 
   15531       // Make sure the lower and upper halves are both all-ones.
   15532       static const int Mask[] = { 1, 0, 3, 2 };
   15533       SDValue Shuf = DAG.getVectorShuffle(MVT::v4i32, dl, Result, Result, Mask);
   15534       Result = DAG.getNode(ISD::AND, dl, MVT::v4i32, Result, Shuf);
   15535 
   15536       if (Invert)
   15537         Result = DAG.getNOT(dl, Result, MVT::v4i32);
   15538 
   15539       return DAG.getBitcast(VT, Result);
   15540     }
   15541   }
   15542 
   15543   // Since SSE has no unsigned integer comparisons, we need to flip the sign
   15544   // bits of the inputs before performing those operations.
   15545   if (FlipSigns) {
   15546     MVT EltVT = VT.getVectorElementType();
   15547     SDValue SB = DAG.getConstant(APInt::getSignBit(EltVT.getSizeInBits()), dl,
   15548                                  VT);
   15549     Op0 = DAG.getNode(ISD::XOR, dl, VT, Op0, SB);
   15550     Op1 = DAG.getNode(ISD::XOR, dl, VT, Op1, SB);
   15551   }
   15552 
   15553   SDValue Result = DAG.getNode(Opc, dl, VT, Op0, Op1);
   15554 
   15555   // If the logical-not of the result is required, perform that now.
   15556   if (Invert)
   15557     Result = DAG.getNOT(dl, Result, VT);
   15558 
   15559   if (MinMax)
   15560     Result = DAG.getNode(X86ISD::PCMPEQ, dl, VT, Op0, Result);
   15561 
   15562   if (Subus)
   15563     Result = DAG.getNode(X86ISD::PCMPEQ, dl, VT, Result,
   15564                          getZeroVector(VT, Subtarget, DAG, dl));
   15565 
   15566   return Result;
   15567 }
   15568 
   15569 SDValue X86TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const {
   15570 
   15571   MVT VT = Op.getSimpleValueType();
   15572 
   15573   if (VT.isVector()) return LowerVSETCC(Op, Subtarget, DAG);
   15574 
   15575   assert(((!Subtarget.hasAVX512() && VT == MVT::i8) || (VT == MVT::i1))
   15576          && "SetCC type must be 8-bit or 1-bit integer");
   15577   SDValue Op0 = Op.getOperand(0);
   15578   SDValue Op1 = Op.getOperand(1);
   15579   SDLoc dl(Op);
   15580   ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(2))->get();
   15581 
   15582   // Optimize to BT if possible.
   15583   // Lower (X & (1 << N)) == 0 to BT(X, N).
   15584   // Lower ((X >>u N) & 1) != 0 to BT(X, N).
   15585   // Lower ((X >>s N) & 1) != 0 to BT(X, N).
   15586   if (Op0.getOpcode() == ISD::AND && Op0.hasOneUse() &&
   15587       isNullConstant(Op1) &&
   15588       (CC == ISD::SETEQ || CC == ISD::SETNE)) {
   15589     if (SDValue NewSetCC = LowerToBT(Op0, CC, dl, DAG)) {
   15590       if (VT == MVT::i1) {
   15591         NewSetCC = DAG.getNode(ISD::AssertZext, dl, MVT::i8, NewSetCC,
   15592                                DAG.getValueType(MVT::i1));
   15593         return DAG.getNode(ISD::TRUNCATE, dl, MVT::i1, NewSetCC);
   15594       }
   15595       return NewSetCC;
   15596     }
   15597   }
   15598 
   15599   // Look for X == 0, X == 1, X != 0, or X != 1.  We can simplify some forms of
   15600   // these.
   15601   if ((isOneConstant(Op1) || isNullConstant(Op1)) &&
   15602       (CC == ISD::SETEQ || CC == ISD::SETNE)) {
   15603 
   15604     // If the input is a setcc, then reuse the input setcc or use a new one with
   15605     // the inverted condition.
   15606     if (Op0.getOpcode() == X86ISD::SETCC) {
   15607       X86::CondCode CCode = (X86::CondCode)Op0.getConstantOperandVal(0);
   15608       bool Invert = (CC == ISD::SETNE) ^ isNullConstant(Op1);
   15609       if (!Invert)
   15610         return Op0;
   15611 
   15612       CCode = X86::GetOppositeBranchCondition(CCode);
   15613       SDValue SetCC = DAG.getNode(X86ISD::SETCC, dl, MVT::i8,
   15614                                   DAG.getConstant(CCode, dl, MVT::i8),
   15615                                   Op0.getOperand(1));
   15616       if (VT == MVT::i1) {
   15617         SetCC = DAG.getNode(ISD::AssertZext, dl, MVT::i8, SetCC,
   15618                             DAG.getValueType(MVT::i1));
   15619         return DAG.getNode(ISD::TRUNCATE, dl, MVT::i1, SetCC);
   15620       }
   15621       return SetCC;
   15622     }
   15623   }
   15624   if (Op0.getValueType() == MVT::i1 && (CC == ISD::SETEQ || CC == ISD::SETNE)) {
   15625     if (isOneConstant(Op1)) {
   15626       ISD::CondCode NewCC = ISD::getSetCCInverse(CC, true);
   15627       return DAG.getSetCC(dl, VT, Op0, DAG.getConstant(0, dl, MVT::i1), NewCC);
   15628     }
   15629     if (!isNullConstant(Op1)) {
   15630       SDValue Xor = DAG.getNode(ISD::XOR, dl, MVT::i1, Op0, Op1);
   15631       return DAG.getSetCC(dl, VT, Xor, DAG.getConstant(0, dl, MVT::i1), CC);
   15632     }
   15633   }
   15634 
   15635   bool isFP = Op1.getSimpleValueType().isFloatingPoint();
   15636   unsigned X86CC = TranslateX86CC(CC, dl, isFP, Op0, Op1, DAG);
   15637   if (X86CC == X86::COND_INVALID)
   15638     return SDValue();
   15639 
   15640   SDValue EFLAGS = EmitCmp(Op0, Op1, X86CC, dl, DAG);
   15641   EFLAGS = ConvertCmpIfNecessary(EFLAGS, DAG);
   15642   SDValue SetCC = DAG.getNode(X86ISD::SETCC, dl, MVT::i8,
   15643                               DAG.getConstant(X86CC, dl, MVT::i8), EFLAGS);
   15644   if (VT == MVT::i1) {
   15645     SetCC = DAG.getNode(ISD::AssertZext, dl, MVT::i8, SetCC,
   15646                         DAG.getValueType(MVT::i1));
   15647     return DAG.getNode(ISD::TRUNCATE, dl, MVT::i1, SetCC);
   15648   }
   15649   return SetCC;
   15650 }
   15651 
   15652 SDValue X86TargetLowering::LowerSETCCE(SDValue Op, SelectionDAG &DAG) const {
   15653   SDValue LHS = Op.getOperand(0);
   15654   SDValue RHS = Op.getOperand(1);
   15655   SDValue Carry = Op.getOperand(2);
   15656   SDValue Cond = Op.getOperand(3);
   15657   SDLoc DL(Op);
   15658 
   15659   assert(LHS.getSimpleValueType().isInteger() && "SETCCE is integer only.");
   15660   X86::CondCode CC = TranslateIntegerX86CC(cast<CondCodeSDNode>(Cond)->get());
   15661 
   15662   assert(Carry.getOpcode() != ISD::CARRY_FALSE);
   15663   SDVTList VTs = DAG.getVTList(LHS.getValueType(), MVT::i32);
   15664   SDValue Cmp = DAG.getNode(X86ISD::SBB, DL, VTs, LHS, RHS, Carry);
   15665   SDValue SetCC = DAG.getNode(X86ISD::SETCC, DL, MVT::i8,
   15666                               DAG.getConstant(CC, DL, MVT::i8), Cmp.getValue(1));
   15667   if (Op.getSimpleValueType() == MVT::i1) {
   15668     SetCC = DAG.getNode(ISD::AssertZext, DL, MVT::i8, SetCC,
   15669                         DAG.getValueType(MVT::i1));
   15670     return DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, SetCC);
   15671   }
   15672   return SetCC;
   15673 }
   15674 
   15675 /// Return true if opcode is a X86 logical comparison.
   15676 static bool isX86LogicalCmp(SDValue Op) {
   15677   unsigned Opc = Op.getNode()->getOpcode();
   15678   if (Opc == X86ISD::CMP || Opc == X86ISD::COMI || Opc == X86ISD::UCOMI ||
   15679       Opc == X86ISD::SAHF)
   15680     return true;
   15681   if (Op.getResNo() == 1 &&
   15682       (Opc == X86ISD::ADD ||
   15683        Opc == X86ISD::SUB ||
   15684        Opc == X86ISD::ADC ||
   15685        Opc == X86ISD::SBB ||
   15686        Opc == X86ISD::SMUL ||
   15687        Opc == X86ISD::UMUL ||
   15688        Opc == X86ISD::INC ||
   15689        Opc == X86ISD::DEC ||
   15690        Opc == X86ISD::OR ||
   15691        Opc == X86ISD::XOR ||
   15692        Opc == X86ISD::AND))
   15693     return true;
   15694 
   15695   if (Op.getResNo() == 2 && Opc == X86ISD::UMUL)
   15696     return true;
   15697 
   15698   return false;
   15699 }
   15700 
   15701 /// Returns the "condition" node, that may be wrapped with "truncate".
   15702 /// Like this: (i1 (trunc (i8 X86ISD::SETCC))).
   15703 static SDValue getCondAfterTruncWithZeroHighBitsInput(SDValue V, SelectionDAG &DAG) {
   15704   if (V.getOpcode() != ISD::TRUNCATE)
   15705     return V;
   15706 
   15707   SDValue VOp0 = V.getOperand(0);
   15708   if (VOp0.getOpcode() == ISD::AssertZext &&
   15709       V.getValueSizeInBits() ==
   15710       cast<VTSDNode>(VOp0.getOperand(1))->getVT().getSizeInBits())
   15711     return VOp0.getOperand(0);
   15712 
   15713   unsigned InBits = VOp0.getValueSizeInBits();
   15714   unsigned Bits = V.getValueSizeInBits();
   15715   if (DAG.MaskedValueIsZero(VOp0, APInt::getHighBitsSet(InBits,InBits-Bits)))
   15716     return V.getOperand(0);
   15717   return V;
   15718 }
   15719 
   15720 SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
   15721   bool addTest = true;
   15722   SDValue Cond  = Op.getOperand(0);
   15723   SDValue Op1 = Op.getOperand(1);
   15724   SDValue Op2 = Op.getOperand(2);
   15725   SDLoc DL(Op);
   15726   MVT VT = Op1.getSimpleValueType();
   15727   SDValue CC;
   15728 
   15729   // Lower FP selects into a CMP/AND/ANDN/OR sequence when the necessary SSE ops
   15730   // are available or VBLENDV if AVX is available.
   15731   // Otherwise FP cmovs get lowered into a less efficient branch sequence later.
   15732   if (Cond.getOpcode() == ISD::SETCC &&
   15733       ((Subtarget.hasSSE2() && (VT == MVT::f32 || VT == MVT::f64)) ||
   15734        (Subtarget.hasSSE1() && VT == MVT::f32)) &&
   15735       VT == Cond.getOperand(0).getSimpleValueType() && Cond->hasOneUse()) {
   15736     SDValue CondOp0 = Cond.getOperand(0), CondOp1 = Cond.getOperand(1);
   15737     int SSECC = translateX86FSETCC(
   15738         cast<CondCodeSDNode>(Cond.getOperand(2))->get(), CondOp0, CondOp1);
   15739 
   15740     if (SSECC != 8) {
   15741       if (Subtarget.hasAVX512()) {
   15742         SDValue Cmp = DAG.getNode(X86ISD::FSETCC, DL, MVT::i1, CondOp0, CondOp1,
   15743                                   DAG.getConstant(SSECC, DL, MVT::i8));
   15744         return DAG.getNode(X86ISD::SELECT, DL, VT, Cmp, Op1, Op2);
   15745       }
   15746 
   15747       SDValue Cmp = DAG.getNode(X86ISD::FSETCC, DL, VT, CondOp0, CondOp1,
   15748                                 DAG.getConstant(SSECC, DL, MVT::i8));
   15749 
   15750       // If we have AVX, we can use a variable vector select (VBLENDV) instead
   15751       // of 3 logic instructions for size savings and potentially speed.
   15752       // Unfortunately, there is no scalar form of VBLENDV.
   15753 
   15754       // If either operand is a constant, don't try this. We can expect to
   15755       // optimize away at least one of the logic instructions later in that
   15756       // case, so that sequence would be faster than a variable blend.
   15757 
   15758       // BLENDV was introduced with SSE 4.1, but the 2 register form implicitly
   15759       // uses XMM0 as the selection register. That may need just as many
   15760       // instructions as the AND/ANDN/OR sequence due to register moves, so
   15761       // don't bother.
   15762 
   15763       if (Subtarget.hasAVX() &&
   15764           !isa<ConstantFPSDNode>(Op1) && !isa<ConstantFPSDNode>(Op2)) {
   15765 
   15766         // Convert to vectors, do a VSELECT, and convert back to scalar.
   15767         // All of the conversions should be optimized away.
   15768 
   15769         MVT VecVT = VT == MVT::f32 ? MVT::v4f32 : MVT::v2f64;
   15770         SDValue VOp1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, Op1);
   15771         SDValue VOp2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, Op2);
   15772         SDValue VCmp = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, Cmp);
   15773 
   15774         MVT VCmpVT = VT == MVT::f32 ? MVT::v4i32 : MVT::v2i64;
   15775         VCmp = DAG.getBitcast(VCmpVT, VCmp);
   15776 
   15777         SDValue VSel = DAG.getNode(ISD::VSELECT, DL, VecVT, VCmp, VOp1, VOp2);
   15778 
   15779         return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT,
   15780                            VSel, DAG.getIntPtrConstant(0, DL));
   15781       }
   15782       SDValue AndN = DAG.getNode(X86ISD::FANDN, DL, VT, Cmp, Op2);
   15783       SDValue And = DAG.getNode(X86ISD::FAND, DL, VT, Cmp, Op1);
   15784       return DAG.getNode(X86ISD::FOR, DL, VT, AndN, And);
   15785     }
   15786   }
   15787 
   15788   if (VT.isVector() && VT.getVectorElementType() == MVT::i1) {
   15789     SDValue Op1Scalar;
   15790     if (ISD::isBuildVectorOfConstantSDNodes(Op1.getNode()))
   15791       Op1Scalar = ConvertI1VectorToInteger(Op1, DAG);
   15792     else if (Op1.getOpcode() == ISD::BITCAST && Op1.getOperand(0))
   15793       Op1Scalar = Op1.getOperand(0);
   15794     SDValue Op2Scalar;
   15795     if (ISD::isBuildVectorOfConstantSDNodes(Op2.getNode()))
   15796       Op2Scalar = ConvertI1VectorToInteger(Op2, DAG);
   15797     else if (Op2.getOpcode() == ISD::BITCAST && Op2.getOperand(0))
   15798       Op2Scalar = Op2.getOperand(0);
   15799     if (Op1Scalar.getNode() && Op2Scalar.getNode()) {
   15800       SDValue newSelect = DAG.getNode(ISD::SELECT, DL,
   15801                                       Op1Scalar.getValueType(),
   15802                                       Cond, Op1Scalar, Op2Scalar);
   15803       if (newSelect.getValueSizeInBits() == VT.getSizeInBits())
   15804         return DAG.getBitcast(VT, newSelect);
   15805       SDValue ExtVec = DAG.getBitcast(MVT::v8i1, newSelect);
   15806       return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, ExtVec,
   15807                          DAG.getIntPtrConstant(0, DL));
   15808     }
   15809   }
   15810 
   15811   if (VT == MVT::v4i1 || VT == MVT::v2i1) {
   15812     SDValue zeroConst = DAG.getIntPtrConstant(0, DL);
   15813     Op1 = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, MVT::v8i1,
   15814                       DAG.getUNDEF(MVT::v8i1), Op1, zeroConst);
   15815     Op2 = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, MVT::v8i1,
   15816                       DAG.getUNDEF(MVT::v8i1), Op2, zeroConst);
   15817     SDValue newSelect = DAG.getNode(ISD::SELECT, DL, MVT::v8i1,
   15818                                     Cond, Op1, Op2);
   15819     return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, newSelect, zeroConst);
   15820   }
   15821 
   15822   if (Cond.getOpcode() == ISD::SETCC) {
   15823     if (SDValue NewCond = LowerSETCC(Cond, DAG))
   15824       Cond = NewCond;
   15825   }
   15826 
   15827   // (select (x == 0), -1, y) -> (sign_bit (x - 1)) | y
   15828   // (select (x == 0), y, -1) -> ~(sign_bit (x - 1)) | y
   15829   // (select (x != 0), y, -1) -> (sign_bit (x - 1)) | y
   15830   // (select (x != 0), -1, y) -> ~(sign_bit (x - 1)) | y
   15831   if (Cond.getOpcode() == X86ISD::SETCC &&
   15832       Cond.getOperand(1).getOpcode() == X86ISD::CMP &&
   15833       isNullConstant(Cond.getOperand(1).getOperand(1))) {
   15834     SDValue Cmp = Cond.getOperand(1);
   15835 
   15836     unsigned CondCode =cast<ConstantSDNode>(Cond.getOperand(0))->getZExtValue();
   15837 
   15838     if ((isAllOnesConstant(Op1) || isAllOnesConstant(Op2)) &&
   15839         (CondCode == X86::COND_E || CondCode == X86::COND_NE)) {
   15840       SDValue Y = isAllOnesConstant(Op2) ? Op1 : Op2;
   15841 
   15842       SDValue CmpOp0 = Cmp.getOperand(0);
   15843       // Apply further optimizations for special cases
   15844       // (select (x != 0), -1, 0) -> neg & sbb
   15845       // (select (x == 0), 0, -1) -> neg & sbb
   15846       if (isNullConstant(Y) &&
   15847             (isAllOnesConstant(Op1) == (CondCode == X86::COND_NE))) {
   15848           SDVTList VTs = DAG.getVTList(CmpOp0.getValueType(), MVT::i32);
   15849           SDValue Neg = DAG.getNode(X86ISD::SUB, DL, VTs,
   15850                                     DAG.getConstant(0, DL,
   15851                                                     CmpOp0.getValueType()),
   15852                                     CmpOp0);
   15853           SDValue Res = DAG.getNode(X86ISD::SETCC_CARRY, DL, Op.getValueType(),
   15854                                     DAG.getConstant(X86::COND_B, DL, MVT::i8),
   15855                                     SDValue(Neg.getNode(), 1));
   15856           return Res;
   15857         }
   15858 
   15859       Cmp = DAG.getNode(X86ISD::CMP, DL, MVT::i32,
   15860                         CmpOp0, DAG.getConstant(1, DL, CmpOp0.getValueType()));
   15861       Cmp = ConvertCmpIfNecessary(Cmp, DAG);
   15862 
   15863       SDValue Res =   // Res = 0 or -1.
   15864         DAG.getNode(X86ISD::SETCC_CARRY, DL, Op.getValueType(),
   15865                     DAG.getConstant(X86::COND_B, DL, MVT::i8), Cmp);
   15866 
   15867       if (isAllOnesConstant(Op1) != (CondCode == X86::COND_E))
   15868         Res = DAG.getNOT(DL, Res, Res.getValueType());
   15869 
   15870       if (!isNullConstant(Op2))
   15871         Res = DAG.getNode(ISD::OR, DL, Res.getValueType(), Res, Y);
   15872       return Res;
   15873     }
   15874   }
   15875 
   15876   // Look past (and (setcc_carry (cmp ...)), 1).
   15877   if (Cond.getOpcode() == ISD::AND &&
   15878       Cond.getOperand(0).getOpcode() == X86ISD::SETCC_CARRY &&
   15879       isOneConstant(Cond.getOperand(1)))
   15880     Cond = Cond.getOperand(0);
   15881 
   15882   // If condition flag is set by a X86ISD::CMP, then use it as the condition
   15883   // setting operand in place of the X86ISD::SETCC.
   15884   unsigned CondOpcode = Cond.getOpcode();
   15885   if (CondOpcode == X86ISD::SETCC ||
   15886       CondOpcode == X86ISD::SETCC_CARRY) {
   15887     CC = Cond.getOperand(0);
   15888 
   15889     SDValue Cmp = Cond.getOperand(1);
   15890     unsigned Opc = Cmp.getOpcode();
   15891     MVT VT = Op.getSimpleValueType();
   15892 
   15893     bool IllegalFPCMov = false;
   15894     if (VT.isFloatingPoint() && !VT.isVector() &&
   15895         !isScalarFPTypeInSSEReg(VT))  // FPStack?
   15896       IllegalFPCMov = !hasFPCMov(cast<ConstantSDNode>(CC)->getSExtValue());
   15897 
   15898     if ((isX86LogicalCmp(Cmp) && !IllegalFPCMov) ||
   15899         Opc == X86ISD::BT) { // FIXME
   15900       Cond = Cmp;
   15901       addTest = false;
   15902     }
   15903   } else if (CondOpcode == ISD::USUBO || CondOpcode == ISD::SSUBO ||
   15904              CondOpcode == ISD::UADDO || CondOpcode == ISD::SADDO ||
   15905              ((CondOpcode == ISD::UMULO || CondOpcode == ISD::SMULO) &&
   15906               Cond.getOperand(0).getValueType() != MVT::i8)) {
   15907     SDValue LHS = Cond.getOperand(0);
   15908     SDValue RHS = Cond.getOperand(1);
   15909     unsigned X86Opcode;
   15910     unsigned X86Cond;
   15911     SDVTList VTs;
   15912     switch (CondOpcode) {
   15913     case ISD::UADDO: X86Opcode = X86ISD::ADD; X86Cond = X86::COND_B; break;
   15914     case ISD::SADDO: X86Opcode = X86ISD::ADD; X86Cond = X86::COND_O; break;
   15915     case ISD::USUBO: X86Opcode = X86ISD::SUB; X86Cond = X86::COND_B; break;
   15916     case ISD::SSUBO: X86Opcode = X86ISD::SUB; X86Cond = X86::COND_O; break;
   15917     case ISD::UMULO: X86Opcode = X86ISD::UMUL; X86Cond = X86::COND_O; break;
   15918     case ISD::SMULO: X86Opcode = X86ISD::SMUL; X86Cond = X86::COND_O; break;
   15919     default: llvm_unreachable("unexpected overflowing operator");
   15920     }
   15921     if (CondOpcode == ISD::UMULO)
   15922       VTs = DAG.getVTList(LHS.getValueType(), LHS.getValueType(),
   15923                           MVT::i32);
   15924     else
   15925       VTs = DAG.getVTList(LHS.getValueType(), MVT::i32);
   15926 
   15927     SDValue X86Op = DAG.getNode(X86Opcode, DL, VTs, LHS, RHS);
   15928 
   15929     if (CondOpcode == ISD::UMULO)
   15930       Cond = X86Op.getValue(2);
   15931     else
   15932       Cond = X86Op.getValue(1);
   15933 
   15934     CC = DAG.getConstant(X86Cond, DL, MVT::i8);
   15935     addTest = false;
   15936   }
   15937 
   15938   if (addTest) {
   15939     // Look past the truncate if the high bits are known zero.
   15940     Cond = getCondAfterTruncWithZeroHighBitsInput(Cond, DAG);
   15941 
   15942     // We know the result of AND is compared against zero. Try to match
   15943     // it to BT.
   15944     if (Cond.getOpcode() == ISD::AND && Cond.hasOneUse()) {
   15945       if (SDValue NewSetCC = LowerToBT(Cond, ISD::SETNE, DL, DAG)) {
   15946         CC = NewSetCC.getOperand(0);
   15947         Cond = NewSetCC.getOperand(1);
   15948         addTest = false;
   15949       }
   15950     }
   15951   }
   15952 
   15953   if (addTest) {
   15954     CC = DAG.getConstant(X86::COND_NE, DL, MVT::i8);
   15955     Cond = EmitTest(Cond, X86::COND_NE, DL, DAG);
   15956   }
   15957 
   15958   // a <  b ? -1 :  0 -> RES = ~setcc_carry
   15959   // a <  b ?  0 : -1 -> RES = setcc_carry
   15960   // a >= b ? -1 :  0 -> RES = setcc_carry
   15961   // a >= b ?  0 : -1 -> RES = ~setcc_carry
   15962   if (Cond.getOpcode() == X86ISD::SUB) {
   15963     Cond = ConvertCmpIfNecessary(Cond, DAG);
   15964     unsigned CondCode = cast<ConstantSDNode>(CC)->getZExtValue();
   15965 
   15966     if ((CondCode == X86::COND_AE || CondCode == X86::COND_B) &&
   15967         (isAllOnesConstant(Op1) || isAllOnesConstant(Op2)) &&
   15968         (isNullConstant(Op1) || isNullConstant(Op2))) {
   15969       SDValue Res = DAG.getNode(X86ISD::SETCC_CARRY, DL, Op.getValueType(),
   15970                                 DAG.getConstant(X86::COND_B, DL, MVT::i8),
   15971                                 Cond);
   15972       if (isAllOnesConstant(Op1) != (CondCode == X86::COND_B))
   15973         return DAG.getNOT(DL, Res, Res.getValueType());
   15974       return Res;
   15975     }
   15976   }
   15977 
   15978   // X86 doesn't have an i8 cmov. If both operands are the result of a truncate
   15979   // widen the cmov and push the truncate through. This avoids introducing a new
   15980   // branch during isel and doesn't add any extensions.
   15981   if (Op.getValueType() == MVT::i8 &&
   15982       Op1.getOpcode() == ISD::TRUNCATE && Op2.getOpcode() == ISD::TRUNCATE) {
   15983     SDValue T1 = Op1.getOperand(0), T2 = Op2.getOperand(0);
   15984     if (T1.getValueType() == T2.getValueType() &&
   15985         // Blacklist CopyFromReg to avoid partial register stalls.
   15986         T1.getOpcode() != ISD::CopyFromReg && T2.getOpcode()!=ISD::CopyFromReg){
   15987       SDVTList VTs = DAG.getVTList(T1.getValueType(), MVT::Glue);
   15988       SDValue Cmov = DAG.getNode(X86ISD::CMOV, DL, VTs, T2, T1, CC, Cond);
   15989       return DAG.getNode(ISD::TRUNCATE, DL, Op.getValueType(), Cmov);
   15990     }
   15991   }
   15992 
   15993   // X86ISD::CMOV means set the result (which is operand 1) to the RHS if
   15994   // condition is true.
   15995   SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::Glue);
   15996   SDValue Ops[] = { Op2, Op1, CC, Cond };
   15997   return DAG.getNode(X86ISD::CMOV, DL, VTs, Ops);
   15998 }
   15999 
   16000 static SDValue LowerSIGN_EXTEND_AVX512(SDValue Op,
   16001                                        const X86Subtarget &Subtarget,
   16002                                        SelectionDAG &DAG) {
   16003   MVT VT = Op->getSimpleValueType(0);
   16004   SDValue In = Op->getOperand(0);
   16005   MVT InVT = In.getSimpleValueType();
   16006   MVT VTElt = VT.getVectorElementType();
   16007   MVT InVTElt = InVT.getVectorElementType();
   16008   SDLoc dl(Op);
   16009 
   16010   // SKX processor
   16011   if ((InVTElt == MVT::i1) &&
   16012       (((Subtarget.hasBWI() && Subtarget.hasVLX() &&
   16013         VT.getSizeInBits() <= 256 && VTElt.getSizeInBits() <= 16)) ||
   16014 
   16015        ((Subtarget.hasBWI() && VT.is512BitVector() &&
   16016         VTElt.getSizeInBits() <= 16)) ||
   16017 
   16018        ((Subtarget.hasDQI() && Subtarget.hasVLX() &&
   16019         VT.getSizeInBits() <= 256 && VTElt.getSizeInBits() >= 32)) ||
   16020 
   16021        ((Subtarget.hasDQI() && VT.is512BitVector() &&
   16022         VTElt.getSizeInBits() >= 32))))
   16023     return DAG.getNode(X86ISD::VSEXT, dl, VT, In);
   16024 
   16025   unsigned int NumElts = VT.getVectorNumElements();
   16026 
   16027   if (NumElts != 8 && NumElts != 16 && !Subtarget.hasBWI())
   16028     return SDValue();
   16029 
   16030   if (VT.is512BitVector() && InVT.getVectorElementType() != MVT::i1) {
   16031     if (In.getOpcode() == X86ISD::VSEXT || In.getOpcode() == X86ISD::VZEXT)
   16032       return DAG.getNode(In.getOpcode(), dl, VT, In.getOperand(0));
   16033     return DAG.getNode(X86ISD::VSEXT, dl, VT, In);
   16034   }
   16035 
   16036   assert (InVT.getVectorElementType() == MVT::i1 && "Unexpected vector type");
   16037   MVT ExtVT = NumElts == 8 ? MVT::v8i64 : MVT::v16i32;
   16038   SDValue NegOne =
   16039    DAG.getConstant(APInt::getAllOnesValue(ExtVT.getScalarSizeInBits()), dl,
   16040                    ExtVT);
   16041   SDValue Zero =
   16042    DAG.getConstant(APInt::getNullValue(ExtVT.getScalarSizeInBits()), dl, ExtVT);
   16043 
   16044   SDValue V = DAG.getNode(ISD::VSELECT, dl, ExtVT, In, NegOne, Zero);
   16045   if (VT.is512BitVector())
   16046     return V;
   16047   return DAG.getNode(X86ISD::VTRUNC, dl, VT, V);
   16048 }
   16049 
   16050 static SDValue LowerSIGN_EXTEND_VECTOR_INREG(SDValue Op,
   16051                                              const X86Subtarget &Subtarget,
   16052                                              SelectionDAG &DAG) {
   16053   SDValue In = Op->getOperand(0);
   16054   MVT VT = Op->getSimpleValueType(0);
   16055   MVT InVT = In.getSimpleValueType();
   16056   assert(VT.getSizeInBits() == InVT.getSizeInBits());
   16057 
   16058   MVT SVT = VT.getVectorElementType();
   16059   MVT InSVT = InVT.getVectorElementType();
   16060   assert(SVT.getSizeInBits() > InSVT.getSizeInBits());
   16061 
   16062   if (SVT != MVT::i64 && SVT != MVT::i32 && SVT != MVT::i16)
   16063     return SDValue();
   16064   if (InSVT != MVT::i32 && InSVT != MVT::i16 && InSVT != MVT::i8)
   16065     return SDValue();
   16066   if (!(VT.is128BitVector() && Subtarget.hasSSE2()) &&
   16067       !(VT.is256BitVector() && Subtarget.hasInt256()))
   16068     return SDValue();
   16069 
   16070   SDLoc dl(Op);
   16071 
   16072   // For 256-bit vectors, we only need the lower (128-bit) half of the input.
   16073   if (VT.is256BitVector())
   16074     In = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl,
   16075                      MVT::getVectorVT(InSVT, InVT.getVectorNumElements() / 2),
   16076                      In, DAG.getIntPtrConstant(0, dl));
   16077 
   16078   // SSE41 targets can use the pmovsx* instructions directly.
   16079   if (Subtarget.hasSSE41())
   16080     return DAG.getNode(X86ISD::VSEXT, dl, VT, In);
   16081 
   16082   // pre-SSE41 targets unpack lower lanes and then sign-extend using SRAI.
   16083   SDValue Curr = In;
   16084   MVT CurrVT = InVT;
   16085 
   16086   // As SRAI is only available on i16/i32 types, we expand only up to i32
   16087   // and handle i64 separately.
   16088   while (CurrVT != VT && CurrVT.getVectorElementType() != MVT::i32) {
   16089     Curr = DAG.getNode(X86ISD::UNPCKL, dl, CurrVT, DAG.getUNDEF(CurrVT), Curr);
   16090     MVT CurrSVT = MVT::getIntegerVT(CurrVT.getScalarSizeInBits() * 2);
   16091     CurrVT = MVT::getVectorVT(CurrSVT, CurrVT.getVectorNumElements() / 2);
   16092     Curr = DAG.getBitcast(CurrVT, Curr);
   16093   }
   16094 
   16095   SDValue SignExt = Curr;
   16096   if (CurrVT != InVT) {
   16097     unsigned SignExtShift =
   16098         CurrVT.getVectorElementType().getSizeInBits() - InSVT.getSizeInBits();
   16099     SignExt = DAG.getNode(X86ISD::VSRAI, dl, CurrVT, Curr,
   16100                           DAG.getConstant(SignExtShift, dl, MVT::i8));
   16101   }
   16102 
   16103   if (CurrVT == VT)
   16104     return SignExt;
   16105 
   16106   if (VT == MVT::v2i64 && CurrVT == MVT::v4i32) {
   16107     SDValue Sign = DAG.getNode(X86ISD::VSRAI, dl, CurrVT, Curr,
   16108                                DAG.getConstant(31, dl, MVT::i8));
   16109     SDValue Ext = DAG.getVectorShuffle(CurrVT, dl, SignExt, Sign, {0, 4, 1, 5});
   16110     return DAG.getBitcast(VT, Ext);
   16111   }
   16112 
   16113   return SDValue();
   16114 }
   16115 
   16116 static SDValue LowerSIGN_EXTEND(SDValue Op, const X86Subtarget &Subtarget,
   16117                                 SelectionDAG &DAG) {
   16118   MVT VT = Op->getSimpleValueType(0);
   16119   SDValue In = Op->getOperand(0);
   16120   MVT InVT = In.getSimpleValueType();
   16121   SDLoc dl(Op);
   16122 
   16123   if (VT.is512BitVector() || InVT.getVectorElementType() == MVT::i1)
   16124     return LowerSIGN_EXTEND_AVX512(Op, Subtarget, DAG);
   16125 
   16126   if ((VT != MVT::v4i64 || InVT != MVT::v4i32) &&
   16127       (VT != MVT::v8i32 || InVT != MVT::v8i16) &&
   16128       (VT != MVT::v16i16 || InVT != MVT::v16i8))
   16129     return SDValue();
   16130 
   16131   if (Subtarget.hasInt256())
   16132     return DAG.getNode(X86ISD::VSEXT, dl, VT, In);
   16133 
   16134   // Optimize vectors in AVX mode
   16135   // Sign extend  v8i16 to v8i32 and
   16136   //              v4i32 to v4i64
   16137   //
   16138   // Divide input vector into two parts
   16139   // for v4i32 the shuffle mask will be { 0, 1, -1, -1} {2, 3, -1, -1}
   16140   // use vpmovsx instruction to extend v4i32 -> v2i64; v8i16 -> v4i32
   16141   // concat the vectors to original VT
   16142 
   16143   unsigned NumElems = InVT.getVectorNumElements();
   16144   SDValue Undef = DAG.getUNDEF(InVT);
   16145 
   16146   SmallVector<int,8> ShufMask1(NumElems, -1);
   16147   for (unsigned i = 0; i != NumElems/2; ++i)
   16148     ShufMask1[i] = i;
   16149 
   16150   SDValue OpLo = DAG.getVectorShuffle(InVT, dl, In, Undef, ShufMask1);
   16151 
   16152   SmallVector<int,8> ShufMask2(NumElems, -1);
   16153   for (unsigned i = 0; i != NumElems/2; ++i)
   16154     ShufMask2[i] = i + NumElems/2;
   16155 
   16156   SDValue OpHi = DAG.getVectorShuffle(InVT, dl, In, Undef, ShufMask2);
   16157 
   16158   MVT HalfVT = MVT::getVectorVT(VT.getVectorElementType(),
   16159                                 VT.getVectorNumElements()/2);
   16160 
   16161   OpLo = DAG.getNode(X86ISD::VSEXT, dl, HalfVT, OpLo);
   16162   OpHi = DAG.getNode(X86ISD::VSEXT, dl, HalfVT, OpHi);
   16163 
   16164   return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, OpLo, OpHi);
   16165 }
   16166 
   16167 // Lower truncating store. We need a special lowering to vXi1 vectors
   16168 static SDValue LowerTruncatingStore(SDValue StOp, const X86Subtarget &Subtarget,
   16169                                     SelectionDAG &DAG) {
   16170   StoreSDNode *St = cast<StoreSDNode>(StOp.getNode());
   16171   SDLoc dl(St);
   16172   EVT MemVT = St->getMemoryVT();
   16173   assert(St->isTruncatingStore() && "We only custom truncating store.");
   16174   assert(MemVT.isVector() && MemVT.getVectorElementType() == MVT::i1 &&
   16175          "Expected truncstore of i1 vector");
   16176 
   16177   SDValue Op = St->getValue();
   16178   MVT OpVT = Op.getValueType().getSimpleVT();
   16179   unsigned NumElts = OpVT.getVectorNumElements();
   16180   if ((Subtarget.hasVLX() && Subtarget.hasBWI() && Subtarget.hasDQI()) ||
   16181       NumElts == 16) {
   16182     // Truncate and store - everything is legal
   16183     Op = DAG.getNode(ISD::TRUNCATE, dl, MemVT, Op);
   16184     if (MemVT.getSizeInBits() < 8)
   16185       Op = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, MVT::v8i1,
   16186                        DAG.getUNDEF(MVT::v8i1), Op,
   16187                        DAG.getIntPtrConstant(0, dl));
   16188     return DAG.getStore(St->getChain(), dl, Op, St->getBasePtr(),
   16189                         St->getMemOperand());
   16190   }
   16191 
   16192   // A subset, assume that we have only AVX-512F
   16193   if (NumElts <= 8) {
   16194     if (NumElts < 8) {
   16195       // Extend to 8-elts vector
   16196       MVT ExtVT = MVT::getVectorVT(OpVT.getScalarType(), 8);
   16197       Op = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ExtVT,
   16198                         DAG.getUNDEF(ExtVT), Op, DAG.getIntPtrConstant(0, dl));
   16199     }
   16200     Op = DAG.getNode(ISD::TRUNCATE, dl, MVT::v8i1, Op);
   16201     return DAG.getStore(St->getChain(), dl, Op, St->getBasePtr(),
   16202                         St->getMemOperand());
   16203   }
   16204   // v32i8
   16205   assert(OpVT == MVT::v32i8 && "Unexpected operand type");
   16206   // Divide the vector into 2 parts and store each part separately
   16207   SDValue Lo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v16i8, Op,
   16208                             DAG.getIntPtrConstant(0, dl));
   16209   Lo = DAG.getNode(ISD::TRUNCATE, dl, MVT::v16i1, Lo);
   16210   SDValue BasePtr = St->getBasePtr();
   16211   SDValue StLo = DAG.getStore(St->getChain(), dl, Lo, BasePtr,
   16212                               St->getMemOperand());
   16213   SDValue Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v16i8, Op,
   16214                             DAG.getIntPtrConstant(16, dl));
   16215   Hi = DAG.getNode(ISD::TRUNCATE, dl, MVT::v16i1, Hi);
   16216 
   16217   SDValue BasePtrHi =
   16218     DAG.getNode(ISD::ADD, dl, BasePtr.getValueType(), BasePtr,
   16219                 DAG.getConstant(2, dl, BasePtr.getValueType()));
   16220 
   16221   SDValue StHi = DAG.getStore(St->getChain(), dl, Hi,
   16222                               BasePtrHi, St->getMemOperand());
   16223   return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, StLo, StHi);
   16224 }
   16225 
   16226 static SDValue LowerExtended1BitVectorLoad(SDValue Op,
   16227                                            const X86Subtarget &Subtarget,
   16228                                            SelectionDAG &DAG) {
   16229 
   16230   LoadSDNode *Ld = cast<LoadSDNode>(Op.getNode());
   16231   SDLoc dl(Ld);
   16232   EVT MemVT = Ld->getMemoryVT();
   16233   assert(MemVT.isVector() && MemVT.getScalarType() == MVT::i1 &&
   16234          "Expected i1 vector load");
   16235   unsigned ExtOpcode = Ld->getExtensionType() == ISD::ZEXTLOAD ?
   16236     ISD::ZERO_EXTEND : ISD::SIGN_EXTEND;
   16237   MVT VT = Op.getValueType().getSimpleVT();
   16238   unsigned NumElts = VT.getVectorNumElements();
   16239 
   16240   if ((Subtarget.hasVLX() && Subtarget.hasBWI() && Subtarget.hasDQI()) ||
   16241       NumElts == 16) {
   16242     // Load and extend - everything is legal
   16243     if (NumElts < 8) {
   16244       SDValue Load = DAG.getLoad(MVT::v8i1, dl, Ld->getChain(),
   16245                                  Ld->getBasePtr(),
   16246                                  Ld->getMemOperand());
   16247       // Replace chain users with the new chain.
   16248       assert(Load->getNumValues() == 2 && "Loads must carry a chain!");
   16249       DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), Load.getValue(1));
   16250       MVT ExtVT = MVT::getVectorVT(VT.getScalarType(), 8);
   16251       SDValue ExtVec = DAG.getNode(ExtOpcode, dl, ExtVT, Load);
   16252 
   16253       return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, ExtVec,
   16254                                    DAG.getIntPtrConstant(0, dl));
   16255     }
   16256     SDValue Load = DAG.getLoad(MemVT, dl, Ld->getChain(),
   16257                                Ld->getBasePtr(),
   16258                                Ld->getMemOperand());
   16259     // Replace chain users with the new chain.
   16260     assert(Load->getNumValues() == 2 && "Loads must carry a chain!");
   16261     DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), Load.getValue(1));
   16262 
   16263     // Finally, do a normal sign-extend to the desired register.
   16264     return DAG.getNode(ExtOpcode, dl, Op.getValueType(), Load);
   16265   }
   16266 
   16267   if (NumElts <= 8) {
   16268     // A subset, assume that we have only AVX-512F
   16269     unsigned NumBitsToLoad = NumElts < 8 ? 8 : NumElts;
   16270     MVT TypeToLoad = MVT::getIntegerVT(NumBitsToLoad);
   16271     SDValue Load = DAG.getLoad(TypeToLoad, dl, Ld->getChain(),
   16272                               Ld->getBasePtr(),
   16273                               Ld->getMemOperand());
   16274     // Replace chain users with the new chain.
   16275     assert(Load->getNumValues() == 2 && "Loads must carry a chain!");
   16276     DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), Load.getValue(1));
   16277 
   16278     MVT MaskVT = MVT::getVectorVT(MVT::i1, NumBitsToLoad);
   16279     SDValue BitVec = DAG.getBitcast(MaskVT, Load);
   16280 
   16281     if (NumElts == 8)
   16282       return DAG.getNode(ExtOpcode, dl, VT, BitVec);
   16283 
   16284       // we should take care to v4i1 and v2i1
   16285 
   16286     MVT ExtVT = MVT::getVectorVT(VT.getScalarType(), 8);
   16287     SDValue ExtVec = DAG.getNode(ExtOpcode, dl, ExtVT, BitVec);
   16288     return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, ExtVec,
   16289                         DAG.getIntPtrConstant(0, dl));
   16290   }
   16291 
   16292   assert(VT == MVT::v32i8 && "Unexpected extload type");
   16293 
   16294   SmallVector<SDValue, 2> Chains;
   16295 
   16296   SDValue BasePtr = Ld->getBasePtr();
   16297   SDValue LoadLo = DAG.getLoad(MVT::v16i1, dl, Ld->getChain(),
   16298                                Ld->getBasePtr(),
   16299                                Ld->getMemOperand());
   16300   Chains.push_back(LoadLo.getValue(1));
   16301 
   16302   SDValue BasePtrHi =
   16303     DAG.getNode(ISD::ADD, dl, BasePtr.getValueType(), BasePtr,
   16304                 DAG.getConstant(2, dl, BasePtr.getValueType()));
   16305 
   16306   SDValue LoadHi = DAG.getLoad(MVT::v16i1, dl, Ld->getChain(),
   16307                                BasePtrHi,
   16308                                Ld->getMemOperand());
   16309   Chains.push_back(LoadHi.getValue(1));
   16310   SDValue NewChain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Chains);
   16311   DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), NewChain);
   16312 
   16313   SDValue Lo = DAG.getNode(ExtOpcode, dl, MVT::v16i8, LoadLo);
   16314   SDValue Hi = DAG.getNode(ExtOpcode, dl, MVT::v16i8, LoadHi);
   16315   return DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v32i8, Lo, Hi);
   16316 }
   16317 
   16318 // Lower vector extended loads using a shuffle. If SSSE3 is not available we
   16319 // may emit an illegal shuffle but the expansion is still better than scalar
   16320 // code. We generate X86ISD::VSEXT for SEXTLOADs if it's available, otherwise
   16321 // we'll emit a shuffle and a arithmetic shift.
   16322 // FIXME: Is the expansion actually better than scalar code? It doesn't seem so.
   16323 // TODO: It is possible to support ZExt by zeroing the undef values during
   16324 // the shuffle phase or after the shuffle.
   16325 static SDValue LowerExtendedLoad(SDValue Op, const X86Subtarget &Subtarget,
   16326                                  SelectionDAG &DAG) {
   16327   MVT RegVT = Op.getSimpleValueType();
   16328   assert(RegVT.isVector() && "We only custom lower vector sext loads.");
   16329   assert(RegVT.isInteger() &&
   16330          "We only custom lower integer vector sext loads.");
   16331 
   16332   // Nothing useful we can do without SSE2 shuffles.
   16333   assert(Subtarget.hasSSE2() && "We only custom lower sext loads with SSE2.");
   16334 
   16335   LoadSDNode *Ld = cast<LoadSDNode>(Op.getNode());
   16336   SDLoc dl(Ld);
   16337   EVT MemVT = Ld->getMemoryVT();
   16338   if (MemVT.getScalarType() == MVT::i1)
   16339     return LowerExtended1BitVectorLoad(Op, Subtarget, DAG);
   16340 
   16341   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
   16342   unsigned RegSz = RegVT.getSizeInBits();
   16343 
   16344   ISD::LoadExtType Ext = Ld->getExtensionType();
   16345 
   16346   assert((Ext == ISD::EXTLOAD || Ext == ISD::SEXTLOAD)
   16347          && "Only anyext and sext are currently implemented.");
   16348   assert(MemVT != RegVT && "Cannot extend to the same type");
   16349   assert(MemVT.isVector() && "Must load a vector from memory");
   16350 
   16351   unsigned NumElems = RegVT.getVectorNumElements();
   16352   unsigned MemSz = MemVT.getSizeInBits();
   16353   assert(RegSz > MemSz && "Register size must be greater than the mem size");
   16354 
   16355   if (Ext == ISD::SEXTLOAD && RegSz == 256 && !Subtarget.hasInt256()) {
   16356     // The only way in which we have a legal 256-bit vector result but not the
   16357     // integer 256-bit operations needed to directly lower a sextload is if we
   16358     // have AVX1 but not AVX2. In that case, we can always emit a sextload to
   16359     // a 128-bit vector and a normal sign_extend to 256-bits that should get
   16360     // correctly legalized. We do this late to allow the canonical form of
   16361     // sextload to persist throughout the rest of the DAG combiner -- it wants
   16362     // to fold together any extensions it can, and so will fuse a sign_extend
   16363     // of an sextload into a sextload targeting a wider value.
   16364     SDValue Load;
   16365     if (MemSz == 128) {
   16366       // Just switch this to a normal load.
   16367       assert(TLI.isTypeLegal(MemVT) && "If the memory type is a 128-bit type, "
   16368                                        "it must be a legal 128-bit vector "
   16369                                        "type!");
   16370       Load = DAG.getLoad(MemVT, dl, Ld->getChain(), Ld->getBasePtr(),
   16371                   Ld->getPointerInfo(), Ld->isVolatile(), Ld->isNonTemporal(),
   16372                   Ld->isInvariant(), Ld->getAlignment());
   16373     } else {
   16374       assert(MemSz < 128 &&
   16375              "Can't extend a type wider than 128 bits to a 256 bit vector!");
   16376       // Do an sext load to a 128-bit vector type. We want to use the same
   16377       // number of elements, but elements half as wide. This will end up being
   16378       // recursively lowered by this routine, but will succeed as we definitely
   16379       // have all the necessary features if we're using AVX1.
   16380       EVT HalfEltVT =
   16381           EVT::getIntegerVT(*DAG.getContext(), RegVT.getScalarSizeInBits() / 2);
   16382       EVT HalfVecVT = EVT::getVectorVT(*DAG.getContext(), HalfEltVT, NumElems);
   16383       Load =
   16384           DAG.getExtLoad(Ext, dl, HalfVecVT, Ld->getChain(), Ld->getBasePtr(),
   16385                          Ld->getPointerInfo(), MemVT, Ld->isVolatile(),
   16386                          Ld->isNonTemporal(), Ld->isInvariant(),
   16387                          Ld->getAlignment());
   16388     }
   16389 
   16390     // Replace chain users with the new chain.
   16391     assert(Load->getNumValues() == 2 && "Loads must carry a chain!");
   16392     DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), Load.getValue(1));
   16393 
   16394     // Finally, do a normal sign-extend to the desired register.
   16395     return DAG.getSExtOrTrunc(Load, dl, RegVT);
   16396   }
   16397 
   16398   // All sizes must be a power of two.
   16399   assert(isPowerOf2_32(RegSz * MemSz * NumElems) &&
   16400          "Non-power-of-two elements are not custom lowered!");
   16401 
   16402   // Attempt to load the original value using scalar loads.
   16403   // Find the largest scalar type that divides the total loaded size.
   16404   MVT SclrLoadTy = MVT::i8;
   16405   for (MVT Tp : MVT::integer_valuetypes()) {
   16406     if (TLI.isTypeLegal(Tp) && ((MemSz % Tp.getSizeInBits()) == 0)) {
   16407       SclrLoadTy = Tp;
   16408     }
   16409   }
   16410 
   16411   // On 32bit systems, we can't save 64bit integers. Try bitcasting to F64.
   16412   if (TLI.isTypeLegal(MVT::f64) && SclrLoadTy.getSizeInBits() < 64 &&
   16413       (64 <= MemSz))
   16414     SclrLoadTy = MVT::f64;
   16415 
   16416   // Calculate the number of scalar loads that we need to perform
   16417   // in order to load our vector from memory.
   16418   unsigned NumLoads = MemSz / SclrLoadTy.getSizeInBits();
   16419 
   16420   assert((Ext != ISD::SEXTLOAD || NumLoads == 1) &&
   16421          "Can only lower sext loads with a single scalar load!");
   16422 
   16423   unsigned loadRegZize = RegSz;
   16424   if (Ext == ISD::SEXTLOAD && RegSz >= 256)
   16425     loadRegZize = 128;
   16426 
   16427   // Represent our vector as a sequence of elements which are the
   16428   // largest scalar that we can load.
   16429   EVT LoadUnitVecVT = EVT::getVectorVT(
   16430       *DAG.getContext(), SclrLoadTy, loadRegZize / SclrLoadTy.getSizeInBits());
   16431 
   16432   // Represent the data using the same element type that is stored in
   16433   // memory. In practice, we ''widen'' MemVT.
   16434   EVT WideVecVT =
   16435       EVT::getVectorVT(*DAG.getContext(), MemVT.getScalarType(),
   16436                        loadRegZize / MemVT.getScalarSizeInBits());
   16437 
   16438   assert(WideVecVT.getSizeInBits() == LoadUnitVecVT.getSizeInBits() &&
   16439          "Invalid vector type");
   16440 
   16441   // We can't shuffle using an illegal type.
   16442   assert(TLI.isTypeLegal(WideVecVT) &&
   16443          "We only lower types that form legal widened vector types");
   16444 
   16445   SmallVector<SDValue, 8> Chains;
   16446   SDValue Ptr = Ld->getBasePtr();
   16447   SDValue Increment = DAG.getConstant(SclrLoadTy.getSizeInBits() / 8, dl,
   16448                                       TLI.getPointerTy(DAG.getDataLayout()));
   16449   SDValue Res = DAG.getUNDEF(LoadUnitVecVT);
   16450 
   16451   for (unsigned i = 0; i < NumLoads; ++i) {
   16452     // Perform a single load.
   16453     SDValue ScalarLoad =
   16454         DAG.getLoad(SclrLoadTy, dl, Ld->getChain(), Ptr, Ld->getPointerInfo(),
   16455                     Ld->isVolatile(), Ld->isNonTemporal(), Ld->isInvariant(),
   16456                     Ld->getAlignment());
   16457     Chains.push_back(ScalarLoad.getValue(1));
   16458     // Create the first element type using SCALAR_TO_VECTOR in order to avoid
   16459     // another round of DAGCombining.
   16460     if (i == 0)
   16461       Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, LoadUnitVecVT, ScalarLoad);
   16462     else
   16463       Res = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, LoadUnitVecVT, Res,
   16464                         ScalarLoad, DAG.getIntPtrConstant(i, dl));
   16465 
   16466     Ptr = DAG.getNode(ISD::ADD, dl, Ptr.getValueType(), Ptr, Increment);
   16467   }
   16468 
   16469   SDValue TF = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Chains);
   16470 
   16471   // Bitcast the loaded value to a vector of the original element type, in
   16472   // the size of the target vector type.
   16473   SDValue SlicedVec = DAG.getBitcast(WideVecVT, Res);
   16474   unsigned SizeRatio = RegSz / MemSz;
   16475 
   16476   if (Ext == ISD::SEXTLOAD) {
   16477     // If we have SSE4.1, we can directly emit a VSEXT node.
   16478     if (Subtarget.hasSSE41()) {
   16479       SDValue Sext = DAG.getNode(X86ISD::VSEXT, dl, RegVT, SlicedVec);
   16480       DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), TF);
   16481       return Sext;
   16482     }
   16483 
   16484     // Otherwise we'll use SIGN_EXTEND_VECTOR_INREG to sign extend the lowest
   16485     // lanes.
   16486     assert(TLI.isOperationLegalOrCustom(ISD::SIGN_EXTEND_VECTOR_INREG, RegVT) &&
   16487            "We can't implement a sext load without SIGN_EXTEND_VECTOR_INREG!");
   16488 
   16489     SDValue Shuff = DAG.getSignExtendVectorInReg(SlicedVec, dl, RegVT);
   16490     DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), TF);
   16491     return Shuff;
   16492   }
   16493 
   16494   // Redistribute the loaded elements into the different locations.
   16495   SmallVector<int, 16> ShuffleVec(NumElems * SizeRatio, -1);
   16496   for (unsigned i = 0; i != NumElems; ++i)
   16497     ShuffleVec[i * SizeRatio] = i;
   16498 
   16499   SDValue Shuff = DAG.getVectorShuffle(WideVecVT, dl, SlicedVec,
   16500                                        DAG.getUNDEF(WideVecVT), ShuffleVec);
   16501 
   16502   // Bitcast to the requested type.
   16503   Shuff = DAG.getBitcast(RegVT, Shuff);
   16504   DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), TF);
   16505   return Shuff;
   16506 }
   16507 
   16508 /// Return true if node is an ISD::AND or ISD::OR of two X86ISD::SETCC nodes
   16509 /// each of which has no other use apart from the AND / OR.
   16510 static bool isAndOrOfSetCCs(SDValue Op, unsigned &Opc) {
   16511   Opc = Op.getOpcode();
   16512   if (Opc != ISD::OR && Opc != ISD::AND)
   16513     return false;
   16514   return (Op.getOperand(0).getOpcode() == X86ISD::SETCC &&
   16515           Op.getOperand(0).hasOneUse() &&
   16516           Op.getOperand(1).getOpcode() == X86ISD::SETCC &&
   16517           Op.getOperand(1).hasOneUse());
   16518 }
   16519 
   16520 /// Return true if node is an ISD::XOR of a X86ISD::SETCC and 1 and that the
   16521 /// SETCC node has a single use.
   16522 static bool isXor1OfSetCC(SDValue Op) {
   16523   if (Op.getOpcode() != ISD::XOR)
   16524     return false;
   16525   if (isOneConstant(Op.getOperand(1)))
   16526     return Op.getOperand(0).getOpcode() == X86ISD::SETCC &&
   16527            Op.getOperand(0).hasOneUse();
   16528   return false;
   16529 }
   16530 
   16531 SDValue X86TargetLowering::LowerBRCOND(SDValue Op, SelectionDAG &DAG) const {
   16532   bool addTest = true;
   16533   SDValue Chain = Op.getOperand(0);
   16534   SDValue Cond  = Op.getOperand(1);
   16535   SDValue Dest  = Op.getOperand(2);
   16536   SDLoc dl(Op);
   16537   SDValue CC;
   16538   bool Inverted = false;
   16539 
   16540   if (Cond.getOpcode() == ISD::SETCC) {
   16541     // Check for setcc([su]{add,sub,mul}o == 0).
   16542     if (cast<CondCodeSDNode>(Cond.getOperand(2))->get() == ISD::SETEQ &&
   16543         isNullConstant(Cond.getOperand(1)) &&
   16544         Cond.getOperand(0).getResNo() == 1 &&
   16545         (Cond.getOperand(0).getOpcode() == ISD::SADDO ||
   16546          Cond.getOperand(0).getOpcode() == ISD::UADDO ||
   16547          Cond.getOperand(0).getOpcode() == ISD::SSUBO ||
   16548          Cond.getOperand(0).getOpcode() == ISD::USUBO ||
   16549          Cond.getOperand(0).getOpcode() == ISD::SMULO ||
   16550          Cond.getOperand(0).getOpcode() == ISD::UMULO)) {
   16551       Inverted = true;
   16552       Cond = Cond.getOperand(0);
   16553     } else {
   16554       if (SDValue NewCond = LowerSETCC(Cond, DAG))
   16555         Cond = NewCond;
   16556     }
   16557   }
   16558 #if 0
   16559   // FIXME: LowerXALUO doesn't handle these!!
   16560   else if (Cond.getOpcode() == X86ISD::ADD  ||
   16561            Cond.getOpcode() == X86ISD::SUB  ||
   16562            Cond.getOpcode() == X86ISD::SMUL ||
   16563            Cond.getOpcode() == X86ISD::UMUL)
   16564     Cond = LowerXALUO(Cond, DAG);
   16565 #endif
   16566 
   16567   // Look pass (and (setcc_carry (cmp ...)), 1).
   16568   if (Cond.getOpcode() == ISD::AND &&
   16569       Cond.getOperand(0).getOpcode() == X86ISD::SETCC_CARRY &&
   16570       isOneConstant(Cond.getOperand(1)))
   16571     Cond = Cond.getOperand(0);
   16572 
   16573   // If condition flag is set by a X86ISD::CMP, then use it as the condition
   16574   // setting operand in place of the X86ISD::SETCC.
   16575   unsigned CondOpcode = Cond.getOpcode();
   16576   if (CondOpcode == X86ISD::SETCC ||
   16577       CondOpcode == X86ISD::SETCC_CARRY) {
   16578     CC = Cond.getOperand(0);
   16579 
   16580     SDValue Cmp = Cond.getOperand(1);
   16581     unsigned Opc = Cmp.getOpcode();
   16582     // FIXME: WHY THE SPECIAL CASING OF LogicalCmp??
   16583     if (isX86LogicalCmp(Cmp) || Opc == X86ISD::BT) {
   16584       Cond = Cmp;
   16585       addTest = false;
   16586     } else {
   16587       switch (cast<ConstantSDNode>(CC)->getZExtValue()) {
   16588       default: break;
   16589       case X86::COND_O:
   16590       case X86::COND_B:
   16591         // These can only come from an arithmetic instruction with overflow,
   16592         // e.g. SADDO, UADDO.
   16593         Cond = Cond.getNode()->getOperand(1);
   16594         addTest = false;
   16595         break;
   16596       }
   16597     }
   16598   }
   16599   CondOpcode = Cond.getOpcode();
   16600   if (CondOpcode == ISD::UADDO || CondOpcode == ISD::SADDO ||
   16601       CondOpcode == ISD::USUBO || CondOpcode == ISD::SSUBO ||
   16602       ((CondOpcode == ISD::UMULO || CondOpcode == ISD::SMULO) &&
   16603        Cond.getOperand(0).getValueType() != MVT::i8)) {
   16604     SDValue LHS = Cond.getOperand(0);
   16605     SDValue RHS = Cond.getOperand(1);
   16606     unsigned X86Opcode;
   16607     unsigned X86Cond;
   16608     SDVTList VTs;
   16609     // Keep this in sync with LowerXALUO, otherwise we might create redundant
   16610     // instructions that can't be removed afterwards (i.e. X86ISD::ADD and
   16611     // X86ISD::INC).
   16612     switch (CondOpcode) {
   16613     case ISD::UADDO: X86Opcode = X86ISD::ADD; X86Cond = X86::COND_B; break;
   16614     case ISD::SADDO:
   16615       if (isOneConstant(RHS)) {
   16616           X86Opcode = X86ISD::INC; X86Cond = X86::COND_O;
   16617           break;
   16618         }
   16619       X86Opcode = X86ISD::ADD; X86Cond = X86::COND_O; break;
   16620     case ISD::USUBO: X86Opcode = X86ISD::SUB; X86Cond = X86::COND_B; break;
   16621     case ISD::SSUBO:
   16622       if (isOneConstant(RHS)) {
   16623           X86Opcode = X86ISD::DEC; X86Cond = X86::COND_O;
   16624           break;
   16625         }
   16626       X86Opcode = X86ISD::SUB; X86Cond = X86::COND_O; break;
   16627     case ISD::UMULO: X86Opcode = X86ISD::UMUL; X86Cond = X86::COND_O; break;
   16628     case ISD::SMULO: X86Opcode = X86ISD::SMUL; X86Cond = X86::COND_O; break;
   16629     default: llvm_unreachable("unexpected overflowing operator");
   16630     }
   16631     if (Inverted)
   16632       X86Cond = X86::GetOppositeBranchCondition((X86::CondCode)X86Cond);
   16633     if (CondOpcode == ISD::UMULO)
   16634       VTs = DAG.getVTList(LHS.getValueType(), LHS.getValueType(),
   16635                           MVT::i32);
   16636     else
   16637       VTs = DAG.getVTList(LHS.getValueType(), MVT::i32);
   16638 
   16639     SDValue X86Op = DAG.getNode(X86Opcode, dl, VTs, LHS, RHS);
   16640 
   16641     if (CondOpcode == ISD::UMULO)
   16642       Cond = X86Op.getValue(2);
   16643     else
   16644       Cond = X86Op.getValue(1);
   16645 
   16646     CC = DAG.getConstant(X86Cond, dl, MVT::i8);
   16647     addTest = false;
   16648   } else {
   16649     unsigned CondOpc;
   16650     if (Cond.hasOneUse() && isAndOrOfSetCCs(Cond, CondOpc)) {
   16651       SDValue Cmp = Cond.getOperand(0).getOperand(1);
   16652       if (CondOpc == ISD::OR) {
   16653         // Also, recognize the pattern generated by an FCMP_UNE. We can emit
   16654         // two branches instead of an explicit OR instruction with a
   16655         // separate test.
   16656         if (Cmp == Cond.getOperand(1).getOperand(1) &&
   16657             isX86LogicalCmp(Cmp)) {
   16658           CC = Cond.getOperand(0).getOperand(0);
   16659           Chain = DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(),
   16660                               Chain, Dest, CC, Cmp);
   16661           CC = Cond.getOperand(1).getOperand(0);
   16662           Cond = Cmp;
   16663           addTest = false;
   16664         }
   16665       } else { // ISD::AND
   16666         // Also, recognize the pattern generated by an FCMP_OEQ. We can emit
   16667         // two branches instead of an explicit AND instruction with a
   16668         // separate test. However, we only do this if this block doesn't
   16669         // have a fall-through edge, because this requires an explicit
   16670         // jmp when the condition is false.
   16671         if (Cmp == Cond.getOperand(1).getOperand(1) &&
   16672             isX86LogicalCmp(Cmp) &&
   16673             Op.getNode()->hasOneUse()) {
   16674           X86::CondCode CCode =
   16675             (X86::CondCode)Cond.getOperand(0).getConstantOperandVal(0);
   16676           CCode = X86::GetOppositeBranchCondition(CCode);
   16677           CC = DAG.getConstant(CCode, dl, MVT::i8);
   16678           SDNode *User = *Op.getNode()->use_begin();
   16679           // Look for an unconditional branch following this conditional branch.
   16680           // We need this because we need to reverse the successors in order
   16681           // to implement FCMP_OEQ.
   16682           if (User->getOpcode() == ISD::BR) {
   16683             SDValue FalseBB = User->getOperand(1);
   16684             SDNode *NewBR =
   16685               DAG.UpdateNodeOperands(User, User->getOperand(0), Dest);
   16686             assert(NewBR == User);
   16687             (void)NewBR;
   16688             Dest = FalseBB;
   16689 
   16690             Chain = DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(),
   16691                                 Chain, Dest, CC, Cmp);
   16692             X86::CondCode CCode =
   16693               (X86::CondCode)Cond.getOperand(1).getConstantOperandVal(0);
   16694             CCode = X86::GetOppositeBranchCondition(CCode);
   16695             CC = DAG.getConstant(CCode, dl, MVT::i8);
   16696             Cond = Cmp;
   16697             addTest = false;
   16698           }
   16699         }
   16700       }
   16701     } else if (Cond.hasOneUse() && isXor1OfSetCC(Cond)) {
   16702       // Recognize for xorb (setcc), 1 patterns. The xor inverts the condition.
   16703       // It should be transformed during dag combiner except when the condition
   16704       // is set by a arithmetics with overflow node.
   16705       X86::CondCode CCode =
   16706         (X86::CondCode)Cond.getOperand(0).getConstantOperandVal(0);
   16707       CCode = X86::GetOppositeBranchCondition(CCode);
   16708       CC = DAG.getConstant(CCode, dl, MVT::i8);
   16709       Cond = Cond.getOperand(0).getOperand(1);
   16710       addTest = false;
   16711     } else if (Cond.getOpcode() == ISD::SETCC &&
   16712                cast<CondCodeSDNode>(Cond.getOperand(2))->get() == ISD::SETOEQ) {
   16713       // For FCMP_OEQ, we can emit
   16714       // two branches instead of an explicit AND instruction with a
   16715       // separate test. However, we only do this if this block doesn't
   16716       // have a fall-through edge, because this requires an explicit
   16717       // jmp when the condition is false.
   16718       if (Op.getNode()->hasOneUse()) {
   16719         SDNode *User = *Op.getNode()->use_begin();
   16720         // Look for an unconditional branch following this conditional branch.
   16721         // We need this because we need to reverse the successors in order
   16722         // to implement FCMP_OEQ.
   16723         if (User->getOpcode() == ISD::BR) {
   16724           SDValue FalseBB = User->getOperand(1);
   16725           SDNode *NewBR =
   16726             DAG.UpdateNodeOperands(User, User->getOperand(0), Dest);
   16727           assert(NewBR == User);
   16728           (void)NewBR;
   16729           Dest = FalseBB;
   16730 
   16731           SDValue Cmp = DAG.getNode(X86ISD::CMP, dl, MVT::i32,
   16732                                     Cond.getOperand(0), Cond.getOperand(1));
   16733           Cmp = ConvertCmpIfNecessary(Cmp, DAG);
   16734           CC = DAG.getConstant(X86::COND_NE, dl, MVT::i8);
   16735           Chain = DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(),
   16736                               Chain, Dest, CC, Cmp);
   16737           CC = DAG.getConstant(X86::COND_P, dl, MVT::i8);
   16738           Cond = Cmp;
   16739           addTest = false;
   16740         }
   16741       }
   16742     } else if (Cond.getOpcode() == ISD::SETCC &&
   16743                cast<CondCodeSDNode>(Cond.getOperand(2))->get() == ISD::SETUNE) {
   16744       // For FCMP_UNE, we can emit
   16745       // two branches instead of an explicit AND instruction with a
   16746       // separate test. However, we only do this if this block doesn't
   16747       // have a fall-through edge, because this requires an explicit
   16748       // jmp when the condition is false.
   16749       if (Op.getNode()->hasOneUse()) {
   16750         SDNode *User = *Op.getNode()->use_begin();
   16751         // Look for an unconditional branch following this conditional branch.
   16752         // We need this because we need to reverse the successors in order
   16753         // to implement FCMP_UNE.
   16754         if (User->getOpcode() == ISD::BR) {
   16755           SDValue FalseBB = User->getOperand(1);
   16756           SDNode *NewBR =
   16757             DAG.UpdateNodeOperands(User, User->getOperand(0), Dest);
   16758           assert(NewBR == User);
   16759           (void)NewBR;
   16760 
   16761           SDValue Cmp = DAG.getNode(X86ISD::CMP, dl, MVT::i32,
   16762                                     Cond.getOperand(0), Cond.getOperand(1));
   16763           Cmp = ConvertCmpIfNecessary(Cmp, DAG);
   16764           CC = DAG.getConstant(X86::COND_NE, dl, MVT::i8);
   16765           Chain = DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(),
   16766                               Chain, Dest, CC, Cmp);
   16767           CC = DAG.getConstant(X86::COND_NP, dl, MVT::i8);
   16768           Cond = Cmp;
   16769           addTest = false;
   16770           Dest = FalseBB;
   16771         }
   16772       }
   16773     }
   16774   }
   16775 
   16776   if (addTest) {
   16777     // Look pass the truncate if the high bits are known zero.
   16778     Cond = getCondAfterTruncWithZeroHighBitsInput(Cond, DAG);
   16779 
   16780     // We know the result of AND is compared against zero. Try to match
   16781     // it to BT.
   16782     if (Cond.getOpcode() == ISD::AND && Cond.hasOneUse()) {
   16783       if (SDValue NewSetCC = LowerToBT(Cond, ISD::SETNE, dl, DAG)) {
   16784         CC = NewSetCC.getOperand(0);
   16785         Cond = NewSetCC.getOperand(1);
   16786         addTest = false;
   16787       }
   16788     }
   16789   }
   16790 
   16791   if (addTest) {
   16792     X86::CondCode X86Cond = Inverted ? X86::COND_E : X86::COND_NE;
   16793     CC = DAG.getConstant(X86Cond, dl, MVT::i8);
   16794     Cond = EmitTest(Cond, X86Cond, dl, DAG);
   16795   }
   16796   Cond = ConvertCmpIfNecessary(Cond, DAG);
   16797   return DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(),
   16798                      Chain, Dest, CC, Cond);
   16799 }
   16800 
   16801 // Lower dynamic stack allocation to _alloca call for Cygwin/Mingw targets.
   16802 // Calls to _alloca are needed to probe the stack when allocating more than 4k
   16803 // bytes in one go. Touching the stack at 4K increments is necessary to ensure
   16804 // that the guard pages used by the OS virtual memory manager are allocated in
   16805 // correct sequence.
   16806 SDValue
   16807 X86TargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op,
   16808                                            SelectionDAG &DAG) const {
   16809   MachineFunction &MF = DAG.getMachineFunction();
   16810   bool SplitStack = MF.shouldSplitStack();
   16811   bool Lower = (Subtarget.isOSWindows() && !Subtarget.isTargetMachO()) ||
   16812                SplitStack;
   16813   SDLoc dl(Op);
   16814 
   16815   // Get the inputs.
   16816   SDNode *Node = Op.getNode();
   16817   SDValue Chain = Op.getOperand(0);
   16818   SDValue Size  = Op.getOperand(1);
   16819   unsigned Align = cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue();
   16820   EVT VT = Node->getValueType(0);
   16821 
   16822   // Chain the dynamic stack allocation so that it doesn't modify the stack
   16823   // pointer when other instructions are using the stack.
   16824   Chain = DAG.getCALLSEQ_START(Chain, DAG.getIntPtrConstant(0, dl, true), dl);
   16825 
   16826   bool Is64Bit = Subtarget.is64Bit();
   16827   MVT SPTy = getPointerTy(DAG.getDataLayout());
   16828 
   16829   SDValue Result;
   16830   if (!Lower) {
   16831     const TargetLowering &TLI = DAG.getTargetLoweringInfo();
   16832     unsigned SPReg = TLI.getStackPointerRegisterToSaveRestore();
   16833     assert(SPReg && "Target cannot require DYNAMIC_STACKALLOC expansion and"
   16834                     " not tell us which reg is the stack pointer!");
   16835 
   16836     SDValue SP = DAG.getCopyFromReg(Chain, dl, SPReg, VT);
   16837     Chain = SP.getValue(1);
   16838     const TargetFrameLowering &TFI = *Subtarget.getFrameLowering();
   16839     unsigned StackAlign = TFI.getStackAlignment();
   16840     Result = DAG.getNode(ISD::SUB, dl, VT, SP, Size); // Value
   16841     if (Align > StackAlign)
   16842       Result = DAG.getNode(ISD::AND, dl, VT, Result,
   16843                          DAG.getConstant(-(uint64_t)Align, dl, VT));
   16844     Chain = DAG.getCopyToReg(Chain, dl, SPReg, Result); // Output chain
   16845   } else if (SplitStack) {
   16846     MachineRegisterInfo &MRI = MF.getRegInfo();
   16847 
   16848     if (Is64Bit) {
   16849       // The 64 bit implementation of segmented stacks needs to clobber both r10
   16850       // r11. This makes it impossible to use it along with nested parameters.
   16851       const Function *F = MF.getFunction();
   16852       for (const auto &A : F->args()) {
   16853         if (A.hasNestAttr())
   16854           report_fatal_error("Cannot use segmented stacks with functions that "
   16855                              "have nested arguments.");
   16856       }
   16857     }
   16858 
   16859     const TargetRegisterClass *AddrRegClass = getRegClassFor(SPTy);
   16860     unsigned Vreg = MRI.createVirtualRegister(AddrRegClass);
   16861     Chain = DAG.getCopyToReg(Chain, dl, Vreg, Size);
   16862     Result = DAG.getNode(X86ISD::SEG_ALLOCA, dl, SPTy, Chain,
   16863                                 DAG.getRegister(Vreg, SPTy));
   16864   } else {
   16865     SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
   16866     Chain = DAG.getNode(X86ISD::WIN_ALLOCA, dl, NodeTys, Chain, Size);
   16867     MF.getInfo<X86MachineFunctionInfo>()->setHasWinAlloca(true);
   16868 
   16869     const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
   16870     unsigned SPReg = RegInfo->getStackRegister();
   16871     SDValue SP = DAG.getCopyFromReg(Chain, dl, SPReg, SPTy);
   16872     Chain = SP.getValue(1);
   16873 
   16874     if (Align) {
   16875       SP = DAG.getNode(ISD::AND, dl, VT, SP.getValue(0),
   16876                        DAG.getConstant(-(uint64_t)Align, dl, VT));
   16877       Chain = DAG.getCopyToReg(Chain, dl, SPReg, SP);
   16878     }
   16879 
   16880     Result = SP;
   16881   }
   16882 
   16883   Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(0, dl, true),
   16884                              DAG.getIntPtrConstant(0, dl, true), SDValue(), dl);
   16885 
   16886   SDValue Ops[2] = {Result, Chain};
   16887   return DAG.getMergeValues(Ops, dl);
   16888 }
   16889 
   16890 SDValue X86TargetLowering::LowerVASTART(SDValue Op, SelectionDAG &DAG) const {
   16891   MachineFunction &MF = DAG.getMachineFunction();
   16892   auto PtrVT = getPointerTy(MF.getDataLayout());
   16893   X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
   16894 
   16895   const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
   16896   SDLoc DL(Op);
   16897 
   16898   if (!Subtarget.is64Bit() ||
   16899       Subtarget.isCallingConvWin64(MF.getFunction()->getCallingConv())) {
   16900     // vastart just stores the address of the VarArgsFrameIndex slot into the
   16901     // memory location argument.
   16902     SDValue FR = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(), PtrVT);
   16903     return DAG.getStore(Op.getOperand(0), DL, FR, Op.getOperand(1),
   16904                         MachinePointerInfo(SV), false, false, 0);
   16905   }
   16906 
   16907   // __va_list_tag:
   16908   //   gp_offset         (0 - 6 * 8)
   16909   //   fp_offset         (48 - 48 + 8 * 16)
   16910   //   overflow_arg_area (point to parameters coming in memory).
   16911   //   reg_save_area
   16912   SmallVector<SDValue, 8> MemOps;
   16913   SDValue FIN = Op.getOperand(1);
   16914   // Store gp_offset
   16915   SDValue Store = DAG.getStore(Op.getOperand(0), DL,
   16916                                DAG.getConstant(FuncInfo->getVarArgsGPOffset(),
   16917                                                DL, MVT::i32),
   16918                                FIN, MachinePointerInfo(SV), false, false, 0);
   16919   MemOps.push_back(Store);
   16920 
   16921   // Store fp_offset
   16922   FIN = DAG.getMemBasePlusOffset(FIN, 4, DL);
   16923   Store = DAG.getStore(Op.getOperand(0), DL,
   16924                        DAG.getConstant(FuncInfo->getVarArgsFPOffset(), DL,
   16925                                        MVT::i32),
   16926                        FIN, MachinePointerInfo(SV, 4), false, false, 0);
   16927   MemOps.push_back(Store);
   16928 
   16929   // Store ptr to overflow_arg_area
   16930   FIN = DAG.getNode(ISD::ADD, DL, PtrVT, FIN, DAG.getIntPtrConstant(4, DL));
   16931   SDValue OVFIN = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(), PtrVT);
   16932   Store = DAG.getStore(Op.getOperand(0), DL, OVFIN, FIN,
   16933                        MachinePointerInfo(SV, 8),
   16934                        false, false, 0);
   16935   MemOps.push_back(Store);
   16936 
   16937   // Store ptr to reg_save_area.
   16938   FIN = DAG.getNode(ISD::ADD, DL, PtrVT, FIN, DAG.getIntPtrConstant(
   16939       Subtarget.isTarget64BitLP64() ? 8 : 4, DL));
   16940   SDValue RSFIN = DAG.getFrameIndex(FuncInfo->getRegSaveFrameIndex(), PtrVT);
   16941   Store = DAG.getStore(Op.getOperand(0), DL, RSFIN, FIN, MachinePointerInfo(
   16942       SV, Subtarget.isTarget64BitLP64() ? 16 : 12), false, false, 0);
   16943   MemOps.push_back(Store);
   16944   return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOps);
   16945 }
   16946 
   16947 SDValue X86TargetLowering::LowerVAARG(SDValue Op, SelectionDAG &DAG) const {
   16948   assert(Subtarget.is64Bit() &&
   16949          "LowerVAARG only handles 64-bit va_arg!");
   16950   assert(Op.getNode()->getNumOperands() == 4);
   16951 
   16952   MachineFunction &MF = DAG.getMachineFunction();
   16953   if (Subtarget.isCallingConvWin64(MF.getFunction()->getCallingConv()))
   16954     // The Win64 ABI uses char* instead of a structure.
   16955     return DAG.expandVAArg(Op.getNode());
   16956 
   16957   SDValue Chain = Op.getOperand(0);
   16958   SDValue SrcPtr = Op.getOperand(1);
   16959   const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
   16960   unsigned Align = Op.getConstantOperandVal(3);
   16961   SDLoc dl(Op);
   16962 
   16963   EVT ArgVT = Op.getNode()->getValueType(0);
   16964   Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext());
   16965   uint32_t ArgSize = DAG.getDataLayout().getTypeAllocSize(ArgTy);
   16966   uint8_t ArgMode;
   16967 
   16968   // Decide which area this value should be read from.
   16969   // TODO: Implement the AMD64 ABI in its entirety. This simple
   16970   // selection mechanism works only for the basic types.
   16971   if (ArgVT == MVT::f80) {
   16972     llvm_unreachable("va_arg for f80 not yet implemented");
   16973   } else if (ArgVT.isFloatingPoint() && ArgSize <= 16 /*bytes*/) {
   16974     ArgMode = 2;  // Argument passed in XMM register. Use fp_offset.
   16975   } else if (ArgVT.isInteger() && ArgSize <= 32 /*bytes*/) {
   16976     ArgMode = 1;  // Argument passed in GPR64 register(s). Use gp_offset.
   16977   } else {
   16978     llvm_unreachable("Unhandled argument type in LowerVAARG");
   16979   }
   16980 
   16981   if (ArgMode == 2) {
   16982     // Sanity Check: Make sure using fp_offset makes sense.
   16983     assert(!Subtarget.useSoftFloat() &&
   16984            !(MF.getFunction()->hasFnAttribute(Attribute::NoImplicitFloat)) &&
   16985            Subtarget.hasSSE1());
   16986   }
   16987 
   16988   // Insert VAARG_64 node into the DAG
   16989   // VAARG_64 returns two values: Variable Argument Address, Chain
   16990   SDValue InstOps[] = {Chain, SrcPtr, DAG.getConstant(ArgSize, dl, MVT::i32),
   16991                        DAG.getConstant(ArgMode, dl, MVT::i8),
   16992                        DAG.getConstant(Align, dl, MVT::i32)};
   16993   SDVTList VTs = DAG.getVTList(getPointerTy(DAG.getDataLayout()), MVT::Other);
   16994   SDValue VAARG = DAG.getMemIntrinsicNode(X86ISD::VAARG_64, dl,
   16995                                           VTs, InstOps, MVT::i64,
   16996                                           MachinePointerInfo(SV),
   16997                                           /*Align=*/0,
   16998                                           /*Volatile=*/false,
   16999                                           /*ReadMem=*/true,
   17000                                           /*WriteMem=*/true);
   17001   Chain = VAARG.getValue(1);
   17002 
   17003   // Load the next argument and return it
   17004   return DAG.getLoad(ArgVT, dl,
   17005                      Chain,
   17006                      VAARG,
   17007                      MachinePointerInfo(),
   17008                      false, false, false, 0);
   17009 }
   17010 
   17011 static SDValue LowerVACOPY(SDValue Op, const X86Subtarget &Subtarget,
   17012                            SelectionDAG &DAG) {
   17013   // X86-64 va_list is a struct { i32, i32, i8*, i8* }, except on Windows,
   17014   // where a va_list is still an i8*.
   17015   assert(Subtarget.is64Bit() && "This code only handles 64-bit va_copy!");
   17016   if (Subtarget.isCallingConvWin64(
   17017         DAG.getMachineFunction().getFunction()->getCallingConv()))
   17018     // Probably a Win64 va_copy.
   17019     return DAG.expandVACopy(Op.getNode());
   17020 
   17021   SDValue Chain = Op.getOperand(0);
   17022   SDValue DstPtr = Op.getOperand(1);
   17023   SDValue SrcPtr = Op.getOperand(2);
   17024   const Value *DstSV = cast<SrcValueSDNode>(Op.getOperand(3))->getValue();
   17025   const Value *SrcSV = cast<SrcValueSDNode>(Op.getOperand(4))->getValue();
   17026   SDLoc DL(Op);
   17027 
   17028   return DAG.getMemcpy(Chain, DL, DstPtr, SrcPtr,
   17029                        DAG.getIntPtrConstant(24, DL), 8, /*isVolatile*/false,
   17030                        false, false,
   17031                        MachinePointerInfo(DstSV), MachinePointerInfo(SrcSV));
   17032 }
   17033 
   17034 /// Handle vector element shifts where the shift amount is a constant.
   17035 /// Takes immediate version of shift as input.
   17036 static SDValue getTargetVShiftByConstNode(unsigned Opc, const SDLoc &dl, MVT VT,
   17037                                           SDValue SrcOp, uint64_t ShiftAmt,
   17038                                           SelectionDAG &DAG) {
   17039   MVT ElementType = VT.getVectorElementType();
   17040 
   17041   // Fold this packed shift into its first operand if ShiftAmt is 0.
   17042   if (ShiftAmt == 0)
   17043     return SrcOp;
   17044 
   17045   // Check for ShiftAmt >= element width
   17046   if (ShiftAmt >= ElementType.getSizeInBits()) {
   17047     if (Opc == X86ISD::VSRAI)
   17048       ShiftAmt = ElementType.getSizeInBits() - 1;
   17049     else
   17050       return DAG.getConstant(0, dl, VT);
   17051   }
   17052 
   17053   assert((Opc == X86ISD::VSHLI || Opc == X86ISD::VSRLI || Opc == X86ISD::VSRAI)
   17054          && "Unknown target vector shift-by-constant node");
   17055 
   17056   // Fold this packed vector shift into a build vector if SrcOp is a
   17057   // vector of Constants or UNDEFs, and SrcOp valuetype is the same as VT.
   17058   if (VT == SrcOp.getSimpleValueType() &&
   17059       ISD::isBuildVectorOfConstantSDNodes(SrcOp.getNode())) {
   17060     SmallVector<SDValue, 8> Elts;
   17061     unsigned NumElts = SrcOp->getNumOperands();
   17062     ConstantSDNode *ND;
   17063 
   17064     switch(Opc) {
   17065     default: llvm_unreachable("Unknown opcode!");
   17066     case X86ISD::VSHLI:
   17067       for (unsigned i=0; i!=NumElts; ++i) {
   17068         SDValue CurrentOp = SrcOp->getOperand(i);
   17069         if (CurrentOp->isUndef()) {
   17070           Elts.push_back(CurrentOp);
   17071           continue;
   17072         }
   17073         ND = cast<ConstantSDNode>(CurrentOp);
   17074         const APInt &C = ND->getAPIntValue();
   17075         Elts.push_back(DAG.getConstant(C.shl(ShiftAmt), dl, ElementType));
   17076       }
   17077       break;
   17078     case X86ISD::VSRLI:
   17079       for (unsigned i=0; i!=NumElts; ++i) {
   17080         SDValue CurrentOp = SrcOp->getOperand(i);
   17081         if (CurrentOp->isUndef()) {
   17082           Elts.push_back(CurrentOp);
   17083           continue;
   17084         }
   17085         ND = cast<ConstantSDNode>(CurrentOp);
   17086         const APInt &C = ND->getAPIntValue();
   17087         Elts.push_back(DAG.getConstant(C.lshr(ShiftAmt), dl, ElementType));
   17088       }
   17089       break;
   17090     case X86ISD::VSRAI:
   17091       for (unsigned i=0; i!=NumElts; ++i) {
   17092         SDValue CurrentOp = SrcOp->getOperand(i);
   17093         if (CurrentOp->isUndef()) {
   17094           Elts.push_back(CurrentOp);
   17095           continue;
   17096         }
   17097         ND = cast<ConstantSDNode>(CurrentOp);
   17098         const APInt &C = ND->getAPIntValue();
   17099         Elts.push_back(DAG.getConstant(C.ashr(ShiftAmt), dl, ElementType));
   17100       }
   17101       break;
   17102     }
   17103 
   17104     return DAG.getBuildVector(VT, dl, Elts);
   17105   }
   17106 
   17107   return DAG.getNode(Opc, dl, VT, SrcOp,
   17108                      DAG.getConstant(ShiftAmt, dl, MVT::i8));
   17109 }
   17110 
   17111 /// Handle vector element shifts where the shift amount may or may not be a
   17112 /// constant. Takes immediate version of shift as input.
   17113 static SDValue getTargetVShiftNode(unsigned Opc, const SDLoc &dl, MVT VT,
   17114                                    SDValue SrcOp, SDValue ShAmt,
   17115                                    SelectionDAG &DAG) {
   17116   MVT SVT = ShAmt.getSimpleValueType();
   17117   assert((SVT == MVT::i32 || SVT == MVT::i64) && "Unexpected value type!");
   17118 
   17119   // Catch shift-by-constant.
   17120   if (ConstantSDNode *CShAmt = dyn_cast<ConstantSDNode>(ShAmt))
   17121     return getTargetVShiftByConstNode(Opc, dl, VT, SrcOp,
   17122                                       CShAmt->getZExtValue(), DAG);
   17123 
   17124   // Change opcode to non-immediate version
   17125   switch (Opc) {
   17126     default: llvm_unreachable("Unknown target vector shift node");
   17127     case X86ISD::VSHLI: Opc = X86ISD::VSHL; break;
   17128     case X86ISD::VSRLI: Opc = X86ISD::VSRL; break;
   17129     case X86ISD::VSRAI: Opc = X86ISD::VSRA; break;
   17130   }
   17131 
   17132   const X86Subtarget &Subtarget =
   17133       static_cast<const X86Subtarget &>(DAG.getSubtarget());
   17134   if (Subtarget.hasSSE41() && ShAmt.getOpcode() == ISD::ZERO_EXTEND &&
   17135       ShAmt.getOperand(0).getSimpleValueType() == MVT::i16) {
   17136     // Let the shuffle legalizer expand this shift amount node.
   17137     SDValue Op0 = ShAmt.getOperand(0);
   17138     Op0 = DAG.getNode(ISD::SCALAR_TO_VECTOR, SDLoc(Op0), MVT::v8i16, Op0);
   17139     ShAmt = getShuffleVectorZeroOrUndef(Op0, 0, true, Subtarget, DAG);
   17140   } else {
   17141     // Need to build a vector containing shift amount.
   17142     // SSE/AVX packed shifts only use the lower 64-bit of the shift count.
   17143     SmallVector<SDValue, 4> ShOps;
   17144     ShOps.push_back(ShAmt);
   17145     if (SVT == MVT::i32) {
   17146       ShOps.push_back(DAG.getConstant(0, dl, SVT));
   17147       ShOps.push_back(DAG.getUNDEF(SVT));
   17148     }
   17149     ShOps.push_back(DAG.getUNDEF(SVT));
   17150 
   17151     MVT BVT = SVT == MVT::i32 ? MVT::v4i32 : MVT::v2i64;
   17152     ShAmt = DAG.getBuildVector(BVT, dl, ShOps);
   17153   }
   17154 
   17155   // The return type has to be a 128-bit type with the same element
   17156   // type as the input type.
   17157   MVT EltVT = VT.getVectorElementType();
   17158   MVT ShVT = MVT::getVectorVT(EltVT, 128/EltVT.getSizeInBits());
   17159 
   17160   ShAmt = DAG.getBitcast(ShVT, ShAmt);
   17161   return DAG.getNode(Opc, dl, VT, SrcOp, ShAmt);
   17162 }
   17163 
   17164 /// \brief Return Mask with the necessary casting or extending
   17165 /// for \p Mask according to \p MaskVT when lowering masking intrinsics
   17166 static SDValue getMaskNode(SDValue Mask, MVT MaskVT,
   17167                            const X86Subtarget &Subtarget, SelectionDAG &DAG,
   17168                            const SDLoc &dl) {
   17169 
   17170   if (isAllOnesConstant(Mask))
   17171     return DAG.getTargetConstant(1, dl, MaskVT);
   17172   if (X86::isZeroNode(Mask))
   17173     return DAG.getTargetConstant(0, dl, MaskVT);
   17174 
   17175   if (MaskVT.bitsGT(Mask.getSimpleValueType())) {
   17176     // Mask should be extended
   17177     Mask = DAG.getNode(ISD::ANY_EXTEND, dl,
   17178                        MVT::getIntegerVT(MaskVT.getSizeInBits()), Mask);
   17179   }
   17180 
   17181   if (Mask.getSimpleValueType() == MVT::i64 && Subtarget.is32Bit()) {
   17182     if (MaskVT == MVT::v64i1) {
   17183       assert(Subtarget.hasBWI() && "Expected AVX512BW target!");
   17184       // In case 32bit mode, bitcast i64 is illegal, extend/split it.
   17185       SDValue Lo, Hi;
   17186       Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, Mask,
   17187                           DAG.getConstant(0, dl, MVT::i32));
   17188       Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, Mask,
   17189                           DAG.getConstant(1, dl, MVT::i32));
   17190 
   17191       Lo = DAG.getBitcast(MVT::v32i1, Lo);
   17192       Hi = DAG.getBitcast(MVT::v32i1, Hi);
   17193 
   17194       return DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v64i1, Lo, Hi);
   17195     } else {
   17196       // MaskVT require < 64bit. Truncate mask (should succeed in any case),
   17197       // and bitcast.
   17198       MVT TruncVT = MVT::getIntegerVT(MaskVT.getSizeInBits());
   17199       return DAG.getBitcast(MaskVT,
   17200                             DAG.getNode(ISD::TRUNCATE, dl, TruncVT, Mask));
   17201     }
   17202 
   17203   } else {
   17204     MVT BitcastVT = MVT::getVectorVT(MVT::i1,
   17205                                      Mask.getSimpleValueType().getSizeInBits());
   17206     // In case when MaskVT equals v2i1 or v4i1, low 2 or 4 elements
   17207     // are extracted by EXTRACT_SUBVECTOR.
   17208     return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MaskVT,
   17209                        DAG.getBitcast(BitcastVT, Mask),
   17210                        DAG.getIntPtrConstant(0, dl));
   17211   }
   17212 }
   17213 
   17214 /// \brief Return (and \p Op, \p Mask) for compare instructions or
   17215 /// (vselect \p Mask, \p Op, \p PreservedSrc) for others along with the
   17216 /// necessary casting or extending for \p Mask when lowering masking intrinsics
   17217 static SDValue getVectorMaskingNode(SDValue Op, SDValue Mask,
   17218                   SDValue PreservedSrc,
   17219                   const X86Subtarget &Subtarget,
   17220                   SelectionDAG &DAG) {
   17221   MVT VT = Op.getSimpleValueType();
   17222   MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getVectorNumElements());
   17223   unsigned OpcodeSelect = ISD::VSELECT;
   17224   SDLoc dl(Op);
   17225 
   17226   if (isAllOnesConstant(Mask))
   17227     return Op;
   17228 
   17229   SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
   17230 
   17231   switch (Op.getOpcode()) {
   17232   default: break;
   17233   case X86ISD::PCMPEQM:
   17234   case X86ISD::PCMPGTM:
   17235   case X86ISD::CMPM:
   17236   case X86ISD::CMPMU:
   17237     return DAG.getNode(ISD::AND, dl, VT, Op, VMask);
   17238   case X86ISD::VFPCLASS:
   17239     case X86ISD::VFPCLASSS:
   17240     return DAG.getNode(ISD::OR, dl, VT, Op, VMask);
   17241   case X86ISD::VTRUNC:
   17242   case X86ISD::VTRUNCS:
   17243   case X86ISD::VTRUNCUS:
   17244   case ISD::FP_TO_FP16:
   17245     // We can't use ISD::VSELECT here because it is not always "Legal"
   17246     // for the destination type. For example vpmovqb require only AVX512
   17247     // and vselect that can operate on byte element type require BWI
   17248     OpcodeSelect = X86ISD::SELECT;
   17249     break;
   17250   }
   17251   if (PreservedSrc.isUndef())
   17252     PreservedSrc = getZeroVector(VT, Subtarget, DAG, dl);
   17253   return DAG.getNode(OpcodeSelect, dl, VT, VMask, Op, PreservedSrc);
   17254 }
   17255 
   17256 /// \brief Creates an SDNode for a predicated scalar operation.
   17257 /// \returns (X86vselect \p Mask, \p Op, \p PreservedSrc).
   17258 /// The mask is coming as MVT::i8 and it should be truncated
   17259 /// to MVT::i1 while lowering masking intrinsics.
   17260 /// The main difference between ScalarMaskingNode and VectorMaskingNode is using
   17261 /// "X86select" instead of "vselect". We just can't create the "vselect" node
   17262 /// for a scalar instruction.
   17263 static SDValue getScalarMaskingNode(SDValue Op, SDValue Mask,
   17264                                     SDValue PreservedSrc,
   17265                                     const X86Subtarget &Subtarget,
   17266                                     SelectionDAG &DAG) {
   17267   if (isAllOnesConstant(Mask))
   17268     return Op;
   17269 
   17270   MVT VT = Op.getSimpleValueType();
   17271   SDLoc dl(Op);
   17272   // The mask should be of type MVT::i1
   17273   SDValue IMask = DAG.getNode(ISD::TRUNCATE, dl, MVT::i1, Mask);
   17274 
   17275   if (Op.getOpcode() == X86ISD::FSETCC)
   17276     return DAG.getNode(ISD::AND, dl, VT, Op, IMask);
   17277   if (Op.getOpcode() == X86ISD::VFPCLASS ||
   17278       Op.getOpcode() == X86ISD::VFPCLASSS)
   17279     return DAG.getNode(ISD::OR, dl, VT, Op, IMask);
   17280 
   17281   if (PreservedSrc.isUndef())
   17282     PreservedSrc = getZeroVector(VT, Subtarget, DAG, dl);
   17283   return DAG.getNode(X86ISD::SELECT, dl, VT, IMask, Op, PreservedSrc);
   17284 }
   17285 
   17286 static int getSEHRegistrationNodeSize(const Function *Fn) {
   17287   if (!Fn->hasPersonalityFn())
   17288     report_fatal_error(
   17289         "querying registration node size for function without personality");
   17290   // The RegNodeSize is 6 32-bit words for SEH and 4 for C++ EH. See
   17291   // WinEHStatePass for the full struct definition.
   17292   switch (classifyEHPersonality(Fn->getPersonalityFn())) {
   17293   case EHPersonality::MSVC_X86SEH: return 24;
   17294   case EHPersonality::MSVC_CXX: return 16;
   17295   default: break;
   17296   }
   17297   report_fatal_error(
   17298       "can only recover FP for 32-bit MSVC EH personality functions");
   17299 }
   17300 
   17301 /// When the MSVC runtime transfers control to us, either to an outlined
   17302 /// function or when returning to a parent frame after catching an exception, we
   17303 /// recover the parent frame pointer by doing arithmetic on the incoming EBP.
   17304 /// Here's the math:
   17305 ///   RegNodeBase = EntryEBP - RegNodeSize
   17306 ///   ParentFP = RegNodeBase - ParentFrameOffset
   17307 /// Subtracting RegNodeSize takes us to the offset of the registration node, and
   17308 /// subtracting the offset (negative on x86) takes us back to the parent FP.
   17309 static SDValue recoverFramePointer(SelectionDAG &DAG, const Function *Fn,
   17310                                    SDValue EntryEBP) {
   17311   MachineFunction &MF = DAG.getMachineFunction();
   17312   SDLoc dl;
   17313 
   17314   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
   17315   MVT PtrVT = TLI.getPointerTy(DAG.getDataLayout());
   17316 
   17317   // It's possible that the parent function no longer has a personality function
   17318   // if the exceptional code was optimized away, in which case we just return
   17319   // the incoming EBP.
   17320   if (!Fn->hasPersonalityFn())
   17321     return EntryEBP;
   17322 
   17323   // Get an MCSymbol that will ultimately resolve to the frame offset of the EH
   17324   // registration, or the .set_setframe offset.
   17325   MCSymbol *OffsetSym =
   17326       MF.getMMI().getContext().getOrCreateParentFrameOffsetSymbol(
   17327           GlobalValue::getRealLinkageName(Fn->getName()));
   17328   SDValue OffsetSymVal = DAG.getMCSymbol(OffsetSym, PtrVT);
   17329   SDValue ParentFrameOffset =
   17330       DAG.getNode(ISD::LOCAL_RECOVER, dl, PtrVT, OffsetSymVal);
   17331 
   17332   // Return EntryEBP + ParentFrameOffset for x64. This adjusts from RSP after
   17333   // prologue to RBP in the parent function.
   17334   const X86Subtarget &Subtarget =
   17335       static_cast<const X86Subtarget &>(DAG.getSubtarget());
   17336   if (Subtarget.is64Bit())
   17337     return DAG.getNode(ISD::ADD, dl, PtrVT, EntryEBP, ParentFrameOffset);
   17338 
   17339   int RegNodeSize = getSEHRegistrationNodeSize(Fn);
   17340   // RegNodeBase = EntryEBP - RegNodeSize
   17341   // ParentFP = RegNodeBase - ParentFrameOffset
   17342   SDValue RegNodeBase = DAG.getNode(ISD::SUB, dl, PtrVT, EntryEBP,
   17343                                     DAG.getConstant(RegNodeSize, dl, PtrVT));
   17344   return DAG.getNode(ISD::SUB, dl, PtrVT, RegNodeBase, ParentFrameOffset);
   17345 }
   17346 
   17347 static SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, const X86Subtarget &Subtarget,
   17348                                        SelectionDAG &DAG) {
   17349   SDLoc dl(Op);
   17350   unsigned IntNo = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
   17351   MVT VT = Op.getSimpleValueType();
   17352   const IntrinsicData* IntrData = getIntrinsicWithoutChain(IntNo);
   17353   if (IntrData) {
   17354     switch(IntrData->Type) {
   17355     case INTR_TYPE_1OP:
   17356       return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Op.getOperand(1));
   17357     case INTR_TYPE_2OP:
   17358       return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Op.getOperand(1),
   17359         Op.getOperand(2));
   17360     case INTR_TYPE_2OP_IMM8:
   17361       return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Op.getOperand(1),
   17362                          DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Op.getOperand(2)));
   17363     case INTR_TYPE_3OP:
   17364       return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Op.getOperand(1),
   17365         Op.getOperand(2), Op.getOperand(3));
   17366     case INTR_TYPE_4OP:
   17367       return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Op.getOperand(1),
   17368         Op.getOperand(2), Op.getOperand(3), Op.getOperand(4));
   17369     case INTR_TYPE_1OP_MASK_RM: {
   17370       SDValue Src = Op.getOperand(1);
   17371       SDValue PassThru = Op.getOperand(2);
   17372       SDValue Mask = Op.getOperand(3);
   17373       SDValue RoundingMode;
   17374       // We allways add rounding mode to the Node.
   17375       // If the rounding mode is not specified, we add the
   17376       // "current direction" mode.
   17377       if (Op.getNumOperands() == 4)
   17378         RoundingMode =
   17379           DAG.getConstant(X86::STATIC_ROUNDING::CUR_DIRECTION, dl, MVT::i32);
   17380       else
   17381         RoundingMode = Op.getOperand(4);
   17382       unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
   17383       if (IntrWithRoundingModeOpcode != 0)
   17384         if (cast<ConstantSDNode>(RoundingMode)->getZExtValue() !=
   17385             X86::STATIC_ROUNDING::CUR_DIRECTION)
   17386           return getVectorMaskingNode(DAG.getNode(IntrWithRoundingModeOpcode,
   17387                                       dl, Op.getValueType(), Src, RoundingMode),
   17388                                       Mask, PassThru, Subtarget, DAG);
   17389       return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src,
   17390                                               RoundingMode),
   17391                                   Mask, PassThru, Subtarget, DAG);
   17392     }
   17393     case INTR_TYPE_1OP_MASK: {
   17394       SDValue Src = Op.getOperand(1);
   17395       SDValue PassThru = Op.getOperand(2);
   17396       SDValue Mask = Op.getOperand(3);
   17397       // We add rounding mode to the Node when
   17398       //   - RM Opcode is specified and
   17399       //   - RM is not "current direction".
   17400       unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
   17401       if (IntrWithRoundingModeOpcode != 0) {
   17402         SDValue Rnd = Op.getOperand(4);
   17403         unsigned Round = cast<ConstantSDNode>(Rnd)->getZExtValue();
   17404         if (Round != X86::STATIC_ROUNDING::CUR_DIRECTION) {
   17405           return getVectorMaskingNode(DAG.getNode(IntrWithRoundingModeOpcode,
   17406                                       dl, Op.getValueType(),
   17407                                       Src, Rnd),
   17408                                       Mask, PassThru, Subtarget, DAG);
   17409         }
   17410       }
   17411       return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src),
   17412                                   Mask, PassThru, Subtarget, DAG);
   17413     }
   17414     case INTR_TYPE_SCALAR_MASK: {
   17415       SDValue Src1 = Op.getOperand(1);
   17416       SDValue Src2 = Op.getOperand(2);
   17417       SDValue passThru = Op.getOperand(3);
   17418       SDValue Mask = Op.getOperand(4);
   17419       return getScalarMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src1, Src2),
   17420                                   Mask, passThru, Subtarget, DAG);
   17421     }
   17422     case INTR_TYPE_SCALAR_MASK_RM: {
   17423       SDValue Src1 = Op.getOperand(1);
   17424       SDValue Src2 = Op.getOperand(2);
   17425       SDValue Src0 = Op.getOperand(3);
   17426       SDValue Mask = Op.getOperand(4);
   17427       // There are 2 kinds of intrinsics in this group:
   17428       // (1) With suppress-all-exceptions (sae) or rounding mode- 6 operands
   17429       // (2) With rounding mode and sae - 7 operands.
   17430       if (Op.getNumOperands() == 6) {
   17431         SDValue Sae  = Op.getOperand(5);
   17432         unsigned Opc = IntrData->Opc1 ? IntrData->Opc1 : IntrData->Opc0;
   17433         return getScalarMaskingNode(DAG.getNode(Opc, dl, VT, Src1, Src2,
   17434                                                 Sae),
   17435                                     Mask, Src0, Subtarget, DAG);
   17436       }
   17437       assert(Op.getNumOperands() == 7 && "Unexpected intrinsic form");
   17438       SDValue RoundingMode  = Op.getOperand(5);
   17439       SDValue Sae  = Op.getOperand(6);
   17440       return getScalarMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src1, Src2,
   17441                                               RoundingMode, Sae),
   17442                                   Mask, Src0, Subtarget, DAG);
   17443     }
   17444     case INTR_TYPE_2OP_MASK:
   17445     case INTR_TYPE_2OP_IMM8_MASK: {
   17446       SDValue Src1 = Op.getOperand(1);
   17447       SDValue Src2 = Op.getOperand(2);
   17448       SDValue PassThru = Op.getOperand(3);
   17449       SDValue Mask = Op.getOperand(4);
   17450 
   17451       if (IntrData->Type == INTR_TYPE_2OP_IMM8_MASK)
   17452         Src2 = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Src2);
   17453 
   17454       // We specify 2 possible opcodes for intrinsics with rounding modes.
   17455       // First, we check if the intrinsic may have non-default rounding mode,
   17456       // (IntrData->Opc1 != 0), then we check the rounding mode operand.
   17457       unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
   17458       if (IntrWithRoundingModeOpcode != 0) {
   17459         SDValue Rnd = Op.getOperand(5);
   17460         unsigned Round = cast<ConstantSDNode>(Rnd)->getZExtValue();
   17461         if (Round != X86::STATIC_ROUNDING::CUR_DIRECTION) {
   17462           return getVectorMaskingNode(DAG.getNode(IntrWithRoundingModeOpcode,
   17463                                       dl, Op.getValueType(),
   17464                                       Src1, Src2, Rnd),
   17465                                       Mask, PassThru, Subtarget, DAG);
   17466         }
   17467       }
   17468       // TODO: Intrinsics should have fast-math-flags to propagate.
   17469       return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT,Src1,Src2),
   17470                                   Mask, PassThru, Subtarget, DAG);
   17471     }
   17472     case INTR_TYPE_2OP_MASK_RM: {
   17473       SDValue Src1 = Op.getOperand(1);
   17474       SDValue Src2 = Op.getOperand(2);
   17475       SDValue PassThru = Op.getOperand(3);
   17476       SDValue Mask = Op.getOperand(4);
   17477       // We specify 2 possible modes for intrinsics, with/without rounding
   17478       // modes.
   17479       // First, we check if the intrinsic have rounding mode (6 operands),
   17480       // if not, we set rounding mode to "current".
   17481       SDValue Rnd;
   17482       if (Op.getNumOperands() == 6)
   17483         Rnd = Op.getOperand(5);
   17484       else
   17485         Rnd = DAG.getConstant(X86::STATIC_ROUNDING::CUR_DIRECTION, dl, MVT::i32);
   17486       return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT,
   17487                                               Src1, Src2, Rnd),
   17488                                   Mask, PassThru, Subtarget, DAG);
   17489     }
   17490     case INTR_TYPE_3OP_SCALAR_MASK_RM: {
   17491       SDValue Src1 = Op.getOperand(1);
   17492       SDValue Src2 = Op.getOperand(2);
   17493       SDValue Src3 = Op.getOperand(3);
   17494       SDValue PassThru = Op.getOperand(4);
   17495       SDValue Mask = Op.getOperand(5);
   17496       SDValue Sae  = Op.getOperand(6);
   17497 
   17498       return getScalarMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src1,
   17499                                               Src2, Src3, Sae),
   17500                                   Mask, PassThru, Subtarget, DAG);
   17501     }
   17502     case INTR_TYPE_3OP_MASK_RM: {
   17503       SDValue Src1 = Op.getOperand(1);
   17504       SDValue Src2 = Op.getOperand(2);
   17505       SDValue Imm = Op.getOperand(3);
   17506       SDValue PassThru = Op.getOperand(4);
   17507       SDValue Mask = Op.getOperand(5);
   17508       // We specify 2 possible modes for intrinsics, with/without rounding
   17509       // modes.
   17510       // First, we check if the intrinsic have rounding mode (7 operands),
   17511       // if not, we set rounding mode to "current".
   17512       SDValue Rnd;
   17513       if (Op.getNumOperands() == 7)
   17514         Rnd = Op.getOperand(6);
   17515       else
   17516         Rnd = DAG.getConstant(X86::STATIC_ROUNDING::CUR_DIRECTION, dl, MVT::i32);
   17517       return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT,
   17518         Src1, Src2, Imm, Rnd),
   17519         Mask, PassThru, Subtarget, DAG);
   17520     }
   17521     case INTR_TYPE_3OP_IMM8_MASK:
   17522     case INTR_TYPE_3OP_MASK:
   17523     case INSERT_SUBVEC: {
   17524       SDValue Src1 = Op.getOperand(1);
   17525       SDValue Src2 = Op.getOperand(2);
   17526       SDValue Src3 = Op.getOperand(3);
   17527       SDValue PassThru = Op.getOperand(4);
   17528       SDValue Mask = Op.getOperand(5);
   17529 
   17530       if (IntrData->Type == INTR_TYPE_3OP_IMM8_MASK)
   17531         Src3 = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Src3);
   17532       else if (IntrData->Type == INSERT_SUBVEC) {
   17533         // imm should be adapted to ISD::INSERT_SUBVECTOR behavior
   17534         assert(isa<ConstantSDNode>(Src3) && "Expected a ConstantSDNode here!");
   17535         unsigned Imm = cast<ConstantSDNode>(Src3)->getZExtValue();
   17536         Imm *= Src2.getSimpleValueType().getVectorNumElements();
   17537         Src3 = DAG.getTargetConstant(Imm, dl, MVT::i32);
   17538       }
   17539 
   17540       // We specify 2 possible opcodes for intrinsics with rounding modes.
   17541       // First, we check if the intrinsic may have non-default rounding mode,
   17542       // (IntrData->Opc1 != 0), then we check the rounding mode operand.
   17543       unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
   17544       if (IntrWithRoundingModeOpcode != 0) {
   17545         SDValue Rnd = Op.getOperand(6);
   17546         unsigned Round = cast<ConstantSDNode>(Rnd)->getZExtValue();
   17547         if (Round != X86::STATIC_ROUNDING::CUR_DIRECTION) {
   17548           return getVectorMaskingNode(DAG.getNode(IntrWithRoundingModeOpcode,
   17549                                       dl, Op.getValueType(),
   17550                                       Src1, Src2, Src3, Rnd),
   17551                                       Mask, PassThru, Subtarget, DAG);
   17552         }
   17553       }
   17554       return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT,
   17555                                               Src1, Src2, Src3),
   17556                                   Mask, PassThru, Subtarget, DAG);
   17557     }
   17558     case VPERM_2OP_MASK : {
   17559       SDValue Src1 = Op.getOperand(1);
   17560       SDValue Src2 = Op.getOperand(2);
   17561       SDValue PassThru = Op.getOperand(3);
   17562       SDValue Mask = Op.getOperand(4);
   17563 
   17564       // Swap Src1 and Src2 in the node creation
   17565       return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT,Src2, Src1),
   17566                                   Mask, PassThru, Subtarget, DAG);
   17567     }
   17568     case VPERM_3OP_MASKZ:
   17569     case VPERM_3OP_MASK:{
   17570       // Src2 is the PassThru
   17571       SDValue Src1 = Op.getOperand(1);
   17572       SDValue Src2 = Op.getOperand(2);
   17573       SDValue Src3 = Op.getOperand(3);
   17574       SDValue Mask = Op.getOperand(4);
   17575       MVT VT = Op.getSimpleValueType();
   17576       SDValue PassThru = SDValue();
   17577 
   17578       // set PassThru element
   17579       if (IntrData->Type == VPERM_3OP_MASKZ)
   17580         PassThru = getZeroVector(VT, Subtarget, DAG, dl);
   17581       else
   17582         PassThru = DAG.getBitcast(VT, Src2);
   17583 
   17584       // Swap Src1 and Src2 in the node creation
   17585       return getVectorMaskingNode(DAG.getNode(IntrData->Opc0,
   17586                                               dl, Op.getValueType(),
   17587                                               Src2, Src1, Src3),
   17588                                   Mask, PassThru, Subtarget, DAG);
   17589     }
   17590     case FMA_OP_MASK3:
   17591     case FMA_OP_MASKZ:
   17592     case FMA_OP_MASK: {
   17593       SDValue Src1 = Op.getOperand(1);
   17594       SDValue Src2 = Op.getOperand(2);
   17595       SDValue Src3 = Op.getOperand(3);
   17596       SDValue Mask = Op.getOperand(4);
   17597       MVT VT = Op.getSimpleValueType();
   17598       SDValue PassThru = SDValue();
   17599 
   17600       // set PassThru element
   17601       if (IntrData->Type == FMA_OP_MASKZ)
   17602         PassThru = getZeroVector(VT, Subtarget, DAG, dl);
   17603       else if (IntrData->Type == FMA_OP_MASK3)
   17604         PassThru = Src3;
   17605       else
   17606         PassThru = Src1;
   17607 
   17608       // We specify 2 possible opcodes for intrinsics with rounding modes.
   17609       // First, we check if the intrinsic may have non-default rounding mode,
   17610       // (IntrData->Opc1 != 0), then we check the rounding mode operand.
   17611       unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
   17612       if (IntrWithRoundingModeOpcode != 0) {
   17613         SDValue Rnd = Op.getOperand(5);
   17614         if (cast<ConstantSDNode>(Rnd)->getZExtValue() !=
   17615             X86::STATIC_ROUNDING::CUR_DIRECTION)
   17616           return getVectorMaskingNode(DAG.getNode(IntrWithRoundingModeOpcode,
   17617                                                   dl, Op.getValueType(),
   17618                                                   Src1, Src2, Src3, Rnd),
   17619                                       Mask, PassThru, Subtarget, DAG);
   17620       }
   17621       return getVectorMaskingNode(DAG.getNode(IntrData->Opc0,
   17622                                               dl, Op.getValueType(),
   17623                                               Src1, Src2, Src3),
   17624                                   Mask, PassThru, Subtarget, DAG);
   17625     }
   17626     case FMA_OP_SCALAR_MASK:
   17627     case FMA_OP_SCALAR_MASK3:
   17628     case FMA_OP_SCALAR_MASKZ: {
   17629       SDValue Src1 = Op.getOperand(1);
   17630       SDValue Src2 = Op.getOperand(2);
   17631       SDValue Src3 = Op.getOperand(3);
   17632       SDValue Mask = Op.getOperand(4);
   17633       MVT VT = Op.getSimpleValueType();
   17634       SDValue PassThru = SDValue();
   17635 
   17636       // set PassThru element
   17637       if (IntrData->Type == FMA_OP_SCALAR_MASKZ)
   17638         PassThru = getZeroVector(VT, Subtarget, DAG, dl);
   17639       else if (IntrData->Type == FMA_OP_SCALAR_MASK3)
   17640         PassThru = Src3;
   17641       else
   17642         PassThru = Src1;
   17643 
   17644       SDValue Rnd = Op.getOperand(5);
   17645       return getScalarMaskingNode(DAG.getNode(IntrData->Opc0, dl,
   17646                                               Op.getValueType(), Src1, Src2,
   17647                                               Src3, Rnd),
   17648                                   Mask, PassThru, Subtarget, DAG);
   17649     }
   17650     case TERLOG_OP_MASK:
   17651     case TERLOG_OP_MASKZ: {
   17652       SDValue Src1 = Op.getOperand(1);
   17653       SDValue Src2 = Op.getOperand(2);
   17654       SDValue Src3 = Op.getOperand(3);
   17655       SDValue Src4 = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Op.getOperand(4));
   17656       SDValue Mask = Op.getOperand(5);
   17657       MVT VT = Op.getSimpleValueType();
   17658       SDValue PassThru = Src1;
   17659       // Set PassThru element.
   17660       if (IntrData->Type == TERLOG_OP_MASKZ)
   17661         PassThru = getZeroVector(VT, Subtarget, DAG, dl);
   17662 
   17663       return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT,
   17664                                               Src1, Src2, Src3, Src4),
   17665                                   Mask, PassThru, Subtarget, DAG);
   17666     }
   17667     case FPCLASS: {
   17668       // FPclass intrinsics with mask
   17669        SDValue Src1 = Op.getOperand(1);
   17670        MVT VT = Src1.getSimpleValueType();
   17671        MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getVectorNumElements());
   17672        SDValue Imm = Op.getOperand(2);
   17673        SDValue Mask = Op.getOperand(3);
   17674        MVT BitcastVT = MVT::getVectorVT(MVT::i1,
   17675                                      Mask.getSimpleValueType().getSizeInBits());
   17676        SDValue FPclass = DAG.getNode(IntrData->Opc0, dl, MaskVT, Src1, Imm);
   17677        SDValue FPclassMask = getVectorMaskingNode(FPclass, Mask,
   17678                                                  DAG.getTargetConstant(0, dl, MaskVT),
   17679                                                  Subtarget, DAG);
   17680        SDValue Res = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, BitcastVT,
   17681                                  DAG.getUNDEF(BitcastVT), FPclassMask,
   17682                                  DAG.getIntPtrConstant(0, dl));
   17683        return DAG.getBitcast(Op.getValueType(), Res);
   17684     }
   17685     case FPCLASSS: {
   17686       SDValue Src1 = Op.getOperand(1);
   17687       SDValue Imm = Op.getOperand(2);
   17688       SDValue Mask = Op.getOperand(3);
   17689       SDValue FPclass = DAG.getNode(IntrData->Opc0, dl, MVT::i1, Src1, Imm);
   17690       SDValue FPclassMask = getScalarMaskingNode(FPclass, Mask,
   17691         DAG.getTargetConstant(0, dl, MVT::i1), Subtarget, DAG);
   17692       return DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::i8, FPclassMask);
   17693     }
   17694     case CMP_MASK:
   17695     case CMP_MASK_CC: {
   17696       // Comparison intrinsics with masks.
   17697       // Example of transformation:
   17698       // (i8 (int_x86_avx512_mask_pcmpeq_q_128
   17699       //             (v2i64 %a), (v2i64 %b), (i8 %mask))) ->
   17700       // (i8 (bitcast
   17701       //   (v8i1 (insert_subvector undef,
   17702       //           (v2i1 (and (PCMPEQM %a, %b),
   17703       //                      (extract_subvector
   17704       //                         (v8i1 (bitcast %mask)), 0))), 0))))
   17705       MVT VT = Op.getOperand(1).getSimpleValueType();
   17706       MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getVectorNumElements());
   17707       SDValue Mask = Op.getOperand((IntrData->Type == CMP_MASK_CC) ? 4 : 3);
   17708       MVT BitcastVT = MVT::getVectorVT(MVT::i1,
   17709                                        Mask.getSimpleValueType().getSizeInBits());
   17710       SDValue Cmp;
   17711       if (IntrData->Type == CMP_MASK_CC) {
   17712         SDValue CC = Op.getOperand(3);
   17713         CC = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, CC);
   17714         // We specify 2 possible opcodes for intrinsics with rounding modes.
   17715         // First, we check if the intrinsic may have non-default rounding mode,
   17716         // (IntrData->Opc1 != 0), then we check the rounding mode operand.
   17717         if (IntrData->Opc1 != 0) {
   17718           SDValue Rnd = Op.getOperand(5);
   17719           if (cast<ConstantSDNode>(Rnd)->getZExtValue() !=
   17720               X86::STATIC_ROUNDING::CUR_DIRECTION)
   17721             Cmp = DAG.getNode(IntrData->Opc1, dl, MaskVT, Op.getOperand(1),
   17722                               Op.getOperand(2), CC, Rnd);
   17723         }
   17724         //default rounding mode
   17725         if(!Cmp.getNode())
   17726             Cmp = DAG.getNode(IntrData->Opc0, dl, MaskVT, Op.getOperand(1),
   17727                               Op.getOperand(2), CC);
   17728 
   17729       } else {
   17730         assert(IntrData->Type == CMP_MASK && "Unexpected intrinsic type!");
   17731         Cmp = DAG.getNode(IntrData->Opc0, dl, MaskVT, Op.getOperand(1),
   17732                           Op.getOperand(2));
   17733       }
   17734       SDValue CmpMask = getVectorMaskingNode(Cmp, Mask,
   17735                                              DAG.getTargetConstant(0, dl,
   17736                                                                    MaskVT),
   17737                                              Subtarget, DAG);
   17738       SDValue Res = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, BitcastVT,
   17739                                 DAG.getUNDEF(BitcastVT), CmpMask,
   17740                                 DAG.getIntPtrConstant(0, dl));
   17741       return DAG.getBitcast(Op.getValueType(), Res);
   17742     }
   17743     case CMP_MASK_SCALAR_CC: {
   17744       SDValue Src1 = Op.getOperand(1);
   17745       SDValue Src2 = Op.getOperand(2);
   17746       SDValue CC = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Op.getOperand(3));
   17747       SDValue Mask = Op.getOperand(4);
   17748 
   17749       SDValue Cmp;
   17750       if (IntrData->Opc1 != 0) {
   17751         SDValue Rnd = Op.getOperand(5);
   17752         if (cast<ConstantSDNode>(Rnd)->getZExtValue() !=
   17753             X86::STATIC_ROUNDING::CUR_DIRECTION)
   17754           Cmp = DAG.getNode(IntrData->Opc1, dl, MVT::i1, Src1, Src2, CC, Rnd);
   17755       }
   17756       //default rounding mode
   17757       if(!Cmp.getNode())
   17758         Cmp = DAG.getNode(IntrData->Opc0, dl, MVT::i1, Src1, Src2, CC);
   17759 
   17760       SDValue CmpMask = getScalarMaskingNode(Cmp, Mask,
   17761                                              DAG.getTargetConstant(0, dl,
   17762                                                                    MVT::i1),
   17763                                              Subtarget, DAG);
   17764 
   17765       return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i8, CmpMask);
   17766     }
   17767     case COMI: { // Comparison intrinsics
   17768       ISD::CondCode CC = (ISD::CondCode)IntrData->Opc1;
   17769       SDValue LHS = Op.getOperand(1);
   17770       SDValue RHS = Op.getOperand(2);
   17771       SDValue Comi = DAG.getNode(IntrData->Opc0, dl, MVT::i32, LHS, RHS);
   17772       SDValue InvComi = DAG.getNode(IntrData->Opc0, dl, MVT::i32, RHS, LHS);
   17773       SDValue SetCC;
   17774       switch (CC) {
   17775       case ISD::SETEQ: { // (ZF = 0 and PF = 0)
   17776         SetCC = DAG.getNode(X86ISD::SETCC, dl, MVT::i8,
   17777                             DAG.getConstant(X86::COND_E, dl, MVT::i8), Comi);
   17778         SDValue SetNP = DAG.getNode(X86ISD::SETCC, dl, MVT::i8,
   17779                                     DAG.getConstant(X86::COND_NP, dl, MVT::i8),
   17780                                     Comi);
   17781         SetCC = DAG.getNode(ISD::AND, dl, MVT::i8, SetCC, SetNP);
   17782         break;
   17783       }
   17784       case ISD::SETNE: { // (ZF = 1 or PF = 1)
   17785         SetCC = DAG.getNode(X86ISD::SETCC, dl, MVT::i8,
   17786                             DAG.getConstant(X86::COND_NE, dl, MVT::i8), Comi);
   17787         SDValue SetP = DAG.getNode(X86ISD::SETCC, dl, MVT::i8,
   17788                                    DAG.getConstant(X86::COND_P, dl, MVT::i8),
   17789                                    Comi);
   17790         SetCC = DAG.getNode(ISD::OR, dl, MVT::i8, SetCC, SetP);
   17791         break;
   17792       }
   17793       case ISD::SETGT: // (CF = 0 and ZF = 0)
   17794         SetCC = DAG.getNode(X86ISD::SETCC, dl, MVT::i8,
   17795                             DAG.getConstant(X86::COND_A, dl, MVT::i8), Comi);
   17796         break;
   17797       case ISD::SETLT: { // The condition is opposite to GT. Swap the operands.
   17798         SetCC = DAG.getNode(X86ISD::SETCC, dl, MVT::i8,
   17799                             DAG.getConstant(X86::COND_A, dl, MVT::i8), InvComi);
   17800         break;
   17801       }
   17802       case ISD::SETGE: // CF = 0
   17803         SetCC = DAG.getNode(X86ISD::SETCC, dl, MVT::i8,
   17804                             DAG.getConstant(X86::COND_AE, dl, MVT::i8), Comi);
   17805         break;
   17806       case ISD::SETLE: // The condition is opposite to GE. Swap the operands.
   17807         SetCC = DAG.getNode(X86ISD::SETCC, dl, MVT::i8,
   17808                             DAG.getConstant(X86::COND_AE, dl, MVT::i8), InvComi);
   17809         break;
   17810       default:
   17811         llvm_unreachable("Unexpected illegal condition!");
   17812       }
   17813       return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC);
   17814     }
   17815     case COMI_RM: { // Comparison intrinsics with Sae
   17816       SDValue LHS = Op.getOperand(1);
   17817       SDValue RHS = Op.getOperand(2);
   17818       unsigned CondVal = cast<ConstantSDNode>(Op.getOperand(3))->getZExtValue();
   17819       SDValue Sae = Op.getOperand(4);
   17820 
   17821       SDValue FCmp;
   17822       if (cast<ConstantSDNode>(Sae)->getZExtValue() ==
   17823           X86::STATIC_ROUNDING::CUR_DIRECTION)
   17824         FCmp = DAG.getNode(X86ISD::FSETCC, dl, MVT::i1, LHS, RHS,
   17825                                   DAG.getConstant(CondVal, dl, MVT::i8));
   17826       else
   17827         FCmp = DAG.getNode(X86ISD::FSETCC, dl, MVT::i1, LHS, RHS,
   17828                                   DAG.getConstant(CondVal, dl, MVT::i8), Sae);
   17829       // AnyExt just uses KMOVW %kreg, %r32; ZeroExt emits "and $1, %reg"
   17830       return DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, FCmp);
   17831     }
   17832     case VSHIFT:
   17833       return getTargetVShiftNode(IntrData->Opc0, dl, Op.getSimpleValueType(),
   17834                                  Op.getOperand(1), Op.getOperand(2), DAG);
   17835     case COMPRESS_EXPAND_IN_REG: {
   17836       SDValue Mask = Op.getOperand(3);
   17837       SDValue DataToCompress = Op.getOperand(1);
   17838       SDValue PassThru = Op.getOperand(2);
   17839       if (isAllOnesConstant(Mask)) // return data as is
   17840         return Op.getOperand(1);
   17841 
   17842       return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT,
   17843                                               DataToCompress),
   17844                                   Mask, PassThru, Subtarget, DAG);
   17845     }
   17846     case BROADCASTM: {
   17847       SDValue Mask = Op.getOperand(1);
   17848       MVT MaskVT = MVT::getVectorVT(MVT::i1,
   17849                                     Mask.getSimpleValueType().getSizeInBits());
   17850       Mask = DAG.getBitcast(MaskVT, Mask);
   17851       return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Mask);
   17852     }
   17853     case KUNPCK: {
   17854       MVT VT = Op.getSimpleValueType();
   17855       MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getSizeInBits()/2);
   17856 
   17857       SDValue Src1 = getMaskNode(Op.getOperand(1), MaskVT, Subtarget, DAG, dl);
   17858       SDValue Src2 = getMaskNode(Op.getOperand(2), MaskVT, Subtarget, DAG, dl);
   17859       // Arguments should be swapped.
   17860       SDValue Res = DAG.getNode(IntrData->Opc0, dl,
   17861                                 MVT::getVectorVT(MVT::i1, VT.getSizeInBits()),
   17862                                 Src2, Src1);
   17863       return DAG.getBitcast(VT, Res);
   17864     }
   17865     case FIXUPIMMS:
   17866     case FIXUPIMMS_MASKZ:
   17867     case FIXUPIMM:
   17868     case FIXUPIMM_MASKZ:{
   17869       SDValue Src1 = Op.getOperand(1);
   17870       SDValue Src2 = Op.getOperand(2);
   17871       SDValue Src3 = Op.getOperand(3);
   17872       SDValue Imm = Op.getOperand(4);
   17873       SDValue Mask = Op.getOperand(5);
   17874       SDValue Passthru = (IntrData->Type == FIXUPIMM || IntrData->Type == FIXUPIMMS ) ?
   17875                                          Src1 : getZeroVector(VT, Subtarget, DAG, dl);
   17876       // We specify 2 possible modes for intrinsics, with/without rounding
   17877       // modes.
   17878       // First, we check if the intrinsic have rounding mode (7 operands),
   17879       // if not, we set rounding mode to "current".
   17880       SDValue Rnd;
   17881       if (Op.getNumOperands() == 7)
   17882         Rnd = Op.getOperand(6);
   17883       else
   17884         Rnd = DAG.getConstant(X86::STATIC_ROUNDING::CUR_DIRECTION, dl, MVT::i32);
   17885       if (IntrData->Type == FIXUPIMM || IntrData->Type == FIXUPIMM_MASKZ)
   17886         return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT,
   17887                                                 Src1, Src2, Src3, Imm, Rnd),
   17888                                     Mask, Passthru, Subtarget, DAG);
   17889       else // Scalar - FIXUPIMMS, FIXUPIMMS_MASKZ
   17890         return getScalarMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT,
   17891                                        Src1, Src2, Src3, Imm, Rnd),
   17892                                     Mask, Passthru, Subtarget, DAG);
   17893     }
   17894     case CONVERT_TO_MASK: {
   17895       MVT SrcVT = Op.getOperand(1).getSimpleValueType();
   17896       MVT MaskVT = MVT::getVectorVT(MVT::i1, SrcVT.getVectorNumElements());
   17897       MVT BitcastVT = MVT::getVectorVT(MVT::i1, VT.getSizeInBits());
   17898 
   17899       SDValue CvtMask = DAG.getNode(IntrData->Opc0, dl, MaskVT,
   17900                                     Op.getOperand(1));
   17901       SDValue Res = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, BitcastVT,
   17902                                 DAG.getUNDEF(BitcastVT), CvtMask,
   17903                                 DAG.getIntPtrConstant(0, dl));
   17904       return DAG.getBitcast(Op.getValueType(), Res);
   17905     }
   17906     case CONVERT_MASK_TO_VEC: {
   17907       SDValue Mask = Op.getOperand(1);
   17908       MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getVectorNumElements());
   17909       SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
   17910       return DAG.getNode(IntrData->Opc0, dl, VT, VMask);
   17911     }
   17912     case BRCST_SUBVEC_TO_VEC: {
   17913       SDValue Src = Op.getOperand(1);
   17914       SDValue Passthru = Op.getOperand(2);
   17915       SDValue Mask = Op.getOperand(3);
   17916       EVT resVT = Passthru.getValueType();
   17917       SDValue subVec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, resVT,
   17918                                        DAG.getUNDEF(resVT), Src,
   17919                                        DAG.getIntPtrConstant(0, dl));
   17920       SDValue immVal;
   17921       if (Src.getSimpleValueType().is256BitVector() && resVT.is512BitVector())
   17922         immVal = DAG.getConstant(0x44, dl, MVT::i8);
   17923       else
   17924         immVal = DAG.getConstant(0, dl, MVT::i8);
   17925       return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT,
   17926                                               subVec, subVec, immVal),
   17927                                   Mask, Passthru, Subtarget, DAG);
   17928     }
   17929     case BRCST32x2_TO_VEC: {
   17930       SDValue Src = Op.getOperand(1);
   17931       SDValue PassThru = Op.getOperand(2);
   17932       SDValue Mask = Op.getOperand(3);
   17933 
   17934       assert((VT.getScalarType() == MVT::i32 ||
   17935               VT.getScalarType() == MVT::f32) && "Unexpected type!");
   17936       //bitcast Src to packed 64
   17937       MVT ScalarVT = VT.getScalarType() == MVT::i32 ? MVT::i64 : MVT::f64;
   17938       MVT BitcastVT = MVT::getVectorVT(ScalarVT, Src.getValueSizeInBits()/64);
   17939       Src = DAG.getBitcast(BitcastVT, Src);
   17940 
   17941       return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src),
   17942                                   Mask, PassThru, Subtarget, DAG);
   17943     }
   17944     default:
   17945       break;
   17946     }
   17947   }
   17948 
   17949   switch (IntNo) {
   17950   default: return SDValue();    // Don't custom lower most intrinsics.
   17951 
   17952   case Intrinsic::x86_avx2_permd:
   17953   case Intrinsic::x86_avx2_permps:
   17954     // Operands intentionally swapped. Mask is last operand to intrinsic,
   17955     // but second operand for node/instruction.
   17956     return DAG.getNode(X86ISD::VPERMV, dl, Op.getValueType(),
   17957                        Op.getOperand(2), Op.getOperand(1));
   17958 
   17959   // ptest and testp intrinsics. The intrinsic these come from are designed to
   17960   // return an integer value, not just an instruction so lower it to the ptest
   17961   // or testp pattern and a setcc for the result.
   17962   case Intrinsic::x86_sse41_ptestz:
   17963   case Intrinsic::x86_sse41_ptestc:
   17964   case Intrinsic::x86_sse41_ptestnzc:
   17965   case Intrinsic::x86_avx_ptestz_256:
   17966   case Intrinsic::x86_avx_ptestc_256:
   17967   case Intrinsic::x86_avx_ptestnzc_256:
   17968   case Intrinsic::x86_avx_vtestz_ps:
   17969   case Intrinsic::x86_avx_vtestc_ps:
   17970   case Intrinsic::x86_avx_vtestnzc_ps:
   17971   case Intrinsic::x86_avx_vtestz_pd:
   17972   case Intrinsic::x86_avx_vtestc_pd:
   17973   case Intrinsic::x86_avx_vtestnzc_pd:
   17974   case Intrinsic::x86_avx_vtestz_ps_256:
   17975   case Intrinsic::x86_avx_vtestc_ps_256:
   17976   case Intrinsic::x86_avx_vtestnzc_ps_256:
   17977   case Intrinsic::x86_avx_vtestz_pd_256:
   17978   case Intrinsic::x86_avx_vtestc_pd_256:
   17979   case Intrinsic::x86_avx_vtestnzc_pd_256: {
   17980     bool IsTestPacked = false;
   17981     unsigned X86CC;
   17982     switch (IntNo) {
   17983     default: llvm_unreachable("Bad fallthrough in Intrinsic lowering.");
   17984     case Intrinsic::x86_avx_vtestz_ps:
   17985     case Intrinsic::x86_avx_vtestz_pd:
   17986     case Intrinsic::x86_avx_vtestz_ps_256:
   17987     case Intrinsic::x86_avx_vtestz_pd_256:
   17988       IsTestPacked = true; // Fallthrough
   17989     case Intrinsic::x86_sse41_ptestz:
   17990     case Intrinsic::x86_avx_ptestz_256:
   17991       // ZF = 1
   17992       X86CC = X86::COND_E;
   17993       break;
   17994     case Intrinsic::x86_avx_vtestc_ps:
   17995     case Intrinsic::x86_avx_vtestc_pd:
   17996     case Intrinsic::x86_avx_vtestc_ps_256:
   17997     case Intrinsic::x86_avx_vtestc_pd_256:
   17998       IsTestPacked = true; // Fallthrough
   17999     case Intrinsic::x86_sse41_ptestc:
   18000     case Intrinsic::x86_avx_ptestc_256:
   18001       // CF = 1
   18002       X86CC = X86::COND_B;
   18003       break;
   18004     case Intrinsic::x86_avx_vtestnzc_ps:
   18005     case Intrinsic::x86_avx_vtestnzc_pd:
   18006     case Intrinsic::x86_avx_vtestnzc_ps_256:
   18007     case Intrinsic::x86_avx_vtestnzc_pd_256:
   18008       IsTestPacked = true; // Fallthrough
   18009     case Intrinsic::x86_sse41_ptestnzc:
   18010     case Intrinsic::x86_avx_ptestnzc_256:
   18011       // ZF and CF = 0
   18012       X86CC = X86::COND_A;
   18013       break;
   18014     }
   18015 
   18016     SDValue LHS = Op.getOperand(1);
   18017     SDValue RHS = Op.getOperand(2);
   18018     unsigned TestOpc = IsTestPacked ? X86ISD::TESTP : X86ISD::PTEST;
   18019     SDValue Test = DAG.getNode(TestOpc, dl, MVT::i32, LHS, RHS);
   18020     SDValue CC = DAG.getConstant(X86CC, dl, MVT::i8);
   18021     SDValue SetCC = DAG.getNode(X86ISD::SETCC, dl, MVT::i8, CC, Test);
   18022     return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC);
   18023   }
   18024   case Intrinsic::x86_avx512_kortestz_w:
   18025   case Intrinsic::x86_avx512_kortestc_w: {
   18026     unsigned X86CC = (IntNo == Intrinsic::x86_avx512_kortestz_w)? X86::COND_E: X86::COND_B;
   18027     SDValue LHS = DAG.getBitcast(MVT::v16i1, Op.getOperand(1));
   18028     SDValue RHS = DAG.getBitcast(MVT::v16i1, Op.getOperand(2));
   18029     SDValue CC = DAG.getConstant(X86CC, dl, MVT::i8);
   18030     SDValue Test = DAG.getNode(X86ISD::KORTEST, dl, MVT::i32, LHS, RHS);
   18031     SDValue SetCC = DAG.getNode(X86ISD::SETCC, dl, MVT::i8, CC, Test);
   18032     return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC);
   18033   }
   18034 
   18035   case Intrinsic::x86_sse42_pcmpistria128:
   18036   case Intrinsic::x86_sse42_pcmpestria128:
   18037   case Intrinsic::x86_sse42_pcmpistric128:
   18038   case Intrinsic::x86_sse42_pcmpestric128:
   18039   case Intrinsic::x86_sse42_pcmpistrio128:
   18040   case Intrinsic::x86_sse42_pcmpestrio128:
   18041   case Intrinsic::x86_sse42_pcmpistris128:
   18042   case Intrinsic::x86_sse42_pcmpestris128:
   18043   case Intrinsic::x86_sse42_pcmpistriz128:
   18044   case Intrinsic::x86_sse42_pcmpestriz128: {
   18045     unsigned Opcode;
   18046     unsigned X86CC;
   18047     switch (IntNo) {
   18048     default: llvm_unreachable("Impossible intrinsic");  // Can't reach here.
   18049     case Intrinsic::x86_sse42_pcmpistria128:
   18050       Opcode = X86ISD::PCMPISTRI;
   18051       X86CC = X86::COND_A;
   18052       break;
   18053     case Intrinsic::x86_sse42_pcmpestria128:
   18054       Opcode = X86ISD::PCMPESTRI;
   18055       X86CC = X86::COND_A;
   18056       break;
   18057     case Intrinsic::x86_sse42_pcmpistric128:
   18058       Opcode = X86ISD::PCMPISTRI;
   18059       X86CC = X86::COND_B;
   18060       break;
   18061     case Intrinsic::x86_sse42_pcmpestric128:
   18062       Opcode = X86ISD::PCMPESTRI;
   18063       X86CC = X86::COND_B;
   18064       break;
   18065     case Intrinsic::x86_sse42_pcmpistrio128:
   18066       Opcode = X86ISD::PCMPISTRI;
   18067       X86CC = X86::COND_O;
   18068       break;
   18069     case Intrinsic::x86_sse42_pcmpestrio128:
   18070       Opcode = X86ISD::PCMPESTRI;
   18071       X86CC = X86::COND_O;
   18072       break;
   18073     case Intrinsic::x86_sse42_pcmpistris128:
   18074       Opcode = X86ISD::PCMPISTRI;
   18075       X86CC = X86::COND_S;
   18076       break;
   18077     case Intrinsic::x86_sse42_pcmpestris128:
   18078       Opcode = X86ISD::PCMPESTRI;
   18079       X86CC = X86::COND_S;
   18080       break;
   18081     case Intrinsic::x86_sse42_pcmpistriz128:
   18082       Opcode = X86ISD::PCMPISTRI;
   18083       X86CC = X86::COND_E;
   18084       break;
   18085     case Intrinsic::x86_sse42_pcmpestriz128:
   18086       Opcode = X86ISD::PCMPESTRI;
   18087       X86CC = X86::COND_E;
   18088       break;
   18089     }
   18090     SmallVector<SDValue, 5> NewOps(Op->op_begin()+1, Op->op_end());
   18091     SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32);
   18092     SDValue PCMP = DAG.getNode(Opcode, dl, VTs, NewOps);
   18093     SDValue SetCC = DAG.getNode(X86ISD::SETCC, dl, MVT::i8,
   18094                                 DAG.getConstant(X86CC, dl, MVT::i8),
   18095                                 SDValue(PCMP.getNode(), 1));
   18096     return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC);
   18097   }
   18098 
   18099   case Intrinsic::x86_sse42_pcmpistri128:
   18100   case Intrinsic::x86_sse42_pcmpestri128: {
   18101     unsigned Opcode;
   18102     if (IntNo == Intrinsic::x86_sse42_pcmpistri128)
   18103       Opcode = X86ISD::PCMPISTRI;
   18104     else
   18105       Opcode = X86ISD::PCMPESTRI;
   18106 
   18107     SmallVector<SDValue, 5> NewOps(Op->op_begin()+1, Op->op_end());
   18108     SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32);
   18109     return DAG.getNode(Opcode, dl, VTs, NewOps);
   18110   }
   18111 
   18112   case Intrinsic::eh_sjlj_lsda: {
   18113     MachineFunction &MF = DAG.getMachineFunction();
   18114     const TargetLowering &TLI = DAG.getTargetLoweringInfo();
   18115     MVT PtrVT = TLI.getPointerTy(DAG.getDataLayout());
   18116     auto &Context = MF.getMMI().getContext();
   18117     MCSymbol *S = Context.getOrCreateSymbol(Twine("GCC_except_table") +
   18118                                             Twine(MF.getFunctionNumber()));
   18119     return DAG.getNode(X86ISD::Wrapper, dl, VT, DAG.getMCSymbol(S, PtrVT));
   18120   }
   18121 
   18122   case Intrinsic::x86_seh_lsda: {
   18123     // Compute the symbol for the LSDA. We know it'll get emitted later.
   18124     MachineFunction &MF = DAG.getMachineFunction();
   18125     SDValue Op1 = Op.getOperand(1);
   18126     auto *Fn = cast<Function>(cast<GlobalAddressSDNode>(Op1)->getGlobal());
   18127     MCSymbol *LSDASym = MF.getMMI().getContext().getOrCreateLSDASymbol(
   18128         GlobalValue::getRealLinkageName(Fn->getName()));
   18129 
   18130     // Generate a simple absolute symbol reference. This intrinsic is only
   18131     // supported on 32-bit Windows, which isn't PIC.
   18132     SDValue Result = DAG.getMCSymbol(LSDASym, VT);
   18133     return DAG.getNode(X86ISD::Wrapper, dl, VT, Result);
   18134   }
   18135 
   18136   case Intrinsic::x86_seh_recoverfp: {
   18137     SDValue FnOp = Op.getOperand(1);
   18138     SDValue IncomingFPOp = Op.getOperand(2);
   18139     GlobalAddressSDNode *GSD = dyn_cast<GlobalAddressSDNode>(FnOp);
   18140     auto *Fn = dyn_cast_or_null<Function>(GSD ? GSD->getGlobal() : nullptr);
   18141     if (!Fn)
   18142       report_fatal_error(
   18143           "llvm.x86.seh.recoverfp must take a function as the first argument");
   18144     return recoverFramePointer(DAG, Fn, IncomingFPOp);
   18145   }
   18146 
   18147   case Intrinsic::localaddress: {
   18148     // Returns one of the stack, base, or frame pointer registers, depending on
   18149     // which is used to reference local variables.
   18150     MachineFunction &MF = DAG.getMachineFunction();
   18151     const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
   18152     unsigned Reg;
   18153     if (RegInfo->hasBasePointer(MF))
   18154       Reg = RegInfo->getBaseRegister();
   18155     else // This function handles the SP or FP case.
   18156       Reg = RegInfo->getPtrSizedFrameRegister(MF);
   18157     return DAG.getCopyFromReg(DAG.getEntryNode(), dl, Reg, VT);
   18158   }
   18159   }
   18160 }
   18161 
   18162 static SDValue getGatherNode(unsigned Opc, SDValue Op, SelectionDAG &DAG,
   18163                               SDValue Src, SDValue Mask, SDValue Base,
   18164                               SDValue Index, SDValue ScaleOp, SDValue Chain,
   18165                               const X86Subtarget &Subtarget) {
   18166   SDLoc dl(Op);
   18167   auto *C = cast<ConstantSDNode>(ScaleOp);
   18168   SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), dl, MVT::i8);
   18169   MVT MaskVT = MVT::getVectorVT(MVT::i1,
   18170                              Index.getSimpleValueType().getVectorNumElements());
   18171 
   18172   SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
   18173   SDVTList VTs = DAG.getVTList(Op.getValueType(), MaskVT, MVT::Other);
   18174   SDValue Disp = DAG.getTargetConstant(0, dl, MVT::i32);
   18175   SDValue Segment = DAG.getRegister(0, MVT::i32);
   18176   if (Src.isUndef())
   18177     Src = getZeroVector(Op.getSimpleValueType(), Subtarget, DAG, dl);
   18178   SDValue Ops[] = {Src, VMask, Base, Scale, Index, Disp, Segment, Chain};
   18179   SDNode *Res = DAG.getMachineNode(Opc, dl, VTs, Ops);
   18180   SDValue RetOps[] = { SDValue(Res, 0), SDValue(Res, 2) };
   18181   return DAG.getMergeValues(RetOps, dl);
   18182 }
   18183 
   18184 static SDValue getScatterNode(unsigned Opc, SDValue Op, SelectionDAG &DAG,
   18185                                SDValue Src, SDValue Mask, SDValue Base,
   18186                                SDValue Index, SDValue ScaleOp, SDValue Chain,
   18187                                const X86Subtarget &Subtarget) {
   18188   SDLoc dl(Op);
   18189   auto *C = cast<ConstantSDNode>(ScaleOp);
   18190   SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), dl, MVT::i8);
   18191   SDValue Disp = DAG.getTargetConstant(0, dl, MVT::i32);
   18192   SDValue Segment = DAG.getRegister(0, MVT::i32);
   18193   MVT MaskVT = MVT::getVectorVT(MVT::i1,
   18194                              Index.getSimpleValueType().getVectorNumElements());
   18195 
   18196   SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
   18197   SDVTList VTs = DAG.getVTList(MaskVT, MVT::Other);
   18198   SDValue Ops[] = {Base, Scale, Index, Disp, Segment, VMask, Src, Chain};
   18199   SDNode *Res = DAG.getMachineNode(Opc, dl, VTs, Ops);
   18200   return SDValue(Res, 1);
   18201 }
   18202 
   18203 static SDValue getPrefetchNode(unsigned Opc, SDValue Op, SelectionDAG &DAG,
   18204                                SDValue Mask, SDValue Base, SDValue Index,
   18205                                SDValue ScaleOp, SDValue Chain,
   18206                                const X86Subtarget &Subtarget) {
   18207   SDLoc dl(Op);
   18208   auto *C = cast<ConstantSDNode>(ScaleOp);
   18209   SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), dl, MVT::i8);
   18210   SDValue Disp = DAG.getTargetConstant(0, dl, MVT::i32);
   18211   SDValue Segment = DAG.getRegister(0, MVT::i32);
   18212   MVT MaskVT =
   18213     MVT::getVectorVT(MVT::i1, Index.getSimpleValueType().getVectorNumElements());
   18214   SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
   18215   //SDVTList VTs = DAG.getVTList(MVT::Other);
   18216   SDValue Ops[] = {VMask, Base, Scale, Index, Disp, Segment, Chain};
   18217   SDNode *Res = DAG.getMachineNode(Opc, dl, MVT::Other, Ops);
   18218   return SDValue(Res, 0);
   18219 }
   18220 
   18221 /// Handles the lowering of builtin intrinsics that read performance monitor
   18222 /// counters (x86_rdpmc).
   18223 static void getReadPerformanceCounter(SDNode *N, const SDLoc &DL,
   18224                                       SelectionDAG &DAG,
   18225                                       const X86Subtarget &Subtarget,
   18226                                       SmallVectorImpl<SDValue> &Results) {
   18227   assert(N->getNumOperands() == 3 && "Unexpected number of operands!");
   18228   SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue);
   18229   SDValue LO, HI;
   18230 
   18231   // The ECX register is used to select the index of the performance counter
   18232   // to read.
   18233   SDValue Chain = DAG.getCopyToReg(N->getOperand(0), DL, X86::ECX,
   18234                                    N->getOperand(2));
   18235   SDValue rd = DAG.getNode(X86ISD::RDPMC_DAG, DL, Tys, Chain);
   18236 
   18237   // Reads the content of a 64-bit performance counter and returns it in the
   18238   // registers EDX:EAX.
   18239   if (Subtarget.is64Bit()) {
   18240     LO = DAG.getCopyFromReg(rd, DL, X86::RAX, MVT::i64, rd.getValue(1));
   18241     HI = DAG.getCopyFromReg(LO.getValue(1), DL, X86::RDX, MVT::i64,
   18242                             LO.getValue(2));
   18243   } else {
   18244     LO = DAG.getCopyFromReg(rd, DL, X86::EAX, MVT::i32, rd.getValue(1));
   18245     HI = DAG.getCopyFromReg(LO.getValue(1), DL, X86::EDX, MVT::i32,
   18246                             LO.getValue(2));
   18247   }
   18248   Chain = HI.getValue(1);
   18249 
   18250   if (Subtarget.is64Bit()) {
   18251     // The EAX register is loaded with the low-order 32 bits. The EDX register
   18252     // is loaded with the supported high-order bits of the counter.
   18253     SDValue Tmp = DAG.getNode(ISD::SHL, DL, MVT::i64, HI,
   18254                               DAG.getConstant(32, DL, MVT::i8));
   18255     Results.push_back(DAG.getNode(ISD::OR, DL, MVT::i64, LO, Tmp));
   18256     Results.push_back(Chain);
   18257     return;
   18258   }
   18259 
   18260   // Use a buildpair to merge the two 32-bit values into a 64-bit one.
   18261   SDValue Ops[] = { LO, HI };
   18262   SDValue Pair = DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, Ops);
   18263   Results.push_back(Pair);
   18264   Results.push_back(Chain);
   18265 }
   18266 
   18267 /// Handles the lowering of builtin intrinsics that read the time stamp counter
   18268 /// (x86_rdtsc and x86_rdtscp). This function is also used to custom lower
   18269 /// READCYCLECOUNTER nodes.
   18270 static void getReadTimeStampCounter(SDNode *N, const SDLoc &DL, unsigned Opcode,
   18271                                     SelectionDAG &DAG,
   18272                                     const X86Subtarget &Subtarget,
   18273                                     SmallVectorImpl<SDValue> &Results) {
   18274   SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue);
   18275   SDValue rd = DAG.getNode(Opcode, DL, Tys, N->getOperand(0));
   18276   SDValue LO, HI;
   18277 
   18278   // The processor's time-stamp counter (a 64-bit MSR) is stored into the
   18279   // EDX:EAX registers. EDX is loaded with the high-order 32 bits of the MSR
   18280   // and the EAX register is loaded with the low-order 32 bits.
   18281   if (Subtarget.is64Bit()) {
   18282     LO = DAG.getCopyFromReg(rd, DL, X86::RAX, MVT::i64, rd.getValue(1));
   18283     HI = DAG.getCopyFromReg(LO.getValue(1), DL, X86::RDX, MVT::i64,
   18284                             LO.getValue(2));
   18285   } else {
   18286     LO = DAG.getCopyFromReg(rd, DL, X86::EAX, MVT::i32, rd.getValue(1));
   18287     HI = DAG.getCopyFromReg(LO.getValue(1), DL, X86::EDX, MVT::i32,
   18288                             LO.getValue(2));
   18289   }
   18290   SDValue Chain = HI.getValue(1);
   18291 
   18292   if (Opcode == X86ISD::RDTSCP_DAG) {
   18293     assert(N->getNumOperands() == 3 && "Unexpected number of operands!");
   18294 
   18295     // Instruction RDTSCP loads the IA32:TSC_AUX_MSR (address C000_0103H) into
   18296     // the ECX register. Add 'ecx' explicitly to the chain.
   18297     SDValue ecx = DAG.getCopyFromReg(Chain, DL, X86::ECX, MVT::i32,
   18298                                      HI.getValue(2));
   18299     // Explicitly store the content of ECX at the location passed in input
   18300     // to the 'rdtscp' intrinsic.
   18301     Chain = DAG.getStore(ecx.getValue(1), DL, ecx, N->getOperand(2),
   18302                          MachinePointerInfo(), false, false, 0);
   18303   }
   18304 
   18305   if (Subtarget.is64Bit()) {
   18306     // The EDX register is loaded with the high-order 32 bits of the MSR, and
   18307     // the EAX register is loaded with the low-order 32 bits.
   18308     SDValue Tmp = DAG.getNode(ISD::SHL, DL, MVT::i64, HI,
   18309                               DAG.getConstant(32, DL, MVT::i8));
   18310     Results.push_back(DAG.getNode(ISD::OR, DL, MVT::i64, LO, Tmp));
   18311     Results.push_back(Chain);
   18312     return;
   18313   }
   18314 
   18315   // Use a buildpair to merge the two 32-bit values into a 64-bit one.
   18316   SDValue Ops[] = { LO, HI };
   18317   SDValue Pair = DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, Ops);
   18318   Results.push_back(Pair);
   18319   Results.push_back(Chain);
   18320 }
   18321 
   18322 static SDValue LowerREADCYCLECOUNTER(SDValue Op, const X86Subtarget &Subtarget,
   18323                                      SelectionDAG &DAG) {
   18324   SmallVector<SDValue, 2> Results;
   18325   SDLoc DL(Op);
   18326   getReadTimeStampCounter(Op.getNode(), DL, X86ISD::RDTSC_DAG, DAG, Subtarget,
   18327                           Results);
   18328   return DAG.getMergeValues(Results, DL);
   18329 }
   18330 
   18331 static SDValue MarkEHRegistrationNode(SDValue Op, SelectionDAG &DAG) {
   18332   MachineFunction &MF = DAG.getMachineFunction();
   18333   SDValue Chain = Op.getOperand(0);
   18334   SDValue RegNode = Op.getOperand(2);
   18335   WinEHFuncInfo *EHInfo = MF.getWinEHFuncInfo();
   18336   if (!EHInfo)
   18337     report_fatal_error("EH registrations only live in functions using WinEH");
   18338 
   18339   // Cast the operand to an alloca, and remember the frame index.
   18340   auto *FINode = dyn_cast<FrameIndexSDNode>(RegNode);
   18341   if (!FINode)
   18342     report_fatal_error("llvm.x86.seh.ehregnode expects a static alloca");
   18343   EHInfo->EHRegNodeFrameIndex = FINode->getIndex();
   18344 
   18345   // Return the chain operand without making any DAG nodes.
   18346   return Chain;
   18347 }
   18348 
   18349 static SDValue MarkEHGuard(SDValue Op, SelectionDAG &DAG) {
   18350   MachineFunction &MF = DAG.getMachineFunction();
   18351   SDValue Chain = Op.getOperand(0);
   18352   SDValue EHGuard = Op.getOperand(2);
   18353   WinEHFuncInfo *EHInfo = MF.getWinEHFuncInfo();
   18354   if (!EHInfo)
   18355     report_fatal_error("EHGuard only live in functions using WinEH");
   18356 
   18357   // Cast the operand to an alloca, and remember the frame index.
   18358   auto *FINode = dyn_cast<FrameIndexSDNode>(EHGuard);
   18359   if (!FINode)
   18360     report_fatal_error("llvm.x86.seh.ehguard expects a static alloca");
   18361   EHInfo->EHGuardFrameIndex = FINode->getIndex();
   18362 
   18363   // Return the chain operand without making any DAG nodes.
   18364   return Chain;
   18365 }
   18366 
   18367 static SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, const X86Subtarget &Subtarget,
   18368                                       SelectionDAG &DAG) {
   18369   unsigned IntNo = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
   18370 
   18371   const IntrinsicData* IntrData = getIntrinsicWithChain(IntNo);
   18372   if (!IntrData) {
   18373     if (IntNo == llvm::Intrinsic::x86_seh_ehregnode)
   18374       return MarkEHRegistrationNode(Op, DAG);
   18375     if (IntNo == llvm::Intrinsic::x86_seh_ehguard)
   18376       return MarkEHGuard(Op, DAG);
   18377     if (IntNo == llvm::Intrinsic::x86_flags_read_u32 ||
   18378         IntNo == llvm::Intrinsic::x86_flags_read_u64 ||
   18379         IntNo == llvm::Intrinsic::x86_flags_write_u32 ||
   18380         IntNo == llvm::Intrinsic::x86_flags_write_u64) {
   18381       // We need a frame pointer because this will get lowered to a PUSH/POP
   18382       // sequence.
   18383       MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo();
   18384       MFI->setHasCopyImplyingStackAdjustment(true);
   18385       // Don't do anything here, we will expand these intrinsics out later
   18386       // during ExpandISelPseudos in EmitInstrWithCustomInserter.
   18387       return SDValue();
   18388     }
   18389     return SDValue();
   18390   }
   18391 
   18392   SDLoc dl(Op);
   18393   switch(IntrData->Type) {
   18394   default: llvm_unreachable("Unknown Intrinsic Type");
   18395   case RDSEED:
   18396   case RDRAND: {
   18397     // Emit the node with the right value type.
   18398     SDVTList VTs = DAG.getVTList(Op->getValueType(0), MVT::Glue, MVT::Other);
   18399     SDValue Result = DAG.getNode(IntrData->Opc0, dl, VTs, Op.getOperand(0));
   18400 
   18401     // If the value returned by RDRAND/RDSEED was valid (CF=1), return 1.
   18402     // Otherwise return the value from Rand, which is always 0, casted to i32.
   18403     SDValue Ops[] = { DAG.getZExtOrTrunc(Result, dl, Op->getValueType(1)),
   18404                       DAG.getConstant(1, dl, Op->getValueType(1)),
   18405                       DAG.getConstant(X86::COND_B, dl, MVT::i32),
   18406                       SDValue(Result.getNode(), 1) };
   18407     SDValue isValid = DAG.getNode(X86ISD::CMOV, dl,
   18408                                   DAG.getVTList(Op->getValueType(1), MVT::Glue),
   18409                                   Ops);
   18410 
   18411     // Return { result, isValid, chain }.
   18412     return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(), Result, isValid,
   18413                        SDValue(Result.getNode(), 2));
   18414   }
   18415   case GATHER: {
   18416   //gather(v1, mask, index, base, scale);
   18417     SDValue Chain = Op.getOperand(0);
   18418     SDValue Src   = Op.getOperand(2);
   18419     SDValue Base  = Op.getOperand(3);
   18420     SDValue Index = Op.getOperand(4);
   18421     SDValue Mask  = Op.getOperand(5);
   18422     SDValue Scale = Op.getOperand(6);
   18423     return getGatherNode(IntrData->Opc0, Op, DAG, Src, Mask, Base, Index, Scale,
   18424                          Chain, Subtarget);
   18425   }
   18426   case SCATTER: {
   18427   //scatter(base, mask, index, v1, scale);
   18428     SDValue Chain = Op.getOperand(0);
   18429     SDValue Base  = Op.getOperand(2);
   18430     SDValue Mask  = Op.getOperand(3);
   18431     SDValue Index = Op.getOperand(4);
   18432     SDValue Src   = Op.getOperand(5);
   18433     SDValue Scale = Op.getOperand(6);
   18434     return getScatterNode(IntrData->Opc0, Op, DAG, Src, Mask, Base, Index,
   18435                           Scale, Chain, Subtarget);
   18436   }
   18437   case PREFETCH: {
   18438     SDValue Hint = Op.getOperand(6);
   18439     unsigned HintVal = cast<ConstantSDNode>(Hint)->getZExtValue();
   18440     assert(HintVal < 2 && "Wrong prefetch hint in intrinsic: should be 0 or 1");
   18441     unsigned Opcode = (HintVal ? IntrData->Opc1 : IntrData->Opc0);
   18442     SDValue Chain = Op.getOperand(0);
   18443     SDValue Mask  = Op.getOperand(2);
   18444     SDValue Index = Op.getOperand(3);
   18445     SDValue Base  = Op.getOperand(4);
   18446     SDValue Scale = Op.getOperand(5);
   18447     return getPrefetchNode(Opcode, Op, DAG, Mask, Base, Index, Scale, Chain,
   18448                            Subtarget);
   18449   }
   18450   // Read Time Stamp Counter (RDTSC) and Processor ID (RDTSCP).
   18451   case RDTSC: {
   18452     SmallVector<SDValue, 2> Results;
   18453     getReadTimeStampCounter(Op.getNode(), dl, IntrData->Opc0, DAG, Subtarget,
   18454                             Results);
   18455     return DAG.getMergeValues(Results, dl);
   18456   }
   18457   // Read Performance Monitoring Counters.
   18458   case RDPMC: {
   18459     SmallVector<SDValue, 2> Results;
   18460     getReadPerformanceCounter(Op.getNode(), dl, DAG, Subtarget, Results);
   18461     return DAG.getMergeValues(Results, dl);
   18462   }
   18463   // XTEST intrinsics.
   18464   case XTEST: {
   18465     SDVTList VTs = DAG.getVTList(Op->getValueType(0), MVT::Other);
   18466     SDValue InTrans = DAG.getNode(IntrData->Opc0, dl, VTs, Op.getOperand(0));
   18467     SDValue SetCC = DAG.getNode(X86ISD::SETCC, dl, MVT::i8,
   18468                                 DAG.getConstant(X86::COND_NE, dl, MVT::i8),
   18469                                 InTrans);
   18470     SDValue Ret = DAG.getNode(ISD::ZERO_EXTEND, dl, Op->getValueType(0), SetCC);
   18471     return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(),
   18472                        Ret, SDValue(InTrans.getNode(), 1));
   18473   }
   18474   // ADC/ADCX/SBB
   18475   case ADX: {
   18476     SDVTList CFVTs = DAG.getVTList(Op->getValueType(0), MVT::Other);
   18477     SDVTList VTs = DAG.getVTList(Op.getOperand(3)->getValueType(0), MVT::Other);
   18478     SDValue GenCF = DAG.getNode(X86ISD::ADD, dl, CFVTs, Op.getOperand(2),
   18479                                 DAG.getConstant(-1, dl, MVT::i8));
   18480     SDValue Res = DAG.getNode(IntrData->Opc0, dl, VTs, Op.getOperand(3),
   18481                               Op.getOperand(4), GenCF.getValue(1));
   18482     SDValue Store = DAG.getStore(Op.getOperand(0), dl, Res.getValue(0),
   18483                                  Op.getOperand(5), MachinePointerInfo(),
   18484                                  false, false, 0);
   18485     SDValue SetCC = DAG.getNode(X86ISD::SETCC, dl, MVT::i8,
   18486                                 DAG.getConstant(X86::COND_B, dl, MVT::i8),
   18487                                 Res.getValue(1));
   18488     SDValue Results[] = { SetCC, Store };
   18489     return DAG.getMergeValues(Results, dl);
   18490   }
   18491   case COMPRESS_TO_MEM: {
   18492     SDValue Mask = Op.getOperand(4);
   18493     SDValue DataToCompress = Op.getOperand(3);
   18494     SDValue Addr = Op.getOperand(2);
   18495     SDValue Chain = Op.getOperand(0);
   18496     MVT VT = DataToCompress.getSimpleValueType();
   18497 
   18498     MemIntrinsicSDNode *MemIntr = dyn_cast<MemIntrinsicSDNode>(Op);
   18499     assert(MemIntr && "Expected MemIntrinsicSDNode!");
   18500 
   18501     if (isAllOnesConstant(Mask)) // return just a store
   18502       return DAG.getStore(Chain, dl, DataToCompress, Addr,
   18503                           MemIntr->getMemOperand());
   18504 
   18505     SDValue Compressed =
   18506       getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, DataToCompress),
   18507                            Mask, DAG.getUNDEF(VT), Subtarget, DAG);
   18508     return DAG.getStore(Chain, dl, Compressed, Addr,
   18509                         MemIntr->getMemOperand());
   18510   }
   18511   case TRUNCATE_TO_MEM_VI8:
   18512   case TRUNCATE_TO_MEM_VI16:
   18513   case TRUNCATE_TO_MEM_VI32: {
   18514     SDValue Mask = Op.getOperand(4);
   18515     SDValue DataToTruncate = Op.getOperand(3);
   18516     SDValue Addr = Op.getOperand(2);
   18517     SDValue Chain = Op.getOperand(0);
   18518 
   18519     MemIntrinsicSDNode *MemIntr = dyn_cast<MemIntrinsicSDNode>(Op);
   18520     assert(MemIntr && "Expected MemIntrinsicSDNode!");
   18521 
   18522     EVT VT  = MemIntr->getMemoryVT();
   18523 
   18524     if (isAllOnesConstant(Mask)) // return just a truncate store
   18525       return DAG.getTruncStore(Chain, dl, DataToTruncate, Addr, VT,
   18526                                MemIntr->getMemOperand());
   18527 
   18528     MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getVectorNumElements());
   18529     SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
   18530 
   18531     return DAG.getMaskedStore(Chain, dl, DataToTruncate, Addr, VMask, VT,
   18532                               MemIntr->getMemOperand(), true);
   18533   }
   18534   case EXPAND_FROM_MEM: {
   18535     SDValue Mask = Op.getOperand(4);
   18536     SDValue PassThru = Op.getOperand(3);
   18537     SDValue Addr = Op.getOperand(2);
   18538     SDValue Chain = Op.getOperand(0);
   18539     MVT VT = Op.getSimpleValueType();
   18540 
   18541     MemIntrinsicSDNode *MemIntr = dyn_cast<MemIntrinsicSDNode>(Op);
   18542     assert(MemIntr && "Expected MemIntrinsicSDNode!");
   18543 
   18544     SDValue DataToExpand = DAG.getLoad(VT, dl, Chain, Addr,
   18545                                        MemIntr->getMemOperand());
   18546 
   18547     if (isAllOnesConstant(Mask)) // return just a load
   18548       return DataToExpand;
   18549 
   18550     SDValue Results[] = {
   18551       getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, DataToExpand),
   18552                            Mask, PassThru, Subtarget, DAG), Chain};
   18553     return DAG.getMergeValues(Results, dl);
   18554   }
   18555   }
   18556 }
   18557 
   18558 SDValue X86TargetLowering::LowerRETURNADDR(SDValue Op,
   18559                                            SelectionDAG &DAG) const {
   18560   MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo();
   18561   MFI->setReturnAddressIsTaken(true);
   18562 
   18563   if (verifyReturnAddressArgumentIsConstant(Op, DAG))
   18564     return SDValue();
   18565 
   18566   unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
   18567   SDLoc dl(Op);
   18568   EVT PtrVT = getPointerTy(DAG.getDataLayout());
   18569 
   18570   if (Depth > 0) {
   18571     SDValue FrameAddr = LowerFRAMEADDR(Op, DAG);
   18572     const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
   18573     SDValue Offset = DAG.getConstant(RegInfo->getSlotSize(), dl, PtrVT);
   18574     return DAG.getLoad(PtrVT, dl, DAG.getEntryNode(),
   18575                        DAG.getNode(ISD::ADD, dl, PtrVT,
   18576                                    FrameAddr, Offset),
   18577                        MachinePointerInfo(), false, false, false, 0);
   18578   }
   18579 
   18580   // Just load the return address.
   18581   SDValue RetAddrFI = getReturnAddressFrameIndex(DAG);
   18582   return DAG.getLoad(PtrVT, dl, DAG.getEntryNode(),
   18583                      RetAddrFI, MachinePointerInfo(), false, false, false, 0);
   18584 }
   18585 
   18586 SDValue X86TargetLowering::LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) const {
   18587   MachineFunction &MF = DAG.getMachineFunction();
   18588   MachineFrameInfo *MFI = MF.getFrameInfo();
   18589   X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
   18590   const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
   18591   EVT VT = Op.getValueType();
   18592 
   18593   MFI->setFrameAddressIsTaken(true);
   18594 
   18595   if (MF.getTarget().getMCAsmInfo()->usesWindowsCFI()) {
   18596     // Depth > 0 makes no sense on targets which use Windows unwind codes.  It
   18597     // is not possible to crawl up the stack without looking at the unwind codes
   18598     // simultaneously.
   18599     int FrameAddrIndex = FuncInfo->getFAIndex();
   18600     if (!FrameAddrIndex) {
   18601       // Set up a frame object for the return address.
   18602       unsigned SlotSize = RegInfo->getSlotSize();
   18603       FrameAddrIndex = MF.getFrameInfo()->CreateFixedObject(
   18604           SlotSize, /*Offset=*/0, /*IsImmutable=*/false);
   18605       FuncInfo->setFAIndex(FrameAddrIndex);
   18606     }
   18607     return DAG.getFrameIndex(FrameAddrIndex, VT);
   18608   }
   18609 
   18610   unsigned FrameReg =
   18611       RegInfo->getPtrSizedFrameRegister(DAG.getMachineFunction());
   18612   SDLoc dl(Op);  // FIXME probably not meaningful
   18613   unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
   18614   assert(((FrameReg == X86::RBP && VT == MVT::i64) ||
   18615           (FrameReg == X86::EBP && VT == MVT::i32)) &&
   18616          "Invalid Frame Register!");
   18617   SDValue FrameAddr = DAG.getCopyFromReg(DAG.getEntryNode(), dl, FrameReg, VT);
   18618   while (Depth--)
   18619     FrameAddr = DAG.getLoad(VT, dl, DAG.getEntryNode(), FrameAddr,
   18620                             MachinePointerInfo(),
   18621                             false, false, false, 0);
   18622   return FrameAddr;
   18623 }
   18624 
   18625 // FIXME? Maybe this could be a TableGen attribute on some registers and
   18626 // this table could be generated automatically from RegInfo.
   18627 unsigned X86TargetLowering::getRegisterByName(const char* RegName, EVT VT,
   18628                                               SelectionDAG &DAG) const {
   18629   const TargetFrameLowering &TFI = *Subtarget.getFrameLowering();
   18630   const MachineFunction &MF = DAG.getMachineFunction();
   18631 
   18632   unsigned Reg = StringSwitch<unsigned>(RegName)
   18633                        .Case("esp", X86::ESP)
   18634                        .Case("rsp", X86::RSP)
   18635                        .Case("ebp", X86::EBP)
   18636                        .Case("rbp", X86::RBP)
   18637                        .Default(0);
   18638 
   18639   if (Reg == X86::EBP || Reg == X86::RBP) {
   18640     if (!TFI.hasFP(MF))
   18641       report_fatal_error("register " + StringRef(RegName) +
   18642                          " is allocatable: function has no frame pointer");
   18643 #ifndef NDEBUG
   18644     else {
   18645       const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
   18646       unsigned FrameReg =
   18647           RegInfo->getPtrSizedFrameRegister(DAG.getMachineFunction());
   18648       assert((FrameReg == X86::EBP || FrameReg == X86::RBP) &&
   18649              "Invalid Frame Register!");
   18650     }
   18651 #endif
   18652   }
   18653 
   18654   if (Reg)
   18655     return Reg;
   18656 
   18657   report_fatal_error("Invalid register name global variable");
   18658 }
   18659 
   18660 SDValue X86TargetLowering::LowerFRAME_TO_ARGS_OFFSET(SDValue Op,
   18661                                                      SelectionDAG &DAG) const {
   18662   const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
   18663   return DAG.getIntPtrConstant(2 * RegInfo->getSlotSize(), SDLoc(Op));
   18664 }
   18665 
   18666 unsigned X86TargetLowering::getExceptionPointerRegister(
   18667     const Constant *PersonalityFn) const {
   18668   if (classifyEHPersonality(PersonalityFn) == EHPersonality::CoreCLR)
   18669     return Subtarget.isTarget64BitLP64() ? X86::RDX : X86::EDX;
   18670 
   18671   return Subtarget.isTarget64BitLP64() ? X86::RAX : X86::EAX;
   18672 }
   18673 
   18674 unsigned X86TargetLowering::getExceptionSelectorRegister(
   18675     const Constant *PersonalityFn) const {
   18676   // Funclet personalities don't use selectors (the runtime does the selection).
   18677   assert(!isFuncletEHPersonality(classifyEHPersonality(PersonalityFn)));
   18678   return Subtarget.isTarget64BitLP64() ? X86::RDX : X86::EDX;
   18679 }
   18680 
   18681 bool X86TargetLowering::needsFixedCatchObjects() const {
   18682   return Subtarget.isTargetWin64();
   18683 }
   18684 
   18685 SDValue X86TargetLowering::LowerEH_RETURN(SDValue Op, SelectionDAG &DAG) const {
   18686   SDValue Chain     = Op.getOperand(0);
   18687   SDValue Offset    = Op.getOperand(1);
   18688   SDValue Handler   = Op.getOperand(2);
   18689   SDLoc dl      (Op);
   18690 
   18691   EVT PtrVT = getPointerTy(DAG.getDataLayout());
   18692   const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
   18693   unsigned FrameReg = RegInfo->getFrameRegister(DAG.getMachineFunction());
   18694   assert(((FrameReg == X86::RBP && PtrVT == MVT::i64) ||
   18695           (FrameReg == X86::EBP && PtrVT == MVT::i32)) &&
   18696          "Invalid Frame Register!");
   18697   SDValue Frame = DAG.getCopyFromReg(DAG.getEntryNode(), dl, FrameReg, PtrVT);
   18698   unsigned StoreAddrReg = (PtrVT == MVT::i64) ? X86::RCX : X86::ECX;
   18699 
   18700   SDValue StoreAddr = DAG.getNode(ISD::ADD, dl, PtrVT, Frame,
   18701                                  DAG.getIntPtrConstant(RegInfo->getSlotSize(),
   18702                                                        dl));
   18703   StoreAddr = DAG.getNode(ISD::ADD, dl, PtrVT, StoreAddr, Offset);
   18704   Chain = DAG.getStore(Chain, dl, Handler, StoreAddr, MachinePointerInfo(),
   18705                        false, false, 0);
   18706   Chain = DAG.getCopyToReg(Chain, dl, StoreAddrReg, StoreAddr);
   18707 
   18708   return DAG.getNode(X86ISD::EH_RETURN, dl, MVT::Other, Chain,
   18709                      DAG.getRegister(StoreAddrReg, PtrVT));
   18710 }
   18711 
   18712 SDValue X86TargetLowering::lowerEH_SJLJ_SETJMP(SDValue Op,
   18713                                                SelectionDAG &DAG) const {
   18714   SDLoc DL(Op);
   18715   // If the subtarget is not 64bit, we may need the global base reg
   18716   // after isel expand pseudo, i.e., after CGBR pass ran.
   18717   // Therefore, ask for the GlobalBaseReg now, so that the pass
   18718   // inserts the code for us in case we need it.
   18719   // Otherwise, we will end up in a situation where we will
   18720   // reference a virtual register that is not defined!
   18721   if (!Subtarget.is64Bit()) {
   18722     const X86InstrInfo *TII = Subtarget.getInstrInfo();
   18723     (void)TII->getGlobalBaseReg(&DAG.getMachineFunction());
   18724   }
   18725   return DAG.getNode(X86ISD::EH_SJLJ_SETJMP, DL,
   18726                      DAG.getVTList(MVT::i32, MVT::Other),
   18727                      Op.getOperand(0), Op.getOperand(1));
   18728 }
   18729 
   18730 SDValue X86TargetLowering::lowerEH_SJLJ_LONGJMP(SDValue Op,
   18731                                                 SelectionDAG &DAG) const {
   18732   SDLoc DL(Op);
   18733   return DAG.getNode(X86ISD::EH_SJLJ_LONGJMP, DL, MVT::Other,
   18734                      Op.getOperand(0), Op.getOperand(1));
   18735 }
   18736 
   18737 SDValue X86TargetLowering::lowerEH_SJLJ_SETUP_DISPATCH(SDValue Op,
   18738                                                        SelectionDAG &DAG) const {
   18739   SDLoc DL(Op);
   18740   return DAG.getNode(X86ISD::EH_SJLJ_SETUP_DISPATCH, DL, MVT::Other,
   18741                      Op.getOperand(0));
   18742 }
   18743 
   18744 static SDValue LowerADJUST_TRAMPOLINE(SDValue Op, SelectionDAG &DAG) {
   18745   return Op.getOperand(0);
   18746 }
   18747 
   18748 SDValue X86TargetLowering::LowerINIT_TRAMPOLINE(SDValue Op,
   18749                                                 SelectionDAG &DAG) const {
   18750   SDValue Root = Op.getOperand(0);
   18751   SDValue Trmp = Op.getOperand(1); // trampoline
   18752   SDValue FPtr = Op.getOperand(2); // nested function
   18753   SDValue Nest = Op.getOperand(3); // 'nest' parameter value
   18754   SDLoc dl (Op);
   18755 
   18756   const Value *TrmpAddr = cast<SrcValueSDNode>(Op.getOperand(4))->getValue();
   18757   const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo();
   18758 
   18759   if (Subtarget.is64Bit()) {
   18760     SDValue OutChains[6];
   18761 
   18762     // Large code-model.
   18763     const unsigned char JMP64r  = 0xFF; // 64-bit jmp through register opcode.
   18764     const unsigned char MOV64ri = 0xB8; // X86::MOV64ri opcode.
   18765 
   18766     const unsigned char N86R10 = TRI->getEncodingValue(X86::R10) & 0x7;
   18767     const unsigned char N86R11 = TRI->getEncodingValue(X86::R11) & 0x7;
   18768 
   18769     const unsigned char REX_WB = 0x40 | 0x08 | 0x01; // REX prefix
   18770 
   18771     // Load the pointer to the nested function into R11.
   18772     unsigned OpCode = ((MOV64ri | N86R11) << 8) | REX_WB; // movabsq r11
   18773     SDValue Addr = Trmp;
   18774     OutChains[0] = DAG.getStore(Root, dl, DAG.getConstant(OpCode, dl, MVT::i16),
   18775                                 Addr, MachinePointerInfo(TrmpAddr),
   18776                                 false, false, 0);
   18777 
   18778     Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
   18779                        DAG.getConstant(2, dl, MVT::i64));
   18780     OutChains[1] = DAG.getStore(Root, dl, FPtr, Addr,
   18781                                 MachinePointerInfo(TrmpAddr, 2),
   18782                                 false, false, 2);
   18783 
   18784     // Load the 'nest' parameter value into R10.
   18785     // R10 is specified in X86CallingConv.td
   18786     OpCode = ((MOV64ri | N86R10) << 8) | REX_WB; // movabsq r10
   18787     Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
   18788                        DAG.getConstant(10, dl, MVT::i64));
   18789     OutChains[2] = DAG.getStore(Root, dl, DAG.getConstant(OpCode, dl, MVT::i16),
   18790                                 Addr, MachinePointerInfo(TrmpAddr, 10),
   18791                                 false, false, 0);
   18792 
   18793     Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
   18794                        DAG.getConstant(12, dl, MVT::i64));
   18795     OutChains[3] = DAG.getStore(Root, dl, Nest, Addr,
   18796                                 MachinePointerInfo(TrmpAddr, 12),
   18797                                 false, false, 2);
   18798 
   18799     // Jump to the nested function.
   18800     OpCode = (JMP64r << 8) | REX_WB; // jmpq *...
   18801     Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
   18802                        DAG.getConstant(20, dl, MVT::i64));
   18803     OutChains[4] = DAG.getStore(Root, dl, DAG.getConstant(OpCode, dl, MVT::i16),
   18804                                 Addr, MachinePointerInfo(TrmpAddr, 20),
   18805                                 false, false, 0);
   18806 
   18807     unsigned char ModRM = N86R11 | (4 << 3) | (3 << 6); // ...r11
   18808     Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
   18809                        DAG.getConstant(22, dl, MVT::i64));
   18810     OutChains[5] = DAG.getStore(Root, dl, DAG.getConstant(ModRM, dl, MVT::i8),
   18811                                 Addr, MachinePointerInfo(TrmpAddr, 22),
   18812                                 false, false, 0);
   18813 
   18814     return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, OutChains);
   18815   } else {
   18816     const Function *Func =
   18817       cast<Function>(cast<SrcValueSDNode>(Op.getOperand(5))->getValue());
   18818     CallingConv::ID CC = Func->getCallingConv();
   18819     unsigned NestReg;
   18820 
   18821     switch (CC) {
   18822     default:
   18823       llvm_unreachable("Unsupported calling convention");
   18824     case CallingConv::C:
   18825     case CallingConv::X86_StdCall: {
   18826       // Pass 'nest' parameter in ECX.
   18827       // Must be kept in sync with X86CallingConv.td
   18828       NestReg = X86::ECX;
   18829 
   18830       // Check that ECX wasn't needed by an 'inreg' parameter.
   18831       FunctionType *FTy = Func->getFunctionType();
   18832       const AttributeSet &Attrs = Func->getAttributes();
   18833 
   18834       if (!Attrs.isEmpty() && !Func->isVarArg()) {
   18835         unsigned InRegCount = 0;
   18836         unsigned Idx = 1;
   18837 
   18838         for (FunctionType::param_iterator I = FTy->param_begin(),
   18839              E = FTy->param_end(); I != E; ++I, ++Idx)
   18840           if (Attrs.hasAttribute(Idx, Attribute::InReg)) {
   18841             auto &DL = DAG.getDataLayout();
   18842             // FIXME: should only count parameters that are lowered to integers.
   18843             InRegCount += (DL.getTypeSizeInBits(*I) + 31) / 32;
   18844           }
   18845 
   18846         if (InRegCount > 2) {
   18847           report_fatal_error("Nest register in use - reduce number of inreg"
   18848                              " parameters!");
   18849         }
   18850       }
   18851       break;
   18852     }
   18853     case CallingConv::X86_FastCall:
   18854     case CallingConv::X86_ThisCall:
   18855     case CallingConv::Fast:
   18856       // Pass 'nest' parameter in EAX.
   18857       // Must be kept in sync with X86CallingConv.td
   18858       NestReg = X86::EAX;
   18859       break;
   18860     }
   18861 
   18862     SDValue OutChains[4];
   18863     SDValue Addr, Disp;
   18864 
   18865     Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp,
   18866                        DAG.getConstant(10, dl, MVT::i32));
   18867     Disp = DAG.getNode(ISD::SUB, dl, MVT::i32, FPtr, Addr);
   18868 
   18869     // This is storing the opcode for MOV32ri.
   18870     const unsigned char MOV32ri = 0xB8; // X86::MOV32ri's opcode byte.
   18871     const unsigned char N86Reg = TRI->getEncodingValue(NestReg) & 0x7;
   18872     OutChains[0] = DAG.getStore(Root, dl,
   18873                                 DAG.getConstant(MOV32ri|N86Reg, dl, MVT::i8),
   18874                                 Trmp, MachinePointerInfo(TrmpAddr),
   18875                                 false, false, 0);
   18876 
   18877     Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp,
   18878                        DAG.getConstant(1, dl, MVT::i32));
   18879     OutChains[1] = DAG.getStore(Root, dl, Nest, Addr,
   18880                                 MachinePointerInfo(TrmpAddr, 1),
   18881                                 false, false, 1);
   18882 
   18883     const unsigned char JMP = 0xE9; // jmp <32bit dst> opcode.
   18884     Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp,
   18885                        DAG.getConstant(5, dl, MVT::i32));
   18886     OutChains[2] = DAG.getStore(Root, dl, DAG.getConstant(JMP, dl, MVT::i8),
   18887                                 Addr, MachinePointerInfo(TrmpAddr, 5),
   18888                                 false, false, 1);
   18889 
   18890     Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp,
   18891                        DAG.getConstant(6, dl, MVT::i32));
   18892     OutChains[3] = DAG.getStore(Root, dl, Disp, Addr,
   18893                                 MachinePointerInfo(TrmpAddr, 6),
   18894                                 false, false, 1);
   18895 
   18896     return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, OutChains);
   18897   }
   18898 }
   18899 
   18900 SDValue X86TargetLowering::LowerFLT_ROUNDS_(SDValue Op,
   18901                                             SelectionDAG &DAG) const {
   18902   /*
   18903    The rounding mode is in bits 11:10 of FPSR, and has the following
   18904    settings:
   18905      00 Round to nearest
   18906      01 Round to -inf
   18907      10 Round to +inf
   18908      11 Round to 0
   18909 
   18910   FLT_ROUNDS, on the other hand, expects the following:
   18911     -1 Undefined
   18912      0 Round to 0
   18913      1 Round to nearest
   18914      2 Round to +inf
   18915      3 Round to -inf
   18916 
   18917   To perform the conversion, we do:
   18918     (((((FPSR & 0x800) >> 11) | ((FPSR & 0x400) >> 9)) + 1) & 3)
   18919   */
   18920 
   18921   MachineFunction &MF = DAG.getMachineFunction();
   18922   const TargetFrameLowering &TFI = *Subtarget.getFrameLowering();
   18923   unsigned StackAlignment = TFI.getStackAlignment();
   18924   MVT VT = Op.getSimpleValueType();
   18925   SDLoc DL(Op);
   18926 
   18927   // Save FP Control Word to stack slot
   18928   int SSFI = MF.getFrameInfo()->CreateStackObject(2, StackAlignment, false);
   18929   SDValue StackSlot =
   18930       DAG.getFrameIndex(SSFI, getPointerTy(DAG.getDataLayout()));
   18931 
   18932   MachineMemOperand *MMO =
   18933       MF.getMachineMemOperand(MachinePointerInfo::getFixedStack(MF, SSFI),
   18934                               MachineMemOperand::MOStore, 2, 2);
   18935 
   18936   SDValue Ops[] = { DAG.getEntryNode(), StackSlot };
   18937   SDValue Chain = DAG.getMemIntrinsicNode(X86ISD::FNSTCW16m, DL,
   18938                                           DAG.getVTList(MVT::Other),
   18939                                           Ops, MVT::i16, MMO);
   18940 
   18941   // Load FP Control Word from stack slot
   18942   SDValue CWD = DAG.getLoad(MVT::i16, DL, Chain, StackSlot,
   18943                             MachinePointerInfo(), false, false, false, 0);
   18944 
   18945   // Transform as necessary
   18946   SDValue CWD1 =
   18947     DAG.getNode(ISD::SRL, DL, MVT::i16,
   18948                 DAG.getNode(ISD::AND, DL, MVT::i16,
   18949                             CWD, DAG.getConstant(0x800, DL, MVT::i16)),
   18950                 DAG.getConstant(11, DL, MVT::i8));
   18951   SDValue CWD2 =
   18952     DAG.getNode(ISD::SRL, DL, MVT::i16,
   18953                 DAG.getNode(ISD::AND, DL, MVT::i16,
   18954                             CWD, DAG.getConstant(0x400, DL, MVT::i16)),
   18955                 DAG.getConstant(9, DL, MVT::i8));
   18956 
   18957   SDValue RetVal =
   18958     DAG.getNode(ISD::AND, DL, MVT::i16,
   18959                 DAG.getNode(ISD::ADD, DL, MVT::i16,
   18960                             DAG.getNode(ISD::OR, DL, MVT::i16, CWD1, CWD2),
   18961                             DAG.getConstant(1, DL, MVT::i16)),
   18962                 DAG.getConstant(3, DL, MVT::i16));
   18963 
   18964   return DAG.getNode((VT.getSizeInBits() < 16 ?
   18965                       ISD::TRUNCATE : ISD::ZERO_EXTEND), DL, VT, RetVal);
   18966 }
   18967 
   18968 /// \brief Lower a vector CTLZ using native supported vector CTLZ instruction.
   18969 //
   18970 // 1. i32/i64 128/256-bit vector (native support require VLX) are expended
   18971 //    to 512-bit vector.
   18972 // 2. i8/i16 vector implemented using dword LZCNT vector instruction
   18973 //    ( sub(trunc(lzcnt(zext32(x)))) ). In case zext32(x) is illegal,
   18974 //    split the vector, perform operation on it's Lo a Hi part and
   18975 //    concatenate the results.
   18976 static SDValue LowerVectorCTLZ_AVX512(SDValue Op, SelectionDAG &DAG) {
   18977   assert(Op.getOpcode() == ISD::CTLZ);
   18978   SDLoc dl(Op);
   18979   MVT VT = Op.getSimpleValueType();
   18980   MVT EltVT = VT.getVectorElementType();
   18981   unsigned NumElems = VT.getVectorNumElements();
   18982 
   18983   if (EltVT == MVT::i64 || EltVT == MVT::i32) {
   18984     // Extend to 512 bit vector.
   18985     assert((VT.is256BitVector() || VT.is128BitVector()) &&
   18986               "Unsupported value type for operation");
   18987 
   18988     MVT NewVT = MVT::getVectorVT(EltVT, 512 / VT.getScalarSizeInBits());
   18989     SDValue Vec512 = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, NewVT,
   18990                                  DAG.getUNDEF(NewVT),
   18991                                  Op.getOperand(0),
   18992                                  DAG.getIntPtrConstant(0, dl));
   18993     SDValue CtlzNode = DAG.getNode(ISD::CTLZ, dl, NewVT, Vec512);
   18994 
   18995     return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, CtlzNode,
   18996                        DAG.getIntPtrConstant(0, dl));
   18997   }
   18998 
   18999   assert((EltVT == MVT::i8 || EltVT == MVT::i16) &&
   19000           "Unsupported element type");
   19001 
   19002   if (16 < NumElems) {
   19003     // Split vector, it's Lo and Hi parts will be handled in next iteration.
   19004     SDValue Lo, Hi;
   19005     std::tie(Lo, Hi) = DAG.SplitVector(Op.getOperand(0), dl);
   19006     MVT OutVT = MVT::getVectorVT(EltVT, NumElems/2);
   19007 
   19008     Lo = DAG.getNode(ISD::CTLZ, dl, OutVT, Lo);
   19009     Hi = DAG.getNode(ISD::CTLZ, dl, OutVT, Hi);
   19010 
   19011     return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Lo, Hi);
   19012   }
   19013 
   19014   MVT NewVT = MVT::getVectorVT(MVT::i32, NumElems);
   19015 
   19016   assert((NewVT.is256BitVector() || NewVT.is512BitVector()) &&
   19017           "Unsupported value type for operation");
   19018 
   19019   // Use native supported vector instruction vplzcntd.
   19020   Op = DAG.getNode(ISD::ZERO_EXTEND, dl, NewVT, Op.getOperand(0));
   19021   SDValue CtlzNode = DAG.getNode(ISD::CTLZ, dl, NewVT, Op);
   19022   SDValue TruncNode = DAG.getNode(ISD::TRUNCATE, dl, VT, CtlzNode);
   19023   SDValue Delta = DAG.getConstant(32 - EltVT.getSizeInBits(), dl, VT);
   19024 
   19025   return DAG.getNode(ISD::SUB, dl, VT, TruncNode, Delta);
   19026 }
   19027 
   19028 // Lower CTLZ using a PSHUFB lookup table implementation.
   19029 static SDValue LowerVectorCTLZInRegLUT(SDValue Op, const SDLoc &DL,
   19030                                        const X86Subtarget &Subtarget,
   19031                                        SelectionDAG &DAG) {
   19032   MVT VT = Op.getSimpleValueType();
   19033   int NumElts = VT.getVectorNumElements();
   19034   int NumBytes = NumElts * (VT.getScalarSizeInBits() / 8);
   19035   MVT CurrVT = MVT::getVectorVT(MVT::i8, NumBytes);
   19036 
   19037   // Per-nibble leading zero PSHUFB lookup table.
   19038   const int LUT[16] = {/* 0 */ 4, /* 1 */ 3, /* 2 */ 2, /* 3 */ 2,
   19039                        /* 4 */ 1, /* 5 */ 1, /* 6 */ 1, /* 7 */ 1,
   19040                        /* 8 */ 0, /* 9 */ 0, /* a */ 0, /* b */ 0,
   19041                        /* c */ 0, /* d */ 0, /* e */ 0, /* f */ 0};
   19042 
   19043   SmallVector<SDValue, 64> LUTVec;
   19044   for (int i = 0; i < NumBytes; ++i)
   19045     LUTVec.push_back(DAG.getConstant(LUT[i % 16], DL, MVT::i8));
   19046   SDValue InRegLUT = DAG.getNode(ISD::BUILD_VECTOR, DL, CurrVT, LUTVec);
   19047 
   19048   // Begin by bitcasting the input to byte vector, then split those bytes
   19049   // into lo/hi nibbles and use the PSHUFB LUT to perform CLTZ on each of them.
   19050   // If the hi input nibble is zero then we add both results together, otherwise
   19051   // we just take the hi result (by masking the lo result to zero before the
   19052   // add).
   19053   SDValue Op0 = DAG.getBitcast(CurrVT, Op.getOperand(0));
   19054   SDValue Zero = getZeroVector(CurrVT, Subtarget, DAG, DL);
   19055 
   19056   SDValue NibbleMask = DAG.getConstant(0xF, DL, CurrVT);
   19057   SDValue NibbleShift = DAG.getConstant(0x4, DL, CurrVT);
   19058   SDValue Lo = DAG.getNode(ISD::AND, DL, CurrVT, Op0, NibbleMask);
   19059   SDValue Hi = DAG.getNode(ISD::SRL, DL, CurrVT, Op0, NibbleShift);
   19060   SDValue HiZ = DAG.getSetCC(DL, CurrVT, Hi, Zero, ISD::SETEQ);
   19061 
   19062   Lo = DAG.getNode(X86ISD::PSHUFB, DL, CurrVT, InRegLUT, Lo);
   19063   Hi = DAG.getNode(X86ISD::PSHUFB, DL, CurrVT, InRegLUT, Hi);
   19064   Lo = DAG.getNode(ISD::AND, DL, CurrVT, Lo, HiZ);
   19065   SDValue Res = DAG.getNode(ISD::ADD, DL, CurrVT, Lo, Hi);
   19066 
   19067   // Merge result back from vXi8 back to VT, working on the lo/hi halves
   19068   // of the current vector width in the same way we did for the nibbles.
   19069   // If the upper half of the input element is zero then add the halves'
   19070   // leading zero counts together, otherwise just use the upper half's.
   19071   // Double the width of the result until we are at target width.
   19072   while (CurrVT != VT) {
   19073     int CurrScalarSizeInBits = CurrVT.getScalarSizeInBits();
   19074     int CurrNumElts = CurrVT.getVectorNumElements();
   19075     MVT NextSVT = MVT::getIntegerVT(CurrScalarSizeInBits * 2);
   19076     MVT NextVT = MVT::getVectorVT(NextSVT, CurrNumElts / 2);
   19077     SDValue Shift = DAG.getConstant(CurrScalarSizeInBits, DL, NextVT);
   19078 
   19079     // Check if the upper half of the input element is zero.
   19080     SDValue HiZ = DAG.getSetCC(DL, CurrVT, DAG.getBitcast(CurrVT, Op0),
   19081                                DAG.getBitcast(CurrVT, Zero), ISD::SETEQ);
   19082     HiZ = DAG.getBitcast(NextVT, HiZ);
   19083 
   19084     // Move the upper/lower halves to the lower bits as we'll be extending to
   19085     // NextVT. Mask the lower result to zero if HiZ is true and add the results
   19086     // together.
   19087     SDValue ResNext = Res = DAG.getBitcast(NextVT, Res);
   19088     SDValue R0 = DAG.getNode(ISD::SRL, DL, NextVT, ResNext, Shift);
   19089     SDValue R1 = DAG.getNode(ISD::SRL, DL, NextVT, HiZ, Shift);
   19090     R1 = DAG.getNode(ISD::AND, DL, NextVT, ResNext, R1);
   19091     Res = DAG.getNode(ISD::ADD, DL, NextVT, R0, R1);
   19092     CurrVT = NextVT;
   19093   }
   19094 
   19095   return Res;
   19096 }
   19097 
   19098 static SDValue LowerVectorCTLZ(SDValue Op, const SDLoc &DL,
   19099                                const X86Subtarget &Subtarget,
   19100                                SelectionDAG &DAG) {
   19101   MVT VT = Op.getSimpleValueType();
   19102   SDValue Op0 = Op.getOperand(0);
   19103 
   19104   if (Subtarget.hasAVX512())
   19105     return LowerVectorCTLZ_AVX512(Op, DAG);
   19106 
   19107   // Decompose 256-bit ops into smaller 128-bit ops.
   19108   if (VT.is256BitVector() && !Subtarget.hasInt256()) {
   19109     unsigned NumElems = VT.getVectorNumElements();
   19110 
   19111     // Extract each 128-bit vector, perform ctlz and concat the result.
   19112     SDValue LHS = extract128BitVector(Op0, 0, DAG, DL);
   19113     SDValue RHS = extract128BitVector(Op0, NumElems / 2, DAG, DL);
   19114 
   19115     return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT,
   19116                        DAG.getNode(ISD::CTLZ, DL, LHS.getValueType(), LHS),
   19117                        DAG.getNode(ISD::CTLZ, DL, RHS.getValueType(), RHS));
   19118   }
   19119 
   19120   assert(Subtarget.hasSSSE3() && "Expected SSSE3 support for PSHUFB");
   19121   return LowerVectorCTLZInRegLUT(Op, DL, Subtarget, DAG);
   19122 }
   19123 
   19124 static SDValue LowerCTLZ(SDValue Op, const X86Subtarget &Subtarget,
   19125                          SelectionDAG &DAG) {
   19126   MVT VT = Op.getSimpleValueType();
   19127   MVT OpVT = VT;
   19128   unsigned NumBits = VT.getSizeInBits();
   19129   SDLoc dl(Op);
   19130   unsigned Opc = Op.getOpcode();
   19131 
   19132   if (VT.isVector())
   19133     return LowerVectorCTLZ(Op, dl, Subtarget, DAG);
   19134 
   19135   Op = Op.getOperand(0);
   19136   if (VT == MVT::i8) {
   19137     // Zero extend to i32 since there is not an i8 bsr.
   19138     OpVT = MVT::i32;
   19139     Op = DAG.getNode(ISD::ZERO_EXTEND, dl, OpVT, Op);
   19140   }
   19141 
   19142   // Issue a bsr (scan bits in reverse) which also sets EFLAGS.
   19143   SDVTList VTs = DAG.getVTList(OpVT, MVT::i32);
   19144   Op = DAG.getNode(X86ISD::BSR, dl, VTs, Op);
   19145 
   19146   if (Opc == ISD::CTLZ) {
   19147     // If src is zero (i.e. bsr sets ZF), returns NumBits.
   19148     SDValue Ops[] = {
   19149       Op,
   19150       DAG.getConstant(NumBits + NumBits - 1, dl, OpVT),
   19151       DAG.getConstant(X86::COND_E, dl, MVT::i8),
   19152       Op.getValue(1)
   19153     };
   19154     Op = DAG.getNode(X86ISD::CMOV, dl, OpVT, Ops);
   19155   }
   19156 
   19157   // Finally xor with NumBits-1.
   19158   Op = DAG.getNode(ISD::XOR, dl, OpVT, Op,
   19159                    DAG.getConstant(NumBits - 1, dl, OpVT));
   19160 
   19161   if (VT == MVT::i8)
   19162     Op = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Op);
   19163   return Op;
   19164 }
   19165 
   19166 static SDValue LowerCTTZ(SDValue Op, SelectionDAG &DAG) {
   19167   MVT VT = Op.getSimpleValueType();
   19168   unsigned NumBits = VT.getScalarSizeInBits();
   19169   SDLoc dl(Op);
   19170 
   19171   if (VT.isVector()) {
   19172     SDValue N0 = Op.getOperand(0);
   19173     SDValue Zero = DAG.getConstant(0, dl, VT);
   19174 
   19175     // lsb(x) = (x & -x)
   19176     SDValue LSB = DAG.getNode(ISD::AND, dl, VT, N0,
   19177                               DAG.getNode(ISD::SUB, dl, VT, Zero, N0));
   19178 
   19179     // cttz_undef(x) = (width - 1) - ctlz(lsb)
   19180     if (Op.getOpcode() == ISD::CTTZ_ZERO_UNDEF) {
   19181       SDValue WidthMinusOne = DAG.getConstant(NumBits - 1, dl, VT);
   19182       return DAG.getNode(ISD::SUB, dl, VT, WidthMinusOne,
   19183                          DAG.getNode(ISD::CTLZ, dl, VT, LSB));
   19184     }
   19185 
   19186     // cttz(x) = ctpop(lsb - 1)
   19187     SDValue One = DAG.getConstant(1, dl, VT);
   19188     return DAG.getNode(ISD::CTPOP, dl, VT,
   19189                        DAG.getNode(ISD::SUB, dl, VT, LSB, One));
   19190   }
   19191 
   19192   assert(Op.getOpcode() == ISD::CTTZ &&
   19193          "Only scalar CTTZ requires custom lowering");
   19194 
   19195   // Issue a bsf (scan bits forward) which also sets EFLAGS.
   19196   SDVTList VTs = DAG.getVTList(VT, MVT::i32);
   19197   Op = DAG.getNode(X86ISD::BSF, dl, VTs, Op.getOperand(0));
   19198 
   19199   // If src is zero (i.e. bsf sets ZF), returns NumBits.
   19200   SDValue Ops[] = {
   19201     Op,
   19202     DAG.getConstant(NumBits, dl, VT),
   19203     DAG.getConstant(X86::COND_E, dl, MVT::i8),
   19204     Op.getValue(1)
   19205   };
   19206   return DAG.getNode(X86ISD::CMOV, dl, VT, Ops);
   19207 }
   19208 
   19209 /// Break a 256-bit integer operation into two new 128-bit ones and then
   19210 /// concatenate the result back.
   19211 static SDValue Lower256IntArith(SDValue Op, SelectionDAG &DAG) {
   19212   MVT VT = Op.getSimpleValueType();
   19213 
   19214   assert(VT.is256BitVector() && VT.isInteger() &&
   19215          "Unsupported value type for operation");
   19216 
   19217   unsigned NumElems = VT.getVectorNumElements();
   19218   SDLoc dl(Op);
   19219 
   19220   // Extract the LHS vectors
   19221   SDValue LHS = Op.getOperand(0);
   19222   SDValue LHS1 = extract128BitVector(LHS, 0, DAG, dl);
   19223   SDValue LHS2 = extract128BitVector(LHS, NumElems / 2, DAG, dl);
   19224 
   19225   // Extract the RHS vectors
   19226   SDValue RHS = Op.getOperand(1);
   19227   SDValue RHS1 = extract128BitVector(RHS, 0, DAG, dl);
   19228   SDValue RHS2 = extract128BitVector(RHS, NumElems / 2, DAG, dl);
   19229 
   19230   MVT EltVT = VT.getVectorElementType();
   19231   MVT NewVT = MVT::getVectorVT(EltVT, NumElems/2);
   19232 
   19233   return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT,
   19234                      DAG.getNode(Op.getOpcode(), dl, NewVT, LHS1, RHS1),
   19235                      DAG.getNode(Op.getOpcode(), dl, NewVT, LHS2, RHS2));
   19236 }
   19237 
   19238 /// Break a 512-bit integer operation into two new 256-bit ones and then
   19239 /// concatenate the result back.
   19240 static SDValue Lower512IntArith(SDValue Op, SelectionDAG &DAG) {
   19241   MVT VT = Op.getSimpleValueType();
   19242 
   19243   assert(VT.is512BitVector() && VT.isInteger() &&
   19244          "Unsupported value type for operation");
   19245 
   19246   unsigned NumElems = VT.getVectorNumElements();
   19247   SDLoc dl(Op);
   19248 
   19249   // Extract the LHS vectors
   19250   SDValue LHS = Op.getOperand(0);
   19251   SDValue LHS1 = extract256BitVector(LHS, 0, DAG, dl);
   19252   SDValue LHS2 = extract256BitVector(LHS, NumElems / 2, DAG, dl);
   19253 
   19254   // Extract the RHS vectors
   19255   SDValue RHS = Op.getOperand(1);
   19256   SDValue RHS1 = extract256BitVector(RHS, 0, DAG, dl);
   19257   SDValue RHS2 = extract256BitVector(RHS, NumElems / 2, DAG, dl);
   19258 
   19259   MVT EltVT = VT.getVectorElementType();
   19260   MVT NewVT = MVT::getVectorVT(EltVT, NumElems/2);
   19261 
   19262   return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT,
   19263                      DAG.getNode(Op.getOpcode(), dl, NewVT, LHS1, RHS1),
   19264                      DAG.getNode(Op.getOpcode(), dl, NewVT, LHS2, RHS2));
   19265 }
   19266 
   19267 static SDValue LowerADD(SDValue Op, SelectionDAG &DAG) {
   19268   if (Op.getValueType() == MVT::i1)
   19269     return DAG.getNode(ISD::XOR, SDLoc(Op), Op.getValueType(),
   19270                        Op.getOperand(0), Op.getOperand(1));
   19271   assert(Op.getSimpleValueType().is256BitVector() &&
   19272          Op.getSimpleValueType().isInteger() &&
   19273          "Only handle AVX 256-bit vector integer operation");
   19274   return Lower256IntArith(Op, DAG);
   19275 }
   19276 
   19277 static SDValue LowerSUB(SDValue Op, SelectionDAG &DAG) {
   19278   if (Op.getValueType() == MVT::i1)
   19279     return DAG.getNode(ISD::XOR, SDLoc(Op), Op.getValueType(),
   19280                        Op.getOperand(0), Op.getOperand(1));
   19281   assert(Op.getSimpleValueType().is256BitVector() &&
   19282          Op.getSimpleValueType().isInteger() &&
   19283          "Only handle AVX 256-bit vector integer operation");
   19284   return Lower256IntArith(Op, DAG);
   19285 }
   19286 
   19287 static SDValue LowerMINMAX(SDValue Op, SelectionDAG &DAG) {
   19288   assert(Op.getSimpleValueType().is256BitVector() &&
   19289          Op.getSimpleValueType().isInteger() &&
   19290          "Only handle AVX 256-bit vector integer operation");
   19291   return Lower256IntArith(Op, DAG);
   19292 }
   19293 
   19294 static SDValue LowerMUL(SDValue Op, const X86Subtarget &Subtarget,
   19295                         SelectionDAG &DAG) {
   19296   SDLoc dl(Op);
   19297   MVT VT = Op.getSimpleValueType();
   19298 
   19299   if (VT == MVT::i1)
   19300     return DAG.getNode(ISD::AND, dl, VT, Op.getOperand(0), Op.getOperand(1));
   19301 
   19302   // Decompose 256-bit ops into smaller 128-bit ops.
   19303   if (VT.is256BitVector() && !Subtarget.hasInt256())
   19304     return Lower256IntArith(Op, DAG);
   19305 
   19306   SDValue A = Op.getOperand(0);
   19307   SDValue B = Op.getOperand(1);
   19308 
   19309   // Lower v16i8/v32i8/v64i8 mul as sign-extension to v8i16/v16i16/v32i16
   19310   // vector pairs, multiply and truncate.
   19311   if (VT == MVT::v16i8 || VT == MVT::v32i8 || VT == MVT::v64i8) {
   19312     if (Subtarget.hasInt256()) {
   19313       // For 512-bit vectors, split into 256-bit vectors to allow the
   19314       // sign-extension to occur.
   19315       if (VT == MVT::v64i8)
   19316         return Lower512IntArith(Op, DAG);
   19317 
   19318       // For 256-bit vectors, split into 128-bit vectors to allow the
   19319       // sign-extension to occur. We don't need this on AVX512BW as we can
   19320       // safely sign-extend to v32i16.
   19321       if (VT == MVT::v32i8 && !Subtarget.hasBWI())
   19322         return Lower256IntArith(Op, DAG);
   19323 
   19324       MVT ExVT = MVT::getVectorVT(MVT::i16, VT.getVectorNumElements());
   19325       return DAG.getNode(
   19326           ISD::TRUNCATE, dl, VT,
   19327           DAG.getNode(ISD::MUL, dl, ExVT,
   19328                       DAG.getNode(ISD::SIGN_EXTEND, dl, ExVT, A),
   19329                       DAG.getNode(ISD::SIGN_EXTEND, dl, ExVT, B)));
   19330     }
   19331 
   19332     assert(VT == MVT::v16i8 &&
   19333            "Pre-AVX2 support only supports v16i8 multiplication");
   19334     MVT ExVT = MVT::v8i16;
   19335 
   19336     // Extract the lo parts and sign extend to i16
   19337     SDValue ALo, BLo;
   19338     if (Subtarget.hasSSE41()) {
   19339       ALo = DAG.getNode(X86ISD::VSEXT, dl, ExVT, A);
   19340       BLo = DAG.getNode(X86ISD::VSEXT, dl, ExVT, B);
   19341     } else {
   19342       const int ShufMask[] = {-1, 0, -1, 1, -1, 2, -1, 3,
   19343                               -1, 4, -1, 5, -1, 6, -1, 7};
   19344       ALo = DAG.getVectorShuffle(VT, dl, A, A, ShufMask);
   19345       BLo = DAG.getVectorShuffle(VT, dl, B, B, ShufMask);
   19346       ALo = DAG.getBitcast(ExVT, ALo);
   19347       BLo = DAG.getBitcast(ExVT, BLo);
   19348       ALo = DAG.getNode(ISD::SRA, dl, ExVT, ALo, DAG.getConstant(8, dl, ExVT));
   19349       BLo = DAG.getNode(ISD::SRA, dl, ExVT, BLo, DAG.getConstant(8, dl, ExVT));
   19350     }
   19351 
   19352     // Extract the hi parts and sign extend to i16
   19353     SDValue AHi, BHi;
   19354     if (Subtarget.hasSSE41()) {
   19355       const int ShufMask[] = {8,  9,  10, 11, 12, 13, 14, 15,
   19356                               -1, -1, -1, -1, -1, -1, -1, -1};
   19357       AHi = DAG.getVectorShuffle(VT, dl, A, A, ShufMask);
   19358       BHi = DAG.getVectorShuffle(VT, dl, B, B, ShufMask);
   19359       AHi = DAG.getNode(X86ISD::VSEXT, dl, ExVT, AHi);
   19360       BHi = DAG.getNode(X86ISD::VSEXT, dl, ExVT, BHi);
   19361     } else {
   19362       const int ShufMask[] = {-1, 8,  -1, 9,  -1, 10, -1, 11,
   19363                               -1, 12, -1, 13, -1, 14, -1, 15};
   19364       AHi = DAG.getVectorShuffle(VT, dl, A, A, ShufMask);
   19365       BHi = DAG.getVectorShuffle(VT, dl, B, B, ShufMask);
   19366       AHi = DAG.getBitcast(ExVT, AHi);
   19367       BHi = DAG.getBitcast(ExVT, BHi);
   19368       AHi = DAG.getNode(ISD::SRA, dl, ExVT, AHi, DAG.getConstant(8, dl, ExVT));
   19369       BHi = DAG.getNode(ISD::SRA, dl, ExVT, BHi, DAG.getConstant(8, dl, ExVT));
   19370     }
   19371 
   19372     // Multiply, mask the lower 8bits of the lo/hi results and pack
   19373     SDValue RLo = DAG.getNode(ISD::MUL, dl, ExVT, ALo, BLo);
   19374     SDValue RHi = DAG.getNode(ISD::MUL, dl, ExVT, AHi, BHi);
   19375     RLo = DAG.getNode(ISD::AND, dl, ExVT, RLo, DAG.getConstant(255, dl, ExVT));
   19376     RHi = DAG.getNode(ISD::AND, dl, ExVT, RHi, DAG.getConstant(255, dl, ExVT));
   19377     return DAG.getNode(X86ISD::PACKUS, dl, VT, RLo, RHi);
   19378   }
   19379 
   19380   // Lower v4i32 mul as 2x shuffle, 2x pmuludq, 2x shuffle.
   19381   if (VT == MVT::v4i32) {
   19382     assert(Subtarget.hasSSE2() && !Subtarget.hasSSE41() &&
   19383            "Should not custom lower when pmuldq is available!");
   19384 
   19385     // Extract the odd parts.
   19386     static const int UnpackMask[] = { 1, -1, 3, -1 };
   19387     SDValue Aodds = DAG.getVectorShuffle(VT, dl, A, A, UnpackMask);
   19388     SDValue Bodds = DAG.getVectorShuffle(VT, dl, B, B, UnpackMask);
   19389 
   19390     // Multiply the even parts.
   19391     SDValue Evens = DAG.getNode(X86ISD::PMULUDQ, dl, MVT::v2i64, A, B);
   19392     // Now multiply odd parts.
   19393     SDValue Odds = DAG.getNode(X86ISD::PMULUDQ, dl, MVT::v2i64, Aodds, Bodds);
   19394 
   19395     Evens = DAG.getBitcast(VT, Evens);
   19396     Odds = DAG.getBitcast(VT, Odds);
   19397 
   19398     // Merge the two vectors back together with a shuffle. This expands into 2
   19399     // shuffles.
   19400     static const int ShufMask[] = { 0, 4, 2, 6 };
   19401     return DAG.getVectorShuffle(VT, dl, Evens, Odds, ShufMask);
   19402   }
   19403 
   19404   assert((VT == MVT::v2i64 || VT == MVT::v4i64 || VT == MVT::v8i64) &&
   19405          "Only know how to lower V2I64/V4I64/V8I64 multiply");
   19406 
   19407   //  Ahi = psrlqi(a, 32);
   19408   //  Bhi = psrlqi(b, 32);
   19409   //
   19410   //  AloBlo = pmuludq(a, b);
   19411   //  AloBhi = pmuludq(a, Bhi);
   19412   //  AhiBlo = pmuludq(Ahi, b);
   19413 
   19414   //  AloBhi = psllqi(AloBhi, 32);
   19415   //  AhiBlo = psllqi(AhiBlo, 32);
   19416   //  return AloBlo + AloBhi + AhiBlo;
   19417 
   19418   SDValue Ahi = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, VT, A, 32, DAG);
   19419   SDValue Bhi = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, VT, B, 32, DAG);
   19420 
   19421   SDValue AhiBlo = Ahi;
   19422   SDValue AloBhi = Bhi;
   19423   // Bit cast to 32-bit vectors for MULUDQ
   19424   MVT MulVT = (VT == MVT::v2i64) ? MVT::v4i32 :
   19425                                   (VT == MVT::v4i64) ? MVT::v8i32 : MVT::v16i32;
   19426   A = DAG.getBitcast(MulVT, A);
   19427   B = DAG.getBitcast(MulVT, B);
   19428   Ahi = DAG.getBitcast(MulVT, Ahi);
   19429   Bhi = DAG.getBitcast(MulVT, Bhi);
   19430 
   19431   SDValue AloBlo = DAG.getNode(X86ISD::PMULUDQ, dl, VT, A, B);
   19432   // After shifting right const values the result may be all-zero.
   19433   if (!ISD::isBuildVectorAllZeros(Ahi.getNode())) {
   19434     AhiBlo = DAG.getNode(X86ISD::PMULUDQ, dl, VT, Ahi, B);
   19435     AhiBlo = getTargetVShiftByConstNode(X86ISD::VSHLI, dl, VT, AhiBlo, 32, DAG);
   19436   }
   19437   if (!ISD::isBuildVectorAllZeros(Bhi.getNode())) {
   19438     AloBhi = DAG.getNode(X86ISD::PMULUDQ, dl, VT, A, Bhi);
   19439     AloBhi = getTargetVShiftByConstNode(X86ISD::VSHLI, dl, VT, AloBhi, 32, DAG);
   19440   }
   19441 
   19442   SDValue Res = DAG.getNode(ISD::ADD, dl, VT, AloBlo, AloBhi);
   19443   return DAG.getNode(ISD::ADD, dl, VT, Res, AhiBlo);
   19444 }
   19445 
   19446 static SDValue LowerMULH(SDValue Op, const X86Subtarget &Subtarget,
   19447                          SelectionDAG &DAG) {
   19448   SDLoc dl(Op);
   19449   MVT VT = Op.getSimpleValueType();
   19450 
   19451   // Decompose 256-bit ops into smaller 128-bit ops.
   19452   if (VT.is256BitVector() && !Subtarget.hasInt256())
   19453     return Lower256IntArith(Op, DAG);
   19454 
   19455   // Only i8 vectors should need custom lowering after this.
   19456   assert((VT == MVT::v16i8 || (VT == MVT::v32i8 && Subtarget.hasInt256())) &&
   19457          "Unsupported vector type");
   19458 
   19459   // Lower v16i8/v32i8 as extension to v8i16/v16i16 vector pairs, multiply,
   19460   // logical shift down the upper half and pack back to i8.
   19461   SDValue A = Op.getOperand(0);
   19462   SDValue B = Op.getOperand(1);
   19463 
   19464   // With SSE41 we can use sign/zero extend, but for pre-SSE41 we unpack
   19465   // and then ashr/lshr the upper bits down to the lower bits before multiply.
   19466   unsigned Opcode = Op.getOpcode();
   19467   unsigned ExShift = (ISD::MULHU == Opcode ? ISD::SRL : ISD::SRA);
   19468   unsigned ExSSE41 = (ISD::MULHU == Opcode ? X86ISD::VZEXT : X86ISD::VSEXT);
   19469 
   19470   // AVX2 implementations - extend xmm subvectors to ymm.
   19471   if (Subtarget.hasInt256()) {
   19472     SDValue Lo = DAG.getIntPtrConstant(0, dl);
   19473     SDValue Hi = DAG.getIntPtrConstant(VT.getVectorNumElements() / 2, dl);
   19474 
   19475     if (VT == MVT::v32i8) {
   19476       SDValue ALo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v16i8, A, Lo);
   19477       SDValue BLo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v16i8, B, Lo);
   19478       SDValue AHi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v16i8, A, Hi);
   19479       SDValue BHi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v16i8, B, Hi);
   19480       ALo = DAG.getNode(ExSSE41, dl, MVT::v16i16, ALo);
   19481       BLo = DAG.getNode(ExSSE41, dl, MVT::v16i16, BLo);
   19482       AHi = DAG.getNode(ExSSE41, dl, MVT::v16i16, AHi);
   19483       BHi = DAG.getNode(ExSSE41, dl, MVT::v16i16, BHi);
   19484       Lo = DAG.getNode(ISD::SRL, dl, MVT::v16i16,
   19485                        DAG.getNode(ISD::MUL, dl, MVT::v16i16, ALo, BLo),
   19486                        DAG.getConstant(8, dl, MVT::v16i16));
   19487       Hi = DAG.getNode(ISD::SRL, dl, MVT::v16i16,
   19488                        DAG.getNode(ISD::MUL, dl, MVT::v16i16, AHi, BHi),
   19489                        DAG.getConstant(8, dl, MVT::v16i16));
   19490       // The ymm variant of PACKUS treats the 128-bit lanes separately, so before
   19491       // using PACKUS we need to permute the inputs to the correct lo/hi xmm lane.
   19492       const int LoMask[] = {0,  1,  2,  3,  4,  5,  6,  7,
   19493                             16, 17, 18, 19, 20, 21, 22, 23};
   19494       const int HiMask[] = {8,  9,  10, 11, 12, 13, 14, 15,
   19495                             24, 25, 26, 27, 28, 29, 30, 31};
   19496       return DAG.getNode(X86ISD::PACKUS, dl, VT,
   19497                          DAG.getVectorShuffle(MVT::v16i16, dl, Lo, Hi, LoMask),
   19498                          DAG.getVectorShuffle(MVT::v16i16, dl, Lo, Hi, HiMask));
   19499     }
   19500 
   19501     SDValue ExA = DAG.getNode(ExSSE41, dl, MVT::v16i16, A);
   19502     SDValue ExB = DAG.getNode(ExSSE41, dl, MVT::v16i16, B);
   19503     SDValue Mul = DAG.getNode(ISD::MUL, dl, MVT::v16i16, ExA, ExB);
   19504     SDValue MulH = DAG.getNode(ISD::SRL, dl, MVT::v16i16, Mul,
   19505                                DAG.getConstant(8, dl, MVT::v16i16));
   19506     Lo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v8i16, MulH, Lo);
   19507     Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v8i16, MulH, Hi);
   19508     return DAG.getNode(X86ISD::PACKUS, dl, VT, Lo, Hi);
   19509   }
   19510 
   19511   assert(VT == MVT::v16i8 &&
   19512          "Pre-AVX2 support only supports v16i8 multiplication");
   19513   MVT ExVT = MVT::v8i16;
   19514 
   19515   // Extract the lo parts and zero/sign extend to i16.
   19516   SDValue ALo, BLo;
   19517   if (Subtarget.hasSSE41()) {
   19518     ALo = DAG.getNode(ExSSE41, dl, ExVT, A);
   19519     BLo = DAG.getNode(ExSSE41, dl, ExVT, B);
   19520   } else {
   19521     const int ShufMask[] = {-1, 0, -1, 1, -1, 2, -1, 3,
   19522                             -1, 4, -1, 5, -1, 6, -1, 7};
   19523     ALo = DAG.getVectorShuffle(VT, dl, A, A, ShufMask);
   19524     BLo = DAG.getVectorShuffle(VT, dl, B, B, ShufMask);
   19525     ALo = DAG.getBitcast(ExVT, ALo);
   19526     BLo = DAG.getBitcast(ExVT, BLo);
   19527     ALo = DAG.getNode(ExShift, dl, ExVT, ALo, DAG.getConstant(8, dl, ExVT));
   19528     BLo = DAG.getNode(ExShift, dl, ExVT, BLo, DAG.getConstant(8, dl, ExVT));
   19529   }
   19530 
   19531   // Extract the hi parts and zero/sign extend to i16.
   19532   SDValue AHi, BHi;
   19533   if (Subtarget.hasSSE41()) {
   19534     const int ShufMask[] = {8,  9,  10, 11, 12, 13, 14, 15,
   19535                             -1, -1, -1, -1, -1, -1, -1, -1};
   19536     AHi = DAG.getVectorShuffle(VT, dl, A, A, ShufMask);
   19537     BHi = DAG.getVectorShuffle(VT, dl, B, B, ShufMask);
   19538     AHi = DAG.getNode(ExSSE41, dl, ExVT, AHi);
   19539     BHi = DAG.getNode(ExSSE41, dl, ExVT, BHi);
   19540   } else {
   19541     const int ShufMask[] = {-1, 8,  -1, 9,  -1, 10, -1, 11,
   19542                             -1, 12, -1, 13, -1, 14, -1, 15};
   19543     AHi = DAG.getVectorShuffle(VT, dl, A, A, ShufMask);
   19544     BHi = DAG.getVectorShuffle(VT, dl, B, B, ShufMask);
   19545     AHi = DAG.getBitcast(ExVT, AHi);
   19546     BHi = DAG.getBitcast(ExVT, BHi);
   19547     AHi = DAG.getNode(ExShift, dl, ExVT, AHi, DAG.getConstant(8, dl, ExVT));
   19548     BHi = DAG.getNode(ExShift, dl, ExVT, BHi, DAG.getConstant(8, dl, ExVT));
   19549   }
   19550 
   19551   // Multiply, lshr the upper 8bits to the lower 8bits of the lo/hi results and
   19552   // pack back to v16i8.
   19553   SDValue RLo = DAG.getNode(ISD::MUL, dl, ExVT, ALo, BLo);
   19554   SDValue RHi = DAG.getNode(ISD::MUL, dl, ExVT, AHi, BHi);
   19555   RLo = DAG.getNode(ISD::SRL, dl, ExVT, RLo, DAG.getConstant(8, dl, ExVT));
   19556   RHi = DAG.getNode(ISD::SRL, dl, ExVT, RHi, DAG.getConstant(8, dl, ExVT));
   19557   return DAG.getNode(X86ISD::PACKUS, dl, VT, RLo, RHi);
   19558 }
   19559 
   19560 SDValue X86TargetLowering::LowerWin64_i128OP(SDValue Op, SelectionDAG &DAG) const {
   19561   assert(Subtarget.isTargetWin64() && "Unexpected target");
   19562   EVT VT = Op.getValueType();
   19563   assert(VT.isInteger() && VT.getSizeInBits() == 128 &&
   19564          "Unexpected return type for lowering");
   19565 
   19566   RTLIB::Libcall LC;
   19567   bool isSigned;
   19568   switch (Op->getOpcode()) {
   19569   default: llvm_unreachable("Unexpected request for libcall!");
   19570   case ISD::SDIV:      isSigned = true;  LC = RTLIB::SDIV_I128;    break;
   19571   case ISD::UDIV:      isSigned = false; LC = RTLIB::UDIV_I128;    break;
   19572   case ISD::SREM:      isSigned = true;  LC = RTLIB::SREM_I128;    break;
   19573   case ISD::UREM:      isSigned = false; LC = RTLIB::UREM_I128;    break;
   19574   case ISD::SDIVREM:   isSigned = true;  LC = RTLIB::SDIVREM_I128; break;
   19575   case ISD::UDIVREM:   isSigned = false; LC = RTLIB::UDIVREM_I128; break;
   19576   }
   19577 
   19578   SDLoc dl(Op);
   19579   SDValue InChain = DAG.getEntryNode();
   19580 
   19581   TargetLowering::ArgListTy Args;
   19582   TargetLowering::ArgListEntry Entry;
   19583   for (unsigned i = 0, e = Op->getNumOperands(); i != e; ++i) {
   19584     EVT ArgVT = Op->getOperand(i).getValueType();
   19585     assert(ArgVT.isInteger() && ArgVT.getSizeInBits() == 128 &&
   19586            "Unexpected argument type for lowering");
   19587     SDValue StackPtr = DAG.CreateStackTemporary(ArgVT, 16);
   19588     Entry.Node = StackPtr;
   19589     InChain = DAG.getStore(InChain, dl, Op->getOperand(i), StackPtr, MachinePointerInfo(),
   19590                            false, false, 16);
   19591     Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext());
   19592     Entry.Ty = PointerType::get(ArgTy,0);
   19593     Entry.isSExt = false;
   19594     Entry.isZExt = false;
   19595     Args.push_back(Entry);
   19596   }
   19597 
   19598   SDValue Callee = DAG.getExternalSymbol(getLibcallName(LC),
   19599                                          getPointerTy(DAG.getDataLayout()));
   19600 
   19601   TargetLowering::CallLoweringInfo CLI(DAG);
   19602   CLI.setDebugLoc(dl).setChain(InChain)
   19603     .setCallee(getLibcallCallingConv(LC),
   19604                static_cast<EVT>(MVT::v2i64).getTypeForEVT(*DAG.getContext()),
   19605                Callee, std::move(Args))
   19606     .setInRegister().setSExtResult(isSigned).setZExtResult(!isSigned);
   19607 
   19608   std::pair<SDValue, SDValue> CallInfo = LowerCallTo(CLI);
   19609   return DAG.getBitcast(VT, CallInfo.first);
   19610 }
   19611 
   19612 static SDValue LowerMUL_LOHI(SDValue Op, const X86Subtarget &Subtarget,
   19613                              SelectionDAG &DAG) {
   19614   SDValue Op0 = Op.getOperand(0), Op1 = Op.getOperand(1);
   19615   MVT VT = Op0.getSimpleValueType();
   19616   SDLoc dl(Op);
   19617 
   19618   // Decompose 256-bit ops into smaller 128-bit ops.
   19619   if (VT.is256BitVector() && !Subtarget.hasInt256()) {
   19620     unsigned Opcode = Op.getOpcode();
   19621     unsigned NumElems = VT.getVectorNumElements();
   19622     MVT HalfVT = MVT::getVectorVT(VT.getScalarType(), NumElems / 2);
   19623     SDValue Lo0 = extract128BitVector(Op0, 0, DAG, dl);
   19624     SDValue Lo1 = extract128BitVector(Op1, 0, DAG, dl);
   19625     SDValue Hi0 = extract128BitVector(Op0, NumElems / 2, DAG, dl);
   19626     SDValue Hi1 = extract128BitVector(Op1, NumElems / 2, DAG, dl);
   19627     SDValue Lo = DAG.getNode(Opcode, dl, DAG.getVTList(HalfVT, HalfVT), Lo0, Lo1);
   19628     SDValue Hi = DAG.getNode(Opcode, dl, DAG.getVTList(HalfVT, HalfVT), Hi0, Hi1);
   19629     SDValue Ops[] = {
   19630       DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Lo.getValue(0), Hi.getValue(0)),
   19631       DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Lo.getValue(1), Hi.getValue(1))
   19632     };
   19633     return DAG.getMergeValues(Ops, dl);
   19634   }
   19635 
   19636   assert((VT == MVT::v4i32 && Subtarget.hasSSE2()) ||
   19637          (VT == MVT::v8i32 && Subtarget.hasInt256()));
   19638 
   19639   // PMULxD operations multiply each even value (starting at 0) of LHS with
   19640   // the related value of RHS and produce a widen result.
   19641   // E.g., PMULUDQ <4 x i32> <a|b|c|d>, <4 x i32> <e|f|g|h>
   19642   // => <2 x i64> <ae|cg>
   19643   //
   19644   // In other word, to have all the results, we need to perform two PMULxD:
   19645   // 1. one with the even values.
   19646   // 2. one with the odd values.
   19647   // To achieve #2, with need to place the odd values at an even position.
   19648   //
   19649   // Place the odd value at an even position (basically, shift all values 1
   19650   // step to the left):
   19651   const int Mask[] = {1, -1, 3, -1, 5, -1, 7, -1};
   19652   // <a|b|c|d> => <b|undef|d|undef>
   19653   SDValue Odd0 = DAG.getVectorShuffle(VT, dl, Op0, Op0,
   19654                              makeArrayRef(&Mask[0], VT.getVectorNumElements()));
   19655   // <e|f|g|h> => <f|undef|h|undef>
   19656   SDValue Odd1 = DAG.getVectorShuffle(VT, dl, Op1, Op1,
   19657                              makeArrayRef(&Mask[0], VT.getVectorNumElements()));
   19658 
   19659   // Emit two multiplies, one for the lower 2 ints and one for the higher 2
   19660   // ints.
   19661   MVT MulVT = VT == MVT::v4i32 ? MVT::v2i64 : MVT::v4i64;
   19662   bool IsSigned = Op->getOpcode() == ISD::SMUL_LOHI;
   19663   unsigned Opcode =
   19664       (!IsSigned || !Subtarget.hasSSE41()) ? X86ISD::PMULUDQ : X86ISD::PMULDQ;
   19665   // PMULUDQ <4 x i32> <a|b|c|d>, <4 x i32> <e|f|g|h>
   19666   // => <2 x i64> <ae|cg>
   19667   SDValue Mul1 = DAG.getBitcast(VT, DAG.getNode(Opcode, dl, MulVT, Op0, Op1));
   19668   // PMULUDQ <4 x i32> <b|undef|d|undef>, <4 x i32> <f|undef|h|undef>
   19669   // => <2 x i64> <bf|dh>
   19670   SDValue Mul2 = DAG.getBitcast(VT, DAG.getNode(Opcode, dl, MulVT, Odd0, Odd1));
   19671 
   19672   // Shuffle it back into the right order.
   19673   SDValue Highs, Lows;
   19674   if (VT == MVT::v8i32) {
   19675     const int HighMask[] = {1, 9, 3, 11, 5, 13, 7, 15};
   19676     Highs = DAG.getVectorShuffle(VT, dl, Mul1, Mul2, HighMask);
   19677     const int LowMask[] = {0, 8, 2, 10, 4, 12, 6, 14};
   19678     Lows = DAG.getVectorShuffle(VT, dl, Mul1, Mul2, LowMask);
   19679   } else {
   19680     const int HighMask[] = {1, 5, 3, 7};
   19681     Highs = DAG.getVectorShuffle(VT, dl, Mul1, Mul2, HighMask);
   19682     const int LowMask[] = {0, 4, 2, 6};
   19683     Lows = DAG.getVectorShuffle(VT, dl, Mul1, Mul2, LowMask);
   19684   }
   19685 
   19686   // If we have a signed multiply but no PMULDQ fix up the high parts of a
   19687   // unsigned multiply.
   19688   if (IsSigned && !Subtarget.hasSSE41()) {
   19689     SDValue ShAmt = DAG.getConstant(
   19690         31, dl,
   19691         DAG.getTargetLoweringInfo().getShiftAmountTy(VT, DAG.getDataLayout()));
   19692     SDValue T1 = DAG.getNode(ISD::AND, dl, VT,
   19693                              DAG.getNode(ISD::SRA, dl, VT, Op0, ShAmt), Op1);
   19694     SDValue T2 = DAG.getNode(ISD::AND, dl, VT,
   19695                              DAG.getNode(ISD::SRA, dl, VT, Op1, ShAmt), Op0);
   19696 
   19697     SDValue Fixup = DAG.getNode(ISD::ADD, dl, VT, T1, T2);
   19698     Highs = DAG.getNode(ISD::SUB, dl, VT, Highs, Fixup);
   19699   }
   19700 
   19701   // The first result of MUL_LOHI is actually the low value, followed by the
   19702   // high value.
   19703   SDValue Ops[] = {Lows, Highs};
   19704   return DAG.getMergeValues(Ops, dl);
   19705 }
   19706 
   19707 // Return true if the required (according to Opcode) shift-imm form is natively
   19708 // supported by the Subtarget
   19709 static bool SupportedVectorShiftWithImm(MVT VT, const X86Subtarget &Subtarget,
   19710                                         unsigned Opcode) {
   19711   if (VT.getScalarSizeInBits() < 16)
   19712     return false;
   19713 
   19714   if (VT.is512BitVector() &&
   19715       (VT.getScalarSizeInBits() > 16 || Subtarget.hasBWI()))
   19716     return true;
   19717 
   19718   bool LShift = VT.is128BitVector() ||
   19719     (VT.is256BitVector() && Subtarget.hasInt256());
   19720 
   19721   bool AShift = LShift && (Subtarget.hasVLX() ||
   19722     (VT != MVT::v2i64 && VT != MVT::v4i64));
   19723   return (Opcode == ISD::SRA) ? AShift : LShift;
   19724 }
   19725 
   19726 // The shift amount is a variable, but it is the same for all vector lanes.
   19727 // These instructions are defined together with shift-immediate.
   19728 static
   19729 bool SupportedVectorShiftWithBaseAmnt(MVT VT, const X86Subtarget &Subtarget,
   19730                                       unsigned Opcode) {
   19731   return SupportedVectorShiftWithImm(VT, Subtarget, Opcode);
   19732 }
   19733 
   19734 // Return true if the required (according to Opcode) variable-shift form is
   19735 // natively supported by the Subtarget
   19736 static bool SupportedVectorVarShift(MVT VT, const X86Subtarget &Subtarget,
   19737                                     unsigned Opcode) {
   19738 
   19739   if (!Subtarget.hasInt256() || VT.getScalarSizeInBits() < 16)
   19740     return false;
   19741 
   19742   // vXi16 supported only on AVX-512, BWI
   19743   if (VT.getScalarSizeInBits() == 16 && !Subtarget.hasBWI())
   19744     return false;
   19745 
   19746   if (VT.is512BitVector() || Subtarget.hasVLX())
   19747     return true;
   19748 
   19749   bool LShift = VT.is128BitVector() || VT.is256BitVector();
   19750   bool AShift = LShift &&  VT != MVT::v2i64 && VT != MVT::v4i64;
   19751   return (Opcode == ISD::SRA) ? AShift : LShift;
   19752 }
   19753 
   19754 static SDValue LowerScalarImmediateShift(SDValue Op, SelectionDAG &DAG,
   19755                                          const X86Subtarget &Subtarget) {
   19756   MVT VT = Op.getSimpleValueType();
   19757   SDLoc dl(Op);
   19758   SDValue R = Op.getOperand(0);
   19759   SDValue Amt = Op.getOperand(1);
   19760 
   19761   unsigned X86Opc = (Op.getOpcode() == ISD::SHL) ? X86ISD::VSHLI :
   19762     (Op.getOpcode() == ISD::SRL) ? X86ISD::VSRLI : X86ISD::VSRAI;
   19763 
   19764   auto ArithmeticShiftRight64 = [&](uint64_t ShiftAmt) {
   19765     assert((VT == MVT::v2i64 || VT == MVT::v4i64) && "Unexpected SRA type");
   19766     MVT ExVT = MVT::getVectorVT(MVT::i32, VT.getVectorNumElements() * 2);
   19767     SDValue Ex = DAG.getBitcast(ExVT, R);
   19768 
   19769     if (ShiftAmt >= 32) {
   19770       // Splat sign to upper i32 dst, and SRA upper i32 src to lower i32.
   19771       SDValue Upper =
   19772           getTargetVShiftByConstNode(X86ISD::VSRAI, dl, ExVT, Ex, 31, DAG);
   19773       SDValue Lower = getTargetVShiftByConstNode(X86ISD::VSRAI, dl, ExVT, Ex,
   19774                                                  ShiftAmt - 32, DAG);
   19775       if (VT == MVT::v2i64)
   19776         Ex = DAG.getVectorShuffle(ExVT, dl, Upper, Lower, {5, 1, 7, 3});
   19777       if (VT == MVT::v4i64)
   19778         Ex = DAG.getVectorShuffle(ExVT, dl, Upper, Lower,
   19779                                   {9, 1, 11, 3, 13, 5, 15, 7});
   19780     } else {
   19781       // SRA upper i32, SHL whole i64 and select lower i32.
   19782       SDValue Upper = getTargetVShiftByConstNode(X86ISD::VSRAI, dl, ExVT, Ex,
   19783                                                  ShiftAmt, DAG);
   19784       SDValue Lower =
   19785           getTargetVShiftByConstNode(X86ISD::VSRLI, dl, VT, R, ShiftAmt, DAG);
   19786       Lower = DAG.getBitcast(ExVT, Lower);
   19787       if (VT == MVT::v2i64)
   19788         Ex = DAG.getVectorShuffle(ExVT, dl, Upper, Lower, {4, 1, 6, 3});
   19789       if (VT == MVT::v4i64)
   19790         Ex = DAG.getVectorShuffle(ExVT, dl, Upper, Lower,
   19791                                   {8, 1, 10, 3, 12, 5, 14, 7});
   19792     }
   19793     return DAG.getBitcast(VT, Ex);
   19794   };
   19795 
   19796   // Optimize shl/srl/sra with constant shift amount.
   19797   if (auto *BVAmt = dyn_cast<BuildVectorSDNode>(Amt)) {
   19798     if (auto *ShiftConst = BVAmt->getConstantSplatNode()) {
   19799       uint64_t ShiftAmt = ShiftConst->getZExtValue();
   19800 
   19801       if (SupportedVectorShiftWithImm(VT, Subtarget, Op.getOpcode()))
   19802         return getTargetVShiftByConstNode(X86Opc, dl, VT, R, ShiftAmt, DAG);
   19803 
   19804       // i64 SRA needs to be performed as partial shifts.
   19805       if ((VT == MVT::v2i64 || (Subtarget.hasInt256() && VT == MVT::v4i64)) &&
   19806           Op.getOpcode() == ISD::SRA && !Subtarget.hasXOP())
   19807         return ArithmeticShiftRight64(ShiftAmt);
   19808 
   19809       if (VT == MVT::v16i8 ||
   19810           (Subtarget.hasInt256() && VT == MVT::v32i8) ||
   19811           VT == MVT::v64i8) {
   19812         unsigned NumElts = VT.getVectorNumElements();
   19813         MVT ShiftVT = MVT::getVectorVT(MVT::i16, NumElts / 2);
   19814 
   19815         // Simple i8 add case
   19816         if (Op.getOpcode() == ISD::SHL && ShiftAmt == 1)
   19817           return DAG.getNode(ISD::ADD, dl, VT, R, R);
   19818 
   19819         // ashr(R, 7)  === cmp_slt(R, 0)
   19820         if (Op.getOpcode() == ISD::SRA && ShiftAmt == 7) {
   19821           SDValue Zeros = getZeroVector(VT, Subtarget, DAG, dl);
   19822           if (VT.is512BitVector()) {
   19823             assert(VT == MVT::v64i8 && "Unexpected element type!");
   19824             SDValue CMP = DAG.getNode(X86ISD::PCMPGTM, dl, MVT::v64i1, Zeros, R);
   19825             return DAG.getNode(ISD::SIGN_EXTEND, dl, VT, CMP);
   19826           }
   19827           return DAG.getNode(X86ISD::PCMPGT, dl, VT, Zeros, R);
   19828         }
   19829 
   19830         // XOP can shift v16i8 directly instead of as shift v8i16 + mask.
   19831         if (VT == MVT::v16i8 && Subtarget.hasXOP())
   19832           return SDValue();
   19833 
   19834         if (Op.getOpcode() == ISD::SHL) {
   19835           // Make a large shift.
   19836           SDValue SHL = getTargetVShiftByConstNode(X86ISD::VSHLI, dl, ShiftVT,
   19837                                                    R, ShiftAmt, DAG);
   19838           SHL = DAG.getBitcast(VT, SHL);
   19839           // Zero out the rightmost bits.
   19840           return DAG.getNode(ISD::AND, dl, VT, SHL,
   19841                              DAG.getConstant(uint8_t(-1U << ShiftAmt), dl, VT));
   19842         }
   19843         if (Op.getOpcode() == ISD::SRL) {
   19844           // Make a large shift.
   19845           SDValue SRL = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ShiftVT,
   19846                                                    R, ShiftAmt, DAG);
   19847           SRL = DAG.getBitcast(VT, SRL);
   19848           // Zero out the leftmost bits.
   19849           return DAG.getNode(ISD::AND, dl, VT, SRL,
   19850                              DAG.getConstant(uint8_t(-1U) >> ShiftAmt, dl, VT));
   19851         }
   19852         if (Op.getOpcode() == ISD::SRA) {
   19853           // ashr(R, Amt) === sub(xor(lshr(R, Amt), Mask), Mask)
   19854           SDValue Res = DAG.getNode(ISD::SRL, dl, VT, R, Amt);
   19855 
   19856           SDValue Mask = DAG.getConstant(128 >> ShiftAmt, dl, VT);
   19857           Res = DAG.getNode(ISD::XOR, dl, VT, Res, Mask);
   19858           Res = DAG.getNode(ISD::SUB, dl, VT, Res, Mask);
   19859           return Res;
   19860         }
   19861         llvm_unreachable("Unknown shift opcode.");
   19862       }
   19863     }
   19864   }
   19865 
   19866   // Special case in 32-bit mode, where i64 is expanded into high and low parts.
   19867   if (!Subtarget.is64Bit() && !Subtarget.hasXOP() &&
   19868       (VT == MVT::v2i64 || (Subtarget.hasInt256() && VT == MVT::v4i64))) {
   19869 
   19870     // Peek through any splat that was introduced for i64 shift vectorization.
   19871     int SplatIndex = -1;
   19872     if (ShuffleVectorSDNode *SVN = dyn_cast<ShuffleVectorSDNode>(Amt.getNode()))
   19873       if (SVN->isSplat()) {
   19874         SplatIndex = SVN->getSplatIndex();
   19875         Amt = Amt.getOperand(0);
   19876         assert(SplatIndex < (int)VT.getVectorNumElements() &&
   19877                "Splat shuffle referencing second operand");
   19878       }
   19879 
   19880     if (Amt.getOpcode() != ISD::BITCAST ||
   19881         Amt.getOperand(0).getOpcode() != ISD::BUILD_VECTOR)
   19882       return SDValue();
   19883 
   19884     Amt = Amt.getOperand(0);
   19885     unsigned Ratio = Amt.getSimpleValueType().getVectorNumElements() /
   19886                      VT.getVectorNumElements();
   19887     unsigned RatioInLog2 = Log2_32_Ceil(Ratio);
   19888     uint64_t ShiftAmt = 0;
   19889     unsigned BaseOp = (SplatIndex < 0 ? 0 : SplatIndex * Ratio);
   19890     for (unsigned i = 0; i != Ratio; ++i) {
   19891       ConstantSDNode *C = dyn_cast<ConstantSDNode>(Amt.getOperand(i + BaseOp));
   19892       if (!C)
   19893         return SDValue();
   19894       // 6 == Log2(64)
   19895       ShiftAmt |= C->getZExtValue() << (i * (1 << (6 - RatioInLog2)));
   19896     }
   19897 
   19898     // Check remaining shift amounts (if not a splat).
   19899     if (SplatIndex < 0) {
   19900       for (unsigned i = Ratio; i != Amt.getNumOperands(); i += Ratio) {
   19901         uint64_t ShAmt = 0;
   19902         for (unsigned j = 0; j != Ratio; ++j) {
   19903           ConstantSDNode *C = dyn_cast<ConstantSDNode>(Amt.getOperand(i + j));
   19904           if (!C)
   19905             return SDValue();
   19906           // 6 == Log2(64)
   19907           ShAmt |= C->getZExtValue() << (j * (1 << (6 - RatioInLog2)));
   19908         }
   19909         if (ShAmt != ShiftAmt)
   19910           return SDValue();
   19911       }
   19912     }
   19913 
   19914     if (SupportedVectorShiftWithImm(VT, Subtarget, Op.getOpcode()))
   19915       return getTargetVShiftByConstNode(X86Opc, dl, VT, R, ShiftAmt, DAG);
   19916 
   19917     if (Op.getOpcode() == ISD::SRA)
   19918       return ArithmeticShiftRight64(ShiftAmt);
   19919   }
   19920 
   19921   return SDValue();
   19922 }
   19923 
   19924 static SDValue LowerScalarVariableShift(SDValue Op, SelectionDAG &DAG,
   19925                                         const X86Subtarget &Subtarget) {
   19926   MVT VT = Op.getSimpleValueType();
   19927   SDLoc dl(Op);
   19928   SDValue R = Op.getOperand(0);
   19929   SDValue Amt = Op.getOperand(1);
   19930 
   19931   unsigned X86OpcI = (Op.getOpcode() == ISD::SHL) ? X86ISD::VSHLI :
   19932     (Op.getOpcode() == ISD::SRL) ? X86ISD::VSRLI : X86ISD::VSRAI;
   19933 
   19934   unsigned X86OpcV = (Op.getOpcode() == ISD::SHL) ? X86ISD::VSHL :
   19935     (Op.getOpcode() == ISD::SRL) ? X86ISD::VSRL : X86ISD::VSRA;
   19936 
   19937   if (SupportedVectorShiftWithBaseAmnt(VT, Subtarget, Op.getOpcode())) {
   19938     SDValue BaseShAmt;
   19939     MVT EltVT = VT.getVectorElementType();
   19940 
   19941     if (BuildVectorSDNode *BV = dyn_cast<BuildVectorSDNode>(Amt)) {
   19942       // Check if this build_vector node is doing a splat.
   19943       // If so, then set BaseShAmt equal to the splat value.
   19944       BaseShAmt = BV->getSplatValue();
   19945       if (BaseShAmt && BaseShAmt.isUndef())
   19946         BaseShAmt = SDValue();
   19947     } else {
   19948       if (Amt.getOpcode() == ISD::EXTRACT_SUBVECTOR)
   19949         Amt = Amt.getOperand(0);
   19950 
   19951       ShuffleVectorSDNode *SVN = dyn_cast<ShuffleVectorSDNode>(Amt);
   19952       if (SVN && SVN->isSplat()) {
   19953         unsigned SplatIdx = (unsigned)SVN->getSplatIndex();
   19954         SDValue InVec = Amt.getOperand(0);
   19955         if (InVec.getOpcode() == ISD::BUILD_VECTOR) {
   19956           assert((SplatIdx < InVec.getSimpleValueType().getVectorNumElements()) &&
   19957                  "Unexpected shuffle index found!");
   19958           BaseShAmt = InVec.getOperand(SplatIdx);
   19959         } else if (InVec.getOpcode() == ISD::INSERT_VECTOR_ELT) {
   19960            if (ConstantSDNode *C =
   19961                dyn_cast<ConstantSDNode>(InVec.getOperand(2))) {
   19962              if (C->getZExtValue() == SplatIdx)
   19963                BaseShAmt = InVec.getOperand(1);
   19964            }
   19965         }
   19966 
   19967         if (!BaseShAmt)
   19968           // Avoid introducing an extract element from a shuffle.
   19969           BaseShAmt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, InVec,
   19970                                   DAG.getIntPtrConstant(SplatIdx, dl));
   19971       }
   19972     }
   19973 
   19974     if (BaseShAmt.getNode()) {
   19975       assert(EltVT.bitsLE(MVT::i64) && "Unexpected element type!");
   19976       if (EltVT != MVT::i64 && EltVT.bitsGT(MVT::i32))
   19977         BaseShAmt = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i64, BaseShAmt);
   19978       else if (EltVT.bitsLT(MVT::i32))
   19979         BaseShAmt = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, BaseShAmt);
   19980 
   19981       return getTargetVShiftNode(X86OpcI, dl, VT, R, BaseShAmt, DAG);
   19982     }
   19983   }
   19984 
   19985   // Special case in 32-bit mode, where i64 is expanded into high and low parts.
   19986   if (!Subtarget.is64Bit() && VT == MVT::v2i64  &&
   19987       Amt.getOpcode() == ISD::BITCAST &&
   19988       Amt.getOperand(0).getOpcode() == ISD::BUILD_VECTOR) {
   19989     Amt = Amt.getOperand(0);
   19990     unsigned Ratio = Amt.getSimpleValueType().getVectorNumElements() /
   19991                      VT.getVectorNumElements();
   19992     std::vector<SDValue> Vals(Ratio);
   19993     for (unsigned i = 0; i != Ratio; ++i)
   19994       Vals[i] = Amt.getOperand(i);
   19995     for (unsigned i = Ratio; i != Amt.getNumOperands(); i += Ratio) {
   19996       for (unsigned j = 0; j != Ratio; ++j)
   19997         if (Vals[j] != Amt.getOperand(i + j))
   19998           return SDValue();
   19999     }
   20000 
   20001     if (SupportedVectorShiftWithBaseAmnt(VT, Subtarget, Op.getOpcode()))
   20002       return DAG.getNode(X86OpcV, dl, VT, R, Op.getOperand(1));
   20003   }
   20004   return SDValue();
   20005 }
   20006 
   20007 static SDValue LowerShift(SDValue Op, const X86Subtarget &Subtarget,
   20008                           SelectionDAG &DAG) {
   20009   MVT VT = Op.getSimpleValueType();
   20010   SDLoc dl(Op);
   20011   SDValue R = Op.getOperand(0);
   20012   SDValue Amt = Op.getOperand(1);
   20013   bool ConstantAmt = ISD::isBuildVectorOfConstantSDNodes(Amt.getNode());
   20014 
   20015   assert(VT.isVector() && "Custom lowering only for vector shifts!");
   20016   assert(Subtarget.hasSSE2() && "Only custom lower when we have SSE2!");
   20017 
   20018   if (SDValue V = LowerScalarImmediateShift(Op, DAG, Subtarget))
   20019     return V;
   20020 
   20021   if (SDValue V = LowerScalarVariableShift(Op, DAG, Subtarget))
   20022     return V;
   20023 
   20024   if (SupportedVectorVarShift(VT, Subtarget, Op.getOpcode()))
   20025     return Op;
   20026 
   20027   // XOP has 128-bit variable logical/arithmetic shifts.
   20028   // +ve/-ve Amt = shift left/right.
   20029   if (Subtarget.hasXOP() &&
   20030       (VT == MVT::v2i64 || VT == MVT::v4i32 ||
   20031        VT == MVT::v8i16 || VT == MVT::v16i8)) {
   20032     if (Op.getOpcode() == ISD::SRL || Op.getOpcode() == ISD::SRA) {
   20033       SDValue Zero = getZeroVector(VT, Subtarget, DAG, dl);
   20034       Amt = DAG.getNode(ISD::SUB, dl, VT, Zero, Amt);
   20035     }
   20036     if (Op.getOpcode() == ISD::SHL || Op.getOpcode() == ISD::SRL)
   20037       return DAG.getNode(X86ISD::VPSHL, dl, VT, R, Amt);
   20038     if (Op.getOpcode() == ISD::SRA)
   20039       return DAG.getNode(X86ISD::VPSHA, dl, VT, R, Amt);
   20040   }
   20041 
   20042   // 2i64 vector logical shifts can efficiently avoid scalarization - do the
   20043   // shifts per-lane and then shuffle the partial results back together.
   20044   if (VT == MVT::v2i64 && Op.getOpcode() != ISD::SRA) {
   20045     // Splat the shift amounts so the scalar shifts above will catch it.
   20046     SDValue Amt0 = DAG.getVectorShuffle(VT, dl, Amt, Amt, {0, 0});
   20047     SDValue Amt1 = DAG.getVectorShuffle(VT, dl, Amt, Amt, {1, 1});
   20048     SDValue R0 = DAG.getNode(Op->getOpcode(), dl, VT, R, Amt0);
   20049     SDValue R1 = DAG.getNode(Op->getOpcode(), dl, VT, R, Amt1);
   20050     return DAG.getVectorShuffle(VT, dl, R0, R1, {0, 3});
   20051   }
   20052 
   20053   // i64 vector arithmetic shift can be emulated with the transform:
   20054   // M = lshr(SIGN_BIT, Amt)
   20055   // ashr(R, Amt) === sub(xor(lshr(R, Amt), M), M)
   20056   if ((VT == MVT::v2i64 || (VT == MVT::v4i64 && Subtarget.hasInt256())) &&
   20057       Op.getOpcode() == ISD::SRA) {
   20058     SDValue S = DAG.getConstant(APInt::getSignBit(64), dl, VT);
   20059     SDValue M = DAG.getNode(ISD::SRL, dl, VT, S, Amt);
   20060     R = DAG.getNode(ISD::SRL, dl, VT, R, Amt);
   20061     R = DAG.getNode(ISD::XOR, dl, VT, R, M);
   20062     R = DAG.getNode(ISD::SUB, dl, VT, R, M);
   20063     return R;
   20064   }
   20065 
   20066   // If possible, lower this packed shift into a vector multiply instead of
   20067   // expanding it into a sequence of scalar shifts.
   20068   // Do this only if the vector shift count is a constant build_vector.
   20069   if (ConstantAmt && Op.getOpcode() == ISD::SHL &&
   20070       (VT == MVT::v8i16 || VT == MVT::v4i32 ||
   20071        (Subtarget.hasInt256() && VT == MVT::v16i16))) {
   20072     SmallVector<SDValue, 8> Elts;
   20073     MVT SVT = VT.getVectorElementType();
   20074     unsigned SVTBits = SVT.getSizeInBits();
   20075     APInt One(SVTBits, 1);
   20076     unsigned NumElems = VT.getVectorNumElements();
   20077 
   20078     for (unsigned i=0; i !=NumElems; ++i) {
   20079       SDValue Op = Amt->getOperand(i);
   20080       if (Op->isUndef()) {
   20081         Elts.push_back(Op);
   20082         continue;
   20083       }
   20084 
   20085       ConstantSDNode *ND = cast<ConstantSDNode>(Op);
   20086       APInt C(SVTBits, ND->getAPIntValue().getZExtValue());
   20087       uint64_t ShAmt = C.getZExtValue();
   20088       if (ShAmt >= SVTBits) {
   20089         Elts.push_back(DAG.getUNDEF(SVT));
   20090         continue;
   20091       }
   20092       Elts.push_back(DAG.getConstant(One.shl(ShAmt), dl, SVT));
   20093     }
   20094     SDValue BV = DAG.getBuildVector(VT, dl, Elts);
   20095     return DAG.getNode(ISD::MUL, dl, VT, R, BV);
   20096   }
   20097 
   20098   // Lower SHL with variable shift amount.
   20099   if (VT == MVT::v4i32 && Op->getOpcode() == ISD::SHL) {
   20100     Op = DAG.getNode(ISD::SHL, dl, VT, Amt, DAG.getConstant(23, dl, VT));
   20101 
   20102     Op = DAG.getNode(ISD::ADD, dl, VT, Op,
   20103                      DAG.getConstant(0x3f800000U, dl, VT));
   20104     Op = DAG.getBitcast(MVT::v4f32, Op);
   20105     Op = DAG.getNode(ISD::FP_TO_SINT, dl, VT, Op);
   20106     return DAG.getNode(ISD::MUL, dl, VT, Op, R);
   20107   }
   20108 
   20109   // If possible, lower this shift as a sequence of two shifts by
   20110   // constant plus a MOVSS/MOVSD instead of scalarizing it.
   20111   // Example:
   20112   //   (v4i32 (srl A, (build_vector < X, Y, Y, Y>)))
   20113   //
   20114   // Could be rewritten as:
   20115   //   (v4i32 (MOVSS (srl A, <Y,Y,Y,Y>), (srl A, <X,X,X,X>)))
   20116   //
   20117   // The advantage is that the two shifts from the example would be
   20118   // lowered as X86ISD::VSRLI nodes. This would be cheaper than scalarizing
   20119   // the vector shift into four scalar shifts plus four pairs of vector
   20120   // insert/extract.
   20121   if (ConstantAmt && (VT == MVT::v8i16 || VT == MVT::v4i32)) {
   20122     unsigned TargetOpcode = X86ISD::MOVSS;
   20123     bool CanBeSimplified;
   20124     // The splat value for the first packed shift (the 'X' from the example).
   20125     SDValue Amt1 = Amt->getOperand(0);
   20126     // The splat value for the second packed shift (the 'Y' from the example).
   20127     SDValue Amt2 = (VT == MVT::v4i32) ? Amt->getOperand(1) : Amt->getOperand(2);
   20128 
   20129     // See if it is possible to replace this node with a sequence of
   20130     // two shifts followed by a MOVSS/MOVSD
   20131     if (VT == MVT::v4i32) {
   20132       // Check if it is legal to use a MOVSS.
   20133       CanBeSimplified = Amt2 == Amt->getOperand(2) &&
   20134                         Amt2 == Amt->getOperand(3);
   20135       if (!CanBeSimplified) {
   20136         // Otherwise, check if we can still simplify this node using a MOVSD.
   20137         CanBeSimplified = Amt1 == Amt->getOperand(1) &&
   20138                           Amt->getOperand(2) == Amt->getOperand(3);
   20139         TargetOpcode = X86ISD::MOVSD;
   20140         Amt2 = Amt->getOperand(2);
   20141       }
   20142     } else {
   20143       // Do similar checks for the case where the machine value type
   20144       // is MVT::v8i16.
   20145       CanBeSimplified = Amt1 == Amt->getOperand(1);
   20146       for (unsigned i=3; i != 8 && CanBeSimplified; ++i)
   20147         CanBeSimplified = Amt2 == Amt->getOperand(i);
   20148 
   20149       if (!CanBeSimplified) {
   20150         TargetOpcode = X86ISD::MOVSD;
   20151         CanBeSimplified = true;
   20152         Amt2 = Amt->getOperand(4);
   20153         for (unsigned i=0; i != 4 && CanBeSimplified; ++i)
   20154           CanBeSimplified = Amt1 == Amt->getOperand(i);
   20155         for (unsigned j=4; j != 8 && CanBeSimplified; ++j)
   20156           CanBeSimplified = Amt2 == Amt->getOperand(j);
   20157       }
   20158     }
   20159 
   20160     if (CanBeSimplified && isa<ConstantSDNode>(Amt1) &&
   20161         isa<ConstantSDNode>(Amt2)) {
   20162       // Replace this node with two shifts followed by a MOVSS/MOVSD.
   20163       MVT CastVT = MVT::v4i32;
   20164       SDValue Splat1 =
   20165         DAG.getConstant(cast<ConstantSDNode>(Amt1)->getAPIntValue(), dl, VT);
   20166       SDValue Shift1 = DAG.getNode(Op->getOpcode(), dl, VT, R, Splat1);
   20167       SDValue Splat2 =
   20168         DAG.getConstant(cast<ConstantSDNode>(Amt2)->getAPIntValue(), dl, VT);
   20169       SDValue Shift2 = DAG.getNode(Op->getOpcode(), dl, VT, R, Splat2);
   20170       if (TargetOpcode == X86ISD::MOVSD)
   20171         CastVT = MVT::v2i64;
   20172       SDValue BitCast1 = DAG.getBitcast(CastVT, Shift1);
   20173       SDValue BitCast2 = DAG.getBitcast(CastVT, Shift2);
   20174       SDValue Result = getTargetShuffleNode(TargetOpcode, dl, CastVT, BitCast2,
   20175                                             BitCast1, DAG);
   20176       return DAG.getBitcast(VT, Result);
   20177     }
   20178   }
   20179 
   20180   // v4i32 Non Uniform Shifts.
   20181   // If the shift amount is constant we can shift each lane using the SSE2
   20182   // immediate shifts, else we need to zero-extend each lane to the lower i64
   20183   // and shift using the SSE2 variable shifts.
   20184   // The separate results can then be blended together.
   20185   if (VT == MVT::v4i32) {
   20186     unsigned Opc = Op.getOpcode();
   20187     SDValue Amt0, Amt1, Amt2, Amt3;
   20188     if (ConstantAmt) {
   20189       Amt0 = DAG.getVectorShuffle(VT, dl, Amt, DAG.getUNDEF(VT), {0, 0, 0, 0});
   20190       Amt1 = DAG.getVectorShuffle(VT, dl, Amt, DAG.getUNDEF(VT), {1, 1, 1, 1});
   20191       Amt2 = DAG.getVectorShuffle(VT, dl, Amt, DAG.getUNDEF(VT), {2, 2, 2, 2});
   20192       Amt3 = DAG.getVectorShuffle(VT, dl, Amt, DAG.getUNDEF(VT), {3, 3, 3, 3});
   20193     } else {
   20194       // ISD::SHL is handled above but we include it here for completeness.
   20195       switch (Opc) {
   20196       default:
   20197         llvm_unreachable("Unknown target vector shift node");
   20198       case ISD::SHL:
   20199         Opc = X86ISD::VSHL;
   20200         break;
   20201       case ISD::SRL:
   20202         Opc = X86ISD::VSRL;
   20203         break;
   20204       case ISD::SRA:
   20205         Opc = X86ISD::VSRA;
   20206         break;
   20207       }
   20208       // The SSE2 shifts use the lower i64 as the same shift amount for
   20209       // all lanes and the upper i64 is ignored. These shuffle masks
   20210       // optimally zero-extend each lanes on SSE2/SSE41/AVX targets.
   20211       SDValue Z = getZeroVector(VT, Subtarget, DAG, dl);
   20212       Amt0 = DAG.getVectorShuffle(VT, dl, Amt, Z, {0, 4, -1, -1});
   20213       Amt1 = DAG.getVectorShuffle(VT, dl, Amt, Z, {1, 5, -1, -1});
   20214       Amt2 = DAG.getVectorShuffle(VT, dl, Amt, Z, {2, 6, -1, -1});
   20215       Amt3 = DAG.getVectorShuffle(VT, dl, Amt, Z, {3, 7, -1, -1});
   20216     }
   20217 
   20218     SDValue R0 = DAG.getNode(Opc, dl, VT, R, Amt0);
   20219     SDValue R1 = DAG.getNode(Opc, dl, VT, R, Amt1);
   20220     SDValue R2 = DAG.getNode(Opc, dl, VT, R, Amt2);
   20221     SDValue R3 = DAG.getNode(Opc, dl, VT, R, Amt3);
   20222     SDValue R02 = DAG.getVectorShuffle(VT, dl, R0, R2, {0, -1, 6, -1});
   20223     SDValue R13 = DAG.getVectorShuffle(VT, dl, R1, R3, {-1, 1, -1, 7});
   20224     return DAG.getVectorShuffle(VT, dl, R02, R13, {0, 5, 2, 7});
   20225   }
   20226 
   20227   if (VT == MVT::v16i8 ||
   20228       (VT == MVT::v32i8 && Subtarget.hasInt256() && !Subtarget.hasXOP())) {
   20229     MVT ExtVT = MVT::getVectorVT(MVT::i16, VT.getVectorNumElements() / 2);
   20230     unsigned ShiftOpcode = Op->getOpcode();
   20231 
   20232     auto SignBitSelect = [&](MVT SelVT, SDValue Sel, SDValue V0, SDValue V1) {
   20233       // On SSE41 targets we make use of the fact that VSELECT lowers
   20234       // to PBLENDVB which selects bytes based just on the sign bit.
   20235       if (Subtarget.hasSSE41()) {
   20236         V0 = DAG.getBitcast(VT, V0);
   20237         V1 = DAG.getBitcast(VT, V1);
   20238         Sel = DAG.getBitcast(VT, Sel);
   20239         return DAG.getBitcast(SelVT,
   20240                               DAG.getNode(ISD::VSELECT, dl, VT, Sel, V0, V1));
   20241       }
   20242       // On pre-SSE41 targets we test for the sign bit by comparing to
   20243       // zero - a negative value will set all bits of the lanes to true
   20244       // and VSELECT uses that in its OR(AND(V0,C),AND(V1,~C)) lowering.
   20245       SDValue Z = getZeroVector(SelVT, Subtarget, DAG, dl);
   20246       SDValue C = DAG.getNode(X86ISD::PCMPGT, dl, SelVT, Z, Sel);
   20247       return DAG.getNode(ISD::VSELECT, dl, SelVT, C, V0, V1);
   20248     };
   20249 
   20250     // Turn 'a' into a mask suitable for VSELECT: a = a << 5;
   20251     // We can safely do this using i16 shifts as we're only interested in
   20252     // the 3 lower bits of each byte.
   20253     Amt = DAG.getBitcast(ExtVT, Amt);
   20254     Amt = DAG.getNode(ISD::SHL, dl, ExtVT, Amt, DAG.getConstant(5, dl, ExtVT));
   20255     Amt = DAG.getBitcast(VT, Amt);
   20256 
   20257     if (Op->getOpcode() == ISD::SHL || Op->getOpcode() == ISD::SRL) {
   20258       // r = VSELECT(r, shift(r, 4), a);
   20259       SDValue M =
   20260           DAG.getNode(ShiftOpcode, dl, VT, R, DAG.getConstant(4, dl, VT));
   20261       R = SignBitSelect(VT, Amt, M, R);
   20262 
   20263       // a += a
   20264       Amt = DAG.getNode(ISD::ADD, dl, VT, Amt, Amt);
   20265 
   20266       // r = VSELECT(r, shift(r, 2), a);
   20267       M = DAG.getNode(ShiftOpcode, dl, VT, R, DAG.getConstant(2, dl, VT));
   20268       R = SignBitSelect(VT, Amt, M, R);
   20269 
   20270       // a += a
   20271       Amt = DAG.getNode(ISD::ADD, dl, VT, Amt, Amt);
   20272 
   20273       // return VSELECT(r, shift(r, 1), a);
   20274       M = DAG.getNode(ShiftOpcode, dl, VT, R, DAG.getConstant(1, dl, VT));
   20275       R = SignBitSelect(VT, Amt, M, R);
   20276       return R;
   20277     }
   20278 
   20279     if (Op->getOpcode() == ISD::SRA) {
   20280       // For SRA we need to unpack each byte to the higher byte of a i16 vector
   20281       // so we can correctly sign extend. We don't care what happens to the
   20282       // lower byte.
   20283       SDValue ALo = DAG.getNode(X86ISD::UNPCKL, dl, VT, DAG.getUNDEF(VT), Amt);
   20284       SDValue AHi = DAG.getNode(X86ISD::UNPCKH, dl, VT, DAG.getUNDEF(VT), Amt);
   20285       SDValue RLo = DAG.getNode(X86ISD::UNPCKL, dl, VT, DAG.getUNDEF(VT), R);
   20286       SDValue RHi = DAG.getNode(X86ISD::UNPCKH, dl, VT, DAG.getUNDEF(VT), R);
   20287       ALo = DAG.getBitcast(ExtVT, ALo);
   20288       AHi = DAG.getBitcast(ExtVT, AHi);
   20289       RLo = DAG.getBitcast(ExtVT, RLo);
   20290       RHi = DAG.getBitcast(ExtVT, RHi);
   20291 
   20292       // r = VSELECT(r, shift(r, 4), a);
   20293       SDValue MLo = DAG.getNode(ShiftOpcode, dl, ExtVT, RLo,
   20294                                 DAG.getConstant(4, dl, ExtVT));
   20295       SDValue MHi = DAG.getNode(ShiftOpcode, dl, ExtVT, RHi,
   20296                                 DAG.getConstant(4, dl, ExtVT));
   20297       RLo = SignBitSelect(ExtVT, ALo, MLo, RLo);
   20298       RHi = SignBitSelect(ExtVT, AHi, MHi, RHi);
   20299 
   20300       // a += a
   20301       ALo = DAG.getNode(ISD::ADD, dl, ExtVT, ALo, ALo);
   20302       AHi = DAG.getNode(ISD::ADD, dl, ExtVT, AHi, AHi);
   20303 
   20304       // r = VSELECT(r, shift(r, 2), a);
   20305       MLo = DAG.getNode(ShiftOpcode, dl, ExtVT, RLo,
   20306                         DAG.getConstant(2, dl, ExtVT));
   20307       MHi = DAG.getNode(ShiftOpcode, dl, ExtVT, RHi,
   20308                         DAG.getConstant(2, dl, ExtVT));
   20309       RLo = SignBitSelect(ExtVT, ALo, MLo, RLo);
   20310       RHi = SignBitSelect(ExtVT, AHi, MHi, RHi);
   20311 
   20312       // a += a
   20313       ALo = DAG.getNode(ISD::ADD, dl, ExtVT, ALo, ALo);
   20314       AHi = DAG.getNode(ISD::ADD, dl, ExtVT, AHi, AHi);
   20315 
   20316       // r = VSELECT(r, shift(r, 1), a);
   20317       MLo = DAG.getNode(ShiftOpcode, dl, ExtVT, RLo,
   20318                         DAG.getConstant(1, dl, ExtVT));
   20319       MHi = DAG.getNode(ShiftOpcode, dl, ExtVT, RHi,
   20320                         DAG.getConstant(1, dl, ExtVT));
   20321       RLo = SignBitSelect(ExtVT, ALo, MLo, RLo);
   20322       RHi = SignBitSelect(ExtVT, AHi, MHi, RHi);
   20323 
   20324       // Logical shift the result back to the lower byte, leaving a zero upper
   20325       // byte
   20326       // meaning that we can safely pack with PACKUSWB.
   20327       RLo =
   20328           DAG.getNode(ISD::SRL, dl, ExtVT, RLo, DAG.getConstant(8, dl, ExtVT));
   20329       RHi =
   20330           DAG.getNode(ISD::SRL, dl, ExtVT, RHi, DAG.getConstant(8, dl, ExtVT));
   20331       return DAG.getNode(X86ISD::PACKUS, dl, VT, RLo, RHi);
   20332     }
   20333   }
   20334 
   20335   // It's worth extending once and using the v8i32 shifts for 16-bit types, but
   20336   // the extra overheads to get from v16i8 to v8i32 make the existing SSE
   20337   // solution better.
   20338   if (Subtarget.hasInt256() && VT == MVT::v8i16) {
   20339     MVT ExtVT = MVT::v8i32;
   20340     unsigned ExtOpc =
   20341         Op.getOpcode() == ISD::SRA ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
   20342     R = DAG.getNode(ExtOpc, dl, ExtVT, R);
   20343     Amt = DAG.getNode(ISD::ANY_EXTEND, dl, ExtVT, Amt);
   20344     return DAG.getNode(ISD::TRUNCATE, dl, VT,
   20345                        DAG.getNode(Op.getOpcode(), dl, ExtVT, R, Amt));
   20346   }
   20347 
   20348   if (Subtarget.hasInt256() && !Subtarget.hasXOP() && VT == MVT::v16i16) {
   20349     MVT ExtVT = MVT::v8i32;
   20350     SDValue Z = getZeroVector(VT, Subtarget, DAG, dl);
   20351     SDValue ALo = DAG.getNode(X86ISD::UNPCKL, dl, VT, Amt, Z);
   20352     SDValue AHi = DAG.getNode(X86ISD::UNPCKH, dl, VT, Amt, Z);
   20353     SDValue RLo = DAG.getNode(X86ISD::UNPCKL, dl, VT, Z, R);
   20354     SDValue RHi = DAG.getNode(X86ISD::UNPCKH, dl, VT, Z, R);
   20355     ALo = DAG.getBitcast(ExtVT, ALo);
   20356     AHi = DAG.getBitcast(ExtVT, AHi);
   20357     RLo = DAG.getBitcast(ExtVT, RLo);
   20358     RHi = DAG.getBitcast(ExtVT, RHi);
   20359     SDValue Lo = DAG.getNode(Op.getOpcode(), dl, ExtVT, RLo, ALo);
   20360     SDValue Hi = DAG.getNode(Op.getOpcode(), dl, ExtVT, RHi, AHi);
   20361     Lo = DAG.getNode(ISD::SRL, dl, ExtVT, Lo, DAG.getConstant(16, dl, ExtVT));
   20362     Hi = DAG.getNode(ISD::SRL, dl, ExtVT, Hi, DAG.getConstant(16, dl, ExtVT));
   20363     return DAG.getNode(X86ISD::PACKUS, dl, VT, Lo, Hi);
   20364   }
   20365 
   20366   if (VT == MVT::v8i16) {
   20367     unsigned ShiftOpcode = Op->getOpcode();
   20368 
   20369     // If we have a constant shift amount, the non-SSE41 path is best as
   20370     // avoiding bitcasts make it easier to constant fold and reduce to PBLENDW.
   20371     bool UseSSE41 = Subtarget.hasSSE41() &&
   20372                     !ISD::isBuildVectorOfConstantSDNodes(Amt.getNode());
   20373 
   20374     auto SignBitSelect = [&](SDValue Sel, SDValue V0, SDValue V1) {
   20375       // On SSE41 targets we make use of the fact that VSELECT lowers
   20376       // to PBLENDVB which selects bytes based just on the sign bit.
   20377       if (UseSSE41) {
   20378         MVT ExtVT = MVT::getVectorVT(MVT::i8, VT.getVectorNumElements() * 2);
   20379         V0 = DAG.getBitcast(ExtVT, V0);
   20380         V1 = DAG.getBitcast(ExtVT, V1);
   20381         Sel = DAG.getBitcast(ExtVT, Sel);
   20382         return DAG.getBitcast(
   20383             VT, DAG.getNode(ISD::VSELECT, dl, ExtVT, Sel, V0, V1));
   20384       }
   20385       // On pre-SSE41 targets we splat the sign bit - a negative value will
   20386       // set all bits of the lanes to true and VSELECT uses that in
   20387       // its OR(AND(V0,C),AND(V1,~C)) lowering.
   20388       SDValue C =
   20389           DAG.getNode(ISD::SRA, dl, VT, Sel, DAG.getConstant(15, dl, VT));
   20390       return DAG.getNode(ISD::VSELECT, dl, VT, C, V0, V1);
   20391     };
   20392 
   20393     // Turn 'a' into a mask suitable for VSELECT: a = a << 12;
   20394     if (UseSSE41) {
   20395       // On SSE41 targets we need to replicate the shift mask in both
   20396       // bytes for PBLENDVB.
   20397       Amt = DAG.getNode(
   20398           ISD::OR, dl, VT,
   20399           DAG.getNode(ISD::SHL, dl, VT, Amt, DAG.getConstant(4, dl, VT)),
   20400           DAG.getNode(ISD::SHL, dl, VT, Amt, DAG.getConstant(12, dl, VT)));
   20401     } else {
   20402       Amt = DAG.getNode(ISD::SHL, dl, VT, Amt, DAG.getConstant(12, dl, VT));
   20403     }
   20404 
   20405     // r = VSELECT(r, shift(r, 8), a);
   20406     SDValue M = DAG.getNode(ShiftOpcode, dl, VT, R, DAG.getConstant(8, dl, VT));
   20407     R = SignBitSelect(Amt, M, R);
   20408 
   20409     // a += a
   20410     Amt = DAG.getNode(ISD::ADD, dl, VT, Amt, Amt);
   20411 
   20412     // r = VSELECT(r, shift(r, 4), a);
   20413     M = DAG.getNode(ShiftOpcode, dl, VT, R, DAG.getConstant(4, dl, VT));
   20414     R = SignBitSelect(Amt, M, R);
   20415 
   20416     // a += a
   20417     Amt = DAG.getNode(ISD::ADD, dl, VT, Amt, Amt);
   20418 
   20419     // r = VSELECT(r, shift(r, 2), a);
   20420     M = DAG.getNode(ShiftOpcode, dl, VT, R, DAG.getConstant(2, dl, VT));
   20421     R = SignBitSelect(Amt, M, R);
   20422 
   20423     // a += a
   20424     Amt = DAG.getNode(ISD::ADD, dl, VT, Amt, Amt);
   20425 
   20426     // return VSELECT(r, shift(r, 1), a);
   20427     M = DAG.getNode(ShiftOpcode, dl, VT, R, DAG.getConstant(1, dl, VT));
   20428     R = SignBitSelect(Amt, M, R);
   20429     return R;
   20430   }
   20431 
   20432   // Decompose 256-bit shifts into smaller 128-bit shifts.
   20433   if (VT.is256BitVector())
   20434     return Lower256IntArith(Op, DAG);
   20435 
   20436   return SDValue();
   20437 }
   20438 
   20439 static SDValue LowerRotate(SDValue Op, const X86Subtarget &Subtarget,
   20440                            SelectionDAG &DAG) {
   20441   MVT VT = Op.getSimpleValueType();
   20442   SDLoc DL(Op);
   20443   SDValue R = Op.getOperand(0);
   20444   SDValue Amt = Op.getOperand(1);
   20445 
   20446   assert(VT.isVector() && "Custom lowering only for vector rotates!");
   20447   assert(Subtarget.hasXOP() && "XOP support required for vector rotates!");
   20448   assert((Op.getOpcode() == ISD::ROTL) && "Only ROTL supported");
   20449 
   20450   // XOP has 128-bit vector variable + immediate rotates.
   20451   // +ve/-ve Amt = rotate left/right.
   20452 
   20453   // Split 256-bit integers.
   20454   if (VT.is256BitVector())
   20455     return Lower256IntArith(Op, DAG);
   20456 
   20457   assert(VT.is128BitVector() && "Only rotate 128-bit vectors!");
   20458 
   20459   // Attempt to rotate by immediate.
   20460   if (auto *BVAmt = dyn_cast<BuildVectorSDNode>(Amt)) {
   20461     if (auto *RotateConst = BVAmt->getConstantSplatNode()) {
   20462       uint64_t RotateAmt = RotateConst->getAPIntValue().getZExtValue();
   20463       assert(RotateAmt < VT.getScalarSizeInBits() && "Rotation out of range");
   20464       return DAG.getNode(X86ISD::VPROTI, DL, VT, R,
   20465                          DAG.getConstant(RotateAmt, DL, MVT::i8));
   20466     }
   20467   }
   20468 
   20469   // Use general rotate by variable (per-element).
   20470   return DAG.getNode(X86ISD::VPROT, DL, VT, R, Amt);
   20471 }
   20472 
   20473 static SDValue LowerXALUO(SDValue Op, SelectionDAG &DAG) {
   20474   // Lower the "add/sub/mul with overflow" instruction into a regular ins plus
   20475   // a "setcc" instruction that checks the overflow flag. The "brcond" lowering
   20476   // looks for this combo and may remove the "setcc" instruction if the "setcc"
   20477   // has only one use.
   20478   SDNode *N = Op.getNode();
   20479   SDValue LHS = N->getOperand(0);
   20480   SDValue RHS = N->getOperand(1);
   20481   unsigned BaseOp = 0;
   20482   unsigned Cond = 0;
   20483   SDLoc DL(Op);
   20484   switch (Op.getOpcode()) {
   20485   default: llvm_unreachable("Unknown ovf instruction!");
   20486   case ISD::SADDO:
   20487     // A subtract of one will be selected as a INC. Note that INC doesn't
   20488     // set CF, so we can't do this for UADDO.
   20489     if (isOneConstant(RHS)) {
   20490         BaseOp = X86ISD::INC;
   20491         Cond = X86::COND_O;
   20492         break;
   20493       }
   20494     BaseOp = X86ISD::ADD;
   20495     Cond = X86::COND_O;
   20496     break;
   20497   case ISD::UADDO:
   20498     BaseOp = X86ISD::ADD;
   20499     Cond = X86::COND_B;
   20500     break;
   20501   case ISD::SSUBO:
   20502     // A subtract of one will be selected as a DEC. Note that DEC doesn't
   20503     // set CF, so we can't do this for USUBO.
   20504     if (isOneConstant(RHS)) {
   20505         BaseOp = X86ISD::DEC;
   20506         Cond = X86::COND_O;
   20507         break;
   20508       }
   20509     BaseOp = X86ISD::SUB;
   20510     Cond = X86::COND_O;
   20511     break;
   20512   case ISD::USUBO:
   20513     BaseOp = X86ISD::SUB;
   20514     Cond = X86::COND_B;
   20515     break;
   20516   case ISD::SMULO:
   20517     BaseOp = N->getValueType(0) == MVT::i8 ? X86ISD::SMUL8 : X86ISD::SMUL;
   20518     Cond = X86::COND_O;
   20519     break;
   20520   case ISD::UMULO: { // i64, i8 = umulo lhs, rhs --> i64, i64, i32 umul lhs,rhs
   20521     if (N->getValueType(0) == MVT::i8) {
   20522       BaseOp = X86ISD::UMUL8;
   20523       Cond = X86::COND_O;
   20524       break;
   20525     }
   20526     SDVTList VTs = DAG.getVTList(N->getValueType(0), N->getValueType(0),
   20527                                  MVT::i32);
   20528     SDValue Sum = DAG.getNode(X86ISD::UMUL, DL, VTs, LHS, RHS);
   20529 
   20530     SDValue SetCC =
   20531       DAG.getNode(X86ISD::SETCC, DL, MVT::i8,
   20532                   DAG.getConstant(X86::COND_O, DL, MVT::i32),
   20533                   SDValue(Sum.getNode(), 2));
   20534 
   20535     if (N->getValueType(1) == MVT::i1) {
   20536       SetCC = DAG.getNode(ISD::AssertZext, DL, MVT::i8, SetCC,
   20537                           DAG.getValueType(MVT::i1));
   20538       SetCC = DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, SetCC);
   20539     }
   20540     return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(), Sum, SetCC);
   20541   }
   20542   }
   20543 
   20544   // Also sets EFLAGS.
   20545   SDVTList VTs = DAG.getVTList(N->getValueType(0), MVT::i32);
   20546   SDValue Sum = DAG.getNode(BaseOp, DL, VTs, LHS, RHS);
   20547 
   20548   SDValue SetCC =
   20549     DAG.getNode(X86ISD::SETCC, DL, MVT::i8,
   20550                 DAG.getConstant(Cond, DL, MVT::i32),
   20551                 SDValue(Sum.getNode(), 1));
   20552 
   20553   if (N->getValueType(1) == MVT::i1) {
   20554     SetCC = DAG.getNode(ISD::AssertZext, DL, MVT::i8, SetCC,
   20555                         DAG.getValueType(MVT::i1));
   20556     SetCC = DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, SetCC);
   20557   }
   20558   return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(), Sum, SetCC);
   20559 }
   20560 
   20561 /// Returns true if the operand type is exactly twice the native width, and
   20562 /// the corresponding cmpxchg8b or cmpxchg16b instruction is available.
   20563 /// Used to know whether to use cmpxchg8/16b when expanding atomic operations
   20564 /// (otherwise we leave them alone to become __sync_fetch_and_... calls).
   20565 bool X86TargetLowering::needsCmpXchgNb(Type *MemType) const {
   20566   unsigned OpWidth = MemType->getPrimitiveSizeInBits();
   20567 
   20568   if (OpWidth == 64)
   20569     return !Subtarget.is64Bit(); // FIXME this should be Subtarget.hasCmpxchg8b
   20570   else if (OpWidth == 128)
   20571     return Subtarget.hasCmpxchg16b();
   20572   else
   20573     return false;
   20574 }
   20575 
   20576 bool X86TargetLowering::shouldExpandAtomicStoreInIR(StoreInst *SI) const {
   20577   return needsCmpXchgNb(SI->getValueOperand()->getType());
   20578 }
   20579 
   20580 // Note: this turns large loads into lock cmpxchg8b/16b.
   20581 // FIXME: On 32 bits x86, fild/movq might be faster than lock cmpxchg8b.
   20582 TargetLowering::AtomicExpansionKind
   20583 X86TargetLowering::shouldExpandAtomicLoadInIR(LoadInst *LI) const {
   20584   auto PTy = cast<PointerType>(LI->getPointerOperand()->getType());
   20585   return needsCmpXchgNb(PTy->getElementType()) ? AtomicExpansionKind::CmpXChg
   20586                                                : AtomicExpansionKind::None;
   20587 }
   20588 
   20589 TargetLowering::AtomicExpansionKind
   20590 X86TargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const {
   20591   unsigned NativeWidth = Subtarget.is64Bit() ? 64 : 32;
   20592   Type *MemType = AI->getType();
   20593 
   20594   // If the operand is too big, we must see if cmpxchg8/16b is available
   20595   // and default to library calls otherwise.
   20596   if (MemType->getPrimitiveSizeInBits() > NativeWidth) {
   20597     return needsCmpXchgNb(MemType) ? AtomicExpansionKind::CmpXChg
   20598                                    : AtomicExpansionKind::None;
   20599   }
   20600 
   20601   AtomicRMWInst::BinOp Op = AI->getOperation();
   20602   switch (Op) {
   20603   default:
   20604     llvm_unreachable("Unknown atomic operation");
   20605   case AtomicRMWInst::Xchg:
   20606   case AtomicRMWInst::Add:
   20607   case AtomicRMWInst::Sub:
   20608     // It's better to use xadd, xsub or xchg for these in all cases.
   20609     return AtomicExpansionKind::None;
   20610   case AtomicRMWInst::Or:
   20611   case AtomicRMWInst::And:
   20612   case AtomicRMWInst::Xor:
   20613     // If the atomicrmw's result isn't actually used, we can just add a "lock"
   20614     // prefix to a normal instruction for these operations.
   20615     return !AI->use_empty() ? AtomicExpansionKind::CmpXChg
   20616                             : AtomicExpansionKind::None;
   20617   case AtomicRMWInst::Nand:
   20618   case AtomicRMWInst::Max:
   20619   case AtomicRMWInst::Min:
   20620   case AtomicRMWInst::UMax:
   20621   case AtomicRMWInst::UMin:
   20622     // These always require a non-trivial set of data operations on x86. We must
   20623     // use a cmpxchg loop.
   20624     return AtomicExpansionKind::CmpXChg;
   20625   }
   20626 }
   20627 
   20628 LoadInst *
   20629 X86TargetLowering::lowerIdempotentRMWIntoFencedLoad(AtomicRMWInst *AI) const {
   20630   unsigned NativeWidth = Subtarget.is64Bit() ? 64 : 32;
   20631   Type *MemType = AI->getType();
   20632   // Accesses larger than the native width are turned into cmpxchg/libcalls, so
   20633   // there is no benefit in turning such RMWs into loads, and it is actually
   20634   // harmful as it introduces a mfence.
   20635   if (MemType->getPrimitiveSizeInBits() > NativeWidth)
   20636     return nullptr;
   20637 
   20638   auto Builder = IRBuilder<>(AI);
   20639   Module *M = Builder.GetInsertBlock()->getParent()->getParent();
   20640   auto SynchScope = AI->getSynchScope();
   20641   // We must restrict the ordering to avoid generating loads with Release or
   20642   // ReleaseAcquire orderings.
   20643   auto Order = AtomicCmpXchgInst::getStrongestFailureOrdering(AI->getOrdering());
   20644   auto Ptr = AI->getPointerOperand();
   20645 
   20646   // Before the load we need a fence. Here is an example lifted from
   20647   // http://www.hpl.hp.com/techreports/2012/HPL-2012-68.pdf showing why a fence
   20648   // is required:
   20649   // Thread 0:
   20650   //   x.store(1, relaxed);
   20651   //   r1 = y.fetch_add(0, release);
   20652   // Thread 1:
   20653   //   y.fetch_add(42, acquire);
   20654   //   r2 = x.load(relaxed);
   20655   // r1 = r2 = 0 is impossible, but becomes possible if the idempotent rmw is
   20656   // lowered to just a load without a fence. A mfence flushes the store buffer,
   20657   // making the optimization clearly correct.
   20658   // FIXME: it is required if isReleaseOrStronger(Order) but it is not clear
   20659   // otherwise, we might be able to be more aggressive on relaxed idempotent
   20660   // rmw. In practice, they do not look useful, so we don't try to be
   20661   // especially clever.
   20662   if (SynchScope == SingleThread)
   20663     // FIXME: we could just insert an X86ISD::MEMBARRIER here, except we are at
   20664     // the IR level, so we must wrap it in an intrinsic.
   20665     return nullptr;
   20666 
   20667   if (!Subtarget.hasMFence())
   20668     // FIXME: it might make sense to use a locked operation here but on a
   20669     // different cache-line to prevent cache-line bouncing. In practice it
   20670     // is probably a small win, and x86 processors without mfence are rare
   20671     // enough that we do not bother.
   20672     return nullptr;
   20673 
   20674   Function *MFence =
   20675       llvm::Intrinsic::getDeclaration(M, Intrinsic::x86_sse2_mfence);
   20676   Builder.CreateCall(MFence, {});
   20677 
   20678   // Finally we can emit the atomic load.
   20679   LoadInst *Loaded = Builder.CreateAlignedLoad(Ptr,
   20680           AI->getType()->getPrimitiveSizeInBits());
   20681   Loaded->setAtomic(Order, SynchScope);
   20682   AI->replaceAllUsesWith(Loaded);
   20683   AI->eraseFromParent();
   20684   return Loaded;
   20685 }
   20686 
   20687 static SDValue LowerATOMIC_FENCE(SDValue Op, const X86Subtarget &Subtarget,
   20688                                  SelectionDAG &DAG) {
   20689   SDLoc dl(Op);
   20690   AtomicOrdering FenceOrdering = static_cast<AtomicOrdering>(
   20691     cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue());
   20692   SynchronizationScope FenceScope = static_cast<SynchronizationScope>(
   20693     cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue());
   20694 
   20695   // The only fence that needs an instruction is a sequentially-consistent
   20696   // cross-thread fence.
   20697   if (FenceOrdering == AtomicOrdering::SequentiallyConsistent &&
   20698       FenceScope == CrossThread) {
   20699     if (Subtarget.hasMFence())
   20700       return DAG.getNode(X86ISD::MFENCE, dl, MVT::Other, Op.getOperand(0));
   20701 
   20702     SDValue Chain = Op.getOperand(0);
   20703     SDValue Zero = DAG.getConstant(0, dl, MVT::i32);
   20704     SDValue Ops[] = {
   20705       DAG.getRegister(X86::ESP, MVT::i32),     // Base
   20706       DAG.getTargetConstant(1, dl, MVT::i8),   // Scale
   20707       DAG.getRegister(0, MVT::i32),            // Index
   20708       DAG.getTargetConstant(0, dl, MVT::i32),  // Disp
   20709       DAG.getRegister(0, MVT::i32),            // Segment.
   20710       Zero,
   20711       Chain
   20712     };
   20713     SDNode *Res = DAG.getMachineNode(X86::OR32mrLocked, dl, MVT::Other, Ops);
   20714     return SDValue(Res, 0);
   20715   }
   20716 
   20717   // MEMBARRIER is a compiler barrier; it codegens to a no-op.
   20718   return DAG.getNode(X86ISD::MEMBARRIER, dl, MVT::Other, Op.getOperand(0));
   20719 }
   20720 
   20721 static SDValue LowerCMP_SWAP(SDValue Op, const X86Subtarget &Subtarget,
   20722                              SelectionDAG &DAG) {
   20723   MVT T = Op.getSimpleValueType();
   20724   SDLoc DL(Op);
   20725   unsigned Reg = 0;
   20726   unsigned size = 0;
   20727   switch(T.SimpleTy) {
   20728   default: llvm_unreachable("Invalid value type!");
   20729   case MVT::i8:  Reg = X86::AL;  size = 1; break;
   20730   case MVT::i16: Reg = X86::AX;  size = 2; break;
   20731   case MVT::i32: Reg = X86::EAX; size = 4; break;
   20732   case MVT::i64:
   20733     assert(Subtarget.is64Bit() && "Node not type legal!");
   20734     Reg = X86::RAX; size = 8;
   20735     break;
   20736   }
   20737   SDValue cpIn = DAG.getCopyToReg(Op.getOperand(0), DL, Reg,
   20738                                   Op.getOperand(2), SDValue());
   20739   SDValue Ops[] = { cpIn.getValue(0),
   20740                     Op.getOperand(1),
   20741                     Op.getOperand(3),
   20742                     DAG.getTargetConstant(size, DL, MVT::i8),
   20743                     cpIn.getValue(1) };
   20744   SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue);
   20745   MachineMemOperand *MMO = cast<AtomicSDNode>(Op)->getMemOperand();
   20746   SDValue Result = DAG.getMemIntrinsicNode(X86ISD::LCMPXCHG_DAG, DL, Tys,
   20747                                            Ops, T, MMO);
   20748 
   20749   SDValue cpOut =
   20750     DAG.getCopyFromReg(Result.getValue(0), DL, Reg, T, Result.getValue(1));
   20751   SDValue EFLAGS = DAG.getCopyFromReg(cpOut.getValue(1), DL, X86::EFLAGS,
   20752                                       MVT::i32, cpOut.getValue(2));
   20753   SDValue Success = DAG.getNode(X86ISD::SETCC, DL, Op->getValueType(1),
   20754                                 DAG.getConstant(X86::COND_E, DL, MVT::i8),
   20755                                 EFLAGS);
   20756 
   20757   DAG.ReplaceAllUsesOfValueWith(Op.getValue(0), cpOut);
   20758   DAG.ReplaceAllUsesOfValueWith(Op.getValue(1), Success);
   20759   DAG.ReplaceAllUsesOfValueWith(Op.getValue(2), EFLAGS.getValue(1));
   20760   return SDValue();
   20761 }
   20762 
   20763 static SDValue LowerBITCAST(SDValue Op, const X86Subtarget &Subtarget,
   20764                             SelectionDAG &DAG) {
   20765   MVT SrcVT = Op.getOperand(0).getSimpleValueType();
   20766   MVT DstVT = Op.getSimpleValueType();
   20767 
   20768   if (SrcVT == MVT::v2i32 || SrcVT == MVT::v4i16 || SrcVT == MVT::v8i8 ||
   20769       SrcVT == MVT::i64) {
   20770     assert(Subtarget.hasSSE2() && "Requires at least SSE2!");
   20771     if (DstVT != MVT::f64)
   20772       // This conversion needs to be expanded.
   20773       return SDValue();
   20774 
   20775     SDValue Op0 = Op->getOperand(0);
   20776     SmallVector<SDValue, 16> Elts;
   20777     SDLoc dl(Op);
   20778     unsigned NumElts;
   20779     MVT SVT;
   20780     if (SrcVT.isVector()) {
   20781       NumElts = SrcVT.getVectorNumElements();
   20782       SVT = SrcVT.getVectorElementType();
   20783 
   20784       // Widen the vector in input in the case of MVT::v2i32.
   20785       // Example: from MVT::v2i32 to MVT::v4i32.
   20786       for (unsigned i = 0, e = NumElts; i != e; ++i)
   20787         Elts.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, SVT, Op0,
   20788                                    DAG.getIntPtrConstant(i, dl)));
   20789     } else {
   20790       assert(SrcVT == MVT::i64 && !Subtarget.is64Bit() &&
   20791              "Unexpected source type in LowerBITCAST");
   20792       Elts.push_back(DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, Op0,
   20793                                  DAG.getIntPtrConstant(0, dl)));
   20794       Elts.push_back(DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, Op0,
   20795                                  DAG.getIntPtrConstant(1, dl)));
   20796       NumElts = 2;
   20797       SVT = MVT::i32;
   20798     }
   20799     // Explicitly mark the extra elements as Undef.
   20800     Elts.append(NumElts, DAG.getUNDEF(SVT));
   20801 
   20802     EVT NewVT = EVT::getVectorVT(*DAG.getContext(), SVT, NumElts * 2);
   20803     SDValue BV = DAG.getBuildVector(NewVT, dl, Elts);
   20804     SDValue ToV2F64 = DAG.getBitcast(MVT::v2f64, BV);
   20805     return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, ToV2F64,
   20806                        DAG.getIntPtrConstant(0, dl));
   20807   }
   20808 
   20809   assert(Subtarget.is64Bit() && !Subtarget.hasSSE2() &&
   20810          Subtarget.hasMMX() && "Unexpected custom BITCAST");
   20811   assert((DstVT == MVT::i64 ||
   20812           (DstVT.isVector() && DstVT.getSizeInBits()==64)) &&
   20813          "Unexpected custom BITCAST");
   20814   // i64 <=> MMX conversions are Legal.
   20815   if (SrcVT==MVT::i64 && DstVT.isVector())
   20816     return Op;
   20817   if (DstVT==MVT::i64 && SrcVT.isVector())
   20818     return Op;
   20819   // MMX <=> MMX conversions are Legal.
   20820   if (SrcVT.isVector() && DstVT.isVector())
   20821     return Op;
   20822   // All other conversions need to be expanded.
   20823   return SDValue();
   20824 }
   20825 
   20826 /// Compute the horizontal sum of bytes in V for the elements of VT.
   20827 ///
   20828 /// Requires V to be a byte vector and VT to be an integer vector type with
   20829 /// wider elements than V's type. The width of the elements of VT determines
   20830 /// how many bytes of V are summed horizontally to produce each element of the
   20831 /// result.
   20832 static SDValue LowerHorizontalByteSum(SDValue V, MVT VT,
   20833                                       const X86Subtarget &Subtarget,
   20834                                       SelectionDAG &DAG) {
   20835   SDLoc DL(V);
   20836   MVT ByteVecVT = V.getSimpleValueType();
   20837   MVT EltVT = VT.getVectorElementType();
   20838   assert(ByteVecVT.getVectorElementType() == MVT::i8 &&
   20839          "Expected value to have byte element type.");
   20840   assert(EltVT != MVT::i8 &&
   20841          "Horizontal byte sum only makes sense for wider elements!");
   20842   unsigned VecSize = VT.getSizeInBits();
   20843   assert(ByteVecVT.getSizeInBits() == VecSize && "Cannot change vector size!");
   20844 
   20845   // PSADBW instruction horizontally add all bytes and leave the result in i64
   20846   // chunks, thus directly computes the pop count for v2i64 and v4i64.
   20847   if (EltVT == MVT::i64) {
   20848     SDValue Zeros = getZeroVector(ByteVecVT, Subtarget, DAG, DL);
   20849     MVT SadVecVT = MVT::getVectorVT(MVT::i64, VecSize / 64);
   20850     V = DAG.getNode(X86ISD::PSADBW, DL, SadVecVT, V, Zeros);
   20851     return DAG.getBitcast(VT, V);
   20852   }
   20853 
   20854   if (EltVT == MVT::i32) {
   20855     // We unpack the low half and high half into i32s interleaved with zeros so
   20856     // that we can use PSADBW to horizontally sum them. The most useful part of
   20857     // this is that it lines up the results of two PSADBW instructions to be
   20858     // two v2i64 vectors which concatenated are the 4 population counts. We can
   20859     // then use PACKUSWB to shrink and concatenate them into a v4i32 again.
   20860     SDValue Zeros = getZeroVector(VT, Subtarget, DAG, DL);
   20861     SDValue Low = DAG.getNode(X86ISD::UNPCKL, DL, VT, V, Zeros);
   20862     SDValue High = DAG.getNode(X86ISD::UNPCKH, DL, VT, V, Zeros);
   20863 
   20864     // Do the horizontal sums into two v2i64s.
   20865     Zeros = getZeroVector(ByteVecVT, Subtarget, DAG, DL);
   20866     MVT SadVecVT = MVT::getVectorVT(MVT::i64, VecSize / 64);
   20867     Low = DAG.getNode(X86ISD::PSADBW, DL, SadVecVT,
   20868                       DAG.getBitcast(ByteVecVT, Low), Zeros);
   20869     High = DAG.getNode(X86ISD::PSADBW, DL, SadVecVT,
   20870                        DAG.getBitcast(ByteVecVT, High), Zeros);
   20871 
   20872     // Merge them together.
   20873     MVT ShortVecVT = MVT::getVectorVT(MVT::i16, VecSize / 16);
   20874     V = DAG.getNode(X86ISD::PACKUS, DL, ByteVecVT,
   20875                     DAG.getBitcast(ShortVecVT, Low),
   20876                     DAG.getBitcast(ShortVecVT, High));
   20877 
   20878     return DAG.getBitcast(VT, V);
   20879   }
   20880 
   20881   // The only element type left is i16.
   20882   assert(EltVT == MVT::i16 && "Unknown how to handle type");
   20883 
   20884   // To obtain pop count for each i16 element starting from the pop count for
   20885   // i8 elements, shift the i16s left by 8, sum as i8s, and then shift as i16s
   20886   // right by 8. It is important to shift as i16s as i8 vector shift isn't
   20887   // directly supported.
   20888   SDValue ShifterV = DAG.getConstant(8, DL, VT);
   20889   SDValue Shl = DAG.getNode(ISD::SHL, DL, VT, DAG.getBitcast(VT, V), ShifterV);
   20890   V = DAG.getNode(ISD::ADD, DL, ByteVecVT, DAG.getBitcast(ByteVecVT, Shl),
   20891                   DAG.getBitcast(ByteVecVT, V));
   20892   return DAG.getNode(ISD::SRL, DL, VT, DAG.getBitcast(VT, V), ShifterV);
   20893 }
   20894 
   20895 static SDValue LowerVectorCTPOPInRegLUT(SDValue Op, const SDLoc &DL,
   20896                                         const X86Subtarget &Subtarget,
   20897                                         SelectionDAG &DAG) {
   20898   MVT VT = Op.getSimpleValueType();
   20899   MVT EltVT = VT.getVectorElementType();
   20900   unsigned VecSize = VT.getSizeInBits();
   20901 
   20902   // Implement a lookup table in register by using an algorithm based on:
   20903   // http://wm.ite.pl/articles/sse-popcount.html
   20904   //
   20905   // The general idea is that every lower byte nibble in the input vector is an
   20906   // index into a in-register pre-computed pop count table. We then split up the
   20907   // input vector in two new ones: (1) a vector with only the shifted-right
   20908   // higher nibbles for each byte and (2) a vector with the lower nibbles (and
   20909   // masked out higher ones) for each byte. PSHUB is used separately with both
   20910   // to index the in-register table. Next, both are added and the result is a
   20911   // i8 vector where each element contains the pop count for input byte.
   20912   //
   20913   // To obtain the pop count for elements != i8, we follow up with the same
   20914   // approach and use additional tricks as described below.
   20915   //
   20916   const int LUT[16] = {/* 0 */ 0, /* 1 */ 1, /* 2 */ 1, /* 3 */ 2,
   20917                        /* 4 */ 1, /* 5 */ 2, /* 6 */ 2, /* 7 */ 3,
   20918                        /* 8 */ 1, /* 9 */ 2, /* a */ 2, /* b */ 3,
   20919                        /* c */ 2, /* d */ 3, /* e */ 3, /* f */ 4};
   20920 
   20921   int NumByteElts = VecSize / 8;
   20922   MVT ByteVecVT = MVT::getVectorVT(MVT::i8, NumByteElts);
   20923   SDValue In = DAG.getBitcast(ByteVecVT, Op);
   20924   SmallVector<SDValue, 64> LUTVec;
   20925   for (int i = 0; i < NumByteElts; ++i)
   20926     LUTVec.push_back(DAG.getConstant(LUT[i % 16], DL, MVT::i8));
   20927   SDValue InRegLUT = DAG.getBuildVector(ByteVecVT, DL, LUTVec);
   20928   SDValue M0F = DAG.getConstant(0x0F, DL, ByteVecVT);
   20929 
   20930   // High nibbles
   20931   SDValue FourV = DAG.getConstant(4, DL, ByteVecVT);
   20932   SDValue HighNibbles = DAG.getNode(ISD::SRL, DL, ByteVecVT, In, FourV);
   20933 
   20934   // Low nibbles
   20935   SDValue LowNibbles = DAG.getNode(ISD::AND, DL, ByteVecVT, In, M0F);
   20936 
   20937   // The input vector is used as the shuffle mask that index elements into the
   20938   // LUT. After counting low and high nibbles, add the vector to obtain the
   20939   // final pop count per i8 element.
   20940   SDValue HighPopCnt =
   20941       DAG.getNode(X86ISD::PSHUFB, DL, ByteVecVT, InRegLUT, HighNibbles);
   20942   SDValue LowPopCnt =
   20943       DAG.getNode(X86ISD::PSHUFB, DL, ByteVecVT, InRegLUT, LowNibbles);
   20944   SDValue PopCnt = DAG.getNode(ISD::ADD, DL, ByteVecVT, HighPopCnt, LowPopCnt);
   20945 
   20946   if (EltVT == MVT::i8)
   20947     return PopCnt;
   20948 
   20949   return LowerHorizontalByteSum(PopCnt, VT, Subtarget, DAG);
   20950 }
   20951 
   20952 static SDValue LowerVectorCTPOPBitmath(SDValue Op, const SDLoc &DL,
   20953                                        const X86Subtarget &Subtarget,
   20954                                        SelectionDAG &DAG) {
   20955   MVT VT = Op.getSimpleValueType();
   20956   assert(VT.is128BitVector() &&
   20957          "Only 128-bit vector bitmath lowering supported.");
   20958 
   20959   int VecSize = VT.getSizeInBits();
   20960   MVT EltVT = VT.getVectorElementType();
   20961   int Len = EltVT.getSizeInBits();
   20962 
   20963   // This is the vectorized version of the "best" algorithm from
   20964   // http://graphics.stanford.edu/~seander/bithacks.html#CountBitsSetParallel
   20965   // with a minor tweak to use a series of adds + shifts instead of vector
   20966   // multiplications. Implemented for all integer vector types. We only use
   20967   // this when we don't have SSSE3 which allows a LUT-based lowering that is
   20968   // much faster, even faster than using native popcnt instructions.
   20969 
   20970   auto GetShift = [&](unsigned OpCode, SDValue V, int Shifter) {
   20971     MVT VT = V.getSimpleValueType();
   20972     SDValue ShifterV = DAG.getConstant(Shifter, DL, VT);
   20973     return DAG.getNode(OpCode, DL, VT, V, ShifterV);
   20974   };
   20975   auto GetMask = [&](SDValue V, APInt Mask) {
   20976     MVT VT = V.getSimpleValueType();
   20977     SDValue MaskV = DAG.getConstant(Mask, DL, VT);
   20978     return DAG.getNode(ISD::AND, DL, VT, V, MaskV);
   20979   };
   20980 
   20981   // We don't want to incur the implicit masks required to SRL vNi8 vectors on
   20982   // x86, so set the SRL type to have elements at least i16 wide. This is
   20983   // correct because all of our SRLs are followed immediately by a mask anyways
   20984   // that handles any bits that sneak into the high bits of the byte elements.
   20985   MVT SrlVT = Len > 8 ? VT : MVT::getVectorVT(MVT::i16, VecSize / 16);
   20986 
   20987   SDValue V = Op;
   20988 
   20989   // v = v - ((v >> 1) & 0x55555555...)
   20990   SDValue Srl =
   20991       DAG.getBitcast(VT, GetShift(ISD::SRL, DAG.getBitcast(SrlVT, V), 1));
   20992   SDValue And = GetMask(Srl, APInt::getSplat(Len, APInt(8, 0x55)));
   20993   V = DAG.getNode(ISD::SUB, DL, VT, V, And);
   20994 
   20995   // v = (v & 0x33333333...) + ((v >> 2) & 0x33333333...)
   20996   SDValue AndLHS = GetMask(V, APInt::getSplat(Len, APInt(8, 0x33)));
   20997   Srl = DAG.getBitcast(VT, GetShift(ISD::SRL, DAG.getBitcast(SrlVT, V), 2));
   20998   SDValue AndRHS = GetMask(Srl, APInt::getSplat(Len, APInt(8, 0x33)));
   20999   V = DAG.getNode(ISD::ADD, DL, VT, AndLHS, AndRHS);
   21000 
   21001   // v = (v + (v >> 4)) & 0x0F0F0F0F...
   21002   Srl = DAG.getBitcast(VT, GetShift(ISD::SRL, DAG.getBitcast(SrlVT, V), 4));
   21003   SDValue Add = DAG.getNode(ISD::ADD, DL, VT, V, Srl);
   21004   V = GetMask(Add, APInt::getSplat(Len, APInt(8, 0x0F)));
   21005 
   21006   // At this point, V contains the byte-wise population count, and we are
   21007   // merely doing a horizontal sum if necessary to get the wider element
   21008   // counts.
   21009   if (EltVT == MVT::i8)
   21010     return V;
   21011 
   21012   return LowerHorizontalByteSum(
   21013       DAG.getBitcast(MVT::getVectorVT(MVT::i8, VecSize / 8), V), VT, Subtarget,
   21014       DAG);
   21015 }
   21016 
   21017 static SDValue LowerVectorCTPOP(SDValue Op, const X86Subtarget &Subtarget,
   21018                                 SelectionDAG &DAG) {
   21019   MVT VT = Op.getSimpleValueType();
   21020   assert((VT.is512BitVector() || VT.is256BitVector() || VT.is128BitVector()) &&
   21021          "Unknown CTPOP type to handle");
   21022   SDLoc DL(Op.getNode());
   21023   SDValue Op0 = Op.getOperand(0);
   21024 
   21025   if (!Subtarget.hasSSSE3()) {
   21026     // We can't use the fast LUT approach, so fall back on vectorized bitmath.
   21027     assert(VT.is128BitVector() && "Only 128-bit vectors supported in SSE!");
   21028     return LowerVectorCTPOPBitmath(Op0, DL, Subtarget, DAG);
   21029   }
   21030 
   21031   if (VT.is256BitVector() && !Subtarget.hasInt256()) {
   21032     unsigned NumElems = VT.getVectorNumElements();
   21033 
   21034     // Extract each 128-bit vector, compute pop count and concat the result.
   21035     SDValue LHS = extract128BitVector(Op0, 0, DAG, DL);
   21036     SDValue RHS = extract128BitVector(Op0, NumElems / 2, DAG, DL);
   21037 
   21038     return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT,
   21039                        LowerVectorCTPOPInRegLUT(LHS, DL, Subtarget, DAG),
   21040                        LowerVectorCTPOPInRegLUT(RHS, DL, Subtarget, DAG));
   21041   }
   21042 
   21043   if (VT.is512BitVector() && !Subtarget.hasBWI()) {
   21044     unsigned NumElems = VT.getVectorNumElements();
   21045 
   21046     // Extract each 256-bit vector, compute pop count and concat the result.
   21047     SDValue LHS = extract256BitVector(Op0, 0, DAG, DL);
   21048     SDValue RHS = extract256BitVector(Op0, NumElems / 2, DAG, DL);
   21049 
   21050     return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT,
   21051                        LowerVectorCTPOPInRegLUT(LHS, DL, Subtarget, DAG),
   21052                        LowerVectorCTPOPInRegLUT(RHS, DL, Subtarget, DAG));
   21053   }
   21054 
   21055   return LowerVectorCTPOPInRegLUT(Op0, DL, Subtarget, DAG);
   21056 }
   21057 
   21058 static SDValue LowerCTPOP(SDValue Op, const X86Subtarget &Subtarget,
   21059                           SelectionDAG &DAG) {
   21060   assert(Op.getSimpleValueType().isVector() &&
   21061          "We only do custom lowering for vector population count.");
   21062   return LowerVectorCTPOP(Op, Subtarget, DAG);
   21063 }
   21064 
   21065 static SDValue LowerBITREVERSE_XOP(SDValue Op, SelectionDAG &DAG) {
   21066   MVT VT = Op.getSimpleValueType();
   21067   SDValue In = Op.getOperand(0);
   21068   SDLoc DL(Op);
   21069 
   21070   // For scalars, its still beneficial to transfer to/from the SIMD unit to
   21071   // perform the BITREVERSE.
   21072   if (!VT.isVector()) {
   21073     MVT VecVT = MVT::getVectorVT(VT, 128 / VT.getSizeInBits());
   21074     SDValue Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, In);
   21075     Res = DAG.getNode(ISD::BITREVERSE, DL, VecVT, Res);
   21076     return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Res,
   21077                        DAG.getIntPtrConstant(0, DL));
   21078   }
   21079 
   21080   MVT SVT = VT.getVectorElementType();
   21081   int NumElts = VT.getVectorNumElements();
   21082   int ScalarSizeInBytes = VT.getScalarSizeInBits() / 8;
   21083 
   21084   // Decompose 256-bit ops into smaller 128-bit ops.
   21085   if (VT.is256BitVector()) {
   21086     SDValue Lo = extract128BitVector(In, 0, DAG, DL);
   21087     SDValue Hi = extract128BitVector(In, NumElts / 2, DAG, DL);
   21088 
   21089     MVT HalfVT = MVT::getVectorVT(SVT, NumElts / 2);
   21090     return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT,
   21091                        DAG.getNode(ISD::BITREVERSE, DL, HalfVT, Lo),
   21092                        DAG.getNode(ISD::BITREVERSE, DL, HalfVT, Hi));
   21093   }
   21094 
   21095   assert(VT.is128BitVector() &&
   21096          "Only 128-bit vector bitreverse lowering supported.");
   21097 
   21098   // VPPERM reverses the bits of a byte with the permute Op (2 << 5), and we
   21099   // perform the BSWAP in the shuffle.
   21100   // Its best to shuffle using the second operand as this will implicitly allow
   21101   // memory folding for multiple vectors.
   21102   SmallVector<SDValue, 16> MaskElts;
   21103   for (int i = 0; i != NumElts; ++i) {
   21104     for (int j = ScalarSizeInBytes - 1; j >= 0; --j) {
   21105       int SourceByte = 16 + (i * ScalarSizeInBytes) + j;
   21106       int PermuteByte = SourceByte | (2 << 5);
   21107       MaskElts.push_back(DAG.getConstant(PermuteByte, DL, MVT::i8));
   21108     }
   21109   }
   21110 
   21111   SDValue Mask = DAG.getBuildVector(MVT::v16i8, DL, MaskElts);
   21112   SDValue Res = DAG.getBitcast(MVT::v16i8, In);
   21113   Res = DAG.getNode(X86ISD::VPPERM, DL, MVT::v16i8, DAG.getUNDEF(MVT::v16i8),
   21114                     Res, Mask);
   21115   return DAG.getBitcast(VT, Res);
   21116 }
   21117 
   21118 static SDValue LowerBITREVERSE(SDValue Op, const X86Subtarget &Subtarget,
   21119                                SelectionDAG &DAG) {
   21120   if (Subtarget.hasXOP())
   21121     return LowerBITREVERSE_XOP(Op, DAG);
   21122 
   21123   assert(Subtarget.hasSSSE3() && "SSSE3 required for BITREVERSE");
   21124 
   21125   MVT VT = Op.getSimpleValueType();
   21126   SDValue In = Op.getOperand(0);
   21127   SDLoc DL(Op);
   21128 
   21129   unsigned NumElts = VT.getVectorNumElements();
   21130   assert(VT.getScalarType() == MVT::i8 &&
   21131          "Only byte vector BITREVERSE supported");
   21132 
   21133   // Decompose 256-bit ops into smaller 128-bit ops on pre-AVX2.
   21134   if (VT.is256BitVector() && !Subtarget.hasInt256()) {
   21135     MVT HalfVT = MVT::getVectorVT(MVT::i8, NumElts / 2);
   21136     SDValue Lo = extract128BitVector(In, 0, DAG, DL);
   21137     SDValue Hi = extract128BitVector(In, NumElts / 2, DAG, DL);
   21138     Lo = DAG.getNode(ISD::BITREVERSE, DL, HalfVT, Lo);
   21139     Hi = DAG.getNode(ISD::BITREVERSE, DL, HalfVT, Hi);
   21140     return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Lo, Hi);
   21141   }
   21142 
   21143   // Perform BITREVERSE using PSHUFB lookups. Each byte is split into
   21144   // two nibbles and a PSHUFB lookup to find the bitreverse of each
   21145   // 0-15 value (moved to the other nibble).
   21146   SDValue NibbleMask = DAG.getConstant(0xF, DL, VT);
   21147   SDValue Lo = DAG.getNode(ISD::AND, DL, VT, In, NibbleMask);
   21148   SDValue Hi = DAG.getNode(ISD::SRL, DL, VT, In, DAG.getConstant(4, DL, VT));
   21149 
   21150   const int LoLUT[16] = {
   21151       /* 0 */ 0x00, /* 1 */ 0x80, /* 2 */ 0x40, /* 3 */ 0xC0,
   21152       /* 4 */ 0x20, /* 5 */ 0xA0, /* 6 */ 0x60, /* 7 */ 0xE0,
   21153       /* 8 */ 0x10, /* 9 */ 0x90, /* a */ 0x50, /* b */ 0xD0,
   21154       /* c */ 0x30, /* d */ 0xB0, /* e */ 0x70, /* f */ 0xF0};
   21155   const int HiLUT[16] = {
   21156       /* 0 */ 0x00, /* 1 */ 0x08, /* 2 */ 0x04, /* 3 */ 0x0C,
   21157       /* 4 */ 0x02, /* 5 */ 0x0A, /* 6 */ 0x06, /* 7 */ 0x0E,
   21158       /* 8 */ 0x01, /* 9 */ 0x09, /* a */ 0x05, /* b */ 0x0D,
   21159       /* c */ 0x03, /* d */ 0x0B, /* e */ 0x07, /* f */ 0x0F};
   21160 
   21161   SmallVector<SDValue, 16> LoMaskElts, HiMaskElts;
   21162   for (unsigned i = 0; i < NumElts; ++i) {
   21163     LoMaskElts.push_back(DAG.getConstant(LoLUT[i % 16], DL, MVT::i8));
   21164     HiMaskElts.push_back(DAG.getConstant(HiLUT[i % 16], DL, MVT::i8));
   21165   }
   21166 
   21167   SDValue LoMask = DAG.getBuildVector(VT, DL, LoMaskElts);
   21168   SDValue HiMask = DAG.getBuildVector(VT, DL, HiMaskElts);
   21169   Lo = DAG.getNode(X86ISD::PSHUFB, DL, VT, LoMask, Lo);
   21170   Hi = DAG.getNode(X86ISD::PSHUFB, DL, VT, HiMask, Hi);
   21171   return DAG.getNode(ISD::OR, DL, VT, Lo, Hi);
   21172 }
   21173 
   21174 static SDValue lowerAtomicArithWithLOCK(SDValue N, SelectionDAG &DAG) {
   21175   unsigned NewOpc = 0;
   21176   switch (N->getOpcode()) {
   21177   case ISD::ATOMIC_LOAD_ADD:
   21178     NewOpc = X86ISD::LADD;
   21179     break;
   21180   case ISD::ATOMIC_LOAD_SUB:
   21181     NewOpc = X86ISD::LSUB;
   21182     break;
   21183   case ISD::ATOMIC_LOAD_OR:
   21184     NewOpc = X86ISD::LOR;
   21185     break;
   21186   case ISD::ATOMIC_LOAD_XOR:
   21187     NewOpc = X86ISD::LXOR;
   21188     break;
   21189   case ISD::ATOMIC_LOAD_AND:
   21190     NewOpc = X86ISD::LAND;
   21191     break;
   21192   default:
   21193     llvm_unreachable("Unknown ATOMIC_LOAD_ opcode");
   21194   }
   21195 
   21196   MachineMemOperand *MMO = cast<MemSDNode>(N)->getMemOperand();
   21197   return DAG.getMemIntrinsicNode(
   21198       NewOpc, SDLoc(N), DAG.getVTList(MVT::i32, MVT::Other),
   21199       {N->getOperand(0), N->getOperand(1), N->getOperand(2)},
   21200       /*MemVT=*/N->getSimpleValueType(0), MMO);
   21201 }
   21202 
   21203 /// Lower atomic_load_ops into LOCK-prefixed operations.
   21204 static SDValue lowerAtomicArith(SDValue N, SelectionDAG &DAG,
   21205                                 const X86Subtarget &Subtarget) {
   21206   SDValue Chain = N->getOperand(0);
   21207   SDValue LHS = N->getOperand(1);
   21208   SDValue RHS = N->getOperand(2);
   21209   unsigned Opc = N->getOpcode();
   21210   MVT VT = N->getSimpleValueType(0);
   21211   SDLoc DL(N);
   21212 
   21213   // We can lower atomic_load_add into LXADD. However, any other atomicrmw op
   21214   // can only be lowered when the result is unused.  They should have already
   21215   // been transformed into a cmpxchg loop in AtomicExpand.
   21216   if (N->hasAnyUseOfValue(0)) {
   21217     // Handle (atomic_load_sub p, v) as (atomic_load_add p, -v), to be able to
   21218     // select LXADD if LOCK_SUB can't be selected.
   21219     if (Opc == ISD::ATOMIC_LOAD_SUB) {
   21220       AtomicSDNode *AN = cast<AtomicSDNode>(N.getNode());
   21221       RHS = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT), RHS);
   21222       return DAG.getAtomic(ISD::ATOMIC_LOAD_ADD, DL, VT, Chain, LHS,
   21223                            RHS, AN->getMemOperand(), AN->getOrdering(),
   21224                            AN->getSynchScope());
   21225     }
   21226     assert(Opc == ISD::ATOMIC_LOAD_ADD &&
   21227            "Used AtomicRMW ops other than Add should have been expanded!");
   21228     return N;
   21229   }
   21230 
   21231   SDValue LockOp = lowerAtomicArithWithLOCK(N, DAG);
   21232   // RAUW the chain, but don't worry about the result, as it's unused.
   21233   assert(!N->hasAnyUseOfValue(0));
   21234   DAG.ReplaceAllUsesOfValueWith(N.getValue(1), LockOp.getValue(1));
   21235   return SDValue();
   21236 }
   21237 
   21238 static SDValue LowerATOMIC_STORE(SDValue Op, SelectionDAG &DAG) {
   21239   SDNode *Node = Op.getNode();
   21240   SDLoc dl(Node);
   21241   EVT VT = cast<AtomicSDNode>(Node)->getMemoryVT();
   21242 
   21243   // Convert seq_cst store -> xchg
   21244   // Convert wide store -> swap (-> cmpxchg8b/cmpxchg16b)
   21245   // FIXME: On 32-bit, store -> fist or movq would be more efficient
   21246   //        (The only way to get a 16-byte store is cmpxchg16b)
   21247   // FIXME: 16-byte ATOMIC_SWAP isn't actually hooked up at the moment.
   21248   if (cast<AtomicSDNode>(Node)->getOrdering() ==
   21249           AtomicOrdering::SequentiallyConsistent ||
   21250       !DAG.getTargetLoweringInfo().isTypeLegal(VT)) {
   21251     SDValue Swap = DAG.getAtomic(ISD::ATOMIC_SWAP, dl,
   21252                                  cast<AtomicSDNode>(Node)->getMemoryVT(),
   21253                                  Node->getOperand(0),
   21254                                  Node->getOperand(1), Node->getOperand(2),
   21255                                  cast<AtomicSDNode>(Node)->getMemOperand(),
   21256                                  cast<AtomicSDNode>(Node)->getOrdering(),
   21257                                  cast<AtomicSDNode>(Node)->getSynchScope());
   21258     return Swap.getValue(1);
   21259   }
   21260   // Other atomic stores have a simple pattern.
   21261   return Op;
   21262 }
   21263 
   21264 static SDValue LowerADDC_ADDE_SUBC_SUBE(SDValue Op, SelectionDAG &DAG) {
   21265   MVT VT = Op.getNode()->getSimpleValueType(0);
   21266 
   21267   // Let legalize expand this if it isn't a legal type yet.
   21268   if (!DAG.getTargetLoweringInfo().isTypeLegal(VT))
   21269     return SDValue();
   21270 
   21271   SDVTList VTs = DAG.getVTList(VT, MVT::i32);
   21272 
   21273   unsigned Opc;
   21274   bool ExtraOp = false;
   21275   switch (Op.getOpcode()) {
   21276   default: llvm_unreachable("Invalid code");
   21277   case ISD::ADDC: Opc = X86ISD::ADD; break;
   21278   case ISD::ADDE: Opc = X86ISD::ADC; ExtraOp = true; break;
   21279   case ISD::SUBC: Opc = X86ISD::SUB; break;
   21280   case ISD::SUBE: Opc = X86ISD::SBB; ExtraOp = true; break;
   21281   }
   21282 
   21283   if (!ExtraOp)
   21284     return DAG.getNode(Opc, SDLoc(Op), VTs, Op.getOperand(0),
   21285                        Op.getOperand(1));
   21286   return DAG.getNode(Opc, SDLoc(Op), VTs, Op.getOperand(0),
   21287                      Op.getOperand(1), Op.getOperand(2));
   21288 }
   21289 
   21290 static SDValue LowerFSINCOS(SDValue Op, const X86Subtarget &Subtarget,
   21291                             SelectionDAG &DAG) {
   21292   assert(Subtarget.isTargetDarwin() && Subtarget.is64Bit());
   21293 
   21294   // For MacOSX, we want to call an alternative entry point: __sincos_stret,
   21295   // which returns the values as { float, float } (in XMM0) or
   21296   // { double, double } (which is returned in XMM0, XMM1).
   21297   SDLoc dl(Op);
   21298   SDValue Arg = Op.getOperand(0);
   21299   EVT ArgVT = Arg.getValueType();
   21300   Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext());
   21301 
   21302   TargetLowering::ArgListTy Args;
   21303   TargetLowering::ArgListEntry Entry;
   21304 
   21305   Entry.Node = Arg;
   21306   Entry.Ty = ArgTy;
   21307   Entry.isSExt = false;
   21308   Entry.isZExt = false;
   21309   Args.push_back(Entry);
   21310 
   21311   bool isF64 = ArgVT == MVT::f64;
   21312   // Only optimize x86_64 for now. i386 is a bit messy. For f32,
   21313   // the small struct {f32, f32} is returned in (eax, edx). For f64,
   21314   // the results are returned via SRet in memory.
   21315   const char *LibcallName =  isF64 ? "__sincos_stret" : "__sincosf_stret";
   21316   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
   21317   SDValue Callee =
   21318       DAG.getExternalSymbol(LibcallName, TLI.getPointerTy(DAG.getDataLayout()));
   21319 
   21320   Type *RetTy = isF64
   21321     ? (Type*)StructType::get(ArgTy, ArgTy, nullptr)
   21322     : (Type*)VectorType::get(ArgTy, 4);
   21323 
   21324   TargetLowering::CallLoweringInfo CLI(DAG);
   21325   CLI.setDebugLoc(dl).setChain(DAG.getEntryNode())
   21326     .setCallee(CallingConv::C, RetTy, Callee, std::move(Args));
   21327 
   21328   std::pair<SDValue, SDValue> CallResult = TLI.LowerCallTo(CLI);
   21329 
   21330   if (isF64)
   21331     // Returned in xmm0 and xmm1.
   21332     return CallResult.first;
   21333 
   21334   // Returned in bits 0:31 and 32:64 xmm0.
   21335   SDValue SinVal = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, ArgVT,
   21336                                CallResult.first, DAG.getIntPtrConstant(0, dl));
   21337   SDValue CosVal = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, ArgVT,
   21338                                CallResult.first, DAG.getIntPtrConstant(1, dl));
   21339   SDVTList Tys = DAG.getVTList(ArgVT, ArgVT);
   21340   return DAG.getNode(ISD::MERGE_VALUES, dl, Tys, SinVal, CosVal);
   21341 }
   21342 
   21343 /// Widen a vector input to a vector of NVT.  The
   21344 /// input vector must have the same element type as NVT.
   21345 static SDValue ExtendToType(SDValue InOp, MVT NVT, SelectionDAG &DAG,
   21346                             bool FillWithZeroes = false) {
   21347   // Check if InOp already has the right width.
   21348   MVT InVT = InOp.getSimpleValueType();
   21349   if (InVT == NVT)
   21350     return InOp;
   21351 
   21352   if (InOp.isUndef())
   21353     return DAG.getUNDEF(NVT);
   21354 
   21355   assert(InVT.getVectorElementType() == NVT.getVectorElementType() &&
   21356          "input and widen element type must match");
   21357 
   21358   unsigned InNumElts = InVT.getVectorNumElements();
   21359   unsigned WidenNumElts = NVT.getVectorNumElements();
   21360   assert(WidenNumElts > InNumElts && WidenNumElts % InNumElts == 0 &&
   21361          "Unexpected request for vector widening");
   21362 
   21363   EVT EltVT = NVT.getVectorElementType();
   21364 
   21365   SDLoc dl(InOp);
   21366   if (InOp.getOpcode() == ISD::CONCAT_VECTORS &&
   21367       InOp.getNumOperands() == 2) {
   21368     SDValue N1 = InOp.getOperand(1);
   21369     if ((ISD::isBuildVectorAllZeros(N1.getNode()) && FillWithZeroes) ||
   21370         N1.isUndef()) {
   21371       InOp = InOp.getOperand(0);
   21372       InVT = InOp.getSimpleValueType();
   21373       InNumElts = InVT.getVectorNumElements();
   21374     }
   21375   }
   21376   if (ISD::isBuildVectorOfConstantSDNodes(InOp.getNode()) ||
   21377       ISD::isBuildVectorOfConstantFPSDNodes(InOp.getNode())) {
   21378     SmallVector<SDValue, 16> Ops;
   21379     for (unsigned i = 0; i < InNumElts; ++i)
   21380       Ops.push_back(InOp.getOperand(i));
   21381 
   21382     SDValue FillVal = FillWithZeroes ? DAG.getConstant(0, dl, EltVT) :
   21383       DAG.getUNDEF(EltVT);
   21384     for (unsigned i = 0; i < WidenNumElts - InNumElts; ++i)
   21385       Ops.push_back(FillVal);
   21386     return DAG.getBuildVector(NVT, dl, Ops);
   21387   }
   21388   SDValue FillVal = FillWithZeroes ? DAG.getConstant(0, dl, NVT) :
   21389     DAG.getUNDEF(NVT);
   21390   return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, NVT, FillVal,
   21391                      InOp, DAG.getIntPtrConstant(0, dl));
   21392 }
   21393 
   21394 static SDValue LowerMSCATTER(SDValue Op, const X86Subtarget &Subtarget,
   21395                              SelectionDAG &DAG) {
   21396   assert(Subtarget.hasAVX512() &&
   21397          "MGATHER/MSCATTER are supported on AVX-512 arch only");
   21398 
   21399   // X86 scatter kills mask register, so its type should be added to
   21400   // the list of return values.
   21401   // If the "scatter" has 2 return values, it is already handled.
   21402   if (Op.getNode()->getNumValues() == 2)
   21403     return Op;
   21404 
   21405   MaskedScatterSDNode *N = cast<MaskedScatterSDNode>(Op.getNode());
   21406   SDValue Src = N->getValue();
   21407   MVT VT = Src.getSimpleValueType();
   21408   assert(VT.getScalarSizeInBits() >= 32 && "Unsupported scatter op");
   21409   SDLoc dl(Op);
   21410 
   21411   SDValue NewScatter;
   21412   SDValue Index = N->getIndex();
   21413   SDValue Mask = N->getMask();
   21414   SDValue Chain = N->getChain();
   21415   SDValue BasePtr = N->getBasePtr();
   21416   MVT MemVT = N->getMemoryVT().getSimpleVT();
   21417   MVT IndexVT = Index.getSimpleValueType();
   21418   MVT MaskVT = Mask.getSimpleValueType();
   21419 
   21420   if (MemVT.getScalarSizeInBits() < VT.getScalarSizeInBits()) {
   21421     // The v2i32 value was promoted to v2i64.
   21422     // Now we "redo" the type legalizer's work and widen the original
   21423     // v2i32 value to v4i32. The original v2i32 is retrieved from v2i64
   21424     // with a shuffle.
   21425     assert((MemVT == MVT::v2i32 && VT == MVT::v2i64) &&
   21426            "Unexpected memory type");
   21427     int ShuffleMask[] = {0, 2, -1, -1};
   21428     Src = DAG.getVectorShuffle(MVT::v4i32, dl, DAG.getBitcast(MVT::v4i32, Src),
   21429                                DAG.getUNDEF(MVT::v4i32), ShuffleMask);
   21430     // Now we have 4 elements instead of 2.
   21431     // Expand the index.
   21432     MVT NewIndexVT = MVT::getVectorVT(IndexVT.getScalarType(), 4);
   21433     Index = ExtendToType(Index, NewIndexVT, DAG);
   21434 
   21435     // Expand the mask with zeroes
   21436     // Mask may be <2 x i64> or <2 x i1> at this moment
   21437     assert((MaskVT == MVT::v2i1 || MaskVT == MVT::v2i64) &&
   21438            "Unexpected mask type");
   21439     MVT ExtMaskVT = MVT::getVectorVT(MaskVT.getScalarType(), 4);
   21440     Mask = ExtendToType(Mask, ExtMaskVT, DAG, true);
   21441     VT = MVT::v4i32;
   21442   }
   21443 
   21444   unsigned NumElts = VT.getVectorNumElements();
   21445   if (!Subtarget.hasVLX() && !VT.is512BitVector() &&
   21446       !Index.getSimpleValueType().is512BitVector()) {
   21447     // AVX512F supports only 512-bit vectors. Or data or index should
   21448     // be 512 bit wide. If now the both index and data are 256-bit, but
   21449     // the vector contains 8 elements, we just sign-extend the index
   21450     if (IndexVT == MVT::v8i32)
   21451       // Just extend index
   21452       Index = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v8i64, Index);
   21453     else {
   21454       // The minimal number of elts in scatter is 8
   21455       NumElts = 8;
   21456       // Index
   21457       MVT NewIndexVT = MVT::getVectorVT(IndexVT.getScalarType(), NumElts);
   21458       // Use original index here, do not modify the index twice
   21459       Index = ExtendToType(N->getIndex(), NewIndexVT, DAG);
   21460       if (IndexVT.getScalarType() == MVT::i32)
   21461         Index = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v8i64, Index);
   21462 
   21463       // Mask
   21464       // At this point we have promoted mask operand
   21465       assert(MaskVT.getScalarSizeInBits() >= 32 && "unexpected mask type");
   21466       MVT ExtMaskVT = MVT::getVectorVT(MaskVT.getScalarType(), NumElts);
   21467       // Use the original mask here, do not modify the mask twice
   21468       Mask = ExtendToType(N->getMask(), ExtMaskVT, DAG, true);
   21469 
   21470       // The value that should be stored
   21471       MVT NewVT = MVT::getVectorVT(VT.getScalarType(), NumElts);
   21472       Src = ExtendToType(Src, NewVT, DAG);
   21473     }
   21474   }
   21475   // If the mask is "wide" at this point - truncate it to i1 vector
   21476   MVT BitMaskVT = MVT::getVectorVT(MVT::i1, NumElts);
   21477   Mask = DAG.getNode(ISD::TRUNCATE, dl, BitMaskVT, Mask);
   21478 
   21479   // The mask is killed by scatter, add it to the values
   21480   SDVTList VTs = DAG.getVTList(BitMaskVT, MVT::Other);
   21481   SDValue Ops[] = {Chain, Src, Mask, BasePtr, Index};
   21482   NewScatter = DAG.getMaskedScatter(VTs, N->getMemoryVT(), dl, Ops,
   21483                                     N->getMemOperand());
   21484   DAG.ReplaceAllUsesWith(Op, SDValue(NewScatter.getNode(), 1));
   21485   return SDValue(NewScatter.getNode(), 1);
   21486 }
   21487 
   21488 static SDValue LowerMLOAD(SDValue Op, const X86Subtarget &Subtarget,
   21489                           SelectionDAG &DAG) {
   21490 
   21491   MaskedLoadSDNode *N = cast<MaskedLoadSDNode>(Op.getNode());
   21492   MVT VT = Op.getSimpleValueType();
   21493   MVT ScalarVT = VT.getScalarType();
   21494   SDValue Mask = N->getMask();
   21495   SDLoc dl(Op);
   21496 
   21497   assert(Subtarget.hasAVX512() && !Subtarget.hasVLX() && !VT.is512BitVector() &&
   21498          "Cannot lower masked load op.");
   21499 
   21500   assert(((ScalarVT == MVT::i32 || ScalarVT == MVT::f32) ||
   21501           (Subtarget.hasBWI() &&
   21502               (ScalarVT == MVT::i8 || ScalarVT == MVT::i16))) &&
   21503          "Unsupported masked load op.");
   21504 
   21505   // This operation is legal for targets with VLX, but without
   21506   // VLX the vector should be widened to 512 bit
   21507   unsigned NumEltsInWideVec = 512/VT.getScalarSizeInBits();
   21508   MVT WideDataVT = MVT::getVectorVT(ScalarVT, NumEltsInWideVec);
   21509   MVT WideMaskVT = MVT::getVectorVT(MVT::i1, NumEltsInWideVec);
   21510   SDValue Src0 = N->getSrc0();
   21511   Src0 = ExtendToType(Src0, WideDataVT, DAG);
   21512   Mask = ExtendToType(Mask, WideMaskVT, DAG, true);
   21513   SDValue NewLoad = DAG.getMaskedLoad(WideDataVT, dl, N->getChain(),
   21514                                       N->getBasePtr(), Mask, Src0,
   21515                                       N->getMemoryVT(), N->getMemOperand(),
   21516                                       N->getExtensionType());
   21517 
   21518   SDValue Exract = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT,
   21519                                NewLoad.getValue(0),
   21520                                DAG.getIntPtrConstant(0, dl));
   21521   SDValue RetOps[] = {Exract, NewLoad.getValue(1)};
   21522   return DAG.getMergeValues(RetOps, dl);
   21523 }
   21524 
   21525 static SDValue LowerMSTORE(SDValue Op, const X86Subtarget &Subtarget,
   21526                            SelectionDAG &DAG) {
   21527   MaskedStoreSDNode *N = cast<MaskedStoreSDNode>(Op.getNode());
   21528   SDValue DataToStore = N->getValue();
   21529   MVT VT = DataToStore.getSimpleValueType();
   21530   MVT ScalarVT = VT.getScalarType();
   21531   SDValue Mask = N->getMask();
   21532   SDLoc dl(Op);
   21533 
   21534   assert(Subtarget.hasAVX512() && !Subtarget.hasVLX() && !VT.is512BitVector() &&
   21535          "Cannot lower masked store op.");
   21536 
   21537   assert(((ScalarVT == MVT::i32 || ScalarVT == MVT::f32) ||
   21538           (Subtarget.hasBWI() &&
   21539               (ScalarVT == MVT::i8 || ScalarVT == MVT::i16))) &&
   21540           "Unsupported masked store op.");
   21541 
   21542   // This operation is legal for targets with VLX, but without
   21543   // VLX the vector should be widened to 512 bit
   21544   unsigned NumEltsInWideVec = 512/VT.getScalarSizeInBits();
   21545   MVT WideDataVT = MVT::getVectorVT(ScalarVT, NumEltsInWideVec);
   21546   MVT WideMaskVT = MVT::getVectorVT(MVT::i1, NumEltsInWideVec);
   21547   DataToStore = ExtendToType(DataToStore, WideDataVT, DAG);
   21548   Mask = ExtendToType(Mask, WideMaskVT, DAG, true);
   21549   return DAG.getMaskedStore(N->getChain(), dl, DataToStore, N->getBasePtr(),
   21550                             Mask, N->getMemoryVT(), N->getMemOperand(),
   21551                             N->isTruncatingStore());
   21552 }
   21553 
   21554 static SDValue LowerMGATHER(SDValue Op, const X86Subtarget &Subtarget,
   21555                             SelectionDAG &DAG) {
   21556   assert(Subtarget.hasAVX512() &&
   21557          "MGATHER/MSCATTER are supported on AVX-512 arch only");
   21558 
   21559   MaskedGatherSDNode *N = cast<MaskedGatherSDNode>(Op.getNode());
   21560   SDLoc dl(Op);
   21561   MVT VT = Op.getSimpleValueType();
   21562   SDValue Index = N->getIndex();
   21563   SDValue Mask = N->getMask();
   21564   SDValue Src0 = N->getValue();
   21565   MVT IndexVT = Index.getSimpleValueType();
   21566   MVT MaskVT = Mask.getSimpleValueType();
   21567 
   21568   unsigned NumElts = VT.getVectorNumElements();
   21569   assert(VT.getScalarSizeInBits() >= 32 && "Unsupported gather op");
   21570 
   21571   if (!Subtarget.hasVLX() && !VT.is512BitVector() &&
   21572       !Index.getSimpleValueType().is512BitVector()) {
   21573     // AVX512F supports only 512-bit vectors. Or data or index should
   21574     // be 512 bit wide. If now the both index and data are 256-bit, but
   21575     // the vector contains 8 elements, we just sign-extend the index
   21576     if (NumElts == 8) {
   21577       Index = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v8i64, Index);
   21578       SDValue Ops[] = { N->getOperand(0), N->getOperand(1),  N->getOperand(2),
   21579                         N->getOperand(3), Index };
   21580       DAG.UpdateNodeOperands(N, Ops);
   21581       return Op;
   21582     }
   21583 
   21584     // Minimal number of elements in Gather
   21585     NumElts = 8;
   21586     // Index
   21587     MVT NewIndexVT = MVT::getVectorVT(IndexVT.getScalarType(), NumElts);
   21588     Index = ExtendToType(Index, NewIndexVT, DAG);
   21589     if (IndexVT.getScalarType() == MVT::i32)
   21590       Index = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v8i64, Index);
   21591 
   21592     // Mask
   21593     MVT MaskBitVT = MVT::getVectorVT(MVT::i1, NumElts);
   21594     // At this point we have promoted mask operand
   21595     assert(MaskVT.getScalarSizeInBits() >= 32 && "unexpected mask type");
   21596     MVT ExtMaskVT = MVT::getVectorVT(MaskVT.getScalarType(), NumElts);
   21597     Mask = ExtendToType(Mask, ExtMaskVT, DAG, true);
   21598     Mask = DAG.getNode(ISD::TRUNCATE, dl, MaskBitVT, Mask);
   21599 
   21600     // The pass-thru value
   21601     MVT NewVT = MVT::getVectorVT(VT.getScalarType(), NumElts);
   21602     Src0 = ExtendToType(Src0, NewVT, DAG);
   21603 
   21604     SDValue Ops[] = { N->getChain(), Src0, Mask, N->getBasePtr(), Index };
   21605     SDValue NewGather = DAG.getMaskedGather(DAG.getVTList(NewVT, MVT::Other),
   21606                                             N->getMemoryVT(), dl, Ops,
   21607                                             N->getMemOperand());
   21608     SDValue Exract = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT,
   21609                                  NewGather.getValue(0),
   21610                                  DAG.getIntPtrConstant(0, dl));
   21611     SDValue RetOps[] = {Exract, NewGather.getValue(1)};
   21612     return DAG.getMergeValues(RetOps, dl);
   21613   }
   21614   return Op;
   21615 }
   21616 
   21617 SDValue X86TargetLowering::LowerGC_TRANSITION_START(SDValue Op,
   21618                                                     SelectionDAG &DAG) const {
   21619   // TODO: Eventually, the lowering of these nodes should be informed by or
   21620   // deferred to the GC strategy for the function in which they appear. For
   21621   // now, however, they must be lowered to something. Since they are logically
   21622   // no-ops in the case of a null GC strategy (or a GC strategy which does not
   21623   // require special handling for these nodes), lower them as literal NOOPs for
   21624   // the time being.
   21625   SmallVector<SDValue, 2> Ops;
   21626 
   21627   Ops.push_back(Op.getOperand(0));
   21628   if (Op->getGluedNode())
   21629     Ops.push_back(Op->getOperand(Op->getNumOperands() - 1));
   21630 
   21631   SDLoc OpDL(Op);
   21632   SDVTList VTs = DAG.getVTList(MVT::Other, MVT::Glue);
   21633   SDValue NOOP(DAG.getMachineNode(X86::NOOP, SDLoc(Op), VTs, Ops), 0);
   21634 
   21635   return NOOP;
   21636 }
   21637 
   21638 SDValue X86TargetLowering::LowerGC_TRANSITION_END(SDValue Op,
   21639                                                   SelectionDAG &DAG) const {
   21640   // TODO: Eventually, the lowering of these nodes should be informed by or
   21641   // deferred to the GC strategy for the function in which they appear. For
   21642   // now, however, they must be lowered to something. Since they are logically
   21643   // no-ops in the case of a null GC strategy (or a GC strategy which does not
   21644   // require special handling for these nodes), lower them as literal NOOPs for
   21645   // the time being.
   21646   SmallVector<SDValue, 2> Ops;
   21647 
   21648   Ops.push_back(Op.getOperand(0));
   21649   if (Op->getGluedNode())
   21650     Ops.push_back(Op->getOperand(Op->getNumOperands() - 1));
   21651 
   21652   SDLoc OpDL(Op);
   21653   SDVTList VTs = DAG.getVTList(MVT::Other, MVT::Glue);
   21654   SDValue NOOP(DAG.getMachineNode(X86::NOOP, SDLoc(Op), VTs, Ops), 0);
   21655 
   21656   return NOOP;
   21657 }
   21658 
   21659 /// Provide custom lowering hooks for some operations.
   21660 SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
   21661   switch (Op.getOpcode()) {
   21662   default: llvm_unreachable("Should not custom lower this!");
   21663   case ISD::ATOMIC_FENCE:       return LowerATOMIC_FENCE(Op, Subtarget, DAG);
   21664   case ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS:
   21665     return LowerCMP_SWAP(Op, Subtarget, DAG);
   21666   case ISD::CTPOP:              return LowerCTPOP(Op, Subtarget, DAG);
   21667   case ISD::ATOMIC_LOAD_ADD:
   21668   case ISD::ATOMIC_LOAD_SUB:
   21669   case ISD::ATOMIC_LOAD_OR:
   21670   case ISD::ATOMIC_LOAD_XOR:
   21671   case ISD::ATOMIC_LOAD_AND:    return lowerAtomicArith(Op, DAG, Subtarget);
   21672   case ISD::ATOMIC_STORE:       return LowerATOMIC_STORE(Op, DAG);
   21673   case ISD::BITREVERSE:         return LowerBITREVERSE(Op, Subtarget, DAG);
   21674   case ISD::BUILD_VECTOR:       return LowerBUILD_VECTOR(Op, DAG);
   21675   case ISD::CONCAT_VECTORS:     return LowerCONCAT_VECTORS(Op, Subtarget, DAG);
   21676   case ISD::VECTOR_SHUFFLE:     return lowerVectorShuffle(Op, Subtarget, DAG);
   21677   case ISD::VSELECT:            return LowerVSELECT(Op, DAG);
   21678   case ISD::EXTRACT_VECTOR_ELT: return LowerEXTRACT_VECTOR_ELT(Op, DAG);
   21679   case ISD::INSERT_VECTOR_ELT:  return LowerINSERT_VECTOR_ELT(Op, DAG);
   21680   case ISD::EXTRACT_SUBVECTOR:  return LowerEXTRACT_SUBVECTOR(Op,Subtarget,DAG);
   21681   case ISD::INSERT_SUBVECTOR:   return LowerINSERT_SUBVECTOR(Op, Subtarget,DAG);
   21682   case ISD::SCALAR_TO_VECTOR:   return LowerSCALAR_TO_VECTOR(Op, DAG);
   21683   case ISD::ConstantPool:       return LowerConstantPool(Op, DAG);
   21684   case ISD::GlobalAddress:      return LowerGlobalAddress(Op, DAG);
   21685   case ISD::GlobalTLSAddress:   return LowerGlobalTLSAddress(Op, DAG);
   21686   case ISD::ExternalSymbol:     return LowerExternalSymbol(Op, DAG);
   21687   case ISD::BlockAddress:       return LowerBlockAddress(Op, DAG);
   21688   case ISD::SHL_PARTS:
   21689   case ISD::SRA_PARTS:
   21690   case ISD::SRL_PARTS:          return LowerShiftParts(Op, DAG);
   21691   case ISD::SINT_TO_FP:         return LowerSINT_TO_FP(Op, DAG);
   21692   case ISD::UINT_TO_FP:         return LowerUINT_TO_FP(Op, DAG);
   21693   case ISD::TRUNCATE:           return LowerTRUNCATE(Op, DAG);
   21694   case ISD::ZERO_EXTEND:        return LowerZERO_EXTEND(Op, Subtarget, DAG);
   21695   case ISD::SIGN_EXTEND:        return LowerSIGN_EXTEND(Op, Subtarget, DAG);
   21696   case ISD::ANY_EXTEND:         return LowerANY_EXTEND(Op, Subtarget, DAG);
   21697   case ISD::SIGN_EXTEND_VECTOR_INREG:
   21698     return LowerSIGN_EXTEND_VECTOR_INREG(Op, Subtarget, DAG);
   21699   case ISD::FP_TO_SINT:         return LowerFP_TO_SINT(Op, DAG);
   21700   case ISD::FP_TO_UINT:         return LowerFP_TO_UINT(Op, DAG);
   21701   case ISD::FP_EXTEND:          return LowerFP_EXTEND(Op, DAG);
   21702   case ISD::LOAD:               return LowerExtendedLoad(Op, Subtarget, DAG);
   21703   case ISD::FABS:
   21704   case ISD::FNEG:               return LowerFABSorFNEG(Op, DAG);
   21705   case ISD::FCOPYSIGN:          return LowerFCOPYSIGN(Op, DAG);
   21706   case ISD::FGETSIGN:           return LowerFGETSIGN(Op, DAG);
   21707   case ISD::SETCC:              return LowerSETCC(Op, DAG);
   21708   case ISD::SETCCE:             return LowerSETCCE(Op, DAG);
   21709   case ISD::SELECT:             return LowerSELECT(Op, DAG);
   21710   case ISD::BRCOND:             return LowerBRCOND(Op, DAG);
   21711   case ISD::JumpTable:          return LowerJumpTable(Op, DAG);
   21712   case ISD::VASTART:            return LowerVASTART(Op, DAG);
   21713   case ISD::VAARG:              return LowerVAARG(Op, DAG);
   21714   case ISD::VACOPY:             return LowerVACOPY(Op, Subtarget, DAG);
   21715   case ISD::INTRINSIC_WO_CHAIN: return LowerINTRINSIC_WO_CHAIN(Op, Subtarget, DAG);
   21716   case ISD::INTRINSIC_VOID:
   21717   case ISD::INTRINSIC_W_CHAIN:  return LowerINTRINSIC_W_CHAIN(Op, Subtarget, DAG);
   21718   case ISD::RETURNADDR:         return LowerRETURNADDR(Op, DAG);
   21719   case ISD::FRAMEADDR:          return LowerFRAMEADDR(Op, DAG);
   21720   case ISD::FRAME_TO_ARGS_OFFSET:
   21721                                 return LowerFRAME_TO_ARGS_OFFSET(Op, DAG);
   21722   case ISD::DYNAMIC_STACKALLOC: return LowerDYNAMIC_STACKALLOC(Op, DAG);
   21723   case ISD::EH_RETURN:          return LowerEH_RETURN(Op, DAG);
   21724   case ISD::EH_SJLJ_SETJMP:     return lowerEH_SJLJ_SETJMP(Op, DAG);
   21725   case ISD::EH_SJLJ_LONGJMP:    return lowerEH_SJLJ_LONGJMP(Op, DAG);
   21726   case ISD::EH_SJLJ_SETUP_DISPATCH:
   21727     return lowerEH_SJLJ_SETUP_DISPATCH(Op, DAG);
   21728   case ISD::INIT_TRAMPOLINE:    return LowerINIT_TRAMPOLINE(Op, DAG);
   21729   case ISD::ADJUST_TRAMPOLINE:  return LowerADJUST_TRAMPOLINE(Op, DAG);
   21730   case ISD::FLT_ROUNDS_:        return LowerFLT_ROUNDS_(Op, DAG);
   21731   case ISD::CTLZ:
   21732   case ISD::CTLZ_ZERO_UNDEF:    return LowerCTLZ(Op, Subtarget, DAG);
   21733   case ISD::CTTZ:
   21734   case ISD::CTTZ_ZERO_UNDEF:    return LowerCTTZ(Op, DAG);
   21735   case ISD::MUL:                return LowerMUL(Op, Subtarget, DAG);
   21736   case ISD::MULHS:
   21737   case ISD::MULHU:              return LowerMULH(Op, Subtarget, DAG);
   21738   case ISD::UMUL_LOHI:
   21739   case ISD::SMUL_LOHI:          return LowerMUL_LOHI(Op, Subtarget, DAG);
   21740   case ISD::ROTL:               return LowerRotate(Op, Subtarget, DAG);
   21741   case ISD::SRA:
   21742   case ISD::SRL:
   21743   case ISD::SHL:                return LowerShift(Op, Subtarget, DAG);
   21744   case ISD::SADDO:
   21745   case ISD::UADDO:
   21746   case ISD::SSUBO:
   21747   case ISD::USUBO:
   21748   case ISD::SMULO:
   21749   case ISD::UMULO:              return LowerXALUO(Op, DAG);
   21750   case ISD::READCYCLECOUNTER:   return LowerREADCYCLECOUNTER(Op, Subtarget,DAG);
   21751   case ISD::BITCAST:            return LowerBITCAST(Op, Subtarget, DAG);
   21752   case ISD::ADDC:
   21753   case ISD::ADDE:
   21754   case ISD::SUBC:
   21755   case ISD::SUBE:               return LowerADDC_ADDE_SUBC_SUBE(Op, DAG);
   21756   case ISD::ADD:                return LowerADD(Op, DAG);
   21757   case ISD::SUB:                return LowerSUB(Op, DAG);
   21758   case ISD::SMAX:
   21759   case ISD::SMIN:
   21760   case ISD::UMAX:
   21761   case ISD::UMIN:               return LowerMINMAX(Op, DAG);
   21762   case ISD::FSINCOS:            return LowerFSINCOS(Op, Subtarget, DAG);
   21763   case ISD::MLOAD:              return LowerMLOAD(Op, Subtarget, DAG);
   21764   case ISD::MSTORE:             return LowerMSTORE(Op, Subtarget, DAG);
   21765   case ISD::MGATHER:            return LowerMGATHER(Op, Subtarget, DAG);
   21766   case ISD::MSCATTER:           return LowerMSCATTER(Op, Subtarget, DAG);
   21767   case ISD::GC_TRANSITION_START:
   21768                                 return LowerGC_TRANSITION_START(Op, DAG);
   21769   case ISD::GC_TRANSITION_END:  return LowerGC_TRANSITION_END(Op, DAG);
   21770   case ISD::STORE:              return LowerTruncatingStore(Op, Subtarget, DAG);
   21771   }
   21772 }
   21773 
   21774 /// Places new result values for the node in Results (their number
   21775 /// and types must exactly match those of the original return values of
   21776 /// the node), or leaves Results empty, which indicates that the node is not
   21777 /// to be custom lowered after all.
   21778 void X86TargetLowering::LowerOperationWrapper(SDNode *N,
   21779                                               SmallVectorImpl<SDValue> &Results,
   21780                                               SelectionDAG &DAG) const {
   21781   SDValue Res = LowerOperation(SDValue(N, 0), DAG);
   21782 
   21783   if (!Res.getNode())
   21784     return;
   21785 
   21786   assert((N->getNumValues() <= Res->getNumValues()) &&
   21787       "Lowering returned the wrong number of results!");
   21788 
   21789   // Places new result values base on N result number.
   21790   // In some cases (LowerSINT_TO_FP for example) Res has more result values
   21791   // than original node, chain should be dropped(last value).
   21792   for (unsigned I = 0, E = N->getNumValues(); I != E; ++I)
   21793       Results.push_back(Res.getValue(I));
   21794 }
   21795 
   21796 /// Replace a node with an illegal result type with a new node built out of
   21797 /// custom code.
   21798 void X86TargetLowering::ReplaceNodeResults(SDNode *N,
   21799                                            SmallVectorImpl<SDValue>&Results,
   21800                                            SelectionDAG &DAG) const {
   21801   SDLoc dl(N);
   21802   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
   21803   switch (N->getOpcode()) {
   21804   default:
   21805     llvm_unreachable("Do not know how to custom type legalize this operation!");
   21806   case X86ISD::AVG: {
   21807     // Legalize types for X86ISD::AVG by expanding vectors.
   21808     assert(Subtarget.hasSSE2() && "Requires at least SSE2!");
   21809 
   21810     auto InVT = N->getValueType(0);
   21811     auto InVTSize = InVT.getSizeInBits();
   21812     const unsigned RegSize =
   21813         (InVTSize > 128) ? ((InVTSize > 256) ? 512 : 256) : 128;
   21814     assert((!Subtarget.hasAVX512() || RegSize < 512) &&
   21815            "512-bit vector requires AVX512");
   21816     assert((!Subtarget.hasAVX2() || RegSize < 256) &&
   21817            "256-bit vector requires AVX2");
   21818 
   21819     auto ElemVT = InVT.getVectorElementType();
   21820     auto RegVT = EVT::getVectorVT(*DAG.getContext(), ElemVT,
   21821                                   RegSize / ElemVT.getSizeInBits());
   21822     assert(RegSize % InVT.getSizeInBits() == 0);
   21823     unsigned NumConcat = RegSize / InVT.getSizeInBits();
   21824 
   21825     SmallVector<SDValue, 16> Ops(NumConcat, DAG.getUNDEF(InVT));
   21826     Ops[0] = N->getOperand(0);
   21827     SDValue InVec0 = DAG.getNode(ISD::CONCAT_VECTORS, dl, RegVT, Ops);
   21828     Ops[0] = N->getOperand(1);
   21829     SDValue InVec1 = DAG.getNode(ISD::CONCAT_VECTORS, dl, RegVT, Ops);
   21830 
   21831     SDValue Res = DAG.getNode(X86ISD::AVG, dl, RegVT, InVec0, InVec1);
   21832     Results.push_back(DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, InVT, Res,
   21833                                   DAG.getIntPtrConstant(0, dl)));
   21834     return;
   21835   }
   21836   // We might have generated v2f32 FMIN/FMAX operations. Widen them to v4f32.
   21837   case X86ISD::FMINC:
   21838   case X86ISD::FMIN:
   21839   case X86ISD::FMAXC:
   21840   case X86ISD::FMAX: {
   21841     EVT VT = N->getValueType(0);
   21842     assert(VT == MVT::v2f32 && "Unexpected type (!= v2f32) on FMIN/FMAX.");
   21843     SDValue UNDEF = DAG.getUNDEF(VT);
   21844     SDValue LHS = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32,
   21845                               N->getOperand(0), UNDEF);
   21846     SDValue RHS = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32,
   21847                               N->getOperand(1), UNDEF);
   21848     Results.push_back(DAG.getNode(N->getOpcode(), dl, MVT::v4f32, LHS, RHS));
   21849     return;
   21850   }
   21851   case ISD::SIGN_EXTEND_INREG:
   21852   case ISD::ADDC:
   21853   case ISD::ADDE:
   21854   case ISD::SUBC:
   21855   case ISD::SUBE:
   21856     // We don't want to expand or promote these.
   21857     return;
   21858   case ISD::SDIV:
   21859   case ISD::UDIV:
   21860   case ISD::SREM:
   21861   case ISD::UREM:
   21862   case ISD::SDIVREM:
   21863   case ISD::UDIVREM: {
   21864     SDValue V = LowerWin64_i128OP(SDValue(N,0), DAG);
   21865     Results.push_back(V);
   21866     return;
   21867   }
   21868   case ISD::FP_TO_SINT:
   21869   case ISD::FP_TO_UINT: {
   21870     bool IsSigned = N->getOpcode() == ISD::FP_TO_SINT;
   21871 
   21872     std::pair<SDValue,SDValue> Vals =
   21873         FP_TO_INTHelper(SDValue(N, 0), DAG, IsSigned, /*IsReplace=*/ true);
   21874     SDValue FIST = Vals.first, StackSlot = Vals.second;
   21875     if (FIST.getNode()) {
   21876       EVT VT = N->getValueType(0);
   21877       // Return a load from the stack slot.
   21878       if (StackSlot.getNode())
   21879         Results.push_back(DAG.getLoad(VT, dl, FIST, StackSlot,
   21880                                       MachinePointerInfo(),
   21881                                       false, false, false, 0));
   21882       else
   21883         Results.push_back(FIST);
   21884     }
   21885     return;
   21886   }
   21887   case ISD::UINT_TO_FP: {
   21888     assert(Subtarget.hasSSE2() && "Requires at least SSE2!");
   21889     if (N->getOperand(0).getValueType() != MVT::v2i32 ||
   21890         N->getValueType(0) != MVT::v2f32)
   21891       return;
   21892     SDValue ZExtIn = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::v2i64,
   21893                                  N->getOperand(0));
   21894     SDValue VBias =
   21895         DAG.getConstantFP(BitsToDouble(0x4330000000000000ULL), dl, MVT::v2f64);
   21896     SDValue Or = DAG.getNode(ISD::OR, dl, MVT::v2i64, ZExtIn,
   21897                              DAG.getBitcast(MVT::v2i64, VBias));
   21898     Or = DAG.getBitcast(MVT::v2f64, Or);
   21899     // TODO: Are there any fast-math-flags to propagate here?
   21900     SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::v2f64, Or, VBias);
   21901     Results.push_back(DAG.getNode(X86ISD::VFPROUND, dl, MVT::v4f32, Sub));
   21902     return;
   21903   }
   21904   case ISD::FP_ROUND: {
   21905     if (!TLI.isTypeLegal(N->getOperand(0).getValueType()))
   21906         return;
   21907     SDValue V = DAG.getNode(X86ISD::VFPROUND, dl, MVT::v4f32, N->getOperand(0));
   21908     Results.push_back(V);
   21909     return;
   21910   }
   21911   case ISD::FP_EXTEND: {
   21912     // Right now, only MVT::v2f32 has OperationAction for FP_EXTEND.
   21913     // No other ValueType for FP_EXTEND should reach this point.
   21914     assert(N->getValueType(0) == MVT::v2f32 &&
   21915            "Do not know how to legalize this Node");
   21916     return;
   21917   }
   21918   case ISD::INTRINSIC_W_CHAIN: {
   21919     unsigned IntNo = cast<ConstantSDNode>(N->getOperand(1))->getZExtValue();
   21920     switch (IntNo) {
   21921     default : llvm_unreachable("Do not know how to custom type "
   21922                                "legalize this intrinsic operation!");
   21923     case Intrinsic::x86_rdtsc:
   21924       return getReadTimeStampCounter(N, dl, X86ISD::RDTSC_DAG, DAG, Subtarget,
   21925                                      Results);
   21926     case Intrinsic::x86_rdtscp:
   21927       return getReadTimeStampCounter(N, dl, X86ISD::RDTSCP_DAG, DAG, Subtarget,
   21928                                      Results);
   21929     case Intrinsic::x86_rdpmc:
   21930       return getReadPerformanceCounter(N, dl, DAG, Subtarget, Results);
   21931     }
   21932   }
   21933   case ISD::INTRINSIC_WO_CHAIN: {
   21934     if (SDValue V = LowerINTRINSIC_WO_CHAIN(SDValue(N, 0), Subtarget, DAG))
   21935       Results.push_back(V);
   21936     return;
   21937   }
   21938   case ISD::READCYCLECOUNTER: {
   21939     return getReadTimeStampCounter(N, dl, X86ISD::RDTSC_DAG, DAG, Subtarget,
   21940                                    Results);
   21941   }
   21942   case ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS: {
   21943     EVT T = N->getValueType(0);
   21944     assert((T == MVT::i64 || T == MVT::i128) && "can only expand cmpxchg pair");
   21945     bool Regs64bit = T == MVT::i128;
   21946     MVT HalfT = Regs64bit ? MVT::i64 : MVT::i32;
   21947     SDValue cpInL, cpInH;
   21948     cpInL = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, HalfT, N->getOperand(2),
   21949                         DAG.getConstant(0, dl, HalfT));
   21950     cpInH = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, HalfT, N->getOperand(2),
   21951                         DAG.getConstant(1, dl, HalfT));
   21952     cpInL = DAG.getCopyToReg(N->getOperand(0), dl,
   21953                              Regs64bit ? X86::RAX : X86::EAX,
   21954                              cpInL, SDValue());
   21955     cpInH = DAG.getCopyToReg(cpInL.getValue(0), dl,
   21956                              Regs64bit ? X86::RDX : X86::EDX,
   21957                              cpInH, cpInL.getValue(1));
   21958     SDValue swapInL, swapInH;
   21959     swapInL = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, HalfT, N->getOperand(3),
   21960                           DAG.getConstant(0, dl, HalfT));
   21961     swapInH = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, HalfT, N->getOperand(3),
   21962                           DAG.getConstant(1, dl, HalfT));
   21963     swapInH =
   21964         DAG.getCopyToReg(cpInH.getValue(0), dl, Regs64bit ? X86::RCX : X86::ECX,
   21965                          swapInH, cpInH.getValue(1));
   21966     // If the current function needs the base pointer, RBX,
   21967     // we shouldn't use cmpxchg directly.
   21968     // Indeed the lowering of that instruction will clobber
   21969     // that register and since RBX will be a reserved register
   21970     // the register allocator will not make sure its value will
   21971     // be properly saved and restored around this live-range.
   21972     const X86RegisterInfo *TRI = Subtarget.getRegisterInfo();
   21973     SDValue Result;
   21974     SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue);
   21975     unsigned BasePtr = TRI->getBaseRegister();
   21976     MachineMemOperand *MMO = cast<AtomicSDNode>(N)->getMemOperand();
   21977     if (TRI->hasBasePointer(DAG.getMachineFunction()) &&
   21978         (BasePtr == X86::RBX || BasePtr == X86::EBX)) {
   21979       // ISel prefers the LCMPXCHG64 variant.
   21980       // If that assert breaks, that means it is not the case anymore,
   21981       // and we need to teach LCMPXCHG8_SAVE_EBX_DAG how to save RBX,
   21982       // not just EBX. This is a matter of accepting i64 input for that
   21983       // pseudo, and restoring into the register of the right wide
   21984       // in expand pseudo. Everything else should just work.
   21985       assert(((Regs64bit == (BasePtr == X86::RBX)) || BasePtr == X86::EBX) &&
   21986              "Saving only half of the RBX");
   21987       unsigned Opcode = Regs64bit ? X86ISD::LCMPXCHG16_SAVE_RBX_DAG
   21988                                   : X86ISD::LCMPXCHG8_SAVE_EBX_DAG;
   21989       SDValue RBXSave = DAG.getCopyFromReg(swapInH.getValue(0), dl,
   21990                                            Regs64bit ? X86::RBX : X86::EBX,
   21991                                            HalfT, swapInH.getValue(1));
   21992       SDValue Ops[] = {/*Chain*/ RBXSave.getValue(1), N->getOperand(1), swapInL,
   21993                        RBXSave,
   21994                        /*Glue*/ RBXSave.getValue(2)};
   21995       Result = DAG.getMemIntrinsicNode(Opcode, dl, Tys, Ops, T, MMO);
   21996     } else {
   21997       unsigned Opcode =
   21998           Regs64bit ? X86ISD::LCMPXCHG16_DAG : X86ISD::LCMPXCHG8_DAG;
   21999       swapInL = DAG.getCopyToReg(swapInH.getValue(0), dl,
   22000                                  Regs64bit ? X86::RBX : X86::EBX, swapInL,
   22001                                  swapInH.getValue(1));
   22002       SDValue Ops[] = {swapInL.getValue(0), N->getOperand(1),
   22003                        swapInL.getValue(1)};
   22004       Result = DAG.getMemIntrinsicNode(Opcode, dl, Tys, Ops, T, MMO);
   22005     }
   22006     SDValue cpOutL = DAG.getCopyFromReg(Result.getValue(0), dl,
   22007                                         Regs64bit ? X86::RAX : X86::EAX,
   22008                                         HalfT, Result.getValue(1));
   22009     SDValue cpOutH = DAG.getCopyFromReg(cpOutL.getValue(1), dl,
   22010                                         Regs64bit ? X86::RDX : X86::EDX,
   22011                                         HalfT, cpOutL.getValue(2));
   22012     SDValue OpsF[] = { cpOutL.getValue(0), cpOutH.getValue(0)};
   22013 
   22014     SDValue EFLAGS = DAG.getCopyFromReg(cpOutH.getValue(1), dl, X86::EFLAGS,
   22015                                         MVT::i32, cpOutH.getValue(2));
   22016     SDValue Success =
   22017         DAG.getNode(X86ISD::SETCC, dl, MVT::i8,
   22018                     DAG.getConstant(X86::COND_E, dl, MVT::i8), EFLAGS);
   22019     Success = DAG.getZExtOrTrunc(Success, dl, N->getValueType(1));
   22020 
   22021     Results.push_back(DAG.getNode(ISD::BUILD_PAIR, dl, T, OpsF));
   22022     Results.push_back(Success);
   22023     Results.push_back(EFLAGS.getValue(1));
   22024     return;
   22025   }
   22026   case ISD::ATOMIC_SWAP:
   22027   case ISD::ATOMIC_LOAD_ADD:
   22028   case ISD::ATOMIC_LOAD_SUB:
   22029   case ISD::ATOMIC_LOAD_AND:
   22030   case ISD::ATOMIC_LOAD_OR:
   22031   case ISD::ATOMIC_LOAD_XOR:
   22032   case ISD::ATOMIC_LOAD_NAND:
   22033   case ISD::ATOMIC_LOAD_MIN:
   22034   case ISD::ATOMIC_LOAD_MAX:
   22035   case ISD::ATOMIC_LOAD_UMIN:
   22036   case ISD::ATOMIC_LOAD_UMAX:
   22037   case ISD::ATOMIC_LOAD: {
   22038     // Delegate to generic TypeLegalization. Situations we can really handle
   22039     // should have already been dealt with by AtomicExpandPass.cpp.
   22040     break;
   22041   }
   22042   case ISD::BITCAST: {
   22043     assert(Subtarget.hasSSE2() && "Requires at least SSE2!");
   22044     EVT DstVT = N->getValueType(0);
   22045     EVT SrcVT = N->getOperand(0)->getValueType(0);
   22046 
   22047     if (SrcVT != MVT::f64 ||
   22048         (DstVT != MVT::v2i32 && DstVT != MVT::v4i16 && DstVT != MVT::v8i8))
   22049       return;
   22050 
   22051     unsigned NumElts = DstVT.getVectorNumElements();
   22052     EVT SVT = DstVT.getVectorElementType();
   22053     EVT WiderVT = EVT::getVectorVT(*DAG.getContext(), SVT, NumElts * 2);
   22054     SDValue Expanded = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl,
   22055                                    MVT::v2f64, N->getOperand(0));
   22056     SDValue ToVecInt = DAG.getBitcast(WiderVT, Expanded);
   22057 
   22058     if (ExperimentalVectorWideningLegalization) {
   22059       // If we are legalizing vectors by widening, we already have the desired
   22060       // legal vector type, just return it.
   22061       Results.push_back(ToVecInt);
   22062       return;
   22063     }
   22064 
   22065     SmallVector<SDValue, 8> Elts;
   22066     for (unsigned i = 0, e = NumElts; i != e; ++i)
   22067       Elts.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, SVT,
   22068                                    ToVecInt, DAG.getIntPtrConstant(i, dl)));
   22069 
   22070     Results.push_back(DAG.getBuildVector(DstVT, dl, Elts));
   22071   }
   22072   }
   22073 }
   22074 
   22075 const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const {
   22076   switch ((X86ISD::NodeType)Opcode) {
   22077   case X86ISD::FIRST_NUMBER:       break;
   22078   case X86ISD::BSF:                return "X86ISD::BSF";
   22079   case X86ISD::BSR:                return "X86ISD::BSR";
   22080   case X86ISD::SHLD:               return "X86ISD::SHLD";
   22081   case X86ISD::SHRD:               return "X86ISD::SHRD";
   22082   case X86ISD::FAND:               return "X86ISD::FAND";
   22083   case X86ISD::FANDN:              return "X86ISD::FANDN";
   22084   case X86ISD::FOR:                return "X86ISD::FOR";
   22085   case X86ISD::FXOR:               return "X86ISD::FXOR";
   22086   case X86ISD::FILD:               return "X86ISD::FILD";
   22087   case X86ISD::FILD_FLAG:          return "X86ISD::FILD_FLAG";
   22088   case X86ISD::FP_TO_INT16_IN_MEM: return "X86ISD::FP_TO_INT16_IN_MEM";
   22089   case X86ISD::FP_TO_INT32_IN_MEM: return "X86ISD::FP_TO_INT32_IN_MEM";
   22090   case X86ISD::FP_TO_INT64_IN_MEM: return "X86ISD::FP_TO_INT64_IN_MEM";
   22091   case X86ISD::FLD:                return "X86ISD::FLD";
   22092   case X86ISD::FST:                return "X86ISD::FST";
   22093   case X86ISD::CALL:               return "X86ISD::CALL";
   22094   case X86ISD::RDTSC_DAG:          return "X86ISD::RDTSC_DAG";
   22095   case X86ISD::RDTSCP_DAG:         return "X86ISD::RDTSCP_DAG";
   22096   case X86ISD::RDPMC_DAG:          return "X86ISD::RDPMC_DAG";
   22097   case X86ISD::BT:                 return "X86ISD::BT";
   22098   case X86ISD::CMP:                return "X86ISD::CMP";
   22099   case X86ISD::COMI:               return "X86ISD::COMI";
   22100   case X86ISD::UCOMI:              return "X86ISD::UCOMI";
   22101   case X86ISD::CMPM:               return "X86ISD::CMPM";
   22102   case X86ISD::CMPMU:              return "X86ISD::CMPMU";
   22103   case X86ISD::CMPM_RND:           return "X86ISD::CMPM_RND";
   22104   case X86ISD::SETCC:              return "X86ISD::SETCC";
   22105   case X86ISD::SETCC_CARRY:        return "X86ISD::SETCC_CARRY";
   22106   case X86ISD::FSETCC:             return "X86ISD::FSETCC";
   22107   case X86ISD::CMOV:               return "X86ISD::CMOV";
   22108   case X86ISD::BRCOND:             return "X86ISD::BRCOND";
   22109   case X86ISD::RET_FLAG:           return "X86ISD::RET_FLAG";
   22110   case X86ISD::IRET:               return "X86ISD::IRET";
   22111   case X86ISD::REP_STOS:           return "X86ISD::REP_STOS";
   22112   case X86ISD::REP_MOVS:           return "X86ISD::REP_MOVS";
   22113   case X86ISD::GlobalBaseReg:      return "X86ISD::GlobalBaseReg";
   22114   case X86ISD::Wrapper:            return "X86ISD::Wrapper";
   22115   case X86ISD::WrapperRIP:         return "X86ISD::WrapperRIP";
   22116   case X86ISD::MOVDQ2Q:            return "X86ISD::MOVDQ2Q";
   22117   case X86ISD::MMX_MOVD2W:         return "X86ISD::MMX_MOVD2W";
   22118   case X86ISD::MMX_MOVW2D:         return "X86ISD::MMX_MOVW2D";
   22119   case X86ISD::PEXTRB:             return "X86ISD::PEXTRB";
   22120   case X86ISD::PEXTRW:             return "X86ISD::PEXTRW";
   22121   case X86ISD::INSERTPS:           return "X86ISD::INSERTPS";
   22122   case X86ISD::PINSRB:             return "X86ISD::PINSRB";
   22123   case X86ISD::PINSRW:             return "X86ISD::PINSRW";
   22124   case X86ISD::MMX_PINSRW:         return "X86ISD::MMX_PINSRW";
   22125   case X86ISD::PSHUFB:             return "X86ISD::PSHUFB";
   22126   case X86ISD::ANDNP:              return "X86ISD::ANDNP";
   22127   case X86ISD::BLENDI:             return "X86ISD::BLENDI";
   22128   case X86ISD::SHRUNKBLEND:        return "X86ISD::SHRUNKBLEND";
   22129   case X86ISD::ADDUS:              return "X86ISD::ADDUS";
   22130   case X86ISD::SUBUS:              return "X86ISD::SUBUS";
   22131   case X86ISD::HADD:               return "X86ISD::HADD";
   22132   case X86ISD::HSUB:               return "X86ISD::HSUB";
   22133   case X86ISD::FHADD:              return "X86ISD::FHADD";
   22134   case X86ISD::FHSUB:              return "X86ISD::FHSUB";
   22135   case X86ISD::ABS:                return "X86ISD::ABS";
   22136   case X86ISD::CONFLICT:           return "X86ISD::CONFLICT";
   22137   case X86ISD::FMAX:               return "X86ISD::FMAX";
   22138   case X86ISD::FMAX_RND:           return "X86ISD::FMAX_RND";
   22139   case X86ISD::FMIN:               return "X86ISD::FMIN";
   22140   case X86ISD::FMIN_RND:           return "X86ISD::FMIN_RND";
   22141   case X86ISD::FMAXC:              return "X86ISD::FMAXC";
   22142   case X86ISD::FMINC:              return "X86ISD::FMINC";
   22143   case X86ISD::FRSQRT:             return "X86ISD::FRSQRT";
   22144   case X86ISD::FRSQRTS:             return "X86ISD::FRSQRTS";
   22145   case X86ISD::FRCP:               return "X86ISD::FRCP";
   22146   case X86ISD::FRCPS:              return "X86ISD::FRCPS";
   22147   case X86ISD::EXTRQI:             return "X86ISD::EXTRQI";
   22148   case X86ISD::INSERTQI:           return "X86ISD::INSERTQI";
   22149   case X86ISD::TLSADDR:            return "X86ISD::TLSADDR";
   22150   case X86ISD::TLSBASEADDR:        return "X86ISD::TLSBASEADDR";
   22151   case X86ISD::TLSCALL:            return "X86ISD::TLSCALL";
   22152   case X86ISD::EH_SJLJ_SETJMP:     return "X86ISD::EH_SJLJ_SETJMP";
   22153   case X86ISD::EH_SJLJ_LONGJMP:    return "X86ISD::EH_SJLJ_LONGJMP";
   22154   case X86ISD::EH_SJLJ_SETUP_DISPATCH:
   22155     return "X86ISD::EH_SJLJ_SETUP_DISPATCH";
   22156   case X86ISD::EH_RETURN:          return "X86ISD::EH_RETURN";
   22157   case X86ISD::TC_RETURN:          return "X86ISD::TC_RETURN";
   22158   case X86ISD::FNSTCW16m:          return "X86ISD::FNSTCW16m";
   22159   case X86ISD::FNSTSW16r:          return "X86ISD::FNSTSW16r";
   22160   case X86ISD::LCMPXCHG_DAG:       return "X86ISD::LCMPXCHG_DAG";
   22161   case X86ISD::LCMPXCHG8_DAG:      return "X86ISD::LCMPXCHG8_DAG";
   22162   case X86ISD::LCMPXCHG16_DAG:     return "X86ISD::LCMPXCHG16_DAG";
   22163   case X86ISD::LCMPXCHG8_SAVE_EBX_DAG:
   22164     return "X86ISD::LCMPXCHG8_SAVE_EBX_DAG";
   22165   case X86ISD::LCMPXCHG16_SAVE_RBX_DAG:
   22166     return "X86ISD::LCMPXCHG16_SAVE_RBX_DAG";
   22167   case X86ISD::LADD:               return "X86ISD::LADD";
   22168   case X86ISD::LSUB:               return "X86ISD::LSUB";
   22169   case X86ISD::LOR:                return "X86ISD::LOR";
   22170   case X86ISD::LXOR:               return "X86ISD::LXOR";
   22171   case X86ISD::LAND:               return "X86ISD::LAND";
   22172   case X86ISD::VZEXT_MOVL:         return "X86ISD::VZEXT_MOVL";
   22173   case X86ISD::VZEXT_LOAD:         return "X86ISD::VZEXT_LOAD";
   22174   case X86ISD::VZEXT:              return "X86ISD::VZEXT";
   22175   case X86ISD::VSEXT:              return "X86ISD::VSEXT";
   22176   case X86ISD::VTRUNC:             return "X86ISD::VTRUNC";
   22177   case X86ISD::VTRUNCS:            return "X86ISD::VTRUNCS";
   22178   case X86ISD::VTRUNCUS:           return "X86ISD::VTRUNCUS";
   22179   case X86ISD::VINSERT:            return "X86ISD::VINSERT";
   22180   case X86ISD::VFPEXT:             return "X86ISD::VFPEXT";
   22181   case X86ISD::VFPROUND:           return "X86ISD::VFPROUND";
   22182   case X86ISD::CVTDQ2PD:           return "X86ISD::CVTDQ2PD";
   22183   case X86ISD::CVTUDQ2PD:          return "X86ISD::CVTUDQ2PD";
   22184   case X86ISD::CVT2MASK:           return "X86ISD::CVT2MASK";
   22185   case X86ISD::VSHLDQ:             return "X86ISD::VSHLDQ";
   22186   case X86ISD::VSRLDQ:             return "X86ISD::VSRLDQ";
   22187   case X86ISD::VSHL:               return "X86ISD::VSHL";
   22188   case X86ISD::VSRL:               return "X86ISD::VSRL";
   22189   case X86ISD::VSRA:               return "X86ISD::VSRA";
   22190   case X86ISD::VSHLI:              return "X86ISD::VSHLI";
   22191   case X86ISD::VSRLI:              return "X86ISD::VSRLI";
   22192   case X86ISD::VSRAI:              return "X86ISD::VSRAI";
   22193   case X86ISD::VSRAV:              return "X86ISD::VSRAV";
   22194   case X86ISD::VROTLI:             return "X86ISD::VROTLI";
   22195   case X86ISD::VROTRI:             return "X86ISD::VROTRI";
   22196   case X86ISD::VPPERM:             return "X86ISD::VPPERM";
   22197   case X86ISD::CMPP:               return "X86ISD::CMPP";
   22198   case X86ISD::PCMPEQ:             return "X86ISD::PCMPEQ";
   22199   case X86ISD::PCMPGT:             return "X86ISD::PCMPGT";
   22200   case X86ISD::PCMPEQM:            return "X86ISD::PCMPEQM";
   22201   case X86ISD::PCMPGTM:            return "X86ISD::PCMPGTM";
   22202   case X86ISD::ADD:                return "X86ISD::ADD";
   22203   case X86ISD::SUB:                return "X86ISD::SUB";
   22204   case X86ISD::ADC:                return "X86ISD::ADC";
   22205   case X86ISD::SBB:                return "X86ISD::SBB";
   22206   case X86ISD::SMUL:               return "X86ISD::SMUL";
   22207   case X86ISD::UMUL:               return "X86ISD::UMUL";
   22208   case X86ISD::SMUL8:              return "X86ISD::SMUL8";
   22209   case X86ISD::UMUL8:              return "X86ISD::UMUL8";
   22210   case X86ISD::SDIVREM8_SEXT_HREG: return "X86ISD::SDIVREM8_SEXT_HREG";
   22211   case X86ISD::UDIVREM8_ZEXT_HREG: return "X86ISD::UDIVREM8_ZEXT_HREG";
   22212   case X86ISD::INC:                return "X86ISD::INC";
   22213   case X86ISD::DEC:                return "X86ISD::DEC";
   22214   case X86ISD::OR:                 return "X86ISD::OR";
   22215   case X86ISD::XOR:                return "X86ISD::XOR";
   22216   case X86ISD::AND:                return "X86ISD::AND";
   22217   case X86ISD::BEXTR:              return "X86ISD::BEXTR";
   22218   case X86ISD::MUL_IMM:            return "X86ISD::MUL_IMM";
   22219   case X86ISD::MOVMSK:             return "X86ISD::MOVMSK";
   22220   case X86ISD::PTEST:              return "X86ISD::PTEST";
   22221   case X86ISD::TESTP:              return "X86ISD::TESTP";
   22222   case X86ISD::TESTM:              return "X86ISD::TESTM";
   22223   case X86ISD::TESTNM:             return "X86ISD::TESTNM";
   22224   case X86ISD::KORTEST:            return "X86ISD::KORTEST";
   22225   case X86ISD::KTEST:              return "X86ISD::KTEST";
   22226   case X86ISD::PACKSS:             return "X86ISD::PACKSS";
   22227   case X86ISD::PACKUS:             return "X86ISD::PACKUS";
   22228   case X86ISD::PALIGNR:            return "X86ISD::PALIGNR";
   22229   case X86ISD::VALIGN:             return "X86ISD::VALIGN";
   22230   case X86ISD::PSHUFD:             return "X86ISD::PSHUFD";
   22231   case X86ISD::PSHUFHW:            return "X86ISD::PSHUFHW";
   22232   case X86ISD::PSHUFLW:            return "X86ISD::PSHUFLW";
   22233   case X86ISD::SHUFP:              return "X86ISD::SHUFP";
   22234   case X86ISD::SHUF128:            return "X86ISD::SHUF128";
   22235   case X86ISD::MOVLHPS:            return "X86ISD::MOVLHPS";
   22236   case X86ISD::MOVLHPD:            return "X86ISD::MOVLHPD";
   22237   case X86ISD::MOVHLPS:            return "X86ISD::MOVHLPS";
   22238   case X86ISD::MOVLPS:             return "X86ISD::MOVLPS";
   22239   case X86ISD::MOVLPD:             return "X86ISD::MOVLPD";
   22240   case X86ISD::MOVDDUP:            return "X86ISD::MOVDDUP";
   22241   case X86ISD::MOVSHDUP:           return "X86ISD::MOVSHDUP";
   22242   case X86ISD::MOVSLDUP:           return "X86ISD::MOVSLDUP";
   22243   case X86ISD::MOVSD:              return "X86ISD::MOVSD";
   22244   case X86ISD::MOVSS:              return "X86ISD::MOVSS";
   22245   case X86ISD::UNPCKL:             return "X86ISD::UNPCKL";
   22246   case X86ISD::UNPCKH:             return "X86ISD::UNPCKH";
   22247   case X86ISD::VBROADCAST:         return "X86ISD::VBROADCAST";
   22248   case X86ISD::VBROADCASTM:        return "X86ISD::VBROADCASTM";
   22249   case X86ISD::SUBV_BROADCAST:     return "X86ISD::SUBV_BROADCAST";
   22250   case X86ISD::VEXTRACT:           return "X86ISD::VEXTRACT";
   22251   case X86ISD::VPERMILPV:          return "X86ISD::VPERMILPV";
   22252   case X86ISD::VPERMILPI:          return "X86ISD::VPERMILPI";
   22253   case X86ISD::VPERM2X128:         return "X86ISD::VPERM2X128";
   22254   case X86ISD::VPERMV:             return "X86ISD::VPERMV";
   22255   case X86ISD::VPERMV3:            return "X86ISD::VPERMV3";
   22256   case X86ISD::VPERMIV3:           return "X86ISD::VPERMIV3";
   22257   case X86ISD::VPERMI:             return "X86ISD::VPERMI";
   22258   case X86ISD::VPTERNLOG:          return "X86ISD::VPTERNLOG";
   22259   case X86ISD::VFIXUPIMM:          return "X86ISD::VFIXUPIMM";
   22260   case X86ISD::VFIXUPIMMS:          return "X86ISD::VFIXUPIMMS";
   22261   case X86ISD::VRANGE:             return "X86ISD::VRANGE";
   22262   case X86ISD::PMULUDQ:            return "X86ISD::PMULUDQ";
   22263   case X86ISD::PMULDQ:             return "X86ISD::PMULDQ";
   22264   case X86ISD::PSADBW:             return "X86ISD::PSADBW";
   22265   case X86ISD::DBPSADBW:           return "X86ISD::DBPSADBW";
   22266   case X86ISD::VASTART_SAVE_XMM_REGS: return "X86ISD::VASTART_SAVE_XMM_REGS";
   22267   case X86ISD::VAARG_64:           return "X86ISD::VAARG_64";
   22268   case X86ISD::WIN_ALLOCA:         return "X86ISD::WIN_ALLOCA";
   22269   case X86ISD::MEMBARRIER:         return "X86ISD::MEMBARRIER";
   22270   case X86ISD::MFENCE:             return "X86ISD::MFENCE";
   22271   case X86ISD::SEG_ALLOCA:         return "X86ISD::SEG_ALLOCA";
   22272   case X86ISD::SAHF:               return "X86ISD::SAHF";
   22273   case X86ISD::RDRAND:             return "X86ISD::RDRAND";
   22274   case X86ISD::RDSEED:             return "X86ISD::RDSEED";
   22275   case X86ISD::VPMADDUBSW:         return "X86ISD::VPMADDUBSW";
   22276   case X86ISD::VPMADDWD:           return "X86ISD::VPMADDWD";
   22277   case X86ISD::VPROT:              return "X86ISD::VPROT";
   22278   case X86ISD::VPROTI:             return "X86ISD::VPROTI";
   22279   case X86ISD::VPSHA:              return "X86ISD::VPSHA";
   22280   case X86ISD::VPSHL:              return "X86ISD::VPSHL";
   22281   case X86ISD::VPCOM:              return "X86ISD::VPCOM";
   22282   case X86ISD::VPCOMU:             return "X86ISD::VPCOMU";
   22283   case X86ISD::VPERMIL2:           return "X86ISD::VPERMIL2";
   22284   case X86ISD::FMADD:              return "X86ISD::FMADD";
   22285   case X86ISD::FMSUB:              return "X86ISD::FMSUB";
   22286   case X86ISD::FNMADD:             return "X86ISD::FNMADD";
   22287   case X86ISD::FNMSUB:             return "X86ISD::FNMSUB";
   22288   case X86ISD::FMADDSUB:           return "X86ISD::FMADDSUB";
   22289   case X86ISD::FMSUBADD:           return "X86ISD::FMSUBADD";
   22290   case X86ISD::FMADD_RND:          return "X86ISD::FMADD_RND";
   22291   case X86ISD::FNMADD_RND:         return "X86ISD::FNMADD_RND";
   22292   case X86ISD::FMSUB_RND:          return "X86ISD::FMSUB_RND";
   22293   case X86ISD::FNMSUB_RND:         return "X86ISD::FNMSUB_RND";
   22294   case X86ISD::FMADDSUB_RND:       return "X86ISD::FMADDSUB_RND";
   22295   case X86ISD::FMSUBADD_RND:       return "X86ISD::FMSUBADD_RND";
   22296   case X86ISD::VPMADD52H:          return "X86ISD::VPMADD52H";
   22297   case X86ISD::VPMADD52L:          return "X86ISD::VPMADD52L";
   22298   case X86ISD::VRNDSCALE:          return "X86ISD::VRNDSCALE";
   22299   case X86ISD::VREDUCE:            return "X86ISD::VREDUCE";
   22300   case X86ISD::VGETMANT:           return "X86ISD::VGETMANT";
   22301   case X86ISD::PCMPESTRI:          return "X86ISD::PCMPESTRI";
   22302   case X86ISD::PCMPISTRI:          return "X86ISD::PCMPISTRI";
   22303   case X86ISD::XTEST:              return "X86ISD::XTEST";
   22304   case X86ISD::COMPRESS:           return "X86ISD::COMPRESS";
   22305   case X86ISD::EXPAND:             return "X86ISD::EXPAND";
   22306   case X86ISD::SELECT:             return "X86ISD::SELECT";
   22307   case X86ISD::ADDSUB:             return "X86ISD::ADDSUB";
   22308   case X86ISD::RCP28:              return "X86ISD::RCP28";
   22309   case X86ISD::EXP2:               return "X86ISD::EXP2";
   22310   case X86ISD::RSQRT28:            return "X86ISD::RSQRT28";
   22311   case X86ISD::FADD_RND:           return "X86ISD::FADD_RND";
   22312   case X86ISD::FSUB_RND:           return "X86ISD::FSUB_RND";
   22313   case X86ISD::FMUL_RND:           return "X86ISD::FMUL_RND";
   22314   case X86ISD::FDIV_RND:           return "X86ISD::FDIV_RND";
   22315   case X86ISD::FSQRT_RND:          return "X86ISD::FSQRT_RND";
   22316   case X86ISD::FGETEXP_RND:        return "X86ISD::FGETEXP_RND";
   22317   case X86ISD::SCALEF:             return "X86ISD::SCALEF";
   22318   case X86ISD::SCALEFS:            return "X86ISD::SCALEFS";
   22319   case X86ISD::ADDS:               return "X86ISD::ADDS";
   22320   case X86ISD::SUBS:               return "X86ISD::SUBS";
   22321   case X86ISD::AVG:                return "X86ISD::AVG";
   22322   case X86ISD::MULHRS:             return "X86ISD::MULHRS";
   22323   case X86ISD::SINT_TO_FP_RND:     return "X86ISD::SINT_TO_FP_RND";
   22324   case X86ISD::UINT_TO_FP_RND:     return "X86ISD::UINT_TO_FP_RND";
   22325   case X86ISD::FP_TO_SINT_RND:     return "X86ISD::FP_TO_SINT_RND";
   22326   case X86ISD::FP_TO_UINT_RND:     return "X86ISD::FP_TO_UINT_RND";
   22327   case X86ISD::VFPCLASS:           return "X86ISD::VFPCLASS";
   22328   case X86ISD::VFPCLASSS:          return "X86ISD::VFPCLASSS";
   22329   case X86ISD::MULTISHIFT:         return "X86ISD::MULTISHIFT";
   22330   case X86ISD::SCALAR_FP_TO_SINT_RND: return "X86ISD::SCALAR_FP_TO_SINT_RND";
   22331   case X86ISD::SCALAR_FP_TO_UINT_RND: return "X86ISD::SCALAR_FP_TO_UINT_RND";
   22332   }
   22333   return nullptr;
   22334 }
   22335 
   22336 /// Return true if the addressing mode represented by AM is legal for this
   22337 /// target, for a load/store of the specified type.
   22338 bool X86TargetLowering::isLegalAddressingMode(const DataLayout &DL,
   22339                                               const AddrMode &AM, Type *Ty,
   22340                                               unsigned AS) const {
   22341   // X86 supports extremely general addressing modes.
   22342   CodeModel::Model M = getTargetMachine().getCodeModel();
   22343 
   22344   // X86 allows a sign-extended 32-bit immediate field as a displacement.
   22345   if (!X86::isOffsetSuitableForCodeModel(AM.BaseOffs, M, AM.BaseGV != nullptr))
   22346     return false;
   22347 
   22348   if (AM.BaseGV) {
   22349     unsigned GVFlags = Subtarget.classifyGlobalReference(AM.BaseGV);
   22350 
   22351     // If a reference to this global requires an extra load, we can't fold it.
   22352     if (isGlobalStubReference(GVFlags))
   22353       return false;
   22354 
   22355     // If BaseGV requires a register for the PIC base, we cannot also have a
   22356     // BaseReg specified.
   22357     if (AM.HasBaseReg && isGlobalRelativeToPICBase(GVFlags))
   22358       return false;
   22359 
   22360     // If lower 4G is not available, then we must use rip-relative addressing.
   22361     if ((M != CodeModel::Small || isPositionIndependent()) &&
   22362         Subtarget.is64Bit() && (AM.BaseOffs || AM.Scale > 1))
   22363       return false;
   22364   }
   22365 
   22366   switch (AM.Scale) {
   22367   case 0:
   22368   case 1:
   22369   case 2:
   22370   case 4:
   22371   case 8:
   22372     // These scales always work.
   22373     break;
   22374   case 3:
   22375   case 5:
   22376   case 9:
   22377     // These scales are formed with basereg+scalereg.  Only accept if there is
   22378     // no basereg yet.
   22379     if (AM.HasBaseReg)
   22380       return false;
   22381     break;
   22382   default:  // Other stuff never works.
   22383     return false;
   22384   }
   22385 
   22386   return true;
   22387 }
   22388 
   22389 bool X86TargetLowering::isVectorShiftByScalarCheap(Type *Ty) const {
   22390   unsigned Bits = Ty->getScalarSizeInBits();
   22391 
   22392   // 8-bit shifts are always expensive, but versions with a scalar amount aren't
   22393   // particularly cheaper than those without.
   22394   if (Bits == 8)
   22395     return false;
   22396 
   22397   // On AVX2 there are new vpsllv[dq] instructions (and other shifts), that make
   22398   // variable shifts just as cheap as scalar ones.
   22399   if (Subtarget.hasInt256() && (Bits == 32 || Bits == 64))
   22400     return false;
   22401 
   22402   // Otherwise, it's significantly cheaper to shift by a scalar amount than by a
   22403   // fully general vector.
   22404   return true;
   22405 }
   22406 
   22407 bool X86TargetLowering::isTruncateFree(Type *Ty1, Type *Ty2) const {
   22408   if (!Ty1->isIntegerTy() || !Ty2->isIntegerTy())
   22409     return false;
   22410   unsigned NumBits1 = Ty1->getPrimitiveSizeInBits();
   22411   unsigned NumBits2 = Ty2->getPrimitiveSizeInBits();
   22412   return NumBits1 > NumBits2;
   22413 }
   22414 
   22415 bool X86TargetLowering::allowTruncateForTailCall(Type *Ty1, Type *Ty2) const {
   22416   if (!Ty1->isIntegerTy() || !Ty2->isIntegerTy())
   22417     return false;
   22418 
   22419   if (!isTypeLegal(EVT::getEVT(Ty1)))
   22420     return false;
   22421 
   22422   assert(Ty1->getPrimitiveSizeInBits() <= 64 && "i128 is probably not a noop");
   22423 
   22424   // Assuming the caller doesn't have a zeroext or signext return parameter,
   22425   // truncation all the way down to i1 is valid.
   22426   return true;
   22427 }
   22428 
   22429 bool X86TargetLowering::isLegalICmpImmediate(int64_t Imm) const {
   22430   return isInt<32>(Imm);
   22431 }
   22432 
   22433 bool X86TargetLowering::isLegalAddImmediate(int64_t Imm) const {
   22434   // Can also use sub to handle negated immediates.
   22435   return isInt<32>(Imm);
   22436 }
   22437 
   22438 bool X86TargetLowering::isTruncateFree(EVT VT1, EVT VT2) const {
   22439   if (!VT1.isInteger() || !VT2.isInteger())
   22440     return false;
   22441   unsigned NumBits1 = VT1.getSizeInBits();
   22442   unsigned NumBits2 = VT2.getSizeInBits();
   22443   return NumBits1 > NumBits2;
   22444 }
   22445 
   22446 bool X86TargetLowering::isZExtFree(Type *Ty1, Type *Ty2) const {
   22447   // x86-64 implicitly zero-extends 32-bit results in 64-bit registers.
   22448   return Ty1->isIntegerTy(32) && Ty2->isIntegerTy(64) && Subtarget.is64Bit();
   22449 }
   22450 
   22451 bool X86TargetLowering::isZExtFree(EVT VT1, EVT VT2) const {
   22452   // x86-64 implicitly zero-extends 32-bit results in 64-bit registers.
   22453   return VT1 == MVT::i32 && VT2 == MVT::i64 && Subtarget.is64Bit();
   22454 }
   22455 
   22456 bool X86TargetLowering::isZExtFree(SDValue Val, EVT VT2) const {
   22457   EVT VT1 = Val.getValueType();
   22458   if (isZExtFree(VT1, VT2))
   22459     return true;
   22460 
   22461   if (Val.getOpcode() != ISD::LOAD)
   22462     return false;
   22463 
   22464   if (!VT1.isSimple() || !VT1.isInteger() ||
   22465       !VT2.isSimple() || !VT2.isInteger())
   22466     return false;
   22467 
   22468   switch (VT1.getSimpleVT().SimpleTy) {
   22469   default: break;
   22470   case MVT::i8:
   22471   case MVT::i16:
   22472   case MVT::i32:
   22473     // X86 has 8, 16, and 32-bit zero-extending loads.
   22474     return true;
   22475   }
   22476 
   22477   return false;
   22478 }
   22479 
   22480 bool X86TargetLowering::isVectorLoadExtDesirable(SDValue) const { return true; }
   22481 
   22482 bool
   22483 X86TargetLowering::isFMAFasterThanFMulAndFAdd(EVT VT) const {
   22484   if (!Subtarget.hasAnyFMA())
   22485     return false;
   22486 
   22487   VT = VT.getScalarType();
   22488 
   22489   if (!VT.isSimple())
   22490     return false;
   22491 
   22492   switch (VT.getSimpleVT().SimpleTy) {
   22493   case MVT::f32:
   22494   case MVT::f64:
   22495     return true;
   22496   default:
   22497     break;
   22498   }
   22499 
   22500   return false;
   22501 }
   22502 
   22503 bool X86TargetLowering::isNarrowingProfitable(EVT VT1, EVT VT2) const {
   22504   // i16 instructions are longer (0x66 prefix) and potentially slower.
   22505   return !(VT1 == MVT::i32 && VT2 == MVT::i16);
   22506 }
   22507 
   22508 /// Targets can use this to indicate that they only support *some*
   22509 /// VECTOR_SHUFFLE operations, those with specific masks.
   22510 /// By default, if a target supports the VECTOR_SHUFFLE node, all mask values
   22511 /// are assumed to be legal.
   22512 bool
   22513 X86TargetLowering::isShuffleMaskLegal(const SmallVectorImpl<int> &M,
   22514                                       EVT VT) const {
   22515   if (!VT.isSimple())
   22516     return false;
   22517 
   22518   // Not for i1 vectors
   22519   if (VT.getSimpleVT().getScalarType() == MVT::i1)
   22520     return false;
   22521 
   22522   // Very little shuffling can be done for 64-bit vectors right now.
   22523   if (VT.getSimpleVT().getSizeInBits() == 64)
   22524     return false;
   22525 
   22526   // We only care that the types being shuffled are legal. The lowering can
   22527   // handle any possible shuffle mask that results.
   22528   return isTypeLegal(VT.getSimpleVT());
   22529 }
   22530 
   22531 bool
   22532 X86TargetLowering::isVectorClearMaskLegal(const SmallVectorImpl<int> &Mask,
   22533                                           EVT VT) const {
   22534   // Just delegate to the generic legality, clear masks aren't special.
   22535   return isShuffleMaskLegal(Mask, VT);
   22536 }
   22537 
   22538 //===----------------------------------------------------------------------===//
   22539 //                           X86 Scheduler Hooks
   22540 //===----------------------------------------------------------------------===//
   22541 
   22542 /// Utility function to emit xbegin specifying the start of an RTM region.
   22543 static MachineBasicBlock *emitXBegin(MachineInstr &MI, MachineBasicBlock *MBB,
   22544                                      const TargetInstrInfo *TII) {
   22545   DebugLoc DL = MI.getDebugLoc();
   22546 
   22547   const BasicBlock *BB = MBB->getBasicBlock();
   22548   MachineFunction::iterator I = ++MBB->getIterator();
   22549 
   22550   // For the v = xbegin(), we generate
   22551   //
   22552   // thisMBB:
   22553   //  xbegin sinkMBB
   22554   //
   22555   // mainMBB:
   22556   //  eax = -1
   22557   //
   22558   // sinkMBB:
   22559   //  v = eax
   22560 
   22561   MachineBasicBlock *thisMBB = MBB;
   22562   MachineFunction *MF = MBB->getParent();
   22563   MachineBasicBlock *mainMBB = MF->CreateMachineBasicBlock(BB);
   22564   MachineBasicBlock *sinkMBB = MF->CreateMachineBasicBlock(BB);
   22565   MF->insert(I, mainMBB);
   22566   MF->insert(I, sinkMBB);
   22567 
   22568   // Transfer the remainder of BB and its successor edges to sinkMBB.
   22569   sinkMBB->splice(sinkMBB->begin(), MBB,
   22570                   std::next(MachineBasicBlock::iterator(MI)), MBB->end());
   22571   sinkMBB->transferSuccessorsAndUpdatePHIs(MBB);
   22572 
   22573   // thisMBB:
   22574   //  xbegin sinkMBB
   22575   //  # fallthrough to mainMBB
   22576   //  # abortion to sinkMBB
   22577   BuildMI(thisMBB, DL, TII->get(X86::XBEGIN_4)).addMBB(sinkMBB);
   22578   thisMBB->addSuccessor(mainMBB);
   22579   thisMBB->addSuccessor(sinkMBB);
   22580 
   22581   // mainMBB:
   22582   //  EAX = -1
   22583   BuildMI(mainMBB, DL, TII->get(X86::MOV32ri), X86::EAX).addImm(-1);
   22584   mainMBB->addSuccessor(sinkMBB);
   22585 
   22586   // sinkMBB:
   22587   // EAX is live into the sinkMBB
   22588   sinkMBB->addLiveIn(X86::EAX);
   22589   BuildMI(*sinkMBB, sinkMBB->begin(), DL, TII->get(TargetOpcode::COPY),
   22590           MI.getOperand(0).getReg())
   22591       .addReg(X86::EAX);
   22592 
   22593   MI.eraseFromParent();
   22594   return sinkMBB;
   22595 }
   22596 
   22597 // FIXME: When we get size specific XMM0 registers, i.e. XMM0_V16I8
   22598 // or XMM0_V32I8 in AVX all of this code can be replaced with that
   22599 // in the .td file.
   22600 static MachineBasicBlock *emitPCMPSTRM(MachineInstr &MI, MachineBasicBlock *BB,
   22601                                        const TargetInstrInfo *TII) {
   22602   unsigned Opc;
   22603   switch (MI.getOpcode()) {
   22604   default: llvm_unreachable("illegal opcode!");
   22605   case X86::PCMPISTRM128REG:  Opc = X86::PCMPISTRM128rr;  break;
   22606   case X86::VPCMPISTRM128REG: Opc = X86::VPCMPISTRM128rr; break;
   22607   case X86::PCMPISTRM128MEM:  Opc = X86::PCMPISTRM128rm;  break;
   22608   case X86::VPCMPISTRM128MEM: Opc = X86::VPCMPISTRM128rm; break;
   22609   case X86::PCMPESTRM128REG:  Opc = X86::PCMPESTRM128rr;  break;
   22610   case X86::VPCMPESTRM128REG: Opc = X86::VPCMPESTRM128rr; break;
   22611   case X86::PCMPESTRM128MEM:  Opc = X86::PCMPESTRM128rm;  break;
   22612   case X86::VPCMPESTRM128MEM: Opc = X86::VPCMPESTRM128rm; break;
   22613   }
   22614 
   22615   DebugLoc dl = MI.getDebugLoc();
   22616   MachineInstrBuilder MIB = BuildMI(*BB, MI, dl, TII->get(Opc));
   22617 
   22618   unsigned NumArgs = MI.getNumOperands();
   22619   for (unsigned i = 1; i < NumArgs; ++i) {
   22620     MachineOperand &Op = MI.getOperand(i);
   22621     if (!(Op.isReg() && Op.isImplicit()))
   22622       MIB.addOperand(Op);
   22623   }
   22624   if (MI.hasOneMemOperand())
   22625     MIB->setMemRefs(MI.memoperands_begin(), MI.memoperands_end());
   22626 
   22627   BuildMI(*BB, MI, dl, TII->get(TargetOpcode::COPY), MI.getOperand(0).getReg())
   22628       .addReg(X86::XMM0);
   22629 
   22630   MI.eraseFromParent();
   22631   return BB;
   22632 }
   22633 
   22634 // FIXME: Custom handling because TableGen doesn't support multiple implicit
   22635 // defs in an instruction pattern
   22636 static MachineBasicBlock *emitPCMPSTRI(MachineInstr &MI, MachineBasicBlock *BB,
   22637                                        const TargetInstrInfo *TII) {
   22638   unsigned Opc;
   22639   switch (MI.getOpcode()) {
   22640   default: llvm_unreachable("illegal opcode!");
   22641   case X86::PCMPISTRIREG:  Opc = X86::PCMPISTRIrr;  break;
   22642   case X86::VPCMPISTRIREG: Opc = X86::VPCMPISTRIrr; break;
   22643   case X86::PCMPISTRIMEM:  Opc = X86::PCMPISTRIrm;  break;
   22644   case X86::VPCMPISTRIMEM: Opc = X86::VPCMPISTRIrm; break;
   22645   case X86::PCMPESTRIREG:  Opc = X86::PCMPESTRIrr;  break;
   22646   case X86::VPCMPESTRIREG: Opc = X86::VPCMPESTRIrr; break;
   22647   case X86::PCMPESTRIMEM:  Opc = X86::PCMPESTRIrm;  break;
   22648   case X86::VPCMPESTRIMEM: Opc = X86::VPCMPESTRIrm; break;
   22649   }
   22650 
   22651   DebugLoc dl = MI.getDebugLoc();
   22652   MachineInstrBuilder MIB = BuildMI(*BB, MI, dl, TII->get(Opc));
   22653 
   22654   unsigned NumArgs = MI.getNumOperands(); // remove the results
   22655   for (unsigned i = 1; i < NumArgs; ++i) {
   22656     MachineOperand &Op = MI.getOperand(i);
   22657     if (!(Op.isReg() && Op.isImplicit()))
   22658       MIB.addOperand(Op);
   22659   }
   22660   if (MI.hasOneMemOperand())
   22661     MIB->setMemRefs(MI.memoperands_begin(), MI.memoperands_end());
   22662 
   22663   BuildMI(*BB, MI, dl, TII->get(TargetOpcode::COPY), MI.getOperand(0).getReg())
   22664       .addReg(X86::ECX);
   22665 
   22666   MI.eraseFromParent();
   22667   return BB;
   22668 }
   22669 
   22670 static MachineBasicBlock *emitWRPKRU(MachineInstr &MI, MachineBasicBlock *BB,
   22671                                      const X86Subtarget &Subtarget) {
   22672   DebugLoc dl = MI.getDebugLoc();
   22673   const TargetInstrInfo *TII = Subtarget.getInstrInfo();
   22674 
   22675   // insert input VAL into EAX
   22676   BuildMI(*BB, MI, dl, TII->get(TargetOpcode::COPY), X86::EAX)
   22677       .addReg(MI.getOperand(0).getReg());
   22678   // insert zero to ECX
   22679   BuildMI(*BB, MI, dl, TII->get(X86::MOV32r0), X86::ECX);
   22680 
   22681   // insert zero to EDX
   22682   BuildMI(*BB, MI, dl, TII->get(X86::MOV32r0), X86::EDX);
   22683 
   22684   // insert WRPKRU instruction
   22685   BuildMI(*BB, MI, dl, TII->get(X86::WRPKRUr));
   22686 
   22687   MI.eraseFromParent(); // The pseudo is gone now.
   22688   return BB;
   22689 }
   22690 
   22691 static MachineBasicBlock *emitRDPKRU(MachineInstr &MI, MachineBasicBlock *BB,
   22692                                      const X86Subtarget &Subtarget) {
   22693   DebugLoc dl = MI.getDebugLoc();
   22694   const TargetInstrInfo *TII = Subtarget.getInstrInfo();
   22695 
   22696   // insert zero to ECX
   22697   BuildMI(*BB, MI, dl, TII->get(X86::MOV32r0), X86::ECX);
   22698 
   22699   // insert RDPKRU instruction
   22700   BuildMI(*BB, MI, dl, TII->get(X86::RDPKRUr));
   22701   BuildMI(*BB, MI, dl, TII->get(TargetOpcode::COPY), MI.getOperand(0).getReg())
   22702       .addReg(X86::EAX);
   22703 
   22704   MI.eraseFromParent(); // The pseudo is gone now.
   22705   return BB;
   22706 }
   22707 
   22708 static MachineBasicBlock *emitMonitor(MachineInstr &MI, MachineBasicBlock *BB,
   22709                                       const X86Subtarget &Subtarget,
   22710                                       unsigned Opc) {
   22711   DebugLoc dl = MI.getDebugLoc();
   22712   const TargetInstrInfo *TII = Subtarget.getInstrInfo();
   22713   // Address into RAX/EAX, other two args into ECX, EDX.
   22714   unsigned MemOpc = Subtarget.is64Bit() ? X86::LEA64r : X86::LEA32r;
   22715   unsigned MemReg = Subtarget.is64Bit() ? X86::RAX : X86::EAX;
   22716   MachineInstrBuilder MIB = BuildMI(*BB, MI, dl, TII->get(MemOpc), MemReg);
   22717   for (int i = 0; i < X86::AddrNumOperands; ++i)
   22718     MIB.addOperand(MI.getOperand(i));
   22719 
   22720   unsigned ValOps = X86::AddrNumOperands;
   22721   BuildMI(*BB, MI, dl, TII->get(TargetOpcode::COPY), X86::ECX)
   22722       .addReg(MI.getOperand(ValOps).getReg());
   22723   BuildMI(*BB, MI, dl, TII->get(TargetOpcode::COPY), X86::EDX)
   22724       .addReg(MI.getOperand(ValOps + 1).getReg());
   22725 
   22726   // The instruction doesn't actually take any operands though.
   22727   BuildMI(*BB, MI, dl, TII->get(Opc));
   22728 
   22729   MI.eraseFromParent(); // The pseudo is gone now.
   22730   return BB;
   22731 }
   22732 
   22733 MachineBasicBlock *
   22734 X86TargetLowering::EmitVAARG64WithCustomInserter(MachineInstr &MI,
   22735                                                  MachineBasicBlock *MBB) const {
   22736   // Emit va_arg instruction on X86-64.
   22737 
   22738   // Operands to this pseudo-instruction:
   22739   // 0  ) Output        : destination address (reg)
   22740   // 1-5) Input         : va_list address (addr, i64mem)
   22741   // 6  ) ArgSize       : Size (in bytes) of vararg type
   22742   // 7  ) ArgMode       : 0=overflow only, 1=use gp_offset, 2=use fp_offset
   22743   // 8  ) Align         : Alignment of type
   22744   // 9  ) EFLAGS (implicit-def)
   22745 
   22746   assert(MI.getNumOperands() == 10 && "VAARG_64 should have 10 operands!");
   22747   static_assert(X86::AddrNumOperands == 5,
   22748                 "VAARG_64 assumes 5 address operands");
   22749 
   22750   unsigned DestReg = MI.getOperand(0).getReg();
   22751   MachineOperand &Base = MI.getOperand(1);
   22752   MachineOperand &Scale = MI.getOperand(2);
   22753   MachineOperand &Index = MI.getOperand(3);
   22754   MachineOperand &Disp = MI.getOperand(4);
   22755   MachineOperand &Segment = MI.getOperand(5);
   22756   unsigned ArgSize = MI.getOperand(6).getImm();
   22757   unsigned ArgMode = MI.getOperand(7).getImm();
   22758   unsigned Align = MI.getOperand(8).getImm();
   22759 
   22760   // Memory Reference
   22761   assert(MI.hasOneMemOperand() && "Expected VAARG_64 to have one memoperand");
   22762   MachineInstr::mmo_iterator MMOBegin = MI.memoperands_begin();
   22763   MachineInstr::mmo_iterator MMOEnd = MI.memoperands_end();
   22764 
   22765   // Machine Information
   22766   const TargetInstrInfo *TII = Subtarget.getInstrInfo();
   22767   MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo();
   22768   const TargetRegisterClass *AddrRegClass = getRegClassFor(MVT::i64);
   22769   const TargetRegisterClass *OffsetRegClass = getRegClassFor(MVT::i32);
   22770   DebugLoc DL = MI.getDebugLoc();
   22771 
   22772   // struct va_list {
   22773   //   i32   gp_offset
   22774   //   i32   fp_offset
   22775   //   i64   overflow_area (address)
   22776   //   i64   reg_save_area (address)
   22777   // }
   22778   // sizeof(va_list) = 24
   22779   // alignment(va_list) = 8
   22780 
   22781   unsigned TotalNumIntRegs = 6;
   22782   unsigned TotalNumXMMRegs = 8;
   22783   bool UseGPOffset = (ArgMode == 1);
   22784   bool UseFPOffset = (ArgMode == 2);
   22785   unsigned MaxOffset = TotalNumIntRegs * 8 +
   22786                        (UseFPOffset ? TotalNumXMMRegs * 16 : 0);
   22787 
   22788   /* Align ArgSize to a multiple of 8 */
   22789   unsigned ArgSizeA8 = (ArgSize + 7) & ~7;
   22790   bool NeedsAlign = (Align > 8);
   22791 
   22792   MachineBasicBlock *thisMBB = MBB;
   22793   MachineBasicBlock *overflowMBB;
   22794   MachineBasicBlock *offsetMBB;
   22795   MachineBasicBlock *endMBB;
   22796 
   22797   unsigned OffsetDestReg = 0;    // Argument address computed by offsetMBB
   22798   unsigned OverflowDestReg = 0;  // Argument address computed by overflowMBB
   22799   unsigned OffsetReg = 0;
   22800 
   22801   if (!UseGPOffset && !UseFPOffset) {
   22802     // If we only pull from the overflow region, we don't create a branch.
   22803     // We don't need to alter control flow.
   22804     OffsetDestReg = 0; // unused
   22805     OverflowDestReg = DestReg;
   22806 
   22807     offsetMBB = nullptr;
   22808     overflowMBB = thisMBB;
   22809     endMBB = thisMBB;
   22810   } else {
   22811     // First emit code to check if gp_offset (or fp_offset) is below the bound.
   22812     // If so, pull the argument from reg_save_area. (branch to offsetMBB)
   22813     // If not, pull from overflow_area. (branch to overflowMBB)
   22814     //
   22815     //       thisMBB
   22816     //         |     .
   22817     //         |        .
   22818     //     offsetMBB   overflowMBB
   22819     //         |        .
   22820     //         |     .
   22821     //        endMBB
   22822 
   22823     // Registers for the PHI in endMBB
   22824     OffsetDestReg = MRI.createVirtualRegister(AddrRegClass);
   22825     OverflowDestReg = MRI.createVirtualRegister(AddrRegClass);
   22826 
   22827     const BasicBlock *LLVM_BB = MBB->getBasicBlock();
   22828     MachineFunction *MF = MBB->getParent();
   22829     overflowMBB = MF->CreateMachineBasicBlock(LLVM_BB);
   22830     offsetMBB = MF->CreateMachineBasicBlock(LLVM_BB);
   22831     endMBB = MF->CreateMachineBasicBlock(LLVM_BB);
   22832 
   22833     MachineFunction::iterator MBBIter = ++MBB->getIterator();
   22834 
   22835     // Insert the new basic blocks
   22836     MF->insert(MBBIter, offsetMBB);
   22837     MF->insert(MBBIter, overflowMBB);
   22838     MF->insert(MBBIter, endMBB);
   22839 
   22840     // Transfer the remainder of MBB and its successor edges to endMBB.
   22841     endMBB->splice(endMBB->begin(), thisMBB,
   22842                    std::next(MachineBasicBlock::iterator(MI)), thisMBB->end());
   22843     endMBB->transferSuccessorsAndUpdatePHIs(thisMBB);
   22844 
   22845     // Make offsetMBB and overflowMBB successors of thisMBB
   22846     thisMBB->addSuccessor(offsetMBB);
   22847     thisMBB->addSuccessor(overflowMBB);
   22848 
   22849     // endMBB is a successor of both offsetMBB and overflowMBB
   22850     offsetMBB->addSuccessor(endMBB);
   22851     overflowMBB->addSuccessor(endMBB);
   22852 
   22853     // Load the offset value into a register
   22854     OffsetReg = MRI.createVirtualRegister(OffsetRegClass);
   22855     BuildMI(thisMBB, DL, TII->get(X86::MOV32rm), OffsetReg)
   22856       .addOperand(Base)
   22857       .addOperand(Scale)
   22858       .addOperand(Index)
   22859       .addDisp(Disp, UseFPOffset ? 4 : 0)
   22860       .addOperand(Segment)
   22861       .setMemRefs(MMOBegin, MMOEnd);
   22862 
   22863     // Check if there is enough room left to pull this argument.
   22864     BuildMI(thisMBB, DL, TII->get(X86::CMP32ri))
   22865       .addReg(OffsetReg)
   22866       .addImm(MaxOffset + 8 - ArgSizeA8);
   22867 
   22868     // Branch to "overflowMBB" if offset >= max
   22869     // Fall through to "offsetMBB" otherwise
   22870     BuildMI(thisMBB, DL, TII->get(X86::GetCondBranchFromCond(X86::COND_AE)))
   22871       .addMBB(overflowMBB);
   22872   }
   22873 
   22874   // In offsetMBB, emit code to use the reg_save_area.
   22875   if (offsetMBB) {
   22876     assert(OffsetReg != 0);
   22877 
   22878     // Read the reg_save_area address.
   22879     unsigned RegSaveReg = MRI.createVirtualRegister(AddrRegClass);
   22880     BuildMI(offsetMBB, DL, TII->get(X86::MOV64rm), RegSaveReg)
   22881       .addOperand(Base)
   22882       .addOperand(Scale)
   22883       .addOperand(Index)
   22884       .addDisp(Disp, 16)
   22885       .addOperand(Segment)
   22886       .setMemRefs(MMOBegin, MMOEnd);
   22887 
   22888     // Zero-extend the offset
   22889     unsigned OffsetReg64 = MRI.createVirtualRegister(AddrRegClass);
   22890       BuildMI(offsetMBB, DL, TII->get(X86::SUBREG_TO_REG), OffsetReg64)
   22891         .addImm(0)
   22892         .addReg(OffsetReg)
   22893         .addImm(X86::sub_32bit);
   22894 
   22895     // Add the offset to the reg_save_area to get the final address.
   22896     BuildMI(offsetMBB, DL, TII->get(X86::ADD64rr), OffsetDestReg)
   22897       .addReg(OffsetReg64)
   22898       .addReg(RegSaveReg);
   22899 
   22900     // Compute the offset for the next argument
   22901     unsigned NextOffsetReg = MRI.createVirtualRegister(OffsetRegClass);
   22902     BuildMI(offsetMBB, DL, TII->get(X86::ADD32ri), NextOffsetReg)
   22903       .addReg(OffsetReg)
   22904       .addImm(UseFPOffset ? 16 : 8);
   22905 
   22906     // Store it back into the va_list.
   22907     BuildMI(offsetMBB, DL, TII->get(X86::MOV32mr))
   22908       .addOperand(Base)
   22909       .addOperand(Scale)
   22910       .addOperand(Index)
   22911       .addDisp(Disp, UseFPOffset ? 4 : 0)
   22912       .addOperand(Segment)
   22913       .addReg(NextOffsetReg)
   22914       .setMemRefs(MMOBegin, MMOEnd);
   22915 
   22916     // Jump to endMBB
   22917     BuildMI(offsetMBB, DL, TII->get(X86::JMP_1))
   22918       .addMBB(endMBB);
   22919   }
   22920 
   22921   //
   22922   // Emit code to use overflow area
   22923   //
   22924 
   22925   // Load the overflow_area address into a register.
   22926   unsigned OverflowAddrReg = MRI.createVirtualRegister(AddrRegClass);
   22927   BuildMI(overflowMBB, DL, TII->get(X86::MOV64rm), OverflowAddrReg)
   22928     .addOperand(Base)
   22929     .addOperand(Scale)
   22930     .addOperand(Index)
   22931     .addDisp(Disp, 8)
   22932     .addOperand(Segment)
   22933     .setMemRefs(MMOBegin, MMOEnd);
   22934 
   22935   // If we need to align it, do so. Otherwise, just copy the address
   22936   // to OverflowDestReg.
   22937   if (NeedsAlign) {
   22938     // Align the overflow address
   22939     assert(isPowerOf2_32(Align) && "Alignment must be a power of 2");
   22940     unsigned TmpReg = MRI.createVirtualRegister(AddrRegClass);
   22941 
   22942     // aligned_addr = (addr + (align-1)) & ~(align-1)
   22943     BuildMI(overflowMBB, DL, TII->get(X86::ADD64ri32), TmpReg)
   22944       .addReg(OverflowAddrReg)
   22945       .addImm(Align-1);
   22946 
   22947     BuildMI(overflowMBB, DL, TII->get(X86::AND64ri32), OverflowDestReg)
   22948       .addReg(TmpReg)
   22949       .addImm(~(uint64_t)(Align-1));
   22950   } else {
   22951     BuildMI(overflowMBB, DL, TII->get(TargetOpcode::COPY), OverflowDestReg)
   22952       .addReg(OverflowAddrReg);
   22953   }
   22954 
   22955   // Compute the next overflow address after this argument.
   22956   // (the overflow address should be kept 8-byte aligned)
   22957   unsigned NextAddrReg = MRI.createVirtualRegister(AddrRegClass);
   22958   BuildMI(overflowMBB, DL, TII->get(X86::ADD64ri32), NextAddrReg)
   22959     .addReg(OverflowDestReg)
   22960     .addImm(ArgSizeA8);
   22961 
   22962   // Store the new overflow address.
   22963   BuildMI(overflowMBB, DL, TII->get(X86::MOV64mr))
   22964     .addOperand(Base)
   22965     .addOperand(Scale)
   22966     .addOperand(Index)
   22967     .addDisp(Disp, 8)
   22968     .addOperand(Segment)
   22969     .addReg(NextAddrReg)
   22970     .setMemRefs(MMOBegin, MMOEnd);
   22971 
   22972   // If we branched, emit the PHI to the front of endMBB.
   22973   if (offsetMBB) {
   22974     BuildMI(*endMBB, endMBB->begin(), DL,
   22975             TII->get(X86::PHI), DestReg)
   22976       .addReg(OffsetDestReg).addMBB(offsetMBB)
   22977       .addReg(OverflowDestReg).addMBB(overflowMBB);
   22978   }
   22979 
   22980   // Erase the pseudo instruction
   22981   MI.eraseFromParent();
   22982 
   22983   return endMBB;
   22984 }
   22985 
   22986 MachineBasicBlock *X86TargetLowering::EmitVAStartSaveXMMRegsWithCustomInserter(
   22987     MachineInstr &MI, MachineBasicBlock *MBB) const {
   22988   // Emit code to save XMM registers to the stack. The ABI says that the
   22989   // number of registers to save is given in %al, so it's theoretically
   22990   // possible to do an indirect jump trick to avoid saving all of them,
   22991   // however this code takes a simpler approach and just executes all
   22992   // of the stores if %al is non-zero. It's less code, and it's probably
   22993   // easier on the hardware branch predictor, and stores aren't all that
   22994   // expensive anyway.
   22995 
   22996   // Create the new basic blocks. One block contains all the XMM stores,
   22997   // and one block is the final destination regardless of whether any
   22998   // stores were performed.
   22999   const BasicBlock *LLVM_BB = MBB->getBasicBlock();
   23000   MachineFunction *F = MBB->getParent();
   23001   MachineFunction::iterator MBBIter = ++MBB->getIterator();
   23002   MachineBasicBlock *XMMSaveMBB = F->CreateMachineBasicBlock(LLVM_BB);
   23003   MachineBasicBlock *EndMBB = F->CreateMachineBasicBlock(LLVM_BB);
   23004   F->insert(MBBIter, XMMSaveMBB);
   23005   F->insert(MBBIter, EndMBB);
   23006 
   23007   // Transfer the remainder of MBB and its successor edges to EndMBB.
   23008   EndMBB->splice(EndMBB->begin(), MBB,
   23009                  std::next(MachineBasicBlock::iterator(MI)), MBB->end());
   23010   EndMBB->transferSuccessorsAndUpdatePHIs(MBB);
   23011 
   23012   // The original block will now fall through to the XMM save block.
   23013   MBB->addSuccessor(XMMSaveMBB);
   23014   // The XMMSaveMBB will fall through to the end block.
   23015   XMMSaveMBB->addSuccessor(EndMBB);
   23016 
   23017   // Now add the instructions.
   23018   const TargetInstrInfo *TII = Subtarget.getInstrInfo();
   23019   DebugLoc DL = MI.getDebugLoc();
   23020 
   23021   unsigned CountReg = MI.getOperand(0).getReg();
   23022   int64_t RegSaveFrameIndex = MI.getOperand(1).getImm();
   23023   int64_t VarArgsFPOffset = MI.getOperand(2).getImm();
   23024 
   23025   if (!Subtarget.isCallingConvWin64(F->getFunction()->getCallingConv())) {
   23026     // If %al is 0, branch around the XMM save block.
   23027     BuildMI(MBB, DL, TII->get(X86::TEST8rr)).addReg(CountReg).addReg(CountReg);
   23028     BuildMI(MBB, DL, TII->get(X86::JE_1)).addMBB(EndMBB);
   23029     MBB->addSuccessor(EndMBB);
   23030   }
   23031 
   23032   // Make sure the last operand is EFLAGS, which gets clobbered by the branch
   23033   // that was just emitted, but clearly shouldn't be "saved".
   23034   assert((MI.getNumOperands() <= 3 ||
   23035           !MI.getOperand(MI.getNumOperands() - 1).isReg() ||
   23036           MI.getOperand(MI.getNumOperands() - 1).getReg() == X86::EFLAGS) &&
   23037          "Expected last argument to be EFLAGS");
   23038   unsigned MOVOpc = Subtarget.hasFp256() ? X86::VMOVAPSmr : X86::MOVAPSmr;
   23039   // In the XMM save block, save all the XMM argument registers.
   23040   for (int i = 3, e = MI.getNumOperands() - 1; i != e; ++i) {
   23041     int64_t Offset = (i - 3) * 16 + VarArgsFPOffset;
   23042     MachineMemOperand *MMO = F->getMachineMemOperand(
   23043         MachinePointerInfo::getFixedStack(*F, RegSaveFrameIndex, Offset),
   23044         MachineMemOperand::MOStore,
   23045         /*Size=*/16, /*Align=*/16);
   23046     BuildMI(XMMSaveMBB, DL, TII->get(MOVOpc))
   23047         .addFrameIndex(RegSaveFrameIndex)
   23048         .addImm(/*Scale=*/1)
   23049         .addReg(/*IndexReg=*/0)
   23050         .addImm(/*Disp=*/Offset)
   23051         .addReg(/*Segment=*/0)
   23052         .addReg(MI.getOperand(i).getReg())
   23053         .addMemOperand(MMO);
   23054   }
   23055 
   23056   MI.eraseFromParent(); // The pseudo instruction is gone now.
   23057 
   23058   return EndMBB;
   23059 }
   23060 
   23061 // The EFLAGS operand of SelectItr might be missing a kill marker
   23062 // because there were multiple uses of EFLAGS, and ISel didn't know
   23063 // which to mark. Figure out whether SelectItr should have had a
   23064 // kill marker, and set it if it should. Returns the correct kill
   23065 // marker value.
   23066 static bool checkAndUpdateEFLAGSKill(MachineBasicBlock::iterator SelectItr,
   23067                                      MachineBasicBlock* BB,
   23068                                      const TargetRegisterInfo* TRI) {
   23069   // Scan forward through BB for a use/def of EFLAGS.
   23070   MachineBasicBlock::iterator miI(std::next(SelectItr));
   23071   for (MachineBasicBlock::iterator miE = BB->end(); miI != miE; ++miI) {
   23072     const MachineInstr& mi = *miI;
   23073     if (mi.readsRegister(X86::EFLAGS))
   23074       return false;
   23075     if (mi.definesRegister(X86::EFLAGS))
   23076       break; // Should have kill-flag - update below.
   23077   }
   23078 
   23079   // If we hit the end of the block, check whether EFLAGS is live into a
   23080   // successor.
   23081   if (miI == BB->end()) {
   23082     for (MachineBasicBlock::succ_iterator sItr = BB->succ_begin(),
   23083                                           sEnd = BB->succ_end();
   23084          sItr != sEnd; ++sItr) {
   23085       MachineBasicBlock* succ = *sItr;
   23086       if (succ->isLiveIn(X86::EFLAGS))
   23087         return false;
   23088     }
   23089   }
   23090 
   23091   // We found a def, or hit the end of the basic block and EFLAGS wasn't live
   23092   // out. SelectMI should have a kill flag on EFLAGS.
   23093   SelectItr->addRegisterKilled(X86::EFLAGS, TRI);
   23094   return true;
   23095 }
   23096 
   23097 // Return true if it is OK for this CMOV pseudo-opcode to be cascaded
   23098 // together with other CMOV pseudo-opcodes into a single basic-block with
   23099 // conditional jump around it.
   23100 static bool isCMOVPseudo(MachineInstr &MI) {
   23101   switch (MI.getOpcode()) {
   23102   case X86::CMOV_FR32:
   23103   case X86::CMOV_FR64:
   23104   case X86::CMOV_GR8:
   23105   case X86::CMOV_GR16:
   23106   case X86::CMOV_GR32:
   23107   case X86::CMOV_RFP32:
   23108   case X86::CMOV_RFP64:
   23109   case X86::CMOV_RFP80:
   23110   case X86::CMOV_V2F64:
   23111   case X86::CMOV_V2I64:
   23112   case X86::CMOV_V4F32:
   23113   case X86::CMOV_V4F64:
   23114   case X86::CMOV_V4I64:
   23115   case X86::CMOV_V16F32:
   23116   case X86::CMOV_V8F32:
   23117   case X86::CMOV_V8F64:
   23118   case X86::CMOV_V8I64:
   23119   case X86::CMOV_V8I1:
   23120   case X86::CMOV_V16I1:
   23121   case X86::CMOV_V32I1:
   23122   case X86::CMOV_V64I1:
   23123     return true;
   23124 
   23125   default:
   23126     return false;
   23127   }
   23128 }
   23129 
   23130 MachineBasicBlock *
   23131 X86TargetLowering::EmitLoweredSelect(MachineInstr &MI,
   23132                                      MachineBasicBlock *BB) const {
   23133   const TargetInstrInfo *TII = Subtarget.getInstrInfo();
   23134   DebugLoc DL = MI.getDebugLoc();
   23135 
   23136   // To "insert" a SELECT_CC instruction, we actually have to insert the
   23137   // diamond control-flow pattern.  The incoming instruction knows the
   23138   // destination vreg to set, the condition code register to branch on, the
   23139   // true/false values to select between, and a branch opcode to use.
   23140   const BasicBlock *LLVM_BB = BB->getBasicBlock();
   23141   MachineFunction::iterator It = ++BB->getIterator();
   23142 
   23143   //  thisMBB:
   23144   //  ...
   23145   //   TrueVal = ...
   23146   //   cmpTY ccX, r1, r2
   23147   //   bCC copy1MBB
   23148   //   fallthrough --> copy0MBB
   23149   MachineBasicBlock *thisMBB = BB;
   23150   MachineFunction *F = BB->getParent();
   23151 
   23152   // This code lowers all pseudo-CMOV instructions. Generally it lowers these
   23153   // as described above, by inserting a BB, and then making a PHI at the join
   23154   // point to select the true and false operands of the CMOV in the PHI.
   23155   //
   23156   // The code also handles two different cases of multiple CMOV opcodes
   23157   // in a row.
   23158   //
   23159   // Case 1:
   23160   // In this case, there are multiple CMOVs in a row, all which are based on
   23161   // the same condition setting (or the exact opposite condition setting).
   23162   // In this case we can lower all the CMOVs using a single inserted BB, and
   23163   // then make a number of PHIs at the join point to model the CMOVs. The only
   23164   // trickiness here, is that in a case like:
   23165   //
   23166   // t2 = CMOV cond1 t1, f1
   23167   // t3 = CMOV cond1 t2, f2
   23168   //
   23169   // when rewriting this into PHIs, we have to perform some renaming on the
   23170   // temps since you cannot have a PHI operand refer to a PHI result earlier
   23171   // in the same block.  The "simple" but wrong lowering would be:
   23172   //
   23173   // t2 = PHI t1(BB1), f1(BB2)
   23174   // t3 = PHI t2(BB1), f2(BB2)
   23175   //
   23176   // but clearly t2 is not defined in BB1, so that is incorrect. The proper
   23177   // renaming is to note that on the path through BB1, t2 is really just a
   23178   // copy of t1, and do that renaming, properly generating:
   23179   //
   23180   // t2 = PHI t1(BB1), f1(BB2)
   23181   // t3 = PHI t1(BB1), f2(BB2)
   23182   //
   23183   // Case 2, we lower cascaded CMOVs such as
   23184   //
   23185   //   (CMOV (CMOV F, T, cc1), T, cc2)
   23186   //
   23187   // to two successives branches.  For that, we look for another CMOV as the
   23188   // following instruction.
   23189   //
   23190   // Without this, we would add a PHI between the two jumps, which ends up
   23191   // creating a few copies all around. For instance, for
   23192   //
   23193   //    (sitofp (zext (fcmp une)))
   23194   //
   23195   // we would generate:
   23196   //
   23197   //         ucomiss %xmm1, %xmm0
   23198   //         movss  <1.0f>, %xmm0
   23199   //         movaps  %xmm0, %xmm1
   23200   //         jne     .LBB5_2
   23201   //         xorps   %xmm1, %xmm1
   23202   // .LBB5_2:
   23203   //         jp      .LBB5_4
   23204   //         movaps  %xmm1, %xmm0
   23205   // .LBB5_4:
   23206   //         retq
   23207   //
   23208   // because this custom-inserter would have generated:
   23209   //
   23210   //   A
   23211   //   | \
   23212   //   |  B
   23213   //   | /
   23214   //   C
   23215   //   | \
   23216   //   |  D
   23217   //   | /
   23218   //   E
   23219   //
   23220   // A: X = ...; Y = ...
   23221   // B: empty
   23222   // C: Z = PHI [X, A], [Y, B]
   23223   // D: empty
   23224   // E: PHI [X, C], [Z, D]
   23225   //
   23226   // If we lower both CMOVs in a single step, we can instead generate:
   23227   //
   23228   //   A
   23229   //   | \
   23230   //   |  C
   23231   //   | /|
   23232   //   |/ |
   23233   //   |  |
   23234   //   |  D
   23235   //   | /
   23236   //   E
   23237   //
   23238   // A: X = ...; Y = ...
   23239   // D: empty
   23240   // E: PHI [X, A], [X, C], [Y, D]
   23241   //
   23242   // Which, in our sitofp/fcmp example, gives us something like:
   23243   //
   23244   //         ucomiss %xmm1, %xmm0
   23245   //         movss  <1.0f>, %xmm0
   23246   //         jne     .LBB5_4
   23247   //         jp      .LBB5_4
   23248   //         xorps   %xmm0, %xmm0
   23249   // .LBB5_4:
   23250   //         retq
   23251   //
   23252   MachineInstr *CascadedCMOV = nullptr;
   23253   MachineInstr *LastCMOV = &MI;
   23254   X86::CondCode CC = X86::CondCode(MI.getOperand(3).getImm());
   23255   X86::CondCode OppCC = X86::GetOppositeBranchCondition(CC);
   23256   MachineBasicBlock::iterator NextMIIt =
   23257       std::next(MachineBasicBlock::iterator(MI));
   23258 
   23259   // Check for case 1, where there are multiple CMOVs with the same condition
   23260   // first.  Of the two cases of multiple CMOV lowerings, case 1 reduces the
   23261   // number of jumps the most.
   23262 
   23263   if (isCMOVPseudo(MI)) {
   23264     // See if we have a string of CMOVS with the same condition.
   23265     while (NextMIIt != BB->end() && isCMOVPseudo(*NextMIIt) &&
   23266            (NextMIIt->getOperand(3).getImm() == CC ||
   23267             NextMIIt->getOperand(3).getImm() == OppCC)) {
   23268       LastCMOV = &*NextMIIt;
   23269       ++NextMIIt;
   23270     }
   23271   }
   23272 
   23273   // This checks for case 2, but only do this if we didn't already find
   23274   // case 1, as indicated by LastCMOV == MI.
   23275   if (LastCMOV == &MI && NextMIIt != BB->end() &&
   23276       NextMIIt->getOpcode() == MI.getOpcode() &&
   23277       NextMIIt->getOperand(2).getReg() == MI.getOperand(2).getReg() &&
   23278       NextMIIt->getOperand(1).getReg() == MI.getOperand(0).getReg() &&
   23279       NextMIIt->getOperand(1).isKill()) {
   23280     CascadedCMOV = &*NextMIIt;
   23281   }
   23282 
   23283   MachineBasicBlock *jcc1MBB = nullptr;
   23284 
   23285   // If we have a cascaded CMOV, we lower it to two successive branches to
   23286   // the same block.  EFLAGS is used by both, so mark it as live in the second.
   23287   if (CascadedCMOV) {
   23288     jcc1MBB = F->CreateMachineBasicBlock(LLVM_BB);
   23289     F->insert(It, jcc1MBB);
   23290     jcc1MBB->addLiveIn(X86::EFLAGS);
   23291   }
   23292 
   23293   MachineBasicBlock *copy0MBB = F->CreateMachineBasicBlock(LLVM_BB);
   23294   MachineBasicBlock *sinkMBB = F->CreateMachineBasicBlock(LLVM_BB);
   23295   F->insert(It, copy0MBB);
   23296   F->insert(It, sinkMBB);
   23297 
   23298   // If the EFLAGS register isn't dead in the terminator, then claim that it's
   23299   // live into the sink and copy blocks.
   23300   const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo();
   23301 
   23302   MachineInstr *LastEFLAGSUser = CascadedCMOV ? CascadedCMOV : LastCMOV;
   23303   if (!LastEFLAGSUser->killsRegister(X86::EFLAGS) &&
   23304       !checkAndUpdateEFLAGSKill(LastEFLAGSUser, BB, TRI)) {
   23305     copy0MBB->addLiveIn(X86::EFLAGS);
   23306     sinkMBB->addLiveIn(X86::EFLAGS);
   23307   }
   23308 
   23309   // Transfer the remainder of BB and its successor edges to sinkMBB.
   23310   sinkMBB->splice(sinkMBB->begin(), BB,
   23311                   std::next(MachineBasicBlock::iterator(LastCMOV)), BB->end());
   23312   sinkMBB->transferSuccessorsAndUpdatePHIs(BB);
   23313 
   23314   // Add the true and fallthrough blocks as its successors.
   23315   if (CascadedCMOV) {
   23316     // The fallthrough block may be jcc1MBB, if we have a cascaded CMOV.
   23317     BB->addSuccessor(jcc1MBB);
   23318 
   23319     // In that case, jcc1MBB will itself fallthrough the copy0MBB, and
   23320     // jump to the sinkMBB.
   23321     jcc1MBB->addSuccessor(copy0MBB);
   23322     jcc1MBB->addSuccessor(sinkMBB);
   23323   } else {
   23324     BB->addSuccessor(copy0MBB);
   23325   }
   23326 
   23327   // The true block target of the first (or only) branch is always sinkMBB.
   23328   BB->addSuccessor(sinkMBB);
   23329 
   23330   // Create the conditional branch instruction.
   23331   unsigned Opc = X86::GetCondBranchFromCond(CC);
   23332   BuildMI(BB, DL, TII->get(Opc)).addMBB(sinkMBB);
   23333 
   23334   if (CascadedCMOV) {
   23335     unsigned Opc2 = X86::GetCondBranchFromCond(
   23336         (X86::CondCode)CascadedCMOV->getOperand(3).getImm());
   23337     BuildMI(jcc1MBB, DL, TII->get(Opc2)).addMBB(sinkMBB);
   23338   }
   23339 
   23340   //  copy0MBB:
   23341   //   %FalseValue = ...
   23342   //   # fallthrough to sinkMBB
   23343   copy0MBB->addSuccessor(sinkMBB);
   23344 
   23345   //  sinkMBB:
   23346   //   %Result = phi [ %FalseValue, copy0MBB ], [ %TrueValue, thisMBB ]
   23347   //  ...
   23348   MachineBasicBlock::iterator MIItBegin = MachineBasicBlock::iterator(MI);
   23349   MachineBasicBlock::iterator MIItEnd =
   23350     std::next(MachineBasicBlock::iterator(LastCMOV));
   23351   MachineBasicBlock::iterator SinkInsertionPoint = sinkMBB->begin();
   23352   DenseMap<unsigned, std::pair<unsigned, unsigned>> RegRewriteTable;
   23353   MachineInstrBuilder MIB;
   23354 
   23355   // As we are creating the PHIs, we have to be careful if there is more than
   23356   // one.  Later CMOVs may reference the results of earlier CMOVs, but later
   23357   // PHIs have to reference the individual true/false inputs from earlier PHIs.
   23358   // That also means that PHI construction must work forward from earlier to
   23359   // later, and that the code must maintain a mapping from earlier PHI's
   23360   // destination registers, and the registers that went into the PHI.
   23361 
   23362   for (MachineBasicBlock::iterator MIIt = MIItBegin; MIIt != MIItEnd; ++MIIt) {
   23363     unsigned DestReg = MIIt->getOperand(0).getReg();
   23364     unsigned Op1Reg = MIIt->getOperand(1).getReg();
   23365     unsigned Op2Reg = MIIt->getOperand(2).getReg();
   23366 
   23367     // If this CMOV we are generating is the opposite condition from
   23368     // the jump we generated, then we have to swap the operands for the
   23369     // PHI that is going to be generated.
   23370     if (MIIt->getOperand(3).getImm() == OppCC)
   23371         std::swap(Op1Reg, Op2Reg);
   23372 
   23373     if (RegRewriteTable.find(Op1Reg) != RegRewriteTable.end())
   23374       Op1Reg = RegRewriteTable[Op1Reg].first;
   23375 
   23376     if (RegRewriteTable.find(Op2Reg) != RegRewriteTable.end())
   23377       Op2Reg = RegRewriteTable[Op2Reg].second;
   23378 
   23379     MIB = BuildMI(*sinkMBB, SinkInsertionPoint, DL,
   23380                   TII->get(X86::PHI), DestReg)
   23381           .addReg(Op1Reg).addMBB(copy0MBB)
   23382           .addReg(Op2Reg).addMBB(thisMBB);
   23383 
   23384     // Add this PHI to the rewrite table.
   23385     RegRewriteTable[DestReg] = std::make_pair(Op1Reg, Op2Reg);
   23386   }
   23387 
   23388   // If we have a cascaded CMOV, the second Jcc provides the same incoming
   23389   // value as the first Jcc (the True operand of the SELECT_CC/CMOV nodes).
   23390   if (CascadedCMOV) {
   23391     MIB.addReg(MI.getOperand(2).getReg()).addMBB(jcc1MBB);
   23392     // Copy the PHI result to the register defined by the second CMOV.
   23393     BuildMI(*sinkMBB, std::next(MachineBasicBlock::iterator(MIB.getInstr())),
   23394             DL, TII->get(TargetOpcode::COPY),
   23395             CascadedCMOV->getOperand(0).getReg())
   23396         .addReg(MI.getOperand(0).getReg());
   23397     CascadedCMOV->eraseFromParent();
   23398   }
   23399 
   23400   // Now remove the CMOV(s).
   23401   for (MachineBasicBlock::iterator MIIt = MIItBegin; MIIt != MIItEnd; )
   23402     (MIIt++)->eraseFromParent();
   23403 
   23404   return sinkMBB;
   23405 }
   23406 
   23407 MachineBasicBlock *
   23408 X86TargetLowering::EmitLoweredAtomicFP(MachineInstr &MI,
   23409                                        MachineBasicBlock *BB) const {
   23410   // Combine the following atomic floating-point modification pattern:
   23411   //   a.store(reg OP a.load(acquire), release)
   23412   // Transform them into:
   23413   //   OPss (%gpr), %xmm
   23414   //   movss %xmm, (%gpr)
   23415   // Or sd equivalent for 64-bit operations.
   23416   unsigned MOp, FOp;
   23417   switch (MI.getOpcode()) {
   23418   default: llvm_unreachable("unexpected instr type for EmitLoweredAtomicFP");
   23419   case X86::RELEASE_FADD32mr:
   23420     FOp = X86::ADDSSrm;
   23421     MOp = X86::MOVSSmr;
   23422     break;
   23423   case X86::RELEASE_FADD64mr:
   23424     FOp = X86::ADDSDrm;
   23425     MOp = X86::MOVSDmr;
   23426     break;
   23427   }
   23428   const X86InstrInfo *TII = Subtarget.getInstrInfo();
   23429   DebugLoc DL = MI.getDebugLoc();
   23430   MachineRegisterInfo &MRI = BB->getParent()->getRegInfo();
   23431   unsigned ValOpIdx = X86::AddrNumOperands;
   23432   unsigned VSrc = MI.getOperand(ValOpIdx).getReg();
   23433   MachineInstrBuilder MIB =
   23434       BuildMI(*BB, MI, DL, TII->get(FOp),
   23435               MRI.createVirtualRegister(MRI.getRegClass(VSrc)))
   23436           .addReg(VSrc);
   23437   for (int i = 0; i < X86::AddrNumOperands; ++i) {
   23438     MachineOperand &Operand = MI.getOperand(i);
   23439     // Clear any kill flags on register operands as we'll create a second
   23440     // instruction using the same address operands.
   23441     if (Operand.isReg())
   23442       Operand.setIsKill(false);
   23443     MIB.addOperand(Operand);
   23444   }
   23445   MachineInstr *FOpMI = MIB;
   23446   MIB = BuildMI(*BB, MI, DL, TII->get(MOp));
   23447   for (int i = 0; i < X86::AddrNumOperands; ++i)
   23448     MIB.addOperand(MI.getOperand(i));
   23449   MIB.addReg(FOpMI->getOperand(0).getReg(), RegState::Kill);
   23450   MI.eraseFromParent(); // The pseudo instruction is gone now.
   23451   return BB;
   23452 }
   23453 
   23454 MachineBasicBlock *
   23455 X86TargetLowering::EmitLoweredSegAlloca(MachineInstr &MI,
   23456                                         MachineBasicBlock *BB) const {
   23457   MachineFunction *MF = BB->getParent();
   23458   const TargetInstrInfo *TII = Subtarget.getInstrInfo();
   23459   DebugLoc DL = MI.getDebugLoc();
   23460   const BasicBlock *LLVM_BB = BB->getBasicBlock();
   23461 
   23462   assert(MF->shouldSplitStack());
   23463 
   23464   const bool Is64Bit = Subtarget.is64Bit();
   23465   const bool IsLP64 = Subtarget.isTarget64BitLP64();
   23466 
   23467   const unsigned TlsReg = Is64Bit ? X86::FS : X86::GS;
   23468   const unsigned TlsOffset = IsLP64 ? 0x70 : Is64Bit ? 0x40 : 0x30;
   23469 
   23470   // BB:
   23471   //  ... [Till the alloca]
   23472   // If stacklet is not large enough, jump to mallocMBB
   23473   //
   23474   // bumpMBB:
   23475   //  Allocate by subtracting from RSP
   23476   //  Jump to continueMBB
   23477   //
   23478   // mallocMBB:
   23479   //  Allocate by call to runtime
   23480   //
   23481   // continueMBB:
   23482   //  ...
   23483   //  [rest of original BB]
   23484   //
   23485 
   23486   MachineBasicBlock *mallocMBB = MF->CreateMachineBasicBlock(LLVM_BB);
   23487   MachineBasicBlock *bumpMBB = MF->CreateMachineBasicBlock(LLVM_BB);
   23488   MachineBasicBlock *continueMBB = MF->CreateMachineBasicBlock(LLVM_BB);
   23489 
   23490   MachineRegisterInfo &MRI = MF->getRegInfo();
   23491   const TargetRegisterClass *AddrRegClass =
   23492       getRegClassFor(getPointerTy(MF->getDataLayout()));
   23493 
   23494   unsigned mallocPtrVReg = MRI.createVirtualRegister(AddrRegClass),
   23495            bumpSPPtrVReg = MRI.createVirtualRegister(AddrRegClass),
   23496            tmpSPVReg = MRI.createVirtualRegister(AddrRegClass),
   23497            SPLimitVReg = MRI.createVirtualRegister(AddrRegClass),
   23498            sizeVReg = MI.getOperand(1).getReg(),
   23499            physSPReg =
   23500                IsLP64 || Subtarget.isTargetNaCl64() ? X86::RSP : X86::ESP;
   23501 
   23502   MachineFunction::iterator MBBIter = ++BB->getIterator();
   23503 
   23504   MF->insert(MBBIter, bumpMBB);
   23505   MF->insert(MBBIter, mallocMBB);
   23506   MF->insert(MBBIter, continueMBB);
   23507 
   23508   continueMBB->splice(continueMBB->begin(), BB,
   23509                       std::next(MachineBasicBlock::iterator(MI)), BB->end());
   23510   continueMBB->transferSuccessorsAndUpdatePHIs(BB);
   23511 
   23512   // Add code to the main basic block to check if the stack limit has been hit,
   23513   // and if so, jump to mallocMBB otherwise to bumpMBB.
   23514   BuildMI(BB, DL, TII->get(TargetOpcode::COPY), tmpSPVReg).addReg(physSPReg);
   23515   BuildMI(BB, DL, TII->get(IsLP64 ? X86::SUB64rr:X86::SUB32rr), SPLimitVReg)
   23516     .addReg(tmpSPVReg).addReg(sizeVReg);
   23517   BuildMI(BB, DL, TII->get(IsLP64 ? X86::CMP64mr:X86::CMP32mr))
   23518     .addReg(0).addImm(1).addReg(0).addImm(TlsOffset).addReg(TlsReg)
   23519     .addReg(SPLimitVReg);
   23520   BuildMI(BB, DL, TII->get(X86::JG_1)).addMBB(mallocMBB);
   23521 
   23522   // bumpMBB simply decreases the stack pointer, since we know the current
   23523   // stacklet has enough space.
   23524   BuildMI(bumpMBB, DL, TII->get(TargetOpcode::COPY), physSPReg)
   23525     .addReg(SPLimitVReg);
   23526   BuildMI(bumpMBB, DL, TII->get(TargetOpcode::COPY), bumpSPPtrVReg)
   23527     .addReg(SPLimitVReg);
   23528   BuildMI(bumpMBB, DL, TII->get(X86::JMP_1)).addMBB(continueMBB);
   23529 
   23530   // Calls into a routine in libgcc to allocate more space from the heap.
   23531   const uint32_t *RegMask =
   23532       Subtarget.getRegisterInfo()->getCallPreservedMask(*MF, CallingConv::C);
   23533   if (IsLP64) {
   23534     BuildMI(mallocMBB, DL, TII->get(X86::MOV64rr), X86::RDI)
   23535       .addReg(sizeVReg);
   23536     BuildMI(mallocMBB, DL, TII->get(X86::CALL64pcrel32))
   23537       .addExternalSymbol("__morestack_allocate_stack_space")
   23538       .addRegMask(RegMask)
   23539       .addReg(X86::RDI, RegState::Implicit)
   23540       .addReg(X86::RAX, RegState::ImplicitDefine);
   23541   } else if (Is64Bit) {
   23542     BuildMI(mallocMBB, DL, TII->get(X86::MOV32rr), X86::EDI)
   23543       .addReg(sizeVReg);
   23544     BuildMI(mallocMBB, DL, TII->get(X86::CALL64pcrel32))
   23545       .addExternalSymbol("__morestack_allocate_stack_space")
   23546       .addRegMask(RegMask)
   23547       .addReg(X86::EDI, RegState::Implicit)
   23548       .addReg(X86::EAX, RegState::ImplicitDefine);
   23549   } else {
   23550     BuildMI(mallocMBB, DL, TII->get(X86::SUB32ri), physSPReg).addReg(physSPReg)
   23551       .addImm(12);
   23552     BuildMI(mallocMBB, DL, TII->get(X86::PUSH32r)).addReg(sizeVReg);
   23553     BuildMI(mallocMBB, DL, TII->get(X86::CALLpcrel32))
   23554       .addExternalSymbol("__morestack_allocate_stack_space")
   23555       .addRegMask(RegMask)
   23556       .addReg(X86::EAX, RegState::ImplicitDefine);
   23557   }
   23558 
   23559   if (!Is64Bit)
   23560     BuildMI(mallocMBB, DL, TII->get(X86::ADD32ri), physSPReg).addReg(physSPReg)
   23561       .addImm(16);
   23562 
   23563   BuildMI(mallocMBB, DL, TII->get(TargetOpcode::COPY), mallocPtrVReg)
   23564     .addReg(IsLP64 ? X86::RAX : X86::EAX);
   23565   BuildMI(mallocMBB, DL, TII->get(X86::JMP_1)).addMBB(continueMBB);
   23566 
   23567   // Set up the CFG correctly.
   23568   BB->addSuccessor(bumpMBB);
   23569   BB->addSuccessor(mallocMBB);
   23570   mallocMBB->addSuccessor(continueMBB);
   23571   bumpMBB->addSuccessor(continueMBB);
   23572 
   23573   // Take care of the PHI nodes.
   23574   BuildMI(*continueMBB, continueMBB->begin(), DL, TII->get(X86::PHI),
   23575           MI.getOperand(0).getReg())
   23576       .addReg(mallocPtrVReg)
   23577       .addMBB(mallocMBB)
   23578       .addReg(bumpSPPtrVReg)
   23579       .addMBB(bumpMBB);
   23580 
   23581   // Delete the original pseudo instruction.
   23582   MI.eraseFromParent();
   23583 
   23584   // And we're done.
   23585   return continueMBB;
   23586 }
   23587 
   23588 MachineBasicBlock *
   23589 X86TargetLowering::EmitLoweredCatchRet(MachineInstr &MI,
   23590                                        MachineBasicBlock *BB) const {
   23591   MachineFunction *MF = BB->getParent();
   23592   const TargetInstrInfo &TII = *Subtarget.getInstrInfo();
   23593   MachineBasicBlock *TargetMBB = MI.getOperand(0).getMBB();
   23594   DebugLoc DL = MI.getDebugLoc();
   23595 
   23596   assert(!isAsynchronousEHPersonality(
   23597              classifyEHPersonality(MF->getFunction()->getPersonalityFn())) &&
   23598          "SEH does not use catchret!");
   23599 
   23600   // Only 32-bit EH needs to worry about manually restoring stack pointers.
   23601   if (!Subtarget.is32Bit())
   23602     return BB;
   23603 
   23604   // C++ EH creates a new target block to hold the restore code, and wires up
   23605   // the new block to the return destination with a normal JMP_4.
   23606   MachineBasicBlock *RestoreMBB =
   23607       MF->CreateMachineBasicBlock(BB->getBasicBlock());
   23608   assert(BB->succ_size() == 1);
   23609   MF->insert(std::next(BB->getIterator()), RestoreMBB);
   23610   RestoreMBB->transferSuccessorsAndUpdatePHIs(BB);
   23611   BB->addSuccessor(RestoreMBB);
   23612   MI.getOperand(0).setMBB(RestoreMBB);
   23613 
   23614   auto RestoreMBBI = RestoreMBB->begin();
   23615   BuildMI(*RestoreMBB, RestoreMBBI, DL, TII.get(X86::EH_RESTORE));
   23616   BuildMI(*RestoreMBB, RestoreMBBI, DL, TII.get(X86::JMP_4)).addMBB(TargetMBB);
   23617   return BB;
   23618 }
   23619 
   23620 MachineBasicBlock *
   23621 X86TargetLowering::EmitLoweredCatchPad(MachineInstr &MI,
   23622                                        MachineBasicBlock *BB) const {
   23623   MachineFunction *MF = BB->getParent();
   23624   const Constant *PerFn = MF->getFunction()->getPersonalityFn();
   23625   bool IsSEH = isAsynchronousEHPersonality(classifyEHPersonality(PerFn));
   23626   // Only 32-bit SEH requires special handling for catchpad.
   23627   if (IsSEH && Subtarget.is32Bit()) {
   23628     const TargetInstrInfo &TII = *Subtarget.getInstrInfo();
   23629     DebugLoc DL = MI.getDebugLoc();
   23630     BuildMI(*BB, MI, DL, TII.get(X86::EH_RESTORE));
   23631   }
   23632   MI.eraseFromParent();
   23633   return BB;
   23634 }
   23635 
   23636 MachineBasicBlock *
   23637 X86TargetLowering::EmitLoweredTLSAddr(MachineInstr &MI,
   23638                                       MachineBasicBlock *BB) const {
   23639   // So, here we replace TLSADDR with the sequence:
   23640   // adjust_stackdown -> TLSADDR -> adjust_stackup.
   23641   // We need this because TLSADDR is lowered into calls
   23642   // inside MC, therefore without the two markers shrink-wrapping
   23643   // may push the prologue/epilogue pass them.
   23644   const TargetInstrInfo &TII = *Subtarget.getInstrInfo();
   23645   DebugLoc DL = MI.getDebugLoc();
   23646   MachineFunction &MF = *BB->getParent();
   23647 
   23648   // Emit CALLSEQ_START right before the instruction.
   23649   unsigned AdjStackDown = TII.getCallFrameSetupOpcode();
   23650   MachineInstrBuilder CallseqStart =
   23651     BuildMI(MF, DL, TII.get(AdjStackDown)).addImm(0).addImm(0);
   23652   BB->insert(MachineBasicBlock::iterator(MI), CallseqStart);
   23653 
   23654   // Emit CALLSEQ_END right after the instruction.
   23655   // We don't call erase from parent because we want to keep the
   23656   // original instruction around.
   23657   unsigned AdjStackUp = TII.getCallFrameDestroyOpcode();
   23658   MachineInstrBuilder CallseqEnd =
   23659     BuildMI(MF, DL, TII.get(AdjStackUp)).addImm(0).addImm(0);
   23660   BB->insertAfter(MachineBasicBlock::iterator(MI), CallseqEnd);
   23661 
   23662   return BB;
   23663 }
   23664 
   23665 MachineBasicBlock *
   23666 X86TargetLowering::EmitLoweredTLSCall(MachineInstr &MI,
   23667                                       MachineBasicBlock *BB) const {
   23668   // This is pretty easy.  We're taking the value that we received from
   23669   // our load from the relocation, sticking it in either RDI (x86-64)
   23670   // or EAX and doing an indirect call.  The return value will then
   23671   // be in the normal return register.
   23672   MachineFunction *F = BB->getParent();
   23673   const X86InstrInfo *TII = Subtarget.getInstrInfo();
   23674   DebugLoc DL = MI.getDebugLoc();
   23675 
   23676   assert(Subtarget.isTargetDarwin() && "Darwin only instr emitted?");
   23677   assert(MI.getOperand(3).isGlobal() && "This should be a global");
   23678 
   23679   // Get a register mask for the lowered call.
   23680   // FIXME: The 32-bit calls have non-standard calling conventions. Use a
   23681   // proper register mask.
   23682   const uint32_t *RegMask =
   23683       Subtarget.is64Bit() ?
   23684       Subtarget.getRegisterInfo()->getDarwinTLSCallPreservedMask() :
   23685       Subtarget.getRegisterInfo()->getCallPreservedMask(*F, CallingConv::C);
   23686   if (Subtarget.is64Bit()) {
   23687     MachineInstrBuilder MIB =
   23688         BuildMI(*BB, MI, DL, TII->get(X86::MOV64rm), X86::RDI)
   23689             .addReg(X86::RIP)
   23690             .addImm(0)
   23691             .addReg(0)
   23692             .addGlobalAddress(MI.getOperand(3).getGlobal(), 0,
   23693                               MI.getOperand(3).getTargetFlags())
   23694             .addReg(0);
   23695     MIB = BuildMI(*BB, MI, DL, TII->get(X86::CALL64m));
   23696     addDirectMem(MIB, X86::RDI);
   23697     MIB.addReg(X86::RAX, RegState::ImplicitDefine).addRegMask(RegMask);
   23698   } else if (!isPositionIndependent()) {
   23699     MachineInstrBuilder MIB =
   23700         BuildMI(*BB, MI, DL, TII->get(X86::MOV32rm), X86::EAX)
   23701             .addReg(0)
   23702             .addImm(0)
   23703             .addReg(0)
   23704             .addGlobalAddress(MI.getOperand(3).getGlobal(), 0,
   23705                               MI.getOperand(3).getTargetFlags())
   23706             .addReg(0);
   23707     MIB = BuildMI(*BB, MI, DL, TII->get(X86::CALL32m));
   23708     addDirectMem(MIB, X86::EAX);
   23709     MIB.addReg(X86::EAX, RegState::ImplicitDefine).addRegMask(RegMask);
   23710   } else {
   23711     MachineInstrBuilder MIB =
   23712         BuildMI(*BB, MI, DL, TII->get(X86::MOV32rm), X86::EAX)
   23713             .addReg(TII->getGlobalBaseReg(F))
   23714             .addImm(0)
   23715             .addReg(0)
   23716             .addGlobalAddress(MI.getOperand(3).getGlobal(), 0,
   23717                               MI.getOperand(3).getTargetFlags())
   23718             .addReg(0);
   23719     MIB = BuildMI(*BB, MI, DL, TII->get(X86::CALL32m));
   23720     addDirectMem(MIB, X86::EAX);
   23721     MIB.addReg(X86::EAX, RegState::ImplicitDefine).addRegMask(RegMask);
   23722   }
   23723 
   23724   MI.eraseFromParent(); // The pseudo instruction is gone now.
   23725   return BB;
   23726 }
   23727 
   23728 MachineBasicBlock *
   23729 X86TargetLowering::emitEHSjLjSetJmp(MachineInstr &MI,
   23730                                     MachineBasicBlock *MBB) const {
   23731   DebugLoc DL = MI.getDebugLoc();
   23732   MachineFunction *MF = MBB->getParent();
   23733   const TargetInstrInfo *TII = Subtarget.getInstrInfo();
   23734   MachineRegisterInfo &MRI = MF->getRegInfo();
   23735 
   23736   const BasicBlock *BB = MBB->getBasicBlock();
   23737   MachineFunction::iterator I = ++MBB->getIterator();
   23738 
   23739   // Memory Reference
   23740   MachineInstr::mmo_iterator MMOBegin = MI.memoperands_begin();
   23741   MachineInstr::mmo_iterator MMOEnd = MI.memoperands_end();
   23742 
   23743   unsigned DstReg;
   23744   unsigned MemOpndSlot = 0;
   23745 
   23746   unsigned CurOp = 0;
   23747 
   23748   DstReg = MI.getOperand(CurOp++).getReg();
   23749   const TargetRegisterClass *RC = MRI.getRegClass(DstReg);
   23750   assert(RC->hasType(MVT::i32) && "Invalid destination!");
   23751   unsigned mainDstReg = MRI.createVirtualRegister(RC);
   23752   unsigned restoreDstReg = MRI.createVirtualRegister(RC);
   23753 
   23754   MemOpndSlot = CurOp;
   23755 
   23756   MVT PVT = getPointerTy(MF->getDataLayout());
   23757   assert((PVT == MVT::i64 || PVT == MVT::i32) &&
   23758          "Invalid Pointer Size!");
   23759 
   23760   // For v = setjmp(buf), we generate
   23761   //
   23762   // thisMBB:
   23763   //  buf[LabelOffset] = restoreMBB <-- takes address of restoreMBB
   23764   //  SjLjSetup restoreMBB
   23765   //
   23766   // mainMBB:
   23767   //  v_main = 0
   23768   //
   23769   // sinkMBB:
   23770   //  v = phi(main, restore)
   23771   //
   23772   // restoreMBB:
   23773   //  if base pointer being used, load it from frame
   23774   //  v_restore = 1
   23775 
   23776   MachineBasicBlock *thisMBB = MBB;
   23777   MachineBasicBlock *mainMBB = MF->CreateMachineBasicBlock(BB);
   23778   MachineBasicBlock *sinkMBB = MF->CreateMachineBasicBlock(BB);
   23779   MachineBasicBlock *restoreMBB = MF->CreateMachineBasicBlock(BB);
   23780   MF->insert(I, mainMBB);
   23781   MF->insert(I, sinkMBB);
   23782   MF->push_back(restoreMBB);
   23783   restoreMBB->setHasAddressTaken();
   23784 
   23785   MachineInstrBuilder MIB;
   23786 
   23787   // Transfer the remainder of BB and its successor edges to sinkMBB.
   23788   sinkMBB->splice(sinkMBB->begin(), MBB,
   23789                   std::next(MachineBasicBlock::iterator(MI)), MBB->end());
   23790   sinkMBB->transferSuccessorsAndUpdatePHIs(MBB);
   23791 
   23792   // thisMBB:
   23793   unsigned PtrStoreOpc = 0;
   23794   unsigned LabelReg = 0;
   23795   const int64_t LabelOffset = 1 * PVT.getStoreSize();
   23796   bool UseImmLabel = (MF->getTarget().getCodeModel() == CodeModel::Small) &&
   23797                      !isPositionIndependent();
   23798 
   23799   // Prepare IP either in reg or imm.
   23800   if (!UseImmLabel) {
   23801     PtrStoreOpc = (PVT == MVT::i64) ? X86::MOV64mr : X86::MOV32mr;
   23802     const TargetRegisterClass *PtrRC = getRegClassFor(PVT);
   23803     LabelReg = MRI.createVirtualRegister(PtrRC);
   23804     if (Subtarget.is64Bit()) {
   23805       MIB = BuildMI(*thisMBB, MI, DL, TII->get(X86::LEA64r), LabelReg)
   23806               .addReg(X86::RIP)
   23807               .addImm(0)
   23808               .addReg(0)
   23809               .addMBB(restoreMBB)
   23810               .addReg(0);
   23811     } else {
   23812       const X86InstrInfo *XII = static_cast<const X86InstrInfo*>(TII);
   23813       MIB = BuildMI(*thisMBB, MI, DL, TII->get(X86::LEA32r), LabelReg)
   23814               .addReg(XII->getGlobalBaseReg(MF))
   23815               .addImm(0)
   23816               .addReg(0)
   23817               .addMBB(restoreMBB, Subtarget.classifyBlockAddressReference())
   23818               .addReg(0);
   23819     }
   23820   } else
   23821     PtrStoreOpc = (PVT == MVT::i64) ? X86::MOV64mi32 : X86::MOV32mi;
   23822   // Store IP
   23823   MIB = BuildMI(*thisMBB, MI, DL, TII->get(PtrStoreOpc));
   23824   for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {
   23825     if (i == X86::AddrDisp)
   23826       MIB.addDisp(MI.getOperand(MemOpndSlot + i), LabelOffset);
   23827     else
   23828       MIB.addOperand(MI.getOperand(MemOpndSlot + i));
   23829   }
   23830   if (!UseImmLabel)
   23831     MIB.addReg(LabelReg);
   23832   else
   23833     MIB.addMBB(restoreMBB);
   23834   MIB.setMemRefs(MMOBegin, MMOEnd);
   23835   // Setup
   23836   MIB = BuildMI(*thisMBB, MI, DL, TII->get(X86::EH_SjLj_Setup))
   23837           .addMBB(restoreMBB);
   23838 
   23839   const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
   23840   MIB.addRegMask(RegInfo->getNoPreservedMask());
   23841   thisMBB->addSuccessor(mainMBB);
   23842   thisMBB->addSuccessor(restoreMBB);
   23843 
   23844   // mainMBB:
   23845   //  EAX = 0
   23846   BuildMI(mainMBB, DL, TII->get(X86::MOV32r0), mainDstReg);
   23847   mainMBB->addSuccessor(sinkMBB);
   23848 
   23849   // sinkMBB:
   23850   BuildMI(*sinkMBB, sinkMBB->begin(), DL,
   23851           TII->get(X86::PHI), DstReg)
   23852     .addReg(mainDstReg).addMBB(mainMBB)
   23853     .addReg(restoreDstReg).addMBB(restoreMBB);
   23854 
   23855   // restoreMBB:
   23856   if (RegInfo->hasBasePointer(*MF)) {
   23857     const bool Uses64BitFramePtr =
   23858         Subtarget.isTarget64BitLP64() || Subtarget.isTargetNaCl64();
   23859     X86MachineFunctionInfo *X86FI = MF->getInfo<X86MachineFunctionInfo>();
   23860     X86FI->setRestoreBasePointer(MF);
   23861     unsigned FramePtr = RegInfo->getFrameRegister(*MF);
   23862     unsigned BasePtr = RegInfo->getBaseRegister();
   23863     unsigned Opm = Uses64BitFramePtr ? X86::MOV64rm : X86::MOV32rm;
   23864     addRegOffset(BuildMI(restoreMBB, DL, TII->get(Opm), BasePtr),
   23865                  FramePtr, true, X86FI->getRestoreBasePointerOffset())
   23866       .setMIFlag(MachineInstr::FrameSetup);
   23867   }
   23868   BuildMI(restoreMBB, DL, TII->get(X86::MOV32ri), restoreDstReg).addImm(1);
   23869   BuildMI(restoreMBB, DL, TII->get(X86::JMP_1)).addMBB(sinkMBB);
   23870   restoreMBB->addSuccessor(sinkMBB);
   23871 
   23872   MI.eraseFromParent();
   23873   return sinkMBB;
   23874 }
   23875 
   23876 MachineBasicBlock *
   23877 X86TargetLowering::emitEHSjLjLongJmp(MachineInstr &MI,
   23878                                      MachineBasicBlock *MBB) const {
   23879   DebugLoc DL = MI.getDebugLoc();
   23880   MachineFunction *MF = MBB->getParent();
   23881   const TargetInstrInfo *TII = Subtarget.getInstrInfo();
   23882   MachineRegisterInfo &MRI = MF->getRegInfo();
   23883 
   23884   // Memory Reference
   23885   MachineInstr::mmo_iterator MMOBegin = MI.memoperands_begin();
   23886   MachineInstr::mmo_iterator MMOEnd = MI.memoperands_end();
   23887 
   23888   MVT PVT = getPointerTy(MF->getDataLayout());
   23889   assert((PVT == MVT::i64 || PVT == MVT::i32) &&
   23890          "Invalid Pointer Size!");
   23891 
   23892   const TargetRegisterClass *RC =
   23893     (PVT == MVT::i64) ? &X86::GR64RegClass : &X86::GR32RegClass;
   23894   unsigned Tmp = MRI.createVirtualRegister(RC);
   23895   // Since FP is only updated here but NOT referenced, it's treated as GPR.
   23896   const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
   23897   unsigned FP = (PVT == MVT::i64) ? X86::RBP : X86::EBP;
   23898   unsigned SP = RegInfo->getStackRegister();
   23899 
   23900   MachineInstrBuilder MIB;
   23901 
   23902   const int64_t LabelOffset = 1 * PVT.getStoreSize();
   23903   const int64_t SPOffset = 2 * PVT.getStoreSize();
   23904 
   23905   unsigned PtrLoadOpc = (PVT == MVT::i64) ? X86::MOV64rm : X86::MOV32rm;
   23906   unsigned IJmpOpc = (PVT == MVT::i64) ? X86::JMP64r : X86::JMP32r;
   23907 
   23908   // Reload FP
   23909   MIB = BuildMI(*MBB, MI, DL, TII->get(PtrLoadOpc), FP);
   23910   for (unsigned i = 0; i < X86::AddrNumOperands; ++i)
   23911     MIB.addOperand(MI.getOperand(i));
   23912   MIB.setMemRefs(MMOBegin, MMOEnd);
   23913   // Reload IP
   23914   MIB = BuildMI(*MBB, MI, DL, TII->get(PtrLoadOpc), Tmp);
   23915   for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {
   23916     if (i == X86::AddrDisp)
   23917       MIB.addDisp(MI.getOperand(i), LabelOffset);
   23918     else
   23919       MIB.addOperand(MI.getOperand(i));
   23920   }
   23921   MIB.setMemRefs(MMOBegin, MMOEnd);
   23922   // Reload SP
   23923   MIB = BuildMI(*MBB, MI, DL, TII->get(PtrLoadOpc), SP);
   23924   for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {
   23925     if (i == X86::AddrDisp)
   23926       MIB.addDisp(MI.getOperand(i), SPOffset);
   23927     else
   23928       MIB.addOperand(MI.getOperand(i));
   23929   }
   23930   MIB.setMemRefs(MMOBegin, MMOEnd);
   23931   // Jump
   23932   BuildMI(*MBB, MI, DL, TII->get(IJmpOpc)).addReg(Tmp);
   23933 
   23934   MI.eraseFromParent();
   23935   return MBB;
   23936 }
   23937 
   23938 void X86TargetLowering::SetupEntryBlockForSjLj(MachineInstr &MI,
   23939                                                MachineBasicBlock *MBB,
   23940                                                MachineBasicBlock *DispatchBB,
   23941                                                int FI) const {
   23942   DebugLoc DL = MI.getDebugLoc();
   23943   MachineFunction *MF = MBB->getParent();
   23944   MachineRegisterInfo *MRI = &MF->getRegInfo();
   23945   const TargetInstrInfo *TII = Subtarget.getInstrInfo();
   23946 
   23947   MVT PVT = getPointerTy(MF->getDataLayout());
   23948   assert((PVT == MVT::i64 || PVT == MVT::i32) && "Invalid Pointer Size!");
   23949 
   23950   unsigned Op = 0;
   23951   unsigned VR = 0;
   23952 
   23953   bool UseImmLabel = (MF->getTarget().getCodeModel() == CodeModel::Small) &&
   23954                      !isPositionIndependent();
   23955 
   23956   if (UseImmLabel) {
   23957     Op = (PVT == MVT::i64) ? X86::MOV64mi32 : X86::MOV32mi;
   23958   } else {
   23959     const TargetRegisterClass *TRC =
   23960         (PVT == MVT::i64) ? &X86::GR64RegClass : &X86::GR32RegClass;
   23961     VR = MRI->createVirtualRegister(TRC);
   23962     Op = (PVT == MVT::i64) ? X86::MOV64mr : X86::MOV32mr;
   23963 
   23964     /* const X86InstrInfo *XII = static_cast<const X86InstrInfo *>(TII); */
   23965 
   23966     if (Subtarget.is64Bit())
   23967       BuildMI(*MBB, MI, DL, TII->get(X86::LEA64r), VR)
   23968           .addReg(X86::RIP)
   23969           .addImm(1)
   23970           .addReg(0)
   23971           .addMBB(DispatchBB)
   23972           .addReg(0);
   23973     else
   23974       BuildMI(*MBB, MI, DL, TII->get(X86::LEA32r), VR)
   23975           .addReg(0) /* XII->getGlobalBaseReg(MF) */
   23976           .addImm(1)
   23977           .addReg(0)
   23978           .addMBB(DispatchBB, Subtarget.classifyBlockAddressReference())
   23979           .addReg(0);
   23980   }
   23981 
   23982   MachineInstrBuilder MIB = BuildMI(*MBB, MI, DL, TII->get(Op));
   23983   addFrameReference(MIB, FI, 36);
   23984   if (UseImmLabel)
   23985     MIB.addMBB(DispatchBB);
   23986   else
   23987     MIB.addReg(VR);
   23988 }
   23989 
   23990 MachineBasicBlock *
   23991 X86TargetLowering::EmitSjLjDispatchBlock(MachineInstr &MI,
   23992                                          MachineBasicBlock *BB) const {
   23993   DebugLoc DL = MI.getDebugLoc();
   23994   MachineFunction *MF = BB->getParent();
   23995   MachineModuleInfo *MMI = &MF->getMMI();
   23996   MachineFrameInfo *MFI = MF->getFrameInfo();
   23997   MachineRegisterInfo *MRI = &MF->getRegInfo();
   23998   const TargetInstrInfo *TII = Subtarget.getInstrInfo();
   23999   int FI = MFI->getFunctionContextIndex();
   24000 
   24001   // Get a mapping of the call site numbers to all of the landing pads they're
   24002   // associated with.
   24003   DenseMap<unsigned, SmallVector<MachineBasicBlock *, 2>> CallSiteNumToLPad;
   24004   unsigned MaxCSNum = 0;
   24005   for (auto &MBB : *MF) {
   24006     if (!MBB.isEHPad())
   24007       continue;
   24008 
   24009     MCSymbol *Sym = nullptr;
   24010     for (const auto &MI : MBB) {
   24011       if (MI.isDebugValue())
   24012         continue;
   24013 
   24014       assert(MI.isEHLabel() && "expected EH_LABEL");
   24015       Sym = MI.getOperand(0).getMCSymbol();
   24016       break;
   24017     }
   24018 
   24019     if (!MMI->hasCallSiteLandingPad(Sym))
   24020       continue;
   24021 
   24022     for (unsigned CSI : MMI->getCallSiteLandingPad(Sym)) {
   24023       CallSiteNumToLPad[CSI].push_back(&MBB);
   24024       MaxCSNum = std::max(MaxCSNum, CSI);
   24025     }
   24026   }
   24027 
   24028   // Get an ordered list of the machine basic blocks for the jump table.
   24029   std::vector<MachineBasicBlock *> LPadList;
   24030   SmallPtrSet<MachineBasicBlock *, 32> InvokeBBs;
   24031   LPadList.reserve(CallSiteNumToLPad.size());
   24032 
   24033   for (unsigned CSI = 1; CSI <= MaxCSNum; ++CSI) {
   24034     for (auto &LP : CallSiteNumToLPad[CSI]) {
   24035       LPadList.push_back(LP);
   24036       InvokeBBs.insert(LP->pred_begin(), LP->pred_end());
   24037     }
   24038   }
   24039 
   24040   assert(!LPadList.empty() &&
   24041          "No landing pad destinations for the dispatch jump table!");
   24042 
   24043   // Create the MBBs for the dispatch code.
   24044 
   24045   // Shove the dispatch's address into the return slot in the function context.
   24046   MachineBasicBlock *DispatchBB = MF->CreateMachineBasicBlock();
   24047   DispatchBB->setIsEHPad(true);
   24048 
   24049   MachineBasicBlock *TrapBB = MF->CreateMachineBasicBlock();
   24050   BuildMI(TrapBB, DL, TII->get(X86::TRAP));
   24051   DispatchBB->addSuccessor(TrapBB);
   24052 
   24053   MachineBasicBlock *DispContBB = MF->CreateMachineBasicBlock();
   24054   DispatchBB->addSuccessor(DispContBB);
   24055 
   24056   // Insert MBBs.
   24057   MF->push_back(DispatchBB);
   24058   MF->push_back(DispContBB);
   24059   MF->push_back(TrapBB);
   24060 
   24061   // Insert code into the entry block that creates and registers the function
   24062   // context.
   24063   SetupEntryBlockForSjLj(MI, BB, DispatchBB, FI);
   24064 
   24065   // Create the jump table and associated information
   24066   MachineJumpTableInfo *JTI =
   24067       MF->getOrCreateJumpTableInfo(getJumpTableEncoding());
   24068   unsigned MJTI = JTI->createJumpTableIndex(LPadList);
   24069 
   24070   const X86InstrInfo *XII = static_cast<const X86InstrInfo *>(TII);
   24071   const X86RegisterInfo &RI = XII->getRegisterInfo();
   24072 
   24073   // Add a register mask with no preserved registers.  This results in all
   24074   // registers being marked as clobbered.
   24075   if (RI.hasBasePointer(*MF)) {
   24076     const bool FPIs64Bit =
   24077         Subtarget.isTarget64BitLP64() || Subtarget.isTargetNaCl64();
   24078     X86MachineFunctionInfo *MFI = MF->getInfo<X86MachineFunctionInfo>();
   24079     MFI->setRestoreBasePointer(MF);
   24080 
   24081     unsigned FP = RI.getFrameRegister(*MF);
   24082     unsigned BP = RI.getBaseRegister();
   24083     unsigned Op = FPIs64Bit ? X86::MOV64rm : X86::MOV32rm;
   24084     addRegOffset(BuildMI(DispatchBB, DL, TII->get(Op), BP), FP, true,
   24085                  MFI->getRestoreBasePointerOffset())
   24086         .addRegMask(RI.getNoPreservedMask());
   24087   } else {
   24088     BuildMI(DispatchBB, DL, TII->get(X86::NOOP))
   24089         .addRegMask(RI.getNoPreservedMask());
   24090   }
   24091 
   24092   unsigned IReg = MRI->createVirtualRegister(&X86::GR32RegClass);
   24093   addFrameReference(BuildMI(DispatchBB, DL, TII->get(X86::MOV32rm), IReg), FI,
   24094                     4);
   24095   BuildMI(DispatchBB, DL, TII->get(X86::CMP32ri))
   24096       .addReg(IReg)
   24097       .addImm(LPadList.size());
   24098   BuildMI(DispatchBB, DL, TII->get(X86::JA_1)).addMBB(TrapBB);
   24099 
   24100   unsigned JReg = MRI->createVirtualRegister(&X86::GR32RegClass);
   24101   BuildMI(DispContBB, DL, TII->get(X86::SUB32ri), JReg)
   24102       .addReg(IReg)
   24103       .addImm(1);
   24104   BuildMI(DispContBB, DL,
   24105           TII->get(Subtarget.is64Bit() ? X86::JMP64m : X86::JMP32m))
   24106       .addReg(0)
   24107       .addImm(Subtarget.is64Bit() ? 8 : 4)
   24108       .addReg(JReg)
   24109       .addJumpTableIndex(MJTI)
   24110       .addReg(0);
   24111 
   24112   // Add the jump table entries as successors to the MBB.
   24113   SmallPtrSet<MachineBasicBlock *, 8> SeenMBBs;
   24114   for (auto &LP : LPadList)
   24115     if (SeenMBBs.insert(LP).second)
   24116       DispContBB->addSuccessor(LP);
   24117 
   24118   // N.B. the order the invoke BBs are processed in doesn't matter here.
   24119   SmallVector<MachineBasicBlock *, 64> MBBLPads;
   24120   const MCPhysReg *SavedRegs =
   24121       Subtarget.getRegisterInfo()->getCalleeSavedRegs(MF);
   24122   for (MachineBasicBlock *MBB : InvokeBBs) {
   24123     // Remove the landing pad successor from the invoke block and replace it
   24124     // with the new dispatch block.
   24125     // Keep a copy of Successors since it's modified inside the loop.
   24126     SmallVector<MachineBasicBlock *, 8> Successors(MBB->succ_rbegin(),
   24127                                                    MBB->succ_rend());
   24128     // FIXME: Avoid quadratic complexity.
   24129     for (auto MBBS : Successors) {
   24130       if (MBBS->isEHPad()) {
   24131         MBB->removeSuccessor(MBBS);
   24132         MBBLPads.push_back(MBBS);
   24133       }
   24134     }
   24135 
   24136     MBB->addSuccessor(DispatchBB);
   24137 
   24138     // Find the invoke call and mark all of the callee-saved registers as
   24139     // 'implicit defined' so that they're spilled.  This prevents code from
   24140     // moving instructions to before the EH block, where they will never be
   24141     // executed.
   24142     for (auto &II : reverse(*MBB)) {
   24143       if (!II.isCall())
   24144         continue;
   24145 
   24146       DenseMap<unsigned, bool> DefRegs;
   24147       for (auto &MOp : II.operands())
   24148         if (MOp.isReg())
   24149           DefRegs[MOp.getReg()] = true;
   24150 
   24151       MachineInstrBuilder MIB(*MF, &II);
   24152       for (unsigned RI = 0; SavedRegs[RI]; ++RI) {
   24153         unsigned Reg = SavedRegs[RI];
   24154         if (!DefRegs[Reg])
   24155           MIB.addReg(Reg, RegState::ImplicitDefine | RegState::Dead);
   24156       }
   24157 
   24158       break;
   24159     }
   24160   }
   24161 
   24162   // Mark all former landing pads as non-landing pads.  The dispatch is the only
   24163   // landing pad now.
   24164   for (auto &LP : MBBLPads)
   24165     LP->setIsEHPad(false);
   24166 
   24167   // The instruction is gone now.
   24168   MI.eraseFromParent();
   24169   return BB;
   24170 }
   24171 
   24172 // Replace 213-type (isel default) FMA3 instructions with 231-type for
   24173 // accumulator loops. Writing back to the accumulator allows the coalescer
   24174 // to remove extra copies in the loop.
   24175 // FIXME: Do this on AVX512.  We don't support 231 variants yet (PR23937).
   24176 MachineBasicBlock *
   24177 X86TargetLowering::emitFMA3Instr(MachineInstr &MI,
   24178                                  MachineBasicBlock *MBB) const {
   24179   MachineOperand &AddendOp = MI.getOperand(3);
   24180 
   24181   // Bail out early if the addend isn't a register - we can't switch these.
   24182   if (!AddendOp.isReg())
   24183     return MBB;
   24184 
   24185   MachineFunction &MF = *MBB->getParent();
   24186   MachineRegisterInfo &MRI = MF.getRegInfo();
   24187 
   24188   // Check whether the addend is defined by a PHI:
   24189   assert(MRI.hasOneDef(AddendOp.getReg()) && "Multiple defs in SSA?");
   24190   MachineInstr &AddendDef = *MRI.def_instr_begin(AddendOp.getReg());
   24191   if (!AddendDef.isPHI())
   24192     return MBB;
   24193 
   24194   // Look for the following pattern:
   24195   // loop:
   24196   //   %addend = phi [%entry, 0], [%loop, %result]
   24197   //   ...
   24198   //   %result<tied1> = FMA213 %m2<tied0>, %m1, %addend
   24199 
   24200   // Replace with:
   24201   //   loop:
   24202   //   %addend = phi [%entry, 0], [%loop, %result]
   24203   //   ...
   24204   //   %result<tied1> = FMA231 %addend<tied0>, %m1, %m2
   24205 
   24206   for (unsigned i = 1, e = AddendDef.getNumOperands(); i < e; i += 2) {
   24207     assert(AddendDef.getOperand(i).isReg());
   24208     MachineOperand PHISrcOp = AddendDef.getOperand(i);
   24209     MachineInstr &PHISrcInst = *MRI.def_instr_begin(PHISrcOp.getReg());
   24210     if (&PHISrcInst == &MI) {
   24211       // Found a matching instruction.
   24212       unsigned NewFMAOpc = 0;
   24213       switch (MI.getOpcode()) {
   24214       case X86::VFMADDPDr213r:
   24215         NewFMAOpc = X86::VFMADDPDr231r;
   24216         break;
   24217       case X86::VFMADDPSr213r:
   24218         NewFMAOpc = X86::VFMADDPSr231r;
   24219         break;
   24220       case X86::VFMADDSDr213r:
   24221         NewFMAOpc = X86::VFMADDSDr231r;
   24222         break;
   24223       case X86::VFMADDSSr213r:
   24224         NewFMAOpc = X86::VFMADDSSr231r;
   24225         break;
   24226       case X86::VFMSUBPDr213r:
   24227         NewFMAOpc = X86::VFMSUBPDr231r;
   24228         break;
   24229       case X86::VFMSUBPSr213r:
   24230         NewFMAOpc = X86::VFMSUBPSr231r;
   24231         break;
   24232       case X86::VFMSUBSDr213r:
   24233         NewFMAOpc = X86::VFMSUBSDr231r;
   24234         break;
   24235       case X86::VFMSUBSSr213r:
   24236         NewFMAOpc = X86::VFMSUBSSr231r;
   24237         break;
   24238       case X86::VFNMADDPDr213r:
   24239         NewFMAOpc = X86::VFNMADDPDr231r;
   24240         break;
   24241       case X86::VFNMADDPSr213r:
   24242         NewFMAOpc = X86::VFNMADDPSr231r;
   24243         break;
   24244       case X86::VFNMADDSDr213r:
   24245         NewFMAOpc = X86::VFNMADDSDr231r;
   24246         break;
   24247       case X86::VFNMADDSSr213r:
   24248         NewFMAOpc = X86::VFNMADDSSr231r;
   24249         break;
   24250       case X86::VFNMSUBPDr213r:
   24251         NewFMAOpc = X86::VFNMSUBPDr231r;
   24252         break;
   24253       case X86::VFNMSUBPSr213r:
   24254         NewFMAOpc = X86::VFNMSUBPSr231r;
   24255         break;
   24256       case X86::VFNMSUBSDr213r:
   24257         NewFMAOpc = X86::VFNMSUBSDr231r;
   24258         break;
   24259       case X86::VFNMSUBSSr213r:
   24260         NewFMAOpc = X86::VFNMSUBSSr231r;
   24261         break;
   24262       case X86::VFMADDSUBPDr213r:
   24263         NewFMAOpc = X86::VFMADDSUBPDr231r;
   24264         break;
   24265       case X86::VFMADDSUBPSr213r:
   24266         NewFMAOpc = X86::VFMADDSUBPSr231r;
   24267         break;
   24268       case X86::VFMSUBADDPDr213r:
   24269         NewFMAOpc = X86::VFMSUBADDPDr231r;
   24270         break;
   24271       case X86::VFMSUBADDPSr213r:
   24272         NewFMAOpc = X86::VFMSUBADDPSr231r;
   24273         break;
   24274 
   24275       case X86::VFMADDPDr213rY:
   24276         NewFMAOpc = X86::VFMADDPDr231rY;
   24277         break;
   24278       case X86::VFMADDPSr213rY:
   24279         NewFMAOpc = X86::VFMADDPSr231rY;
   24280         break;
   24281       case X86::VFMSUBPDr213rY:
   24282         NewFMAOpc = X86::VFMSUBPDr231rY;
   24283         break;
   24284       case X86::VFMSUBPSr213rY:
   24285         NewFMAOpc = X86::VFMSUBPSr231rY;
   24286         break;
   24287       case X86::VFNMADDPDr213rY:
   24288         NewFMAOpc = X86::VFNMADDPDr231rY;
   24289         break;
   24290       case X86::VFNMADDPSr213rY:
   24291         NewFMAOpc = X86::VFNMADDPSr231rY;
   24292         break;
   24293       case X86::VFNMSUBPDr213rY:
   24294         NewFMAOpc = X86::VFNMSUBPDr231rY;
   24295         break;
   24296       case X86::VFNMSUBPSr213rY:
   24297         NewFMAOpc = X86::VFNMSUBPSr231rY;
   24298         break;
   24299       case X86::VFMADDSUBPDr213rY:
   24300         NewFMAOpc = X86::VFMADDSUBPDr231rY;
   24301         break;
   24302       case X86::VFMADDSUBPSr213rY:
   24303         NewFMAOpc = X86::VFMADDSUBPSr231rY;
   24304         break;
   24305       case X86::VFMSUBADDPDr213rY:
   24306         NewFMAOpc = X86::VFMSUBADDPDr231rY;
   24307         break;
   24308       case X86::VFMSUBADDPSr213rY:
   24309         NewFMAOpc = X86::VFMSUBADDPSr231rY;
   24310         break;
   24311       default:
   24312         llvm_unreachable("Unrecognized FMA variant.");
   24313       }
   24314 
   24315       const TargetInstrInfo &TII = *Subtarget.getInstrInfo();
   24316       MachineInstrBuilder MIB =
   24317           BuildMI(MF, MI.getDebugLoc(), TII.get(NewFMAOpc))
   24318               .addOperand(MI.getOperand(0))
   24319               .addOperand(MI.getOperand(3))
   24320               .addOperand(MI.getOperand(2))
   24321               .addOperand(MI.getOperand(1));
   24322       MBB->insert(MachineBasicBlock::iterator(MI), MIB);
   24323       MI.eraseFromParent();
   24324     }
   24325   }
   24326 
   24327   return MBB;
   24328 }
   24329 
   24330 MachineBasicBlock *
   24331 X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
   24332                                                MachineBasicBlock *BB) const {
   24333   switch (MI.getOpcode()) {
   24334   default: llvm_unreachable("Unexpected instr type to insert");
   24335   case X86::TAILJMPd64:
   24336   case X86::TAILJMPr64:
   24337   case X86::TAILJMPm64:
   24338   case X86::TAILJMPd64_REX:
   24339   case X86::TAILJMPr64_REX:
   24340   case X86::TAILJMPm64_REX:
   24341     llvm_unreachable("TAILJMP64 would not be touched here.");
   24342   case X86::TCRETURNdi64:
   24343   case X86::TCRETURNri64:
   24344   case X86::TCRETURNmi64:
   24345     return BB;
   24346   case X86::TLS_addr32:
   24347   case X86::TLS_addr64:
   24348   case X86::TLS_base_addr32:
   24349   case X86::TLS_base_addr64:
   24350     return EmitLoweredTLSAddr(MI, BB);
   24351   case X86::CATCHRET:
   24352     return EmitLoweredCatchRet(MI, BB);
   24353   case X86::CATCHPAD:
   24354     return EmitLoweredCatchPad(MI, BB);
   24355   case X86::SEG_ALLOCA_32:
   24356   case X86::SEG_ALLOCA_64:
   24357     return EmitLoweredSegAlloca(MI, BB);
   24358   case X86::TLSCall_32:
   24359   case X86::TLSCall_64:
   24360     return EmitLoweredTLSCall(MI, BB);
   24361   case X86::CMOV_FR32:
   24362   case X86::CMOV_FR64:
   24363   case X86::CMOV_FR128:
   24364   case X86::CMOV_GR8:
   24365   case X86::CMOV_GR16:
   24366   case X86::CMOV_GR32:
   24367   case X86::CMOV_RFP32:
   24368   case X86::CMOV_RFP64:
   24369   case X86::CMOV_RFP80:
   24370   case X86::CMOV_V2F64:
   24371   case X86::CMOV_V2I64:
   24372   case X86::CMOV_V4F32:
   24373   case X86::CMOV_V4F64:
   24374   case X86::CMOV_V4I64:
   24375   case X86::CMOV_V16F32:
   24376   case X86::CMOV_V8F32:
   24377   case X86::CMOV_V8F64:
   24378   case X86::CMOV_V8I64:
   24379   case X86::CMOV_V8I1:
   24380   case X86::CMOV_V16I1:
   24381   case X86::CMOV_V32I1:
   24382   case X86::CMOV_V64I1:
   24383     return EmitLoweredSelect(MI, BB);
   24384 
   24385   case X86::RDFLAGS32:
   24386   case X86::RDFLAGS64: {
   24387     DebugLoc DL = MI.getDebugLoc();
   24388     const TargetInstrInfo *TII = Subtarget.getInstrInfo();
   24389     unsigned PushF =
   24390         MI.getOpcode() == X86::RDFLAGS32 ? X86::PUSHF32 : X86::PUSHF64;
   24391     unsigned Pop = MI.getOpcode() == X86::RDFLAGS32 ? X86::POP32r : X86::POP64r;
   24392     MachineInstr *Push = BuildMI(*BB, MI, DL, TII->get(PushF));
   24393     // Permit reads of the FLAGS register without it being defined.
   24394     // This intrinsic exists to read external processor state in flags, such as
   24395     // the trap flag, interrupt flag, and direction flag, none of which are
   24396     // modeled by the backend.
   24397     Push->getOperand(2).setIsUndef();
   24398     BuildMI(*BB, MI, DL, TII->get(Pop), MI.getOperand(0).getReg());
   24399 
   24400     MI.eraseFromParent(); // The pseudo is gone now.
   24401     return BB;
   24402   }
   24403 
   24404   case X86::WRFLAGS32:
   24405   case X86::WRFLAGS64: {
   24406     DebugLoc DL = MI.getDebugLoc();
   24407     const TargetInstrInfo *TII = Subtarget.getInstrInfo();
   24408     unsigned Push =
   24409         MI.getOpcode() == X86::WRFLAGS32 ? X86::PUSH32r : X86::PUSH64r;
   24410     unsigned PopF =
   24411         MI.getOpcode() == X86::WRFLAGS32 ? X86::POPF32 : X86::POPF64;
   24412     BuildMI(*BB, MI, DL, TII->get(Push)).addReg(MI.getOperand(0).getReg());
   24413     BuildMI(*BB, MI, DL, TII->get(PopF));
   24414 
   24415     MI.eraseFromParent(); // The pseudo is gone now.
   24416     return BB;
   24417   }
   24418 
   24419   case X86::RELEASE_FADD32mr:
   24420   case X86::RELEASE_FADD64mr:
   24421     return EmitLoweredAtomicFP(MI, BB);
   24422 
   24423   case X86::FP32_TO_INT16_IN_MEM:
   24424   case X86::FP32_TO_INT32_IN_MEM:
   24425   case X86::FP32_TO_INT64_IN_MEM:
   24426   case X86::FP64_TO_INT16_IN_MEM:
   24427   case X86::FP64_TO_INT32_IN_MEM:
   24428   case X86::FP64_TO_INT64_IN_MEM:
   24429   case X86::FP80_TO_INT16_IN_MEM:
   24430   case X86::FP80_TO_INT32_IN_MEM:
   24431   case X86::FP80_TO_INT64_IN_MEM: {
   24432     MachineFunction *F = BB->getParent();
   24433     const TargetInstrInfo *TII = Subtarget.getInstrInfo();
   24434     DebugLoc DL = MI.getDebugLoc();
   24435 
   24436     // Change the floating point control register to use "round towards zero"
   24437     // mode when truncating to an integer value.
   24438     int CWFrameIdx = F->getFrameInfo()->CreateStackObject(2, 2, false);
   24439     addFrameReference(BuildMI(*BB, MI, DL,
   24440                               TII->get(X86::FNSTCW16m)), CWFrameIdx);
   24441 
   24442     // Load the old value of the high byte of the control word...
   24443     unsigned OldCW =
   24444       F->getRegInfo().createVirtualRegister(&X86::GR16RegClass);
   24445     addFrameReference(BuildMI(*BB, MI, DL, TII->get(X86::MOV16rm), OldCW),
   24446                       CWFrameIdx);
   24447 
   24448     // Set the high part to be round to zero...
   24449     addFrameReference(BuildMI(*BB, MI, DL, TII->get(X86::MOV16mi)), CWFrameIdx)
   24450       .addImm(0xC7F);
   24451 
   24452     // Reload the modified control word now...
   24453     addFrameReference(BuildMI(*BB, MI, DL,
   24454                               TII->get(X86::FLDCW16m)), CWFrameIdx);
   24455 
   24456     // Restore the memory image of control word to original value
   24457     addFrameReference(BuildMI(*BB, MI, DL, TII->get(X86::MOV16mr)), CWFrameIdx)
   24458       .addReg(OldCW);
   24459 
   24460     // Get the X86 opcode to use.
   24461     unsigned Opc;
   24462     switch (MI.getOpcode()) {
   24463     default: llvm_unreachable("illegal opcode!");
   24464     case X86::FP32_TO_INT16_IN_MEM: Opc = X86::IST_Fp16m32; break;
   24465     case X86::FP32_TO_INT32_IN_MEM: Opc = X86::IST_Fp32m32; break;
   24466     case X86::FP32_TO_INT64_IN_MEM: Opc = X86::IST_Fp64m32; break;
   24467     case X86::FP64_TO_INT16_IN_MEM: Opc = X86::IST_Fp16m64; break;
   24468     case X86::FP64_TO_INT32_IN_MEM: Opc = X86::IST_Fp32m64; break;
   24469     case X86::FP64_TO_INT64_IN_MEM: Opc = X86::IST_Fp64m64; break;
   24470     case X86::FP80_TO_INT16_IN_MEM: Opc = X86::IST_Fp16m80; break;
   24471     case X86::FP80_TO_INT32_IN_MEM: Opc = X86::IST_Fp32m80; break;
   24472     case X86::FP80_TO_INT64_IN_MEM: Opc = X86::IST_Fp64m80; break;
   24473     }
   24474 
   24475     X86AddressMode AM = getAddressFromInstr(&MI, 0);
   24476     addFullAddress(BuildMI(*BB, MI, DL, TII->get(Opc)), AM)
   24477         .addReg(MI.getOperand(X86::AddrNumOperands).getReg());
   24478 
   24479     // Reload the original control word now.
   24480     addFrameReference(BuildMI(*BB, MI, DL,
   24481                               TII->get(X86::FLDCW16m)), CWFrameIdx);
   24482 
   24483     MI.eraseFromParent(); // The pseudo instruction is gone now.
   24484     return BB;
   24485   }
   24486     // String/text processing lowering.
   24487   case X86::PCMPISTRM128REG:
   24488   case X86::VPCMPISTRM128REG:
   24489   case X86::PCMPISTRM128MEM:
   24490   case X86::VPCMPISTRM128MEM:
   24491   case X86::PCMPESTRM128REG:
   24492   case X86::VPCMPESTRM128REG:
   24493   case X86::PCMPESTRM128MEM:
   24494   case X86::VPCMPESTRM128MEM:
   24495     assert(Subtarget.hasSSE42() &&
   24496            "Target must have SSE4.2 or AVX features enabled");
   24497     return emitPCMPSTRM(MI, BB, Subtarget.getInstrInfo());
   24498 
   24499   // String/text processing lowering.
   24500   case X86::PCMPISTRIREG:
   24501   case X86::VPCMPISTRIREG:
   24502   case X86::PCMPISTRIMEM:
   24503   case X86::VPCMPISTRIMEM:
   24504   case X86::PCMPESTRIREG:
   24505   case X86::VPCMPESTRIREG:
   24506   case X86::PCMPESTRIMEM:
   24507   case X86::VPCMPESTRIMEM:
   24508     assert(Subtarget.hasSSE42() &&
   24509            "Target must have SSE4.2 or AVX features enabled");
   24510     return emitPCMPSTRI(MI, BB, Subtarget.getInstrInfo());
   24511 
   24512   // Thread synchronization.
   24513   case X86::MONITOR:
   24514     return emitMonitor(MI, BB, Subtarget, X86::MONITORrrr);
   24515   case X86::MONITORX:
   24516     return emitMonitor(MI, BB, Subtarget, X86::MONITORXrrr);
   24517   // PKU feature
   24518   case X86::WRPKRU:
   24519     return emitWRPKRU(MI, BB, Subtarget);
   24520   case X86::RDPKRU:
   24521     return emitRDPKRU(MI, BB, Subtarget);
   24522   // xbegin
   24523   case X86::XBEGIN:
   24524     return emitXBegin(MI, BB, Subtarget.getInstrInfo());
   24525 
   24526   case X86::VASTART_SAVE_XMM_REGS:
   24527     return EmitVAStartSaveXMMRegsWithCustomInserter(MI, BB);
   24528 
   24529   case X86::VAARG_64:
   24530     return EmitVAARG64WithCustomInserter(MI, BB);
   24531 
   24532   case X86::EH_SjLj_SetJmp32:
   24533   case X86::EH_SjLj_SetJmp64:
   24534     return emitEHSjLjSetJmp(MI, BB);
   24535 
   24536   case X86::EH_SjLj_LongJmp32:
   24537   case X86::EH_SjLj_LongJmp64:
   24538     return emitEHSjLjLongJmp(MI, BB);
   24539 
   24540   case X86::Int_eh_sjlj_setup_dispatch:
   24541     return EmitSjLjDispatchBlock(MI, BB);
   24542 
   24543   case TargetOpcode::STATEPOINT:
   24544     // As an implementation detail, STATEPOINT shares the STACKMAP format at
   24545     // this point in the process.  We diverge later.
   24546     return emitPatchPoint(MI, BB);
   24547 
   24548   case TargetOpcode::STACKMAP:
   24549   case TargetOpcode::PATCHPOINT:
   24550     return emitPatchPoint(MI, BB);
   24551 
   24552   case X86::VFMADDPDr213r:
   24553   case X86::VFMADDPSr213r:
   24554   case X86::VFMADDSDr213r:
   24555   case X86::VFMADDSSr213r:
   24556   case X86::VFMSUBPDr213r:
   24557   case X86::VFMSUBPSr213r:
   24558   case X86::VFMSUBSDr213r:
   24559   case X86::VFMSUBSSr213r:
   24560   case X86::VFNMADDPDr213r:
   24561   case X86::VFNMADDPSr213r:
   24562   case X86::VFNMADDSDr213r:
   24563   case X86::VFNMADDSSr213r:
   24564   case X86::VFNMSUBPDr213r:
   24565   case X86::VFNMSUBPSr213r:
   24566   case X86::VFNMSUBSDr213r:
   24567   case X86::VFNMSUBSSr213r:
   24568   case X86::VFMADDSUBPDr213r:
   24569   case X86::VFMADDSUBPSr213r:
   24570   case X86::VFMSUBADDPDr213r:
   24571   case X86::VFMSUBADDPSr213r:
   24572   case X86::VFMADDPDr213rY:
   24573   case X86::VFMADDPSr213rY:
   24574   case X86::VFMSUBPDr213rY:
   24575   case X86::VFMSUBPSr213rY:
   24576   case X86::VFNMADDPDr213rY:
   24577   case X86::VFNMADDPSr213rY:
   24578   case X86::VFNMSUBPDr213rY:
   24579   case X86::VFNMSUBPSr213rY:
   24580   case X86::VFMADDSUBPDr213rY:
   24581   case X86::VFMADDSUBPSr213rY:
   24582   case X86::VFMSUBADDPDr213rY:
   24583   case X86::VFMSUBADDPSr213rY:
   24584     return emitFMA3Instr(MI, BB);
   24585   case X86::LCMPXCHG8B_SAVE_EBX:
   24586   case X86::LCMPXCHG16B_SAVE_RBX: {
   24587     unsigned BasePtr =
   24588         MI.getOpcode() == X86::LCMPXCHG8B_SAVE_EBX ? X86::EBX : X86::RBX;
   24589     if (!BB->isLiveIn(BasePtr))
   24590       BB->addLiveIn(BasePtr);
   24591     return BB;
   24592   }
   24593   }
   24594 }
   24595 
   24596 //===----------------------------------------------------------------------===//
   24597 //                           X86 Optimization Hooks
   24598 //===----------------------------------------------------------------------===//
   24599 
   24600 void X86TargetLowering::computeKnownBitsForTargetNode(const SDValue Op,
   24601                                                       APInt &KnownZero,
   24602                                                       APInt &KnownOne,
   24603                                                       const SelectionDAG &DAG,
   24604                                                       unsigned Depth) const {
   24605   unsigned BitWidth = KnownZero.getBitWidth();
   24606   unsigned Opc = Op.getOpcode();
   24607   assert((Opc >= ISD::BUILTIN_OP_END ||
   24608           Opc == ISD::INTRINSIC_WO_CHAIN ||
   24609           Opc == ISD::INTRINSIC_W_CHAIN ||
   24610           Opc == ISD::INTRINSIC_VOID) &&
   24611          "Should use MaskedValueIsZero if you don't know whether Op"
   24612          " is a target node!");
   24613 
   24614   KnownZero = KnownOne = APInt(BitWidth, 0);   // Don't know anything.
   24615   switch (Opc) {
   24616   default: break;
   24617   case X86ISD::ADD:
   24618   case X86ISD::SUB:
   24619   case X86ISD::ADC:
   24620   case X86ISD::SBB:
   24621   case X86ISD::SMUL:
   24622   case X86ISD::UMUL:
   24623   case X86ISD::INC:
   24624   case X86ISD::DEC:
   24625   case X86ISD::OR:
   24626   case X86ISD::XOR:
   24627   case X86ISD::AND:
   24628     // These nodes' second result is a boolean.
   24629     if (Op.getResNo() == 0)
   24630       break;
   24631     // Fallthrough
   24632   case X86ISD::SETCC:
   24633     KnownZero |= APInt::getHighBitsSet(BitWidth, BitWidth - 1);
   24634     break;
   24635   case X86ISD::MOVMSK: {
   24636     unsigned NumLoBits = Op.getOperand(0).getValueType().getVectorNumElements();
   24637     KnownZero = APInt::getHighBitsSet(BitWidth, BitWidth - NumLoBits);
   24638     break;
   24639   }
   24640   }
   24641 }
   24642 
   24643 unsigned X86TargetLowering::ComputeNumSignBitsForTargetNode(
   24644   SDValue Op,
   24645   const SelectionDAG &,
   24646   unsigned Depth) const {
   24647   // SETCC_CARRY sets the dest to ~0 for true or 0 for false.
   24648   if (Op.getOpcode() == X86ISD::SETCC_CARRY)
   24649     return Op.getValueType().getScalarSizeInBits();
   24650 
   24651   // Fallback case.
   24652   return 1;
   24653 }
   24654 
   24655 /// Returns true (and the GlobalValue and the offset) if the node is a
   24656 /// GlobalAddress + offset.
   24657 bool X86TargetLowering::isGAPlusOffset(SDNode *N,
   24658                                        const GlobalValue* &GA,
   24659                                        int64_t &Offset) const {
   24660   if (N->getOpcode() == X86ISD::Wrapper) {
   24661     if (isa<GlobalAddressSDNode>(N->getOperand(0))) {
   24662       GA = cast<GlobalAddressSDNode>(N->getOperand(0))->getGlobal();
   24663       Offset = cast<GlobalAddressSDNode>(N->getOperand(0))->getOffset();
   24664       return true;
   24665     }
   24666   }
   24667   return TargetLowering::isGAPlusOffset(N, GA, Offset);
   24668 }
   24669 
   24670 /// Performs shuffle combines for 256-bit vectors.
   24671 /// FIXME: This could be expanded to support 512 bit vectors as well.
   24672 static SDValue combineShuffle256(SDNode *N, SelectionDAG &DAG,
   24673                                  TargetLowering::DAGCombinerInfo &DCI,
   24674                                  const X86Subtarget &Subtarget) {
   24675   SDLoc dl(N);
   24676   ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N);
   24677   SDValue V1 = SVOp->getOperand(0);
   24678   SDValue V2 = SVOp->getOperand(1);
   24679   MVT VT = SVOp->getSimpleValueType(0);
   24680   unsigned NumElems = VT.getVectorNumElements();
   24681 
   24682   if (V1.getOpcode() == ISD::CONCAT_VECTORS &&
   24683       V2.getOpcode() == ISD::CONCAT_VECTORS) {
   24684     //
   24685     //                   0,0,0,...
   24686     //                      |
   24687     //    V      UNDEF    BUILD_VECTOR    UNDEF
   24688     //     \      /           \           /
   24689     //  CONCAT_VECTOR         CONCAT_VECTOR
   24690     //         \                  /
   24691     //          \                /
   24692     //          RESULT: V + zero extended
   24693     //
   24694     if (V2.getOperand(0).getOpcode() != ISD::BUILD_VECTOR ||
   24695         !V2.getOperand(1).isUndef() || !V1.getOperand(1).isUndef())
   24696       return SDValue();
   24697 
   24698     if (!ISD::isBuildVectorAllZeros(V2.getOperand(0).getNode()))
   24699       return SDValue();
   24700 
   24701     // To match the shuffle mask, the first half of the mask should
   24702     // be exactly the first vector, and all the rest a splat with the
   24703     // first element of the second one.
   24704     for (unsigned i = 0; i != NumElems/2; ++i)
   24705       if (!isUndefOrEqual(SVOp->getMaskElt(i), i) ||
   24706           !isUndefOrEqual(SVOp->getMaskElt(i+NumElems/2), NumElems))
   24707         return SDValue();
   24708 
   24709     // If V1 is coming from a vector load then just fold to a VZEXT_LOAD.
   24710     if (LoadSDNode *Ld = dyn_cast<LoadSDNode>(V1.getOperand(0))) {
   24711       if (Ld->hasNUsesOfValue(1, 0)) {
   24712         SDVTList Tys = DAG.getVTList(MVT::v4i64, MVT::Other);
   24713         SDValue Ops[] = { Ld->getChain(), Ld->getBasePtr() };
   24714         SDValue ResNode =
   24715           DAG.getMemIntrinsicNode(X86ISD::VZEXT_LOAD, dl, Tys, Ops,
   24716                                   Ld->getMemoryVT(),
   24717                                   Ld->getPointerInfo(),
   24718                                   Ld->getAlignment(),
   24719                                   false/*isVolatile*/, true/*ReadMem*/,
   24720                                   false/*WriteMem*/);
   24721 
   24722         // Make sure the newly-created LOAD is in the same position as Ld in
   24723         // terms of dependency. We create a TokenFactor for Ld and ResNode,
   24724         // and update uses of Ld's output chain to use the TokenFactor.
   24725         if (Ld->hasAnyUseOfValue(1)) {
   24726           SDValue NewChain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
   24727                              SDValue(Ld, 1), SDValue(ResNode.getNode(), 1));
   24728           DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), NewChain);
   24729           DAG.UpdateNodeOperands(NewChain.getNode(), SDValue(Ld, 1),
   24730                                  SDValue(ResNode.getNode(), 1));
   24731         }
   24732 
   24733         return DAG.getBitcast(VT, ResNode);
   24734       }
   24735     }
   24736 
   24737     // Emit a zeroed vector and insert the desired subvector on its
   24738     // first half.
   24739     SDValue Zeros = getZeroVector(VT, Subtarget, DAG, dl);
   24740     SDValue InsV = insert128BitVector(Zeros, V1.getOperand(0), 0, DAG, dl);
   24741     return DCI.CombineTo(N, InsV);
   24742   }
   24743 
   24744   return SDValue();
   24745 }
   24746 
   24747 // Attempt to match a combined shuffle mask against supported unary shuffle
   24748 // instructions.
   24749 // TODO: Investigate sharing more of this with shuffle lowering.
   24750 static bool matchUnaryVectorShuffle(MVT SrcVT, ArrayRef<int> Mask,
   24751                                     const X86Subtarget &Subtarget,
   24752                                     unsigned &Shuffle, MVT &ShuffleVT) {
   24753   bool FloatDomain = SrcVT.isFloatingPoint() ||
   24754                      (!Subtarget.hasAVX2() && SrcVT.is256BitVector());
   24755 
   24756   // Match a 128-bit integer vector against a VZEXT_MOVL (MOVQ) instruction.
   24757   if (!FloatDomain && SrcVT.is128BitVector() &&
   24758       isTargetShuffleEquivalent(Mask, {0, SM_SentinelZero})) {
   24759     Shuffle = X86ISD::VZEXT_MOVL;
   24760     ShuffleVT = MVT::v2i64;
   24761     return true;
   24762   }
   24763 
   24764   // Check if we have SSE3 which will let us use MOVDDUP etc. The
   24765   // instructions are no slower than UNPCKLPD but has the option to
   24766   // fold the input operand into even an unaligned memory load.
   24767   if (SrcVT.is128BitVector() && Subtarget.hasSSE3() && FloatDomain) {
   24768     if (isTargetShuffleEquivalent(Mask, {0, 0})) {
   24769       Shuffle = X86ISD::MOVDDUP;
   24770       ShuffleVT = MVT::v2f64;
   24771       return true;
   24772     }
   24773     if (isTargetShuffleEquivalent(Mask, {0, 0, 2, 2})) {
   24774       Shuffle = X86ISD::MOVSLDUP;
   24775       ShuffleVT = MVT::v4f32;
   24776       return true;
   24777     }
   24778     if (isTargetShuffleEquivalent(Mask, {1, 1, 3, 3})) {
   24779       Shuffle = X86ISD::MOVSHDUP;
   24780       ShuffleVT = MVT::v4f32;
   24781       return true;
   24782     }
   24783   }
   24784 
   24785   if (SrcVT.is256BitVector() && FloatDomain) {
   24786     assert(Subtarget.hasAVX() && "AVX required for 256-bit vector shuffles");
   24787     if (isTargetShuffleEquivalent(Mask, {0, 0, 2, 2})) {
   24788       Shuffle = X86ISD::MOVDDUP;
   24789       ShuffleVT = MVT::v4f64;
   24790       return true;
   24791     }
   24792     if (isTargetShuffleEquivalent(Mask, {0, 0, 2, 2, 4, 4, 6, 6})) {
   24793       Shuffle = X86ISD::MOVSLDUP;
   24794       ShuffleVT = MVT::v8f32;
   24795       return true;
   24796     }
   24797     if (isTargetShuffleEquivalent(Mask, {1, 1, 3, 3, 5, 5, 7, 7})) {
   24798       Shuffle = X86ISD::MOVSHDUP;
   24799       ShuffleVT = MVT::v8f32;
   24800       return true;
   24801     }
   24802   }
   24803 
   24804   if (SrcVT.is512BitVector() && FloatDomain) {
   24805     assert(Subtarget.hasAVX512() &&
   24806            "AVX512 required for 512-bit vector shuffles");
   24807     if (isTargetShuffleEquivalent(Mask, {0, 0, 2, 2, 4, 4, 6, 6})) {
   24808       Shuffle = X86ISD::MOVDDUP;
   24809       ShuffleVT = MVT::v8f64;
   24810       return true;
   24811     }
   24812     if (isTargetShuffleEquivalent(
   24813             Mask, {0, 0, 2, 2, 4, 4, 6, 6, 8, 8, 10, 10, 12, 12, 14, 14})) {
   24814       Shuffle = X86ISD::MOVSLDUP;
   24815       ShuffleVT = MVT::v16f32;
   24816       return true;
   24817     }
   24818     if (isTargetShuffleEquivalent(
   24819             Mask, {1, 1, 3, 3, 5, 5, 7, 7, 9, 9, 11, 11, 13, 13, 15, 15})) {
   24820       Shuffle = X86ISD::MOVSHDUP;
   24821       ShuffleVT = MVT::v16f32;
   24822       return true;
   24823     }
   24824   }
   24825 
   24826   // Attempt to match against broadcast-from-vector.
   24827   if (Subtarget.hasAVX2()) {
   24828     unsigned NumElts = Mask.size();
   24829     SmallVector<int, 64> BroadcastMask(NumElts, 0);
   24830     if (isTargetShuffleEquivalent(Mask, BroadcastMask)) {
   24831       unsigned EltSize = SrcVT.getSizeInBits() / NumElts;
   24832       ShuffleVT = FloatDomain ? MVT::getFloatingPointVT(EltSize)
   24833                               : MVT::getIntegerVT(EltSize);
   24834       ShuffleVT = MVT::getVectorVT(ShuffleVT, NumElts);
   24835       Shuffle = X86ISD::VBROADCAST;
   24836       return true;
   24837     }
   24838   }
   24839 
   24840   return false;
   24841 }
   24842 
   24843 // Attempt to match a combined shuffle mask against supported unary immediate
   24844 // permute instructions.
   24845 // TODO: Investigate sharing more of this with shuffle lowering.
   24846 static bool matchPermuteVectorShuffle(MVT SrcVT, ArrayRef<int> Mask,
   24847                                       const X86Subtarget &Subtarget,
   24848                                       unsigned &Shuffle, MVT &ShuffleVT,
   24849                                       unsigned &PermuteImm) {
   24850   // Ensure we don't contain any zero elements.
   24851   for (int M : Mask) {
   24852     if (M == SM_SentinelZero)
   24853       return false;
   24854     assert(SM_SentinelUndef <= M && M < (int)Mask.size() &&
   24855            "Expected unary shuffle");
   24856   }
   24857 
   24858   unsigned MaskScalarSizeInBits = SrcVT.getSizeInBits() / Mask.size();
   24859   MVT MaskEltVT = MVT::getIntegerVT(MaskScalarSizeInBits);
   24860 
   24861   // Handle PSHUFLW/PSHUFHW repeated patterns.
   24862   if (MaskScalarSizeInBits == 16) {
   24863     SmallVector<int, 4> RepeatedMask;
   24864     if (is128BitLaneRepeatedShuffleMask(MaskEltVT, Mask, RepeatedMask)) {
   24865       ArrayRef<int> LoMask(Mask.data() + 0, 4);
   24866       ArrayRef<int> HiMask(Mask.data() + 4, 4);
   24867 
   24868       // PSHUFLW: permute lower 4 elements only.
   24869       if (isUndefOrInRange(LoMask, 0, 4) &&
   24870           isSequentialOrUndefInRange(HiMask, 0, 4, 4)) {
   24871         Shuffle = X86ISD::PSHUFLW;
   24872         ShuffleVT = MVT::getVectorVT(MVT::i16, SrcVT.getSizeInBits() / 16);
   24873         PermuteImm = getV4X86ShuffleImm(LoMask);
   24874         return true;
   24875       }
   24876 
   24877       // PSHUFHW: permute upper 4 elements only.
   24878       if (isUndefOrInRange(HiMask, 4, 8) &&
   24879           isSequentialOrUndefInRange(LoMask, 0, 4, 0)) {
   24880         // Offset the HiMask so that we can create the shuffle immediate.
   24881         int OffsetHiMask[4];
   24882         for (int i = 0; i != 4; ++i)
   24883           OffsetHiMask[i] = (HiMask[i] < 0 ? HiMask[i] : HiMask[i] - 4);
   24884 
   24885         Shuffle = X86ISD::PSHUFHW;
   24886         ShuffleVT = MVT::getVectorVT(MVT::i16, SrcVT.getSizeInBits() / 16);
   24887         PermuteImm = getV4X86ShuffleImm(OffsetHiMask);
   24888         return true;
   24889       }
   24890 
   24891       return false;
   24892     }
   24893     return false;
   24894   }
   24895 
   24896   // We only support permutation of 32/64 bit elements after this.
   24897   if (MaskScalarSizeInBits != 32 && MaskScalarSizeInBits != 64)
   24898     return false;
   24899 
   24900   // AVX introduced the VPERMILPD/VPERMILPS float permutes, before then we
   24901   // had to use 2-input SHUFPD/SHUFPS shuffles (not handled here).
   24902   bool FloatDomain = SrcVT.isFloatingPoint();
   24903   if (FloatDomain && !Subtarget.hasAVX())
   24904     return false;
   24905 
   24906   // Pre-AVX2 we must use float shuffles on 256-bit vectors.
   24907   if (SrcVT.is256BitVector() && !Subtarget.hasAVX2())
   24908     FloatDomain = true;
   24909 
   24910   // Check for lane crossing permutes.
   24911   if (is128BitLaneCrossingShuffleMask(MaskEltVT, Mask)) {
   24912     // PERMPD/PERMQ permutes within a 256-bit vector (AVX2+).
   24913     if (Subtarget.hasAVX2() && SrcVT.is256BitVector() && Mask.size() == 4) {
   24914       Shuffle = X86ISD::VPERMI;
   24915       ShuffleVT = (FloatDomain ? MVT::v4f64 : MVT::v4i64);
   24916       PermuteImm = getV4X86ShuffleImm(Mask);
   24917       return true;
   24918     }
   24919     if (Subtarget.hasAVX512() && SrcVT.is512BitVector() && Mask.size() == 8) {
   24920       SmallVector<int, 4> RepeatedMask;
   24921       if (is256BitLaneRepeatedShuffleMask(MVT::v8f64, Mask, RepeatedMask)) {
   24922         Shuffle = X86ISD::VPERMI;
   24923         ShuffleVT = (FloatDomain ? MVT::v8f64 : MVT::v8i64);
   24924         PermuteImm = getV4X86ShuffleImm(RepeatedMask);
   24925         return true;
   24926       }
   24927     }
   24928     return false;
   24929   }
   24930 
   24931   // VPERMILPD can permute with a non-repeating shuffle.
   24932   if (FloatDomain && MaskScalarSizeInBits == 64) {
   24933     Shuffle = X86ISD::VPERMILPI;
   24934     ShuffleVT = MVT::getVectorVT(MVT::f64, Mask.size());
   24935     PermuteImm = 0;
   24936     for (int i = 0, e = Mask.size(); i != e; ++i) {
   24937       int M = Mask[i];
   24938       if (M == SM_SentinelUndef)
   24939         continue;
   24940       assert(((M / 2) == (i / 2)) && "Out of range shuffle mask index");
   24941       PermuteImm |= (M & 1) << i;
   24942     }
   24943     return true;
   24944   }
   24945 
   24946   // We need a repeating shuffle mask for VPERMILPS/PSHUFD.
   24947   SmallVector<int, 4> RepeatedMask;
   24948   if (!is128BitLaneRepeatedShuffleMask(MaskEltVT, Mask, RepeatedMask))
   24949     return false;
   24950 
   24951   // Narrow the repeated mask for 32-bit element permutes.
   24952   SmallVector<int, 4> WordMask = RepeatedMask;
   24953   if (MaskScalarSizeInBits == 64)
   24954     scaleShuffleMask(2, RepeatedMask, WordMask);
   24955 
   24956   Shuffle = (FloatDomain ? X86ISD::VPERMILPI : X86ISD::PSHUFD);
   24957   ShuffleVT = (FloatDomain ? MVT::f32 : MVT::i32);
   24958   ShuffleVT = MVT::getVectorVT(ShuffleVT, SrcVT.getSizeInBits() / 32);
   24959   PermuteImm = getV4X86ShuffleImm(WordMask);
   24960   return true;
   24961 }
   24962 
   24963 // Attempt to match a combined unary shuffle mask against supported binary
   24964 // shuffle instructions.
   24965 // TODO: Investigate sharing more of this with shuffle lowering.
   24966 static bool matchBinaryVectorShuffle(MVT SrcVT, ArrayRef<int> Mask,
   24967                                      unsigned &Shuffle, MVT &ShuffleVT) {
   24968   bool FloatDomain = SrcVT.isFloatingPoint();
   24969 
   24970   if (SrcVT.is128BitVector()) {
   24971     if (isTargetShuffleEquivalent(Mask, {0, 0}) && FloatDomain) {
   24972       Shuffle = X86ISD::MOVLHPS;
   24973       ShuffleVT = MVT::v4f32;
   24974       return true;
   24975     }
   24976     if (isTargetShuffleEquivalent(Mask, {1, 1}) && FloatDomain) {
   24977       Shuffle = X86ISD::MOVHLPS;
   24978       ShuffleVT = MVT::v4f32;
   24979       return true;
   24980     }
   24981     if (isTargetShuffleEquivalent(Mask, {0, 0, 1, 1}) && FloatDomain) {
   24982       Shuffle = X86ISD::UNPCKL;
   24983       ShuffleVT = MVT::v4f32;
   24984       return true;
   24985     }
   24986     if (isTargetShuffleEquivalent(Mask, {2, 2, 3, 3}) && FloatDomain) {
   24987       Shuffle = X86ISD::UNPCKH;
   24988       ShuffleVT = MVT::v4f32;
   24989       return true;
   24990     }
   24991     if (isTargetShuffleEquivalent(Mask, {0, 0, 1, 1, 2, 2, 3, 3}) ||
   24992         isTargetShuffleEquivalent(
   24993             Mask, {0, 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7})) {
   24994       Shuffle = X86ISD::UNPCKL;
   24995       ShuffleVT = Mask.size() == 8 ? MVT::v8i16 : MVT::v16i8;
   24996       return true;
   24997     }
   24998     if (isTargetShuffleEquivalent(Mask, {4, 4, 5, 5, 6, 6, 7, 7}) ||
   24999         isTargetShuffleEquivalent(Mask, {8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13,
   25000                                          13, 14, 14, 15, 15})) {
   25001       Shuffle = X86ISD::UNPCKH;
   25002       ShuffleVT = Mask.size() == 8 ? MVT::v8i16 : MVT::v16i8;
   25003       return true;
   25004     }
   25005   }
   25006 
   25007   return false;
   25008 }
   25009 
   25010 /// \brief Combine an arbitrary chain of shuffles into a single instruction if
   25011 /// possible.
   25012 ///
   25013 /// This is the leaf of the recursive combine below. When we have found some
   25014 /// chain of single-use x86 shuffle instructions and accumulated the combined
   25015 /// shuffle mask represented by them, this will try to pattern match that mask
   25016 /// into either a single instruction if there is a special purpose instruction
   25017 /// for this operation, or into a PSHUFB instruction which is a fully general
   25018 /// instruction but should only be used to replace chains over a certain depth.
   25019 static bool combineX86ShuffleChain(SDValue Input, SDValue Root,
   25020                                    ArrayRef<int> BaseMask, int Depth,
   25021                                    bool HasVariableMask, SelectionDAG &DAG,
   25022                                    TargetLowering::DAGCombinerInfo &DCI,
   25023                                    const X86Subtarget &Subtarget) {
   25024   assert(!BaseMask.empty() && "Cannot combine an empty shuffle mask!");
   25025 
   25026   // Find the operand that enters the chain. Note that multiple uses are OK
   25027   // here, we're not going to remove the operand we find.
   25028   Input = peekThroughBitcasts(Input);
   25029 
   25030   MVT VT = Input.getSimpleValueType();
   25031   MVT RootVT = Root.getSimpleValueType();
   25032   SDLoc DL(Root);
   25033 
   25034   SDValue Res;
   25035 
   25036   unsigned NumBaseMaskElts = BaseMask.size();
   25037   if (NumBaseMaskElts == 1) {
   25038     assert(BaseMask[0] == 0 && "Invalid shuffle index found!");
   25039     DCI.CombineTo(Root.getNode(), DAG.getBitcast(RootVT, Input),
   25040                   /*AddTo*/ true);
   25041     return true;
   25042   }
   25043 
   25044   unsigned RootSizeInBits = RootVT.getSizeInBits();
   25045   unsigned BaseMaskEltSizeInBits = RootSizeInBits / NumBaseMaskElts;
   25046 
   25047   // Don't combine if we are a AVX512/EVEX target and the mask element size
   25048   // is different from the root element size - this would prevent writemasks
   25049   // from being reused.
   25050   // TODO - this currently prevents all lane shuffles from occurring.
   25051   // TODO - check for writemasks usage instead of always preventing combining.
   25052   // TODO - attempt to narrow Mask back to writemask size.
   25053   if (RootVT.getScalarSizeInBits() != BaseMaskEltSizeInBits &&
   25054       (RootSizeInBits == 512 ||
   25055        (Subtarget.hasVLX() && RootSizeInBits >= 128))) {
   25056     return false;
   25057   }
   25058 
   25059   // TODO - handle 128/256-bit lane shuffles of 512-bit vectors.
   25060 
   25061   // Handle 128-bit lane shuffles of 256-bit vectors.
   25062   if (VT.is256BitVector() && NumBaseMaskElts == 2 &&
   25063       !isSequentialOrUndefOrZeroInRange(BaseMask, 0, 2, 0)) {
   25064     if (Depth == 1 && Root.getOpcode() == X86ISD::VPERM2X128)
   25065       return false; // Nothing to do!
   25066     MVT ShuffleVT = (VT.isFloatingPoint() || !Subtarget.hasAVX2() ? MVT::v4f64
   25067                                                                   : MVT::v4i64);
   25068     unsigned PermMask = 0;
   25069     PermMask |= ((BaseMask[0] < 0 ? 0x8 : (BaseMask[0] & 1)) << 0);
   25070     PermMask |= ((BaseMask[1] < 0 ? 0x8 : (BaseMask[1] & 1)) << 4);
   25071 
   25072     Res = DAG.getBitcast(ShuffleVT, Input);
   25073     DCI.AddToWorklist(Res.getNode());
   25074     Res = DAG.getNode(X86ISD::VPERM2X128, DL, ShuffleVT, Res,
   25075                       DAG.getUNDEF(ShuffleVT),
   25076                       DAG.getConstant(PermMask, DL, MVT::i8));
   25077     DCI.AddToWorklist(Res.getNode());
   25078     DCI.CombineTo(Root.getNode(), DAG.getBitcast(RootVT, Res),
   25079                   /*AddTo*/ true);
   25080     return true;
   25081   }
   25082 
   25083   // For masks that have been widened to 128-bit elements or more,
   25084   // narrow back down to 64-bit elements.
   25085   SmallVector<int, 64> Mask;
   25086   if (BaseMaskEltSizeInBits > 64) {
   25087     assert((BaseMaskEltSizeInBits % 64) == 0 && "Illegal mask size");
   25088     int MaskScale = BaseMaskEltSizeInBits / 64;
   25089     scaleShuffleMask(MaskScale, BaseMask, Mask);
   25090   } else {
   25091     Mask = SmallVector<int, 64>(BaseMask.begin(), BaseMask.end());
   25092   }
   25093 
   25094   unsigned NumMaskElts = Mask.size();
   25095   unsigned MaskEltSizeInBits = RootSizeInBits / NumMaskElts;
   25096 
   25097   // Determine the effective mask value type.
   25098   bool FloatDomain =
   25099       (VT.isFloatingPoint() || (VT.is256BitVector() && !Subtarget.hasAVX2())) &&
   25100       (32 <= MaskEltSizeInBits);
   25101   MVT MaskVT = FloatDomain ? MVT::getFloatingPointVT(MaskEltSizeInBits)
   25102                            : MVT::getIntegerVT(MaskEltSizeInBits);
   25103   MaskVT = MVT::getVectorVT(MaskVT, NumMaskElts);
   25104 
   25105   // Attempt to match the mask against known shuffle patterns.
   25106   MVT ShuffleVT;
   25107   unsigned Shuffle, PermuteImm;
   25108 
   25109   if (matchUnaryVectorShuffle(VT, Mask, Subtarget, Shuffle, ShuffleVT)) {
   25110     if (Depth == 1 && Root.getOpcode() == Shuffle)
   25111       return false; // Nothing to do!
   25112     Res = DAG.getBitcast(ShuffleVT, Input);
   25113     DCI.AddToWorklist(Res.getNode());
   25114     Res = DAG.getNode(Shuffle, DL, ShuffleVT, Res);
   25115     DCI.AddToWorklist(Res.getNode());
   25116     DCI.CombineTo(Root.getNode(), DAG.getBitcast(RootVT, Res),
   25117                   /*AddTo*/ true);
   25118     return true;
   25119   }
   25120 
   25121   if (matchPermuteVectorShuffle(VT, Mask, Subtarget, Shuffle, ShuffleVT,
   25122                                 PermuteImm)) {
   25123     if (Depth == 1 && Root.getOpcode() == Shuffle)
   25124       return false; // Nothing to do!
   25125     Res = DAG.getBitcast(ShuffleVT, Input);
   25126     DCI.AddToWorklist(Res.getNode());
   25127     Res = DAG.getNode(Shuffle, DL, ShuffleVT, Res,
   25128                       DAG.getConstant(PermuteImm, DL, MVT::i8));
   25129     DCI.AddToWorklist(Res.getNode());
   25130     DCI.CombineTo(Root.getNode(), DAG.getBitcast(RootVT, Res),
   25131                   /*AddTo*/ true);
   25132     return true;
   25133   }
   25134 
   25135   if (matchBinaryVectorShuffle(VT, Mask, Shuffle, ShuffleVT)) {
   25136     if (Depth == 1 && Root.getOpcode() == Shuffle)
   25137       return false; // Nothing to do!
   25138     Res = DAG.getBitcast(ShuffleVT, Input);
   25139     DCI.AddToWorklist(Res.getNode());
   25140     Res = DAG.getNode(Shuffle, DL, ShuffleVT, Res, Res);
   25141     DCI.AddToWorklist(Res.getNode());
   25142     DCI.CombineTo(Root.getNode(), DAG.getBitcast(RootVT, Res),
   25143                   /*AddTo*/ true);
   25144     return true;
   25145   }
   25146 
   25147   // Attempt to blend with zero.
   25148   if (NumMaskElts <= 8 &&
   25149       ((Subtarget.hasSSE41() && VT.is128BitVector()) ||
   25150        (Subtarget.hasAVX() && VT.is256BitVector()))) {
   25151     // Convert VT to a type compatible with X86ISD::BLENDI.
   25152     // TODO - add 16i16 support (requires lane duplication).
   25153     MVT ShuffleVT = MaskVT;
   25154     if (Subtarget.hasAVX2()) {
   25155       if (ShuffleVT == MVT::v4i64)
   25156         ShuffleVT = MVT::v8i32;
   25157       else if (ShuffleVT == MVT::v2i64)
   25158         ShuffleVT = MVT::v4i32;
   25159     } else {
   25160       if (ShuffleVT == MVT::v2i64 || ShuffleVT == MVT::v4i32)
   25161         ShuffleVT = MVT::v8i16;
   25162       else if (ShuffleVT == MVT::v4i64)
   25163         ShuffleVT = MVT::v4f64;
   25164       else if (ShuffleVT == MVT::v8i32)
   25165         ShuffleVT = MVT::v8f32;
   25166     }
   25167 
   25168     if (isSequentialOrUndefOrZeroInRange(Mask, /*Pos*/ 0, /*Size*/ NumMaskElts,
   25169                                          /*Low*/ 0) &&
   25170         NumMaskElts <= ShuffleVT.getVectorNumElements()) {
   25171       unsigned BlendMask = 0;
   25172       unsigned ShuffleSize = ShuffleVT.getVectorNumElements();
   25173       unsigned MaskRatio = ShuffleSize / NumMaskElts;
   25174 
   25175       if (Depth == 1 && Root.getOpcode() == X86ISD::BLENDI)
   25176         return false;
   25177 
   25178       for (unsigned i = 0; i != ShuffleSize; ++i)
   25179         if (Mask[i / MaskRatio] < 0)
   25180           BlendMask |= 1u << i;
   25181 
   25182       SDValue Zero = getZeroVector(ShuffleVT, Subtarget, DAG, DL);
   25183       Res = DAG.getBitcast(ShuffleVT, Input);
   25184       DCI.AddToWorklist(Res.getNode());
   25185       Res = DAG.getNode(X86ISD::BLENDI, DL, ShuffleVT, Res, Zero,
   25186                         DAG.getConstant(BlendMask, DL, MVT::i8));
   25187       DCI.AddToWorklist(Res.getNode());
   25188       DCI.CombineTo(Root.getNode(), DAG.getBitcast(RootVT, Res),
   25189                     /*AddTo*/ true);
   25190       return true;
   25191     }
   25192   }
   25193 
   25194   // Attempt to combine to INSERTPS.
   25195   if (Subtarget.hasSSE41() && NumMaskElts == 4 &&
   25196       (VT == MVT::v2f64 || VT == MVT::v4f32)) {
   25197     SmallBitVector Zeroable(4, false);
   25198     for (unsigned i = 0; i != NumMaskElts; ++i)
   25199       if (Mask[i] < 0)
   25200         Zeroable[i] = true;
   25201 
   25202     unsigned InsertPSMask;
   25203     SDValue V1 = Input, V2 = Input;
   25204     if (Zeroable.any() && matchVectorShuffleAsInsertPS(V1, V2, InsertPSMask,
   25205                                                        Zeroable, Mask, DAG)) {
   25206       if (Depth == 1 && Root.getOpcode() == X86ISD::INSERTPS)
   25207         return false; // Nothing to do!
   25208       V1 = DAG.getBitcast(MVT::v4f32, V1);
   25209       DCI.AddToWorklist(V1.getNode());
   25210       V2 = DAG.getBitcast(MVT::v4f32, V2);
   25211       DCI.AddToWorklist(V2.getNode());
   25212       Res = DAG.getNode(X86ISD::INSERTPS, DL, MVT::v4f32, V1, V2,
   25213                         DAG.getConstant(InsertPSMask, DL, MVT::i8));
   25214       DCI.AddToWorklist(Res.getNode());
   25215       DCI.CombineTo(Root.getNode(), DAG.getBitcast(RootVT, Res),
   25216                     /*AddTo*/ true);
   25217       return true;
   25218     }
   25219   }
   25220 
   25221   // Don't try to re-form single instruction chains under any circumstances now
   25222   // that we've done encoding canonicalization for them.
   25223   if (Depth < 2)
   25224     return false;
   25225 
   25226   if (is128BitLaneCrossingShuffleMask(MaskVT, Mask))
   25227     return false;
   25228 
   25229   bool MaskContainsZeros =
   25230       llvm::any_of(Mask, [](int M) { return M == SM_SentinelZero; });
   25231 
   25232   // If we have a single input shuffle with different shuffle patterns in the
   25233   // the 128-bit lanes use the variable mask to VPERMILPS.
   25234   // TODO Combine other mask types at higher depths.
   25235   if (HasVariableMask && !MaskContainsZeros &&
   25236       ((MaskVT == MVT::v8f32 && Subtarget.hasAVX()) ||
   25237        (MaskVT == MVT::v16f32 && Subtarget.hasAVX512()))) {
   25238     SmallVector<SDValue, 16> VPermIdx;
   25239     for (int M : Mask) {
   25240       SDValue Idx =
   25241           M < 0 ? DAG.getUNDEF(MVT::i32) : DAG.getConstant(M % 4, DL, MVT::i32);
   25242       VPermIdx.push_back(Idx);
   25243     }
   25244     MVT VPermMaskVT = MVT::getVectorVT(MVT::i32, NumMaskElts);
   25245     SDValue VPermMask = DAG.getBuildVector(VPermMaskVT, DL, VPermIdx);
   25246     DCI.AddToWorklist(VPermMask.getNode());
   25247     Res = DAG.getBitcast(MaskVT, Input);
   25248     DCI.AddToWorklist(Res.getNode());
   25249     Res = DAG.getNode(X86ISD::VPERMILPV, DL, MaskVT, Res, VPermMask);
   25250     DCI.AddToWorklist(Res.getNode());
   25251     DCI.CombineTo(Root.getNode(), DAG.getBitcast(RootVT, Res),
   25252                   /*AddTo*/ true);
   25253     return true;
   25254   }
   25255 
   25256   // If we have 3 or more shuffle instructions or a chain involving a variable
   25257   // mask, we can replace them with a single PSHUFB instruction profitably.
   25258   // Intel's manuals suggest only using PSHUFB if doing so replacing 5
   25259   // instructions, but in practice PSHUFB tends to be *very* fast so we're
   25260   // more aggressive.
   25261   if ((Depth >= 3 || HasVariableMask) &&
   25262       ((VT.is128BitVector() && Subtarget.hasSSSE3()) ||
   25263        (VT.is256BitVector() && Subtarget.hasAVX2()) ||
   25264        (VT.is512BitVector() && Subtarget.hasBWI()))) {
   25265     SmallVector<SDValue, 16> PSHUFBMask;
   25266     int NumBytes = VT.getSizeInBits() / 8;
   25267     int Ratio = NumBytes / NumMaskElts;
   25268     for (int i = 0; i < NumBytes; ++i) {
   25269       int M = Mask[i / Ratio];
   25270       if (M == SM_SentinelUndef) {
   25271         PSHUFBMask.push_back(DAG.getUNDEF(MVT::i8));
   25272         continue;
   25273       }
   25274       if (M == SM_SentinelZero) {
   25275         PSHUFBMask.push_back(DAG.getConstant(255, DL, MVT::i8));
   25276         continue;
   25277       }
   25278       M = Ratio * M + i % Ratio;
   25279       assert ((M / 16) == (i / 16) && "Lane crossing detected");
   25280       PSHUFBMask.push_back(DAG.getConstant(M, DL, MVT::i8));
   25281     }
   25282     MVT ByteVT = MVT::getVectorVT(MVT::i8, NumBytes);
   25283     Res = DAG.getBitcast(ByteVT, Input);
   25284     DCI.AddToWorklist(Res.getNode());
   25285     SDValue PSHUFBMaskOp = DAG.getBuildVector(ByteVT, DL, PSHUFBMask);
   25286     DCI.AddToWorklist(PSHUFBMaskOp.getNode());
   25287     Res = DAG.getNode(X86ISD::PSHUFB, DL, ByteVT, Res, PSHUFBMaskOp);
   25288     DCI.AddToWorklist(Res.getNode());
   25289     DCI.CombineTo(Root.getNode(), DAG.getBitcast(RootVT, Res),
   25290                   /*AddTo*/ true);
   25291     return true;
   25292   }
   25293 
   25294   // Failed to find any combines.
   25295   return false;
   25296 }
   25297 
   25298 /// \brief Fully generic combining of x86 shuffle instructions.
   25299 ///
   25300 /// This should be the last combine run over the x86 shuffle instructions. Once
   25301 /// they have been fully optimized, this will recursively consider all chains
   25302 /// of single-use shuffle instructions, build a generic model of the cumulative
   25303 /// shuffle operation, and check for simpler instructions which implement this
   25304 /// operation. We use this primarily for two purposes:
   25305 ///
   25306 /// 1) Collapse generic shuffles to specialized single instructions when
   25307 ///    equivalent. In most cases, this is just an encoding size win, but
   25308 ///    sometimes we will collapse multiple generic shuffles into a single
   25309 ///    special-purpose shuffle.
   25310 /// 2) Look for sequences of shuffle instructions with 3 or more total
   25311 ///    instructions, and replace them with the slightly more expensive SSSE3
   25312 ///    PSHUFB instruction if available. We do this as the last combining step
   25313 ///    to ensure we avoid using PSHUFB if we can implement the shuffle with
   25314 ///    a suitable short sequence of other instructions. The PHUFB will either
   25315 ///    use a register or have to read from memory and so is slightly (but only
   25316 ///    slightly) more expensive than the other shuffle instructions.
   25317 ///
   25318 /// Because this is inherently a quadratic operation (for each shuffle in
   25319 /// a chain, we recurse up the chain), the depth is limited to 8 instructions.
   25320 /// This should never be an issue in practice as the shuffle lowering doesn't
   25321 /// produce sequences of more than 8 instructions.
   25322 ///
   25323 /// FIXME: We will currently miss some cases where the redundant shuffling
   25324 /// would simplify under the threshold for PSHUFB formation because of
   25325 /// combine-ordering. To fix this, we should do the redundant instruction
   25326 /// combining in this recursive walk.
   25327 static bool combineX86ShufflesRecursively(SDValue Op, SDValue Root,
   25328                                           ArrayRef<int> RootMask,
   25329                                           int Depth, bool HasVariableMask,
   25330                                           SelectionDAG &DAG,
   25331                                           TargetLowering::DAGCombinerInfo &DCI,
   25332                                           const X86Subtarget &Subtarget) {
   25333   // Bound the depth of our recursive combine because this is ultimately
   25334   // quadratic in nature.
   25335   if (Depth > 8)
   25336     return false;
   25337 
   25338   // Directly rip through bitcasts to find the underlying operand.
   25339   while (Op.getOpcode() == ISD::BITCAST && Op.getOperand(0).hasOneUse())
   25340     Op = Op.getOperand(0);
   25341 
   25342   MVT VT = Op.getSimpleValueType();
   25343   if (!VT.isVector())
   25344     return false; // Bail if we hit a non-vector.
   25345 
   25346   assert(Root.getSimpleValueType().isVector() &&
   25347          "Shuffles operate on vector types!");
   25348   assert(VT.getSizeInBits() == Root.getSimpleValueType().getSizeInBits() &&
   25349          "Can only combine shuffles of the same vector register size.");
   25350 
   25351   // Extract target shuffle mask and resolve sentinels and inputs.
   25352   SDValue Input0, Input1;
   25353   SmallVector<int, 16> OpMask;
   25354   if (!resolveTargetShuffleInputs(Op, Input0, Input1, OpMask))
   25355     return false;
   25356 
   25357   assert(VT.getVectorNumElements() == OpMask.size() &&
   25358          "Different mask size from vector size!");
   25359   assert(((RootMask.size() > OpMask.size() &&
   25360            RootMask.size() % OpMask.size() == 0) ||
   25361           (OpMask.size() > RootMask.size() &&
   25362            OpMask.size() % RootMask.size() == 0) ||
   25363           OpMask.size() == RootMask.size()) &&
   25364          "The smaller number of elements must divide the larger.");
   25365   int MaskWidth = std::max<int>(OpMask.size(), RootMask.size());
   25366   int RootRatio = std::max<int>(1, OpMask.size() / RootMask.size());
   25367   int OpRatio = std::max<int>(1, RootMask.size() / OpMask.size());
   25368   assert(((RootRatio == 1 && OpRatio == 1) ||
   25369           (RootRatio == 1) != (OpRatio == 1)) &&
   25370          "Must not have a ratio for both incoming and op masks!");
   25371 
   25372   SmallVector<int, 16> Mask;
   25373   Mask.reserve(MaskWidth);
   25374 
   25375   // Merge this shuffle operation's mask into our accumulated mask. Note that
   25376   // this shuffle's mask will be the first applied to the input, followed by the
   25377   // root mask to get us all the way to the root value arrangement. The reason
   25378   // for this order is that we are recursing up the operation chain.
   25379   for (int i = 0; i < MaskWidth; ++i) {
   25380     int RootIdx = i / RootRatio;
   25381     if (RootMask[RootIdx] < 0) {
   25382       // This is a zero or undef lane, we're done.
   25383       Mask.push_back(RootMask[RootIdx]);
   25384       continue;
   25385     }
   25386 
   25387     int RootMaskedIdx = RootMask[RootIdx] * RootRatio + i % RootRatio;
   25388     int OpIdx = RootMaskedIdx / OpRatio;
   25389     if (OpMask[OpIdx] < 0) {
   25390       // The incoming lanes are zero or undef, it doesn't matter which ones we
   25391       // are using.
   25392       Mask.push_back(OpMask[OpIdx]);
   25393       continue;
   25394     }
   25395 
   25396     // Ok, we have non-zero lanes, map them through.
   25397     Mask.push_back(OpMask[OpIdx] * OpRatio +
   25398                    RootMaskedIdx % OpRatio);
   25399   }
   25400 
   25401   // Handle the all undef/zero cases early.
   25402   if (llvm::all_of(Mask, [](int Idx) { return Idx == SM_SentinelUndef; })) {
   25403     DCI.CombineTo(Root.getNode(), DAG.getUNDEF(Root.getValueType()));
   25404     return true;
   25405   }
   25406   if (llvm::all_of(Mask, [](int Idx) { return Idx < 0; })) {
   25407     // TODO - should we handle the mixed zero/undef case as well? Just returning
   25408     // a zero mask will lose information on undef elements possibly reducing
   25409     // future combine possibilities.
   25410     DCI.CombineTo(Root.getNode(), getZeroVector(Root.getSimpleValueType(),
   25411                                                 Subtarget, DAG, SDLoc(Root)));
   25412     return true;
   25413   }
   25414 
   25415   int MaskSize = Mask.size();
   25416   bool UseInput0 = std::any_of(Mask.begin(), Mask.end(),
   25417                   [MaskSize](int Idx) { return 0 <= Idx && Idx < MaskSize; });
   25418   bool UseInput1 = std::any_of(Mask.begin(), Mask.end(),
   25419                   [MaskSize](int Idx) { return MaskSize <= Idx; });
   25420 
   25421   // At the moment we can only combine unary shuffle mask cases.
   25422   if (UseInput0 && UseInput1)
   25423     return false;
   25424   else if (UseInput1) {
   25425     std::swap(Input0, Input1);
   25426     ShuffleVectorSDNode::commuteMask(Mask);
   25427   }
   25428 
   25429   assert(Input0 && "Shuffle with no inputs detected");
   25430 
   25431   HasVariableMask |= isTargetShuffleVariableMask(Op.getOpcode());
   25432 
   25433   // See if we can recurse into Input0 (if it's a target shuffle).
   25434   if (Op->isOnlyUserOf(Input0.getNode()) &&
   25435       combineX86ShufflesRecursively(Input0, Root, Mask, Depth + 1,
   25436                                     HasVariableMask, DAG, DCI, Subtarget))
   25437     return true;
   25438 
   25439   // Minor canonicalization of the accumulated shuffle mask to make it easier
   25440   // to match below. All this does is detect masks with sequential pairs of
   25441   // elements, and shrink them to the half-width mask. It does this in a loop
   25442   // so it will reduce the size of the mask to the minimal width mask which
   25443   // performs an equivalent shuffle.
   25444   SmallVector<int, 16> WidenedMask;
   25445   while (Mask.size() > 1 && canWidenShuffleElements(Mask, WidenedMask)) {
   25446     Mask = std::move(WidenedMask);
   25447   }
   25448 
   25449   return combineX86ShuffleChain(Input0, Root, Mask, Depth, HasVariableMask, DAG,
   25450                                 DCI, Subtarget);
   25451 }
   25452 
   25453 /// \brief Get the PSHUF-style mask from PSHUF node.
   25454 ///
   25455 /// This is a very minor wrapper around getTargetShuffleMask to easy forming v4
   25456 /// PSHUF-style masks that can be reused with such instructions.
   25457 static SmallVector<int, 4> getPSHUFShuffleMask(SDValue N) {
   25458   MVT VT = N.getSimpleValueType();
   25459   SmallVector<int, 4> Mask;
   25460   SmallVector<SDValue, 2> Ops;
   25461   bool IsUnary;
   25462   bool HaveMask =
   25463       getTargetShuffleMask(N.getNode(), VT, false, Ops, Mask, IsUnary);
   25464   (void)HaveMask;
   25465   assert(HaveMask);
   25466 
   25467   // If we have more than 128-bits, only the low 128-bits of shuffle mask
   25468   // matter. Check that the upper masks are repeats and remove them.
   25469   if (VT.getSizeInBits() > 128) {
   25470     int LaneElts = 128 / VT.getScalarSizeInBits();
   25471 #ifndef NDEBUG
   25472     for (int i = 1, NumLanes = VT.getSizeInBits() / 128; i < NumLanes; ++i)
   25473       for (int j = 0; j < LaneElts; ++j)
   25474         assert(Mask[j] == Mask[i * LaneElts + j] - (LaneElts * i) &&
   25475                "Mask doesn't repeat in high 128-bit lanes!");
   25476 #endif
   25477     Mask.resize(LaneElts);
   25478   }
   25479 
   25480   switch (N.getOpcode()) {
   25481   case X86ISD::PSHUFD:
   25482     return Mask;
   25483   case X86ISD::PSHUFLW:
   25484     Mask.resize(4);
   25485     return Mask;
   25486   case X86ISD::PSHUFHW:
   25487     Mask.erase(Mask.begin(), Mask.begin() + 4);
   25488     for (int &M : Mask)
   25489       M -= 4;
   25490     return Mask;
   25491   default:
   25492     llvm_unreachable("No valid shuffle instruction found!");
   25493   }
   25494 }
   25495 
   25496 /// \brief Search for a combinable shuffle across a chain ending in pshufd.
   25497 ///
   25498 /// We walk up the chain and look for a combinable shuffle, skipping over
   25499 /// shuffles that we could hoist this shuffle's transformation past without
   25500 /// altering anything.
   25501 static SDValue
   25502 combineRedundantDWordShuffle(SDValue N, MutableArrayRef<int> Mask,
   25503                              SelectionDAG &DAG,
   25504                              TargetLowering::DAGCombinerInfo &DCI) {
   25505   assert(N.getOpcode() == X86ISD::PSHUFD &&
   25506          "Called with something other than an x86 128-bit half shuffle!");
   25507   SDLoc DL(N);
   25508 
   25509   // Walk up a single-use chain looking for a combinable shuffle. Keep a stack
   25510   // of the shuffles in the chain so that we can form a fresh chain to replace
   25511   // this one.
   25512   SmallVector<SDValue, 8> Chain;
   25513   SDValue V = N.getOperand(0);
   25514   for (; V.hasOneUse(); V = V.getOperand(0)) {
   25515     switch (V.getOpcode()) {
   25516     default:
   25517       return SDValue(); // Nothing combined!
   25518 
   25519     case ISD::BITCAST:
   25520       // Skip bitcasts as we always know the type for the target specific
   25521       // instructions.
   25522       continue;
   25523 
   25524     case X86ISD::PSHUFD:
   25525       // Found another dword shuffle.
   25526       break;
   25527 
   25528     case X86ISD::PSHUFLW:
   25529       // Check that the low words (being shuffled) are the identity in the
   25530       // dword shuffle, and the high words are self-contained.
   25531       if (Mask[0] != 0 || Mask[1] != 1 ||
   25532           !(Mask[2] >= 2 && Mask[2] < 4 && Mask[3] >= 2 && Mask[3] < 4))
   25533         return SDValue();
   25534 
   25535       Chain.push_back(V);
   25536       continue;
   25537 
   25538     case X86ISD::PSHUFHW:
   25539       // Check that the high words (being shuffled) are the identity in the
   25540       // dword shuffle, and the low words are self-contained.
   25541       if (Mask[2] != 2 || Mask[3] != 3 ||
   25542           !(Mask[0] >= 0 && Mask[0] < 2 && Mask[1] >= 0 && Mask[1] < 2))
   25543         return SDValue();
   25544 
   25545       Chain.push_back(V);
   25546       continue;
   25547 
   25548     case X86ISD::UNPCKL:
   25549     case X86ISD::UNPCKH:
   25550       // For either i8 -> i16 or i16 -> i32 unpacks, we can combine a dword
   25551       // shuffle into a preceding word shuffle.
   25552       if (V.getSimpleValueType().getVectorElementType() != MVT::i8 &&
   25553           V.getSimpleValueType().getVectorElementType() != MVT::i16)
   25554         return SDValue();
   25555 
   25556       // Search for a half-shuffle which we can combine with.
   25557       unsigned CombineOp =
   25558           V.getOpcode() == X86ISD::UNPCKL ? X86ISD::PSHUFLW : X86ISD::PSHUFHW;
   25559       if (V.getOperand(0) != V.getOperand(1) ||
   25560           !V->isOnlyUserOf(V.getOperand(0).getNode()))
   25561         return SDValue();
   25562       Chain.push_back(V);
   25563       V = V.getOperand(0);
   25564       do {
   25565         switch (V.getOpcode()) {
   25566         default:
   25567           return SDValue(); // Nothing to combine.
   25568 
   25569         case X86ISD::PSHUFLW:
   25570         case X86ISD::PSHUFHW:
   25571           if (V.getOpcode() == CombineOp)
   25572             break;
   25573 
   25574           Chain.push_back(V);
   25575 
   25576           // Fallthrough!
   25577         case ISD::BITCAST:
   25578           V = V.getOperand(0);
   25579           continue;
   25580         }
   25581         break;
   25582       } while (V.hasOneUse());
   25583       break;
   25584     }
   25585     // Break out of the loop if we break out of the switch.
   25586     break;
   25587   }
   25588 
   25589   if (!V.hasOneUse())
   25590     // We fell out of the loop without finding a viable combining instruction.
   25591     return SDValue();
   25592 
   25593   // Merge this node's mask and our incoming mask.
   25594   SmallVector<int, 4> VMask = getPSHUFShuffleMask(V);
   25595   for (int &M : Mask)
   25596     M = VMask[M];
   25597   V = DAG.getNode(V.getOpcode(), DL, V.getValueType(), V.getOperand(0),
   25598                   getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
   25599 
   25600   // Rebuild the chain around this new shuffle.
   25601   while (!Chain.empty()) {
   25602     SDValue W = Chain.pop_back_val();
   25603 
   25604     if (V.getValueType() != W.getOperand(0).getValueType())
   25605       V = DAG.getBitcast(W.getOperand(0).getValueType(), V);
   25606 
   25607     switch (W.getOpcode()) {
   25608     default:
   25609       llvm_unreachable("Only PSHUF and UNPCK instructions get here!");
   25610 
   25611     case X86ISD::UNPCKL:
   25612     case X86ISD::UNPCKH:
   25613       V = DAG.getNode(W.getOpcode(), DL, W.getValueType(), V, V);
   25614       break;
   25615 
   25616     case X86ISD::PSHUFD:
   25617     case X86ISD::PSHUFLW:
   25618     case X86ISD::PSHUFHW:
   25619       V = DAG.getNode(W.getOpcode(), DL, W.getValueType(), V, W.getOperand(1));
   25620       break;
   25621     }
   25622   }
   25623   if (V.getValueType() != N.getValueType())
   25624     V = DAG.getBitcast(N.getValueType(), V);
   25625 
   25626   // Return the new chain to replace N.
   25627   return V;
   25628 }
   25629 
   25630 /// \brief Search for a combinable shuffle across a chain ending in pshuflw or
   25631 /// pshufhw.
   25632 ///
   25633 /// We walk up the chain, skipping shuffles of the other half and looking
   25634 /// through shuffles which switch halves trying to find a shuffle of the same
   25635 /// pair of dwords.
   25636 static bool combineRedundantHalfShuffle(SDValue N, MutableArrayRef<int> Mask,
   25637                                         SelectionDAG &DAG,
   25638                                         TargetLowering::DAGCombinerInfo &DCI) {
   25639   assert(
   25640       (N.getOpcode() == X86ISD::PSHUFLW || N.getOpcode() == X86ISD::PSHUFHW) &&
   25641       "Called with something other than an x86 128-bit half shuffle!");
   25642   SDLoc DL(N);
   25643   unsigned CombineOpcode = N.getOpcode();
   25644 
   25645   // Walk up a single-use chain looking for a combinable shuffle.
   25646   SDValue V = N.getOperand(0);
   25647   for (; V.hasOneUse(); V = V.getOperand(0)) {
   25648     switch (V.getOpcode()) {
   25649     default:
   25650       return false; // Nothing combined!
   25651 
   25652     case ISD::BITCAST:
   25653       // Skip bitcasts as we always know the type for the target specific
   25654       // instructions.
   25655       continue;
   25656 
   25657     case X86ISD::PSHUFLW:
   25658     case X86ISD::PSHUFHW:
   25659       if (V.getOpcode() == CombineOpcode)
   25660         break;
   25661 
   25662       // Other-half shuffles are no-ops.
   25663       continue;
   25664     }
   25665     // Break out of the loop if we break out of the switch.
   25666     break;
   25667   }
   25668 
   25669   if (!V.hasOneUse())
   25670     // We fell out of the loop without finding a viable combining instruction.
   25671     return false;
   25672 
   25673   // Combine away the bottom node as its shuffle will be accumulated into
   25674   // a preceding shuffle.
   25675   DCI.CombineTo(N.getNode(), N.getOperand(0), /*AddTo*/ true);
   25676 
   25677   // Record the old value.
   25678   SDValue Old = V;
   25679 
   25680   // Merge this node's mask and our incoming mask (adjusted to account for all
   25681   // the pshufd instructions encountered).
   25682   SmallVector<int, 4> VMask = getPSHUFShuffleMask(V);
   25683   for (int &M : Mask)
   25684     M = VMask[M];
   25685   V = DAG.getNode(V.getOpcode(), DL, MVT::v8i16, V.getOperand(0),
   25686                   getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
   25687 
   25688   // Check that the shuffles didn't cancel each other out. If not, we need to
   25689   // combine to the new one.
   25690   if (Old != V)
   25691     // Replace the combinable shuffle with the combined one, updating all users
   25692     // so that we re-evaluate the chain here.
   25693     DCI.CombineTo(Old.getNode(), V, /*AddTo*/ true);
   25694 
   25695   return true;
   25696 }
   25697 
   25698 /// \brief Try to combine x86 target specific shuffles.
   25699 static SDValue combineTargetShuffle(SDValue N, SelectionDAG &DAG,
   25700                                     TargetLowering::DAGCombinerInfo &DCI,
   25701                                     const X86Subtarget &Subtarget) {
   25702   SDLoc DL(N);
   25703   MVT VT = N.getSimpleValueType();
   25704   SmallVector<int, 4> Mask;
   25705 
   25706   switch (N.getOpcode()) {
   25707   case X86ISD::PSHUFD:
   25708   case X86ISD::PSHUFLW:
   25709   case X86ISD::PSHUFHW:
   25710     Mask = getPSHUFShuffleMask(N);
   25711     assert(Mask.size() == 4);
   25712     break;
   25713   case X86ISD::UNPCKL: {
   25714     // Combine X86ISD::UNPCKL and ISD::VECTOR_SHUFFLE into X86ISD::UNPCKH, in
   25715     // which X86ISD::UNPCKL has a ISD::UNDEF operand, and ISD::VECTOR_SHUFFLE
   25716     // moves upper half elements into the lower half part. For example:
   25717     //
   25718     // t2: v16i8 = vector_shuffle<8,9,10,11,12,13,14,15,u,u,u,u,u,u,u,u> t1,
   25719     //     undef:v16i8
   25720     // t3: v16i8 = X86ISD::UNPCKL undef:v16i8, t2
   25721     //
   25722     // will be combined to:
   25723     //
   25724     // t3: v16i8 = X86ISD::UNPCKH undef:v16i8, t1
   25725 
   25726     // This is only for 128-bit vectors. From SSE4.1 onward this combine may not
   25727     // happen due to advanced instructions.
   25728     if (!VT.is128BitVector())
   25729       return SDValue();
   25730 
   25731     auto Op0 = N.getOperand(0);
   25732     auto Op1 = N.getOperand(1);
   25733     if (Op0.isUndef() && Op1.getNode()->getOpcode() == ISD::VECTOR_SHUFFLE) {
   25734       ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(Op1.getNode())->getMask();
   25735 
   25736       unsigned NumElts = VT.getVectorNumElements();
   25737       SmallVector<int, 8> ExpectedMask(NumElts, -1);
   25738       std::iota(ExpectedMask.begin(), ExpectedMask.begin() + NumElts / 2,
   25739                 NumElts / 2);
   25740 
   25741       auto ShufOp = Op1.getOperand(0);
   25742       if (isShuffleEquivalent(Op1, ShufOp, Mask, ExpectedMask))
   25743         return DAG.getNode(X86ISD::UNPCKH, DL, VT, N.getOperand(0), ShufOp);
   25744     }
   25745     return SDValue();
   25746   }
   25747   case X86ISD::BLENDI: {
   25748     SDValue V0 = N->getOperand(0);
   25749     SDValue V1 = N->getOperand(1);
   25750     assert(VT == V0.getSimpleValueType() && VT == V1.getSimpleValueType() &&
   25751            "Unexpected input vector types");
   25752 
   25753     // Canonicalize a v2f64 blend with a mask of 2 by swapping the vector
   25754     // operands and changing the mask to 1. This saves us a bunch of
   25755     // pattern-matching possibilities related to scalar math ops in SSE/AVX.
   25756     // x86InstrInfo knows how to commute this back after instruction selection
   25757     // if it would help register allocation.
   25758 
   25759     // TODO: If optimizing for size or a processor that doesn't suffer from
   25760     // partial register update stalls, this should be transformed into a MOVSD
   25761     // instruction because a MOVSD is 1-2 bytes smaller than a BLENDPD.
   25762 
   25763     if (VT == MVT::v2f64)
   25764       if (auto *Mask = dyn_cast<ConstantSDNode>(N->getOperand(2)))
   25765         if (Mask->getZExtValue() == 2 && !isShuffleFoldableLoad(V0)) {
   25766           SDValue NewMask = DAG.getConstant(1, DL, MVT::i8);
   25767           return DAG.getNode(X86ISD::BLENDI, DL, VT, V1, V0, NewMask);
   25768         }
   25769 
   25770     // Attempt to merge blend(insertps(x,y),zero).
   25771     if (V0.getOpcode() == X86ISD::INSERTPS ||
   25772         V1.getOpcode() == X86ISD::INSERTPS) {
   25773       assert(VT == MVT::v4f32 && "INSERTPS ValueType must be MVT::v4f32");
   25774 
   25775       // Determine which elements are known to be zero.
   25776       SmallVector<int, 8> TargetMask;
   25777       SmallVector<SDValue, 2> BlendOps;
   25778       if (!setTargetShuffleZeroElements(N, TargetMask, BlendOps))
   25779         return SDValue();
   25780 
   25781       // Helper function to take inner insertps node and attempt to
   25782       // merge the blend with zero into its zero mask.
   25783       auto MergeInsertPSAndBlend = [&](SDValue V, int Offset) {
   25784         if (V.getOpcode() != X86ISD::INSERTPS)
   25785           return SDValue();
   25786         SDValue Op0 = V.getOperand(0);
   25787         SDValue Op1 = V.getOperand(1);
   25788         SDValue Op2 = V.getOperand(2);
   25789         unsigned InsertPSMask = cast<ConstantSDNode>(Op2)->getZExtValue();
   25790 
   25791         // Check each element of the blend node's target mask - must either
   25792         // be zeroable (and update the zero mask) or selects the element from
   25793         // the inner insertps node.
   25794         for (int i = 0; i != 4; ++i)
   25795           if (TargetMask[i] < 0)
   25796             InsertPSMask |= (1u << i);
   25797           else if (TargetMask[i] != (i + Offset))
   25798             return SDValue();
   25799         return DAG.getNode(X86ISD::INSERTPS, DL, MVT::v4f32, Op0, Op1,
   25800                            DAG.getConstant(InsertPSMask, DL, MVT::i8));
   25801       };
   25802 
   25803       if (SDValue V = MergeInsertPSAndBlend(V0, 0))
   25804         return V;
   25805       if (SDValue V = MergeInsertPSAndBlend(V1, 4))
   25806         return V;
   25807     }
   25808     return SDValue();
   25809   }
   25810   case X86ISD::INSERTPS: {
   25811     assert(VT == MVT::v4f32 && "INSERTPS ValueType must be MVT::v4f32");
   25812     SDValue Op0 = N.getOperand(0);
   25813     SDValue Op1 = N.getOperand(1);
   25814     SDValue Op2 = N.getOperand(2);
   25815     unsigned InsertPSMask = cast<ConstantSDNode>(Op2)->getZExtValue();
   25816     unsigned SrcIdx = (InsertPSMask >> 6) & 0x3;
   25817     unsigned DstIdx = (InsertPSMask >> 4) & 0x3;
   25818     unsigned ZeroMask = InsertPSMask & 0xF;
   25819 
   25820     // If we zero out all elements from Op0 then we don't need to reference it.
   25821     if (((ZeroMask | (1u << DstIdx)) == 0xF) && !Op0.isUndef())
   25822       return DAG.getNode(X86ISD::INSERTPS, DL, VT, DAG.getUNDEF(VT), Op1,
   25823                          DAG.getConstant(InsertPSMask, DL, MVT::i8));
   25824 
   25825     // If we zero out the element from Op1 then we don't need to reference it.
   25826     if ((ZeroMask & (1u << DstIdx)) && !Op1.isUndef())
   25827       return DAG.getNode(X86ISD::INSERTPS, DL, VT, Op0, DAG.getUNDEF(VT),
   25828                          DAG.getConstant(InsertPSMask, DL, MVT::i8));
   25829 
   25830     // Attempt to merge insertps Op1 with an inner target shuffle node.
   25831     SmallVector<int, 8> TargetMask1;
   25832     SmallVector<SDValue, 2> Ops1;
   25833     if (setTargetShuffleZeroElements(Op1, TargetMask1, Ops1)) {
   25834       int M = TargetMask1[SrcIdx];
   25835       if (isUndefOrZero(M)) {
   25836         // Zero/UNDEF insertion - zero out element and remove dependency.
   25837         InsertPSMask |= (1u << DstIdx);
   25838         return DAG.getNode(X86ISD::INSERTPS, DL, VT, Op0, DAG.getUNDEF(VT),
   25839                            DAG.getConstant(InsertPSMask, DL, MVT::i8));
   25840       }
   25841       // Update insertps mask srcidx and reference the source input directly.
   25842       assert(0 <= M && M < 8 && "Shuffle index out of range");
   25843       InsertPSMask = (InsertPSMask & 0x3f) | ((M & 0x3) << 6);
   25844       Op1 = Ops1[M < 4 ? 0 : 1];
   25845       return DAG.getNode(X86ISD::INSERTPS, DL, VT, Op0, Op1,
   25846                          DAG.getConstant(InsertPSMask, DL, MVT::i8));
   25847     }
   25848 
   25849     // Attempt to merge insertps Op0 with an inner target shuffle node.
   25850     SmallVector<int, 8> TargetMask0;
   25851     SmallVector<SDValue, 2> Ops0;
   25852     if (!setTargetShuffleZeroElements(Op0, TargetMask0, Ops0))
   25853       return SDValue();
   25854 
   25855     bool Updated = false;
   25856     bool UseInput00 = false;
   25857     bool UseInput01 = false;
   25858     for (int i = 0; i != 4; ++i) {
   25859       int M = TargetMask0[i];
   25860       if ((InsertPSMask & (1u << i)) || (i == (int)DstIdx)) {
   25861         // No change if element is already zero or the inserted element.
   25862         continue;
   25863       } else if (isUndefOrZero(M)) {
   25864         // If the target mask is undef/zero then we must zero the element.
   25865         InsertPSMask |= (1u << i);
   25866         Updated = true;
   25867         continue;
   25868       }
   25869 
   25870       // The input vector element must be inline.
   25871       if (M != i && M != (i + 4))
   25872         return SDValue();
   25873 
   25874       // Determine which inputs of the target shuffle we're using.
   25875       UseInput00 |= (0 <= M && M < 4);
   25876       UseInput01 |= (4 <= M);
   25877     }
   25878 
   25879     // If we're not using both inputs of the target shuffle then use the
   25880     // referenced input directly.
   25881     if (UseInput00 && !UseInput01) {
   25882       Updated = true;
   25883       Op0 = Ops0[0];
   25884     } else if (!UseInput00 && UseInput01) {
   25885       Updated = true;
   25886       Op0 = Ops0[1];
   25887     }
   25888 
   25889     if (Updated)
   25890       return DAG.getNode(X86ISD::INSERTPS, DL, VT, Op0, Op1,
   25891                          DAG.getConstant(InsertPSMask, DL, MVT::i8));
   25892 
   25893     return SDValue();
   25894   }
   25895   default:
   25896     return SDValue();
   25897   }
   25898 
   25899   // Nuke no-op shuffles that show up after combining.
   25900   if (isNoopShuffleMask(Mask))
   25901     return DCI.CombineTo(N.getNode(), N.getOperand(0), /*AddTo*/ true);
   25902 
   25903   // Look for simplifications involving one or two shuffle instructions.
   25904   SDValue V = N.getOperand(0);
   25905   switch (N.getOpcode()) {
   25906   default:
   25907     break;
   25908   case X86ISD::PSHUFLW:
   25909   case X86ISD::PSHUFHW:
   25910     assert(VT.getVectorElementType() == MVT::i16 && "Bad word shuffle type!");
   25911 
   25912     if (combineRedundantHalfShuffle(N, Mask, DAG, DCI))
   25913       return SDValue(); // We combined away this shuffle, so we're done.
   25914 
   25915     // See if this reduces to a PSHUFD which is no more expensive and can
   25916     // combine with more operations. Note that it has to at least flip the
   25917     // dwords as otherwise it would have been removed as a no-op.
   25918     if (makeArrayRef(Mask).equals({2, 3, 0, 1})) {
   25919       int DMask[] = {0, 1, 2, 3};
   25920       int DOffset = N.getOpcode() == X86ISD::PSHUFLW ? 0 : 2;
   25921       DMask[DOffset + 0] = DOffset + 1;
   25922       DMask[DOffset + 1] = DOffset + 0;
   25923       MVT DVT = MVT::getVectorVT(MVT::i32, VT.getVectorNumElements() / 2);
   25924       V = DAG.getBitcast(DVT, V);
   25925       DCI.AddToWorklist(V.getNode());
   25926       V = DAG.getNode(X86ISD::PSHUFD, DL, DVT, V,
   25927                       getV4X86ShuffleImm8ForMask(DMask, DL, DAG));
   25928       DCI.AddToWorklist(V.getNode());
   25929       return DAG.getBitcast(VT, V);
   25930     }
   25931 
   25932     // Look for shuffle patterns which can be implemented as a single unpack.
   25933     // FIXME: This doesn't handle the location of the PSHUFD generically, and
   25934     // only works when we have a PSHUFD followed by two half-shuffles.
   25935     if (Mask[0] == Mask[1] && Mask[2] == Mask[3] &&
   25936         (V.getOpcode() == X86ISD::PSHUFLW ||
   25937          V.getOpcode() == X86ISD::PSHUFHW) &&
   25938         V.getOpcode() != N.getOpcode() &&
   25939         V.hasOneUse()) {
   25940       SDValue D = V.getOperand(0);
   25941       while (D.getOpcode() == ISD::BITCAST && D.hasOneUse())
   25942         D = D.getOperand(0);
   25943       if (D.getOpcode() == X86ISD::PSHUFD && D.hasOneUse()) {
   25944         SmallVector<int, 4> VMask = getPSHUFShuffleMask(V);
   25945         SmallVector<int, 4> DMask = getPSHUFShuffleMask(D);
   25946         int NOffset = N.getOpcode() == X86ISD::PSHUFLW ? 0 : 4;
   25947         int VOffset = V.getOpcode() == X86ISD::PSHUFLW ? 0 : 4;
   25948         int WordMask[8];
   25949         for (int i = 0; i < 4; ++i) {
   25950           WordMask[i + NOffset] = Mask[i] + NOffset;
   25951           WordMask[i + VOffset] = VMask[i] + VOffset;
   25952         }
   25953         // Map the word mask through the DWord mask.
   25954         int MappedMask[8];
   25955         for (int i = 0; i < 8; ++i)
   25956           MappedMask[i] = 2 * DMask[WordMask[i] / 2] + WordMask[i] % 2;
   25957         if (makeArrayRef(MappedMask).equals({0, 0, 1, 1, 2, 2, 3, 3}) ||
   25958             makeArrayRef(MappedMask).equals({4, 4, 5, 5, 6, 6, 7, 7})) {
   25959           // We can replace all three shuffles with an unpack.
   25960           V = DAG.getBitcast(VT, D.getOperand(0));
   25961           DCI.AddToWorklist(V.getNode());
   25962           return DAG.getNode(MappedMask[0] == 0 ? X86ISD::UNPCKL
   25963                                                 : X86ISD::UNPCKH,
   25964                              DL, VT, V, V);
   25965         }
   25966       }
   25967     }
   25968 
   25969     break;
   25970 
   25971   case X86ISD::PSHUFD:
   25972     if (SDValue NewN = combineRedundantDWordShuffle(N, Mask, DAG, DCI))
   25973       return NewN;
   25974 
   25975     break;
   25976   }
   25977 
   25978   return SDValue();
   25979 }
   25980 
   25981 /// \brief Try to combine a shuffle into a target-specific add-sub node.
   25982 ///
   25983 /// We combine this directly on the abstract vector shuffle nodes so it is
   25984 /// easier to generically match. We also insert dummy vector shuffle nodes for
   25985 /// the operands which explicitly discard the lanes which are unused by this
   25986 /// operation to try to flow through the rest of the combiner the fact that
   25987 /// they're unused.
   25988 static SDValue combineShuffleToAddSub(SDNode *N, const X86Subtarget &Subtarget,
   25989                                       SelectionDAG &DAG) {
   25990   SDLoc DL(N);
   25991   EVT VT = N->getValueType(0);
   25992   if ((!Subtarget.hasSSE3() || (VT != MVT::v4f32 && VT != MVT::v2f64)) &&
   25993       (!Subtarget.hasAVX() || (VT != MVT::v8f32 && VT != MVT::v4f64)))
   25994     return SDValue();
   25995 
   25996   // We only handle target-independent shuffles.
   25997   // FIXME: It would be easy and harmless to use the target shuffle mask
   25998   // extraction tool to support more.
   25999   if (N->getOpcode() != ISD::VECTOR_SHUFFLE)
   26000     return SDValue();
   26001 
   26002   auto *SVN = cast<ShuffleVectorSDNode>(N);
   26003   SmallVector<int, 8> Mask;
   26004   for (int M : SVN->getMask())
   26005     Mask.push_back(M);
   26006 
   26007   SDValue V1 = N->getOperand(0);
   26008   SDValue V2 = N->getOperand(1);
   26009 
   26010   // We require the first shuffle operand to be the FSUB node, and the second to
   26011   // be the FADD node.
   26012   if (V1.getOpcode() == ISD::FADD && V2.getOpcode() == ISD::FSUB) {
   26013     ShuffleVectorSDNode::commuteMask(Mask);
   26014     std::swap(V1, V2);
   26015   } else if (V1.getOpcode() != ISD::FSUB || V2.getOpcode() != ISD::FADD)
   26016     return SDValue();
   26017 
   26018   // If there are other uses of these operations we can't fold them.
   26019   if (!V1->hasOneUse() || !V2->hasOneUse())
   26020     return SDValue();
   26021 
   26022   // Ensure that both operations have the same operands. Note that we can
   26023   // commute the FADD operands.
   26024   SDValue LHS = V1->getOperand(0), RHS = V1->getOperand(1);
   26025   if ((V2->getOperand(0) != LHS || V2->getOperand(1) != RHS) &&
   26026       (V2->getOperand(0) != RHS || V2->getOperand(1) != LHS))
   26027     return SDValue();
   26028 
   26029   // We're looking for blends between FADD and FSUB nodes. We insist on these
   26030   // nodes being lined up in a specific expected pattern.
   26031   if (!(isShuffleEquivalent(V1, V2, Mask, {0, 3}) ||
   26032         isShuffleEquivalent(V1, V2, Mask, {0, 5, 2, 7}) ||
   26033         isShuffleEquivalent(V1, V2, Mask, {0, 9, 2, 11, 4, 13, 6, 15})))
   26034     return SDValue();
   26035 
   26036   return DAG.getNode(X86ISD::ADDSUB, DL, VT, LHS, RHS);
   26037 }
   26038 
   26039 static SDValue combineShuffle(SDNode *N, SelectionDAG &DAG,
   26040                               TargetLowering::DAGCombinerInfo &DCI,
   26041                               const X86Subtarget &Subtarget) {
   26042   SDLoc dl(N);
   26043   EVT VT = N->getValueType(0);
   26044 
   26045   // Don't create instructions with illegal types after legalize types has run.
   26046   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
   26047   if (!DCI.isBeforeLegalize() && !TLI.isTypeLegal(VT.getVectorElementType()))
   26048     return SDValue();
   26049 
   26050   // If we have legalized the vector types, look for blends of FADD and FSUB
   26051   // nodes that we can fuse into an ADDSUB node.
   26052   if (TLI.isTypeLegal(VT))
   26053     if (SDValue AddSub = combineShuffleToAddSub(N, Subtarget, DAG))
   26054       return AddSub;
   26055 
   26056   // Combine 256-bit vector shuffles. This is only profitable when in AVX mode
   26057   if (TLI.isTypeLegal(VT) && Subtarget.hasFp256() && VT.is256BitVector() &&
   26058       N->getOpcode() == ISD::VECTOR_SHUFFLE)
   26059     return combineShuffle256(N, DAG, DCI, Subtarget);
   26060 
   26061   // During Type Legalization, when promoting illegal vector types,
   26062   // the backend might introduce new shuffle dag nodes and bitcasts.
   26063   //
   26064   // This code performs the following transformation:
   26065   // fold: (shuffle (bitcast (BINOP A, B)), Undef, <Mask>) ->
   26066   //       (shuffle (BINOP (bitcast A), (bitcast B)), Undef, <Mask>)
   26067   //
   26068   // We do this only if both the bitcast and the BINOP dag nodes have
   26069   // one use. Also, perform this transformation only if the new binary
   26070   // operation is legal. This is to avoid introducing dag nodes that
   26071   // potentially need to be further expanded (or custom lowered) into a
   26072   // less optimal sequence of dag nodes.
   26073   if (!DCI.isBeforeLegalize() && DCI.isBeforeLegalizeOps() &&
   26074       N->getOpcode() == ISD::VECTOR_SHUFFLE &&
   26075       N->getOperand(0).getOpcode() == ISD::BITCAST &&
   26076       N->getOperand(1).isUndef() && N->getOperand(0).hasOneUse()) {
   26077     SDValue N0 = N->getOperand(0);
   26078     SDValue N1 = N->getOperand(1);
   26079 
   26080     SDValue BC0 = N0.getOperand(0);
   26081     EVT SVT = BC0.getValueType();
   26082     unsigned Opcode = BC0.getOpcode();
   26083     unsigned NumElts = VT.getVectorNumElements();
   26084 
   26085     if (BC0.hasOneUse() && SVT.isVector() &&
   26086         SVT.getVectorNumElements() * 2 == NumElts &&
   26087         TLI.isOperationLegal(Opcode, VT)) {
   26088       bool CanFold = false;
   26089       switch (Opcode) {
   26090       default : break;
   26091       case ISD::ADD :
   26092       case ISD::FADD :
   26093       case ISD::SUB :
   26094       case ISD::FSUB :
   26095       case ISD::MUL :
   26096       case ISD::FMUL :
   26097         CanFold = true;
   26098       }
   26099 
   26100       unsigned SVTNumElts = SVT.getVectorNumElements();
   26101       ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N);
   26102       for (unsigned i = 0, e = SVTNumElts; i != e && CanFold; ++i)
   26103         CanFold = SVOp->getMaskElt(i) == (int)(i * 2);
   26104       for (unsigned i = SVTNumElts, e = NumElts; i != e && CanFold; ++i)
   26105         CanFold = SVOp->getMaskElt(i) < 0;
   26106 
   26107       if (CanFold) {
   26108         SDValue BC00 = DAG.getBitcast(VT, BC0.getOperand(0));
   26109         SDValue BC01 = DAG.getBitcast(VT, BC0.getOperand(1));
   26110         SDValue NewBinOp = DAG.getNode(BC0.getOpcode(), dl, VT, BC00, BC01);
   26111         return DAG.getVectorShuffle(VT, dl, NewBinOp, N1, SVOp->getMask());
   26112       }
   26113     }
   26114   }
   26115 
   26116   // Combine a vector_shuffle that is equal to build_vector load1, load2, load3,
   26117   // load4, <0, 1, 2, 3> into a 128-bit load if the load addresses are
   26118   // consecutive, non-overlapping, and in the right order.
   26119   SmallVector<SDValue, 16> Elts;
   26120   for (unsigned i = 0, e = VT.getVectorNumElements(); i != e; ++i)
   26121     Elts.push_back(getShuffleScalarElt(N, i, DAG, 0));
   26122 
   26123   if (SDValue LD = EltsFromConsecutiveLoads(VT, Elts, dl, DAG, true))
   26124     return LD;
   26125 
   26126   if (isTargetShuffle(N->getOpcode())) {
   26127     if (SDValue Shuffle =
   26128             combineTargetShuffle(SDValue(N, 0), DAG, DCI, Subtarget))
   26129       return Shuffle;
   26130 
   26131     // Try recursively combining arbitrary sequences of x86 shuffle
   26132     // instructions into higher-order shuffles. We do this after combining
   26133     // specific PSHUF instruction sequences into their minimal form so that we
   26134     // can evaluate how many specialized shuffle instructions are involved in
   26135     // a particular chain.
   26136     SmallVector<int, 1> NonceMask; // Just a placeholder.
   26137     NonceMask.push_back(0);
   26138     if (combineX86ShufflesRecursively(SDValue(N, 0), SDValue(N, 0), NonceMask,
   26139                                       /*Depth*/ 1, /*HasPSHUFB*/ false, DAG,
   26140                                       DCI, Subtarget))
   26141       return SDValue(); // This routine will use CombineTo to replace N.
   26142   }
   26143 
   26144   return SDValue();
   26145 }
   26146 
   26147 /// Check if a vector extract from a target-specific shuffle of a load can be
   26148 /// folded into a single element load.
   26149 /// Similar handling for VECTOR_SHUFFLE is performed by DAGCombiner, but
   26150 /// shuffles have been custom lowered so we need to handle those here.
   26151 static SDValue XFormVExtractWithShuffleIntoLoad(SDNode *N, SelectionDAG &DAG,
   26152                                          TargetLowering::DAGCombinerInfo &DCI) {
   26153   if (DCI.isBeforeLegalizeOps())
   26154     return SDValue();
   26155 
   26156   SDValue InVec = N->getOperand(0);
   26157   SDValue EltNo = N->getOperand(1);
   26158   EVT EltVT = N->getValueType(0);
   26159 
   26160   if (!isa<ConstantSDNode>(EltNo))
   26161     return SDValue();
   26162 
   26163   EVT OriginalVT = InVec.getValueType();
   26164 
   26165   if (InVec.getOpcode() == ISD::BITCAST) {
   26166     // Don't duplicate a load with other uses.
   26167     if (!InVec.hasOneUse())
   26168       return SDValue();
   26169     EVT BCVT = InVec.getOperand(0).getValueType();
   26170     if (!BCVT.isVector() ||
   26171         BCVT.getVectorNumElements() != OriginalVT.getVectorNumElements())
   26172       return SDValue();
   26173     InVec = InVec.getOperand(0);
   26174   }
   26175 
   26176   EVT CurrentVT = InVec.getValueType();
   26177 
   26178   if (!isTargetShuffle(InVec.getOpcode()))
   26179     return SDValue();
   26180 
   26181   // Don't duplicate a load with other uses.
   26182   if (!InVec.hasOneUse())
   26183     return SDValue();
   26184 
   26185   SmallVector<int, 16> ShuffleMask;
   26186   SmallVector<SDValue, 2> ShuffleOps;
   26187   bool UnaryShuffle;
   26188   if (!getTargetShuffleMask(InVec.getNode(), CurrentVT.getSimpleVT(), true,
   26189                             ShuffleOps, ShuffleMask, UnaryShuffle))
   26190     return SDValue();
   26191 
   26192   // Select the input vector, guarding against out of range extract vector.
   26193   unsigned NumElems = CurrentVT.getVectorNumElements();
   26194   int Elt = cast<ConstantSDNode>(EltNo)->getZExtValue();
   26195   int Idx = (Elt > (int)NumElems) ? SM_SentinelUndef : ShuffleMask[Elt];
   26196 
   26197   if (Idx == SM_SentinelZero)
   26198     return EltVT.isInteger() ? DAG.getConstant(0, SDLoc(N), EltVT)
   26199                              : DAG.getConstantFP(+0.0, SDLoc(N), EltVT);
   26200   if (Idx == SM_SentinelUndef)
   26201     return DAG.getUNDEF(EltVT);
   26202 
   26203   assert(0 <= Idx && Idx < (int)(2 * NumElems) && "Shuffle index out of range");
   26204   SDValue LdNode = (Idx < (int)NumElems) ? ShuffleOps[0]
   26205                                          : ShuffleOps[1];
   26206 
   26207   // If inputs to shuffle are the same for both ops, then allow 2 uses
   26208   unsigned AllowedUses =
   26209       (ShuffleOps.size() > 1 && ShuffleOps[0] == ShuffleOps[1]) ? 2 : 1;
   26210 
   26211   if (LdNode.getOpcode() == ISD::BITCAST) {
   26212     // Don't duplicate a load with other uses.
   26213     if (!LdNode.getNode()->hasNUsesOfValue(AllowedUses, 0))
   26214       return SDValue();
   26215 
   26216     AllowedUses = 1; // only allow 1 load use if we have a bitcast
   26217     LdNode = LdNode.getOperand(0);
   26218   }
   26219 
   26220   if (!ISD::isNormalLoad(LdNode.getNode()))
   26221     return SDValue();
   26222 
   26223   LoadSDNode *LN0 = cast<LoadSDNode>(LdNode);
   26224 
   26225   if (!LN0 ||!LN0->hasNUsesOfValue(AllowedUses, 0) || LN0->isVolatile())
   26226     return SDValue();
   26227 
   26228   // If there's a bitcast before the shuffle, check if the load type and
   26229   // alignment is valid.
   26230   unsigned Align = LN0->getAlignment();
   26231   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
   26232   unsigned NewAlign = DAG.getDataLayout().getABITypeAlignment(
   26233       EltVT.getTypeForEVT(*DAG.getContext()));
   26234 
   26235   if (NewAlign > Align || !TLI.isOperationLegalOrCustom(ISD::LOAD, EltVT))
   26236     return SDValue();
   26237 
   26238   // All checks match so transform back to vector_shuffle so that DAG combiner
   26239   // can finish the job
   26240   SDLoc dl(N);
   26241 
   26242   // Create shuffle node taking into account the case that its a unary shuffle
   26243   SDValue Shuffle = (UnaryShuffle) ? DAG.getUNDEF(CurrentVT) : ShuffleOps[1];
   26244   Shuffle = DAG.getVectorShuffle(CurrentVT, dl, ShuffleOps[0], Shuffle,
   26245                                  ShuffleMask);
   26246   Shuffle = DAG.getBitcast(OriginalVT, Shuffle);
   26247   return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, N->getValueType(0), Shuffle,
   26248                      EltNo);
   26249 }
   26250 
   26251 static SDValue combineBitcast(SDNode *N, SelectionDAG &DAG,
   26252                               const X86Subtarget &Subtarget) {
   26253   SDValue N0 = N->getOperand(0);
   26254   EVT VT = N->getValueType(0);
   26255 
   26256   // Detect bitcasts between i32 to x86mmx low word. Since MMX types are
   26257   // special and don't usually play with other vector types, it's better to
   26258   // handle them early to be sure we emit efficient code by avoiding
   26259   // store-load conversions.
   26260   if (VT == MVT::x86mmx && N0.getOpcode() == ISD::BUILD_VECTOR &&
   26261       N0.getValueType() == MVT::v2i32 &&
   26262       isNullConstant(N0.getOperand(1))) {
   26263     SDValue N00 = N0->getOperand(0);
   26264     if (N00.getValueType() == MVT::i32)
   26265       return DAG.getNode(X86ISD::MMX_MOVW2D, SDLoc(N00), VT, N00);
   26266   }
   26267 
   26268   // Convert a bitcasted integer logic operation that has one bitcasted
   26269   // floating-point operand and one constant operand into a floating-point
   26270   // logic operation. This may create a load of the constant, but that is
   26271   // cheaper than materializing the constant in an integer register and
   26272   // transferring it to an SSE register or transferring the SSE operand to
   26273   // integer register and back.
   26274   unsigned FPOpcode;
   26275   switch (N0.getOpcode()) {
   26276     case ISD::AND: FPOpcode = X86ISD::FAND; break;
   26277     case ISD::OR:  FPOpcode = X86ISD::FOR;  break;
   26278     case ISD::XOR: FPOpcode = X86ISD::FXOR; break;
   26279     default: return SDValue();
   26280   }
   26281   if (((Subtarget.hasSSE1() && VT == MVT::f32) ||
   26282        (Subtarget.hasSSE2() && VT == MVT::f64)) &&
   26283       isa<ConstantSDNode>(N0.getOperand(1)) &&
   26284       N0.getOperand(0).getOpcode() == ISD::BITCAST &&
   26285       N0.getOperand(0).getOperand(0).getValueType() == VT) {
   26286     SDValue N000 = N0.getOperand(0).getOperand(0);
   26287     SDValue FPConst = DAG.getBitcast(VT, N0.getOperand(1));
   26288     return DAG.getNode(FPOpcode, SDLoc(N0), VT, N000, FPConst);
   26289   }
   26290 
   26291   return SDValue();
   26292 }
   26293 
   26294 /// Detect vector gather/scatter index generation and convert it from being a
   26295 /// bunch of shuffles and extracts into a somewhat faster sequence.
   26296 /// For i686, the best sequence is apparently storing the value and loading
   26297 /// scalars back, while for x64 we should use 64-bit extracts and shifts.
   26298 static SDValue combineExtractVectorElt(SDNode *N, SelectionDAG &DAG,
   26299                                        TargetLowering::DAGCombinerInfo &DCI) {
   26300   if (SDValue NewOp = XFormVExtractWithShuffleIntoLoad(N, DAG, DCI))
   26301     return NewOp;
   26302 
   26303   SDValue InputVector = N->getOperand(0);
   26304   SDLoc dl(InputVector);
   26305   // Detect mmx to i32 conversion through a v2i32 elt extract.
   26306   if (InputVector.getOpcode() == ISD::BITCAST && InputVector.hasOneUse() &&
   26307       N->getValueType(0) == MVT::i32 &&
   26308       InputVector.getValueType() == MVT::v2i32 &&
   26309       isa<ConstantSDNode>(N->getOperand(1)) &&
   26310       N->getConstantOperandVal(1) == 0) {
   26311     SDValue MMXSrc = InputVector.getNode()->getOperand(0);
   26312 
   26313     // The bitcast source is a direct mmx result.
   26314     if (MMXSrc.getValueType() == MVT::x86mmx)
   26315       return DAG.getNode(X86ISD::MMX_MOVD2W, dl, MVT::i32, MMXSrc);
   26316   }
   26317 
   26318   EVT VT = N->getValueType(0);
   26319 
   26320   if (VT == MVT::i1 && isa<ConstantSDNode>(N->getOperand(1)) &&
   26321       InputVector.getOpcode() == ISD::BITCAST &&
   26322       isa<ConstantSDNode>(InputVector.getOperand(0))) {
   26323     uint64_t ExtractedElt =
   26324         cast<ConstantSDNode>(N->getOperand(1))->getZExtValue();
   26325     uint64_t InputValue =
   26326         cast<ConstantSDNode>(InputVector.getOperand(0))->getZExtValue();
   26327     uint64_t Res = (InputValue >> ExtractedElt) & 1;
   26328     return DAG.getConstant(Res, dl, MVT::i1);
   26329   }
   26330   // Only operate on vectors of 4 elements, where the alternative shuffling
   26331   // gets to be more expensive.
   26332   if (InputVector.getValueType() != MVT::v4i32)
   26333     return SDValue();
   26334 
   26335   // Check whether every use of InputVector is an EXTRACT_VECTOR_ELT with a
   26336   // single use which is a sign-extend or zero-extend, and all elements are
   26337   // used.
   26338   SmallVector<SDNode *, 4> Uses;
   26339   unsigned ExtractedElements = 0;
   26340   for (SDNode::use_iterator UI = InputVector.getNode()->use_begin(),
   26341        UE = InputVector.getNode()->use_end(); UI != UE; ++UI) {
   26342     if (UI.getUse().getResNo() != InputVector.getResNo())
   26343       return SDValue();
   26344 
   26345     SDNode *Extract = *UI;
   26346     if (Extract->getOpcode() != ISD::EXTRACT_VECTOR_ELT)
   26347       return SDValue();
   26348 
   26349     if (Extract->getValueType(0) != MVT::i32)
   26350       return SDValue();
   26351     if (!Extract->hasOneUse())
   26352       return SDValue();
   26353     if (Extract->use_begin()->getOpcode() != ISD::SIGN_EXTEND &&
   26354         Extract->use_begin()->getOpcode() != ISD::ZERO_EXTEND)
   26355       return SDValue();
   26356     if (!isa<ConstantSDNode>(Extract->getOperand(1)))
   26357       return SDValue();
   26358 
   26359     // Record which element was extracted.
   26360     ExtractedElements |=
   26361       1 << cast<ConstantSDNode>(Extract->getOperand(1))->getZExtValue();
   26362 
   26363     Uses.push_back(Extract);
   26364   }
   26365 
   26366   // If not all the elements were used, this may not be worthwhile.
   26367   if (ExtractedElements != 15)
   26368     return SDValue();
   26369 
   26370   // Ok, we've now decided to do the transformation.
   26371   // If 64-bit shifts are legal, use the extract-shift sequence,
   26372   // otherwise bounce the vector off the cache.
   26373   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
   26374   SDValue Vals[4];
   26375 
   26376   if (TLI.isOperationLegal(ISD::SRA, MVT::i64)) {
   26377     SDValue Cst = DAG.getBitcast(MVT::v2i64, InputVector);
   26378     auto &DL = DAG.getDataLayout();
   26379     EVT VecIdxTy = DAG.getTargetLoweringInfo().getVectorIdxTy(DL);
   26380     SDValue BottomHalf = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i64, Cst,
   26381       DAG.getConstant(0, dl, VecIdxTy));
   26382     SDValue TopHalf = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i64, Cst,
   26383       DAG.getConstant(1, dl, VecIdxTy));
   26384 
   26385     SDValue ShAmt = DAG.getConstant(
   26386         32, dl, DAG.getTargetLoweringInfo().getShiftAmountTy(MVT::i64, DL));
   26387     Vals[0] = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, BottomHalf);
   26388     Vals[1] = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32,
   26389       DAG.getNode(ISD::SRA, dl, MVT::i64, BottomHalf, ShAmt));
   26390     Vals[2] = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, TopHalf);
   26391     Vals[3] = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32,
   26392       DAG.getNode(ISD::SRA, dl, MVT::i64, TopHalf, ShAmt));
   26393   } else {
   26394     // Store the value to a temporary stack slot.
   26395     SDValue StackPtr = DAG.CreateStackTemporary(InputVector.getValueType());
   26396     SDValue Ch = DAG.getStore(DAG.getEntryNode(), dl, InputVector, StackPtr,
   26397       MachinePointerInfo(), false, false, 0);
   26398 
   26399     EVT ElementType = InputVector.getValueType().getVectorElementType();
   26400     unsigned EltSize = ElementType.getSizeInBits() / 8;
   26401 
   26402     // Replace each use (extract) with a load of the appropriate element.
   26403     for (unsigned i = 0; i < 4; ++i) {
   26404       uint64_t Offset = EltSize * i;
   26405       auto PtrVT = TLI.getPointerTy(DAG.getDataLayout());
   26406       SDValue OffsetVal = DAG.getConstant(Offset, dl, PtrVT);
   26407 
   26408       SDValue ScalarAddr =
   26409           DAG.getNode(ISD::ADD, dl, PtrVT, StackPtr, OffsetVal);
   26410 
   26411       // Load the scalar.
   26412       Vals[i] = DAG.getLoad(ElementType, dl, Ch,
   26413                             ScalarAddr, MachinePointerInfo(),
   26414                             false, false, false, 0);
   26415 
   26416     }
   26417   }
   26418 
   26419   // Replace the extracts
   26420   for (SmallVectorImpl<SDNode *>::iterator UI = Uses.begin(),
   26421     UE = Uses.end(); UI != UE; ++UI) {
   26422     SDNode *Extract = *UI;
   26423 
   26424     SDValue Idx = Extract->getOperand(1);
   26425     uint64_t IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
   26426     DAG.ReplaceAllUsesOfValueWith(SDValue(Extract, 0), Vals[IdxVal]);
   26427   }
   26428 
   26429   // The replacement was made in place; don't return anything.
   26430   return SDValue();
   26431 }
   26432 
   26433 /// Do target-specific dag combines on SELECT and VSELECT nodes.
   26434 static SDValue combineSelect(SDNode *N, SelectionDAG &DAG,
   26435                              TargetLowering::DAGCombinerInfo &DCI,
   26436                              const X86Subtarget &Subtarget) {
   26437   SDLoc DL(N);
   26438   SDValue Cond = N->getOperand(0);
   26439   // Get the LHS/RHS of the select.
   26440   SDValue LHS = N->getOperand(1);
   26441   SDValue RHS = N->getOperand(2);
   26442   EVT VT = LHS.getValueType();
   26443   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
   26444 
   26445   // If we have SSE[12] support, try to form min/max nodes. SSE min/max
   26446   // instructions match the semantics of the common C idiom x<y?x:y but not
   26447   // x<=y?x:y, because of how they handle negative zero (which can be
   26448   // ignored in unsafe-math mode).
   26449   // We also try to create v2f32 min/max nodes, which we later widen to v4f32.
   26450   if (Cond.getOpcode() == ISD::SETCC && VT.isFloatingPoint() &&
   26451       VT != MVT::f80 && VT != MVT::f128 &&
   26452       (TLI.isTypeLegal(VT) || VT == MVT::v2f32) &&
   26453       (Subtarget.hasSSE2() ||
   26454        (Subtarget.hasSSE1() && VT.getScalarType() == MVT::f32))) {
   26455     ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
   26456 
   26457     unsigned Opcode = 0;
   26458     // Check for x CC y ? x : y.
   26459     if (DAG.isEqualTo(LHS, Cond.getOperand(0)) &&
   26460         DAG.isEqualTo(RHS, Cond.getOperand(1))) {
   26461       switch (CC) {
   26462       default: break;
   26463       case ISD::SETULT:
   26464         // Converting this to a min would handle NaNs incorrectly, and swapping
   26465         // the operands would cause it to handle comparisons between positive
   26466         // and negative zero incorrectly.
   26467         if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS)) {
   26468           if (!DAG.getTarget().Options.UnsafeFPMath &&
   26469               !(DAG.isKnownNeverZero(LHS) || DAG.isKnownNeverZero(RHS)))
   26470             break;
   26471           std::swap(LHS, RHS);
   26472         }
   26473         Opcode = X86ISD::FMIN;
   26474         break;
   26475       case ISD::SETOLE:
   26476         // Converting this to a min would handle comparisons between positive
   26477         // and negative zero incorrectly.
   26478         if (!DAG.getTarget().Options.UnsafeFPMath &&
   26479             !DAG.isKnownNeverZero(LHS) && !DAG.isKnownNeverZero(RHS))
   26480           break;
   26481         Opcode = X86ISD::FMIN;
   26482         break;
   26483       case ISD::SETULE:
   26484         // Converting this to a min would handle both negative zeros and NaNs
   26485         // incorrectly, but we can swap the operands to fix both.
   26486         std::swap(LHS, RHS);
   26487       case ISD::SETOLT:
   26488       case ISD::SETLT:
   26489       case ISD::SETLE:
   26490         Opcode = X86ISD::FMIN;
   26491         break;
   26492 
   26493       case ISD::SETOGE:
   26494         // Converting this to a max would handle comparisons between positive
   26495         // and negative zero incorrectly.
   26496         if (!DAG.getTarget().Options.UnsafeFPMath &&
   26497             !DAG.isKnownNeverZero(LHS) && !DAG.isKnownNeverZero(RHS))
   26498           break;
   26499         Opcode = X86ISD::FMAX;
   26500         break;
   26501       case ISD::SETUGT:
   26502         // Converting this to a max would handle NaNs incorrectly, and swapping
   26503         // the operands would cause it to handle comparisons between positive
   26504         // and negative zero incorrectly.
   26505         if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS)) {
   26506           if (!DAG.getTarget().Options.UnsafeFPMath &&
   26507               !(DAG.isKnownNeverZero(LHS) || DAG.isKnownNeverZero(RHS)))
   26508             break;
   26509           std::swap(LHS, RHS);
   26510         }
   26511         Opcode = X86ISD::FMAX;
   26512         break;
   26513       case ISD::SETUGE:
   26514         // Converting this to a max would handle both negative zeros and NaNs
   26515         // incorrectly, but we can swap the operands to fix both.
   26516         std::swap(LHS, RHS);
   26517       case ISD::SETOGT:
   26518       case ISD::SETGT:
   26519       case ISD::SETGE:
   26520         Opcode = X86ISD::FMAX;
   26521         break;
   26522       }
   26523     // Check for x CC y ? y : x -- a min/max with reversed arms.
   26524     } else if (DAG.isEqualTo(LHS, Cond.getOperand(1)) &&
   26525                DAG.isEqualTo(RHS, Cond.getOperand(0))) {
   26526       switch (CC) {
   26527       default: break;
   26528       case ISD::SETOGE:
   26529         // Converting this to a min would handle comparisons between positive
   26530         // and negative zero incorrectly, and swapping the operands would
   26531         // cause it to handle NaNs incorrectly.
   26532         if (!DAG.getTarget().Options.UnsafeFPMath &&
   26533             !(DAG.isKnownNeverZero(LHS) || DAG.isKnownNeverZero(RHS))) {
   26534           if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS))
   26535             break;
   26536           std::swap(LHS, RHS);
   26537         }
   26538         Opcode = X86ISD::FMIN;
   26539         break;
   26540       case ISD::SETUGT:
   26541         // Converting this to a min would handle NaNs incorrectly.
   26542         if (!DAG.getTarget().Options.UnsafeFPMath &&
   26543             (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS)))
   26544           break;
   26545         Opcode = X86ISD::FMIN;
   26546         break;
   26547       case ISD::SETUGE:
   26548         // Converting this to a min would handle both negative zeros and NaNs
   26549         // incorrectly, but we can swap the operands to fix both.
   26550         std::swap(LHS, RHS);
   26551       case ISD::SETOGT:
   26552       case ISD::SETGT:
   26553       case ISD::SETGE:
   26554         Opcode = X86ISD::FMIN;
   26555         break;
   26556 
   26557       case ISD::SETULT:
   26558         // Converting this to a max would handle NaNs incorrectly.
   26559         if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS))
   26560           break;
   26561         Opcode = X86ISD::FMAX;
   26562         break;
   26563       case ISD::SETOLE:
   26564         // Converting this to a max would handle comparisons between positive
   26565         // and negative zero incorrectly, and swapping the operands would
   26566         // cause it to handle NaNs incorrectly.
   26567         if (!DAG.getTarget().Options.UnsafeFPMath &&
   26568             !DAG.isKnownNeverZero(LHS) && !DAG.isKnownNeverZero(RHS)) {
   26569           if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS))
   26570             break;
   26571           std::swap(LHS, RHS);
   26572         }
   26573         Opcode = X86ISD::FMAX;
   26574         break;
   26575       case ISD::SETULE:
   26576         // Converting this to a max would handle both negative zeros and NaNs
   26577         // incorrectly, but we can swap the operands to fix both.
   26578         std::swap(LHS, RHS);
   26579       case ISD::SETOLT:
   26580       case ISD::SETLT:
   26581       case ISD::SETLE:
   26582         Opcode = X86ISD::FMAX;
   26583         break;
   26584       }
   26585     }
   26586 
   26587     if (Opcode)
   26588       return DAG.getNode(Opcode, DL, N->getValueType(0), LHS, RHS);
   26589   }
   26590 
   26591   EVT CondVT = Cond.getValueType();
   26592   if (Subtarget.hasAVX512() && VT.isVector() && CondVT.isVector() &&
   26593       CondVT.getVectorElementType() == MVT::i1) {
   26594     // v16i8 (select v16i1, v16i8, v16i8) does not have a proper
   26595     // lowering on KNL. In this case we convert it to
   26596     // v16i8 (select v16i8, v16i8, v16i8) and use AVX instruction.
   26597     // The same situation for all 128 and 256-bit vectors of i8 and i16.
   26598     // Since SKX these selects have a proper lowering.
   26599     EVT OpVT = LHS.getValueType();
   26600     if ((OpVT.is128BitVector() || OpVT.is256BitVector()) &&
   26601         (OpVT.getVectorElementType() == MVT::i8 ||
   26602          OpVT.getVectorElementType() == MVT::i16) &&
   26603         !(Subtarget.hasBWI() && Subtarget.hasVLX())) {
   26604       Cond = DAG.getNode(ISD::SIGN_EXTEND, DL, OpVT, Cond);
   26605       DCI.AddToWorklist(Cond.getNode());
   26606       return DAG.getNode(N->getOpcode(), DL, OpVT, Cond, LHS, RHS);
   26607     }
   26608   }
   26609   // If this is a select between two integer constants, try to do some
   26610   // optimizations.
   26611   if (ConstantSDNode *TrueC = dyn_cast<ConstantSDNode>(LHS)) {
   26612     if (ConstantSDNode *FalseC = dyn_cast<ConstantSDNode>(RHS))
   26613       // Don't do this for crazy integer types.
   26614       if (DAG.getTargetLoweringInfo().isTypeLegal(LHS.getValueType())) {
   26615         // If this is efficiently invertible, canonicalize the LHSC/RHSC values
   26616         // so that TrueC (the true value) is larger than FalseC.
   26617         bool NeedsCondInvert = false;
   26618 
   26619         if (TrueC->getAPIntValue().ult(FalseC->getAPIntValue()) &&
   26620             // Efficiently invertible.
   26621             (Cond.getOpcode() == ISD::SETCC ||  // setcc -> invertible.
   26622              (Cond.getOpcode() == ISD::XOR &&   // xor(X, C) -> invertible.
   26623               isa<ConstantSDNode>(Cond.getOperand(1))))) {
   26624           NeedsCondInvert = true;
   26625           std::swap(TrueC, FalseC);
   26626         }
   26627 
   26628         // Optimize C ? 8 : 0 -> zext(C) << 3.  Likewise for any pow2/0.
   26629         if (FalseC->getAPIntValue() == 0 &&
   26630             TrueC->getAPIntValue().isPowerOf2()) {
   26631           if (NeedsCondInvert) // Invert the condition if needed.
   26632             Cond = DAG.getNode(ISD::XOR, DL, Cond.getValueType(), Cond,
   26633                                DAG.getConstant(1, DL, Cond.getValueType()));
   26634 
   26635           // Zero extend the condition if needed.
   26636           Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, LHS.getValueType(), Cond);
   26637 
   26638           unsigned ShAmt = TrueC->getAPIntValue().logBase2();
   26639           return DAG.getNode(ISD::SHL, DL, LHS.getValueType(), Cond,
   26640                              DAG.getConstant(ShAmt, DL, MVT::i8));
   26641         }
   26642 
   26643         // Optimize Cond ? cst+1 : cst -> zext(setcc(C)+cst.
   26644         if (FalseC->getAPIntValue()+1 == TrueC->getAPIntValue()) {
   26645           if (NeedsCondInvert) // Invert the condition if needed.
   26646             Cond = DAG.getNode(ISD::XOR, DL, Cond.getValueType(), Cond,
   26647                                DAG.getConstant(1, DL, Cond.getValueType()));
   26648 
   26649           // Zero extend the condition if needed.
   26650           Cond = DAG.getNode(ISD::ZERO_EXTEND, DL,
   26651                              FalseC->getValueType(0), Cond);
   26652           return DAG.getNode(ISD::ADD, DL, Cond.getValueType(), Cond,
   26653                              SDValue(FalseC, 0));
   26654         }
   26655 
   26656         // Optimize cases that will turn into an LEA instruction.  This requires
   26657         // an i32 or i64 and an efficient multiplier (1, 2, 3, 4, 5, 8, 9).
   26658         if (N->getValueType(0) == MVT::i32 || N->getValueType(0) == MVT::i64) {
   26659           uint64_t Diff = TrueC->getZExtValue()-FalseC->getZExtValue();
   26660           if (N->getValueType(0) == MVT::i32) Diff = (unsigned)Diff;
   26661 
   26662           bool isFastMultiplier = false;
   26663           if (Diff < 10) {
   26664             switch ((unsigned char)Diff) {
   26665               default: break;
   26666               case 1:  // result = add base, cond
   26667               case 2:  // result = lea base(    , cond*2)
   26668               case 3:  // result = lea base(cond, cond*2)
   26669               case 4:  // result = lea base(    , cond*4)
   26670               case 5:  // result = lea base(cond, cond*4)
   26671               case 8:  // result = lea base(    , cond*8)
   26672               case 9:  // result = lea base(cond, cond*8)
   26673                 isFastMultiplier = true;
   26674                 break;
   26675             }
   26676           }
   26677 
   26678           if (isFastMultiplier) {
   26679             APInt Diff = TrueC->getAPIntValue()-FalseC->getAPIntValue();
   26680             if (NeedsCondInvert) // Invert the condition if needed.
   26681               Cond = DAG.getNode(ISD::XOR, DL, Cond.getValueType(), Cond,
   26682                                  DAG.getConstant(1, DL, Cond.getValueType()));
   26683 
   26684             // Zero extend the condition if needed.
   26685             Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, FalseC->getValueType(0),
   26686                                Cond);
   26687             // Scale the condition by the difference.
   26688             if (Diff != 1)
   26689               Cond = DAG.getNode(ISD::MUL, DL, Cond.getValueType(), Cond,
   26690                                  DAG.getConstant(Diff, DL,
   26691                                                  Cond.getValueType()));
   26692 
   26693             // Add the base if non-zero.
   26694             if (FalseC->getAPIntValue() != 0)
   26695               Cond = DAG.getNode(ISD::ADD, DL, Cond.getValueType(), Cond,
   26696                                  SDValue(FalseC, 0));
   26697             return Cond;
   26698           }
   26699         }
   26700       }
   26701   }
   26702 
   26703   // Canonicalize max and min:
   26704   // (x > y) ? x : y -> (x >= y) ? x : y
   26705   // (x < y) ? x : y -> (x <= y) ? x : y
   26706   // This allows use of COND_S / COND_NS (see TranslateX86CC) which eliminates
   26707   // the need for an extra compare
   26708   // against zero. e.g.
   26709   // (x - y) > 0 : (x - y) ? 0 -> (x - y) >= 0 : (x - y) ? 0
   26710   // subl   %esi, %edi
   26711   // testl  %edi, %edi
   26712   // movl   $0, %eax
   26713   // cmovgl %edi, %eax
   26714   // =>
   26715   // xorl   %eax, %eax
   26716   // subl   %esi, $edi
   26717   // cmovsl %eax, %edi
   26718   if (N->getOpcode() == ISD::SELECT && Cond.getOpcode() == ISD::SETCC &&
   26719       DAG.isEqualTo(LHS, Cond.getOperand(0)) &&
   26720       DAG.isEqualTo(RHS, Cond.getOperand(1))) {
   26721     ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
   26722     switch (CC) {
   26723     default: break;
   26724     case ISD::SETLT:
   26725     case ISD::SETGT: {
   26726       ISD::CondCode NewCC = (CC == ISD::SETLT) ? ISD::SETLE : ISD::SETGE;
   26727       Cond = DAG.getSetCC(SDLoc(Cond), Cond.getValueType(),
   26728                           Cond.getOperand(0), Cond.getOperand(1), NewCC);
   26729       return DAG.getNode(ISD::SELECT, DL, VT, Cond, LHS, RHS);
   26730     }
   26731     }
   26732   }
   26733 
   26734   // Early exit check
   26735   if (!TLI.isTypeLegal(VT))
   26736     return SDValue();
   26737 
   26738   // Match VSELECTs into subs with unsigned saturation.
   26739   if (N->getOpcode() == ISD::VSELECT && Cond.getOpcode() == ISD::SETCC &&
   26740       // psubus is available in SSE2 and AVX2 for i8 and i16 vectors.
   26741       ((Subtarget.hasSSE2() && (VT == MVT::v16i8 || VT == MVT::v8i16)) ||
   26742        (Subtarget.hasAVX2() && (VT == MVT::v32i8 || VT == MVT::v16i16)))) {
   26743     ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
   26744 
   26745     // Check if one of the arms of the VSELECT is a zero vector. If it's on the
   26746     // left side invert the predicate to simplify logic below.
   26747     SDValue Other;
   26748     if (ISD::isBuildVectorAllZeros(LHS.getNode())) {
   26749       Other = RHS;
   26750       CC = ISD::getSetCCInverse(CC, true);
   26751     } else if (ISD::isBuildVectorAllZeros(RHS.getNode())) {
   26752       Other = LHS;
   26753     }
   26754 
   26755     if (Other.getNode() && Other->getNumOperands() == 2 &&
   26756         DAG.isEqualTo(Other->getOperand(0), Cond.getOperand(0))) {
   26757       SDValue OpLHS = Other->getOperand(0), OpRHS = Other->getOperand(1);
   26758       SDValue CondRHS = Cond->getOperand(1);
   26759 
   26760       // Look for a general sub with unsigned saturation first.
   26761       // x >= y ? x-y : 0 --> subus x, y
   26762       // x >  y ? x-y : 0 --> subus x, y
   26763       if ((CC == ISD::SETUGE || CC == ISD::SETUGT) &&
   26764           Other->getOpcode() == ISD::SUB && DAG.isEqualTo(OpRHS, CondRHS))
   26765         return DAG.getNode(X86ISD::SUBUS, DL, VT, OpLHS, OpRHS);
   26766 
   26767       if (auto *OpRHSBV = dyn_cast<BuildVectorSDNode>(OpRHS))
   26768         if (auto *OpRHSConst = OpRHSBV->getConstantSplatNode()) {
   26769           if (auto *CondRHSBV = dyn_cast<BuildVectorSDNode>(CondRHS))
   26770             if (auto *CondRHSConst = CondRHSBV->getConstantSplatNode())
   26771               // If the RHS is a constant we have to reverse the const
   26772               // canonicalization.
   26773               // x > C-1 ? x+-C : 0 --> subus x, C
   26774               if (CC == ISD::SETUGT && Other->getOpcode() == ISD::ADD &&
   26775                   CondRHSConst->getAPIntValue() ==
   26776                       (-OpRHSConst->getAPIntValue() - 1))
   26777                 return DAG.getNode(
   26778                     X86ISD::SUBUS, DL, VT, OpLHS,
   26779                     DAG.getConstant(-OpRHSConst->getAPIntValue(), DL, VT));
   26780 
   26781           // Another special case: If C was a sign bit, the sub has been
   26782           // canonicalized into a xor.
   26783           // FIXME: Would it be better to use computeKnownBits to determine
   26784           //        whether it's safe to decanonicalize the xor?
   26785           // x s< 0 ? x^C : 0 --> subus x, C
   26786           if (CC == ISD::SETLT && Other->getOpcode() == ISD::XOR &&
   26787               ISD::isBuildVectorAllZeros(CondRHS.getNode()) &&
   26788               OpRHSConst->getAPIntValue().isSignBit())
   26789             // Note that we have to rebuild the RHS constant here to ensure we
   26790             // don't rely on particular values of undef lanes.
   26791             return DAG.getNode(
   26792                 X86ISD::SUBUS, DL, VT, OpLHS,
   26793                 DAG.getConstant(OpRHSConst->getAPIntValue(), DL, VT));
   26794         }
   26795     }
   26796   }
   26797 
   26798   // Simplify vector selection if condition value type matches vselect
   26799   // operand type
   26800   if (N->getOpcode() == ISD::VSELECT && CondVT == VT) {
   26801     assert(Cond.getValueType().isVector() &&
   26802            "vector select expects a vector selector!");
   26803 
   26804     bool TValIsAllOnes = ISD::isBuildVectorAllOnes(LHS.getNode());
   26805     bool FValIsAllZeros = ISD::isBuildVectorAllZeros(RHS.getNode());
   26806 
   26807     // Try invert the condition if true value is not all 1s and false value
   26808     // is not all 0s.
   26809     if (!TValIsAllOnes && !FValIsAllZeros &&
   26810         // Check if the selector will be produced by CMPP*/PCMP*
   26811         Cond.getOpcode() == ISD::SETCC &&
   26812         // Check if SETCC has already been promoted
   26813         TLI.getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT) ==
   26814             CondVT) {
   26815       bool TValIsAllZeros = ISD::isBuildVectorAllZeros(LHS.getNode());
   26816       bool FValIsAllOnes = ISD::isBuildVectorAllOnes(RHS.getNode());
   26817 
   26818       if (TValIsAllZeros || FValIsAllOnes) {
   26819         SDValue CC = Cond.getOperand(2);
   26820         ISD::CondCode NewCC =
   26821           ISD::getSetCCInverse(cast<CondCodeSDNode>(CC)->get(),
   26822                                Cond.getOperand(0).getValueType().isInteger());
   26823         Cond = DAG.getSetCC(DL, CondVT, Cond.getOperand(0), Cond.getOperand(1), NewCC);
   26824         std::swap(LHS, RHS);
   26825         TValIsAllOnes = FValIsAllOnes;
   26826         FValIsAllZeros = TValIsAllZeros;
   26827       }
   26828     }
   26829 
   26830     if (TValIsAllOnes || FValIsAllZeros) {
   26831       SDValue Ret;
   26832 
   26833       if (TValIsAllOnes && FValIsAllZeros)
   26834         Ret = Cond;
   26835       else if (TValIsAllOnes)
   26836         Ret =
   26837             DAG.getNode(ISD::OR, DL, CondVT, Cond, DAG.getBitcast(CondVT, RHS));
   26838       else if (FValIsAllZeros)
   26839         Ret = DAG.getNode(ISD::AND, DL, CondVT, Cond,
   26840                           DAG.getBitcast(CondVT, LHS));
   26841 
   26842       return DAG.getBitcast(VT, Ret);
   26843     }
   26844   }
   26845 
   26846   // If this is a *dynamic* select (non-constant condition) and we can match
   26847   // this node with one of the variable blend instructions, restructure the
   26848   // condition so that the blends can use the high bit of each element and use
   26849   // SimplifyDemandedBits to simplify the condition operand.
   26850   if (N->getOpcode() == ISD::VSELECT && DCI.isBeforeLegalizeOps() &&
   26851       !DCI.isBeforeLegalize() &&
   26852       !ISD::isBuildVectorOfConstantSDNodes(Cond.getNode())) {
   26853     unsigned BitWidth = Cond.getValueType().getScalarSizeInBits();
   26854 
   26855     // Don't optimize vector selects that map to mask-registers.
   26856     if (BitWidth == 1)
   26857       return SDValue();
   26858 
   26859     // We can only handle the cases where VSELECT is directly legal on the
   26860     // subtarget. We custom lower VSELECT nodes with constant conditions and
   26861     // this makes it hard to see whether a dynamic VSELECT will correctly
   26862     // lower, so we both check the operation's status and explicitly handle the
   26863     // cases where a *dynamic* blend will fail even though a constant-condition
   26864     // blend could be custom lowered.
   26865     // FIXME: We should find a better way to handle this class of problems.
   26866     // Potentially, we should combine constant-condition vselect nodes
   26867     // pre-legalization into shuffles and not mark as many types as custom
   26868     // lowered.
   26869     if (!TLI.isOperationLegalOrCustom(ISD::VSELECT, VT))
   26870       return SDValue();
   26871     // FIXME: We don't support i16-element blends currently. We could and
   26872     // should support them by making *all* the bits in the condition be set
   26873     // rather than just the high bit and using an i8-element blend.
   26874     if (VT.getVectorElementType() == MVT::i16)
   26875       return SDValue();
   26876     // Dynamic blending was only available from SSE4.1 onward.
   26877     if (VT.is128BitVector() && !Subtarget.hasSSE41())
   26878       return SDValue();
   26879     // Byte blends are only available in AVX2
   26880     if (VT == MVT::v32i8 && !Subtarget.hasAVX2())
   26881       return SDValue();
   26882 
   26883     assert(BitWidth >= 8 && BitWidth <= 64 && "Invalid mask size");
   26884     APInt DemandedMask = APInt::getHighBitsSet(BitWidth, 1);
   26885 
   26886     APInt KnownZero, KnownOne;
   26887     TargetLowering::TargetLoweringOpt TLO(DAG, DCI.isBeforeLegalize(),
   26888                                           DCI.isBeforeLegalizeOps());
   26889     if (TLO.ShrinkDemandedConstant(Cond, DemandedMask) ||
   26890         TLI.SimplifyDemandedBits(Cond, DemandedMask, KnownZero, KnownOne,
   26891                                  TLO)) {
   26892       // If we changed the computation somewhere in the DAG, this change
   26893       // will affect all users of Cond.
   26894       // Make sure it is fine and update all the nodes so that we do not
   26895       // use the generic VSELECT anymore. Otherwise, we may perform
   26896       // wrong optimizations as we messed up with the actual expectation
   26897       // for the vector boolean values.
   26898       if (Cond != TLO.Old) {
   26899         // Check all uses of that condition operand to check whether it will be
   26900         // consumed by non-BLEND instructions, which may depend on all bits are
   26901         // set properly.
   26902         for (SDNode::use_iterator I = Cond->use_begin(), E = Cond->use_end();
   26903              I != E; ++I)
   26904           if (I->getOpcode() != ISD::VSELECT)
   26905             // TODO: Add other opcodes eventually lowered into BLEND.
   26906             return SDValue();
   26907 
   26908         // Update all the users of the condition, before committing the change,
   26909         // so that the VSELECT optimizations that expect the correct vector
   26910         // boolean value will not be triggered.
   26911         for (SDNode::use_iterator I = Cond->use_begin(), E = Cond->use_end();
   26912              I != E; ++I)
   26913           DAG.ReplaceAllUsesOfValueWith(
   26914               SDValue(*I, 0),
   26915               DAG.getNode(X86ISD::SHRUNKBLEND, SDLoc(*I), I->getValueType(0),
   26916                           Cond, I->getOperand(1), I->getOperand(2)));
   26917         DCI.CommitTargetLoweringOpt(TLO);
   26918         return SDValue();
   26919       }
   26920       // At this point, only Cond is changed. Change the condition
   26921       // just for N to keep the opportunity to optimize all other
   26922       // users their own way.
   26923       DAG.ReplaceAllUsesOfValueWith(
   26924           SDValue(N, 0),
   26925           DAG.getNode(X86ISD::SHRUNKBLEND, SDLoc(N), N->getValueType(0),
   26926                       TLO.New, N->getOperand(1), N->getOperand(2)));
   26927       return SDValue();
   26928     }
   26929   }
   26930 
   26931   return SDValue();
   26932 }
   26933 
   26934 /// Combine:
   26935 ///   (brcond/cmov/setcc .., (cmp (atomic_load_add x, 1), 0), COND_S)
   26936 /// to:
   26937 ///   (brcond/cmov/setcc .., (LADD x, 1), COND_LE)
   26938 /// i.e., reusing the EFLAGS produced by the LOCKed instruction.
   26939 /// Note that this is only legal for some op/cc combinations.
   26940 static SDValue combineSetCCAtomicArith(SDValue Cmp, X86::CondCode &CC,
   26941                                        SelectionDAG &DAG) {
   26942   // This combine only operates on CMP-like nodes.
   26943   if (!(Cmp.getOpcode() == X86ISD::CMP ||
   26944         (Cmp.getOpcode() == X86ISD::SUB && !Cmp->hasAnyUseOfValue(0))))
   26945     return SDValue();
   26946 
   26947   // This only applies to variations of the common case:
   26948   //   (icmp slt x, 0) -> (icmp sle (add x, 1), 0)
   26949   //   (icmp sge x, 0) -> (icmp sgt (add x, 1), 0)
   26950   //   (icmp sle x, 0) -> (icmp slt (sub x, 1), 0)
   26951   //   (icmp sgt x, 0) -> (icmp sge (sub x, 1), 0)
   26952   // Using the proper condcodes (see below), overflow is checked for.
   26953 
   26954   // FIXME: We can generalize both constraints:
   26955   // - XOR/OR/AND (if they were made to survive AtomicExpand)
   26956   // - LHS != 1
   26957   // if the result is compared.
   26958 
   26959   SDValue CmpLHS = Cmp.getOperand(0);
   26960   SDValue CmpRHS = Cmp.getOperand(1);
   26961 
   26962   if (!CmpLHS.hasOneUse())
   26963     return SDValue();
   26964 
   26965   auto *CmpRHSC = dyn_cast<ConstantSDNode>(CmpRHS);
   26966   if (!CmpRHSC || CmpRHSC->getZExtValue() != 0)
   26967     return SDValue();
   26968 
   26969   const unsigned Opc = CmpLHS.getOpcode();
   26970 
   26971   if (Opc != ISD::ATOMIC_LOAD_ADD && Opc != ISD::ATOMIC_LOAD_SUB)
   26972     return SDValue();
   26973 
   26974   SDValue OpRHS = CmpLHS.getOperand(2);
   26975   auto *OpRHSC = dyn_cast<ConstantSDNode>(OpRHS);
   26976   if (!OpRHSC)
   26977     return SDValue();
   26978 
   26979   APInt Addend = OpRHSC->getAPIntValue();
   26980   if (Opc == ISD::ATOMIC_LOAD_SUB)
   26981     Addend = -Addend;
   26982 
   26983   if (CC == X86::COND_S && Addend == 1)
   26984     CC = X86::COND_LE;
   26985   else if (CC == X86::COND_NS && Addend == 1)
   26986     CC = X86::COND_G;
   26987   else if (CC == X86::COND_G && Addend == -1)
   26988     CC = X86::COND_GE;
   26989   else if (CC == X86::COND_LE && Addend == -1)
   26990     CC = X86::COND_L;
   26991   else
   26992     return SDValue();
   26993 
   26994   SDValue LockOp = lowerAtomicArithWithLOCK(CmpLHS, DAG);
   26995   DAG.ReplaceAllUsesOfValueWith(CmpLHS.getValue(0),
   26996                                 DAG.getUNDEF(CmpLHS.getValueType()));
   26997   DAG.ReplaceAllUsesOfValueWith(CmpLHS.getValue(1), LockOp.getValue(1));
   26998   return LockOp;
   26999 }
   27000 
   27001 // Check whether a boolean test is testing a boolean value generated by
   27002 // X86ISD::SETCC. If so, return the operand of that SETCC and proper condition
   27003 // code.
   27004 //
   27005 // Simplify the following patterns:
   27006 // (Op (CMP (SETCC Cond EFLAGS) 1) EQ) or
   27007 // (Op (CMP (SETCC Cond EFLAGS) 0) NEQ)
   27008 // to (Op EFLAGS Cond)
   27009 //
   27010 // (Op (CMP (SETCC Cond EFLAGS) 0) EQ) or
   27011 // (Op (CMP (SETCC Cond EFLAGS) 1) NEQ)
   27012 // to (Op EFLAGS !Cond)
   27013 //
   27014 // where Op could be BRCOND or CMOV.
   27015 //
   27016 static SDValue checkBoolTestSetCCCombine(SDValue Cmp, X86::CondCode &CC) {
   27017   // This combine only operates on CMP-like nodes.
   27018   if (!(Cmp.getOpcode() == X86ISD::CMP ||
   27019         (Cmp.getOpcode() == X86ISD::SUB && !Cmp->hasAnyUseOfValue(0))))
   27020     return SDValue();
   27021 
   27022   // Quit if not used as a boolean value.
   27023   if (CC != X86::COND_E && CC != X86::COND_NE)
   27024     return SDValue();
   27025 
   27026   // Check CMP operands. One of them should be 0 or 1 and the other should be
   27027   // an SetCC or extended from it.
   27028   SDValue Op1 = Cmp.getOperand(0);
   27029   SDValue Op2 = Cmp.getOperand(1);
   27030 
   27031   SDValue SetCC;
   27032   const ConstantSDNode* C = nullptr;
   27033   bool needOppositeCond = (CC == X86::COND_E);
   27034   bool checkAgainstTrue = false; // Is it a comparison against 1?
   27035 
   27036   if ((C = dyn_cast<ConstantSDNode>(Op1)))
   27037     SetCC = Op2;
   27038   else if ((C = dyn_cast<ConstantSDNode>(Op2)))
   27039     SetCC = Op1;
   27040   else // Quit if all operands are not constants.
   27041     return SDValue();
   27042 
   27043   if (C->getZExtValue() == 1) {
   27044     needOppositeCond = !needOppositeCond;
   27045     checkAgainstTrue = true;
   27046   } else if (C->getZExtValue() != 0)
   27047     // Quit if the constant is neither 0 or 1.
   27048     return SDValue();
   27049 
   27050   bool truncatedToBoolWithAnd = false;
   27051   // Skip (zext $x), (trunc $x), or (and $x, 1) node.
   27052   while (SetCC.getOpcode() == ISD::ZERO_EXTEND ||
   27053          SetCC.getOpcode() == ISD::TRUNCATE ||
   27054          SetCC.getOpcode() == ISD::AssertZext ||
   27055          SetCC.getOpcode() == ISD::AND) {
   27056     if (SetCC.getOpcode() == ISD::AND) {
   27057       int OpIdx = -1;
   27058       if (isOneConstant(SetCC.getOperand(0)))
   27059         OpIdx = 1;
   27060       if (isOneConstant(SetCC.getOperand(1)))
   27061         OpIdx = 0;
   27062       if (OpIdx < 0)
   27063         break;
   27064       SetCC = SetCC.getOperand(OpIdx);
   27065       truncatedToBoolWithAnd = true;
   27066     } else
   27067       SetCC = SetCC.getOperand(0);
   27068   }
   27069 
   27070   switch (SetCC.getOpcode()) {
   27071   case X86ISD::SETCC_CARRY:
   27072     // Since SETCC_CARRY gives output based on R = CF ? ~0 : 0, it's unsafe to
   27073     // simplify it if the result of SETCC_CARRY is not canonicalized to 0 or 1,
   27074     // i.e. it's a comparison against true but the result of SETCC_CARRY is not
   27075     // truncated to i1 using 'and'.
   27076     if (checkAgainstTrue && !truncatedToBoolWithAnd)
   27077       break;
   27078     assert(X86::CondCode(SetCC.getConstantOperandVal(0)) == X86::COND_B &&
   27079            "Invalid use of SETCC_CARRY!");
   27080     // FALL THROUGH
   27081   case X86ISD::SETCC:
   27082     // Set the condition code or opposite one if necessary.
   27083     CC = X86::CondCode(SetCC.getConstantOperandVal(0));
   27084     if (needOppositeCond)
   27085       CC = X86::GetOppositeBranchCondition(CC);
   27086     return SetCC.getOperand(1);
   27087   case X86ISD::CMOV: {
   27088     // Check whether false/true value has canonical one, i.e. 0 or 1.
   27089     ConstantSDNode *FVal = dyn_cast<ConstantSDNode>(SetCC.getOperand(0));
   27090     ConstantSDNode *TVal = dyn_cast<ConstantSDNode>(SetCC.getOperand(1));
   27091     // Quit if true value is not a constant.
   27092     if (!TVal)
   27093       return SDValue();
   27094     // Quit if false value is not a constant.
   27095     if (!FVal) {
   27096       SDValue Op = SetCC.getOperand(0);
   27097       // Skip 'zext' or 'trunc' node.
   27098       if (Op.getOpcode() == ISD::ZERO_EXTEND ||
   27099           Op.getOpcode() == ISD::TRUNCATE)
   27100         Op = Op.getOperand(0);
   27101       // A special case for rdrand/rdseed, where 0 is set if false cond is
   27102       // found.
   27103       if ((Op.getOpcode() != X86ISD::RDRAND &&
   27104            Op.getOpcode() != X86ISD::RDSEED) || Op.getResNo() != 0)
   27105         return SDValue();
   27106     }
   27107     // Quit if false value is not the constant 0 or 1.
   27108     bool FValIsFalse = true;
   27109     if (FVal && FVal->getZExtValue() != 0) {
   27110       if (FVal->getZExtValue() != 1)
   27111         return SDValue();
   27112       // If FVal is 1, opposite cond is needed.
   27113       needOppositeCond = !needOppositeCond;
   27114       FValIsFalse = false;
   27115     }
   27116     // Quit if TVal is not the constant opposite of FVal.
   27117     if (FValIsFalse && TVal->getZExtValue() != 1)
   27118       return SDValue();
   27119     if (!FValIsFalse && TVal->getZExtValue() != 0)
   27120       return SDValue();
   27121     CC = X86::CondCode(SetCC.getConstantOperandVal(2));
   27122     if (needOppositeCond)
   27123       CC = X86::GetOppositeBranchCondition(CC);
   27124     return SetCC.getOperand(3);
   27125   }
   27126   }
   27127 
   27128   return SDValue();
   27129 }
   27130 
   27131 /// Check whether Cond is an AND/OR of SETCCs off of the same EFLAGS.
   27132 /// Match:
   27133 ///   (X86or (X86setcc) (X86setcc))
   27134 ///   (X86cmp (and (X86setcc) (X86setcc)), 0)
   27135 static bool checkBoolTestAndOrSetCCCombine(SDValue Cond, X86::CondCode &CC0,
   27136                                            X86::CondCode &CC1, SDValue &Flags,
   27137                                            bool &isAnd) {
   27138   if (Cond->getOpcode() == X86ISD::CMP) {
   27139     if (!isNullConstant(Cond->getOperand(1)))
   27140       return false;
   27141 
   27142     Cond = Cond->getOperand(0);
   27143   }
   27144 
   27145   isAnd = false;
   27146 
   27147   SDValue SetCC0, SetCC1;
   27148   switch (Cond->getOpcode()) {
   27149   default: return false;
   27150   case ISD::AND:
   27151   case X86ISD::AND:
   27152     isAnd = true;
   27153     // fallthru
   27154   case ISD::OR:
   27155   case X86ISD::OR:
   27156     SetCC0 = Cond->getOperand(0);
   27157     SetCC1 = Cond->getOperand(1);
   27158     break;
   27159   };
   27160 
   27161   // Make sure we have SETCC nodes, using the same flags value.
   27162   if (SetCC0.getOpcode() != X86ISD::SETCC ||
   27163       SetCC1.getOpcode() != X86ISD::SETCC ||
   27164       SetCC0->getOperand(1) != SetCC1->getOperand(1))
   27165     return false;
   27166 
   27167   CC0 = (X86::CondCode)SetCC0->getConstantOperandVal(0);
   27168   CC1 = (X86::CondCode)SetCC1->getConstantOperandVal(0);
   27169   Flags = SetCC0->getOperand(1);
   27170   return true;
   27171 }
   27172 
   27173 /// Optimize an EFLAGS definition used according to the condition code \p CC
   27174 /// into a simpler EFLAGS value, potentially returning a new \p CC and replacing
   27175 /// uses of chain values.
   27176 static SDValue combineSetCCEFLAGS(SDValue EFLAGS, X86::CondCode &CC,
   27177                                   SelectionDAG &DAG) {
   27178   if (SDValue R = checkBoolTestSetCCCombine(EFLAGS, CC))
   27179     return R;
   27180   return combineSetCCAtomicArith(EFLAGS, CC, DAG);
   27181 }
   27182 
   27183 /// Optimize X86ISD::CMOV [LHS, RHS, CONDCODE (e.g. X86::COND_NE), CONDVAL]
   27184 static SDValue combineCMov(SDNode *N, SelectionDAG &DAG,
   27185                            TargetLowering::DAGCombinerInfo &DCI,
   27186                            const X86Subtarget &Subtarget) {
   27187   SDLoc DL(N);
   27188 
   27189   // If the flag operand isn't dead, don't touch this CMOV.
   27190   if (N->getNumValues() == 2 && !SDValue(N, 1).use_empty())
   27191     return SDValue();
   27192 
   27193   SDValue FalseOp = N->getOperand(0);
   27194   SDValue TrueOp = N->getOperand(1);
   27195   X86::CondCode CC = (X86::CondCode)N->getConstantOperandVal(2);
   27196   SDValue Cond = N->getOperand(3);
   27197 
   27198   if (CC == X86::COND_E || CC == X86::COND_NE) {
   27199     switch (Cond.getOpcode()) {
   27200     default: break;
   27201     case X86ISD::BSR:
   27202     case X86ISD::BSF:
   27203       // If operand of BSR / BSF are proven never zero, then ZF cannot be set.
   27204       if (DAG.isKnownNeverZero(Cond.getOperand(0)))
   27205         return (CC == X86::COND_E) ? FalseOp : TrueOp;
   27206     }
   27207   }
   27208 
   27209   // Try to simplify the EFLAGS and condition code operands.
   27210   // We can't always do this as FCMOV only supports a subset of X86 cond.
   27211   if (SDValue Flags = combineSetCCEFLAGS(Cond, CC, DAG)) {
   27212     if (FalseOp.getValueType() != MVT::f80 || hasFPCMov(CC)) {
   27213       SDValue Ops[] = {FalseOp, TrueOp, DAG.getConstant(CC, DL, MVT::i8),
   27214         Flags};
   27215       return DAG.getNode(X86ISD::CMOV, DL, N->getVTList(), Ops);
   27216     }
   27217   }
   27218 
   27219   // If this is a select between two integer constants, try to do some
   27220   // optimizations.  Note that the operands are ordered the opposite of SELECT
   27221   // operands.
   27222   if (ConstantSDNode *TrueC = dyn_cast<ConstantSDNode>(TrueOp)) {
   27223     if (ConstantSDNode *FalseC = dyn_cast<ConstantSDNode>(FalseOp)) {
   27224       // Canonicalize the TrueC/FalseC values so that TrueC (the true value) is
   27225       // larger than FalseC (the false value).
   27226       if (TrueC->getAPIntValue().ult(FalseC->getAPIntValue())) {
   27227         CC = X86::GetOppositeBranchCondition(CC);
   27228         std::swap(TrueC, FalseC);
   27229         std::swap(TrueOp, FalseOp);
   27230       }
   27231 
   27232       // Optimize C ? 8 : 0 -> zext(setcc(C)) << 3.  Likewise for any pow2/0.
   27233       // This is efficient for any integer data type (including i8/i16) and
   27234       // shift amount.
   27235       if (FalseC->getAPIntValue() == 0 && TrueC->getAPIntValue().isPowerOf2()) {
   27236         Cond = DAG.getNode(X86ISD::SETCC, DL, MVT::i8,
   27237                            DAG.getConstant(CC, DL, MVT::i8), Cond);
   27238 
   27239         // Zero extend the condition if needed.
   27240         Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, TrueC->getValueType(0), Cond);
   27241 
   27242         unsigned ShAmt = TrueC->getAPIntValue().logBase2();
   27243         Cond = DAG.getNode(ISD::SHL, DL, Cond.getValueType(), Cond,
   27244                            DAG.getConstant(ShAmt, DL, MVT::i8));
   27245         if (N->getNumValues() == 2)  // Dead flag value?
   27246           return DCI.CombineTo(N, Cond, SDValue());
   27247         return Cond;
   27248       }
   27249 
   27250       // Optimize Cond ? cst+1 : cst -> zext(setcc(C)+cst.  This is efficient
   27251       // for any integer data type, including i8/i16.
   27252       if (FalseC->getAPIntValue()+1 == TrueC->getAPIntValue()) {
   27253         Cond = DAG.getNode(X86ISD::SETCC, DL, MVT::i8,
   27254                            DAG.getConstant(CC, DL, MVT::i8), Cond);
   27255 
   27256         // Zero extend the condition if needed.
   27257         Cond = DAG.getNode(ISD::ZERO_EXTEND, DL,
   27258                            FalseC->getValueType(0), Cond);
   27259         Cond = DAG.getNode(ISD::ADD, DL, Cond.getValueType(), Cond,
   27260                            SDValue(FalseC, 0));
   27261 
   27262         if (N->getNumValues() == 2)  // Dead flag value?
   27263           return DCI.CombineTo(N, Cond, SDValue());
   27264         return Cond;
   27265       }
   27266 
   27267       // Optimize cases that will turn into an LEA instruction.  This requires
   27268       // an i32 or i64 and an efficient multiplier (1, 2, 3, 4, 5, 8, 9).
   27269       if (N->getValueType(0) == MVT::i32 || N->getValueType(0) == MVT::i64) {
   27270         uint64_t Diff = TrueC->getZExtValue()-FalseC->getZExtValue();
   27271         if (N->getValueType(0) == MVT::i32) Diff = (unsigned)Diff;
   27272 
   27273         bool isFastMultiplier = false;
   27274         if (Diff < 10) {
   27275           switch ((unsigned char)Diff) {
   27276           default: break;
   27277           case 1:  // result = add base, cond
   27278           case 2:  // result = lea base(    , cond*2)
   27279           case 3:  // result = lea base(cond, cond*2)
   27280           case 4:  // result = lea base(    , cond*4)
   27281           case 5:  // result = lea base(cond, cond*4)
   27282           case 8:  // result = lea base(    , cond*8)
   27283           case 9:  // result = lea base(cond, cond*8)
   27284             isFastMultiplier = true;
   27285             break;
   27286           }
   27287         }
   27288 
   27289         if (isFastMultiplier) {
   27290           APInt Diff = TrueC->getAPIntValue()-FalseC->getAPIntValue();
   27291           Cond = DAG.getNode(X86ISD::SETCC, DL, MVT::i8,
   27292                              DAG.getConstant(CC, DL, MVT::i8), Cond);
   27293           // Zero extend the condition if needed.
   27294           Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, FalseC->getValueType(0),
   27295                              Cond);
   27296           // Scale the condition by the difference.
   27297           if (Diff != 1)
   27298             Cond = DAG.getNode(ISD::MUL, DL, Cond.getValueType(), Cond,
   27299                                DAG.getConstant(Diff, DL, Cond.getValueType()));
   27300 
   27301           // Add the base if non-zero.
   27302           if (FalseC->getAPIntValue() != 0)
   27303             Cond = DAG.getNode(ISD::ADD, DL, Cond.getValueType(), Cond,
   27304                                SDValue(FalseC, 0));
   27305           if (N->getNumValues() == 2)  // Dead flag value?
   27306             return DCI.CombineTo(N, Cond, SDValue());
   27307           return Cond;
   27308         }
   27309       }
   27310     }
   27311   }
   27312 
   27313   // Handle these cases:
   27314   //   (select (x != c), e, c) -> select (x != c), e, x),
   27315   //   (select (x == c), c, e) -> select (x == c), x, e)
   27316   // where the c is an integer constant, and the "select" is the combination
   27317   // of CMOV and CMP.
   27318   //
   27319   // The rationale for this change is that the conditional-move from a constant
   27320   // needs two instructions, however, conditional-move from a register needs
   27321   // only one instruction.
   27322   //
   27323   // CAVEAT: By replacing a constant with a symbolic value, it may obscure
   27324   //  some instruction-combining opportunities. This opt needs to be
   27325   //  postponed as late as possible.
   27326   //
   27327   if (!DCI.isBeforeLegalize() && !DCI.isBeforeLegalizeOps()) {
   27328     // the DCI.xxxx conditions are provided to postpone the optimization as
   27329     // late as possible.
   27330 
   27331     ConstantSDNode *CmpAgainst = nullptr;
   27332     if ((Cond.getOpcode() == X86ISD::CMP || Cond.getOpcode() == X86ISD::SUB) &&
   27333         (CmpAgainst = dyn_cast<ConstantSDNode>(Cond.getOperand(1))) &&
   27334         !isa<ConstantSDNode>(Cond.getOperand(0))) {
   27335 
   27336       if (CC == X86::COND_NE &&
   27337           CmpAgainst == dyn_cast<ConstantSDNode>(FalseOp)) {
   27338         CC = X86::GetOppositeBranchCondition(CC);
   27339         std::swap(TrueOp, FalseOp);
   27340       }
   27341 
   27342       if (CC == X86::COND_E &&
   27343           CmpAgainst == dyn_cast<ConstantSDNode>(TrueOp)) {
   27344         SDValue Ops[] = { FalseOp, Cond.getOperand(0),
   27345                           DAG.getConstant(CC, DL, MVT::i8), Cond };
   27346         return DAG.getNode(X86ISD::CMOV, DL, N->getVTList (), Ops);
   27347       }
   27348     }
   27349   }
   27350 
   27351   // Fold and/or of setcc's to double CMOV:
   27352   //   (CMOV F, T, ((cc1 | cc2) != 0)) -> (CMOV (CMOV F, T, cc1), T, cc2)
   27353   //   (CMOV F, T, ((cc1 & cc2) != 0)) -> (CMOV (CMOV T, F, !cc1), F, !cc2)
   27354   //
   27355   // This combine lets us generate:
   27356   //   cmovcc1 (jcc1 if we don't have CMOV)
   27357   //   cmovcc2 (same)
   27358   // instead of:
   27359   //   setcc1
   27360   //   setcc2
   27361   //   and/or
   27362   //   cmovne (jne if we don't have CMOV)
   27363   // When we can't use the CMOV instruction, it might increase branch
   27364   // mispredicts.
   27365   // When we can use CMOV, or when there is no mispredict, this improves
   27366   // throughput and reduces register pressure.
   27367   //
   27368   if (CC == X86::COND_NE) {
   27369     SDValue Flags;
   27370     X86::CondCode CC0, CC1;
   27371     bool isAndSetCC;
   27372     if (checkBoolTestAndOrSetCCCombine(Cond, CC0, CC1, Flags, isAndSetCC)) {
   27373       if (isAndSetCC) {
   27374         std::swap(FalseOp, TrueOp);
   27375         CC0 = X86::GetOppositeBranchCondition(CC0);
   27376         CC1 = X86::GetOppositeBranchCondition(CC1);
   27377       }
   27378 
   27379       SDValue LOps[] = {FalseOp, TrueOp, DAG.getConstant(CC0, DL, MVT::i8),
   27380         Flags};
   27381       SDValue LCMOV = DAG.getNode(X86ISD::CMOV, DL, N->getVTList(), LOps);
   27382       SDValue Ops[] = {LCMOV, TrueOp, DAG.getConstant(CC1, DL, MVT::i8), Flags};
   27383       SDValue CMOV = DAG.getNode(X86ISD::CMOV, DL, N->getVTList(), Ops);
   27384       DAG.ReplaceAllUsesOfValueWith(SDValue(N, 1), SDValue(CMOV.getNode(), 1));
   27385       return CMOV;
   27386     }
   27387   }
   27388 
   27389   return SDValue();
   27390 }
   27391 
   27392 /// Different mul shrinking modes.
   27393 enum ShrinkMode { MULS8, MULU8, MULS16, MULU16 };
   27394 
   27395 static bool canReduceVMulWidth(SDNode *N, SelectionDAG &DAG, ShrinkMode &Mode) {
   27396   EVT VT = N->getOperand(0).getValueType();
   27397   if (VT.getScalarSizeInBits() != 32)
   27398     return false;
   27399 
   27400   assert(N->getNumOperands() == 2 && "NumOperands of Mul are 2");
   27401   unsigned SignBits[2] = {1, 1};
   27402   bool IsPositive[2] = {false, false};
   27403   for (unsigned i = 0; i < 2; i++) {
   27404     SDValue Opd = N->getOperand(i);
   27405 
   27406     // DAG.ComputeNumSignBits return 1 for ISD::ANY_EXTEND, so we need to
   27407     // compute signbits for it separately.
   27408     if (Opd.getOpcode() == ISD::ANY_EXTEND) {
   27409       // For anyextend, it is safe to assume an appropriate number of leading
   27410       // sign/zero bits.
   27411       if (Opd.getOperand(0).getValueType().getVectorElementType() == MVT::i8)
   27412         SignBits[i] = 25;
   27413       else if (Opd.getOperand(0).getValueType().getVectorElementType() ==
   27414                MVT::i16)
   27415         SignBits[i] = 17;
   27416       else
   27417         return false;
   27418       IsPositive[i] = true;
   27419     } else if (Opd.getOpcode() == ISD::BUILD_VECTOR) {
   27420       // All the operands of BUILD_VECTOR need to be int constant.
   27421       // Find the smallest value range which all the operands belong to.
   27422       SignBits[i] = 32;
   27423       IsPositive[i] = true;
   27424       for (const SDValue &SubOp : Opd.getNode()->op_values()) {
   27425         if (SubOp.isUndef())
   27426           continue;
   27427         auto *CN = dyn_cast<ConstantSDNode>(SubOp);
   27428         if (!CN)
   27429           return false;
   27430         APInt IntVal = CN->getAPIntValue();
   27431         if (IntVal.isNegative())
   27432           IsPositive[i] = false;
   27433         SignBits[i] = std::min(SignBits[i], IntVal.getNumSignBits());
   27434       }
   27435     } else {
   27436       SignBits[i] = DAG.ComputeNumSignBits(Opd);
   27437       if (Opd.getOpcode() == ISD::ZERO_EXTEND)
   27438         IsPositive[i] = true;
   27439     }
   27440   }
   27441 
   27442   bool AllPositive = IsPositive[0] && IsPositive[1];
   27443   unsigned MinSignBits = std::min(SignBits[0], SignBits[1]);
   27444   // When ranges are from -128 ~ 127, use MULS8 mode.
   27445   if (MinSignBits >= 25)
   27446     Mode = MULS8;
   27447   // When ranges are from 0 ~ 255, use MULU8 mode.
   27448   else if (AllPositive && MinSignBits >= 24)
   27449     Mode = MULU8;
   27450   // When ranges are from -32768 ~ 32767, use MULS16 mode.
   27451   else if (MinSignBits >= 17)
   27452     Mode = MULS16;
   27453   // When ranges are from 0 ~ 65535, use MULU16 mode.
   27454   else if (AllPositive && MinSignBits >= 16)
   27455     Mode = MULU16;
   27456   else
   27457     return false;
   27458   return true;
   27459 }
   27460 
   27461 /// When the operands of vector mul are extended from smaller size values,
   27462 /// like i8 and i16, the type of mul may be shrinked to generate more
   27463 /// efficient code. Two typical patterns are handled:
   27464 /// Pattern1:
   27465 ///     %2 = sext/zext <N x i8> %1 to <N x i32>
   27466 ///     %4 = sext/zext <N x i8> %3 to <N x i32>
   27467 //   or %4 = build_vector <N x i32> %C1, ..., %CN (%C1..%CN are constants)
   27468 ///     %5 = mul <N x i32> %2, %4
   27469 ///
   27470 /// Pattern2:
   27471 ///     %2 = zext/sext <N x i16> %1 to <N x i32>
   27472 ///     %4 = zext/sext <N x i16> %3 to <N x i32>
   27473 ///  or %4 = build_vector <N x i32> %C1, ..., %CN (%C1..%CN are constants)
   27474 ///     %5 = mul <N x i32> %2, %4
   27475 ///
   27476 /// There are four mul shrinking modes:
   27477 /// If %2 == sext32(trunc8(%2)), i.e., the scalar value range of %2 is
   27478 /// -128 to 128, and the scalar value range of %4 is also -128 to 128,
   27479 /// generate pmullw+sext32 for it (MULS8 mode).
   27480 /// If %2 == zext32(trunc8(%2)), i.e., the scalar value range of %2 is
   27481 /// 0 to 255, and the scalar value range of %4 is also 0 to 255,
   27482 /// generate pmullw+zext32 for it (MULU8 mode).
   27483 /// If %2 == sext32(trunc16(%2)), i.e., the scalar value range of %2 is
   27484 /// -32768 to 32767, and the scalar value range of %4 is also -32768 to 32767,
   27485 /// generate pmullw+pmulhw for it (MULS16 mode).
   27486 /// If %2 == zext32(trunc16(%2)), i.e., the scalar value range of %2 is
   27487 /// 0 to 65535, and the scalar value range of %4 is also 0 to 65535,
   27488 /// generate pmullw+pmulhuw for it (MULU16 mode).
   27489 static SDValue reduceVMULWidth(SDNode *N, SelectionDAG &DAG,
   27490                                const X86Subtarget &Subtarget) {
   27491   // pmulld is supported since SSE41. It is better to use pmulld
   27492   // instead of pmullw+pmulhw.
   27493   if (Subtarget.hasSSE41())
   27494     return SDValue();
   27495 
   27496   ShrinkMode Mode;
   27497   if (!canReduceVMulWidth(N, DAG, Mode))
   27498     return SDValue();
   27499 
   27500   SDLoc DL(N);
   27501   SDValue N0 = N->getOperand(0);
   27502   SDValue N1 = N->getOperand(1);
   27503   EVT VT = N->getOperand(0).getValueType();
   27504   unsigned RegSize = 128;
   27505   MVT OpsVT = MVT::getVectorVT(MVT::i16, RegSize / 16);
   27506   EVT ReducedVT =
   27507       EVT::getVectorVT(*DAG.getContext(), MVT::i16, VT.getVectorNumElements());
   27508   // Shrink the operands of mul.
   27509   SDValue NewN0 = DAG.getNode(ISD::TRUNCATE, DL, ReducedVT, N0);
   27510   SDValue NewN1 = DAG.getNode(ISD::TRUNCATE, DL, ReducedVT, N1);
   27511 
   27512   if (VT.getVectorNumElements() >= OpsVT.getVectorNumElements()) {
   27513     // Generate the lower part of mul: pmullw. For MULU8/MULS8, only the
   27514     // lower part is needed.
   27515     SDValue MulLo = DAG.getNode(ISD::MUL, DL, ReducedVT, NewN0, NewN1);
   27516     if (Mode == MULU8 || Mode == MULS8) {
   27517       return DAG.getNode((Mode == MULU8) ? ISD::ZERO_EXTEND : ISD::SIGN_EXTEND,
   27518                          DL, VT, MulLo);
   27519     } else {
   27520       MVT ResVT = MVT::getVectorVT(MVT::i32, VT.getVectorNumElements() / 2);
   27521       // Generate the higher part of mul: pmulhw/pmulhuw. For MULU16/MULS16,
   27522       // the higher part is also needed.
   27523       SDValue MulHi = DAG.getNode(Mode == MULS16 ? ISD::MULHS : ISD::MULHU, DL,
   27524                                   ReducedVT, NewN0, NewN1);
   27525 
   27526       // Repack the lower part and higher part result of mul into a wider
   27527       // result.
   27528       // Generate shuffle functioning as punpcklwd.
   27529       SmallVector<int, 16> ShuffleMask(VT.getVectorNumElements());
   27530       for (unsigned i = 0; i < VT.getVectorNumElements() / 2; i++) {
   27531         ShuffleMask[2 * i] = i;
   27532         ShuffleMask[2 * i + 1] = i + VT.getVectorNumElements();
   27533       }
   27534       SDValue ResLo =
   27535           DAG.getVectorShuffle(ReducedVT, DL, MulLo, MulHi, ShuffleMask);
   27536       ResLo = DAG.getNode(ISD::BITCAST, DL, ResVT, ResLo);
   27537       // Generate shuffle functioning as punpckhwd.
   27538       for (unsigned i = 0; i < VT.getVectorNumElements() / 2; i++) {
   27539         ShuffleMask[2 * i] = i + VT.getVectorNumElements() / 2;
   27540         ShuffleMask[2 * i + 1] = i + VT.getVectorNumElements() * 3 / 2;
   27541       }
   27542       SDValue ResHi =
   27543           DAG.getVectorShuffle(ReducedVT, DL, MulLo, MulHi, ShuffleMask);
   27544       ResHi = DAG.getNode(ISD::BITCAST, DL, ResVT, ResHi);
   27545       return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, ResLo, ResHi);
   27546     }
   27547   } else {
   27548     // When VT.getVectorNumElements() < OpsVT.getVectorNumElements(), we want
   27549     // to legalize the mul explicitly because implicit legalization for type
   27550     // <4 x i16> to <4 x i32> sometimes involves unnecessary unpack
   27551     // instructions which will not exist when we explicitly legalize it by
   27552     // extending <4 x i16> to <8 x i16> (concatenating the <4 x i16> val with
   27553     // <4 x i16> undef).
   27554     //
   27555     // Legalize the operands of mul.
   27556     SmallVector<SDValue, 16> Ops(RegSize / ReducedVT.getSizeInBits(),
   27557                                  DAG.getUNDEF(ReducedVT));
   27558     Ops[0] = NewN0;
   27559     NewN0 = DAG.getNode(ISD::CONCAT_VECTORS, DL, OpsVT, Ops);
   27560     Ops[0] = NewN1;
   27561     NewN1 = DAG.getNode(ISD::CONCAT_VECTORS, DL, OpsVT, Ops);
   27562 
   27563     if (Mode == MULU8 || Mode == MULS8) {
   27564       // Generate lower part of mul: pmullw. For MULU8/MULS8, only the lower
   27565       // part is needed.
   27566       SDValue Mul = DAG.getNode(ISD::MUL, DL, OpsVT, NewN0, NewN1);
   27567 
   27568       // convert the type of mul result to VT.
   27569       MVT ResVT = MVT::getVectorVT(MVT::i32, RegSize / 32);
   27570       SDValue Res = DAG.getNode(Mode == MULU8 ? ISD::ZERO_EXTEND_VECTOR_INREG
   27571                                               : ISD::SIGN_EXTEND_VECTOR_INREG,
   27572                                 DL, ResVT, Mul);
   27573       return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Res,
   27574                          DAG.getIntPtrConstant(0, DL));
   27575     } else {
   27576       // Generate the lower and higher part of mul: pmulhw/pmulhuw. For
   27577       // MULU16/MULS16, both parts are needed.
   27578       SDValue MulLo = DAG.getNode(ISD::MUL, DL, OpsVT, NewN0, NewN1);
   27579       SDValue MulHi = DAG.getNode(Mode == MULS16 ? ISD::MULHS : ISD::MULHU, DL,
   27580                                   OpsVT, NewN0, NewN1);
   27581 
   27582       // Repack the lower part and higher part result of mul into a wider
   27583       // result. Make sure the type of mul result is VT.
   27584       MVT ResVT = MVT::getVectorVT(MVT::i32, RegSize / 32);
   27585       SDValue Res = DAG.getNode(X86ISD::UNPCKL, DL, OpsVT, MulLo, MulHi);
   27586       Res = DAG.getNode(ISD::BITCAST, DL, ResVT, Res);
   27587       return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Res,
   27588                          DAG.getIntPtrConstant(0, DL));
   27589     }
   27590   }
   27591 }
   27592 
   27593 /// Optimize a single multiply with constant into two operations in order to
   27594 /// implement it with two cheaper instructions, e.g. LEA + SHL, LEA + LEA.
   27595 static SDValue combineMul(SDNode *N, SelectionDAG &DAG,
   27596                           TargetLowering::DAGCombinerInfo &DCI,
   27597                           const X86Subtarget &Subtarget) {
   27598   EVT VT = N->getValueType(0);
   27599   if (DCI.isBeforeLegalize() && VT.isVector())
   27600     return reduceVMULWidth(N, DAG, Subtarget);
   27601 
   27602   // An imul is usually smaller than the alternative sequence.
   27603   if (DAG.getMachineFunction().getFunction()->optForMinSize())
   27604     return SDValue();
   27605 
   27606   if (DCI.isBeforeLegalize() || DCI.isCalledByLegalizer())
   27607     return SDValue();
   27608 
   27609   if (VT != MVT::i64 && VT != MVT::i32)
   27610     return SDValue();
   27611 
   27612   ConstantSDNode *C = dyn_cast<ConstantSDNode>(N->getOperand(1));
   27613   if (!C)
   27614     return SDValue();
   27615   uint64_t MulAmt = C->getZExtValue();
   27616   if (isPowerOf2_64(MulAmt) || MulAmt == 3 || MulAmt == 5 || MulAmt == 9)
   27617     return SDValue();
   27618 
   27619   uint64_t MulAmt1 = 0;
   27620   uint64_t MulAmt2 = 0;
   27621   if ((MulAmt % 9) == 0) {
   27622     MulAmt1 = 9;
   27623     MulAmt2 = MulAmt / 9;
   27624   } else if ((MulAmt % 5) == 0) {
   27625     MulAmt1 = 5;
   27626     MulAmt2 = MulAmt / 5;
   27627   } else if ((MulAmt % 3) == 0) {
   27628     MulAmt1 = 3;
   27629     MulAmt2 = MulAmt / 3;
   27630   }
   27631 
   27632   SDLoc DL(N);
   27633   SDValue NewMul;
   27634   if (MulAmt2 &&
   27635       (isPowerOf2_64(MulAmt2) || MulAmt2 == 3 || MulAmt2 == 5 || MulAmt2 == 9)){
   27636 
   27637     if (isPowerOf2_64(MulAmt2) &&
   27638         !(N->hasOneUse() && N->use_begin()->getOpcode() == ISD::ADD))
   27639       // If second multiplifer is pow2, issue it first. We want the multiply by
   27640       // 3, 5, or 9 to be folded into the addressing mode unless the lone use
   27641       // is an add.
   27642       std::swap(MulAmt1, MulAmt2);
   27643 
   27644     if (isPowerOf2_64(MulAmt1))
   27645       NewMul = DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
   27646                            DAG.getConstant(Log2_64(MulAmt1), DL, MVT::i8));
   27647     else
   27648       NewMul = DAG.getNode(X86ISD::MUL_IMM, DL, VT, N->getOperand(0),
   27649                            DAG.getConstant(MulAmt1, DL, VT));
   27650 
   27651     if (isPowerOf2_64(MulAmt2))
   27652       NewMul = DAG.getNode(ISD::SHL, DL, VT, NewMul,
   27653                            DAG.getConstant(Log2_64(MulAmt2), DL, MVT::i8));
   27654     else
   27655       NewMul = DAG.getNode(X86ISD::MUL_IMM, DL, VT, NewMul,
   27656                            DAG.getConstant(MulAmt2, DL, VT));
   27657   }
   27658 
   27659   if (!NewMul) {
   27660     assert(MulAmt != 0 && MulAmt != (VT == MVT::i64 ? UINT64_MAX : UINT32_MAX)
   27661            && "Both cases that could cause potential overflows should have "
   27662               "already been handled.");
   27663     if (isPowerOf2_64(MulAmt - 1))
   27664       // (mul x, 2^N + 1) => (add (shl x, N), x)
   27665       NewMul = DAG.getNode(ISD::ADD, DL, VT, N->getOperand(0),
   27666                                 DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
   27667                                 DAG.getConstant(Log2_64(MulAmt - 1), DL,
   27668                                 MVT::i8)));
   27669 
   27670     else if (isPowerOf2_64(MulAmt + 1))
   27671       // (mul x, 2^N - 1) => (sub (shl x, N), x)
   27672       NewMul = DAG.getNode(ISD::SUB, DL, VT, DAG.getNode(ISD::SHL, DL, VT,
   27673                                 N->getOperand(0),
   27674                                 DAG.getConstant(Log2_64(MulAmt + 1),
   27675                                 DL, MVT::i8)), N->getOperand(0));
   27676   }
   27677 
   27678   if (NewMul)
   27679     // Do not add new nodes to DAG combiner worklist.
   27680     DCI.CombineTo(N, NewMul, false);
   27681 
   27682   return SDValue();
   27683 }
   27684 
   27685 static SDValue combineShiftLeft(SDNode *N, SelectionDAG &DAG) {
   27686   SDValue N0 = N->getOperand(0);
   27687   SDValue N1 = N->getOperand(1);
   27688   ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(N1);
   27689   EVT VT = N0.getValueType();
   27690 
   27691   // fold (shl (and (setcc_c), c1), c2) -> (and setcc_c, (c1 << c2))
   27692   // since the result of setcc_c is all zero's or all ones.
   27693   if (VT.isInteger() && !VT.isVector() &&
   27694       N1C && N0.getOpcode() == ISD::AND &&
   27695       N0.getOperand(1).getOpcode() == ISD::Constant) {
   27696     SDValue N00 = N0.getOperand(0);
   27697     APInt Mask = cast<ConstantSDNode>(N0.getOperand(1))->getAPIntValue();
   27698     const APInt &ShAmt = N1C->getAPIntValue();
   27699     Mask = Mask.shl(ShAmt);
   27700     bool MaskOK = false;
   27701     // We can handle cases concerning bit-widening nodes containing setcc_c if
   27702     // we carefully interrogate the mask to make sure we are semantics
   27703     // preserving.
   27704     // The transform is not safe if the result of C1 << C2 exceeds the bitwidth
   27705     // of the underlying setcc_c operation if the setcc_c was zero extended.
   27706     // Consider the following example:
   27707     //   zext(setcc_c)                 -> i32 0x0000FFFF
   27708     //   c1                            -> i32 0x0000FFFF
   27709     //   c2                            -> i32 0x00000001
   27710     //   (shl (and (setcc_c), c1), c2) -> i32 0x0001FFFE
   27711     //   (and setcc_c, (c1 << c2))     -> i32 0x0000FFFE
   27712     if (N00.getOpcode() == X86ISD::SETCC_CARRY) {
   27713       MaskOK = true;
   27714     } else if (N00.getOpcode() == ISD::SIGN_EXTEND &&
   27715                N00.getOperand(0).getOpcode() == X86ISD::SETCC_CARRY) {
   27716       MaskOK = true;
   27717     } else if ((N00.getOpcode() == ISD::ZERO_EXTEND ||
   27718                 N00.getOpcode() == ISD::ANY_EXTEND) &&
   27719                N00.getOperand(0).getOpcode() == X86ISD::SETCC_CARRY) {
   27720       MaskOK = Mask.isIntN(N00.getOperand(0).getValueSizeInBits());
   27721     }
   27722     if (MaskOK && Mask != 0) {
   27723       SDLoc DL(N);
   27724       return DAG.getNode(ISD::AND, DL, VT, N00, DAG.getConstant(Mask, DL, VT));
   27725     }
   27726   }
   27727 
   27728   // Hardware support for vector shifts is sparse which makes us scalarize the
   27729   // vector operations in many cases. Also, on sandybridge ADD is faster than
   27730   // shl.
   27731   // (shl V, 1) -> add V,V
   27732   if (auto *N1BV = dyn_cast<BuildVectorSDNode>(N1))
   27733     if (auto *N1SplatC = N1BV->getConstantSplatNode()) {
   27734       assert(N0.getValueType().isVector() && "Invalid vector shift type");
   27735       // We shift all of the values by one. In many cases we do not have
   27736       // hardware support for this operation. This is better expressed as an ADD
   27737       // of two values.
   27738       if (N1SplatC->getAPIntValue() == 1)
   27739         return DAG.getNode(ISD::ADD, SDLoc(N), VT, N0, N0);
   27740     }
   27741 
   27742   return SDValue();
   27743 }
   27744 
   27745 static SDValue combineShiftRightAlgebraic(SDNode *N, SelectionDAG &DAG) {
   27746   SDValue N0 = N->getOperand(0);
   27747   SDValue N1 = N->getOperand(1);
   27748   EVT VT = N0.getValueType();
   27749   unsigned Size = VT.getSizeInBits();
   27750 
   27751   // fold (ashr (shl, a, [56,48,32,24,16]), SarConst)
   27752   // into (shl, (sext (a), [56,48,32,24,16] - SarConst)) or
   27753   // into (lshr, (sext (a), SarConst - [56,48,32,24,16]))
   27754   // depending on sign of (SarConst - [56,48,32,24,16])
   27755 
   27756   // sexts in X86 are MOVs. The MOVs have the same code size
   27757   // as above SHIFTs (only SHIFT on 1 has lower code size).
   27758   // However the MOVs have 2 advantages to a SHIFT:
   27759   // 1. MOVs can write to a register that differs from source
   27760   // 2. MOVs accept memory operands
   27761 
   27762   if (!VT.isInteger() || VT.isVector() || N1.getOpcode() != ISD::Constant ||
   27763       N0.getOpcode() != ISD::SHL || !N0.hasOneUse() ||
   27764       N0.getOperand(1).getOpcode() != ISD::Constant)
   27765     return SDValue();
   27766 
   27767   SDValue N00 = N0.getOperand(0);
   27768   SDValue N01 = N0.getOperand(1);
   27769   APInt ShlConst = (cast<ConstantSDNode>(N01))->getAPIntValue();
   27770   APInt SarConst = (cast<ConstantSDNode>(N1))->getAPIntValue();
   27771   EVT CVT = N1.getValueType();
   27772 
   27773   if (SarConst.isNegative())
   27774     return SDValue();
   27775 
   27776   for (MVT SVT : MVT::integer_valuetypes()) {
   27777     unsigned ShiftSize = SVT.getSizeInBits();
   27778     // skipping types without corresponding sext/zext and
   27779     // ShlConst that is not one of [56,48,32,24,16]
   27780     if (ShiftSize < 8 || ShiftSize > 64 || ShlConst != Size - ShiftSize)
   27781       continue;
   27782     SDLoc DL(N);
   27783     SDValue NN =
   27784         DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT, N00, DAG.getValueType(SVT));
   27785     SarConst = SarConst - (Size - ShiftSize);
   27786     if (SarConst == 0)
   27787       return NN;
   27788     else if (SarConst.isNegative())
   27789       return DAG.getNode(ISD::SHL, DL, VT, NN,
   27790                          DAG.getConstant(-SarConst, DL, CVT));
   27791     else
   27792       return DAG.getNode(ISD::SRA, DL, VT, NN,
   27793                          DAG.getConstant(SarConst, DL, CVT));
   27794   }
   27795   return SDValue();
   27796 }
   27797 
   27798 /// \brief Returns a vector of 0s if the node in input is a vector logical
   27799 /// shift by a constant amount which is known to be bigger than or equal
   27800 /// to the vector element size in bits.
   27801 static SDValue performShiftToAllZeros(SDNode *N, SelectionDAG &DAG,
   27802                                       const X86Subtarget &Subtarget) {
   27803   EVT VT = N->getValueType(0);
   27804 
   27805   if (VT != MVT::v2i64 && VT != MVT::v4i32 && VT != MVT::v8i16 &&
   27806       (!Subtarget.hasInt256() ||
   27807        (VT != MVT::v4i64 && VT != MVT::v8i32 && VT != MVT::v16i16)))
   27808     return SDValue();
   27809 
   27810   SDValue Amt = N->getOperand(1);
   27811   SDLoc DL(N);
   27812   if (auto *AmtBV = dyn_cast<BuildVectorSDNode>(Amt))
   27813     if (auto *AmtSplat = AmtBV->getConstantSplatNode()) {
   27814       const APInt &ShiftAmt = AmtSplat->getAPIntValue();
   27815       unsigned MaxAmount =
   27816         VT.getSimpleVT().getVectorElementType().getSizeInBits();
   27817 
   27818       // SSE2/AVX2 logical shifts always return a vector of 0s
   27819       // if the shift amount is bigger than or equal to
   27820       // the element size. The constant shift amount will be
   27821       // encoded as a 8-bit immediate.
   27822       if (ShiftAmt.trunc(8).uge(MaxAmount))
   27823         return getZeroVector(VT.getSimpleVT(), Subtarget, DAG, DL);
   27824     }
   27825 
   27826   return SDValue();
   27827 }
   27828 
   27829 static SDValue combineShift(SDNode* N, SelectionDAG &DAG,
   27830                             TargetLowering::DAGCombinerInfo &DCI,
   27831                             const X86Subtarget &Subtarget) {
   27832   if (N->getOpcode() == ISD::SHL)
   27833     if (SDValue V = combineShiftLeft(N, DAG))
   27834       return V;
   27835 
   27836   if (N->getOpcode() == ISD::SRA)
   27837     if (SDValue V = combineShiftRightAlgebraic(N, DAG))
   27838       return V;
   27839 
   27840   // Try to fold this logical shift into a zero vector.
   27841   if (N->getOpcode() != ISD::SRA)
   27842     if (SDValue V = performShiftToAllZeros(N, DAG, Subtarget))
   27843       return V;
   27844 
   27845   return SDValue();
   27846 }
   27847 
   27848 /// Recognize the distinctive (AND (setcc ...) (setcc ..)) where both setccs
   27849 /// reference the same FP CMP, and rewrite for CMPEQSS and friends. Likewise for
   27850 /// OR -> CMPNEQSS.
   27851 static SDValue combineCompareEqual(SDNode *N, SelectionDAG &DAG,
   27852                                    TargetLowering::DAGCombinerInfo &DCI,
   27853                                    const X86Subtarget &Subtarget) {
   27854   unsigned opcode;
   27855 
   27856   // SSE1 supports CMP{eq|ne}SS, and SSE2 added CMP{eq|ne}SD, but
   27857   // we're requiring SSE2 for both.
   27858   if (Subtarget.hasSSE2() && isAndOrOfSetCCs(SDValue(N, 0U), opcode)) {
   27859     SDValue N0 = N->getOperand(0);
   27860     SDValue N1 = N->getOperand(1);
   27861     SDValue CMP0 = N0->getOperand(1);
   27862     SDValue CMP1 = N1->getOperand(1);
   27863     SDLoc DL(N);
   27864 
   27865     // The SETCCs should both refer to the same CMP.
   27866     if (CMP0.getOpcode() != X86ISD::CMP || CMP0 != CMP1)
   27867       return SDValue();
   27868 
   27869     SDValue CMP00 = CMP0->getOperand(0);
   27870     SDValue CMP01 = CMP0->getOperand(1);
   27871     EVT     VT    = CMP00.getValueType();
   27872 
   27873     if (VT == MVT::f32 || VT == MVT::f64) {
   27874       bool ExpectingFlags = false;
   27875       // Check for any users that want flags:
   27876       for (SDNode::use_iterator UI = N->use_begin(), UE = N->use_end();
   27877            !ExpectingFlags && UI != UE; ++UI)
   27878         switch (UI->getOpcode()) {
   27879         default:
   27880         case ISD::BR_CC:
   27881         case ISD::BRCOND:
   27882         case ISD::SELECT:
   27883           ExpectingFlags = true;
   27884           break;
   27885         case ISD::CopyToReg:
   27886         case ISD::SIGN_EXTEND:
   27887         case ISD::ZERO_EXTEND:
   27888         case ISD::ANY_EXTEND:
   27889           break;
   27890         }
   27891 
   27892       if (!ExpectingFlags) {
   27893         enum X86::CondCode cc0 = (enum X86::CondCode)N0.getConstantOperandVal(0);
   27894         enum X86::CondCode cc1 = (enum X86::CondCode)N1.getConstantOperandVal(0);
   27895 
   27896         if (cc1 == X86::COND_E || cc1 == X86::COND_NE) {
   27897           X86::CondCode tmp = cc0;
   27898           cc0 = cc1;
   27899           cc1 = tmp;
   27900         }
   27901 
   27902         if ((cc0 == X86::COND_E  && cc1 == X86::COND_NP) ||
   27903             (cc0 == X86::COND_NE && cc1 == X86::COND_P)) {
   27904           // FIXME: need symbolic constants for these magic numbers.
   27905           // See X86ATTInstPrinter.cpp:printSSECC().
   27906           unsigned x86cc = (cc0 == X86::COND_E) ? 0 : 4;
   27907           if (Subtarget.hasAVX512()) {
   27908             SDValue FSetCC = DAG.getNode(X86ISD::FSETCC, DL, MVT::i1, CMP00,
   27909                                          CMP01,
   27910                                          DAG.getConstant(x86cc, DL, MVT::i8));
   27911             if (N->getValueType(0) != MVT::i1)
   27912               return DAG.getNode(ISD::ZERO_EXTEND, DL, N->getValueType(0),
   27913                                  FSetCC);
   27914             return FSetCC;
   27915           }
   27916           SDValue OnesOrZeroesF = DAG.getNode(X86ISD::FSETCC, DL,
   27917                                               CMP00.getValueType(), CMP00, CMP01,
   27918                                               DAG.getConstant(x86cc, DL,
   27919                                                               MVT::i8));
   27920 
   27921           bool is64BitFP = (CMP00.getValueType() == MVT::f64);
   27922           MVT IntVT = is64BitFP ? MVT::i64 : MVT::i32;
   27923 
   27924           if (is64BitFP && !Subtarget.is64Bit()) {
   27925             // On a 32-bit target, we cannot bitcast the 64-bit float to a
   27926             // 64-bit integer, since that's not a legal type. Since
   27927             // OnesOrZeroesF is all ones of all zeroes, we don't need all the
   27928             // bits, but can do this little dance to extract the lowest 32 bits
   27929             // and work with those going forward.
   27930             SDValue Vector64 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v2f64,
   27931                                            OnesOrZeroesF);
   27932             SDValue Vector32 = DAG.getBitcast(MVT::v4f32, Vector64);
   27933             OnesOrZeroesF = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32,
   27934                                         Vector32, DAG.getIntPtrConstant(0, DL));
   27935             IntVT = MVT::i32;
   27936           }
   27937 
   27938           SDValue OnesOrZeroesI = DAG.getBitcast(IntVT, OnesOrZeroesF);
   27939           SDValue ANDed = DAG.getNode(ISD::AND, DL, IntVT, OnesOrZeroesI,
   27940                                       DAG.getConstant(1, DL, IntVT));
   27941           SDValue OneBitOfTruth = DAG.getNode(ISD::TRUNCATE, DL, MVT::i8,
   27942                                               ANDed);
   27943           return OneBitOfTruth;
   27944         }
   27945       }
   27946     }
   27947   }
   27948   return SDValue();
   27949 }
   27950 
   27951 /// Try to fold: (and (xor X, -1), Y) -> (andnp X, Y).
   27952 static SDValue combineANDXORWithAllOnesIntoANDNP(SDNode *N, SelectionDAG &DAG) {
   27953   assert(N->getOpcode() == ISD::AND);
   27954 
   27955   EVT VT = N->getValueType(0);
   27956   SDValue N0 = N->getOperand(0);
   27957   SDValue N1 = N->getOperand(1);
   27958   SDLoc DL(N);
   27959 
   27960   if (VT != MVT::v2i64 && VT != MVT::v4i64 &&
   27961       VT != MVT::v8i64 && VT != MVT::v16i32 &&
   27962       VT != MVT::v4i32 && VT != MVT::v8i32) // Legal with VLX
   27963     return SDValue();
   27964 
   27965   // Canonicalize XOR to the left.
   27966   if (N1.getOpcode() == ISD::XOR)
   27967     std::swap(N0, N1);
   27968 
   27969   if (N0.getOpcode() != ISD::XOR)
   27970     return SDValue();
   27971 
   27972   SDValue N00 = N0->getOperand(0);
   27973   SDValue N01 = N0->getOperand(1);
   27974 
   27975   N01 = peekThroughBitcasts(N01);
   27976 
   27977   // Either match a direct AllOnes for 128, 256, and 512-bit vectors, or an
   27978   // insert_subvector building a 256-bit AllOnes vector.
   27979   if (!ISD::isBuildVectorAllOnes(N01.getNode())) {
   27980     if (!VT.is256BitVector() || N01->getOpcode() != ISD::INSERT_SUBVECTOR)
   27981       return SDValue();
   27982 
   27983     SDValue V1 = N01->getOperand(0);
   27984     SDValue V2 = N01->getOperand(1);
   27985     if (V1.getOpcode() != ISD::INSERT_SUBVECTOR ||
   27986         !V1.getOperand(0).isUndef() ||
   27987         !ISD::isBuildVectorAllOnes(V1.getOperand(1).getNode()) ||
   27988         !ISD::isBuildVectorAllOnes(V2.getNode()))
   27989       return SDValue();
   27990   }
   27991   return DAG.getNode(X86ISD::ANDNP, DL, VT, N00, N1);
   27992 }
   27993 
   27994 // On AVX/AVX2 the type v8i1 is legalized to v8i16, which is an XMM sized
   27995 // register. In most cases we actually compare or select YMM-sized registers
   27996 // and mixing the two types creates horrible code. This method optimizes
   27997 // some of the transition sequences.
   27998 static SDValue WidenMaskArithmetic(SDNode *N, SelectionDAG &DAG,
   27999                                  TargetLowering::DAGCombinerInfo &DCI,
   28000                                  const X86Subtarget &Subtarget) {
   28001   EVT VT = N->getValueType(0);
   28002   if (!VT.is256BitVector())
   28003     return SDValue();
   28004 
   28005   assert((N->getOpcode() == ISD::ANY_EXTEND ||
   28006           N->getOpcode() == ISD::ZERO_EXTEND ||
   28007           N->getOpcode() == ISD::SIGN_EXTEND) && "Invalid Node");
   28008 
   28009   SDValue Narrow = N->getOperand(0);
   28010   EVT NarrowVT = Narrow->getValueType(0);
   28011   if (!NarrowVT.is128BitVector())
   28012     return SDValue();
   28013 
   28014   if (Narrow->getOpcode() != ISD::XOR &&
   28015       Narrow->getOpcode() != ISD::AND &&
   28016       Narrow->getOpcode() != ISD::OR)
   28017     return SDValue();
   28018 
   28019   SDValue N0  = Narrow->getOperand(0);
   28020   SDValue N1  = Narrow->getOperand(1);
   28021   SDLoc DL(Narrow);
   28022 
   28023   // The Left side has to be a trunc.
   28024   if (N0.getOpcode() != ISD::TRUNCATE)
   28025     return SDValue();
   28026 
   28027   // The type of the truncated inputs.
   28028   EVT WideVT = N0->getOperand(0)->getValueType(0);
   28029   if (WideVT != VT)
   28030     return SDValue();
   28031 
   28032   // The right side has to be a 'trunc' or a constant vector.
   28033   bool RHSTrunc = N1.getOpcode() == ISD::TRUNCATE;
   28034   ConstantSDNode *RHSConstSplat = nullptr;
   28035   if (auto *RHSBV = dyn_cast<BuildVectorSDNode>(N1))
   28036     RHSConstSplat = RHSBV->getConstantSplatNode();
   28037   if (!RHSTrunc && !RHSConstSplat)
   28038     return SDValue();
   28039 
   28040   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
   28041 
   28042   if (!TLI.isOperationLegalOrPromote(Narrow->getOpcode(), WideVT))
   28043     return SDValue();
   28044 
   28045   // Set N0 and N1 to hold the inputs to the new wide operation.
   28046   N0 = N0->getOperand(0);
   28047   if (RHSConstSplat) {
   28048     N1 = DAG.getNode(ISD::ZERO_EXTEND, DL, WideVT.getVectorElementType(),
   28049                      SDValue(RHSConstSplat, 0));
   28050     N1 = DAG.getSplatBuildVector(WideVT, DL, N1);
   28051   } else if (RHSTrunc) {
   28052     N1 = N1->getOperand(0);
   28053   }
   28054 
   28055   // Generate the wide operation.
   28056   SDValue Op = DAG.getNode(Narrow->getOpcode(), DL, WideVT, N0, N1);
   28057   unsigned Opcode = N->getOpcode();
   28058   switch (Opcode) {
   28059   case ISD::ANY_EXTEND:
   28060     return Op;
   28061   case ISD::ZERO_EXTEND: {
   28062     unsigned InBits = NarrowVT.getScalarSizeInBits();
   28063     APInt Mask = APInt::getAllOnesValue(InBits);
   28064     Mask = Mask.zext(VT.getScalarSizeInBits());
   28065     return DAG.getNode(ISD::AND, DL, VT,
   28066                        Op, DAG.getConstant(Mask, DL, VT));
   28067   }
   28068   case ISD::SIGN_EXTEND:
   28069     return DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT,
   28070                        Op, DAG.getValueType(NarrowVT));
   28071   default:
   28072     llvm_unreachable("Unexpected opcode");
   28073   }
   28074 }
   28075 
   28076 static SDValue combineVectorZext(SDNode *N, SelectionDAG &DAG,
   28077                                  TargetLowering::DAGCombinerInfo &DCI,
   28078                                  const X86Subtarget &Subtarget) {
   28079   SDValue N0 = N->getOperand(0);
   28080   SDValue N1 = N->getOperand(1);
   28081   SDLoc DL(N);
   28082 
   28083   // A vector zext_in_reg may be represented as a shuffle,
   28084   // feeding into a bitcast (this represents anyext) feeding into
   28085   // an and with a mask.
   28086   // We'd like to try to combine that into a shuffle with zero
   28087   // plus a bitcast, removing the and.
   28088   if (N0.getOpcode() != ISD::BITCAST ||
   28089       N0.getOperand(0).getOpcode() != ISD::VECTOR_SHUFFLE)
   28090     return SDValue();
   28091 
   28092   // The other side of the AND should be a splat of 2^C, where C
   28093   // is the number of bits in the source type.
   28094   N1 = peekThroughBitcasts(N1);
   28095   if (N1.getOpcode() != ISD::BUILD_VECTOR)
   28096     return SDValue();
   28097   BuildVectorSDNode *Vector = cast<BuildVectorSDNode>(N1);
   28098 
   28099   ShuffleVectorSDNode *Shuffle = cast<ShuffleVectorSDNode>(N0.getOperand(0));
   28100   EVT SrcType = Shuffle->getValueType(0);
   28101 
   28102   // We expect a single-source shuffle
   28103   if (!Shuffle->getOperand(1)->isUndef())
   28104     return SDValue();
   28105 
   28106   unsigned SrcSize = SrcType.getScalarSizeInBits();
   28107   unsigned NumElems = SrcType.getVectorNumElements();
   28108 
   28109   APInt SplatValue, SplatUndef;
   28110   unsigned SplatBitSize;
   28111   bool HasAnyUndefs;
   28112   if (!Vector->isConstantSplat(SplatValue, SplatUndef,
   28113                                 SplatBitSize, HasAnyUndefs))
   28114     return SDValue();
   28115 
   28116   unsigned ResSize = N1.getValueType().getScalarSizeInBits();
   28117   // Make sure the splat matches the mask we expect
   28118   if (SplatBitSize > ResSize ||
   28119       (SplatValue + 1).exactLogBase2() != (int)SrcSize)
   28120     return SDValue();
   28121 
   28122   // Make sure the input and output size make sense
   28123   if (SrcSize >= ResSize || ResSize % SrcSize)
   28124     return SDValue();
   28125 
   28126   // We expect a shuffle of the form <0, u, u, u, 1, u, u, u...>
   28127   // The number of u's between each two values depends on the ratio between
   28128   // the source and dest type.
   28129   unsigned ZextRatio = ResSize / SrcSize;
   28130   bool IsZext = true;
   28131   for (unsigned i = 0; i != NumElems; ++i) {
   28132     if (i % ZextRatio) {
   28133       if (Shuffle->getMaskElt(i) > 0) {
   28134         // Expected undef
   28135         IsZext = false;
   28136         break;
   28137       }
   28138     } else {
   28139       if (Shuffle->getMaskElt(i) != (int)(i / ZextRatio)) {
   28140         // Expected element number
   28141         IsZext = false;
   28142         break;
   28143       }
   28144     }
   28145   }
   28146 
   28147   if (!IsZext)
   28148     return SDValue();
   28149 
   28150   // Ok, perform the transformation - replace the shuffle with
   28151   // a shuffle of the form <0, k, k, k, 1, k, k, k> with zero
   28152   // (instead of undef) where the k elements come from the zero vector.
   28153   SmallVector<int, 8> Mask;
   28154   for (unsigned i = 0; i != NumElems; ++i)
   28155     if (i % ZextRatio)
   28156       Mask.push_back(NumElems);
   28157     else
   28158       Mask.push_back(i / ZextRatio);
   28159 
   28160   SDValue NewShuffle = DAG.getVectorShuffle(Shuffle->getValueType(0), DL,
   28161     Shuffle->getOperand(0), DAG.getConstant(0, DL, SrcType), Mask);
   28162   return DAG.getBitcast(N0.getValueType(), NewShuffle);
   28163 }
   28164 
   28165 /// If both input operands of a logic op are being cast from floating point
   28166 /// types, try to convert this into a floating point logic node to avoid
   28167 /// unnecessary moves from SSE to integer registers.
   28168 static SDValue convertIntLogicToFPLogic(SDNode *N, SelectionDAG &DAG,
   28169                                         const X86Subtarget &Subtarget) {
   28170   unsigned FPOpcode = ISD::DELETED_NODE;
   28171   if (N->getOpcode() == ISD::AND)
   28172     FPOpcode = X86ISD::FAND;
   28173   else if (N->getOpcode() == ISD::OR)
   28174     FPOpcode = X86ISD::FOR;
   28175   else if (N->getOpcode() == ISD::XOR)
   28176     FPOpcode = X86ISD::FXOR;
   28177 
   28178   assert(FPOpcode != ISD::DELETED_NODE &&
   28179          "Unexpected input node for FP logic conversion");
   28180 
   28181   EVT VT = N->getValueType(0);
   28182   SDValue N0 = N->getOperand(0);
   28183   SDValue N1 = N->getOperand(1);
   28184   SDLoc DL(N);
   28185   if (N0.getOpcode() == ISD::BITCAST && N1.getOpcode() == ISD::BITCAST &&
   28186       ((Subtarget.hasSSE1() && VT == MVT::i32) ||
   28187        (Subtarget.hasSSE2() && VT == MVT::i64))) {
   28188     SDValue N00 = N0.getOperand(0);
   28189     SDValue N10 = N1.getOperand(0);
   28190     EVT N00Type = N00.getValueType();
   28191     EVT N10Type = N10.getValueType();
   28192     if (N00Type.isFloatingPoint() && N10Type.isFloatingPoint()) {
   28193       SDValue FPLogic = DAG.getNode(FPOpcode, DL, N00Type, N00, N10);
   28194       return DAG.getBitcast(VT, FPLogic);
   28195     }
   28196   }
   28197   return SDValue();
   28198 }
   28199 
   28200 /// If this is a PCMPEQ or PCMPGT result that is bitwise-anded with 1 (this is
   28201 /// the x86 lowering of a SETCC + ZEXT), replace the 'and' with a shift-right to
   28202 /// eliminate loading the vector constant mask value. This relies on the fact
   28203 /// that a PCMP always creates an all-ones or all-zeros bitmask per element.
   28204 static SDValue combinePCMPAnd1(SDNode *N, SelectionDAG &DAG) {
   28205   SDValue Op0 = peekThroughBitcasts(N->getOperand(0));
   28206   SDValue Op1 = peekThroughBitcasts(N->getOperand(1));
   28207 
   28208   // TODO: Use AssertSext to mark any nodes that have the property of producing
   28209   // all-ones or all-zeros. Then check for that node rather than particular
   28210   // opcodes.
   28211   if (Op0.getOpcode() != X86ISD::PCMPEQ && Op0.getOpcode() != X86ISD::PCMPGT)
   28212     return SDValue();
   28213 
   28214   // The existence of the PCMP node guarantees that we have the required SSE2 or
   28215   // AVX2 for a shift of this vector type, but there is no vector shift by
   28216   // immediate for a vector with byte elements (PSRLB). 512-bit vectors use the
   28217   // masked compare nodes, so they should not make it here.
   28218   EVT VT0 = Op0.getValueType();
   28219   EVT VT1 = Op1.getValueType();
   28220   unsigned EltBitWidth = VT0.getScalarType().getSizeInBits();
   28221   if (VT0 != VT1 || EltBitWidth == 8)
   28222     return SDValue();
   28223 
   28224   assert(VT0.getSizeInBits() == 128 || VT0.getSizeInBits() == 256);
   28225 
   28226   APInt SplatVal;
   28227   if (!ISD::isConstantSplatVector(Op1.getNode(), SplatVal) || SplatVal != 1)
   28228     return SDValue();
   28229 
   28230   SDLoc DL(N);
   28231   SDValue ShAmt = DAG.getConstant(EltBitWidth - 1, DL, MVT::i8);
   28232   SDValue Shift = DAG.getNode(X86ISD::VSRLI, DL, VT0, Op0, ShAmt);
   28233   return DAG.getBitcast(N->getValueType(0), Shift);
   28234 }
   28235 
   28236 static SDValue combineAnd(SDNode *N, SelectionDAG &DAG,
   28237                           TargetLowering::DAGCombinerInfo &DCI,
   28238                           const X86Subtarget &Subtarget) {
   28239   if (DCI.isBeforeLegalizeOps())
   28240     return SDValue();
   28241 
   28242   if (SDValue Zext = combineVectorZext(N, DAG, DCI, Subtarget))
   28243     return Zext;
   28244 
   28245   if (SDValue R = combineCompareEqual(N, DAG, DCI, Subtarget))
   28246     return R;
   28247 
   28248   if (SDValue FPLogic = convertIntLogicToFPLogic(N, DAG, Subtarget))
   28249     return FPLogic;
   28250 
   28251   if (SDValue R = combineANDXORWithAllOnesIntoANDNP(N, DAG))
   28252     return R;
   28253 
   28254   if (SDValue ShiftRight = combinePCMPAnd1(N, DAG))
   28255     return ShiftRight;
   28256 
   28257   EVT VT = N->getValueType(0);
   28258   SDValue N0 = N->getOperand(0);
   28259   SDValue N1 = N->getOperand(1);
   28260   SDLoc DL(N);
   28261 
   28262   // Create BEXTR instructions
   28263   // BEXTR is ((X >> imm) & (2**size-1))
   28264   if (VT != MVT::i32 && VT != MVT::i64)
   28265     return SDValue();
   28266 
   28267   if (!Subtarget.hasBMI() && !Subtarget.hasTBM())
   28268     return SDValue();
   28269   if (N0.getOpcode() != ISD::SRA && N0.getOpcode() != ISD::SRL)
   28270     return SDValue();
   28271 
   28272   ConstantSDNode *MaskNode = dyn_cast<ConstantSDNode>(N1);
   28273   ConstantSDNode *ShiftNode = dyn_cast<ConstantSDNode>(N0.getOperand(1));
   28274   if (MaskNode && ShiftNode) {
   28275     uint64_t Mask = MaskNode->getZExtValue();
   28276     uint64_t Shift = ShiftNode->getZExtValue();
   28277     if (isMask_64(Mask)) {
   28278       uint64_t MaskSize = countPopulation(Mask);
   28279       if (Shift + MaskSize <= VT.getSizeInBits())
   28280         return DAG.getNode(X86ISD::BEXTR, DL, VT, N0.getOperand(0),
   28281                            DAG.getConstant(Shift | (MaskSize << 8), DL,
   28282                                            VT));
   28283     }
   28284   }
   28285   return SDValue();
   28286 }
   28287 
   28288 // Try to fold:
   28289 //   (or (and (m, y), (pandn m, x)))
   28290 // into:
   28291 //   (vselect m, x, y)
   28292 // As a special case, try to fold:
   28293 //   (or (and (m, (sub 0, x)), (pandn m, x)))
   28294 // into:
   28295 //   (sub (xor X, M), M)
   28296 static SDValue combineLogicBlendIntoPBLENDV(SDNode *N, SelectionDAG &DAG,
   28297                                             const X86Subtarget &Subtarget) {
   28298   assert(N->getOpcode() == ISD::OR);
   28299 
   28300   SDValue N0 = N->getOperand(0);
   28301   SDValue N1 = N->getOperand(1);
   28302   EVT VT = N->getValueType(0);
   28303 
   28304   if (!((VT == MVT::v2i64) || (VT == MVT::v4i64 && Subtarget.hasInt256())))
   28305     return SDValue();
   28306   assert(Subtarget.hasSSE2() && "Unexpected i64 vector without SSE2!");
   28307 
   28308   // Canonicalize pandn to RHS
   28309   if (N0.getOpcode() == X86ISD::ANDNP)
   28310     std::swap(N0, N1);
   28311 
   28312   if (N0.getOpcode() != ISD::AND || N1.getOpcode() != X86ISD::ANDNP)
   28313     return SDValue();
   28314 
   28315   SDValue Mask = N1.getOperand(0);
   28316   SDValue X = N1.getOperand(1);
   28317   SDValue Y;
   28318   if (N0.getOperand(0) == Mask)
   28319     Y = N0.getOperand(1);
   28320   if (N0.getOperand(1) == Mask)
   28321     Y = N0.getOperand(0);
   28322 
   28323   // Check to see if the mask appeared in both the AND and ANDNP.
   28324   if (!Y.getNode())
   28325     return SDValue();
   28326 
   28327   // Validate that X, Y, and Mask are bitcasts, and see through them.
   28328   Mask = peekThroughBitcasts(Mask);
   28329   X = peekThroughBitcasts(X);
   28330   Y = peekThroughBitcasts(Y);
   28331 
   28332   EVT MaskVT = Mask.getValueType();
   28333 
   28334   // Validate that the Mask operand is a vector sra node.
   28335   // FIXME: what to do for bytes, since there is a psignb/pblendvb, but
   28336   // there is no psrai.b
   28337   unsigned EltBits = MaskVT.getVectorElementType().getSizeInBits();
   28338   unsigned SraAmt = ~0;
   28339   if (Mask.getOpcode() == ISD::SRA) {
   28340     if (auto *AmtBV = dyn_cast<BuildVectorSDNode>(Mask.getOperand(1)))
   28341       if (auto *AmtConst = AmtBV->getConstantSplatNode())
   28342         SraAmt = AmtConst->getZExtValue();
   28343   } else if (Mask.getOpcode() == X86ISD::VSRAI) {
   28344     SDValue SraC = Mask.getOperand(1);
   28345     SraAmt = cast<ConstantSDNode>(SraC)->getZExtValue();
   28346   }
   28347   if ((SraAmt + 1) != EltBits)
   28348     return SDValue();
   28349 
   28350   SDLoc DL(N);
   28351 
   28352   // Try to match:
   28353   //   (or (and (M, (sub 0, X)), (pandn M, X)))
   28354   // which is a special case of vselect:
   28355   //   (vselect M, (sub 0, X), X)
   28356   // Per:
   28357   // http://graphics.stanford.edu/~seander/bithacks.html#ConditionalNegate
   28358   // We know that, if fNegate is 0 or 1:
   28359   //   (fNegate ? -v : v) == ((v ^ -fNegate) + fNegate)
   28360   //
   28361   // Here, we have a mask, M (all 1s or 0), and, similarly, we know that:
   28362   //   ((M & 1) ? -X : X) == ((X ^ -(M & 1)) + (M & 1))
   28363   //   ( M      ? -X : X) == ((X ^   M     ) + (M & 1))
   28364   // This lets us transform our vselect to:
   28365   //   (add (xor X, M), (and M, 1))
   28366   // And further to:
   28367   //   (sub (xor X, M), M)
   28368   if (X.getValueType() == MaskVT && Y.getValueType() == MaskVT) {
   28369     auto IsNegV = [](SDNode *N, SDValue V) {
   28370       return N->getOpcode() == ISD::SUB && N->getOperand(1) == V &&
   28371         ISD::isBuildVectorAllZeros(N->getOperand(0).getNode());
   28372     };
   28373     SDValue V;
   28374     if (IsNegV(Y.getNode(), X))
   28375       V = X;
   28376     else if (IsNegV(X.getNode(), Y))
   28377       V = Y;
   28378 
   28379     if (V) {
   28380       assert(EltBits == 8 || EltBits == 16 || EltBits == 32);
   28381       SDValue SubOp1 = DAG.getNode(ISD::XOR, DL, MaskVT, V, Mask);
   28382       SDValue SubOp2 = Mask;
   28383 
   28384       // If the negate was on the false side of the select, then
   28385       // the operands of the SUB need to be swapped. PR 27251.
   28386       // This is because the pattern being matched above is
   28387       // (vselect M, (sub (0, X), X)  -> (sub (xor X, M), M)
   28388       // but if the pattern matched was
   28389       // (vselect M, X, (sub (0, X))), that is really negation of the pattern
   28390       // above, -(vselect M, (sub 0, X), X), and therefore the replacement
   28391       // pattern also needs to be a negation of the replacement pattern above.
   28392       // And -(sub X, Y) is just sub (Y, X), so swapping the operands of the
   28393       // sub accomplishes the negation of the replacement pattern.
   28394       if (V == Y)
   28395          std::swap(SubOp1, SubOp2);
   28396 
   28397       return DAG.getBitcast(VT,
   28398                             DAG.getNode(ISD::SUB, DL, MaskVT, SubOp1, SubOp2));
   28399     }
   28400   }
   28401 
   28402   // PBLENDVB is only available on SSE 4.1.
   28403   if (!Subtarget.hasSSE41())
   28404     return SDValue();
   28405 
   28406   MVT BlendVT = (VT == MVT::v4i64) ? MVT::v32i8 : MVT::v16i8;
   28407 
   28408   X = DAG.getBitcast(BlendVT, X);
   28409   Y = DAG.getBitcast(BlendVT, Y);
   28410   Mask = DAG.getBitcast(BlendVT, Mask);
   28411   Mask = DAG.getNode(ISD::VSELECT, DL, BlendVT, Mask, Y, X);
   28412   return DAG.getBitcast(VT, Mask);
   28413 }
   28414 
   28415 static SDValue combineOr(SDNode *N, SelectionDAG &DAG,
   28416                          TargetLowering::DAGCombinerInfo &DCI,
   28417                          const X86Subtarget &Subtarget) {
   28418   if (DCI.isBeforeLegalizeOps())
   28419     return SDValue();
   28420 
   28421   if (SDValue R = combineCompareEqual(N, DAG, DCI, Subtarget))
   28422     return R;
   28423 
   28424   if (SDValue FPLogic = convertIntLogicToFPLogic(N, DAG, Subtarget))
   28425     return FPLogic;
   28426 
   28427   if (SDValue R = combineLogicBlendIntoPBLENDV(N, DAG, Subtarget))
   28428     return R;
   28429 
   28430   SDValue N0 = N->getOperand(0);
   28431   SDValue N1 = N->getOperand(1);
   28432   EVT VT = N->getValueType(0);
   28433 
   28434   if (VT != MVT::i16 && VT != MVT::i32 && VT != MVT::i64)
   28435     return SDValue();
   28436 
   28437   // fold (or (x << c) | (y >> (64 - c))) ==> (shld64 x, y, c)
   28438   bool OptForSize = DAG.getMachineFunction().getFunction()->optForSize();
   28439 
   28440   // SHLD/SHRD instructions have lower register pressure, but on some
   28441   // platforms they have higher latency than the equivalent
   28442   // series of shifts/or that would otherwise be generated.
   28443   // Don't fold (or (x << c) | (y >> (64 - c))) if SHLD/SHRD instructions
   28444   // have higher latencies and we are not optimizing for size.
   28445   if (!OptForSize && Subtarget.isSHLDSlow())
   28446     return SDValue();
   28447 
   28448   if (N0.getOpcode() == ISD::SRL && N1.getOpcode() == ISD::SHL)
   28449     std::swap(N0, N1);
   28450   if (N0.getOpcode() != ISD::SHL || N1.getOpcode() != ISD::SRL)
   28451     return SDValue();
   28452   if (!N0.hasOneUse() || !N1.hasOneUse())
   28453     return SDValue();
   28454 
   28455   SDValue ShAmt0 = N0.getOperand(1);
   28456   if (ShAmt0.getValueType() != MVT::i8)
   28457     return SDValue();
   28458   SDValue ShAmt1 = N1.getOperand(1);
   28459   if (ShAmt1.getValueType() != MVT::i8)
   28460     return SDValue();
   28461   if (ShAmt0.getOpcode() == ISD::TRUNCATE)
   28462     ShAmt0 = ShAmt0.getOperand(0);
   28463   if (ShAmt1.getOpcode() == ISD::TRUNCATE)
   28464     ShAmt1 = ShAmt1.getOperand(0);
   28465 
   28466   SDLoc DL(N);
   28467   unsigned Opc = X86ISD::SHLD;
   28468   SDValue Op0 = N0.getOperand(0);
   28469   SDValue Op1 = N1.getOperand(0);
   28470   if (ShAmt0.getOpcode() == ISD::SUB) {
   28471     Opc = X86ISD::SHRD;
   28472     std::swap(Op0, Op1);
   28473     std::swap(ShAmt0, ShAmt1);
   28474   }
   28475 
   28476   unsigned Bits = VT.getSizeInBits();
   28477   if (ShAmt1.getOpcode() == ISD::SUB) {
   28478     SDValue Sum = ShAmt1.getOperand(0);
   28479     if (ConstantSDNode *SumC = dyn_cast<ConstantSDNode>(Sum)) {
   28480       SDValue ShAmt1Op1 = ShAmt1.getOperand(1);
   28481       if (ShAmt1Op1.getNode()->getOpcode() == ISD::TRUNCATE)
   28482         ShAmt1Op1 = ShAmt1Op1.getOperand(0);
   28483       if (SumC->getSExtValue() == Bits && ShAmt1Op1 == ShAmt0)
   28484         return DAG.getNode(Opc, DL, VT,
   28485                            Op0, Op1,
   28486                            DAG.getNode(ISD::TRUNCATE, DL,
   28487                                        MVT::i8, ShAmt0));
   28488     }
   28489   } else if (ConstantSDNode *ShAmt1C = dyn_cast<ConstantSDNode>(ShAmt1)) {
   28490     ConstantSDNode *ShAmt0C = dyn_cast<ConstantSDNode>(ShAmt0);
   28491     if (ShAmt0C &&
   28492         ShAmt0C->getSExtValue() + ShAmt1C->getSExtValue() == Bits)
   28493       return DAG.getNode(Opc, DL, VT,
   28494                          N0.getOperand(0), N1.getOperand(0),
   28495                          DAG.getNode(ISD::TRUNCATE, DL,
   28496                                        MVT::i8, ShAmt0));
   28497   }
   28498 
   28499   return SDValue();
   28500 }
   28501 
   28502 // Generate NEG and CMOV for integer abs.
   28503 static SDValue combineIntegerAbs(SDNode *N, SelectionDAG &DAG) {
   28504   EVT VT = N->getValueType(0);
   28505 
   28506   // Since X86 does not have CMOV for 8-bit integer, we don't convert
   28507   // 8-bit integer abs to NEG and CMOV.
   28508   if (VT.isInteger() && VT.getSizeInBits() == 8)
   28509     return SDValue();
   28510 
   28511   SDValue N0 = N->getOperand(0);
   28512   SDValue N1 = N->getOperand(1);
   28513   SDLoc DL(N);
   28514 
   28515   // Check pattern of XOR(ADD(X,Y), Y) where Y is SRA(X, size(X)-1)
   28516   // and change it to SUB and CMOV.
   28517   if (VT.isInteger() && N->getOpcode() == ISD::XOR &&
   28518       N0.getOpcode() == ISD::ADD &&
   28519       N0.getOperand(1) == N1 &&
   28520       N1.getOpcode() == ISD::SRA &&
   28521       N1.getOperand(0) == N0.getOperand(0))
   28522     if (ConstantSDNode *Y1C = dyn_cast<ConstantSDNode>(N1.getOperand(1)))
   28523       if (Y1C->getAPIntValue() == VT.getSizeInBits()-1) {
   28524         // Generate SUB & CMOV.
   28525         SDValue Neg = DAG.getNode(X86ISD::SUB, DL, DAG.getVTList(VT, MVT::i32),
   28526                                   DAG.getConstant(0, DL, VT), N0.getOperand(0));
   28527 
   28528         SDValue Ops[] = { N0.getOperand(0), Neg,
   28529                           DAG.getConstant(X86::COND_GE, DL, MVT::i8),
   28530                           SDValue(Neg.getNode(), 1) };
   28531         return DAG.getNode(X86ISD::CMOV, DL, DAG.getVTList(VT, MVT::Glue), Ops);
   28532       }
   28533   return SDValue();
   28534 }
   28535 
   28536 /// Try to turn tests against the signbit in the form of:
   28537 ///   XOR(TRUNCATE(SRL(X, size(X)-1)), 1)
   28538 /// into:
   28539 ///   SETGT(X, -1)
   28540 static SDValue foldXorTruncShiftIntoCmp(SDNode *N, SelectionDAG &DAG) {
   28541   // This is only worth doing if the output type is i8 or i1.
   28542   EVT ResultType = N->getValueType(0);
   28543   if (ResultType != MVT::i8 && ResultType != MVT::i1)
   28544     return SDValue();
   28545 
   28546   SDValue N0 = N->getOperand(0);
   28547   SDValue N1 = N->getOperand(1);
   28548 
   28549   // We should be performing an xor against a truncated shift.
   28550   if (N0.getOpcode() != ISD::TRUNCATE || !N0.hasOneUse())
   28551     return SDValue();
   28552 
   28553   // Make sure we are performing an xor against one.
   28554   if (!isOneConstant(N1))
   28555     return SDValue();
   28556 
   28557   // SetCC on x86 zero extends so only act on this if it's a logical shift.
   28558   SDValue Shift = N0.getOperand(0);
   28559   if (Shift.getOpcode() != ISD::SRL || !Shift.hasOneUse())
   28560     return SDValue();
   28561 
   28562   // Make sure we are truncating from one of i16, i32 or i64.
   28563   EVT ShiftTy = Shift.getValueType();
   28564   if (ShiftTy != MVT::i16 && ShiftTy != MVT::i32 && ShiftTy != MVT::i64)
   28565     return SDValue();
   28566 
   28567   // Make sure the shift amount extracts the sign bit.
   28568   if (!isa<ConstantSDNode>(Shift.getOperand(1)) ||
   28569       Shift.getConstantOperandVal(1) != ShiftTy.getSizeInBits() - 1)
   28570     return SDValue();
   28571 
   28572   // Create a greater-than comparison against -1.
   28573   // N.B. Using SETGE against 0 works but we want a canonical looking
   28574   // comparison, using SETGT matches up with what TranslateX86CC.
   28575   SDLoc DL(N);
   28576   SDValue ShiftOp = Shift.getOperand(0);
   28577   EVT ShiftOpTy = ShiftOp.getValueType();
   28578   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
   28579   EVT SetCCResultType = TLI.getSetCCResultType(DAG.getDataLayout(),
   28580                                                *DAG.getContext(), ResultType);
   28581   SDValue Cond = DAG.getSetCC(DL, SetCCResultType, ShiftOp,
   28582                               DAG.getConstant(-1, DL, ShiftOpTy), ISD::SETGT);
   28583   if (SetCCResultType != ResultType)
   28584     Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, ResultType, Cond);
   28585   return Cond;
   28586 }
   28587 
   28588 /// Turn vector tests of the signbit in the form of:
   28589 ///   xor (sra X, elt_size(X)-1), -1
   28590 /// into:
   28591 ///   pcmpgt X, -1
   28592 ///
   28593 /// This should be called before type legalization because the pattern may not
   28594 /// persist after that.
   28595 static SDValue foldVectorXorShiftIntoCmp(SDNode *N, SelectionDAG &DAG,
   28596                                          const X86Subtarget &Subtarget) {
   28597   EVT VT = N->getValueType(0);
   28598   if (!VT.isSimple())
   28599     return SDValue();
   28600 
   28601   switch (VT.getSimpleVT().SimpleTy) {
   28602   default: return SDValue();
   28603   case MVT::v16i8:
   28604   case MVT::v8i16:
   28605   case MVT::v4i32: if (!Subtarget.hasSSE2()) return SDValue(); break;
   28606   case MVT::v2i64: if (!Subtarget.hasSSE42()) return SDValue(); break;
   28607   case MVT::v32i8:
   28608   case MVT::v16i16:
   28609   case MVT::v8i32:
   28610   case MVT::v4i64: if (!Subtarget.hasAVX2()) return SDValue(); break;
   28611   }
   28612 
   28613   // There must be a shift right algebraic before the xor, and the xor must be a
   28614   // 'not' operation.
   28615   SDValue Shift = N->getOperand(0);
   28616   SDValue Ones = N->getOperand(1);
   28617   if (Shift.getOpcode() != ISD::SRA || !Shift.hasOneUse() ||
   28618       !ISD::isBuildVectorAllOnes(Ones.getNode()))
   28619     return SDValue();
   28620 
   28621   // The shift should be smearing the sign bit across each vector element.
   28622   auto *ShiftBV = dyn_cast<BuildVectorSDNode>(Shift.getOperand(1));
   28623   if (!ShiftBV)
   28624     return SDValue();
   28625 
   28626   EVT ShiftEltTy = Shift.getValueType().getVectorElementType();
   28627   auto *ShiftAmt = ShiftBV->getConstantSplatNode();
   28628   if (!ShiftAmt || ShiftAmt->getZExtValue() != ShiftEltTy.getSizeInBits() - 1)
   28629     return SDValue();
   28630 
   28631   // Create a greater-than comparison against -1. We don't use the more obvious
   28632   // greater-than-or-equal-to-zero because SSE/AVX don't have that instruction.
   28633   return DAG.getNode(X86ISD::PCMPGT, SDLoc(N), VT, Shift.getOperand(0), Ones);
   28634 }
   28635 
   28636 static SDValue combineXor(SDNode *N, SelectionDAG &DAG,
   28637                                  TargetLowering::DAGCombinerInfo &DCI,
   28638                                  const X86Subtarget &Subtarget) {
   28639   if (SDValue Cmp = foldVectorXorShiftIntoCmp(N, DAG, Subtarget))
   28640     return Cmp;
   28641 
   28642   if (DCI.isBeforeLegalizeOps())
   28643     return SDValue();
   28644 
   28645   if (SDValue RV = foldXorTruncShiftIntoCmp(N, DAG))
   28646     return RV;
   28647 
   28648   if (Subtarget.hasCMov())
   28649     if (SDValue RV = combineIntegerAbs(N, DAG))
   28650       return RV;
   28651 
   28652   if (SDValue FPLogic = convertIntLogicToFPLogic(N, DAG, Subtarget))
   28653     return FPLogic;
   28654 
   28655   return SDValue();
   28656 }
   28657 
   28658 /// This function detects the AVG pattern between vectors of unsigned i8/i16,
   28659 /// which is c = (a + b + 1) / 2, and replace this operation with the efficient
   28660 /// X86ISD::AVG instruction.
   28661 static SDValue detectAVGPattern(SDValue In, EVT VT, SelectionDAG &DAG,
   28662                                 const X86Subtarget &Subtarget,
   28663                                 const SDLoc &DL) {
   28664   if (!VT.isVector() || !VT.isSimple())
   28665     return SDValue();
   28666   EVT InVT = In.getValueType();
   28667   unsigned NumElems = VT.getVectorNumElements();
   28668 
   28669   EVT ScalarVT = VT.getVectorElementType();
   28670   if (!((ScalarVT == MVT::i8 || ScalarVT == MVT::i16) &&
   28671         isPowerOf2_32(NumElems)))
   28672     return SDValue();
   28673 
   28674   // InScalarVT is the intermediate type in AVG pattern and it should be greater
   28675   // than the original input type (i8/i16).
   28676   EVT InScalarVT = InVT.getVectorElementType();
   28677   if (InScalarVT.getSizeInBits() <= ScalarVT.getSizeInBits())
   28678     return SDValue();
   28679 
   28680   if (!Subtarget.hasSSE2())
   28681     return SDValue();
   28682   if (Subtarget.hasAVX512()) {
   28683     if (VT.getSizeInBits() > 512)
   28684       return SDValue();
   28685   } else if (Subtarget.hasAVX2()) {
   28686     if (VT.getSizeInBits() > 256)
   28687       return SDValue();
   28688   } else {
   28689     if (VT.getSizeInBits() > 128)
   28690       return SDValue();
   28691   }
   28692 
   28693   // Detect the following pattern:
   28694   //
   28695   //   %1 = zext <N x i8> %a to <N x i32>
   28696   //   %2 = zext <N x i8> %b to <N x i32>
   28697   //   %3 = add nuw nsw <N x i32> %1, <i32 1 x N>
   28698   //   %4 = add nuw nsw <N x i32> %3, %2
   28699   //   %5 = lshr <N x i32> %N, <i32 1 x N>
   28700   //   %6 = trunc <N x i32> %5 to <N x i8>
   28701   //
   28702   // In AVX512, the last instruction can also be a trunc store.
   28703 
   28704   if (In.getOpcode() != ISD::SRL)
   28705     return SDValue();
   28706 
   28707   // A lambda checking the given SDValue is a constant vector and each element
   28708   // is in the range [Min, Max].
   28709   auto IsConstVectorInRange = [](SDValue V, unsigned Min, unsigned Max) {
   28710     BuildVectorSDNode *BV = dyn_cast<BuildVectorSDNode>(V);
   28711     if (!BV || !BV->isConstant())
   28712       return false;
   28713     for (unsigned i = 0, e = V.getNumOperands(); i < e; i++) {
   28714       ConstantSDNode *C = dyn_cast<ConstantSDNode>(V.getOperand(i));
   28715       if (!C)
   28716         return false;
   28717       uint64_t Val = C->getZExtValue();
   28718       if (Val < Min || Val > Max)
   28719         return false;
   28720     }
   28721     return true;
   28722   };
   28723 
   28724   // Check if each element of the vector is left-shifted by one.
   28725   auto LHS = In.getOperand(0);
   28726   auto RHS = In.getOperand(1);
   28727   if (!IsConstVectorInRange(RHS, 1, 1))
   28728     return SDValue();
   28729   if (LHS.getOpcode() != ISD::ADD)
   28730     return SDValue();
   28731 
   28732   // Detect a pattern of a + b + 1 where the order doesn't matter.
   28733   SDValue Operands[3];
   28734   Operands[0] = LHS.getOperand(0);
   28735   Operands[1] = LHS.getOperand(1);
   28736 
   28737   // Take care of the case when one of the operands is a constant vector whose
   28738   // element is in the range [1, 256].
   28739   if (IsConstVectorInRange(Operands[1], 1, ScalarVT == MVT::i8 ? 256 : 65536) &&
   28740       Operands[0].getOpcode() == ISD::ZERO_EXTEND &&
   28741       Operands[0].getOperand(0).getValueType() == VT) {
   28742     // The pattern is detected. Subtract one from the constant vector, then
   28743     // demote it and emit X86ISD::AVG instruction.
   28744     SDValue VecOnes = DAG.getConstant(1, DL, InVT);
   28745     Operands[1] = DAG.getNode(ISD::SUB, DL, InVT, Operands[1], VecOnes);
   28746     Operands[1] = DAG.getNode(ISD::TRUNCATE, DL, VT, Operands[1]);
   28747     return DAG.getNode(X86ISD::AVG, DL, VT, Operands[0].getOperand(0),
   28748                        Operands[1]);
   28749   }
   28750 
   28751   if (Operands[0].getOpcode() == ISD::ADD)
   28752     std::swap(Operands[0], Operands[1]);
   28753   else if (Operands[1].getOpcode() != ISD::ADD)
   28754     return SDValue();
   28755   Operands[2] = Operands[1].getOperand(0);
   28756   Operands[1] = Operands[1].getOperand(1);
   28757 
   28758   // Now we have three operands of two additions. Check that one of them is a
   28759   // constant vector with ones, and the other two are promoted from i8/i16.
   28760   for (int i = 0; i < 3; ++i) {
   28761     if (!IsConstVectorInRange(Operands[i], 1, 1))
   28762       continue;
   28763     std::swap(Operands[i], Operands[2]);
   28764 
   28765     // Check if Operands[0] and Operands[1] are results of type promotion.
   28766     for (int j = 0; j < 2; ++j)
   28767       if (Operands[j].getOpcode() != ISD::ZERO_EXTEND ||
   28768           Operands[j].getOperand(0).getValueType() != VT)
   28769         return SDValue();
   28770 
   28771     // The pattern is detected, emit X86ISD::AVG instruction.
   28772     return DAG.getNode(X86ISD::AVG, DL, VT, Operands[0].getOperand(0),
   28773                        Operands[1].getOperand(0));
   28774   }
   28775 
   28776   return SDValue();
   28777 }
   28778 
   28779 static SDValue combineLoad(SDNode *N, SelectionDAG &DAG,
   28780                            TargetLowering::DAGCombinerInfo &DCI,
   28781                            const X86Subtarget &Subtarget) {
   28782   LoadSDNode *Ld = cast<LoadSDNode>(N);
   28783   EVT RegVT = Ld->getValueType(0);
   28784   EVT MemVT = Ld->getMemoryVT();
   28785   SDLoc dl(Ld);
   28786   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
   28787 
   28788   // For chips with slow 32-byte unaligned loads, break the 32-byte operation
   28789   // into two 16-byte operations.
   28790   ISD::LoadExtType Ext = Ld->getExtensionType();
   28791   bool Fast;
   28792   unsigned AddressSpace = Ld->getAddressSpace();
   28793   unsigned Alignment = Ld->getAlignment();
   28794   if (RegVT.is256BitVector() && !DCI.isBeforeLegalizeOps() &&
   28795       Ext == ISD::NON_EXTLOAD &&
   28796       TLI.allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), RegVT,
   28797                              AddressSpace, Alignment, &Fast) && !Fast) {
   28798     unsigned NumElems = RegVT.getVectorNumElements();
   28799     if (NumElems < 2)
   28800       return SDValue();
   28801 
   28802     SDValue Ptr = Ld->getBasePtr();
   28803 
   28804     EVT HalfVT = EVT::getVectorVT(*DAG.getContext(), MemVT.getScalarType(),
   28805                                   NumElems/2);
   28806     SDValue Load1 = DAG.getLoad(HalfVT, dl, Ld->getChain(), Ptr,
   28807                                 Ld->getPointerInfo(), Ld->isVolatile(),
   28808                                 Ld->isNonTemporal(), Ld->isInvariant(),
   28809                                 Alignment);
   28810 
   28811     Ptr = DAG.getMemBasePlusOffset(Ptr, 16, dl);
   28812     SDValue Load2 = DAG.getLoad(HalfVT, dl, Ld->getChain(), Ptr,
   28813                                 Ld->getPointerInfo(), Ld->isVolatile(),
   28814                                 Ld->isNonTemporal(), Ld->isInvariant(),
   28815                                 std::min(16U, Alignment));
   28816     SDValue TF = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
   28817                              Load1.getValue(1),
   28818                              Load2.getValue(1));
   28819 
   28820     SDValue NewVec = DAG.getUNDEF(RegVT);
   28821     NewVec = insert128BitVector(NewVec, Load1, 0, DAG, dl);
   28822     NewVec = insert128BitVector(NewVec, Load2, NumElems / 2, DAG, dl);
   28823     return DCI.CombineTo(N, NewVec, TF, true);
   28824   }
   28825 
   28826   return SDValue();
   28827 }
   28828 
   28829 /// If V is a build vector of boolean constants and exactly one of those
   28830 /// constants is true, return the operand index of that true element.
   28831 /// Otherwise, return -1.
   28832 static int getOneTrueElt(SDValue V) {
   28833   // This needs to be a build vector of booleans.
   28834   // TODO: Checking for the i1 type matches the IR definition for the mask,
   28835   // but the mask check could be loosened to i8 or other types. That might
   28836   // also require checking more than 'allOnesValue'; eg, the x86 HW
   28837   // instructions only require that the MSB is set for each mask element.
   28838   // The ISD::MSTORE comments/definition do not specify how the mask operand
   28839   // is formatted.
   28840   auto *BV = dyn_cast<BuildVectorSDNode>(V);
   28841   if (!BV || BV->getValueType(0).getVectorElementType() != MVT::i1)
   28842     return -1;
   28843 
   28844   int TrueIndex = -1;
   28845   unsigned NumElts = BV->getValueType(0).getVectorNumElements();
   28846   for (unsigned i = 0; i < NumElts; ++i) {
   28847     const SDValue &Op = BV->getOperand(i);
   28848     if (Op.isUndef())
   28849       continue;
   28850     auto *ConstNode = dyn_cast<ConstantSDNode>(Op);
   28851     if (!ConstNode)
   28852       return -1;
   28853     if (ConstNode->getAPIntValue().isAllOnesValue()) {
   28854       // If we already found a one, this is too many.
   28855       if (TrueIndex >= 0)
   28856         return -1;
   28857       TrueIndex = i;
   28858     }
   28859   }
   28860   return TrueIndex;
   28861 }
   28862 
   28863 /// Given a masked memory load/store operation, return true if it has one mask
   28864 /// bit set. If it has one mask bit set, then also return the memory address of
   28865 /// the scalar element to load/store, the vector index to insert/extract that
   28866 /// scalar element, and the alignment for the scalar memory access.
   28867 static bool getParamsForOneTrueMaskedElt(MaskedLoadStoreSDNode *MaskedOp,
   28868                                          SelectionDAG &DAG, SDValue &Addr,
   28869                                          SDValue &Index, unsigned &Alignment) {
   28870   int TrueMaskElt = getOneTrueElt(MaskedOp->getMask());
   28871   if (TrueMaskElt < 0)
   28872     return false;
   28873 
   28874   // Get the address of the one scalar element that is specified by the mask
   28875   // using the appropriate offset from the base pointer.
   28876   EVT EltVT = MaskedOp->getMemoryVT().getVectorElementType();
   28877   Addr = MaskedOp->getBasePtr();
   28878   if (TrueMaskElt != 0) {
   28879     unsigned Offset = TrueMaskElt * EltVT.getStoreSize();
   28880     Addr = DAG.getMemBasePlusOffset(Addr, Offset, SDLoc(MaskedOp));
   28881   }
   28882 
   28883   Index = DAG.getIntPtrConstant(TrueMaskElt, SDLoc(MaskedOp));
   28884   Alignment = MinAlign(MaskedOp->getAlignment(), EltVT.getStoreSize());
   28885   return true;
   28886 }
   28887 
   28888 /// If exactly one element of the mask is set for a non-extending masked load,
   28889 /// it is a scalar load and vector insert.
   28890 /// Note: It is expected that the degenerate cases of an all-zeros or all-ones
   28891 /// mask have already been optimized in IR, so we don't bother with those here.
   28892 static SDValue
   28893 reduceMaskedLoadToScalarLoad(MaskedLoadSDNode *ML, SelectionDAG &DAG,
   28894                              TargetLowering::DAGCombinerInfo &DCI) {
   28895   // TODO: This is not x86-specific, so it could be lifted to DAGCombiner.
   28896   // However, some target hooks may need to be added to know when the transform
   28897   // is profitable. Endianness would also have to be considered.
   28898 
   28899   SDValue Addr, VecIndex;
   28900   unsigned Alignment;
   28901   if (!getParamsForOneTrueMaskedElt(ML, DAG, Addr, VecIndex, Alignment))
   28902     return SDValue();
   28903 
   28904   // Load the one scalar element that is specified by the mask using the
   28905   // appropriate offset from the base pointer.
   28906   SDLoc DL(ML);
   28907   EVT VT = ML->getValueType(0);
   28908   EVT EltVT = VT.getVectorElementType();
   28909   SDValue Load = DAG.getLoad(EltVT, DL, ML->getChain(), Addr,
   28910                              ML->getPointerInfo(), ML->isVolatile(),
   28911                              ML->isNonTemporal(), ML->isInvariant(), Alignment);
   28912 
   28913   // Insert the loaded element into the appropriate place in the vector.
   28914   SDValue Insert = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, VT, ML->getSrc0(),
   28915                                Load, VecIndex);
   28916   return DCI.CombineTo(ML, Insert, Load.getValue(1), true);
   28917 }
   28918 
   28919 static SDValue
   28920 combineMaskedLoadConstantMask(MaskedLoadSDNode *ML, SelectionDAG &DAG,
   28921                               TargetLowering::DAGCombinerInfo &DCI) {
   28922   if (!ISD::isBuildVectorOfConstantSDNodes(ML->getMask().getNode()))
   28923     return SDValue();
   28924 
   28925   SDLoc DL(ML);
   28926   EVT VT = ML->getValueType(0);
   28927 
   28928   // If we are loading the first and last elements of a vector, it is safe and
   28929   // always faster to load the whole vector. Replace the masked load with a
   28930   // vector load and select.
   28931   unsigned NumElts = VT.getVectorNumElements();
   28932   BuildVectorSDNode *MaskBV = cast<BuildVectorSDNode>(ML->getMask());
   28933   bool LoadFirstElt = !isNullConstant(MaskBV->getOperand(0));
   28934   bool LoadLastElt = !isNullConstant(MaskBV->getOperand(NumElts - 1));
   28935   if (LoadFirstElt && LoadLastElt) {
   28936     SDValue VecLd = DAG.getLoad(VT, DL, ML->getChain(), ML->getBasePtr(),
   28937                                 ML->getMemOperand());
   28938     SDValue Blend = DAG.getSelect(DL, VT, ML->getMask(), VecLd, ML->getSrc0());
   28939     return DCI.CombineTo(ML, Blend, VecLd.getValue(1), true);
   28940   }
   28941 
   28942   // Convert a masked load with a constant mask into a masked load and a select.
   28943   // This allows the select operation to use a faster kind of select instruction
   28944   // (for example, vblendvps -> vblendps).
   28945 
   28946   // Don't try this if the pass-through operand is already undefined. That would
   28947   // cause an infinite loop because that's what we're about to create.
   28948   if (ML->getSrc0().isUndef())
   28949     return SDValue();
   28950 
   28951   // The new masked load has an undef pass-through operand. The select uses the
   28952   // original pass-through operand.
   28953   SDValue NewML = DAG.getMaskedLoad(VT, DL, ML->getChain(), ML->getBasePtr(),
   28954                                     ML->getMask(), DAG.getUNDEF(VT),
   28955                                     ML->getMemoryVT(), ML->getMemOperand(),
   28956                                     ML->getExtensionType());
   28957   SDValue Blend = DAG.getSelect(DL, VT, ML->getMask(), NewML, ML->getSrc0());
   28958 
   28959   return DCI.CombineTo(ML, Blend, NewML.getValue(1), true);
   28960 }
   28961 
   28962 static SDValue combineMaskedLoad(SDNode *N, SelectionDAG &DAG,
   28963                                  TargetLowering::DAGCombinerInfo &DCI,
   28964                                  const X86Subtarget &Subtarget) {
   28965   MaskedLoadSDNode *Mld = cast<MaskedLoadSDNode>(N);
   28966   if (Mld->getExtensionType() == ISD::NON_EXTLOAD) {
   28967     if (SDValue ScalarLoad = reduceMaskedLoadToScalarLoad(Mld, DAG, DCI))
   28968       return ScalarLoad;
   28969     // TODO: Do some AVX512 subsets benefit from this transform?
   28970     if (!Subtarget.hasAVX512())
   28971       if (SDValue Blend = combineMaskedLoadConstantMask(Mld, DAG, DCI))
   28972         return Blend;
   28973   }
   28974 
   28975   if (Mld->getExtensionType() != ISD::SEXTLOAD)
   28976     return SDValue();
   28977 
   28978   // Resolve extending loads.
   28979   EVT VT = Mld->getValueType(0);
   28980   unsigned NumElems = VT.getVectorNumElements();
   28981   EVT LdVT = Mld->getMemoryVT();
   28982   SDLoc dl(Mld);
   28983 
   28984   assert(LdVT != VT && "Cannot extend to the same type");
   28985   unsigned ToSz = VT.getVectorElementType().getSizeInBits();
   28986   unsigned FromSz = LdVT.getVectorElementType().getSizeInBits();
   28987   // From/To sizes and ElemCount must be pow of two.
   28988   assert (isPowerOf2_32(NumElems * FromSz * ToSz) &&
   28989     "Unexpected size for extending masked load");
   28990 
   28991   unsigned SizeRatio  = ToSz / FromSz;
   28992   assert(SizeRatio * NumElems * FromSz == VT.getSizeInBits());
   28993 
   28994   // Create a type on which we perform the shuffle.
   28995   EVT WideVecVT = EVT::getVectorVT(*DAG.getContext(),
   28996           LdVT.getScalarType(), NumElems*SizeRatio);
   28997   assert(WideVecVT.getSizeInBits() == VT.getSizeInBits());
   28998 
   28999   // Convert Src0 value.
   29000   SDValue WideSrc0 = DAG.getBitcast(WideVecVT, Mld->getSrc0());
   29001   if (!Mld->getSrc0().isUndef()) {
   29002     SmallVector<int, 16> ShuffleVec(NumElems * SizeRatio, -1);
   29003     for (unsigned i = 0; i != NumElems; ++i)
   29004       ShuffleVec[i] = i * SizeRatio;
   29005 
   29006     // Can't shuffle using an illegal type.
   29007     assert(DAG.getTargetLoweringInfo().isTypeLegal(WideVecVT) &&
   29008            "WideVecVT should be legal");
   29009     WideSrc0 = DAG.getVectorShuffle(WideVecVT, dl, WideSrc0,
   29010                                     DAG.getUNDEF(WideVecVT), ShuffleVec);
   29011   }
   29012   // Prepare the new mask.
   29013   SDValue NewMask;
   29014   SDValue Mask = Mld->getMask();
   29015   if (Mask.getValueType() == VT) {
   29016     // Mask and original value have the same type.
   29017     NewMask = DAG.getBitcast(WideVecVT, Mask);
   29018     SmallVector<int, 16> ShuffleVec(NumElems * SizeRatio, -1);
   29019     for (unsigned i = 0; i != NumElems; ++i)
   29020       ShuffleVec[i] = i * SizeRatio;
   29021     for (unsigned i = NumElems; i != NumElems * SizeRatio; ++i)
   29022       ShuffleVec[i] = NumElems * SizeRatio;
   29023     NewMask = DAG.getVectorShuffle(WideVecVT, dl, NewMask,
   29024                                    DAG.getConstant(0, dl, WideVecVT),
   29025                                    ShuffleVec);
   29026   } else {
   29027     assert(Mask.getValueType().getVectorElementType() == MVT::i1);
   29028     unsigned WidenNumElts = NumElems*SizeRatio;
   29029     unsigned MaskNumElts = VT.getVectorNumElements();
   29030     EVT NewMaskVT = EVT::getVectorVT(*DAG.getContext(),  MVT::i1,
   29031                                      WidenNumElts);
   29032 
   29033     unsigned NumConcat = WidenNumElts / MaskNumElts;
   29034     SmallVector<SDValue, 16> Ops(NumConcat);
   29035     SDValue ZeroVal = DAG.getConstant(0, dl, Mask.getValueType());
   29036     Ops[0] = Mask;
   29037     for (unsigned i = 1; i != NumConcat; ++i)
   29038       Ops[i] = ZeroVal;
   29039 
   29040     NewMask = DAG.getNode(ISD::CONCAT_VECTORS, dl, NewMaskVT, Ops);
   29041   }
   29042 
   29043   SDValue WideLd = DAG.getMaskedLoad(WideVecVT, dl, Mld->getChain(),
   29044                                      Mld->getBasePtr(), NewMask, WideSrc0,
   29045                                      Mld->getMemoryVT(), Mld->getMemOperand(),
   29046                                      ISD::NON_EXTLOAD);
   29047   SDValue NewVec = DAG.getNode(X86ISD::VSEXT, dl, VT, WideLd);
   29048   return DCI.CombineTo(N, NewVec, WideLd.getValue(1), true);
   29049 }
   29050 
   29051 /// If exactly one element of the mask is set for a non-truncating masked store,
   29052 /// it is a vector extract and scalar store.
   29053 /// Note: It is expected that the degenerate cases of an all-zeros or all-ones
   29054 /// mask have already been optimized in IR, so we don't bother with those here.
   29055 static SDValue reduceMaskedStoreToScalarStore(MaskedStoreSDNode *MS,
   29056                                               SelectionDAG &DAG) {
   29057   // TODO: This is not x86-specific, so it could be lifted to DAGCombiner.
   29058   // However, some target hooks may need to be added to know when the transform
   29059   // is profitable. Endianness would also have to be considered.
   29060 
   29061   SDValue Addr, VecIndex;
   29062   unsigned Alignment;
   29063   if (!getParamsForOneTrueMaskedElt(MS, DAG, Addr, VecIndex, Alignment))
   29064     return SDValue();
   29065 
   29066   // Extract the one scalar element that is actually being stored.
   29067   SDLoc DL(MS);
   29068   EVT VT = MS->getValue().getValueType();
   29069   EVT EltVT = VT.getVectorElementType();
   29070   SDValue Extract = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltVT,
   29071                                 MS->getValue(), VecIndex);
   29072 
   29073   // Store that element at the appropriate offset from the base pointer.
   29074   return DAG.getStore(MS->getChain(), DL, Extract, Addr, MS->getPointerInfo(),
   29075                       MS->isVolatile(), MS->isNonTemporal(), Alignment);
   29076 }
   29077 
   29078 static SDValue combineMaskedStore(SDNode *N, SelectionDAG &DAG,
   29079                                   const X86Subtarget &Subtarget) {
   29080   MaskedStoreSDNode *Mst = cast<MaskedStoreSDNode>(N);
   29081   if (!Mst->isTruncatingStore())
   29082     return reduceMaskedStoreToScalarStore(Mst, DAG);
   29083 
   29084   // Resolve truncating stores.
   29085   EVT VT = Mst->getValue().getValueType();
   29086   unsigned NumElems = VT.getVectorNumElements();
   29087   EVT StVT = Mst->getMemoryVT();
   29088   SDLoc dl(Mst);
   29089 
   29090   assert(StVT != VT && "Cannot truncate to the same type");
   29091   unsigned FromSz = VT.getVectorElementType().getSizeInBits();
   29092   unsigned ToSz = StVT.getVectorElementType().getSizeInBits();
   29093 
   29094   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
   29095 
   29096   // The truncating store is legal in some cases. For example
   29097   // vpmovqb, vpmovqw, vpmovqd, vpmovdb, vpmovdw
   29098   // are designated for truncate store.
   29099   // In this case we don't need any further transformations.
   29100   if (TLI.isTruncStoreLegal(VT, StVT))
   29101     return SDValue();
   29102 
   29103   // From/To sizes and ElemCount must be pow of two.
   29104   assert (isPowerOf2_32(NumElems * FromSz * ToSz) &&
   29105     "Unexpected size for truncating masked store");
   29106   // We are going to use the original vector elt for storing.
   29107   // Accumulated smaller vector elements must be a multiple of the store size.
   29108   assert (((NumElems * FromSz) % ToSz) == 0 &&
   29109           "Unexpected ratio for truncating masked store");
   29110 
   29111   unsigned SizeRatio  = FromSz / ToSz;
   29112   assert(SizeRatio * NumElems * ToSz == VT.getSizeInBits());
   29113 
   29114   // Create a type on which we perform the shuffle.
   29115   EVT WideVecVT = EVT::getVectorVT(*DAG.getContext(),
   29116           StVT.getScalarType(), NumElems*SizeRatio);
   29117 
   29118   assert(WideVecVT.getSizeInBits() == VT.getSizeInBits());
   29119 
   29120   SDValue WideVec = DAG.getBitcast(WideVecVT, Mst->getValue());
   29121   SmallVector<int, 16> ShuffleVec(NumElems * SizeRatio, -1);
   29122   for (unsigned i = 0; i != NumElems; ++i)
   29123     ShuffleVec[i] = i * SizeRatio;
   29124 
   29125   // Can't shuffle using an illegal type.
   29126   assert(DAG.getTargetLoweringInfo().isTypeLegal(WideVecVT) &&
   29127          "WideVecVT should be legal");
   29128 
   29129   SDValue TruncatedVal = DAG.getVectorShuffle(WideVecVT, dl, WideVec,
   29130                                               DAG.getUNDEF(WideVecVT),
   29131                                               ShuffleVec);
   29132 
   29133   SDValue NewMask;
   29134   SDValue Mask = Mst->getMask();
   29135   if (Mask.getValueType() == VT) {
   29136     // Mask and original value have the same type.
   29137     NewMask = DAG.getBitcast(WideVecVT, Mask);
   29138     for (unsigned i = 0; i != NumElems; ++i)
   29139       ShuffleVec[i] = i * SizeRatio;
   29140     for (unsigned i = NumElems; i != NumElems*SizeRatio; ++i)
   29141       ShuffleVec[i] = NumElems*SizeRatio;
   29142     NewMask = DAG.getVectorShuffle(WideVecVT, dl, NewMask,
   29143                                    DAG.getConstant(0, dl, WideVecVT),
   29144                                    ShuffleVec);
   29145   } else {
   29146     assert(Mask.getValueType().getVectorElementType() == MVT::i1);
   29147     unsigned WidenNumElts = NumElems*SizeRatio;
   29148     unsigned MaskNumElts = VT.getVectorNumElements();
   29149     EVT NewMaskVT = EVT::getVectorVT(*DAG.getContext(),  MVT::i1,
   29150                                      WidenNumElts);
   29151 
   29152     unsigned NumConcat = WidenNumElts / MaskNumElts;
   29153     SmallVector<SDValue, 16> Ops(NumConcat);
   29154     SDValue ZeroVal = DAG.getConstant(0, dl, Mask.getValueType());
   29155     Ops[0] = Mask;
   29156     for (unsigned i = 1; i != NumConcat; ++i)
   29157       Ops[i] = ZeroVal;
   29158 
   29159     NewMask = DAG.getNode(ISD::CONCAT_VECTORS, dl, NewMaskVT, Ops);
   29160   }
   29161 
   29162   return DAG.getMaskedStore(Mst->getChain(), dl, TruncatedVal,
   29163                             Mst->getBasePtr(), NewMask, StVT,
   29164                             Mst->getMemOperand(), false);
   29165 }
   29166 
   29167 static SDValue combineStore(SDNode *N, SelectionDAG &DAG,
   29168                             const X86Subtarget &Subtarget) {
   29169   StoreSDNode *St = cast<StoreSDNode>(N);
   29170   EVT VT = St->getValue().getValueType();
   29171   EVT StVT = St->getMemoryVT();
   29172   SDLoc dl(St);
   29173   SDValue StoredVal = St->getOperand(1);
   29174   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
   29175 
   29176   // If we are saving a concatenation of two XMM registers and 32-byte stores
   29177   // are slow, such as on Sandy Bridge, perform two 16-byte stores.
   29178   bool Fast;
   29179   unsigned AddressSpace = St->getAddressSpace();
   29180   unsigned Alignment = St->getAlignment();
   29181   if (VT.is256BitVector() && StVT == VT &&
   29182       TLI.allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), VT,
   29183                              AddressSpace, Alignment, &Fast) &&
   29184       !Fast) {
   29185     unsigned NumElems = VT.getVectorNumElements();
   29186     if (NumElems < 2)
   29187       return SDValue();
   29188 
   29189     SDValue Value0 = extract128BitVector(StoredVal, 0, DAG, dl);
   29190     SDValue Value1 = extract128BitVector(StoredVal, NumElems / 2, DAG, dl);
   29191 
   29192     SDValue Ptr0 = St->getBasePtr();
   29193     SDValue Ptr1 = DAG.getMemBasePlusOffset(Ptr0, 16, dl);
   29194 
   29195     SDValue Ch0 = DAG.getStore(St->getChain(), dl, Value0, Ptr0,
   29196                                St->getPointerInfo(), St->isVolatile(),
   29197                                St->isNonTemporal(), Alignment);
   29198     SDValue Ch1 = DAG.getStore(St->getChain(), dl, Value1, Ptr1,
   29199                                St->getPointerInfo(), St->isVolatile(),
   29200                                St->isNonTemporal(),
   29201                                std::min(16U, Alignment));
   29202     return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Ch0, Ch1);
   29203   }
   29204 
   29205   // Optimize trunc store (of multiple scalars) to shuffle and store.
   29206   // First, pack all of the elements in one place. Next, store to memory
   29207   // in fewer chunks.
   29208   if (St->isTruncatingStore() && VT.isVector()) {
   29209     // Check if we can detect an AVG pattern from the truncation. If yes,
   29210     // replace the trunc store by a normal store with the result of X86ISD::AVG
   29211     // instruction.
   29212     if (SDValue Avg = detectAVGPattern(St->getValue(), St->getMemoryVT(), DAG,
   29213                                        Subtarget, dl))
   29214       return DAG.getStore(St->getChain(), dl, Avg, St->getBasePtr(),
   29215                           St->getPointerInfo(), St->isVolatile(),
   29216                           St->isNonTemporal(), St->getAlignment());
   29217 
   29218     const TargetLowering &TLI = DAG.getTargetLoweringInfo();
   29219     unsigned NumElems = VT.getVectorNumElements();
   29220     assert(StVT != VT && "Cannot truncate to the same type");
   29221     unsigned FromSz = VT.getVectorElementType().getSizeInBits();
   29222     unsigned ToSz = StVT.getVectorElementType().getSizeInBits();
   29223 
   29224     // The truncating store is legal in some cases. For example
   29225     // vpmovqb, vpmovqw, vpmovqd, vpmovdb, vpmovdw
   29226     // are designated for truncate store.
   29227     // In this case we don't need any further transformations.
   29228     if (TLI.isTruncStoreLegalOrCustom(VT, StVT))
   29229       return SDValue();
   29230 
   29231     // From, To sizes and ElemCount must be pow of two
   29232     if (!isPowerOf2_32(NumElems * FromSz * ToSz)) return SDValue();
   29233     // We are going to use the original vector elt for storing.
   29234     // Accumulated smaller vector elements must be a multiple of the store size.
   29235     if (0 != (NumElems * FromSz) % ToSz) return SDValue();
   29236 
   29237     unsigned SizeRatio  = FromSz / ToSz;
   29238 
   29239     assert(SizeRatio * NumElems * ToSz == VT.getSizeInBits());
   29240 
   29241     // Create a type on which we perform the shuffle
   29242     EVT WideVecVT = EVT::getVectorVT(*DAG.getContext(),
   29243             StVT.getScalarType(), NumElems*SizeRatio);
   29244 
   29245     assert(WideVecVT.getSizeInBits() == VT.getSizeInBits());
   29246 
   29247     SDValue WideVec = DAG.getBitcast(WideVecVT, St->getValue());
   29248     SmallVector<int, 8> ShuffleVec(NumElems * SizeRatio, -1);
   29249     for (unsigned i = 0; i != NumElems; ++i)
   29250       ShuffleVec[i] = i * SizeRatio;
   29251 
   29252     // Can't shuffle using an illegal type.
   29253     if (!TLI.isTypeLegal(WideVecVT))
   29254       return SDValue();
   29255 
   29256     SDValue Shuff = DAG.getVectorShuffle(WideVecVT, dl, WideVec,
   29257                                          DAG.getUNDEF(WideVecVT),
   29258                                          ShuffleVec);
   29259     // At this point all of the data is stored at the bottom of the
   29260     // register. We now need to save it to mem.
   29261 
   29262     // Find the largest store unit
   29263     MVT StoreType = MVT::i8;
   29264     for (MVT Tp : MVT::integer_valuetypes()) {
   29265       if (TLI.isTypeLegal(Tp) && Tp.getSizeInBits() <= NumElems * ToSz)
   29266         StoreType = Tp;
   29267     }
   29268 
   29269     // On 32bit systems, we can't save 64bit integers. Try bitcasting to F64.
   29270     if (TLI.isTypeLegal(MVT::f64) && StoreType.getSizeInBits() < 64 &&
   29271         (64 <= NumElems * ToSz))
   29272       StoreType = MVT::f64;
   29273 
   29274     // Bitcast the original vector into a vector of store-size units
   29275     EVT StoreVecVT = EVT::getVectorVT(*DAG.getContext(),
   29276             StoreType, VT.getSizeInBits()/StoreType.getSizeInBits());
   29277     assert(StoreVecVT.getSizeInBits() == VT.getSizeInBits());
   29278     SDValue ShuffWide = DAG.getBitcast(StoreVecVT, Shuff);
   29279     SmallVector<SDValue, 8> Chains;
   29280     SDValue Ptr = St->getBasePtr();
   29281 
   29282     // Perform one or more big stores into memory.
   29283     for (unsigned i=0, e=(ToSz*NumElems)/StoreType.getSizeInBits(); i!=e; ++i) {
   29284       SDValue SubVec = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl,
   29285                                    StoreType, ShuffWide,
   29286                                    DAG.getIntPtrConstant(i, dl));
   29287       SDValue Ch = DAG.getStore(St->getChain(), dl, SubVec, Ptr,
   29288                                 St->getPointerInfo(), St->isVolatile(),
   29289                                 St->isNonTemporal(), St->getAlignment());
   29290       Ptr = DAG.getMemBasePlusOffset(Ptr, StoreType.getStoreSize(), dl);
   29291       Chains.push_back(Ch);
   29292     }
   29293 
   29294     return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Chains);
   29295   }
   29296 
   29297   // Turn load->store of MMX types into GPR load/stores.  This avoids clobbering
   29298   // the FP state in cases where an emms may be missing.
   29299   // A preferable solution to the general problem is to figure out the right
   29300   // places to insert EMMS.  This qualifies as a quick hack.
   29301 
   29302   // Similarly, turn load->store of i64 into double load/stores in 32-bit mode.
   29303   if (VT.getSizeInBits() != 64)
   29304     return SDValue();
   29305 
   29306   const Function *F = DAG.getMachineFunction().getFunction();
   29307   bool NoImplicitFloatOps = F->hasFnAttribute(Attribute::NoImplicitFloat);
   29308   bool F64IsLegal =
   29309       !Subtarget.useSoftFloat() && !NoImplicitFloatOps && Subtarget.hasSSE2();
   29310   if ((VT.isVector() ||
   29311        (VT == MVT::i64 && F64IsLegal && !Subtarget.is64Bit())) &&
   29312       isa<LoadSDNode>(St->getValue()) &&
   29313       !cast<LoadSDNode>(St->getValue())->isVolatile() &&
   29314       St->getChain().hasOneUse() && !St->isVolatile()) {
   29315     SDNode* LdVal = St->getValue().getNode();
   29316     LoadSDNode *Ld = nullptr;
   29317     int TokenFactorIndex = -1;
   29318     SmallVector<SDValue, 8> Ops;
   29319     SDNode* ChainVal = St->getChain().getNode();
   29320     // Must be a store of a load.  We currently handle two cases:  the load
   29321     // is a direct child, and it's under an intervening TokenFactor.  It is
   29322     // possible to dig deeper under nested TokenFactors.
   29323     if (ChainVal == LdVal)
   29324       Ld = cast<LoadSDNode>(St->getChain());
   29325     else if (St->getValue().hasOneUse() &&
   29326              ChainVal->getOpcode() == ISD::TokenFactor) {
   29327       for (unsigned i = 0, e = ChainVal->getNumOperands(); i != e; ++i) {
   29328         if (ChainVal->getOperand(i).getNode() == LdVal) {
   29329           TokenFactorIndex = i;
   29330           Ld = cast<LoadSDNode>(St->getValue());
   29331         } else
   29332           Ops.push_back(ChainVal->getOperand(i));
   29333       }
   29334     }
   29335 
   29336     if (!Ld || !ISD::isNormalLoad(Ld))
   29337       return SDValue();
   29338 
   29339     // If this is not the MMX case, i.e. we are just turning i64 load/store
   29340     // into f64 load/store, avoid the transformation if there are multiple
   29341     // uses of the loaded value.
   29342     if (!VT.isVector() && !Ld->hasNUsesOfValue(1, 0))
   29343       return SDValue();
   29344 
   29345     SDLoc LdDL(Ld);
   29346     SDLoc StDL(N);
   29347     // If we are a 64-bit capable x86, lower to a single movq load/store pair.
   29348     // Otherwise, if it's legal to use f64 SSE instructions, use f64 load/store
   29349     // pair instead.
   29350     if (Subtarget.is64Bit() || F64IsLegal) {
   29351       MVT LdVT = Subtarget.is64Bit() ? MVT::i64 : MVT::f64;
   29352       SDValue NewLd = DAG.getLoad(LdVT, LdDL, Ld->getChain(), Ld->getBasePtr(),
   29353                                   Ld->getPointerInfo(), Ld->isVolatile(),
   29354                                   Ld->isNonTemporal(), Ld->isInvariant(),
   29355                                   Ld->getAlignment());
   29356       SDValue NewChain = NewLd.getValue(1);
   29357       if (TokenFactorIndex >= 0) {
   29358         Ops.push_back(NewChain);
   29359         NewChain = DAG.getNode(ISD::TokenFactor, LdDL, MVT::Other, Ops);
   29360       }
   29361       return DAG.getStore(NewChain, StDL, NewLd, St->getBasePtr(),
   29362                           St->getPointerInfo(),
   29363                           St->isVolatile(), St->isNonTemporal(),
   29364                           St->getAlignment());
   29365     }
   29366 
   29367     // Otherwise, lower to two pairs of 32-bit loads / stores.
   29368     SDValue LoAddr = Ld->getBasePtr();
   29369     SDValue HiAddr = DAG.getMemBasePlusOffset(LoAddr, 4, LdDL);
   29370 
   29371     SDValue LoLd = DAG.getLoad(MVT::i32, LdDL, Ld->getChain(), LoAddr,
   29372                                Ld->getPointerInfo(),
   29373                                Ld->isVolatile(), Ld->isNonTemporal(),
   29374                                Ld->isInvariant(), Ld->getAlignment());
   29375     SDValue HiLd = DAG.getLoad(MVT::i32, LdDL, Ld->getChain(), HiAddr,
   29376                                Ld->getPointerInfo().getWithOffset(4),
   29377                                Ld->isVolatile(), Ld->isNonTemporal(),
   29378                                Ld->isInvariant(),
   29379                                MinAlign(Ld->getAlignment(), 4));
   29380 
   29381     SDValue NewChain = LoLd.getValue(1);
   29382     if (TokenFactorIndex >= 0) {
   29383       Ops.push_back(LoLd);
   29384       Ops.push_back(HiLd);
   29385       NewChain = DAG.getNode(ISD::TokenFactor, LdDL, MVT::Other, Ops);
   29386     }
   29387 
   29388     LoAddr = St->getBasePtr();
   29389     HiAddr = DAG.getMemBasePlusOffset(LoAddr, 4, StDL);
   29390 
   29391     SDValue LoSt = DAG.getStore(NewChain, StDL, LoLd, LoAddr,
   29392                                 St->getPointerInfo(),
   29393                                 St->isVolatile(), St->isNonTemporal(),
   29394                                 St->getAlignment());
   29395     SDValue HiSt = DAG.getStore(NewChain, StDL, HiLd, HiAddr,
   29396                                 St->getPointerInfo().getWithOffset(4),
   29397                                 St->isVolatile(),
   29398                                 St->isNonTemporal(),
   29399                                 MinAlign(St->getAlignment(), 4));
   29400     return DAG.getNode(ISD::TokenFactor, StDL, MVT::Other, LoSt, HiSt);
   29401   }
   29402 
   29403   // This is similar to the above case, but here we handle a scalar 64-bit
   29404   // integer store that is extracted from a vector on a 32-bit target.
   29405   // If we have SSE2, then we can treat it like a floating-point double
   29406   // to get past legalization. The execution dependencies fixup pass will
   29407   // choose the optimal machine instruction for the store if this really is
   29408   // an integer or v2f32 rather than an f64.
   29409   if (VT == MVT::i64 && F64IsLegal && !Subtarget.is64Bit() &&
   29410       St->getOperand(1).getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
   29411     SDValue OldExtract = St->getOperand(1);
   29412     SDValue ExtOp0 = OldExtract.getOperand(0);
   29413     unsigned VecSize = ExtOp0.getValueSizeInBits();
   29414     EVT VecVT = EVT::getVectorVT(*DAG.getContext(), MVT::f64, VecSize / 64);
   29415     SDValue BitCast = DAG.getBitcast(VecVT, ExtOp0);
   29416     SDValue NewExtract = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64,
   29417                                      BitCast, OldExtract.getOperand(1));
   29418     return DAG.getStore(St->getChain(), dl, NewExtract, St->getBasePtr(),
   29419                         St->getPointerInfo(), St->isVolatile(),
   29420                         St->isNonTemporal(), St->getAlignment());
   29421   }
   29422 
   29423   return SDValue();
   29424 }
   29425 
   29426 /// Return 'true' if this vector operation is "horizontal"
   29427 /// and return the operands for the horizontal operation in LHS and RHS.  A
   29428 /// horizontal operation performs the binary operation on successive elements
   29429 /// of its first operand, then on successive elements of its second operand,
   29430 /// returning the resulting values in a vector.  For example, if
   29431 ///   A = < float a0, float a1, float a2, float a3 >
   29432 /// and
   29433 ///   B = < float b0, float b1, float b2, float b3 >
   29434 /// then the result of doing a horizontal operation on A and B is
   29435 ///   A horizontal-op B = < a0 op a1, a2 op a3, b0 op b1, b2 op b3 >.
   29436 /// In short, LHS and RHS are inspected to see if LHS op RHS is of the form
   29437 /// A horizontal-op B, for some already available A and B, and if so then LHS is
   29438 /// set to A, RHS to B, and the routine returns 'true'.
   29439 /// Note that the binary operation should have the property that if one of the
   29440 /// operands is UNDEF then the result is UNDEF.
   29441 static bool isHorizontalBinOp(SDValue &LHS, SDValue &RHS, bool IsCommutative) {
   29442   // Look for the following pattern: if
   29443   //   A = < float a0, float a1, float a2, float a3 >
   29444   //   B = < float b0, float b1, float b2, float b3 >
   29445   // and
   29446   //   LHS = VECTOR_SHUFFLE A, B, <0, 2, 4, 6>
   29447   //   RHS = VECTOR_SHUFFLE A, B, <1, 3, 5, 7>
   29448   // then LHS op RHS = < a0 op a1, a2 op a3, b0 op b1, b2 op b3 >
   29449   // which is A horizontal-op B.
   29450 
   29451   // At least one of the operands should be a vector shuffle.
   29452   if (LHS.getOpcode() != ISD::VECTOR_SHUFFLE &&
   29453       RHS.getOpcode() != ISD::VECTOR_SHUFFLE)
   29454     return false;
   29455 
   29456   MVT VT = LHS.getSimpleValueType();
   29457 
   29458   assert((VT.is128BitVector() || VT.is256BitVector()) &&
   29459          "Unsupported vector type for horizontal add/sub");
   29460 
   29461   // Handle 128 and 256-bit vector lengths. AVX defines horizontal add/sub to
   29462   // operate independently on 128-bit lanes.
   29463   unsigned NumElts = VT.getVectorNumElements();
   29464   unsigned NumLanes = VT.getSizeInBits()/128;
   29465   unsigned NumLaneElts = NumElts / NumLanes;
   29466   assert((NumLaneElts % 2 == 0) &&
   29467          "Vector type should have an even number of elements in each lane");
   29468   unsigned HalfLaneElts = NumLaneElts/2;
   29469 
   29470   // View LHS in the form
   29471   //   LHS = VECTOR_SHUFFLE A, B, LMask
   29472   // If LHS is not a shuffle then pretend it is the shuffle
   29473   //   LHS = VECTOR_SHUFFLE LHS, undef, <0, 1, ..., N-1>
   29474   // NOTE: in what follows a default initialized SDValue represents an UNDEF of
   29475   // type VT.
   29476   SDValue A, B;
   29477   SmallVector<int, 16> LMask(NumElts);
   29478   if (LHS.getOpcode() == ISD::VECTOR_SHUFFLE) {
   29479     if (!LHS.getOperand(0).isUndef())
   29480       A = LHS.getOperand(0);
   29481     if (!LHS.getOperand(1).isUndef())
   29482       B = LHS.getOperand(1);
   29483     ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(LHS.getNode())->getMask();
   29484     std::copy(Mask.begin(), Mask.end(), LMask.begin());
   29485   } else {
   29486     if (!LHS.isUndef())
   29487       A = LHS;
   29488     for (unsigned i = 0; i != NumElts; ++i)
   29489       LMask[i] = i;
   29490   }
   29491 
   29492   // Likewise, view RHS in the form
   29493   //   RHS = VECTOR_SHUFFLE C, D, RMask
   29494   SDValue C, D;
   29495   SmallVector<int, 16> RMask(NumElts);
   29496   if (RHS.getOpcode() == ISD::VECTOR_SHUFFLE) {
   29497     if (!RHS.getOperand(0).isUndef())
   29498       C = RHS.getOperand(0);
   29499     if (!RHS.getOperand(1).isUndef())
   29500       D = RHS.getOperand(1);
   29501     ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(RHS.getNode())->getMask();
   29502     std::copy(Mask.begin(), Mask.end(), RMask.begin());
   29503   } else {
   29504     if (!RHS.isUndef())
   29505       C = RHS;
   29506     for (unsigned i = 0; i != NumElts; ++i)
   29507       RMask[i] = i;
   29508   }
   29509 
   29510   // Check that the shuffles are both shuffling the same vectors.
   29511   if (!(A == C && B == D) && !(A == D && B == C))
   29512     return false;
   29513 
   29514   // If everything is UNDEF then bail out: it would be better to fold to UNDEF.
   29515   if (!A.getNode() && !B.getNode())
   29516     return false;
   29517 
   29518   // If A and B occur in reverse order in RHS, then "swap" them (which means
   29519   // rewriting the mask).
   29520   if (A != C)
   29521     ShuffleVectorSDNode::commuteMask(RMask);
   29522 
   29523   // At this point LHS and RHS are equivalent to
   29524   //   LHS = VECTOR_SHUFFLE A, B, LMask
   29525   //   RHS = VECTOR_SHUFFLE A, B, RMask
   29526   // Check that the masks correspond to performing a horizontal operation.
   29527   for (unsigned l = 0; l != NumElts; l += NumLaneElts) {
   29528     for (unsigned i = 0; i != NumLaneElts; ++i) {
   29529       int LIdx = LMask[i+l], RIdx = RMask[i+l];
   29530 
   29531       // Ignore any UNDEF components.
   29532       if (LIdx < 0 || RIdx < 0 ||
   29533           (!A.getNode() && (LIdx < (int)NumElts || RIdx < (int)NumElts)) ||
   29534           (!B.getNode() && (LIdx >= (int)NumElts || RIdx >= (int)NumElts)))
   29535         continue;
   29536 
   29537       // Check that successive elements are being operated on.  If not, this is
   29538       // not a horizontal operation.
   29539       unsigned Src = (i/HalfLaneElts); // each lane is split between srcs
   29540       int Index = 2*(i%HalfLaneElts) + NumElts*Src + l;
   29541       if (!(LIdx == Index && RIdx == Index + 1) &&
   29542           !(IsCommutative && LIdx == Index + 1 && RIdx == Index))
   29543         return false;
   29544     }
   29545   }
   29546 
   29547   LHS = A.getNode() ? A : B; // If A is 'UNDEF', use B for it.
   29548   RHS = B.getNode() ? B : A; // If B is 'UNDEF', use A for it.
   29549   return true;
   29550 }
   29551 
   29552 /// Do target-specific dag combines on floating-point adds/subs.
   29553 static SDValue combineFaddFsub(SDNode *N, SelectionDAG &DAG,
   29554                                const X86Subtarget &Subtarget) {
   29555   EVT VT = N->getValueType(0);
   29556   SDValue LHS = N->getOperand(0);
   29557   SDValue RHS = N->getOperand(1);
   29558   bool IsFadd = N->getOpcode() == ISD::FADD;
   29559   assert((IsFadd || N->getOpcode() == ISD::FSUB) && "Wrong opcode");
   29560 
   29561   // Try to synthesize horizontal add/sub from adds/subs of shuffles.
   29562   if (((Subtarget.hasSSE3() && (VT == MVT::v4f32 || VT == MVT::v2f64)) ||
   29563        (Subtarget.hasFp256() && (VT == MVT::v8f32 || VT == MVT::v4f64))) &&
   29564       isHorizontalBinOp(LHS, RHS, IsFadd)) {
   29565     auto NewOpcode = IsFadd ? X86ISD::FHADD : X86ISD::FHSUB;
   29566     return DAG.getNode(NewOpcode, SDLoc(N), VT, LHS, RHS);
   29567   }
   29568   return SDValue();
   29569 }
   29570 
   29571 /// Truncate a group of v4i32 into v16i8/v8i16 using X86ISD::PACKUS.
   29572 static SDValue
   29573 combineVectorTruncationWithPACKUS(SDNode *N, SelectionDAG &DAG,
   29574                                   SmallVector<SDValue, 8> &Regs) {
   29575   assert(Regs.size() > 0 && (Regs[0].getValueType() == MVT::v4i32 ||
   29576                              Regs[0].getValueType() == MVT::v2i64));
   29577   EVT OutVT = N->getValueType(0);
   29578   EVT OutSVT = OutVT.getVectorElementType();
   29579   EVT InVT = Regs[0].getValueType();
   29580   EVT InSVT = InVT.getVectorElementType();
   29581   SDLoc DL(N);
   29582 
   29583   // First, use mask to unset all bits that won't appear in the result.
   29584   assert((OutSVT == MVT::i8 || OutSVT == MVT::i16) &&
   29585          "OutSVT can only be either i8 or i16.");
   29586   APInt Mask =
   29587       APInt::getLowBitsSet(InSVT.getSizeInBits(), OutSVT.getSizeInBits());
   29588   SDValue MaskVal = DAG.getConstant(Mask, DL, InVT);
   29589   for (auto &Reg : Regs)
   29590     Reg = DAG.getNode(ISD::AND, DL, InVT, MaskVal, Reg);
   29591 
   29592   MVT UnpackedVT, PackedVT;
   29593   if (OutSVT == MVT::i8) {
   29594     UnpackedVT = MVT::v8i16;
   29595     PackedVT = MVT::v16i8;
   29596   } else {
   29597     UnpackedVT = MVT::v4i32;
   29598     PackedVT = MVT::v8i16;
   29599   }
   29600 
   29601   // In each iteration, truncate the type by a half size.
   29602   auto RegNum = Regs.size();
   29603   for (unsigned j = 1, e = InSVT.getSizeInBits() / OutSVT.getSizeInBits();
   29604        j < e; j *= 2, RegNum /= 2) {
   29605     for (unsigned i = 0; i < RegNum; i++)
   29606       Regs[i] = DAG.getBitcast(UnpackedVT, Regs[i]);
   29607     for (unsigned i = 0; i < RegNum / 2; i++)
   29608       Regs[i] = DAG.getNode(X86ISD::PACKUS, DL, PackedVT, Regs[i * 2],
   29609                             Regs[i * 2 + 1]);
   29610   }
   29611 
   29612   // If the type of the result is v8i8, we need do one more X86ISD::PACKUS, and
   29613   // then extract a subvector as the result since v8i8 is not a legal type.
   29614   if (OutVT == MVT::v8i8) {
   29615     Regs[0] = DAG.getNode(X86ISD::PACKUS, DL, PackedVT, Regs[0], Regs[0]);
   29616     Regs[0] = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, OutVT, Regs[0],
   29617                           DAG.getIntPtrConstant(0, DL));
   29618     return Regs[0];
   29619   } else if (RegNum > 1) {
   29620     Regs.resize(RegNum);
   29621     return DAG.getNode(ISD::CONCAT_VECTORS, DL, OutVT, Regs);
   29622   } else
   29623     return Regs[0];
   29624 }
   29625 
   29626 /// Truncate a group of v4i32 into v8i16 using X86ISD::PACKSS.
   29627 static SDValue
   29628 combineVectorTruncationWithPACKSS(SDNode *N, SelectionDAG &DAG,
   29629                                   SmallVector<SDValue, 8> &Regs) {
   29630   assert(Regs.size() > 0 && Regs[0].getValueType() == MVT::v4i32);
   29631   EVT OutVT = N->getValueType(0);
   29632   SDLoc DL(N);
   29633 
   29634   // Shift left by 16 bits, then arithmetic-shift right by 16 bits.
   29635   SDValue ShAmt = DAG.getConstant(16, DL, MVT::i32);
   29636   for (auto &Reg : Regs) {
   29637     Reg = getTargetVShiftNode(X86ISD::VSHLI, DL, MVT::v4i32, Reg, ShAmt, DAG);
   29638     Reg = getTargetVShiftNode(X86ISD::VSRAI, DL, MVT::v4i32, Reg, ShAmt, DAG);
   29639   }
   29640 
   29641   for (unsigned i = 0, e = Regs.size() / 2; i < e; i++)
   29642     Regs[i] = DAG.getNode(X86ISD::PACKSS, DL, MVT::v8i16, Regs[i * 2],
   29643                           Regs[i * 2 + 1]);
   29644 
   29645   if (Regs.size() > 2) {
   29646     Regs.resize(Regs.size() / 2);
   29647     return DAG.getNode(ISD::CONCAT_VECTORS, DL, OutVT, Regs);
   29648   } else
   29649     return Regs[0];
   29650 }
   29651 
   29652 /// This function transforms truncation from vXi32/vXi64 to vXi8/vXi16 into
   29653 /// X86ISD::PACKUS/X86ISD::PACKSS operations. We do it here because after type
   29654 /// legalization the truncation will be translated into a BUILD_VECTOR with each
   29655 /// element that is extracted from a vector and then truncated, and it is
   29656 /// diffcult to do this optimization based on them.
   29657 static SDValue combineVectorTruncation(SDNode *N, SelectionDAG &DAG,
   29658                                        const X86Subtarget &Subtarget) {
   29659   EVT OutVT = N->getValueType(0);
   29660   if (!OutVT.isVector())
   29661     return SDValue();
   29662 
   29663   SDValue In = N->getOperand(0);
   29664   if (!In.getValueType().isSimple())
   29665     return SDValue();
   29666 
   29667   EVT InVT = In.getValueType();
   29668   unsigned NumElems = OutVT.getVectorNumElements();
   29669 
   29670   // TODO: On AVX2, the behavior of X86ISD::PACKUS is different from that on
   29671   // SSE2, and we need to take care of it specially.
   29672   // AVX512 provides vpmovdb.
   29673   if (!Subtarget.hasSSE2() || Subtarget.hasAVX2())
   29674     return SDValue();
   29675 
   29676   EVT OutSVT = OutVT.getVectorElementType();
   29677   EVT InSVT = InVT.getVectorElementType();
   29678   if (!((InSVT == MVT::i32 || InSVT == MVT::i64) &&
   29679         (OutSVT == MVT::i8 || OutSVT == MVT::i16) && isPowerOf2_32(NumElems) &&
   29680         NumElems >= 8))
   29681     return SDValue();
   29682 
   29683   // SSSE3's pshufb results in less instructions in the cases below.
   29684   if (Subtarget.hasSSSE3() && NumElems == 8 &&
   29685       ((OutSVT == MVT::i8 && InSVT != MVT::i64) ||
   29686        (InSVT == MVT::i32 && OutSVT == MVT::i16)))
   29687     return SDValue();
   29688 
   29689   SDLoc DL(N);
   29690 
   29691   // Split a long vector into vectors of legal type.
   29692   unsigned RegNum = InVT.getSizeInBits() / 128;
   29693   SmallVector<SDValue, 8> SubVec(RegNum);
   29694   unsigned NumSubRegElts = 128 / InSVT.getSizeInBits();
   29695   EVT SubRegVT = EVT::getVectorVT(*DAG.getContext(), InSVT, NumSubRegElts);
   29696 
   29697   for (unsigned i = 0; i < RegNum; i++)
   29698     SubVec[i] = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubRegVT, In,
   29699                             DAG.getIntPtrConstant(i * NumSubRegElts, DL));
   29700 
   29701   // SSE2 provides PACKUS for only 2 x v8i16 -> v16i8 and SSE4.1 provides PACKUS
   29702   // for 2 x v4i32 -> v8i16. For SSSE3 and below, we need to use PACKSS to
   29703   // truncate 2 x v4i32 to v8i16.
   29704   if (Subtarget.hasSSE41() || OutSVT == MVT::i8)
   29705     return combineVectorTruncationWithPACKUS(N, DAG, SubVec);
   29706   else if (InSVT == MVT::i32)
   29707     return combineVectorTruncationWithPACKSS(N, DAG, SubVec);
   29708   else
   29709     return SDValue();
   29710 }
   29711 
   29712 static SDValue combineTruncate(SDNode *N, SelectionDAG &DAG,
   29713                                const X86Subtarget &Subtarget) {
   29714   EVT VT = N->getValueType(0);
   29715   SDValue Src = N->getOperand(0);
   29716   SDLoc DL(N);
   29717 
   29718   // Try to detect AVG pattern first.
   29719   if (SDValue Avg = detectAVGPattern(Src, VT, DAG, Subtarget, DL))
   29720     return Avg;
   29721 
   29722   // The bitcast source is a direct mmx result.
   29723   // Detect bitcasts between i32 to x86mmx
   29724   if (Src.getOpcode() == ISD::BITCAST && VT == MVT::i32) {
   29725     SDValue BCSrc = Src.getOperand(0);
   29726     if (BCSrc.getValueType() == MVT::x86mmx)
   29727       return DAG.getNode(X86ISD::MMX_MOVD2W, DL, MVT::i32, BCSrc);
   29728   }
   29729 
   29730   return combineVectorTruncation(N, DAG, Subtarget);
   29731 }
   29732 
   29733 /// Do target-specific dag combines on floating point negations.
   29734 static SDValue combineFneg(SDNode *N, SelectionDAG &DAG,
   29735                            const X86Subtarget &Subtarget) {
   29736   EVT VT = N->getValueType(0);
   29737   EVT SVT = VT.getScalarType();
   29738   SDValue Arg = N->getOperand(0);
   29739   SDLoc DL(N);
   29740 
   29741   // Let legalize expand this if it isn't a legal type yet.
   29742   if (!DAG.getTargetLoweringInfo().isTypeLegal(VT))
   29743     return SDValue();
   29744 
   29745   // If we're negating a FMUL node on a target with FMA, then we can avoid the
   29746   // use of a constant by performing (-0 - A*B) instead.
   29747   // FIXME: Check rounding control flags as well once it becomes available.
   29748   if (Arg.getOpcode() == ISD::FMUL && (SVT == MVT::f32 || SVT == MVT::f64) &&
   29749       Arg->getFlags()->hasNoSignedZeros() && Subtarget.hasAnyFMA()) {
   29750     SDValue Zero = DAG.getConstantFP(0.0, DL, VT);
   29751     return DAG.getNode(X86ISD::FNMSUB, DL, VT, Arg.getOperand(0),
   29752                        Arg.getOperand(1), Zero);
   29753   }
   29754 
   29755   // If we're negating a FMA node, then we can adjust the
   29756   // instruction to include the extra negation.
   29757   if (Arg.hasOneUse()) {
   29758     switch (Arg.getOpcode()) {
   29759     case X86ISD::FMADD:
   29760       return DAG.getNode(X86ISD::FNMSUB, DL, VT, Arg.getOperand(0),
   29761                          Arg.getOperand(1), Arg.getOperand(2));
   29762     case X86ISD::FMSUB:
   29763       return DAG.getNode(X86ISD::FNMADD, DL, VT, Arg.getOperand(0),
   29764                          Arg.getOperand(1), Arg.getOperand(2));
   29765     case X86ISD::FNMADD:
   29766       return DAG.getNode(X86ISD::FMSUB, DL, VT, Arg.getOperand(0),
   29767                          Arg.getOperand(1), Arg.getOperand(2));
   29768     case X86ISD::FNMSUB:
   29769       return DAG.getNode(X86ISD::FMADD, DL, VT, Arg.getOperand(0),
   29770                          Arg.getOperand(1), Arg.getOperand(2));
   29771     }
   29772   }
   29773   return SDValue();
   29774 }
   29775 
   29776 static SDValue lowerX86FPLogicOp(SDNode *N, SelectionDAG &DAG,
   29777                               const X86Subtarget &Subtarget) {
   29778   EVT VT = N->getValueType(0);
   29779   if (VT.is512BitVector() && !Subtarget.hasDQI()) {
   29780     // VXORPS, VORPS, VANDPS, VANDNPS are supported only under DQ extention.
   29781     // These logic operations may be executed in the integer domain.
   29782     SDLoc dl(N);
   29783     MVT IntScalar = MVT::getIntegerVT(VT.getScalarSizeInBits());
   29784     MVT IntVT = MVT::getVectorVT(IntScalar, VT.getVectorNumElements());
   29785 
   29786     SDValue Op0 = DAG.getBitcast(IntVT, N->getOperand(0));
   29787     SDValue Op1 = DAG.getBitcast(IntVT, N->getOperand(1));
   29788     unsigned IntOpcode = 0;
   29789     switch (N->getOpcode()) {
   29790       default: llvm_unreachable("Unexpected FP logic op");
   29791       case X86ISD::FOR: IntOpcode = ISD::OR; break;
   29792       case X86ISD::FXOR: IntOpcode = ISD::XOR; break;
   29793       case X86ISD::FAND: IntOpcode = ISD::AND; break;
   29794       case X86ISD::FANDN: IntOpcode = X86ISD::ANDNP; break;
   29795     }
   29796     SDValue IntOp = DAG.getNode(IntOpcode, dl, IntVT, Op0, Op1);
   29797     return DAG.getBitcast(VT, IntOp);
   29798   }
   29799   return SDValue();
   29800 }
   29801 /// Do target-specific dag combines on X86ISD::FOR and X86ISD::FXOR nodes.
   29802 static SDValue combineFOr(SDNode *N, SelectionDAG &DAG,
   29803                           const X86Subtarget &Subtarget) {
   29804   assert(N->getOpcode() == X86ISD::FOR || N->getOpcode() == X86ISD::FXOR);
   29805 
   29806   // F[X]OR(0.0, x) -> x
   29807   if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N->getOperand(0)))
   29808     if (C->getValueAPF().isPosZero())
   29809       return N->getOperand(1);
   29810 
   29811   // F[X]OR(x, 0.0) -> x
   29812   if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N->getOperand(1)))
   29813     if (C->getValueAPF().isPosZero())
   29814       return N->getOperand(0);
   29815 
   29816   return lowerX86FPLogicOp(N, DAG, Subtarget);
   29817 }
   29818 
   29819 /// Do target-specific dag combines on X86ISD::FMIN and X86ISD::FMAX nodes.
   29820 static SDValue combineFMinFMax(SDNode *N, SelectionDAG &DAG) {
   29821   assert(N->getOpcode() == X86ISD::FMIN || N->getOpcode() == X86ISD::FMAX);
   29822 
   29823   // Only perform optimizations if UnsafeMath is used.
   29824   if (!DAG.getTarget().Options.UnsafeFPMath)
   29825     return SDValue();
   29826 
   29827   // If we run in unsafe-math mode, then convert the FMAX and FMIN nodes
   29828   // into FMINC and FMAXC, which are Commutative operations.
   29829   unsigned NewOp = 0;
   29830   switch (N->getOpcode()) {
   29831     default: llvm_unreachable("unknown opcode");
   29832     case X86ISD::FMIN:  NewOp = X86ISD::FMINC; break;
   29833     case X86ISD::FMAX:  NewOp = X86ISD::FMAXC; break;
   29834   }
   29835 
   29836   return DAG.getNode(NewOp, SDLoc(N), N->getValueType(0),
   29837                      N->getOperand(0), N->getOperand(1));
   29838 }
   29839 
   29840 static SDValue combineFMinNumFMaxNum(SDNode *N, SelectionDAG &DAG,
   29841                                      const X86Subtarget &Subtarget) {
   29842   if (Subtarget.useSoftFloat())
   29843     return SDValue();
   29844 
   29845   // TODO: Check for global or instruction-level "nnan". In that case, we
   29846   //       should be able to lower to FMAX/FMIN alone.
   29847   // TODO: If an operand is already known to be a NaN or not a NaN, this
   29848   //       should be an optional swap and FMAX/FMIN.
   29849 
   29850   EVT VT = N->getValueType(0);
   29851   if (!((Subtarget.hasSSE1() && (VT == MVT::f32 || VT == MVT::v4f32)) ||
   29852         (Subtarget.hasSSE2() && (VT == MVT::f64 || VT == MVT::v2f64)) ||
   29853         (Subtarget.hasAVX() && (VT == MVT::v8f32 || VT == MVT::v4f64))))
   29854     return SDValue();
   29855 
   29856   // This takes at least 3 instructions, so favor a library call when operating
   29857   // on a scalar and minimizing code size.
   29858   if (!VT.isVector() && DAG.getMachineFunction().getFunction()->optForMinSize())
   29859     return SDValue();
   29860 
   29861   SDValue Op0 = N->getOperand(0);
   29862   SDValue Op1 = N->getOperand(1);
   29863   SDLoc DL(N);
   29864   EVT SetCCType = DAG.getTargetLoweringInfo().getSetCCResultType(
   29865       DAG.getDataLayout(), *DAG.getContext(), VT);
   29866 
   29867   // There are 4 possibilities involving NaN inputs, and these are the required
   29868   // outputs:
   29869   //                   Op1
   29870   //               Num     NaN
   29871   //            ----------------
   29872   //       Num  |  Max  |  Op0 |
   29873   // Op0        ----------------
   29874   //       NaN  |  Op1  |  NaN |
   29875   //            ----------------
   29876   //
   29877   // The SSE FP max/min instructions were not designed for this case, but rather
   29878   // to implement:
   29879   //   Min = Op1 < Op0 ? Op1 : Op0
   29880   //   Max = Op1 > Op0 ? Op1 : Op0
   29881   //
   29882   // So they always return Op0 if either input is a NaN. However, we can still
   29883   // use those instructions for fmaxnum by selecting away a NaN input.
   29884 
   29885   // If either operand is NaN, the 2nd source operand (Op0) is passed through.
   29886   auto MinMaxOp = N->getOpcode() == ISD::FMAXNUM ? X86ISD::FMAX : X86ISD::FMIN;
   29887   SDValue MinOrMax = DAG.getNode(MinMaxOp, DL, VT, Op1, Op0);
   29888   SDValue IsOp0Nan = DAG.getSetCC(DL, SetCCType , Op0, Op0, ISD::SETUO);
   29889 
   29890   // If Op0 is a NaN, select Op1. Otherwise, select the max. If both operands
   29891   // are NaN, the NaN value of Op1 is the result.
   29892   auto SelectOpcode = VT.isVector() ? ISD::VSELECT : ISD::SELECT;
   29893   return DAG.getNode(SelectOpcode, DL, VT, IsOp0Nan, Op1, MinOrMax);
   29894 }
   29895 
   29896 /// Do target-specific dag combines on X86ISD::FAND nodes.
   29897 static SDValue combineFAnd(SDNode *N, SelectionDAG &DAG,
   29898                            const X86Subtarget &Subtarget) {
   29899   // FAND(0.0, x) -> 0.0
   29900   if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N->getOperand(0)))
   29901     if (C->getValueAPF().isPosZero())
   29902       return N->getOperand(0);
   29903 
   29904   // FAND(x, 0.0) -> 0.0
   29905   if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N->getOperand(1)))
   29906     if (C->getValueAPF().isPosZero())
   29907       return N->getOperand(1);
   29908 
   29909   return lowerX86FPLogicOp(N, DAG, Subtarget);
   29910 }
   29911 
   29912 /// Do target-specific dag combines on X86ISD::FANDN nodes
   29913 static SDValue combineFAndn(SDNode *N, SelectionDAG &DAG,
   29914                             const X86Subtarget &Subtarget) {
   29915   // FANDN(0.0, x) -> x
   29916   if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N->getOperand(0)))
   29917     if (C->getValueAPF().isPosZero())
   29918       return N->getOperand(1);
   29919 
   29920   // FANDN(x, 0.0) -> 0.0
   29921   if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N->getOperand(1)))
   29922     if (C->getValueAPF().isPosZero())
   29923       return N->getOperand(1);
   29924 
   29925   return lowerX86FPLogicOp(N, DAG, Subtarget);
   29926 }
   29927 
   29928 static SDValue combineBT(SDNode *N, SelectionDAG &DAG,
   29929                          TargetLowering::DAGCombinerInfo &DCI) {
   29930   // BT ignores high bits in the bit index operand.
   29931   SDValue Op1 = N->getOperand(1);
   29932   if (Op1.hasOneUse()) {
   29933     unsigned BitWidth = Op1.getValueSizeInBits();
   29934     APInt DemandedMask = APInt::getLowBitsSet(BitWidth, Log2_32(BitWidth));
   29935     APInt KnownZero, KnownOne;
   29936     TargetLowering::TargetLoweringOpt TLO(DAG, !DCI.isBeforeLegalize(),
   29937                                           !DCI.isBeforeLegalizeOps());
   29938     const TargetLowering &TLI = DAG.getTargetLoweringInfo();
   29939     if (TLO.ShrinkDemandedConstant(Op1, DemandedMask) ||
   29940         TLI.SimplifyDemandedBits(Op1, DemandedMask, KnownZero, KnownOne, TLO))
   29941       DCI.CommitTargetLoweringOpt(TLO);
   29942   }
   29943   return SDValue();
   29944 }
   29945 
   29946 static SDValue combineVZextMovl(SDNode *N, SelectionDAG &DAG) {
   29947   SDValue Op = peekThroughBitcasts(N->getOperand(0));
   29948   EVT VT = N->getValueType(0), OpVT = Op.getValueType();
   29949   if (Op.getOpcode() == X86ISD::VZEXT_LOAD &&
   29950       VT.getVectorElementType().getSizeInBits() ==
   29951       OpVT.getVectorElementType().getSizeInBits()) {
   29952     return DAG.getBitcast(VT, Op);
   29953   }
   29954   return SDValue();
   29955 }
   29956 
   29957 static SDValue combineSignExtendInReg(SDNode *N, SelectionDAG &DAG,
   29958                                       const X86Subtarget &Subtarget) {
   29959   EVT VT = N->getValueType(0);
   29960   if (!VT.isVector())
   29961     return SDValue();
   29962 
   29963   SDValue N0 = N->getOperand(0);
   29964   SDValue N1 = N->getOperand(1);
   29965   EVT ExtraVT = cast<VTSDNode>(N1)->getVT();
   29966   SDLoc dl(N);
   29967 
   29968   // The SIGN_EXTEND_INREG to v4i64 is expensive operation on the
   29969   // both SSE and AVX2 since there is no sign-extended shift right
   29970   // operation on a vector with 64-bit elements.
   29971   //(sext_in_reg (v4i64 anyext (v4i32 x )), ExtraVT) ->
   29972   // (v4i64 sext (v4i32 sext_in_reg (v4i32 x , ExtraVT)))
   29973   if (VT == MVT::v4i64 && (N0.getOpcode() == ISD::ANY_EXTEND ||
   29974       N0.getOpcode() == ISD::SIGN_EXTEND)) {
   29975     SDValue N00 = N0.getOperand(0);
   29976 
   29977     // EXTLOAD has a better solution on AVX2,
   29978     // it may be replaced with X86ISD::VSEXT node.
   29979     if (N00.getOpcode() == ISD::LOAD && Subtarget.hasInt256())
   29980       if (!ISD::isNormalLoad(N00.getNode()))
   29981         return SDValue();
   29982 
   29983     if (N00.getValueType() == MVT::v4i32 && ExtraVT.getSizeInBits() < 128) {
   29984         SDValue Tmp = DAG.getNode(ISD::SIGN_EXTEND_INREG, dl, MVT::v4i32,
   29985                                   N00, N1);
   29986       return DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v4i64, Tmp);
   29987     }
   29988   }
   29989   return SDValue();
   29990 }
   29991 
   29992 /// sext(add_nsw(x, C)) --> add(sext(x), C_sext)
   29993 /// Promoting a sign extension ahead of an 'add nsw' exposes opportunities
   29994 /// to combine math ops, use an LEA, or use a complex addressing mode. This can
   29995 /// eliminate extend, add, and shift instructions.
   29996 static SDValue promoteSextBeforeAddNSW(SDNode *Sext, SelectionDAG &DAG,
   29997                                        const X86Subtarget &Subtarget) {
   29998   // TODO: This should be valid for other integer types.
   29999   EVT VT = Sext->getValueType(0);
   30000   if (VT != MVT::i64)
   30001     return SDValue();
   30002 
   30003   // We need an 'add nsw' feeding into the 'sext'.
   30004   SDValue Add = Sext->getOperand(0);
   30005   if (Add.getOpcode() != ISD::ADD || !Add->getFlags()->hasNoSignedWrap())
   30006     return SDValue();
   30007 
   30008   // Having a constant operand to the 'add' ensures that we are not increasing
   30009   // the instruction count because the constant is extended for free below.
   30010   // A constant operand can also become the displacement field of an LEA.
   30011   auto *AddOp1 = dyn_cast<ConstantSDNode>(Add.getOperand(1));
   30012   if (!AddOp1)
   30013     return SDValue();
   30014 
   30015   // Don't make the 'add' bigger if there's no hope of combining it with some
   30016   // other 'add' or 'shl' instruction.
   30017   // TODO: It may be profitable to generate simpler LEA instructions in place
   30018   // of single 'add' instructions, but the cost model for selecting an LEA
   30019   // currently has a high threshold.
   30020   bool HasLEAPotential = false;
   30021   for (auto *User : Sext->uses()) {
   30022     if (User->getOpcode() == ISD::ADD || User->getOpcode() == ISD::SHL) {
   30023       HasLEAPotential = true;
   30024       break;
   30025     }
   30026   }
   30027   if (!HasLEAPotential)
   30028     return SDValue();
   30029 
   30030   // Everything looks good, so pull the 'sext' ahead of the 'add'.
   30031   int64_t AddConstant = AddOp1->getSExtValue();
   30032   SDValue AddOp0 = Add.getOperand(0);
   30033   SDValue NewSext = DAG.getNode(ISD::SIGN_EXTEND, SDLoc(Sext), VT, AddOp0);
   30034   SDValue NewConstant = DAG.getConstant(AddConstant, SDLoc(Add), VT);
   30035 
   30036   // The wider add is guaranteed to not wrap because both operands are
   30037   // sign-extended.
   30038   SDNodeFlags Flags;
   30039   Flags.setNoSignedWrap(true);
   30040   return DAG.getNode(ISD::ADD, SDLoc(Add), VT, NewSext, NewConstant, &Flags);
   30041 }
   30042 
   30043 /// (i8,i32 {s/z}ext ({s/u}divrem (i8 x, i8 y)) ->
   30044 /// (i8,i32 ({s/u}divrem_sext_hreg (i8 x, i8 y)
   30045 /// This exposes the {s/z}ext to the sdivrem lowering, so that it directly
   30046 /// extends from AH (which we otherwise need to do contortions to access).
   30047 static SDValue getDivRem8(SDNode *N, SelectionDAG &DAG) {
   30048   SDValue N0 = N->getOperand(0);
   30049   auto OpcodeN = N->getOpcode();
   30050   auto OpcodeN0 = N0.getOpcode();
   30051   if (!((OpcodeN == ISD::SIGN_EXTEND && OpcodeN0 == ISD::SDIVREM) ||
   30052         (OpcodeN == ISD::ZERO_EXTEND && OpcodeN0 == ISD::UDIVREM)))
   30053     return SDValue();
   30054 
   30055   EVT VT = N->getValueType(0);
   30056   EVT InVT = N0.getValueType();
   30057   if (N0.getResNo() != 1 || InVT != MVT::i8 || VT != MVT::i32)
   30058     return SDValue();
   30059 
   30060   SDVTList NodeTys = DAG.getVTList(MVT::i8, VT);
   30061   auto DivRemOpcode = OpcodeN0 == ISD::SDIVREM ? X86ISD::SDIVREM8_SEXT_HREG
   30062                                                : X86ISD::UDIVREM8_ZEXT_HREG;
   30063   SDValue R = DAG.getNode(DivRemOpcode, SDLoc(N), NodeTys, N0.getOperand(0),
   30064                           N0.getOperand(1));
   30065   DAG.ReplaceAllUsesOfValueWith(N0.getValue(0), R.getValue(0));
   30066   return R.getValue(1);
   30067 }
   30068 
   30069 /// Convert a SEXT or ZEXT of a vector to a SIGN_EXTEND_VECTOR_INREG or
   30070 /// ZERO_EXTEND_VECTOR_INREG, this requires the splitting (or concatenating
   30071 /// with UNDEFs) of the input to vectors of the same size as the target type
   30072 /// which then extends the lowest elements.
   30073 static SDValue combineToExtendVectorInReg(SDNode *N, SelectionDAG &DAG,
   30074                                           TargetLowering::DAGCombinerInfo &DCI,
   30075                                           const X86Subtarget &Subtarget) {
   30076   unsigned Opcode = N->getOpcode();
   30077   if (Opcode != ISD::SIGN_EXTEND && Opcode != ISD::ZERO_EXTEND)
   30078     return SDValue();
   30079   if (!DCI.isBeforeLegalizeOps())
   30080     return SDValue();
   30081   if (!Subtarget.hasSSE2())
   30082     return SDValue();
   30083 
   30084   SDValue N0 = N->getOperand(0);
   30085   EVT VT = N->getValueType(0);
   30086   EVT SVT = VT.getScalarType();
   30087   EVT InVT = N0.getValueType();
   30088   EVT InSVT = InVT.getScalarType();
   30089 
   30090   // Input type must be a vector and we must be extending legal integer types.
   30091   if (!VT.isVector())
   30092     return SDValue();
   30093   if (SVT != MVT::i64 && SVT != MVT::i32 && SVT != MVT::i16)
   30094     return SDValue();
   30095   if (InSVT != MVT::i32 && InSVT != MVT::i16 && InSVT != MVT::i8)
   30096     return SDValue();
   30097 
   30098   // On AVX2+ targets, if the input/output types are both legal then we will be
   30099   // able to use SIGN_EXTEND/ZERO_EXTEND directly.
   30100   if (Subtarget.hasInt256() && DAG.getTargetLoweringInfo().isTypeLegal(VT) &&
   30101       DAG.getTargetLoweringInfo().isTypeLegal(InVT))
   30102     return SDValue();
   30103 
   30104   SDLoc DL(N);
   30105 
   30106   auto ExtendVecSize = [&DAG](const SDLoc &DL, SDValue N, unsigned Size) {
   30107     EVT InVT = N.getValueType();
   30108     EVT OutVT = EVT::getVectorVT(*DAG.getContext(), InVT.getScalarType(),
   30109                                  Size / InVT.getScalarSizeInBits());
   30110     SmallVector<SDValue, 8> Opnds(Size / InVT.getSizeInBits(),
   30111                                   DAG.getUNDEF(InVT));
   30112     Opnds[0] = N;
   30113     return DAG.getNode(ISD::CONCAT_VECTORS, DL, OutVT, Opnds);
   30114   };
   30115 
   30116   // If target-size is less than 128-bits, extend to a type that would extend
   30117   // to 128 bits, extend that and extract the original target vector.
   30118   if (VT.getSizeInBits() < 128 && !(128 % VT.getSizeInBits())) {
   30119     unsigned Scale = 128 / VT.getSizeInBits();
   30120     EVT ExVT =
   30121         EVT::getVectorVT(*DAG.getContext(), SVT, 128 / SVT.getSizeInBits());
   30122     SDValue Ex = ExtendVecSize(DL, N0, Scale * InVT.getSizeInBits());
   30123     SDValue SExt = DAG.getNode(Opcode, DL, ExVT, Ex);
   30124     return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, SExt,
   30125                        DAG.getIntPtrConstant(0, DL));
   30126   }
   30127 
   30128   // If target-size is 128-bits (or 256-bits on AVX2 target), then convert to
   30129   // ISD::*_EXTEND_VECTOR_INREG which ensures lowering to X86ISD::V*EXT.
   30130   // Also use this if we don't have SSE41 to allow the legalizer do its job.
   30131   if (!Subtarget.hasSSE41() || VT.is128BitVector() ||
   30132       (VT.is256BitVector() && Subtarget.hasInt256())) {
   30133     SDValue ExOp = ExtendVecSize(DL, N0, VT.getSizeInBits());
   30134     return Opcode == ISD::SIGN_EXTEND
   30135                ? DAG.getSignExtendVectorInReg(ExOp, DL, VT)
   30136                : DAG.getZeroExtendVectorInReg(ExOp, DL, VT);
   30137   }
   30138 
   30139   // On pre-AVX2 targets, split into 128-bit nodes of
   30140   // ISD::*_EXTEND_VECTOR_INREG.
   30141   if (!Subtarget.hasInt256() && !(VT.getSizeInBits() % 128)) {
   30142     unsigned NumVecs = VT.getSizeInBits() / 128;
   30143     unsigned NumSubElts = 128 / SVT.getSizeInBits();
   30144     EVT SubVT = EVT::getVectorVT(*DAG.getContext(), SVT, NumSubElts);
   30145     EVT InSubVT = EVT::getVectorVT(*DAG.getContext(), InSVT, NumSubElts);
   30146 
   30147     SmallVector<SDValue, 8> Opnds;
   30148     for (unsigned i = 0, Offset = 0; i != NumVecs; ++i, Offset += NumSubElts) {
   30149       SDValue SrcVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, InSubVT, N0,
   30150                                    DAG.getIntPtrConstant(Offset, DL));
   30151       SrcVec = ExtendVecSize(DL, SrcVec, 128);
   30152       SrcVec = Opcode == ISD::SIGN_EXTEND
   30153                    ? DAG.getSignExtendVectorInReg(SrcVec, DL, SubVT)
   30154                    : DAG.getZeroExtendVectorInReg(SrcVec, DL, SubVT);
   30155       Opnds.push_back(SrcVec);
   30156     }
   30157     return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Opnds);
   30158   }
   30159 
   30160   return SDValue();
   30161 }
   30162 
   30163 static SDValue combineSext(SDNode *N, SelectionDAG &DAG,
   30164                            TargetLowering::DAGCombinerInfo &DCI,
   30165                            const X86Subtarget &Subtarget) {
   30166   SDValue N0 = N->getOperand(0);
   30167   EVT VT = N->getValueType(0);
   30168   EVT InVT = N0.getValueType();
   30169   SDLoc DL(N);
   30170 
   30171   if (SDValue DivRem8 = getDivRem8(N, DAG))
   30172     return DivRem8;
   30173 
   30174   if (!DCI.isBeforeLegalizeOps()) {
   30175     if (InVT == MVT::i1) {
   30176       SDValue Zero = DAG.getConstant(0, DL, VT);
   30177       SDValue AllOnes =
   30178           DAG.getConstant(APInt::getAllOnesValue(VT.getSizeInBits()), DL, VT);
   30179       return DAG.getNode(ISD::SELECT, DL, VT, N0, AllOnes, Zero);
   30180     }
   30181     return SDValue();
   30182   }
   30183 
   30184   if (SDValue V = combineToExtendVectorInReg(N, DAG, DCI, Subtarget))
   30185     return V;
   30186 
   30187   if (Subtarget.hasAVX() && VT.is256BitVector())
   30188     if (SDValue R = WidenMaskArithmetic(N, DAG, DCI, Subtarget))
   30189       return R;
   30190 
   30191   if (SDValue NewAdd = promoteSextBeforeAddNSW(N, DAG, Subtarget))
   30192     return NewAdd;
   30193 
   30194   return SDValue();
   30195 }
   30196 
   30197 static SDValue combineFMA(SDNode *N, SelectionDAG &DAG,
   30198                           const X86Subtarget &Subtarget) {
   30199   SDLoc dl(N);
   30200   EVT VT = N->getValueType(0);
   30201 
   30202   // Let legalize expand this if it isn't a legal type yet.
   30203   if (!DAG.getTargetLoweringInfo().isTypeLegal(VT))
   30204     return SDValue();
   30205 
   30206   EVT ScalarVT = VT.getScalarType();
   30207   if ((ScalarVT != MVT::f32 && ScalarVT != MVT::f64) || !Subtarget.hasAnyFMA())
   30208     return SDValue();
   30209 
   30210   SDValue A = N->getOperand(0);
   30211   SDValue B = N->getOperand(1);
   30212   SDValue C = N->getOperand(2);
   30213 
   30214   bool NegA = (A.getOpcode() == ISD::FNEG);
   30215   bool NegB = (B.getOpcode() == ISD::FNEG);
   30216   bool NegC = (C.getOpcode() == ISD::FNEG);
   30217 
   30218   // Negative multiplication when NegA xor NegB
   30219   bool NegMul = (NegA != NegB);
   30220   if (NegA)
   30221     A = A.getOperand(0);
   30222   if (NegB)
   30223     B = B.getOperand(0);
   30224   if (NegC)
   30225     C = C.getOperand(0);
   30226 
   30227   unsigned Opcode;
   30228   if (!NegMul)
   30229     Opcode = (!NegC) ? X86ISD::FMADD : X86ISD::FMSUB;
   30230   else
   30231     Opcode = (!NegC) ? X86ISD::FNMADD : X86ISD::FNMSUB;
   30232 
   30233   return DAG.getNode(Opcode, dl, VT, A, B, C);
   30234 }
   30235 
   30236 static SDValue combineZext(SDNode *N, SelectionDAG &DAG,
   30237                            TargetLowering::DAGCombinerInfo &DCI,
   30238                            const X86Subtarget &Subtarget) {
   30239   // (i32 zext (and (i8  x86isd::setcc_carry), 1)) ->
   30240   //           (and (i32 x86isd::setcc_carry), 1)
   30241   // This eliminates the zext. This transformation is necessary because
   30242   // ISD::SETCC is always legalized to i8.
   30243   SDLoc dl(N);
   30244   SDValue N0 = N->getOperand(0);
   30245   EVT VT = N->getValueType(0);
   30246 
   30247   if (N0.getOpcode() == ISD::AND &&
   30248       N0.hasOneUse() &&
   30249       N0.getOperand(0).hasOneUse()) {
   30250     SDValue N00 = N0.getOperand(0);
   30251     if (N00.getOpcode() == X86ISD::SETCC_CARRY) {
   30252       if (!isOneConstant(N0.getOperand(1)))
   30253         return SDValue();
   30254       return DAG.getNode(ISD::AND, dl, VT,
   30255                          DAG.getNode(X86ISD::SETCC_CARRY, dl, VT,
   30256                                      N00.getOperand(0), N00.getOperand(1)),
   30257                          DAG.getConstant(1, dl, VT));
   30258     }
   30259   }
   30260 
   30261   if (N0.getOpcode() == ISD::TRUNCATE &&
   30262       N0.hasOneUse() &&
   30263       N0.getOperand(0).hasOneUse()) {
   30264     SDValue N00 = N0.getOperand(0);
   30265     if (N00.getOpcode() == X86ISD::SETCC_CARRY) {
   30266       return DAG.getNode(ISD::AND, dl, VT,
   30267                          DAG.getNode(X86ISD::SETCC_CARRY, dl, VT,
   30268                                      N00.getOperand(0), N00.getOperand(1)),
   30269                          DAG.getConstant(1, dl, VT));
   30270     }
   30271   }
   30272 
   30273   if (SDValue V = combineToExtendVectorInReg(N, DAG, DCI, Subtarget))
   30274     return V;
   30275 
   30276   if (VT.is256BitVector())
   30277     if (SDValue R = WidenMaskArithmetic(N, DAG, DCI, Subtarget))
   30278       return R;
   30279 
   30280   if (SDValue DivRem8 = getDivRem8(N, DAG))
   30281     return DivRem8;
   30282 
   30283   return SDValue();
   30284 }
   30285 
   30286 /// Optimize x == -y --> x+y == 0
   30287 ///          x != -y --> x+y != 0
   30288 static SDValue combineSetCC(SDNode *N, SelectionDAG &DAG,
   30289                             const X86Subtarget &Subtarget) {
   30290   ISD::CondCode CC = cast<CondCodeSDNode>(N->getOperand(2))->get();
   30291   SDValue LHS = N->getOperand(0);
   30292   SDValue RHS = N->getOperand(1);
   30293   EVT VT = N->getValueType(0);
   30294   SDLoc DL(N);
   30295 
   30296   if ((CC == ISD::SETNE || CC == ISD::SETEQ) && LHS.getOpcode() == ISD::SUB)
   30297     if (isNullConstant(LHS.getOperand(0)) && LHS.hasOneUse()) {
   30298       SDValue addV = DAG.getNode(ISD::ADD, DL, LHS.getValueType(), RHS,
   30299                                  LHS.getOperand(1));
   30300       return DAG.getSetCC(DL, N->getValueType(0), addV,
   30301                           DAG.getConstant(0, DL, addV.getValueType()), CC);
   30302     }
   30303   if ((CC == ISD::SETNE || CC == ISD::SETEQ) && RHS.getOpcode() == ISD::SUB)
   30304     if (isNullConstant(RHS.getOperand(0)) && RHS.hasOneUse()) {
   30305       SDValue addV = DAG.getNode(ISD::ADD, DL, RHS.getValueType(), LHS,
   30306                                  RHS.getOperand(1));
   30307       return DAG.getSetCC(DL, N->getValueType(0), addV,
   30308                           DAG.getConstant(0, DL, addV.getValueType()), CC);
   30309     }
   30310 
   30311   if (VT.getScalarType() == MVT::i1 &&
   30312       (CC == ISD::SETNE || CC == ISD::SETEQ || ISD::isSignedIntSetCC(CC))) {
   30313     bool IsSEXT0 =
   30314         (LHS.getOpcode() == ISD::SIGN_EXTEND) &&
   30315         (LHS.getOperand(0).getValueType().getScalarType() == MVT::i1);
   30316     bool IsVZero1 = ISD::isBuildVectorAllZeros(RHS.getNode());
   30317 
   30318     if (!IsSEXT0 || !IsVZero1) {
   30319       // Swap the operands and update the condition code.
   30320       std::swap(LHS, RHS);
   30321       CC = ISD::getSetCCSwappedOperands(CC);
   30322 
   30323       IsSEXT0 = (LHS.getOpcode() == ISD::SIGN_EXTEND) &&
   30324                 (LHS.getOperand(0).getValueType().getScalarType() == MVT::i1);
   30325       IsVZero1 = ISD::isBuildVectorAllZeros(RHS.getNode());
   30326     }
   30327 
   30328     if (IsSEXT0 && IsVZero1) {
   30329       assert(VT == LHS.getOperand(0).getValueType() &&
   30330              "Uexpected operand type");
   30331       if (CC == ISD::SETGT)
   30332         return DAG.getConstant(0, DL, VT);
   30333       if (CC == ISD::SETLE)
   30334         return DAG.getConstant(1, DL, VT);
   30335       if (CC == ISD::SETEQ || CC == ISD::SETGE)
   30336         return DAG.getNOT(DL, LHS.getOperand(0), VT);
   30337 
   30338       assert((CC == ISD::SETNE || CC == ISD::SETLT) &&
   30339              "Unexpected condition code!");
   30340       return LHS.getOperand(0);
   30341     }
   30342   }
   30343 
   30344   // For an SSE1-only target, lower to X86ISD::CMPP early to avoid scalarization
   30345   // via legalization because v4i32 is not a legal type.
   30346   if (Subtarget.hasSSE1() && !Subtarget.hasSSE2() && VT == MVT::v4i32)
   30347     return LowerVSETCC(SDValue(N, 0), Subtarget, DAG);
   30348 
   30349   return SDValue();
   30350 }
   30351 
   30352 static SDValue combineGatherScatter(SDNode *N, SelectionDAG &DAG) {
   30353   SDLoc DL(N);
   30354   // Gather and Scatter instructions use k-registers for masks. The type of
   30355   // the masks is v*i1. So the mask will be truncated anyway.
   30356   // The SIGN_EXTEND_INREG my be dropped.
   30357   SDValue Mask = N->getOperand(2);
   30358   if (Mask.getOpcode() == ISD::SIGN_EXTEND_INREG) {
   30359     SmallVector<SDValue, 5> NewOps(N->op_begin(), N->op_end());
   30360     NewOps[2] = Mask.getOperand(0);
   30361     DAG.UpdateNodeOperands(N, NewOps);
   30362   }
   30363   return SDValue();
   30364 }
   30365 
   30366 // Helper function of performSETCCCombine. It is to materialize "setb reg"
   30367 // as "sbb reg,reg", since it can be extended without zext and produces
   30368 // an all-ones bit which is more useful than 0/1 in some cases.
   30369 static SDValue MaterializeSETB(const SDLoc &DL, SDValue EFLAGS,
   30370                                SelectionDAG &DAG, MVT VT) {
   30371   if (VT == MVT::i8)
   30372     return DAG.getNode(ISD::AND, DL, VT,
   30373                        DAG.getNode(X86ISD::SETCC_CARRY, DL, MVT::i8,
   30374                                    DAG.getConstant(X86::COND_B, DL, MVT::i8),
   30375                                    EFLAGS),
   30376                        DAG.getConstant(1, DL, VT));
   30377   assert (VT == MVT::i1 && "Unexpected type for SECCC node");
   30378   return DAG.getNode(ISD::TRUNCATE, DL, MVT::i1,
   30379                      DAG.getNode(X86ISD::SETCC_CARRY, DL, MVT::i8,
   30380                                  DAG.getConstant(X86::COND_B, DL, MVT::i8),
   30381                                  EFLAGS));
   30382 }
   30383 
   30384 // Optimize  RES = X86ISD::SETCC CONDCODE, EFLAG_INPUT
   30385 static SDValue combineX86SetCC(SDNode *N, SelectionDAG &DAG,
   30386                                TargetLowering::DAGCombinerInfo &DCI,
   30387                                const X86Subtarget &Subtarget) {
   30388   SDLoc DL(N);
   30389   X86::CondCode CC = X86::CondCode(N->getConstantOperandVal(0));
   30390   SDValue EFLAGS = N->getOperand(1);
   30391 
   30392   if (CC == X86::COND_A) {
   30393     // Try to convert COND_A into COND_B in an attempt to facilitate
   30394     // materializing "setb reg".
   30395     //
   30396     // Do not flip "e > c", where "c" is a constant, because Cmp instruction
   30397     // cannot take an immediate as its first operand.
   30398     //
   30399     if (EFLAGS.getOpcode() == X86ISD::SUB && EFLAGS.hasOneUse() &&
   30400         EFLAGS.getValueType().isInteger() &&
   30401         !isa<ConstantSDNode>(EFLAGS.getOperand(1))) {
   30402       SDValue NewSub = DAG.getNode(X86ISD::SUB, SDLoc(EFLAGS),
   30403                                    EFLAGS.getNode()->getVTList(),
   30404                                    EFLAGS.getOperand(1), EFLAGS.getOperand(0));
   30405       SDValue NewEFLAGS = SDValue(NewSub.getNode(), EFLAGS.getResNo());
   30406       return MaterializeSETB(DL, NewEFLAGS, DAG, N->getSimpleValueType(0));
   30407     }
   30408   }
   30409 
   30410   // Materialize "setb reg" as "sbb reg,reg", since it can be extended without
   30411   // a zext and produces an all-ones bit which is more useful than 0/1 in some
   30412   // cases.
   30413   if (CC == X86::COND_B)
   30414     return MaterializeSETB(DL, EFLAGS, DAG, N->getSimpleValueType(0));
   30415 
   30416   // Try to simplify the EFLAGS and condition code operands.
   30417   if (SDValue Flags = combineSetCCEFLAGS(EFLAGS, CC, DAG)) {
   30418     SDValue Cond = DAG.getConstant(CC, DL, MVT::i8);
   30419     return DAG.getNode(X86ISD::SETCC, DL, N->getVTList(), Cond, Flags);
   30420   }
   30421 
   30422   return SDValue();
   30423 }
   30424 
   30425 /// Optimize branch condition evaluation.
   30426 static SDValue combineBrCond(SDNode *N, SelectionDAG &DAG,
   30427                              TargetLowering::DAGCombinerInfo &DCI,
   30428                              const X86Subtarget &Subtarget) {
   30429   SDLoc DL(N);
   30430   SDValue EFLAGS = N->getOperand(3);
   30431   X86::CondCode CC = X86::CondCode(N->getConstantOperandVal(2));
   30432 
   30433   // Try to simplify the EFLAGS and condition code operands.
   30434   // Make sure to not keep references to operands, as combineSetCCEFLAGS can
   30435   // RAUW them under us.
   30436   if (SDValue Flags = combineSetCCEFLAGS(EFLAGS, CC, DAG)) {
   30437     SDValue Cond = DAG.getConstant(CC, DL, MVT::i8);
   30438     return DAG.getNode(X86ISD::BRCOND, DL, N->getVTList(), N->getOperand(0),
   30439                        N->getOperand(1), Cond, Flags);
   30440   }
   30441 
   30442   return SDValue();
   30443 }
   30444 
   30445 static SDValue combineVectorCompareAndMaskUnaryOp(SDNode *N,
   30446                                                   SelectionDAG &DAG) {
   30447   // Take advantage of vector comparisons producing 0 or -1 in each lane to
   30448   // optimize away operation when it's from a constant.
   30449   //
   30450   // The general transformation is:
   30451   //    UNARYOP(AND(VECTOR_CMP(x,y), constant)) -->
   30452   //       AND(VECTOR_CMP(x,y), constant2)
   30453   //    constant2 = UNARYOP(constant)
   30454 
   30455   // Early exit if this isn't a vector operation, the operand of the
   30456   // unary operation isn't a bitwise AND, or if the sizes of the operations
   30457   // aren't the same.
   30458   EVT VT = N->getValueType(0);
   30459   if (!VT.isVector() || N->getOperand(0)->getOpcode() != ISD::AND ||
   30460       N->getOperand(0)->getOperand(0)->getOpcode() != ISD::SETCC ||
   30461       VT.getSizeInBits() != N->getOperand(0)->getValueType(0).getSizeInBits())
   30462     return SDValue();
   30463 
   30464   // Now check that the other operand of the AND is a constant. We could
   30465   // make the transformation for non-constant splats as well, but it's unclear
   30466   // that would be a benefit as it would not eliminate any operations, just
   30467   // perform one more step in scalar code before moving to the vector unit.
   30468   if (BuildVectorSDNode *BV =
   30469           dyn_cast<BuildVectorSDNode>(N->getOperand(0)->getOperand(1))) {
   30470     // Bail out if the vector isn't a constant.
   30471     if (!BV->isConstant())
   30472       return SDValue();
   30473 
   30474     // Everything checks out. Build up the new and improved node.
   30475     SDLoc DL(N);
   30476     EVT IntVT = BV->getValueType(0);
   30477     // Create a new constant of the appropriate type for the transformed
   30478     // DAG.
   30479     SDValue SourceConst = DAG.getNode(N->getOpcode(), DL, VT, SDValue(BV, 0));
   30480     // The AND node needs bitcasts to/from an integer vector type around it.
   30481     SDValue MaskConst = DAG.getBitcast(IntVT, SourceConst);
   30482     SDValue NewAnd = DAG.getNode(ISD::AND, DL, IntVT,
   30483                                  N->getOperand(0)->getOperand(0), MaskConst);
   30484     SDValue Res = DAG.getBitcast(VT, NewAnd);
   30485     return Res;
   30486   }
   30487 
   30488   return SDValue();
   30489 }
   30490 
   30491 static SDValue combineUIntToFP(SDNode *N, SelectionDAG &DAG,
   30492                                const X86Subtarget &Subtarget) {
   30493   SDValue Op0 = N->getOperand(0);
   30494   EVT VT = N->getValueType(0);
   30495   EVT InVT = Op0.getValueType();
   30496   EVT InSVT = InVT.getScalarType();
   30497   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
   30498 
   30499   // UINT_TO_FP(vXi8) -> SINT_TO_FP(ZEXT(vXi8 to vXi32))
   30500   // UINT_TO_FP(vXi16) -> SINT_TO_FP(ZEXT(vXi16 to vXi32))
   30501   if (InVT.isVector() && (InSVT == MVT::i8 || InSVT == MVT::i16)) {
   30502     SDLoc dl(N);
   30503     EVT DstVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32,
   30504                                  InVT.getVectorNumElements());
   30505     SDValue P = DAG.getNode(ISD::ZERO_EXTEND, dl, DstVT, Op0);
   30506 
   30507     if (TLI.isOperationLegal(ISD::UINT_TO_FP, DstVT))
   30508       return DAG.getNode(ISD::UINT_TO_FP, dl, VT, P);
   30509 
   30510     return DAG.getNode(ISD::SINT_TO_FP, dl, VT, P);
   30511   }
   30512 
   30513   return SDValue();
   30514 }
   30515 
   30516 static SDValue combineSIntToFP(SDNode *N, SelectionDAG &DAG,
   30517                                const X86Subtarget &Subtarget) {
   30518   // First try to optimize away the conversion entirely when it's
   30519   // conditionally from a constant. Vectors only.
   30520   if (SDValue Res = combineVectorCompareAndMaskUnaryOp(N, DAG))
   30521     return Res;
   30522 
   30523   // Now move on to more general possibilities.
   30524   SDValue Op0 = N->getOperand(0);
   30525   EVT VT = N->getValueType(0);
   30526   EVT InVT = Op0.getValueType();
   30527   EVT InSVT = InVT.getScalarType();
   30528 
   30529   // SINT_TO_FP(vXi8) -> SINT_TO_FP(SEXT(vXi8 to vXi32))
   30530   // SINT_TO_FP(vXi16) -> SINT_TO_FP(SEXT(vXi16 to vXi32))
   30531   if (InVT.isVector() && (InSVT == MVT::i8 || InSVT == MVT::i16)) {
   30532     SDLoc dl(N);
   30533     EVT DstVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32,
   30534                                  InVT.getVectorNumElements());
   30535     SDValue P = DAG.getNode(ISD::SIGN_EXTEND, dl, DstVT, Op0);
   30536     return DAG.getNode(ISD::SINT_TO_FP, dl, VT, P);
   30537   }
   30538 
   30539   // Transform (SINT_TO_FP (i64 ...)) into an x87 operation if we have
   30540   // a 32-bit target where SSE doesn't support i64->FP operations.
   30541   if (!Subtarget.useSoftFloat() && Op0.getOpcode() == ISD::LOAD) {
   30542     LoadSDNode *Ld = cast<LoadSDNode>(Op0.getNode());
   30543     EVT LdVT = Ld->getValueType(0);
   30544 
   30545     // This transformation is not supported if the result type is f16 or f128.
   30546     if (VT == MVT::f16 || VT == MVT::f128)
   30547       return SDValue();
   30548 
   30549     if (!Ld->isVolatile() && !VT.isVector() &&
   30550         ISD::isNON_EXTLoad(Op0.getNode()) && Op0.hasOneUse() &&
   30551         !Subtarget.is64Bit() && LdVT == MVT::i64) {
   30552       SDValue FILDChain = Subtarget.getTargetLowering()->BuildFILD(
   30553           SDValue(N, 0), LdVT, Ld->getChain(), Op0, DAG);
   30554       DAG.ReplaceAllUsesOfValueWith(Op0.getValue(1), FILDChain.getValue(1));
   30555       return FILDChain;
   30556     }
   30557   }
   30558   return SDValue();
   30559 }
   30560 
   30561 // Optimize RES, EFLAGS = X86ISD::ADC LHS, RHS, EFLAGS
   30562 static SDValue combineADC(SDNode *N, SelectionDAG &DAG,
   30563                           X86TargetLowering::DAGCombinerInfo &DCI) {
   30564   // If the LHS and RHS of the ADC node are zero, then it can't overflow and
   30565   // the result is either zero or one (depending on the input carry bit).
   30566   // Strength reduce this down to a "set on carry" aka SETCC_CARRY&1.
   30567   if (X86::isZeroNode(N->getOperand(0)) &&
   30568       X86::isZeroNode(N->getOperand(1)) &&
   30569       // We don't have a good way to replace an EFLAGS use, so only do this when
   30570       // dead right now.
   30571       SDValue(N, 1).use_empty()) {
   30572     SDLoc DL(N);
   30573     EVT VT = N->getValueType(0);
   30574     SDValue CarryOut = DAG.getConstant(0, DL, N->getValueType(1));
   30575     SDValue Res1 = DAG.getNode(ISD::AND, DL, VT,
   30576                                DAG.getNode(X86ISD::SETCC_CARRY, DL, VT,
   30577                                            DAG.getConstant(X86::COND_B, DL,
   30578                                                            MVT::i8),
   30579                                            N->getOperand(2)),
   30580                                DAG.getConstant(1, DL, VT));
   30581     return DCI.CombineTo(N, Res1, CarryOut);
   30582   }
   30583 
   30584   return SDValue();
   30585 }
   30586 
   30587 /// fold (add Y, (sete  X, 0)) -> adc  0, Y
   30588 ///      (add Y, (setne X, 0)) -> sbb -1, Y
   30589 ///      (sub (sete  X, 0), Y) -> sbb  0, Y
   30590 ///      (sub (setne X, 0), Y) -> adc -1, Y
   30591 static SDValue OptimizeConditionalInDecrement(SDNode *N, SelectionDAG &DAG) {
   30592   SDLoc DL(N);
   30593 
   30594   // Look through ZExts.
   30595   SDValue Ext = N->getOperand(N->getOpcode() == ISD::SUB ? 1 : 0);
   30596   if (Ext.getOpcode() != ISD::ZERO_EXTEND || !Ext.hasOneUse())
   30597     return SDValue();
   30598 
   30599   SDValue SetCC = Ext.getOperand(0);
   30600   if (SetCC.getOpcode() != X86ISD::SETCC || !SetCC.hasOneUse())
   30601     return SDValue();
   30602 
   30603   X86::CondCode CC = (X86::CondCode)SetCC.getConstantOperandVal(0);
   30604   if (CC != X86::COND_E && CC != X86::COND_NE)
   30605     return SDValue();
   30606 
   30607   SDValue Cmp = SetCC.getOperand(1);
   30608   if (Cmp.getOpcode() != X86ISD::CMP || !Cmp.hasOneUse() ||
   30609       !X86::isZeroNode(Cmp.getOperand(1)) ||
   30610       !Cmp.getOperand(0).getValueType().isInteger())
   30611     return SDValue();
   30612 
   30613   SDValue CmpOp0 = Cmp.getOperand(0);
   30614   SDValue NewCmp = DAG.getNode(X86ISD::CMP, DL, MVT::i32, CmpOp0,
   30615                                DAG.getConstant(1, DL, CmpOp0.getValueType()));
   30616 
   30617   SDValue OtherVal = N->getOperand(N->getOpcode() == ISD::SUB ? 0 : 1);
   30618   if (CC == X86::COND_NE)
   30619     return DAG.getNode(N->getOpcode() == ISD::SUB ? X86ISD::ADC : X86ISD::SBB,
   30620                        DL, OtherVal.getValueType(), OtherVal,
   30621                        DAG.getConstant(-1ULL, DL, OtherVal.getValueType()),
   30622                        NewCmp);
   30623   return DAG.getNode(N->getOpcode() == ISD::SUB ? X86ISD::SBB : X86ISD::ADC,
   30624                      DL, OtherVal.getValueType(), OtherVal,
   30625                      DAG.getConstant(0, DL, OtherVal.getValueType()), NewCmp);
   30626 }
   30627 
   30628 static SDValue detectSADPattern(SDNode *N, SelectionDAG &DAG,
   30629                                 const X86Subtarget &Subtarget) {
   30630   SDLoc DL(N);
   30631   EVT VT = N->getValueType(0);
   30632   SDValue Op0 = N->getOperand(0);
   30633   SDValue Op1 = N->getOperand(1);
   30634 
   30635   if (!VT.isVector() || !VT.isSimple() ||
   30636       !(VT.getVectorElementType() == MVT::i32))
   30637     return SDValue();
   30638 
   30639   unsigned RegSize = 128;
   30640   if (Subtarget.hasBWI())
   30641     RegSize = 512;
   30642   else if (Subtarget.hasAVX2())
   30643     RegSize = 256;
   30644 
   30645   // We only handle v16i32 for SSE2 / v32i32 for AVX2 / v64i32 for AVX512.
   30646   if (VT.getSizeInBits() / 4 > RegSize)
   30647     return SDValue();
   30648 
   30649   // Detect the following pattern:
   30650   //
   30651   // 1:    %2 = zext <N x i8> %0 to <N x i32>
   30652   // 2:    %3 = zext <N x i8> %1 to <N x i32>
   30653   // 3:    %4 = sub nsw <N x i32> %2, %3
   30654   // 4:    %5 = icmp sgt <N x i32> %4, [0 x N] or [-1 x N]
   30655   // 5:    %6 = sub nsw <N x i32> zeroinitializer, %4
   30656   // 6:    %7 = select <N x i1> %5, <N x i32> %4, <N x i32> %6
   30657   // 7:    %8 = add nsw <N x i32> %7, %vec.phi
   30658   //
   30659   // The last instruction must be a reduction add. The instructions 3-6 forms an
   30660   // ABSDIFF pattern.
   30661 
   30662   // The two operands of reduction add are from PHI and a select-op as in line 7
   30663   // above.
   30664   SDValue SelectOp, Phi;
   30665   if (Op0.getOpcode() == ISD::VSELECT) {
   30666     SelectOp = Op0;
   30667     Phi = Op1;
   30668   } else if (Op1.getOpcode() == ISD::VSELECT) {
   30669     SelectOp = Op1;
   30670     Phi = Op0;
   30671   } else
   30672     return SDValue();
   30673 
   30674   // Check the condition of the select instruction is greater-than.
   30675   SDValue SetCC = SelectOp->getOperand(0);
   30676   if (SetCC.getOpcode() != ISD::SETCC)
   30677     return SDValue();
   30678   ISD::CondCode CC = cast<CondCodeSDNode>(SetCC.getOperand(2))->get();
   30679   if (CC != ISD::SETGT)
   30680     return SDValue();
   30681 
   30682   Op0 = SelectOp->getOperand(1);
   30683   Op1 = SelectOp->getOperand(2);
   30684 
   30685   // The second operand of SelectOp Op1 is the negation of the first operand
   30686   // Op0, which is implemented as 0 - Op0.
   30687   if (!(Op1.getOpcode() == ISD::SUB &&
   30688         ISD::isBuildVectorAllZeros(Op1.getOperand(0).getNode()) &&
   30689         Op1.getOperand(1) == Op0))
   30690     return SDValue();
   30691 
   30692   // The first operand of SetCC is the first operand of SelectOp, which is the
   30693   // difference between two input vectors.
   30694   if (SetCC.getOperand(0) != Op0)
   30695     return SDValue();
   30696 
   30697   // The second operand of > comparison can be either -1 or 0.
   30698   if (!(ISD::isBuildVectorAllZeros(SetCC.getOperand(1).getNode()) ||
   30699         ISD::isBuildVectorAllOnes(SetCC.getOperand(1).getNode())))
   30700     return SDValue();
   30701 
   30702   // The first operand of SelectOp is the difference between two input vectors.
   30703   if (Op0.getOpcode() != ISD::SUB)
   30704     return SDValue();
   30705 
   30706   Op1 = Op0.getOperand(1);
   30707   Op0 = Op0.getOperand(0);
   30708 
   30709   // Check if the operands of the diff are zero-extended from vectors of i8.
   30710   if (Op0.getOpcode() != ISD::ZERO_EXTEND ||
   30711       Op0.getOperand(0).getValueType().getVectorElementType() != MVT::i8 ||
   30712       Op1.getOpcode() != ISD::ZERO_EXTEND ||
   30713       Op1.getOperand(0).getValueType().getVectorElementType() != MVT::i8)
   30714     return SDValue();
   30715 
   30716   // SAD pattern detected. Now build a SAD instruction and an addition for
   30717   // reduction. Note that the number of elments of the result of SAD is less
   30718   // than the number of elements of its input. Therefore, we could only update
   30719   // part of elements in the reduction vector.
   30720 
   30721   // Legalize the type of the inputs of PSADBW.
   30722   EVT InVT = Op0.getOperand(0).getValueType();
   30723   if (InVT.getSizeInBits() <= 128)
   30724     RegSize = 128;
   30725   else if (InVT.getSizeInBits() <= 256)
   30726     RegSize = 256;
   30727 
   30728   unsigned NumConcat = RegSize / InVT.getSizeInBits();
   30729   SmallVector<SDValue, 16> Ops(NumConcat, DAG.getConstant(0, DL, InVT));
   30730   Ops[0] = Op0.getOperand(0);
   30731   MVT ExtendedVT = MVT::getVectorVT(MVT::i8, RegSize / 8);
   30732   Op0 = DAG.getNode(ISD::CONCAT_VECTORS, DL, ExtendedVT, Ops);
   30733   Ops[0] = Op1.getOperand(0);
   30734   Op1 = DAG.getNode(ISD::CONCAT_VECTORS, DL, ExtendedVT, Ops);
   30735 
   30736   // The output of PSADBW is a vector of i64.
   30737   MVT SadVT = MVT::getVectorVT(MVT::i64, RegSize / 64);
   30738   SDValue Sad = DAG.getNode(X86ISD::PSADBW, DL, SadVT, Op0, Op1);
   30739 
   30740   // We need to turn the vector of i64 into a vector of i32.
   30741   // If the reduction vector is at least as wide as the psadbw result, just
   30742   // bitcast. If it's narrower, truncate - the high i32 of each i64 is zero
   30743   // anyway.
   30744   MVT ResVT = MVT::getVectorVT(MVT::i32, RegSize / 32);
   30745   if (VT.getSizeInBits() >= ResVT.getSizeInBits())
   30746     Sad = DAG.getNode(ISD::BITCAST, DL, ResVT, Sad);
   30747   else
   30748     Sad = DAG.getNode(ISD::TRUNCATE, DL, VT, Sad);
   30749 
   30750   if (VT.getSizeInBits() > ResVT.getSizeInBits()) {
   30751     // Update part of elements of the reduction vector. This is done by first
   30752     // extracting a sub-vector from it, updating this sub-vector, and inserting
   30753     // it back.
   30754     SDValue SubPhi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, ResVT, Phi,
   30755                                  DAG.getIntPtrConstant(0, DL));
   30756     SDValue Res = DAG.getNode(ISD::ADD, DL, ResVT, Sad, SubPhi);
   30757     return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, Phi, Res,
   30758                        DAG.getIntPtrConstant(0, DL));
   30759   } else
   30760     return DAG.getNode(ISD::ADD, DL, VT, Sad, Phi);
   30761 }
   30762 
   30763 static SDValue combineAdd(SDNode *N, SelectionDAG &DAG,
   30764                           const X86Subtarget &Subtarget) {
   30765   const SDNodeFlags *Flags = &cast<BinaryWithFlagsSDNode>(N)->Flags;
   30766   if (Flags->hasVectorReduction()) {
   30767     if (SDValue Sad = detectSADPattern(N, DAG, Subtarget))
   30768       return Sad;
   30769   }
   30770   EVT VT = N->getValueType(0);
   30771   SDValue Op0 = N->getOperand(0);
   30772   SDValue Op1 = N->getOperand(1);
   30773 
   30774   // Try to synthesize horizontal adds from adds of shuffles.
   30775   if (((Subtarget.hasSSSE3() && (VT == MVT::v8i16 || VT == MVT::v4i32)) ||
   30776        (Subtarget.hasInt256() && (VT == MVT::v16i16 || VT == MVT::v8i32))) &&
   30777       isHorizontalBinOp(Op0, Op1, true))
   30778     return DAG.getNode(X86ISD::HADD, SDLoc(N), VT, Op0, Op1);
   30779 
   30780   return OptimizeConditionalInDecrement(N, DAG);
   30781 }
   30782 
   30783 static SDValue combineSub(SDNode *N, SelectionDAG &DAG,
   30784                           const X86Subtarget &Subtarget) {
   30785   SDValue Op0 = N->getOperand(0);
   30786   SDValue Op1 = N->getOperand(1);
   30787 
   30788   // X86 can't encode an immediate LHS of a sub. See if we can push the
   30789   // negation into a preceding instruction.
   30790   if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op0)) {
   30791     // If the RHS of the sub is a XOR with one use and a constant, invert the
   30792     // immediate. Then add one to the LHS of the sub so we can turn
   30793     // X-Y -> X+~Y+1, saving one register.
   30794     if (Op1->hasOneUse() && Op1.getOpcode() == ISD::XOR &&
   30795         isa<ConstantSDNode>(Op1.getOperand(1))) {
   30796       APInt XorC = cast<ConstantSDNode>(Op1.getOperand(1))->getAPIntValue();
   30797       EVT VT = Op0.getValueType();
   30798       SDValue NewXor = DAG.getNode(ISD::XOR, SDLoc(Op1), VT,
   30799                                    Op1.getOperand(0),
   30800                                    DAG.getConstant(~XorC, SDLoc(Op1), VT));
   30801       return DAG.getNode(ISD::ADD, SDLoc(N), VT, NewXor,
   30802                          DAG.getConstant(C->getAPIntValue() + 1, SDLoc(N), VT));
   30803     }
   30804   }
   30805 
   30806   // Try to synthesize horizontal adds from adds of shuffles.
   30807   EVT VT = N->getValueType(0);
   30808   if (((Subtarget.hasSSSE3() && (VT == MVT::v8i16 || VT == MVT::v4i32)) ||
   30809        (Subtarget.hasInt256() && (VT == MVT::v16i16 || VT == MVT::v8i32))) &&
   30810       isHorizontalBinOp(Op0, Op1, true))
   30811     return DAG.getNode(X86ISD::HSUB, SDLoc(N), VT, Op0, Op1);
   30812 
   30813   return OptimizeConditionalInDecrement(N, DAG);
   30814 }
   30815 
   30816 static SDValue combineVZext(SDNode *N, SelectionDAG &DAG,
   30817                             TargetLowering::DAGCombinerInfo &DCI,
   30818                             const X86Subtarget &Subtarget) {
   30819   SDLoc DL(N);
   30820   MVT VT = N->getSimpleValueType(0);
   30821   MVT SVT = VT.getVectorElementType();
   30822   SDValue Op = N->getOperand(0);
   30823   MVT OpVT = Op.getSimpleValueType();
   30824   MVT OpEltVT = OpVT.getVectorElementType();
   30825   unsigned InputBits = OpEltVT.getSizeInBits() * VT.getVectorNumElements();
   30826 
   30827   // Perform any constant folding.
   30828   if (ISD::isBuildVectorOfConstantSDNodes(Op.getNode())) {
   30829     SmallVector<SDValue, 4> Vals;
   30830     for (int i = 0, e = VT.getVectorNumElements(); i != e; ++i) {
   30831       SDValue OpElt = Op.getOperand(i);
   30832       if (OpElt.getOpcode() == ISD::UNDEF) {
   30833         Vals.push_back(DAG.getUNDEF(SVT));
   30834         continue;
   30835       }
   30836       APInt Cst = cast<ConstantSDNode>(OpElt.getNode())->getAPIntValue();
   30837       assert(Cst.getBitWidth() == OpEltVT.getSizeInBits());
   30838       Cst = Cst.zextOrTrunc(SVT.getSizeInBits());
   30839       Vals.push_back(DAG.getConstant(Cst, DL, SVT));
   30840     }
   30841     return DAG.getNode(ISD::BUILD_VECTOR, DL, VT, Vals);
   30842   }
   30843 
   30844   // (vzext (bitcast (vzext (x)) -> (vzext x)
   30845   SDValue V = peekThroughBitcasts(Op);
   30846   if (V != Op && V.getOpcode() == X86ISD::VZEXT) {
   30847     MVT InnerVT = V.getSimpleValueType();
   30848     MVT InnerEltVT = InnerVT.getVectorElementType();
   30849 
   30850     // If the element sizes match exactly, we can just do one larger vzext. This
   30851     // is always an exact type match as vzext operates on integer types.
   30852     if (OpEltVT == InnerEltVT) {
   30853       assert(OpVT == InnerVT && "Types must match for vzext!");
   30854       return DAG.getNode(X86ISD::VZEXT, DL, VT, V.getOperand(0));
   30855     }
   30856 
   30857     // The only other way we can combine them is if only a single element of the
   30858     // inner vzext is used in the input to the outer vzext.
   30859     if (InnerEltVT.getSizeInBits() < InputBits)
   30860       return SDValue();
   30861 
   30862     // In this case, the inner vzext is completely dead because we're going to
   30863     // only look at bits inside of the low element. Just do the outer vzext on
   30864     // a bitcast of the input to the inner.
   30865     return DAG.getNode(X86ISD::VZEXT, DL, VT, DAG.getBitcast(OpVT, V));
   30866   }
   30867 
   30868   // Check if we can bypass extracting and re-inserting an element of an input
   30869   // vector. Essentially:
   30870   // (bitcast (sclr2vec (ext_vec_elt x))) -> (bitcast x)
   30871   if (V.getOpcode() == ISD::SCALAR_TO_VECTOR &&
   30872       V.getOperand(0).getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
   30873       V.getOperand(0).getSimpleValueType().getSizeInBits() == InputBits) {
   30874     SDValue ExtractedV = V.getOperand(0);
   30875     SDValue OrigV = ExtractedV.getOperand(0);
   30876     if (isNullConstant(ExtractedV.getOperand(1))) {
   30877         MVT OrigVT = OrigV.getSimpleValueType();
   30878         // Extract a subvector if necessary...
   30879         if (OrigVT.getSizeInBits() > OpVT.getSizeInBits()) {
   30880           int Ratio = OrigVT.getSizeInBits() / OpVT.getSizeInBits();
   30881           OrigVT = MVT::getVectorVT(OrigVT.getVectorElementType(),
   30882                                     OrigVT.getVectorNumElements() / Ratio);
   30883           OrigV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, OrigVT, OrigV,
   30884                               DAG.getIntPtrConstant(0, DL));
   30885         }
   30886         Op = DAG.getBitcast(OpVT, OrigV);
   30887         return DAG.getNode(X86ISD::VZEXT, DL, VT, Op);
   30888       }
   30889   }
   30890 
   30891   return SDValue();
   30892 }
   30893 
   30894 /// Canonicalize (LSUB p, 1) -> (LADD p, -1).
   30895 static SDValue combineLockSub(SDNode *N, SelectionDAG &DAG,
   30896                                   const X86Subtarget &Subtarget) {
   30897   SDValue Chain = N->getOperand(0);
   30898   SDValue LHS = N->getOperand(1);
   30899   SDValue RHS = N->getOperand(2);
   30900   MVT VT = RHS.getSimpleValueType();
   30901   SDLoc DL(N);
   30902 
   30903   auto *C = dyn_cast<ConstantSDNode>(RHS);
   30904   if (!C || C->getZExtValue() != 1)
   30905     return SDValue();
   30906 
   30907   RHS = DAG.getConstant(-1, DL, VT);
   30908   MachineMemOperand *MMO = cast<MemSDNode>(N)->getMemOperand();
   30909   return DAG.getMemIntrinsicNode(X86ISD::LADD, DL,
   30910                                  DAG.getVTList(MVT::i32, MVT::Other),
   30911                                  {Chain, LHS, RHS}, VT, MMO);
   30912 }
   30913 
   30914 // TEST (AND a, b) ,(AND a, b) -> TEST a, b
   30915 static SDValue combineTestM(SDNode *N, SelectionDAG &DAG) {
   30916   SDValue Op0 = N->getOperand(0);
   30917   SDValue Op1 = N->getOperand(1);
   30918 
   30919   if (Op0 != Op1 || Op1->getOpcode() != ISD::AND)
   30920     return SDValue();
   30921 
   30922   EVT VT = N->getValueType(0);
   30923   SDLoc DL(N);
   30924 
   30925   return DAG.getNode(X86ISD::TESTM, DL, VT,
   30926                      Op0->getOperand(0), Op0->getOperand(1));
   30927 }
   30928 
   30929 static SDValue combineVectorCompare(SDNode *N, SelectionDAG &DAG,
   30930                                     const X86Subtarget &Subtarget) {
   30931   MVT VT = N->getSimpleValueType(0);
   30932   SDLoc DL(N);
   30933 
   30934   if (N->getOperand(0) == N->getOperand(1)) {
   30935     if (N->getOpcode() == X86ISD::PCMPEQ)
   30936       return getOnesVector(VT, Subtarget, DAG, DL);
   30937     if (N->getOpcode() == X86ISD::PCMPGT)
   30938       return getZeroVector(VT, Subtarget, DAG, DL);
   30939   }
   30940 
   30941   return SDValue();
   30942 }
   30943 
   30944 
   30945 SDValue X86TargetLowering::PerformDAGCombine(SDNode *N,
   30946                                              DAGCombinerInfo &DCI) const {
   30947   SelectionDAG &DAG = DCI.DAG;
   30948   switch (N->getOpcode()) {
   30949   default: break;
   30950   case ISD::EXTRACT_VECTOR_ELT: return combineExtractVectorElt(N, DAG, DCI);
   30951   case ISD::VSELECT:
   30952   case ISD::SELECT:
   30953   case X86ISD::SHRUNKBLEND: return combineSelect(N, DAG, DCI, Subtarget);
   30954   case ISD::BITCAST:        return combineBitcast(N, DAG, Subtarget);
   30955   case X86ISD::CMOV:        return combineCMov(N, DAG, DCI, Subtarget);
   30956   case ISD::ADD:            return combineAdd(N, DAG, Subtarget);
   30957   case ISD::SUB:            return combineSub(N, DAG, Subtarget);
   30958   case X86ISD::ADC:         return combineADC(N, DAG, DCI);
   30959   case ISD::MUL:            return combineMul(N, DAG, DCI, Subtarget);
   30960   case ISD::SHL:
   30961   case ISD::SRA:
   30962   case ISD::SRL:            return combineShift(N, DAG, DCI, Subtarget);
   30963   case ISD::AND:            return combineAnd(N, DAG, DCI, Subtarget);
   30964   case ISD::OR:             return combineOr(N, DAG, DCI, Subtarget);
   30965   case ISD::XOR:            return combineXor(N, DAG, DCI, Subtarget);
   30966   case ISD::LOAD:           return combineLoad(N, DAG, DCI, Subtarget);
   30967   case ISD::MLOAD:          return combineMaskedLoad(N, DAG, DCI, Subtarget);
   30968   case ISD::STORE:          return combineStore(N, DAG, Subtarget);
   30969   case ISD::MSTORE:         return combineMaskedStore(N, DAG, Subtarget);
   30970   case ISD::SINT_TO_FP:     return combineSIntToFP(N, DAG, Subtarget);
   30971   case ISD::UINT_TO_FP:     return combineUIntToFP(N, DAG, Subtarget);
   30972   case ISD::FADD:
   30973   case ISD::FSUB:           return combineFaddFsub(N, DAG, Subtarget);
   30974   case ISD::FNEG:           return combineFneg(N, DAG, Subtarget);
   30975   case ISD::TRUNCATE:       return combineTruncate(N, DAG, Subtarget);
   30976   case X86ISD::FXOR:
   30977   case X86ISD::FOR:         return combineFOr(N, DAG, Subtarget);
   30978   case X86ISD::FMIN:
   30979   case X86ISD::FMAX:        return combineFMinFMax(N, DAG);
   30980   case ISD::FMINNUM:
   30981   case ISD::FMAXNUM:        return combineFMinNumFMaxNum(N, DAG, Subtarget);
   30982   case X86ISD::FAND:        return combineFAnd(N, DAG, Subtarget);
   30983   case X86ISD::FANDN:       return combineFAndn(N, DAG, Subtarget);
   30984   case X86ISD::BT:          return combineBT(N, DAG, DCI);
   30985   case X86ISD::VZEXT_MOVL:  return combineVZextMovl(N, DAG);
   30986   case ISD::ANY_EXTEND:
   30987   case ISD::ZERO_EXTEND:    return combineZext(N, DAG, DCI, Subtarget);
   30988   case ISD::SIGN_EXTEND:    return combineSext(N, DAG, DCI, Subtarget);
   30989   case ISD::SIGN_EXTEND_INREG: return combineSignExtendInReg(N, DAG, Subtarget);
   30990   case ISD::SETCC:          return combineSetCC(N, DAG, Subtarget);
   30991   case X86ISD::SETCC:       return combineX86SetCC(N, DAG, DCI, Subtarget);
   30992   case X86ISD::BRCOND:      return combineBrCond(N, DAG, DCI, Subtarget);
   30993   case X86ISD::VZEXT:       return combineVZext(N, DAG, DCI, Subtarget);
   30994   case X86ISD::SHUFP:       // Handle all target specific shuffles
   30995   case X86ISD::INSERTPS:
   30996   case X86ISD::PALIGNR:
   30997   case X86ISD::VSHLDQ:
   30998   case X86ISD::VSRLDQ:
   30999   case X86ISD::BLENDI:
   31000   case X86ISD::UNPCKH:
   31001   case X86ISD::UNPCKL:
   31002   case X86ISD::MOVHLPS:
   31003   case X86ISD::MOVLHPS:
   31004   case X86ISD::PSHUFB:
   31005   case X86ISD::PSHUFD:
   31006   case X86ISD::PSHUFHW:
   31007   case X86ISD::PSHUFLW:
   31008   case X86ISD::MOVSHDUP:
   31009   case X86ISD::MOVSLDUP:
   31010   case X86ISD::MOVDDUP:
   31011   case X86ISD::MOVSS:
   31012   case X86ISD::MOVSD:
   31013   case X86ISD::VPPERM:
   31014   case X86ISD::VPERMV:
   31015   case X86ISD::VPERMV3:
   31016   case X86ISD::VPERMIL2:
   31017   case X86ISD::VPERMILPI:
   31018   case X86ISD::VPERMILPV:
   31019   case X86ISD::VPERM2X128:
   31020   case ISD::VECTOR_SHUFFLE: return combineShuffle(N, DAG, DCI,Subtarget);
   31021   case ISD::FMA:            return combineFMA(N, DAG, Subtarget);
   31022   case ISD::MGATHER:
   31023   case ISD::MSCATTER:       return combineGatherScatter(N, DAG);
   31024   case X86ISD::LSUB:        return combineLockSub(N, DAG, Subtarget);
   31025   case X86ISD::TESTM:       return combineTestM(N, DAG);
   31026   case X86ISD::PCMPEQ:
   31027   case X86ISD::PCMPGT:      return combineVectorCompare(N, DAG, Subtarget);
   31028   }
   31029 
   31030   return SDValue();
   31031 }
   31032 
   31033 /// Return true if the target has native support for the specified value type
   31034 /// and it is 'desirable' to use the type for the given node type. e.g. On x86
   31035 /// i16 is legal, but undesirable since i16 instruction encodings are longer and
   31036 /// some i16 instructions are slow.
   31037 bool X86TargetLowering::isTypeDesirableForOp(unsigned Opc, EVT VT) const {
   31038   if (!isTypeLegal(VT))
   31039     return false;
   31040   if (VT != MVT::i16)
   31041     return true;
   31042 
   31043   switch (Opc) {
   31044   default:
   31045     return true;
   31046   case ISD::LOAD:
   31047   case ISD::SIGN_EXTEND:
   31048   case ISD::ZERO_EXTEND:
   31049   case ISD::ANY_EXTEND:
   31050   case ISD::SHL:
   31051   case ISD::SRL:
   31052   case ISD::SUB:
   31053   case ISD::ADD:
   31054   case ISD::MUL:
   31055   case ISD::AND:
   31056   case ISD::OR:
   31057   case ISD::XOR:
   31058     return false;
   31059   }
   31060 }
   31061 
   31062 /// This function checks if any of the users of EFLAGS copies the EFLAGS. We
   31063 /// know that the code that lowers COPY of EFLAGS has to use the stack, and if
   31064 /// we don't adjust the stack we clobber the first frame index.
   31065 /// See X86InstrInfo::copyPhysReg.
   31066 bool X86TargetLowering::hasCopyImplyingStackAdjustment(
   31067     MachineFunction *MF) const {
   31068   const MachineRegisterInfo &MRI = MF->getRegInfo();
   31069 
   31070   return any_of(MRI.reg_instructions(X86::EFLAGS),
   31071                 [](const MachineInstr &RI) { return RI.isCopy(); });
   31072 }
   31073 
   31074 /// This method query the target whether it is beneficial for dag combiner to
   31075 /// promote the specified node. If true, it should return the desired promotion
   31076 /// type by reference.
   31077 bool X86TargetLowering::IsDesirableToPromoteOp(SDValue Op, EVT &PVT) const {
   31078   EVT VT = Op.getValueType();
   31079   if (VT != MVT::i16)
   31080     return false;
   31081 
   31082   bool Promote = false;
   31083   bool Commute = false;
   31084   switch (Op.getOpcode()) {
   31085   default: break;
   31086   case ISD::SIGN_EXTEND:
   31087   case ISD::ZERO_EXTEND:
   31088   case ISD::ANY_EXTEND:
   31089     Promote = true;
   31090     break;
   31091   case ISD::SHL:
   31092   case ISD::SRL: {
   31093     SDValue N0 = Op.getOperand(0);
   31094     // Look out for (store (shl (load), x)).
   31095     if (MayFoldLoad(N0) && MayFoldIntoStore(Op))
   31096       return false;
   31097     Promote = true;
   31098     break;
   31099   }
   31100   case ISD::ADD:
   31101   case ISD::MUL:
   31102   case ISD::AND:
   31103   case ISD::OR:
   31104   case ISD::XOR:
   31105     Commute = true;
   31106     // fallthrough
   31107   case ISD::SUB: {
   31108     SDValue N0 = Op.getOperand(0);
   31109     SDValue N1 = Op.getOperand(1);
   31110     if (!Commute && MayFoldLoad(N1))
   31111       return false;
   31112     // Avoid disabling potential load folding opportunities.
   31113     if (MayFoldLoad(N0) && (!isa<ConstantSDNode>(N1) || MayFoldIntoStore(Op)))
   31114       return false;
   31115     if (MayFoldLoad(N1) && (!isa<ConstantSDNode>(N0) || MayFoldIntoStore(Op)))
   31116       return false;
   31117     Promote = true;
   31118   }
   31119   }
   31120 
   31121   PVT = MVT::i32;
   31122   return Promote;
   31123 }
   31124 
   31125 //===----------------------------------------------------------------------===//
   31126 //                           X86 Inline Assembly Support
   31127 //===----------------------------------------------------------------------===//
   31128 
   31129 // Helper to match a string separated by whitespace.
   31130 static bool matchAsm(StringRef S, ArrayRef<const char *> Pieces) {
   31131   S = S.substr(S.find_first_not_of(" \t")); // Skip leading whitespace.
   31132 
   31133   for (StringRef Piece : Pieces) {
   31134     if (!S.startswith(Piece)) // Check if the piece matches.
   31135       return false;
   31136 
   31137     S = S.substr(Piece.size());
   31138     StringRef::size_type Pos = S.find_first_not_of(" \t");
   31139     if (Pos == 0) // We matched a prefix.
   31140       return false;
   31141 
   31142     S = S.substr(Pos);
   31143   }
   31144 
   31145   return S.empty();
   31146 }
   31147 
   31148 static bool clobbersFlagRegisters(const SmallVector<StringRef, 4> &AsmPieces) {
   31149 
   31150   if (AsmPieces.size() == 3 || AsmPieces.size() == 4) {
   31151     if (std::count(AsmPieces.begin(), AsmPieces.end(), "~{cc}") &&
   31152         std::count(AsmPieces.begin(), AsmPieces.end(), "~{flags}") &&
   31153         std::count(AsmPieces.begin(), AsmPieces.end(), "~{fpsr}")) {
   31154 
   31155       if (AsmPieces.size() == 3)
   31156         return true;
   31157       else if (std::count(AsmPieces.begin(), AsmPieces.end(), "~{dirflag}"))
   31158         return true;
   31159     }
   31160   }
   31161   return false;
   31162 }
   31163 
   31164 bool X86TargetLowering::ExpandInlineAsm(CallInst *CI) const {
   31165   InlineAsm *IA = cast<InlineAsm>(CI->getCalledValue());
   31166 
   31167   const std::string &AsmStr = IA->getAsmString();
   31168 
   31169   IntegerType *Ty = dyn_cast<IntegerType>(CI->getType());
   31170   if (!Ty || Ty->getBitWidth() % 16 != 0)
   31171     return false;
   31172 
   31173   // TODO: should remove alternatives from the asmstring: "foo {a|b}" -> "foo a"
   31174   SmallVector<StringRef, 4> AsmPieces;
   31175   SplitString(AsmStr, AsmPieces, ";\n");
   31176 
   31177   switch (AsmPieces.size()) {
   31178   default: return false;
   31179   case 1:
   31180     // FIXME: this should verify that we are targeting a 486 or better.  If not,
   31181     // we will turn this bswap into something that will be lowered to logical
   31182     // ops instead of emitting the bswap asm.  For now, we don't support 486 or
   31183     // lower so don't worry about this.
   31184     // bswap $0
   31185     if (matchAsm(AsmPieces[0], {"bswap", "$0"}) ||
   31186         matchAsm(AsmPieces[0], {"bswapl", "$0"}) ||
   31187         matchAsm(AsmPieces[0], {"bswapq", "$0"}) ||
   31188         matchAsm(AsmPieces[0], {"bswap", "${0:q}"}) ||
   31189         matchAsm(AsmPieces[0], {"bswapl", "${0:q}"}) ||
   31190         matchAsm(AsmPieces[0], {"bswapq", "${0:q}"})) {
   31191       // No need to check constraints, nothing other than the equivalent of
   31192       // "=r,0" would be valid here.
   31193       return IntrinsicLowering::LowerToByteSwap(CI);
   31194     }
   31195 
   31196     // rorw $$8, ${0:w}  -->  llvm.bswap.i16
   31197     if (CI->getType()->isIntegerTy(16) &&
   31198         IA->getConstraintString().compare(0, 5, "=r,0,") == 0 &&
   31199         (matchAsm(AsmPieces[0], {"rorw", "$$8,", "${0:w}"}) ||
   31200          matchAsm(AsmPieces[0], {"rolw", "$$8,", "${0:w}"}))) {
   31201       AsmPieces.clear();
   31202       StringRef ConstraintsStr = IA->getConstraintString();
   31203       SplitString(StringRef(ConstraintsStr).substr(5), AsmPieces, ",");
   31204       array_pod_sort(AsmPieces.begin(), AsmPieces.end());
   31205       if (clobbersFlagRegisters(AsmPieces))
   31206         return IntrinsicLowering::LowerToByteSwap(CI);
   31207     }
   31208     break;
   31209   case 3:
   31210     if (CI->getType()->isIntegerTy(32) &&
   31211         IA->getConstraintString().compare(0, 5, "=r,0,") == 0 &&
   31212         matchAsm(AsmPieces[0], {"rorw", "$$8,", "${0:w}"}) &&
   31213         matchAsm(AsmPieces[1], {"rorl", "$$16,", "$0"}) &&
   31214         matchAsm(AsmPieces[2], {"rorw", "$$8,", "${0:w}"})) {
   31215       AsmPieces.clear();
   31216       StringRef ConstraintsStr = IA->getConstraintString();
   31217       SplitString(StringRef(ConstraintsStr).substr(5), AsmPieces, ",");
   31218       array_pod_sort(AsmPieces.begin(), AsmPieces.end());
   31219       if (clobbersFlagRegisters(AsmPieces))
   31220         return IntrinsicLowering::LowerToByteSwap(CI);
   31221     }
   31222 
   31223     if (CI->getType()->isIntegerTy(64)) {
   31224       InlineAsm::ConstraintInfoVector Constraints = IA->ParseConstraints();
   31225       if (Constraints.size() >= 2 &&
   31226           Constraints[0].Codes.size() == 1 && Constraints[0].Codes[0] == "A" &&
   31227           Constraints[1].Codes.size() == 1 && Constraints[1].Codes[0] == "0") {
   31228         // bswap %eax / bswap %edx / xchgl %eax, %edx  -> llvm.bswap.i64
   31229         if (matchAsm(AsmPieces[0], {"bswap", "%eax"}) &&
   31230             matchAsm(AsmPieces[1], {"bswap", "%edx"}) &&
   31231             matchAsm(AsmPieces[2], {"xchgl", "%eax,", "%edx"}))
   31232           return IntrinsicLowering::LowerToByteSwap(CI);
   31233       }
   31234     }
   31235     break;
   31236   }
   31237   return false;
   31238 }
   31239 
   31240 /// Given a constraint letter, return the type of constraint for this target.
   31241 X86TargetLowering::ConstraintType
   31242 X86TargetLowering::getConstraintType(StringRef Constraint) const {
   31243   if (Constraint.size() == 1) {
   31244     switch (Constraint[0]) {
   31245     case 'R':
   31246     case 'q':
   31247     case 'Q':
   31248     case 'f':
   31249     case 't':
   31250     case 'u':
   31251     case 'y':
   31252     case 'x':
   31253     case 'Y':
   31254     case 'l':
   31255       return C_RegisterClass;
   31256     case 'a':
   31257     case 'b':
   31258     case 'c':
   31259     case 'd':
   31260     case 'S':
   31261     case 'D':
   31262     case 'A':
   31263       return C_Register;
   31264     case 'I':
   31265     case 'J':
   31266     case 'K':
   31267     case 'L':
   31268     case 'M':
   31269     case 'N':
   31270     case 'G':
   31271     case 'C':
   31272     case 'e':
   31273     case 'Z':
   31274       return C_Other;
   31275     default:
   31276       break;
   31277     }
   31278   }
   31279   return TargetLowering::getConstraintType(Constraint);
   31280 }
   31281 
   31282 /// Examine constraint type and operand type and determine a weight value.
   31283 /// This object must already have been set up with the operand type
   31284 /// and the current alternative constraint selected.
   31285 TargetLowering::ConstraintWeight
   31286   X86TargetLowering::getSingleConstraintMatchWeight(
   31287     AsmOperandInfo &info, const char *constraint) const {
   31288   ConstraintWeight weight = CW_Invalid;
   31289   Value *CallOperandVal = info.CallOperandVal;
   31290     // If we don't have a value, we can't do a match,
   31291     // but allow it at the lowest weight.
   31292   if (!CallOperandVal)
   31293     return CW_Default;
   31294   Type *type = CallOperandVal->getType();
   31295   // Look at the constraint type.
   31296   switch (*constraint) {
   31297   default:
   31298     weight = TargetLowering::getSingleConstraintMatchWeight(info, constraint);
   31299   case 'R':
   31300   case 'q':
   31301   case 'Q':
   31302   case 'a':
   31303   case 'b':
   31304   case 'c':
   31305   case 'd':
   31306   case 'S':
   31307   case 'D':
   31308   case 'A':
   31309     if (CallOperandVal->getType()->isIntegerTy())
   31310       weight = CW_SpecificReg;
   31311     break;
   31312   case 'f':
   31313   case 't':
   31314   case 'u':
   31315     if (type->isFloatingPointTy())
   31316       weight = CW_SpecificReg;
   31317     break;
   31318   case 'y':
   31319     if (type->isX86_MMXTy() && Subtarget.hasMMX())
   31320       weight = CW_SpecificReg;
   31321     break;
   31322   case 'x':
   31323   case 'Y':
   31324     if (((type->getPrimitiveSizeInBits() == 128) && Subtarget.hasSSE1()) ||
   31325         ((type->getPrimitiveSizeInBits() == 256) && Subtarget.hasFp256()))
   31326       weight = CW_Register;
   31327     break;
   31328   case 'I':
   31329     if (ConstantInt *C = dyn_cast<ConstantInt>(info.CallOperandVal)) {
   31330       if (C->getZExtValue() <= 31)
   31331         weight = CW_Constant;
   31332     }
   31333     break;
   31334   case 'J':
   31335     if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
   31336       if (C->getZExtValue() <= 63)
   31337         weight = CW_Constant;
   31338     }
   31339     break;
   31340   case 'K':
   31341     if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
   31342       if ((C->getSExtValue() >= -0x80) && (C->getSExtValue() <= 0x7f))
   31343         weight = CW_Constant;
   31344     }
   31345     break;
   31346   case 'L':
   31347     if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
   31348       if ((C->getZExtValue() == 0xff) || (C->getZExtValue() == 0xffff))
   31349         weight = CW_Constant;
   31350     }
   31351     break;
   31352   case 'M':
   31353     if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
   31354       if (C->getZExtValue() <= 3)
   31355         weight = CW_Constant;
   31356     }
   31357     break;
   31358   case 'N':
   31359     if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
   31360       if (C->getZExtValue() <= 0xff)
   31361         weight = CW_Constant;
   31362     }
   31363     break;
   31364   case 'G':
   31365   case 'C':
   31366     if (isa<ConstantFP>(CallOperandVal)) {
   31367       weight = CW_Constant;
   31368     }
   31369     break;
   31370   case 'e':
   31371     if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
   31372       if ((C->getSExtValue() >= -0x80000000LL) &&
   31373           (C->getSExtValue() <= 0x7fffffffLL))
   31374         weight = CW_Constant;
   31375     }
   31376     break;
   31377   case 'Z':
   31378     if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
   31379       if (C->getZExtValue() <= 0xffffffff)
   31380         weight = CW_Constant;
   31381     }
   31382     break;
   31383   }
   31384   return weight;
   31385 }
   31386 
   31387 /// Try to replace an X constraint, which matches anything, with another that
   31388 /// has more specific requirements based on the type of the corresponding
   31389 /// operand.
   31390 const char *X86TargetLowering::
   31391 LowerXConstraint(EVT ConstraintVT) const {
   31392   // FP X constraints get lowered to SSE1/2 registers if available, otherwise
   31393   // 'f' like normal targets.
   31394   if (ConstraintVT.isFloatingPoint()) {
   31395     if (Subtarget.hasSSE2())
   31396       return "Y";
   31397     if (Subtarget.hasSSE1())
   31398       return "x";
   31399   }
   31400 
   31401   return TargetLowering::LowerXConstraint(ConstraintVT);
   31402 }
   31403 
   31404 /// Lower the specified operand into the Ops vector.
   31405 /// If it is invalid, don't add anything to Ops.
   31406 void X86TargetLowering::LowerAsmOperandForConstraint(SDValue Op,
   31407                                                      std::string &Constraint,
   31408                                                      std::vector<SDValue>&Ops,
   31409                                                      SelectionDAG &DAG) const {
   31410   SDValue Result;
   31411 
   31412   // Only support length 1 constraints for now.
   31413   if (Constraint.length() > 1) return;
   31414 
   31415   char ConstraintLetter = Constraint[0];
   31416   switch (ConstraintLetter) {
   31417   default: break;
   31418   case 'I':
   31419     if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
   31420       if (C->getZExtValue() <= 31) {
   31421         Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
   31422                                        Op.getValueType());
   31423         break;
   31424       }
   31425     }
   31426     return;
   31427   case 'J':
   31428     if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
   31429       if (C->getZExtValue() <= 63) {
   31430         Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
   31431                                        Op.getValueType());
   31432         break;
   31433       }
   31434     }
   31435     return;
   31436   case 'K':
   31437     if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
   31438       if (isInt<8>(C->getSExtValue())) {
   31439         Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
   31440                                        Op.getValueType());
   31441         break;
   31442       }
   31443     }
   31444     return;
   31445   case 'L':
   31446     if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
   31447       if (C->getZExtValue() == 0xff || C->getZExtValue() == 0xffff ||
   31448           (Subtarget.is64Bit() && C->getZExtValue() == 0xffffffff)) {
   31449         Result = DAG.getTargetConstant(C->getSExtValue(), SDLoc(Op),
   31450                                        Op.getValueType());
   31451         break;
   31452       }
   31453     }
   31454     return;
   31455   case 'M':
   31456     if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
   31457       if (C->getZExtValue() <= 3) {
   31458         Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
   31459                                        Op.getValueType());
   31460         break;
   31461       }
   31462     }
   31463     return;
   31464   case 'N':
   31465     if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
   31466       if (C->getZExtValue() <= 255) {
   31467         Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
   31468                                        Op.getValueType());
   31469         break;
   31470       }
   31471     }
   31472     return;
   31473   case 'O':
   31474     if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
   31475       if (C->getZExtValue() <= 127) {
   31476         Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
   31477                                        Op.getValueType());
   31478         break;
   31479       }
   31480     }
   31481     return;
   31482   case 'e': {
   31483     // 32-bit signed value
   31484     if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
   31485       if (ConstantInt::isValueValidForType(Type::getInt32Ty(*DAG.getContext()),
   31486                                            C->getSExtValue())) {
   31487         // Widen to 64 bits here to get it sign extended.
   31488         Result = DAG.getTargetConstant(C->getSExtValue(), SDLoc(Op), MVT::i64);
   31489         break;
   31490       }
   31491     // FIXME gcc accepts some relocatable values here too, but only in certain
   31492     // memory models; it's complicated.
   31493     }
   31494     return;
   31495   }
   31496   case 'Z': {
   31497     // 32-bit unsigned value
   31498     if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
   31499       if (ConstantInt::isValueValidForType(Type::getInt32Ty(*DAG.getContext()),
   31500                                            C->getZExtValue())) {
   31501         Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
   31502                                        Op.getValueType());
   31503         break;
   31504       }
   31505     }
   31506     // FIXME gcc accepts some relocatable values here too, but only in certain
   31507     // memory models; it's complicated.
   31508     return;
   31509   }
   31510   case 'i': {
   31511     // Literal immediates are always ok.
   31512     if (ConstantSDNode *CST = dyn_cast<ConstantSDNode>(Op)) {
   31513       // Widen to 64 bits here to get it sign extended.
   31514       Result = DAG.getTargetConstant(CST->getSExtValue(), SDLoc(Op), MVT::i64);
   31515       break;
   31516     }
   31517 
   31518     // In any sort of PIC mode addresses need to be computed at runtime by
   31519     // adding in a register or some sort of table lookup.  These can't
   31520     // be used as immediates.
   31521     if (Subtarget.isPICStyleGOT() || Subtarget.isPICStyleStubPIC())
   31522       return;
   31523 
   31524     // If we are in non-pic codegen mode, we allow the address of a global (with
   31525     // an optional displacement) to be used with 'i'.
   31526     GlobalAddressSDNode *GA = nullptr;
   31527     int64_t Offset = 0;
   31528 
   31529     // Match either (GA), (GA+C), (GA+C1+C2), etc.
   31530     while (1) {
   31531       if ((GA = dyn_cast<GlobalAddressSDNode>(Op))) {
   31532         Offset += GA->getOffset();
   31533         break;
   31534       } else if (Op.getOpcode() == ISD::ADD) {
   31535         if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op.getOperand(1))) {
   31536           Offset += C->getZExtValue();
   31537           Op = Op.getOperand(0);
   31538           continue;
   31539         }
   31540       } else if (Op.getOpcode() == ISD::SUB) {
   31541         if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op.getOperand(1))) {
   31542           Offset += -C->getZExtValue();
   31543           Op = Op.getOperand(0);
   31544           continue;
   31545         }
   31546       }
   31547 
   31548       // Otherwise, this isn't something we can handle, reject it.
   31549       return;
   31550     }
   31551 
   31552     const GlobalValue *GV = GA->getGlobal();
   31553     // If we require an extra load to get this address, as in PIC mode, we
   31554     // can't accept it.
   31555     if (isGlobalStubReference(Subtarget.classifyGlobalReference(GV)))
   31556       return;
   31557 
   31558     Result = DAG.getTargetGlobalAddress(GV, SDLoc(Op),
   31559                                         GA->getValueType(0), Offset);
   31560     break;
   31561   }
   31562   }
   31563 
   31564   if (Result.getNode()) {
   31565     Ops.push_back(Result);
   31566     return;
   31567   }
   31568   return TargetLowering::LowerAsmOperandForConstraint(Op, Constraint, Ops, DAG);
   31569 }
   31570 
   31571 /// Check if \p RC is a general purpose register class.
   31572 /// I.e., GR* or one of their variant.
   31573 static bool isGRClass(const TargetRegisterClass &RC) {
   31574   switch (RC.getID()) {
   31575   case X86::GR8RegClassID:
   31576   case X86::GR8_ABCD_LRegClassID:
   31577   case X86::GR8_ABCD_HRegClassID:
   31578   case X86::GR8_NOREXRegClassID:
   31579   case X86::GR16RegClassID:
   31580   case X86::GR16_ABCDRegClassID:
   31581   case X86::GR16_NOREXRegClassID:
   31582   case X86::GR32RegClassID:
   31583   case X86::GR32_ABCDRegClassID:
   31584   case X86::GR32_TCRegClassID:
   31585   case X86::GR32_NOREXRegClassID:
   31586   case X86::GR32_NOAXRegClassID:
   31587   case X86::GR32_NOSPRegClassID:
   31588   case X86::GR32_NOREX_NOSPRegClassID:
   31589   case X86::GR32_ADRegClassID:
   31590   case X86::GR64RegClassID:
   31591   case X86::GR64_ABCDRegClassID:
   31592   case X86::GR64_TCRegClassID:
   31593   case X86::GR64_TCW64RegClassID:
   31594   case X86::GR64_NOREXRegClassID:
   31595   case X86::GR64_NOSPRegClassID:
   31596   case X86::GR64_NOREX_NOSPRegClassID:
   31597   case X86::LOW32_ADDR_ACCESSRegClassID:
   31598   case X86::LOW32_ADDR_ACCESS_RBPRegClassID:
   31599     return true;
   31600   default:
   31601     return false;
   31602   }
   31603 }
   31604 
   31605 /// Check if \p RC is a vector register class.
   31606 /// I.e., FR* / VR* or one of their variant.
   31607 static bool isFRClass(const TargetRegisterClass &RC) {
   31608   switch (RC.getID()) {
   31609   case X86::FR32RegClassID:
   31610   case X86::FR32XRegClassID:
   31611   case X86::FR64RegClassID:
   31612   case X86::FR64XRegClassID:
   31613   case X86::FR128RegClassID:
   31614   case X86::VR64RegClassID:
   31615   case X86::VR128RegClassID:
   31616   case X86::VR128LRegClassID:
   31617   case X86::VR128HRegClassID:
   31618   case X86::VR128XRegClassID:
   31619   case X86::VR256RegClassID:
   31620   case X86::VR256LRegClassID:
   31621   case X86::VR256HRegClassID:
   31622   case X86::VR256XRegClassID:
   31623   case X86::VR512RegClassID:
   31624     return true;
   31625   default:
   31626     return false;
   31627   }
   31628 }
   31629 
   31630 std::pair<unsigned, const TargetRegisterClass *>
   31631 X86TargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
   31632                                                 StringRef Constraint,
   31633                                                 MVT VT) const {
   31634   // First, see if this is a constraint that directly corresponds to an LLVM
   31635   // register class.
   31636   if (Constraint.size() == 1) {
   31637     // GCC Constraint Letters
   31638     switch (Constraint[0]) {
   31639     default: break;
   31640       // TODO: Slight differences here in allocation order and leaving
   31641       // RIP in the class. Do they matter any more here than they do
   31642       // in the normal allocation?
   31643     case 'q':   // GENERAL_REGS in 64-bit mode, Q_REGS in 32-bit mode.
   31644       if (Subtarget.is64Bit()) {
   31645         if (VT == MVT::i32 || VT == MVT::f32)
   31646           return std::make_pair(0U, &X86::GR32RegClass);
   31647         if (VT == MVT::i16)
   31648           return std::make_pair(0U, &X86::GR16RegClass);
   31649         if (VT == MVT::i8 || VT == MVT::i1)
   31650           return std::make_pair(0U, &X86::GR8RegClass);
   31651         if (VT == MVT::i64 || VT == MVT::f64)
   31652           return std::make_pair(0U, &X86::GR64RegClass);
   31653         break;
   31654       }
   31655       // 32-bit fallthrough
   31656     case 'Q':   // Q_REGS
   31657       if (VT == MVT::i32 || VT == MVT::f32)
   31658         return std::make_pair(0U, &X86::GR32_ABCDRegClass);
   31659       if (VT == MVT::i16)
   31660         return std::make_pair(0U, &X86::GR16_ABCDRegClass);
   31661       if (VT == MVT::i8 || VT == MVT::i1)
   31662         return std::make_pair(0U, &X86::GR8_ABCD_LRegClass);
   31663       if (VT == MVT::i64)
   31664         return std::make_pair(0U, &X86::GR64_ABCDRegClass);
   31665       break;
   31666     case 'r':   // GENERAL_REGS
   31667     case 'l':   // INDEX_REGS
   31668       if (VT == MVT::i8 || VT == MVT::i1)
   31669         return std::make_pair(0U, &X86::GR8RegClass);
   31670       if (VT == MVT::i16)
   31671         return std::make_pair(0U, &X86::GR16RegClass);
   31672       if (VT == MVT::i32 || VT == MVT::f32 || !Subtarget.is64Bit())
   31673         return std::make_pair(0U, &X86::GR32RegClass);
   31674       return std::make_pair(0U, &X86::GR64RegClass);
   31675     case 'R':   // LEGACY_REGS
   31676       if (VT == MVT::i8 || VT == MVT::i1)
   31677         return std::make_pair(0U, &X86::GR8_NOREXRegClass);
   31678       if (VT == MVT::i16)
   31679         return std::make_pair(0U, &X86::GR16_NOREXRegClass);
   31680       if (VT == MVT::i32 || !Subtarget.is64Bit())
   31681         return std::make_pair(0U, &X86::GR32_NOREXRegClass);
   31682       return std::make_pair(0U, &X86::GR64_NOREXRegClass);
   31683     case 'f':  // FP Stack registers.
   31684       // If SSE is enabled for this VT, use f80 to ensure the isel moves the
   31685       // value to the correct fpstack register class.
   31686       if (VT == MVT::f32 && !isScalarFPTypeInSSEReg(VT))
   31687         return std::make_pair(0U, &X86::RFP32RegClass);
   31688       if (VT == MVT::f64 && !isScalarFPTypeInSSEReg(VT))
   31689         return std::make_pair(0U, &X86::RFP64RegClass);
   31690       return std::make_pair(0U, &X86::RFP80RegClass);
   31691     case 'y':   // MMX_REGS if MMX allowed.
   31692       if (!Subtarget.hasMMX()) break;
   31693       return std::make_pair(0U, &X86::VR64RegClass);
   31694     case 'Y':   // SSE_REGS if SSE2 allowed
   31695       if (!Subtarget.hasSSE2()) break;
   31696       // FALL THROUGH.
   31697     case 'x':   // SSE_REGS if SSE1 allowed or AVX_REGS if AVX allowed
   31698       if (!Subtarget.hasSSE1()) break;
   31699 
   31700       switch (VT.SimpleTy) {
   31701       default: break;
   31702       // Scalar SSE types.
   31703       case MVT::f32:
   31704       case MVT::i32:
   31705         return std::make_pair(0U, &X86::FR32RegClass);
   31706       case MVT::f64:
   31707       case MVT::i64:
   31708         return std::make_pair(0U, &X86::FR64RegClass);
   31709       // TODO: Handle f128 and i128 in FR128RegClass after it is tested well.
   31710       // Vector types.
   31711       case MVT::v16i8:
   31712       case MVT::v8i16:
   31713       case MVT::v4i32:
   31714       case MVT::v2i64:
   31715       case MVT::v4f32:
   31716       case MVT::v2f64:
   31717         return std::make_pair(0U, &X86::VR128RegClass);
   31718       // AVX types.
   31719       case MVT::v32i8:
   31720       case MVT::v16i16:
   31721       case MVT::v8i32:
   31722       case MVT::v4i64:
   31723       case MVT::v8f32:
   31724       case MVT::v4f64:
   31725         return std::make_pair(0U, &X86::VR256RegClass);
   31726       case MVT::v8f64:
   31727       case MVT::v16f32:
   31728       case MVT::v16i32:
   31729       case MVT::v8i64:
   31730         return std::make_pair(0U, &X86::VR512RegClass);
   31731       }
   31732       break;
   31733     }
   31734   }
   31735 
   31736   // Use the default implementation in TargetLowering to convert the register
   31737   // constraint into a member of a register class.
   31738   std::pair<unsigned, const TargetRegisterClass*> Res;
   31739   Res = TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT);
   31740 
   31741   // Not found as a standard register?
   31742   if (!Res.second) {
   31743     // Map st(0) -> st(7) -> ST0
   31744     if (Constraint.size() == 7 && Constraint[0] == '{' &&
   31745         tolower(Constraint[1]) == 's' &&
   31746         tolower(Constraint[2]) == 't' &&
   31747         Constraint[3] == '(' &&
   31748         (Constraint[4] >= '0' && Constraint[4] <= '7') &&
   31749         Constraint[5] == ')' &&
   31750         Constraint[6] == '}') {
   31751 
   31752       Res.first = X86::FP0+Constraint[4]-'0';
   31753       Res.second = &X86::RFP80RegClass;
   31754       return Res;
   31755     }
   31756 
   31757     // GCC allows "st(0)" to be called just plain "st".
   31758     if (StringRef("{st}").equals_lower(Constraint)) {
   31759       Res.first = X86::FP0;
   31760       Res.second = &X86::RFP80RegClass;
   31761       return Res;
   31762     }
   31763 
   31764     // flags -> EFLAGS
   31765     if (StringRef("{flags}").equals_lower(Constraint)) {
   31766       Res.first = X86::EFLAGS;
   31767       Res.second = &X86::CCRRegClass;
   31768       return Res;
   31769     }
   31770 
   31771     // 'A' means EAX + EDX.
   31772     if (Constraint == "A") {
   31773       Res.first = X86::EAX;
   31774       Res.second = &X86::GR32_ADRegClass;
   31775       return Res;
   31776     }
   31777     return Res;
   31778   }
   31779 
   31780   // Otherwise, check to see if this is a register class of the wrong value
   31781   // type.  For example, we want to map "{ax},i32" -> {eax}, we don't want it to
   31782   // turn into {ax},{dx}.
   31783   // MVT::Other is used to specify clobber names.
   31784   if (Res.second->hasType(VT) || VT == MVT::Other)
   31785     return Res;   // Correct type already, nothing to do.
   31786 
   31787   // Get a matching integer of the correct size. i.e. "ax" with MVT::32 should
   31788   // return "eax". This should even work for things like getting 64bit integer
   31789   // registers when given an f64 type.
   31790   const TargetRegisterClass *Class = Res.second;
   31791   // The generic code will match the first register class that contains the
   31792   // given register. Thus, based on the ordering of the tablegened file,
   31793   // the "plain" GR classes might not come first.
   31794   // Therefore, use a helper method.
   31795   if (isGRClass(*Class)) {
   31796     unsigned Size = VT.getSizeInBits();
   31797     if (Size == 1) Size = 8;
   31798     unsigned DestReg = getX86SubSuperRegisterOrZero(Res.first, Size);
   31799     if (DestReg > 0) {
   31800       Res.first = DestReg;
   31801       Res.second = Size == 8 ? &X86::GR8RegClass
   31802                  : Size == 16 ? &X86::GR16RegClass
   31803                  : Size == 32 ? &X86::GR32RegClass
   31804                  : &X86::GR64RegClass;
   31805       assert(Res.second->contains(Res.first) && "Register in register class");
   31806     } else {
   31807       // No register found/type mismatch.
   31808       Res.first = 0;
   31809       Res.second = nullptr;
   31810     }
   31811   } else if (isFRClass(*Class)) {
   31812     // Handle references to XMM physical registers that got mapped into the
   31813     // wrong class.  This can happen with constraints like {xmm0} where the
   31814     // target independent register mapper will just pick the first match it can
   31815     // find, ignoring the required type.
   31816 
   31817     // TODO: Handle f128 and i128 in FR128RegClass after it is tested well.
   31818     if (VT == MVT::f32 || VT == MVT::i32)
   31819       Res.second = &X86::FR32RegClass;
   31820     else if (VT == MVT::f64 || VT == MVT::i64)
   31821       Res.second = &X86::FR64RegClass;
   31822     else if (X86::VR128RegClass.hasType(VT))
   31823       Res.second = &X86::VR128RegClass;
   31824     else if (X86::VR256RegClass.hasType(VT))
   31825       Res.second = &X86::VR256RegClass;
   31826     else if (X86::VR512RegClass.hasType(VT))
   31827       Res.second = &X86::VR512RegClass;
   31828     else {
   31829       // Type mismatch and not a clobber: Return an error;
   31830       Res.first = 0;
   31831       Res.second = nullptr;
   31832     }
   31833   }
   31834 
   31835   return Res;
   31836 }
   31837 
   31838 int X86TargetLowering::getScalingFactorCost(const DataLayout &DL,
   31839                                             const AddrMode &AM, Type *Ty,
   31840                                             unsigned AS) const {
   31841   // Scaling factors are not free at all.
   31842   // An indexed folded instruction, i.e., inst (reg1, reg2, scale),
   31843   // will take 2 allocations in the out of order engine instead of 1
   31844   // for plain addressing mode, i.e. inst (reg1).
   31845   // E.g.,
   31846   // vaddps (%rsi,%drx), %ymm0, %ymm1
   31847   // Requires two allocations (one for the load, one for the computation)
   31848   // whereas:
   31849   // vaddps (%rsi), %ymm0, %ymm1
   31850   // Requires just 1 allocation, i.e., freeing allocations for other operations
   31851   // and having less micro operations to execute.
   31852   //
   31853   // For some X86 architectures, this is even worse because for instance for
   31854   // stores, the complex addressing mode forces the instruction to use the
   31855   // "load" ports instead of the dedicated "store" port.
   31856   // E.g., on Haswell:
   31857   // vmovaps %ymm1, (%r8, %rdi) can use port 2 or 3.
   31858   // vmovaps %ymm1, (%r8) can use port 2, 3, or 7.
   31859   if (isLegalAddressingMode(DL, AM, Ty, AS))
   31860     // Scale represents reg2 * scale, thus account for 1
   31861     // as soon as we use a second register.
   31862     return AM.Scale != 0;
   31863   return -1;
   31864 }
   31865 
   31866 bool X86TargetLowering::isIntDivCheap(EVT VT, AttributeSet Attr) const {
   31867   // Integer division on x86 is expensive. However, when aggressively optimizing
   31868   // for code size, we prefer to use a div instruction, as it is usually smaller
   31869   // than the alternative sequence.
   31870   // The exception to this is vector division. Since x86 doesn't have vector
   31871   // integer division, leaving the division as-is is a loss even in terms of
   31872   // size, because it will have to be scalarized, while the alternative code
   31873   // sequence can be performed in vector form.
   31874   bool OptSize = Attr.hasAttribute(AttributeSet::FunctionIndex,
   31875                                    Attribute::MinSize);
   31876   return OptSize && !VT.isVector();
   31877 }
   31878 
   31879 void X86TargetLowering::initializeSplitCSR(MachineBasicBlock *Entry) const {
   31880   if (!Subtarget.is64Bit())
   31881     return;
   31882 
   31883   // Update IsSplitCSR in X86MachineFunctionInfo.
   31884   X86MachineFunctionInfo *AFI =
   31885     Entry->getParent()->getInfo<X86MachineFunctionInfo>();
   31886   AFI->setIsSplitCSR(true);
   31887 }
   31888 
   31889 void X86TargetLowering::insertCopiesSplitCSR(
   31890     MachineBasicBlock *Entry,
   31891     const SmallVectorImpl<MachineBasicBlock *> &Exits) const {
   31892   const X86RegisterInfo *TRI = Subtarget.getRegisterInfo();
   31893   const MCPhysReg *IStart = TRI->getCalleeSavedRegsViaCopy(Entry->getParent());
   31894   if (!IStart)
   31895     return;
   31896 
   31897   const TargetInstrInfo *TII = Subtarget.getInstrInfo();
   31898   MachineRegisterInfo *MRI = &Entry->getParent()->getRegInfo();
   31899   MachineBasicBlock::iterator MBBI = Entry->begin();
   31900   for (const MCPhysReg *I = IStart; *I; ++I) {
   31901     const TargetRegisterClass *RC = nullptr;
   31902     if (X86::GR64RegClass.contains(*I))
   31903       RC = &X86::GR64RegClass;
   31904     else
   31905       llvm_unreachable("Unexpected register class in CSRsViaCopy!");
   31906 
   31907     unsigned NewVR = MRI->createVirtualRegister(RC);
   31908     // Create copy from CSR to a virtual register.
   31909     // FIXME: this currently does not emit CFI pseudo-instructions, it works
   31910     // fine for CXX_FAST_TLS since the C++-style TLS access functions should be
   31911     // nounwind. If we want to generalize this later, we may need to emit
   31912     // CFI pseudo-instructions.
   31913     assert(Entry->getParent()->getFunction()->hasFnAttribute(
   31914                Attribute::NoUnwind) &&
   31915            "Function should be nounwind in insertCopiesSplitCSR!");
   31916     Entry->addLiveIn(*I);
   31917     BuildMI(*Entry, MBBI, DebugLoc(), TII->get(TargetOpcode::COPY), NewVR)
   31918         .addReg(*I);
   31919 
   31920     // Insert the copy-back instructions right before the terminator.
   31921     for (auto *Exit : Exits)
   31922       BuildMI(*Exit, Exit->getFirstTerminator(), DebugLoc(),
   31923               TII->get(TargetOpcode::COPY), *I)
   31924           .addReg(NewVR);
   31925   }
   31926 }
   31927