Home | History | Annotate | Download | only in X86
      1 //===-- X86ISelLowering.cpp - X86 DAG Lowering Implementation -------------===//
      2 //
      3 //                     The LLVM Compiler Infrastructure
      4 //
      5 // This file is distributed under the University of Illinois Open Source
      6 // License. See LICENSE.TXT for details.
      7 //
      8 //===----------------------------------------------------------------------===//
      9 //
     10 // This file defines the interfaces that X86 uses to lower LLVM code into a
     11 // selection DAG.
     12 //
     13 //===----------------------------------------------------------------------===//
     14 
     15 #include "X86ISelLowering.h"
     16 #include "Utils/X86ShuffleDecode.h"
     17 #include "X86CallingConv.h"
     18 #include "X86InstrBuilder.h"
     19 #include "X86MachineFunctionInfo.h"
     20 #include "X86TargetMachine.h"
     21 #include "X86TargetObjectFile.h"
     22 #include "llvm/ADT/SmallSet.h"
     23 #include "llvm/ADT/Statistic.h"
     24 #include "llvm/ADT/StringExtras.h"
     25 #include "llvm/ADT/StringSwitch.h"
     26 #include "llvm/ADT/VariadicFunction.h"
     27 #include "llvm/CodeGen/IntrinsicLowering.h"
     28 #include "llvm/CodeGen/MachineFrameInfo.h"
     29 #include "llvm/CodeGen/MachineFunction.h"
     30 #include "llvm/CodeGen/MachineInstrBuilder.h"
     31 #include "llvm/CodeGen/MachineJumpTableInfo.h"
     32 #include "llvm/CodeGen/MachineModuleInfo.h"
     33 #include "llvm/CodeGen/MachineRegisterInfo.h"
     34 #include "llvm/IR/CallSite.h"
     35 #include "llvm/IR/CallingConv.h"
     36 #include "llvm/IR/Constants.h"
     37 #include "llvm/IR/DerivedTypes.h"
     38 #include "llvm/IR/Function.h"
     39 #include "llvm/IR/GlobalAlias.h"
     40 #include "llvm/IR/GlobalVariable.h"
     41 #include "llvm/IR/Instructions.h"
     42 #include "llvm/IR/Intrinsics.h"
     43 #include "llvm/MC/MCAsmInfo.h"
     44 #include "llvm/MC/MCContext.h"
     45 #include "llvm/MC/MCExpr.h"
     46 #include "llvm/MC/MCSymbol.h"
     47 #include "llvm/Support/CommandLine.h"
     48 #include "llvm/Support/Debug.h"
     49 #include "llvm/Support/ErrorHandling.h"
     50 #include "llvm/Support/MathExtras.h"
     51 #include "llvm/Target/TargetOptions.h"
     52 #include <bitset>
     53 #include <numeric>
     54 #include <cctype>
     55 using namespace llvm;
     56 
     57 #define DEBUG_TYPE "x86-isel"
     58 
     59 STATISTIC(NumTailCalls, "Number of tail calls");
     60 
     61 static cl::opt<bool> ExperimentalVectorWideningLegalization(
     62     "x86-experimental-vector-widening-legalization", cl::init(false),
     63     cl::desc("Enable an experimental vector type legalization through widening "
     64              "rather than promotion."),
     65     cl::Hidden);
     66 
     67 static cl::opt<bool> ExperimentalVectorShuffleLowering(
     68     "x86-experimental-vector-shuffle-lowering", cl::init(false),
     69     cl::desc("Enable an experimental vector shuffle lowering code path."),
     70     cl::Hidden);
     71 
     72 // Forward declarations.
     73 static SDValue getMOVL(SelectionDAG &DAG, SDLoc dl, EVT VT, SDValue V1,
     74                        SDValue V2);
     75 
     76 static SDValue ExtractSubVector(SDValue Vec, unsigned IdxVal,
     77                                 SelectionDAG &DAG, SDLoc dl,
     78                                 unsigned vectorWidth) {
     79   assert((vectorWidth == 128 || vectorWidth == 256) &&
     80          "Unsupported vector width");
     81   EVT VT = Vec.getValueType();
     82   EVT ElVT = VT.getVectorElementType();
     83   unsigned Factor = VT.getSizeInBits()/vectorWidth;
     84   EVT ResultVT = EVT::getVectorVT(*DAG.getContext(), ElVT,
     85                                   VT.getVectorNumElements()/Factor);
     86 
     87   // Extract from UNDEF is UNDEF.
     88   if (Vec.getOpcode() == ISD::UNDEF)
     89     return DAG.getUNDEF(ResultVT);
     90 
     91   // Extract the relevant vectorWidth bits.  Generate an EXTRACT_SUBVECTOR
     92   unsigned ElemsPerChunk = vectorWidth / ElVT.getSizeInBits();
     93 
     94   // This is the index of the first element of the vectorWidth-bit chunk
     95   // we want.
     96   unsigned NormalizedIdxVal = (((IdxVal * ElVT.getSizeInBits()) / vectorWidth)
     97                                * ElemsPerChunk);
     98 
     99   // If the input is a buildvector just emit a smaller one.
    100   if (Vec.getOpcode() == ISD::BUILD_VECTOR)
    101     return DAG.getNode(ISD::BUILD_VECTOR, dl, ResultVT,
    102                        makeArrayRef(Vec->op_begin()+NormalizedIdxVal,
    103                                     ElemsPerChunk));
    104 
    105   SDValue VecIdx = DAG.getIntPtrConstant(NormalizedIdxVal);
    106   SDValue Result = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, ResultVT, Vec,
    107                                VecIdx);
    108 
    109   return Result;
    110 
    111 }
    112 /// Generate a DAG to grab 128-bits from a vector > 128 bits.  This
    113 /// sets things up to match to an AVX VEXTRACTF128 / VEXTRACTI128
    114 /// or AVX-512 VEXTRACTF32x4 / VEXTRACTI32x4
    115 /// instructions or a simple subregister reference. Idx is an index in the
    116 /// 128 bits we want.  It need not be aligned to a 128-bit bounday.  That makes
    117 /// lowering EXTRACT_VECTOR_ELT operations easier.
    118 static SDValue Extract128BitVector(SDValue Vec, unsigned IdxVal,
    119                                    SelectionDAG &DAG, SDLoc dl) {
    120   assert((Vec.getValueType().is256BitVector() ||
    121           Vec.getValueType().is512BitVector()) && "Unexpected vector size!");
    122   return ExtractSubVector(Vec, IdxVal, DAG, dl, 128);
    123 }
    124 
    125 /// Generate a DAG to grab 256-bits from a 512-bit vector.
    126 static SDValue Extract256BitVector(SDValue Vec, unsigned IdxVal,
    127                                    SelectionDAG &DAG, SDLoc dl) {
    128   assert(Vec.getValueType().is512BitVector() && "Unexpected vector size!");
    129   return ExtractSubVector(Vec, IdxVal, DAG, dl, 256);
    130 }
    131 
    132 static SDValue InsertSubVector(SDValue Result, SDValue Vec,
    133                                unsigned IdxVal, SelectionDAG &DAG,
    134                                SDLoc dl, unsigned vectorWidth) {
    135   assert((vectorWidth == 128 || vectorWidth == 256) &&
    136          "Unsupported vector width");
    137   // Inserting UNDEF is Result
    138   if (Vec.getOpcode() == ISD::UNDEF)
    139     return Result;
    140   EVT VT = Vec.getValueType();
    141   EVT ElVT = VT.getVectorElementType();
    142   EVT ResultVT = Result.getValueType();
    143 
    144   // Insert the relevant vectorWidth bits.
    145   unsigned ElemsPerChunk = vectorWidth/ElVT.getSizeInBits();
    146 
    147   // This is the index of the first element of the vectorWidth-bit chunk
    148   // we want.
    149   unsigned NormalizedIdxVal = (((IdxVal * ElVT.getSizeInBits())/vectorWidth)
    150                                * ElemsPerChunk);
    151 
    152   SDValue VecIdx = DAG.getIntPtrConstant(NormalizedIdxVal);
    153   return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResultVT, Result, Vec,
    154                      VecIdx);
    155 }
    156 /// Generate a DAG to put 128-bits into a vector > 128 bits.  This
    157 /// sets things up to match to an AVX VINSERTF128/VINSERTI128 or
    158 /// AVX-512 VINSERTF32x4/VINSERTI32x4 instructions or a
    159 /// simple superregister reference.  Idx is an index in the 128 bits
    160 /// we want.  It need not be aligned to a 128-bit bounday.  That makes
    161 /// lowering INSERT_VECTOR_ELT operations easier.
    162 static SDValue Insert128BitVector(SDValue Result, SDValue Vec,
    163                                   unsigned IdxVal, SelectionDAG &DAG,
    164                                   SDLoc dl) {
    165   assert(Vec.getValueType().is128BitVector() && "Unexpected vector size!");
    166   return InsertSubVector(Result, Vec, IdxVal, DAG, dl, 128);
    167 }
    168 
    169 static SDValue Insert256BitVector(SDValue Result, SDValue Vec,
    170                                   unsigned IdxVal, SelectionDAG &DAG,
    171                                   SDLoc dl) {
    172   assert(Vec.getValueType().is256BitVector() && "Unexpected vector size!");
    173   return InsertSubVector(Result, Vec, IdxVal, DAG, dl, 256);
    174 }
    175 
    176 /// Concat two 128-bit vectors into a 256 bit vector using VINSERTF128
    177 /// instructions. This is used because creating CONCAT_VECTOR nodes of
    178 /// BUILD_VECTORS returns a larger BUILD_VECTOR while we're trying to lower
    179 /// large BUILD_VECTORS.
    180 static SDValue Concat128BitVectors(SDValue V1, SDValue V2, EVT VT,
    181                                    unsigned NumElems, SelectionDAG &DAG,
    182                                    SDLoc dl) {
    183   SDValue V = Insert128BitVector(DAG.getUNDEF(VT), V1, 0, DAG, dl);
    184   return Insert128BitVector(V, V2, NumElems/2, DAG, dl);
    185 }
    186 
    187 static SDValue Concat256BitVectors(SDValue V1, SDValue V2, EVT VT,
    188                                    unsigned NumElems, SelectionDAG &DAG,
    189                                    SDLoc dl) {
    190   SDValue V = Insert256BitVector(DAG.getUNDEF(VT), V1, 0, DAG, dl);
    191   return Insert256BitVector(V, V2, NumElems/2, DAG, dl);
    192 }
    193 
    194 static TargetLoweringObjectFile *createTLOF(const Triple &TT) {
    195   if (TT.isOSBinFormatMachO()) {
    196     if (TT.getArch() == Triple::x86_64)
    197       return new X86_64MachoTargetObjectFile();
    198     return new TargetLoweringObjectFileMachO();
    199   }
    200 
    201   if (TT.isOSLinux())
    202     return new X86LinuxTargetObjectFile();
    203   if (TT.isOSBinFormatELF())
    204     return new TargetLoweringObjectFileELF();
    205   if (TT.isKnownWindowsMSVCEnvironment())
    206     return new X86WindowsTargetObjectFile();
    207   if (TT.isOSBinFormatCOFF())
    208     return new TargetLoweringObjectFileCOFF();
    209   llvm_unreachable("unknown subtarget type");
    210 }
    211 
    212 // FIXME: This should stop caching the target machine as soon as
    213 // we can remove resetOperationActions et al.
    214 X86TargetLowering::X86TargetLowering(X86TargetMachine &TM)
    215   : TargetLowering(TM, createTLOF(Triple(TM.getTargetTriple()))) {
    216   Subtarget = &TM.getSubtarget<X86Subtarget>();
    217   X86ScalarSSEf64 = Subtarget->hasSSE2();
    218   X86ScalarSSEf32 = Subtarget->hasSSE1();
    219   TD = getDataLayout();
    220 
    221   resetOperationActions();
    222 }
    223 
    224 void X86TargetLowering::resetOperationActions() {
    225   const TargetMachine &TM = getTargetMachine();
    226   static bool FirstTimeThrough = true;
    227 
    228   // If none of the target options have changed, then we don't need to reset the
    229   // operation actions.
    230   if (!FirstTimeThrough && TO == TM.Options) return;
    231 
    232   if (!FirstTimeThrough) {
    233     // Reinitialize the actions.
    234     initActions();
    235     FirstTimeThrough = false;
    236   }
    237 
    238   TO = TM.Options;
    239 
    240   // Set up the TargetLowering object.
    241   static const MVT IntVTs[] = { MVT::i8, MVT::i16, MVT::i32, MVT::i64 };
    242 
    243   // X86 is weird, it always uses i8 for shift amounts and setcc results.
    244   setBooleanContents(ZeroOrOneBooleanContent);
    245   // X86-SSE is even stranger. It uses -1 or 0 for vector masks.
    246   setBooleanVectorContents(ZeroOrNegativeOneBooleanContent);
    247 
    248   // For 64-bit since we have so many registers use the ILP scheduler, for
    249   // 32-bit code use the register pressure specific scheduling.
    250   // For Atom, always use ILP scheduling.
    251   if (Subtarget->isAtom())
    252     setSchedulingPreference(Sched::ILP);
    253   else if (Subtarget->is64Bit())
    254     setSchedulingPreference(Sched::ILP);
    255   else
    256     setSchedulingPreference(Sched::RegPressure);
    257   const X86RegisterInfo *RegInfo =
    258     static_cast<const X86RegisterInfo*>(TM.getRegisterInfo());
    259   setStackPointerRegisterToSaveRestore(RegInfo->getStackRegister());
    260 
    261   // Bypass expensive divides on Atom when compiling with O2
    262   if (Subtarget->hasSlowDivide() && TM.getOptLevel() >= CodeGenOpt::Default) {
    263     addBypassSlowDiv(32, 8);
    264     if (Subtarget->is64Bit())
    265       addBypassSlowDiv(64, 16);
    266   }
    267 
    268   if (Subtarget->isTargetKnownWindowsMSVC()) {
    269     // Setup Windows compiler runtime calls.
    270     setLibcallName(RTLIB::SDIV_I64, "_alldiv");
    271     setLibcallName(RTLIB::UDIV_I64, "_aulldiv");
    272     setLibcallName(RTLIB::SREM_I64, "_allrem");
    273     setLibcallName(RTLIB::UREM_I64, "_aullrem");
    274     setLibcallName(RTLIB::MUL_I64, "_allmul");
    275     setLibcallCallingConv(RTLIB::SDIV_I64, CallingConv::X86_StdCall);
    276     setLibcallCallingConv(RTLIB::UDIV_I64, CallingConv::X86_StdCall);
    277     setLibcallCallingConv(RTLIB::SREM_I64, CallingConv::X86_StdCall);
    278     setLibcallCallingConv(RTLIB::UREM_I64, CallingConv::X86_StdCall);
    279     setLibcallCallingConv(RTLIB::MUL_I64, CallingConv::X86_StdCall);
    280 
    281     // The _ftol2 runtime function has an unusual calling conv, which
    282     // is modeled by a special pseudo-instruction.
    283     setLibcallName(RTLIB::FPTOUINT_F64_I64, nullptr);
    284     setLibcallName(RTLIB::FPTOUINT_F32_I64, nullptr);
    285     setLibcallName(RTLIB::FPTOUINT_F64_I32, nullptr);
    286     setLibcallName(RTLIB::FPTOUINT_F32_I32, nullptr);
    287   }
    288 
    289   if (Subtarget->isTargetDarwin()) {
    290     // Darwin should use _setjmp/_longjmp instead of setjmp/longjmp.
    291     setUseUnderscoreSetJmp(false);
    292     setUseUnderscoreLongJmp(false);
    293   } else if (Subtarget->isTargetWindowsGNU()) {
    294     // MS runtime is weird: it exports _setjmp, but longjmp!
    295     setUseUnderscoreSetJmp(true);
    296     setUseUnderscoreLongJmp(false);
    297   } else {
    298     setUseUnderscoreSetJmp(true);
    299     setUseUnderscoreLongJmp(true);
    300   }
    301 
    302   // Set up the register classes.
    303   addRegisterClass(MVT::i8, &X86::GR8RegClass);
    304   addRegisterClass(MVT::i16, &X86::GR16RegClass);
    305   addRegisterClass(MVT::i32, &X86::GR32RegClass);
    306   if (Subtarget->is64Bit())
    307     addRegisterClass(MVT::i64, &X86::GR64RegClass);
    308 
    309   setLoadExtAction(ISD::SEXTLOAD, MVT::i1, Promote);
    310 
    311   // We don't accept any truncstore of integer registers.
    312   setTruncStoreAction(MVT::i64, MVT::i32, Expand);
    313   setTruncStoreAction(MVT::i64, MVT::i16, Expand);
    314   setTruncStoreAction(MVT::i64, MVT::i8 , Expand);
    315   setTruncStoreAction(MVT::i32, MVT::i16, Expand);
    316   setTruncStoreAction(MVT::i32, MVT::i8 , Expand);
    317   setTruncStoreAction(MVT::i16, MVT::i8,  Expand);
    318 
    319   // SETOEQ and SETUNE require checking two conditions.
    320   setCondCodeAction(ISD::SETOEQ, MVT::f32, Expand);
    321   setCondCodeAction(ISD::SETOEQ, MVT::f64, Expand);
    322   setCondCodeAction(ISD::SETOEQ, MVT::f80, Expand);
    323   setCondCodeAction(ISD::SETUNE, MVT::f32, Expand);
    324   setCondCodeAction(ISD::SETUNE, MVT::f64, Expand);
    325   setCondCodeAction(ISD::SETUNE, MVT::f80, Expand);
    326 
    327   // Promote all UINT_TO_FP to larger SINT_TO_FP's, as X86 doesn't have this
    328   // operation.
    329   setOperationAction(ISD::UINT_TO_FP       , MVT::i1   , Promote);
    330   setOperationAction(ISD::UINT_TO_FP       , MVT::i8   , Promote);
    331   setOperationAction(ISD::UINT_TO_FP       , MVT::i16  , Promote);
    332 
    333   if (Subtarget->is64Bit()) {
    334     setOperationAction(ISD::UINT_TO_FP     , MVT::i32  , Promote);
    335     setOperationAction(ISD::UINT_TO_FP     , MVT::i64  , Custom);
    336   } else if (!TM.Options.UseSoftFloat) {
    337     // We have an algorithm for SSE2->double, and we turn this into a
    338     // 64-bit FILD followed by conditional FADD for other targets.
    339     setOperationAction(ISD::UINT_TO_FP     , MVT::i64  , Custom);
    340     // We have an algorithm for SSE2, and we turn this into a 64-bit
    341     // FILD for other targets.
    342     setOperationAction(ISD::UINT_TO_FP     , MVT::i32  , Custom);
    343   }
    344 
    345   // Promote i1/i8 SINT_TO_FP to larger SINT_TO_FP's, as X86 doesn't have
    346   // this operation.
    347   setOperationAction(ISD::SINT_TO_FP       , MVT::i1   , Promote);
    348   setOperationAction(ISD::SINT_TO_FP       , MVT::i8   , Promote);
    349 
    350   if (!TM.Options.UseSoftFloat) {
    351     // SSE has no i16 to fp conversion, only i32
    352     if (X86ScalarSSEf32) {
    353       setOperationAction(ISD::SINT_TO_FP     , MVT::i16  , Promote);
    354       // f32 and f64 cases are Legal, f80 case is not
    355       setOperationAction(ISD::SINT_TO_FP     , MVT::i32  , Custom);
    356     } else {
    357       setOperationAction(ISD::SINT_TO_FP     , MVT::i16  , Custom);
    358       setOperationAction(ISD::SINT_TO_FP     , MVT::i32  , Custom);
    359     }
    360   } else {
    361     setOperationAction(ISD::SINT_TO_FP     , MVT::i16  , Promote);
    362     setOperationAction(ISD::SINT_TO_FP     , MVT::i32  , Promote);
    363   }
    364 
    365   // In 32-bit mode these are custom lowered.  In 64-bit mode F32 and F64
    366   // are Legal, f80 is custom lowered.
    367   setOperationAction(ISD::FP_TO_SINT     , MVT::i64  , Custom);
    368   setOperationAction(ISD::SINT_TO_FP     , MVT::i64  , Custom);
    369 
    370   // Promote i1/i8 FP_TO_SINT to larger FP_TO_SINTS's, as X86 doesn't have
    371   // this operation.
    372   setOperationAction(ISD::FP_TO_SINT       , MVT::i1   , Promote);
    373   setOperationAction(ISD::FP_TO_SINT       , MVT::i8   , Promote);
    374 
    375   if (X86ScalarSSEf32) {
    376     setOperationAction(ISD::FP_TO_SINT     , MVT::i16  , Promote);
    377     // f32 and f64 cases are Legal, f80 case is not
    378     setOperationAction(ISD::FP_TO_SINT     , MVT::i32  , Custom);
    379   } else {
    380     setOperationAction(ISD::FP_TO_SINT     , MVT::i16  , Custom);
    381     setOperationAction(ISD::FP_TO_SINT     , MVT::i32  , Custom);
    382   }
    383 
    384   // Handle FP_TO_UINT by promoting the destination to a larger signed
    385   // conversion.
    386   setOperationAction(ISD::FP_TO_UINT       , MVT::i1   , Promote);
    387   setOperationAction(ISD::FP_TO_UINT       , MVT::i8   , Promote);
    388   setOperationAction(ISD::FP_TO_UINT       , MVT::i16  , Promote);
    389 
    390   if (Subtarget->is64Bit()) {
    391     setOperationAction(ISD::FP_TO_UINT     , MVT::i64  , Expand);
    392     setOperationAction(ISD::FP_TO_UINT     , MVT::i32  , Promote);
    393   } else if (!TM.Options.UseSoftFloat) {
    394     // Since AVX is a superset of SSE3, only check for SSE here.
    395     if (Subtarget->hasSSE1() && !Subtarget->hasSSE3())
    396       // Expand FP_TO_UINT into a select.
    397       // FIXME: We would like to use a Custom expander here eventually to do
    398       // the optimal thing for SSE vs. the default expansion in the legalizer.
    399       setOperationAction(ISD::FP_TO_UINT   , MVT::i32  , Expand);
    400     else
    401       // With SSE3 we can use fisttpll to convert to a signed i64; without
    402       // SSE, we're stuck with a fistpll.
    403       setOperationAction(ISD::FP_TO_UINT   , MVT::i32  , Custom);
    404   }
    405 
    406   if (isTargetFTOL()) {
    407     // Use the _ftol2 runtime function, which has a pseudo-instruction
    408     // to handle its weird calling convention.
    409     setOperationAction(ISD::FP_TO_UINT     , MVT::i64  , Custom);
    410   }
    411 
    412   // TODO: when we have SSE, these could be more efficient, by using movd/movq.
    413   if (!X86ScalarSSEf64) {
    414     setOperationAction(ISD::BITCAST        , MVT::f32  , Expand);
    415     setOperationAction(ISD::BITCAST        , MVT::i32  , Expand);
    416     if (Subtarget->is64Bit()) {
    417       setOperationAction(ISD::BITCAST      , MVT::f64  , Expand);
    418       // Without SSE, i64->f64 goes through memory.
    419       setOperationAction(ISD::BITCAST      , MVT::i64  , Expand);
    420     }
    421   }
    422 
    423   // Scalar integer divide and remainder are lowered to use operations that
    424   // produce two results, to match the available instructions. This exposes
    425   // the two-result form to trivial CSE, which is able to combine x/y and x%y
    426   // into a single instruction.
    427   //
    428   // Scalar integer multiply-high is also lowered to use two-result
    429   // operations, to match the available instructions. However, plain multiply
    430   // (low) operations are left as Legal, as there are single-result
    431   // instructions for this in x86. Using the two-result multiply instructions
    432   // when both high and low results are needed must be arranged by dagcombine.
    433   for (unsigned i = 0; i != array_lengthof(IntVTs); ++i) {
    434     MVT VT = IntVTs[i];
    435     setOperationAction(ISD::MULHS, VT, Expand);
    436     setOperationAction(ISD::MULHU, VT, Expand);
    437     setOperationAction(ISD::SDIV, VT, Expand);
    438     setOperationAction(ISD::UDIV, VT, Expand);
    439     setOperationAction(ISD::SREM, VT, Expand);
    440     setOperationAction(ISD::UREM, VT, Expand);
    441 
    442     // Add/Sub overflow ops with MVT::Glues are lowered to EFLAGS dependences.
    443     setOperationAction(ISD::ADDC, VT, Custom);
    444     setOperationAction(ISD::ADDE, VT, Custom);
    445     setOperationAction(ISD::SUBC, VT, Custom);
    446     setOperationAction(ISD::SUBE, VT, Custom);
    447   }
    448 
    449   setOperationAction(ISD::BR_JT            , MVT::Other, Expand);
    450   setOperationAction(ISD::BRCOND           , MVT::Other, Custom);
    451   setOperationAction(ISD::BR_CC            , MVT::f32,   Expand);
    452   setOperationAction(ISD::BR_CC            , MVT::f64,   Expand);
    453   setOperationAction(ISD::BR_CC            , MVT::f80,   Expand);
    454   setOperationAction(ISD::BR_CC            , MVT::i8,    Expand);
    455   setOperationAction(ISD::BR_CC            , MVT::i16,   Expand);
    456   setOperationAction(ISD::BR_CC            , MVT::i32,   Expand);
    457   setOperationAction(ISD::BR_CC            , MVT::i64,   Expand);
    458   setOperationAction(ISD::SELECT_CC        , MVT::f32,   Expand);
    459   setOperationAction(ISD::SELECT_CC        , MVT::f64,   Expand);
    460   setOperationAction(ISD::SELECT_CC        , MVT::f80,   Expand);
    461   setOperationAction(ISD::SELECT_CC        , MVT::i8,    Expand);
    462   setOperationAction(ISD::SELECT_CC        , MVT::i16,   Expand);
    463   setOperationAction(ISD::SELECT_CC        , MVT::i32,   Expand);
    464   setOperationAction(ISD::SELECT_CC        , MVT::i64,   Expand);
    465   if (Subtarget->is64Bit())
    466     setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i32, Legal);
    467   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i16  , Legal);
    468   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i8   , Legal);
    469   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1   , Expand);
    470   setOperationAction(ISD::FP_ROUND_INREG   , MVT::f32  , Expand);
    471   setOperationAction(ISD::FREM             , MVT::f32  , Expand);
    472   setOperationAction(ISD::FREM             , MVT::f64  , Expand);
    473   setOperationAction(ISD::FREM             , MVT::f80  , Expand);
    474   setOperationAction(ISD::FLT_ROUNDS_      , MVT::i32  , Custom);
    475 
    476   // Promote the i8 variants and force them on up to i32 which has a shorter
    477   // encoding.
    478   setOperationAction(ISD::CTTZ             , MVT::i8   , Promote);
    479   AddPromotedToType (ISD::CTTZ             , MVT::i8   , MVT::i32);
    480   setOperationAction(ISD::CTTZ_ZERO_UNDEF  , MVT::i8   , Promote);
    481   AddPromotedToType (ISD::CTTZ_ZERO_UNDEF  , MVT::i8   , MVT::i32);
    482   if (Subtarget->hasBMI()) {
    483     setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i16  , Expand);
    484     setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i32  , Expand);
    485     if (Subtarget->is64Bit())
    486       setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i64, Expand);
    487   } else {
    488     setOperationAction(ISD::CTTZ           , MVT::i16  , Custom);
    489     setOperationAction(ISD::CTTZ           , MVT::i32  , Custom);
    490     if (Subtarget->is64Bit())
    491       setOperationAction(ISD::CTTZ         , MVT::i64  , Custom);
    492   }
    493 
    494   if (Subtarget->hasLZCNT()) {
    495     // When promoting the i8 variants, force them to i32 for a shorter
    496     // encoding.
    497     setOperationAction(ISD::CTLZ           , MVT::i8   , Promote);
    498     AddPromotedToType (ISD::CTLZ           , MVT::i8   , MVT::i32);
    499     setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i8   , Promote);
    500     AddPromotedToType (ISD::CTLZ_ZERO_UNDEF, MVT::i8   , MVT::i32);
    501     setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i16  , Expand);
    502     setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i32  , Expand);
    503     if (Subtarget->is64Bit())
    504       setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i64, Expand);
    505   } else {
    506     setOperationAction(ISD::CTLZ           , MVT::i8   , Custom);
    507     setOperationAction(ISD::CTLZ           , MVT::i16  , Custom);
    508     setOperationAction(ISD::CTLZ           , MVT::i32  , Custom);
    509     setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i8   , Custom);
    510     setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i16  , Custom);
    511     setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i32  , Custom);
    512     if (Subtarget->is64Bit()) {
    513       setOperationAction(ISD::CTLZ         , MVT::i64  , Custom);
    514       setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i64, Custom);
    515     }
    516   }
    517 
    518   // Special handling for half-precision floating point conversions.
    519   // If we don't have F16C support, then lower half float conversions
    520   // into library calls.
    521   if (TM.Options.UseSoftFloat || !Subtarget->hasF16C()) {
    522     setOperationAction(ISD::FP16_TO_FP32, MVT::f32, Expand);
    523     setOperationAction(ISD::FP32_TO_FP16, MVT::i16, Expand);
    524   }
    525 
    526   if (Subtarget->hasPOPCNT()) {
    527     setOperationAction(ISD::CTPOP          , MVT::i8   , Promote);
    528   } else {
    529     setOperationAction(ISD::CTPOP          , MVT::i8   , Expand);
    530     setOperationAction(ISD::CTPOP          , MVT::i16  , Expand);
    531     setOperationAction(ISD::CTPOP          , MVT::i32  , Expand);
    532     if (Subtarget->is64Bit())
    533       setOperationAction(ISD::CTPOP        , MVT::i64  , Expand);
    534   }
    535 
    536   setOperationAction(ISD::READCYCLECOUNTER , MVT::i64  , Custom);
    537 
    538   if (!Subtarget->hasMOVBE())
    539     setOperationAction(ISD::BSWAP          , MVT::i16  , Expand);
    540 
    541   // These should be promoted to a larger select which is supported.
    542   setOperationAction(ISD::SELECT          , MVT::i1   , Promote);
    543   // X86 wants to expand cmov itself.
    544   setOperationAction(ISD::SELECT          , MVT::i8   , Custom);
    545   setOperationAction(ISD::SELECT          , MVT::i16  , Custom);
    546   setOperationAction(ISD::SELECT          , MVT::i32  , Custom);
    547   setOperationAction(ISD::SELECT          , MVT::f32  , Custom);
    548   setOperationAction(ISD::SELECT          , MVT::f64  , Custom);
    549   setOperationAction(ISD::SELECT          , MVT::f80  , Custom);
    550   setOperationAction(ISD::SETCC           , MVT::i8   , Custom);
    551   setOperationAction(ISD::SETCC           , MVT::i16  , Custom);
    552   setOperationAction(ISD::SETCC           , MVT::i32  , Custom);
    553   setOperationAction(ISD::SETCC           , MVT::f32  , Custom);
    554   setOperationAction(ISD::SETCC           , MVT::f64  , Custom);
    555   setOperationAction(ISD::SETCC           , MVT::f80  , Custom);
    556   if (Subtarget->is64Bit()) {
    557     setOperationAction(ISD::SELECT        , MVT::i64  , Custom);
    558     setOperationAction(ISD::SETCC         , MVT::i64  , Custom);
    559   }
    560   setOperationAction(ISD::EH_RETURN       , MVT::Other, Custom);
    561   // NOTE: EH_SJLJ_SETJMP/_LONGJMP supported here is NOT intended to support
    562   // SjLj exception handling but a light-weight setjmp/longjmp replacement to
    563   // support continuation, user-level threading, and etc.. As a result, no
    564   // other SjLj exception interfaces are implemented and please don't build
    565   // your own exception handling based on them.
    566   // LLVM/Clang supports zero-cost DWARF exception handling.
    567   setOperationAction(ISD::EH_SJLJ_SETJMP, MVT::i32, Custom);
    568   setOperationAction(ISD::EH_SJLJ_LONGJMP, MVT::Other, Custom);
    569 
    570   // Darwin ABI issue.
    571   setOperationAction(ISD::ConstantPool    , MVT::i32  , Custom);
    572   setOperationAction(ISD::JumpTable       , MVT::i32  , Custom);
    573   setOperationAction(ISD::GlobalAddress   , MVT::i32  , Custom);
    574   setOperationAction(ISD::GlobalTLSAddress, MVT::i32  , Custom);
    575   if (Subtarget->is64Bit())
    576     setOperationAction(ISD::GlobalTLSAddress, MVT::i64, Custom);
    577   setOperationAction(ISD::ExternalSymbol  , MVT::i32  , Custom);
    578   setOperationAction(ISD::BlockAddress    , MVT::i32  , Custom);
    579   if (Subtarget->is64Bit()) {
    580     setOperationAction(ISD::ConstantPool  , MVT::i64  , Custom);
    581     setOperationAction(ISD::JumpTable     , MVT::i64  , Custom);
    582     setOperationAction(ISD::GlobalAddress , MVT::i64  , Custom);
    583     setOperationAction(ISD::ExternalSymbol, MVT::i64  , Custom);
    584     setOperationAction(ISD::BlockAddress  , MVT::i64  , Custom);
    585   }
    586   // 64-bit addm sub, shl, sra, srl (iff 32-bit x86)
    587   setOperationAction(ISD::SHL_PARTS       , MVT::i32  , Custom);
    588   setOperationAction(ISD::SRA_PARTS       , MVT::i32  , Custom);
    589   setOperationAction(ISD::SRL_PARTS       , MVT::i32  , Custom);
    590   if (Subtarget->is64Bit()) {
    591     setOperationAction(ISD::SHL_PARTS     , MVT::i64  , Custom);
    592     setOperationAction(ISD::SRA_PARTS     , MVT::i64  , Custom);
    593     setOperationAction(ISD::SRL_PARTS     , MVT::i64  , Custom);
    594   }
    595 
    596   if (Subtarget->hasSSE1())
    597     setOperationAction(ISD::PREFETCH      , MVT::Other, Legal);
    598 
    599   setOperationAction(ISD::ATOMIC_FENCE  , MVT::Other, Custom);
    600 
    601   // Expand certain atomics
    602   for (unsigned i = 0; i != array_lengthof(IntVTs); ++i) {
    603     MVT VT = IntVTs[i];
    604     setOperationAction(ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS, VT, Custom);
    605     setOperationAction(ISD::ATOMIC_LOAD_SUB, VT, Custom);
    606     setOperationAction(ISD::ATOMIC_STORE, VT, Custom);
    607   }
    608 
    609   if (Subtarget->hasCmpxchg16b()) {
    610     setOperationAction(ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS, MVT::i128, Custom);
    611   }
    612 
    613   // FIXME - use subtarget debug flags
    614   if (!Subtarget->isTargetDarwin() && !Subtarget->isTargetELF() &&
    615       !Subtarget->isTargetCygMing() && !Subtarget->isTargetWin64()) {
    616     setOperationAction(ISD::EH_LABEL, MVT::Other, Expand);
    617   }
    618 
    619   if (Subtarget->is64Bit()) {
    620     setExceptionPointerRegister(X86::RAX);
    621     setExceptionSelectorRegister(X86::RDX);
    622   } else {
    623     setExceptionPointerRegister(X86::EAX);
    624     setExceptionSelectorRegister(X86::EDX);
    625   }
    626   setOperationAction(ISD::FRAME_TO_ARGS_OFFSET, MVT::i32, Custom);
    627   setOperationAction(ISD::FRAME_TO_ARGS_OFFSET, MVT::i64, Custom);
    628 
    629   setOperationAction(ISD::INIT_TRAMPOLINE, MVT::Other, Custom);
    630   setOperationAction(ISD::ADJUST_TRAMPOLINE, MVT::Other, Custom);
    631 
    632   setOperationAction(ISD::TRAP, MVT::Other, Legal);
    633   setOperationAction(ISD::DEBUGTRAP, MVT::Other, Legal);
    634 
    635   // VASTART needs to be custom lowered to use the VarArgsFrameIndex
    636   setOperationAction(ISD::VASTART           , MVT::Other, Custom);
    637   setOperationAction(ISD::VAEND             , MVT::Other, Expand);
    638   if (Subtarget->is64Bit() && !Subtarget->isTargetWin64()) {
    639     // TargetInfo::X86_64ABIBuiltinVaList
    640     setOperationAction(ISD::VAARG           , MVT::Other, Custom);
    641     setOperationAction(ISD::VACOPY          , MVT::Other, Custom);
    642   } else {
    643     // TargetInfo::CharPtrBuiltinVaList
    644     setOperationAction(ISD::VAARG           , MVT::Other, Expand);
    645     setOperationAction(ISD::VACOPY          , MVT::Other, Expand);
    646   }
    647 
    648   setOperationAction(ISD::STACKSAVE,          MVT::Other, Expand);
    649   setOperationAction(ISD::STACKRESTORE,       MVT::Other, Expand);
    650 
    651   setOperationAction(ISD::DYNAMIC_STACKALLOC, Subtarget->is64Bit() ?
    652                      MVT::i64 : MVT::i32, Custom);
    653 
    654   if (!TM.Options.UseSoftFloat && X86ScalarSSEf64) {
    655     // f32 and f64 use SSE.
    656     // Set up the FP register classes.
    657     addRegisterClass(MVT::f32, &X86::FR32RegClass);
    658     addRegisterClass(MVT::f64, &X86::FR64RegClass);
    659 
    660     // Use ANDPD to simulate FABS.
    661     setOperationAction(ISD::FABS , MVT::f64, Custom);
    662     setOperationAction(ISD::FABS , MVT::f32, Custom);
    663 
    664     // Use XORP to simulate FNEG.
    665     setOperationAction(ISD::FNEG , MVT::f64, Custom);
    666     setOperationAction(ISD::FNEG , MVT::f32, Custom);
    667 
    668     // Use ANDPD and ORPD to simulate FCOPYSIGN.
    669     setOperationAction(ISD::FCOPYSIGN, MVT::f64, Custom);
    670     setOperationAction(ISD::FCOPYSIGN, MVT::f32, Custom);
    671 
    672     // Lower this to FGETSIGNx86 plus an AND.
    673     setOperationAction(ISD::FGETSIGN, MVT::i64, Custom);
    674     setOperationAction(ISD::FGETSIGN, MVT::i32, Custom);
    675 
    676     // We don't support sin/cos/fmod
    677     setOperationAction(ISD::FSIN   , MVT::f64, Expand);
    678     setOperationAction(ISD::FCOS   , MVT::f64, Expand);
    679     setOperationAction(ISD::FSINCOS, MVT::f64, Expand);
    680     setOperationAction(ISD::FSIN   , MVT::f32, Expand);
    681     setOperationAction(ISD::FCOS   , MVT::f32, Expand);
    682     setOperationAction(ISD::FSINCOS, MVT::f32, Expand);
    683 
    684     // Expand FP immediates into loads from the stack, except for the special
    685     // cases we handle.
    686     addLegalFPImmediate(APFloat(+0.0)); // xorpd
    687     addLegalFPImmediate(APFloat(+0.0f)); // xorps
    688   } else if (!TM.Options.UseSoftFloat && X86ScalarSSEf32) {
    689     // Use SSE for f32, x87 for f64.
    690     // Set up the FP register classes.
    691     addRegisterClass(MVT::f32, &X86::FR32RegClass);
    692     addRegisterClass(MVT::f64, &X86::RFP64RegClass);
    693 
    694     // Use ANDPS to simulate FABS.
    695     setOperationAction(ISD::FABS , MVT::f32, Custom);
    696 
    697     // Use XORP to simulate FNEG.
    698     setOperationAction(ISD::FNEG , MVT::f32, Custom);
    699 
    700     setOperationAction(ISD::UNDEF,     MVT::f64, Expand);
    701 
    702     // Use ANDPS and ORPS to simulate FCOPYSIGN.
    703     setOperationAction(ISD::FCOPYSIGN, MVT::f64, Expand);
    704     setOperationAction(ISD::FCOPYSIGN, MVT::f32, Custom);
    705 
    706     // We don't support sin/cos/fmod
    707     setOperationAction(ISD::FSIN   , MVT::f32, Expand);
    708     setOperationAction(ISD::FCOS   , MVT::f32, Expand);
    709     setOperationAction(ISD::FSINCOS, MVT::f32, Expand);
    710 
    711     // Special cases we handle for FP constants.
    712     addLegalFPImmediate(APFloat(+0.0f)); // xorps
    713     addLegalFPImmediate(APFloat(+0.0)); // FLD0
    714     addLegalFPImmediate(APFloat(+1.0)); // FLD1
    715     addLegalFPImmediate(APFloat(-0.0)); // FLD0/FCHS
    716     addLegalFPImmediate(APFloat(-1.0)); // FLD1/FCHS
    717 
    718     if (!TM.Options.UnsafeFPMath) {
    719       setOperationAction(ISD::FSIN   , MVT::f64, Expand);
    720       setOperationAction(ISD::FCOS   , MVT::f64, Expand);
    721       setOperationAction(ISD::FSINCOS, MVT::f64, Expand);
    722     }
    723   } else if (!TM.Options.UseSoftFloat) {
    724     // f32 and f64 in x87.
    725     // Set up the FP register classes.
    726     addRegisterClass(MVT::f64, &X86::RFP64RegClass);
    727     addRegisterClass(MVT::f32, &X86::RFP32RegClass);
    728 
    729     setOperationAction(ISD::UNDEF,     MVT::f64, Expand);
    730     setOperationAction(ISD::UNDEF,     MVT::f32, Expand);
    731     setOperationAction(ISD::FCOPYSIGN, MVT::f64, Expand);
    732     setOperationAction(ISD::FCOPYSIGN, MVT::f32, Expand);
    733 
    734     if (!TM.Options.UnsafeFPMath) {
    735       setOperationAction(ISD::FSIN   , MVT::f64, Expand);
    736       setOperationAction(ISD::FSIN   , MVT::f32, Expand);
    737       setOperationAction(ISD::FCOS   , MVT::f64, Expand);
    738       setOperationAction(ISD::FCOS   , MVT::f32, Expand);
    739       setOperationAction(ISD::FSINCOS, MVT::f64, Expand);
    740       setOperationAction(ISD::FSINCOS, MVT::f32, Expand);
    741     }
    742     addLegalFPImmediate(APFloat(+0.0)); // FLD0
    743     addLegalFPImmediate(APFloat(+1.0)); // FLD1
    744     addLegalFPImmediate(APFloat(-0.0)); // FLD0/FCHS
    745     addLegalFPImmediate(APFloat(-1.0)); // FLD1/FCHS
    746     addLegalFPImmediate(APFloat(+0.0f)); // FLD0
    747     addLegalFPImmediate(APFloat(+1.0f)); // FLD1
    748     addLegalFPImmediate(APFloat(-0.0f)); // FLD0/FCHS
    749     addLegalFPImmediate(APFloat(-1.0f)); // FLD1/FCHS
    750   }
    751 
    752   // We don't support FMA.
    753   setOperationAction(ISD::FMA, MVT::f64, Expand);
    754   setOperationAction(ISD::FMA, MVT::f32, Expand);
    755 
    756   // Long double always uses X87.
    757   if (!TM.Options.UseSoftFloat) {
    758     addRegisterClass(MVT::f80, &X86::RFP80RegClass);
    759     setOperationAction(ISD::UNDEF,     MVT::f80, Expand);
    760     setOperationAction(ISD::FCOPYSIGN, MVT::f80, Expand);
    761     {
    762       APFloat TmpFlt = APFloat::getZero(APFloat::x87DoubleExtended);
    763       addLegalFPImmediate(TmpFlt);  // FLD0
    764       TmpFlt.changeSign();
    765       addLegalFPImmediate(TmpFlt);  // FLD0/FCHS
    766 
    767       bool ignored;
    768       APFloat TmpFlt2(+1.0);
    769       TmpFlt2.convert(APFloat::x87DoubleExtended, APFloat::rmNearestTiesToEven,
    770                       &ignored);
    771       addLegalFPImmediate(TmpFlt2);  // FLD1
    772       TmpFlt2.changeSign();
    773       addLegalFPImmediate(TmpFlt2);  // FLD1/FCHS
    774     }
    775 
    776     if (!TM.Options.UnsafeFPMath) {
    777       setOperationAction(ISD::FSIN   , MVT::f80, Expand);
    778       setOperationAction(ISD::FCOS   , MVT::f80, Expand);
    779       setOperationAction(ISD::FSINCOS, MVT::f80, Expand);
    780     }
    781 
    782     setOperationAction(ISD::FFLOOR, MVT::f80, Expand);
    783     setOperationAction(ISD::FCEIL,  MVT::f80, Expand);
    784     setOperationAction(ISD::FTRUNC, MVT::f80, Expand);
    785     setOperationAction(ISD::FRINT,  MVT::f80, Expand);
    786     setOperationAction(ISD::FNEARBYINT, MVT::f80, Expand);
    787     setOperationAction(ISD::FMA, MVT::f80, Expand);
    788   }
    789 
    790   // Always use a library call for pow.
    791   setOperationAction(ISD::FPOW             , MVT::f32  , Expand);
    792   setOperationAction(ISD::FPOW             , MVT::f64  , Expand);
    793   setOperationAction(ISD::FPOW             , MVT::f80  , Expand);
    794 
    795   setOperationAction(ISD::FLOG, MVT::f80, Expand);
    796   setOperationAction(ISD::FLOG2, MVT::f80, Expand);
    797   setOperationAction(ISD::FLOG10, MVT::f80, Expand);
    798   setOperationAction(ISD::FEXP, MVT::f80, Expand);
    799   setOperationAction(ISD::FEXP2, MVT::f80, Expand);
    800 
    801   // First set operation action for all vector types to either promote
    802   // (for widening) or expand (for scalarization). Then we will selectively
    803   // turn on ones that can be effectively codegen'd.
    804   for (int i = MVT::FIRST_VECTOR_VALUETYPE;
    805            i <= MVT::LAST_VECTOR_VALUETYPE; ++i) {
    806     MVT VT = (MVT::SimpleValueType)i;
    807     setOperationAction(ISD::ADD , VT, Expand);
    808     setOperationAction(ISD::SUB , VT, Expand);
    809     setOperationAction(ISD::FADD, VT, Expand);
    810     setOperationAction(ISD::FNEG, VT, Expand);
    811     setOperationAction(ISD::FSUB, VT, Expand);
    812     setOperationAction(ISD::MUL , VT, Expand);
    813     setOperationAction(ISD::FMUL, VT, Expand);
    814     setOperationAction(ISD::SDIV, VT, Expand);
    815     setOperationAction(ISD::UDIV, VT, Expand);
    816     setOperationAction(ISD::FDIV, VT, Expand);
    817     setOperationAction(ISD::SREM, VT, Expand);
    818     setOperationAction(ISD::UREM, VT, Expand);
    819     setOperationAction(ISD::LOAD, VT, Expand);
    820     setOperationAction(ISD::VECTOR_SHUFFLE, VT, Expand);
    821     setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT,Expand);
    822     setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Expand);
    823     setOperationAction(ISD::EXTRACT_SUBVECTOR, VT,Expand);
    824     setOperationAction(ISD::INSERT_SUBVECTOR, VT,Expand);
    825     setOperationAction(ISD::FABS, VT, Expand);
    826     setOperationAction(ISD::FSIN, VT, Expand);
    827     setOperationAction(ISD::FSINCOS, VT, Expand);
    828     setOperationAction(ISD::FCOS, VT, Expand);
    829     setOperationAction(ISD::FSINCOS, VT, Expand);
    830     setOperationAction(ISD::FREM, VT, Expand);
    831     setOperationAction(ISD::FMA,  VT, Expand);
    832     setOperationAction(ISD::FPOWI, VT, Expand);
    833     setOperationAction(ISD::FSQRT, VT, Expand);
    834     setOperationAction(ISD::FCOPYSIGN, VT, Expand);
    835     setOperationAction(ISD::FFLOOR, VT, Expand);
    836     setOperationAction(ISD::FCEIL, VT, Expand);
    837     setOperationAction(ISD::FTRUNC, VT, Expand);
    838     setOperationAction(ISD::FRINT, VT, Expand);
    839     setOperationAction(ISD::FNEARBYINT, VT, Expand);
    840     setOperationAction(ISD::SMUL_LOHI, VT, Expand);
    841     setOperationAction(ISD::MULHS, VT, Expand);
    842     setOperationAction(ISD::UMUL_LOHI, VT, Expand);
    843     setOperationAction(ISD::MULHU, VT, Expand);
    844     setOperationAction(ISD::SDIVREM, VT, Expand);
    845     setOperationAction(ISD::UDIVREM, VT, Expand);
    846     setOperationAction(ISD::FPOW, VT, Expand);
    847     setOperationAction(ISD::CTPOP, VT, Expand);
    848     setOperationAction(ISD::CTTZ, VT, Expand);
    849     setOperationAction(ISD::CTTZ_ZERO_UNDEF, VT, Expand);
    850     setOperationAction(ISD::CTLZ, VT, Expand);
    851     setOperationAction(ISD::CTLZ_ZERO_UNDEF, VT, Expand);
    852     setOperationAction(ISD::SHL, VT, Expand);
    853     setOperationAction(ISD::SRA, VT, Expand);
    854     setOperationAction(ISD::SRL, VT, Expand);
    855     setOperationAction(ISD::ROTL, VT, Expand);
    856     setOperationAction(ISD::ROTR, VT, Expand);
    857     setOperationAction(ISD::BSWAP, VT, Expand);
    858     setOperationAction(ISD::SETCC, VT, Expand);
    859     setOperationAction(ISD::FLOG, VT, Expand);
    860     setOperationAction(ISD::FLOG2, VT, Expand);
    861     setOperationAction(ISD::FLOG10, VT, Expand);
    862     setOperationAction(ISD::FEXP, VT, Expand);
    863     setOperationAction(ISD::FEXP2, VT, Expand);
    864     setOperationAction(ISD::FP_TO_UINT, VT, Expand);
    865     setOperationAction(ISD::FP_TO_SINT, VT, Expand);
    866     setOperationAction(ISD::UINT_TO_FP, VT, Expand);
    867     setOperationAction(ISD::SINT_TO_FP, VT, Expand);
    868     setOperationAction(ISD::SIGN_EXTEND_INREG, VT,Expand);
    869     setOperationAction(ISD::TRUNCATE, VT, Expand);
    870     setOperationAction(ISD::SIGN_EXTEND, VT, Expand);
    871     setOperationAction(ISD::ZERO_EXTEND, VT, Expand);
    872     setOperationAction(ISD::ANY_EXTEND, VT, Expand);
    873     setOperationAction(ISD::VSELECT, VT, Expand);
    874     setOperationAction(ISD::SELECT_CC, VT, Expand);
    875     for (int InnerVT = MVT::FIRST_VECTOR_VALUETYPE;
    876              InnerVT <= MVT::LAST_VECTOR_VALUETYPE; ++InnerVT)
    877       setTruncStoreAction(VT,
    878                           (MVT::SimpleValueType)InnerVT, Expand);
    879     setLoadExtAction(ISD::SEXTLOAD, VT, Expand);
    880     setLoadExtAction(ISD::ZEXTLOAD, VT, Expand);
    881     setLoadExtAction(ISD::EXTLOAD, VT, Expand);
    882   }
    883 
    884   // FIXME: In order to prevent SSE instructions being expanded to MMX ones
    885   // with -msoft-float, disable use of MMX as well.
    886   if (!TM.Options.UseSoftFloat && Subtarget->hasMMX()) {
    887     addRegisterClass(MVT::x86mmx, &X86::VR64RegClass);
    888     // No operations on x86mmx supported, everything uses intrinsics.
    889   }
    890 
    891   // MMX-sized vectors (other than x86mmx) are expected to be expanded
    892   // into smaller operations.
    893   setOperationAction(ISD::MULHS,              MVT::v8i8,  Expand);
    894   setOperationAction(ISD::MULHS,              MVT::v4i16, Expand);
    895   setOperationAction(ISD::MULHS,              MVT::v2i32, Expand);
    896   setOperationAction(ISD::MULHS,              MVT::v1i64, Expand);
    897   setOperationAction(ISD::AND,                MVT::v8i8,  Expand);
    898   setOperationAction(ISD::AND,                MVT::v4i16, Expand);
    899   setOperationAction(ISD::AND,                MVT::v2i32, Expand);
    900   setOperationAction(ISD::AND,                MVT::v1i64, Expand);
    901   setOperationAction(ISD::OR,                 MVT::v8i8,  Expand);
    902   setOperationAction(ISD::OR,                 MVT::v4i16, Expand);
    903   setOperationAction(ISD::OR,                 MVT::v2i32, Expand);
    904   setOperationAction(ISD::OR,                 MVT::v1i64, Expand);
    905   setOperationAction(ISD::XOR,                MVT::v8i8,  Expand);
    906   setOperationAction(ISD::XOR,                MVT::v4i16, Expand);
    907   setOperationAction(ISD::XOR,                MVT::v2i32, Expand);
    908   setOperationAction(ISD::XOR,                MVT::v1i64, Expand);
    909   setOperationAction(ISD::SCALAR_TO_VECTOR,   MVT::v8i8,  Expand);
    910   setOperationAction(ISD::SCALAR_TO_VECTOR,   MVT::v4i16, Expand);
    911   setOperationAction(ISD::SCALAR_TO_VECTOR,   MVT::v2i32, Expand);
    912   setOperationAction(ISD::SCALAR_TO_VECTOR,   MVT::v1i64, Expand);
    913   setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v1i64, Expand);
    914   setOperationAction(ISD::SELECT,             MVT::v8i8,  Expand);
    915   setOperationAction(ISD::SELECT,             MVT::v4i16, Expand);
    916   setOperationAction(ISD::SELECT,             MVT::v2i32, Expand);
    917   setOperationAction(ISD::SELECT,             MVT::v1i64, Expand);
    918   setOperationAction(ISD::BITCAST,            MVT::v8i8,  Expand);
    919   setOperationAction(ISD::BITCAST,            MVT::v4i16, Expand);
    920   setOperationAction(ISD::BITCAST,            MVT::v2i32, Expand);
    921   setOperationAction(ISD::BITCAST,            MVT::v1i64, Expand);
    922 
    923   if (!TM.Options.UseSoftFloat && Subtarget->hasSSE1()) {
    924     addRegisterClass(MVT::v4f32, &X86::VR128RegClass);
    925 
    926     setOperationAction(ISD::FADD,               MVT::v4f32, Legal);
    927     setOperationAction(ISD::FSUB,               MVT::v4f32, Legal);
    928     setOperationAction(ISD::FMUL,               MVT::v4f32, Legal);
    929     setOperationAction(ISD::FDIV,               MVT::v4f32, Legal);
    930     setOperationAction(ISD::FSQRT,              MVT::v4f32, Legal);
    931     setOperationAction(ISD::FNEG,               MVT::v4f32, Custom);
    932     setOperationAction(ISD::FABS,               MVT::v4f32, Custom);
    933     setOperationAction(ISD::LOAD,               MVT::v4f32, Legal);
    934     setOperationAction(ISD::BUILD_VECTOR,       MVT::v4f32, Custom);
    935     setOperationAction(ISD::VECTOR_SHUFFLE,     MVT::v4f32, Custom);
    936     setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4f32, Custom);
    937     setOperationAction(ISD::SELECT,             MVT::v4f32, Custom);
    938   }
    939 
    940   if (!TM.Options.UseSoftFloat && Subtarget->hasSSE2()) {
    941     addRegisterClass(MVT::v2f64, &X86::VR128RegClass);
    942 
    943     // FIXME: Unfortunately -soft-float and -no-implicit-float means XMM
    944     // registers cannot be used even for integer operations.
    945     addRegisterClass(MVT::v16i8, &X86::VR128RegClass);
    946     addRegisterClass(MVT::v8i16, &X86::VR128RegClass);
    947     addRegisterClass(MVT::v4i32, &X86::VR128RegClass);
    948     addRegisterClass(MVT::v2i64, &X86::VR128RegClass);
    949 
    950     setOperationAction(ISD::ADD,                MVT::v16i8, Legal);
    951     setOperationAction(ISD::ADD,                MVT::v8i16, Legal);
    952     setOperationAction(ISD::ADD,                MVT::v4i32, Legal);
    953     setOperationAction(ISD::ADD,                MVT::v2i64, Legal);
    954     setOperationAction(ISD::MUL,                MVT::v4i32, Custom);
    955     setOperationAction(ISD::MUL,                MVT::v2i64, Custom);
    956     setOperationAction(ISD::UMUL_LOHI,          MVT::v4i32, Custom);
    957     setOperationAction(ISD::SMUL_LOHI,          MVT::v4i32, Custom);
    958     setOperationAction(ISD::MULHU,              MVT::v8i16, Legal);
    959     setOperationAction(ISD::MULHS,              MVT::v8i16, Legal);
    960     setOperationAction(ISD::SUB,                MVT::v16i8, Legal);
    961     setOperationAction(ISD::SUB,                MVT::v8i16, Legal);
    962     setOperationAction(ISD::SUB,                MVT::v4i32, Legal);
    963     setOperationAction(ISD::SUB,                MVT::v2i64, Legal);
    964     setOperationAction(ISD::MUL,                MVT::v8i16, Legal);
    965     setOperationAction(ISD::FADD,               MVT::v2f64, Legal);
    966     setOperationAction(ISD::FSUB,               MVT::v2f64, Legal);
    967     setOperationAction(ISD::FMUL,               MVT::v2f64, Legal);
    968     setOperationAction(ISD::FDIV,               MVT::v2f64, Legal);
    969     setOperationAction(ISD::FSQRT,              MVT::v2f64, Legal);
    970     setOperationAction(ISD::FNEG,               MVT::v2f64, Custom);
    971     setOperationAction(ISD::FABS,               MVT::v2f64, Custom);
    972 
    973     setOperationAction(ISD::SETCC,              MVT::v2i64, Custom);
    974     setOperationAction(ISD::SETCC,              MVT::v16i8, Custom);
    975     setOperationAction(ISD::SETCC,              MVT::v8i16, Custom);
    976     setOperationAction(ISD::SETCC,              MVT::v4i32, Custom);
    977 
    978     setOperationAction(ISD::SCALAR_TO_VECTOR,   MVT::v16i8, Custom);
    979     setOperationAction(ISD::SCALAR_TO_VECTOR,   MVT::v8i16, Custom);
    980     setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v8i16, Custom);
    981     setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v4i32, Custom);
    982     setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v4f32, Custom);
    983 
    984     // Custom lower build_vector, vector_shuffle, and extract_vector_elt.
    985     for (int i = MVT::v16i8; i != MVT::v2i64; ++i) {
    986       MVT VT = (MVT::SimpleValueType)i;
    987       // Do not attempt to custom lower non-power-of-2 vectors
    988       if (!isPowerOf2_32(VT.getVectorNumElements()))
    989         continue;
    990       // Do not attempt to custom lower non-128-bit vectors
    991       if (!VT.is128BitVector())
    992         continue;
    993       setOperationAction(ISD::BUILD_VECTOR,       VT, Custom);
    994       setOperationAction(ISD::VECTOR_SHUFFLE,     VT, Custom);
    995       setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
    996     }
    997 
    998     setOperationAction(ISD::BUILD_VECTOR,       MVT::v2f64, Custom);
    999     setOperationAction(ISD::BUILD_VECTOR,       MVT::v2i64, Custom);
   1000     setOperationAction(ISD::VECTOR_SHUFFLE,     MVT::v2f64, Custom);
   1001     setOperationAction(ISD::VECTOR_SHUFFLE,     MVT::v2i64, Custom);
   1002     setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v2f64, Custom);
   1003     setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2f64, Custom);
   1004 
   1005     if (Subtarget->is64Bit()) {
   1006       setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v2i64, Custom);
   1007       setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2i64, Custom);
   1008     }
   1009 
   1010     // Promote v16i8, v8i16, v4i32 load, select, and, or, xor to v2i64.
   1011     for (int i = MVT::v16i8; i != MVT::v2i64; ++i) {
   1012       MVT VT = (MVT::SimpleValueType)i;
   1013 
   1014       // Do not attempt to promote non-128-bit vectors
   1015       if (!VT.is128BitVector())
   1016         continue;
   1017 
   1018       setOperationAction(ISD::AND,    VT, Promote);
   1019       AddPromotedToType (ISD::AND,    VT, MVT::v2i64);
   1020       setOperationAction(ISD::OR,     VT, Promote);
   1021       AddPromotedToType (ISD::OR,     VT, MVT::v2i64);
   1022       setOperationAction(ISD::XOR,    VT, Promote);
   1023       AddPromotedToType (ISD::XOR,    VT, MVT::v2i64);
   1024       setOperationAction(ISD::LOAD,   VT, Promote);
   1025       AddPromotedToType (ISD::LOAD,   VT, MVT::v2i64);
   1026       setOperationAction(ISD::SELECT, VT, Promote);
   1027       AddPromotedToType (ISD::SELECT, VT, MVT::v2i64);
   1028     }
   1029 
   1030     setTruncStoreAction(MVT::f64, MVT::f32, Expand);
   1031 
   1032     // Custom lower v2i64 and v2f64 selects.
   1033     setOperationAction(ISD::LOAD,               MVT::v2f64, Legal);
   1034     setOperationAction(ISD::LOAD,               MVT::v2i64, Legal);
   1035     setOperationAction(ISD::SELECT,             MVT::v2f64, Custom);
   1036     setOperationAction(ISD::SELECT,             MVT::v2i64, Custom);
   1037 
   1038     setOperationAction(ISD::FP_TO_SINT,         MVT::v4i32, Legal);
   1039     setOperationAction(ISD::SINT_TO_FP,         MVT::v4i32, Legal);
   1040 
   1041     setOperationAction(ISD::UINT_TO_FP,         MVT::v4i8,  Custom);
   1042     setOperationAction(ISD::UINT_TO_FP,         MVT::v4i16, Custom);
   1043     // As there is no 64-bit GPR available, we need build a special custom
   1044     // sequence to convert from v2i32 to v2f32.
   1045     if (!Subtarget->is64Bit())
   1046       setOperationAction(ISD::UINT_TO_FP,       MVT::v2f32, Custom);
   1047 
   1048     setOperationAction(ISD::FP_EXTEND,          MVT::v2f32, Custom);
   1049     setOperationAction(ISD::FP_ROUND,           MVT::v2f32, Custom);
   1050 
   1051     setLoadExtAction(ISD::EXTLOAD,              MVT::v2f32, Legal);
   1052 
   1053     setOperationAction(ISD::BITCAST,            MVT::v2i32, Custom);
   1054     setOperationAction(ISD::BITCAST,            MVT::v4i16, Custom);
   1055     setOperationAction(ISD::BITCAST,            MVT::v8i8,  Custom);
   1056   }
   1057 
   1058   if (!TM.Options.UseSoftFloat && Subtarget->hasSSE41()) {
   1059     setOperationAction(ISD::FFLOOR,             MVT::f32,   Legal);
   1060     setOperationAction(ISD::FCEIL,              MVT::f32,   Legal);
   1061     setOperationAction(ISD::FTRUNC,             MVT::f32,   Legal);
   1062     setOperationAction(ISD::FRINT,              MVT::f32,   Legal);
   1063     setOperationAction(ISD::FNEARBYINT,         MVT::f32,   Legal);
   1064     setOperationAction(ISD::FFLOOR,             MVT::f64,   Legal);
   1065     setOperationAction(ISD::FCEIL,              MVT::f64,   Legal);
   1066     setOperationAction(ISD::FTRUNC,             MVT::f64,   Legal);
   1067     setOperationAction(ISD::FRINT,              MVT::f64,   Legal);
   1068     setOperationAction(ISD::FNEARBYINT,         MVT::f64,   Legal);
   1069 
   1070     setOperationAction(ISD::FFLOOR,             MVT::v4f32, Legal);
   1071     setOperationAction(ISD::FCEIL,              MVT::v4f32, Legal);
   1072     setOperationAction(ISD::FTRUNC,             MVT::v4f32, Legal);
   1073     setOperationAction(ISD::FRINT,              MVT::v4f32, Legal);
   1074     setOperationAction(ISD::FNEARBYINT,         MVT::v4f32, Legal);
   1075     setOperationAction(ISD::FFLOOR,             MVT::v2f64, Legal);
   1076     setOperationAction(ISD::FCEIL,              MVT::v2f64, Legal);
   1077     setOperationAction(ISD::FTRUNC,             MVT::v2f64, Legal);
   1078     setOperationAction(ISD::FRINT,              MVT::v2f64, Legal);
   1079     setOperationAction(ISD::FNEARBYINT,         MVT::v2f64, Legal);
   1080 
   1081     // FIXME: Do we need to handle scalar-to-vector here?
   1082     setOperationAction(ISD::MUL,                MVT::v4i32, Legal);
   1083 
   1084     setOperationAction(ISD::VSELECT,            MVT::v2f64, Custom);
   1085     setOperationAction(ISD::VSELECT,            MVT::v2i64, Custom);
   1086     setOperationAction(ISD::VSELECT,            MVT::v4i32, Custom);
   1087     setOperationAction(ISD::VSELECT,            MVT::v4f32, Custom);
   1088     setOperationAction(ISD::VSELECT,            MVT::v8i16, Custom);
   1089     // There is no BLENDI for byte vectors. We don't need to custom lower
   1090     // some vselects for now.
   1091     setOperationAction(ISD::VSELECT,            MVT::v16i8, Legal);
   1092 
   1093     // i8 and i16 vectors are custom , because the source register and source
   1094     // source memory operand types are not the same width.  f32 vectors are
   1095     // custom since the immediate controlling the insert encodes additional
   1096     // information.
   1097     setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v16i8, Custom);
   1098     setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v8i16, Custom);
   1099     setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v4i32, Custom);
   1100     setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v4f32, Custom);
   1101 
   1102     setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v16i8, Custom);
   1103     setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v8i16, Custom);
   1104     setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4i32, Custom);
   1105     setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4f32, Custom);
   1106 
   1107     // FIXME: these should be Legal but thats only for the case where
   1108     // the index is constant.  For now custom expand to deal with that.
   1109     if (Subtarget->is64Bit()) {
   1110       setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v2i64, Custom);
   1111       setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2i64, Custom);
   1112     }
   1113   }
   1114 
   1115   if (Subtarget->hasSSE2()) {
   1116     setOperationAction(ISD::SRL,               MVT::v8i16, Custom);
   1117     setOperationAction(ISD::SRL,               MVT::v16i8, Custom);
   1118 
   1119     setOperationAction(ISD::SHL,               MVT::v8i16, Custom);
   1120     setOperationAction(ISD::SHL,               MVT::v16i8, Custom);
   1121 
   1122     setOperationAction(ISD::SRA,               MVT::v8i16, Custom);
   1123     setOperationAction(ISD::SRA,               MVT::v16i8, Custom);
   1124 
   1125     // In the customized shift lowering, the legal cases in AVX2 will be
   1126     // recognized.
   1127     setOperationAction(ISD::SRL,               MVT::v2i64, Custom);
   1128     setOperationAction(ISD::SRL,               MVT::v4i32, Custom);
   1129 
   1130     setOperationAction(ISD::SHL,               MVT::v2i64, Custom);
   1131     setOperationAction(ISD::SHL,               MVT::v4i32, Custom);
   1132 
   1133     setOperationAction(ISD::SRA,               MVT::v4i32, Custom);
   1134   }
   1135 
   1136   if (!TM.Options.UseSoftFloat && Subtarget->hasFp256()) {
   1137     addRegisterClass(MVT::v32i8,  &X86::VR256RegClass);
   1138     addRegisterClass(MVT::v16i16, &X86::VR256RegClass);
   1139     addRegisterClass(MVT::v8i32,  &X86::VR256RegClass);
   1140     addRegisterClass(MVT::v8f32,  &X86::VR256RegClass);
   1141     addRegisterClass(MVT::v4i64,  &X86::VR256RegClass);
   1142     addRegisterClass(MVT::v4f64,  &X86::VR256RegClass);
   1143 
   1144     setOperationAction(ISD::LOAD,               MVT::v8f32, Legal);
   1145     setOperationAction(ISD::LOAD,               MVT::v4f64, Legal);
   1146     setOperationAction(ISD::LOAD,               MVT::v4i64, Legal);
   1147 
   1148     setOperationAction(ISD::FADD,               MVT::v8f32, Legal);
   1149     setOperationAction(ISD::FSUB,               MVT::v8f32, Legal);
   1150     setOperationAction(ISD::FMUL,               MVT::v8f32, Legal);
   1151     setOperationAction(ISD::FDIV,               MVT::v8f32, Legal);
   1152     setOperationAction(ISD::FSQRT,              MVT::v8f32, Legal);
   1153     setOperationAction(ISD::FFLOOR,             MVT::v8f32, Legal);
   1154     setOperationAction(ISD::FCEIL,              MVT::v8f32, Legal);
   1155     setOperationAction(ISD::FTRUNC,             MVT::v8f32, Legal);
   1156     setOperationAction(ISD::FRINT,              MVT::v8f32, Legal);
   1157     setOperationAction(ISD::FNEARBYINT,         MVT::v8f32, Legal);
   1158     setOperationAction(ISD::FNEG,               MVT::v8f32, Custom);
   1159     setOperationAction(ISD::FABS,               MVT::v8f32, Custom);
   1160 
   1161     setOperationAction(ISD::FADD,               MVT::v4f64, Legal);
   1162     setOperationAction(ISD::FSUB,               MVT::v4f64, Legal);
   1163     setOperationAction(ISD::FMUL,               MVT::v4f64, Legal);
   1164     setOperationAction(ISD::FDIV,               MVT::v4f64, Legal);
   1165     setOperationAction(ISD::FSQRT,              MVT::v4f64, Legal);
   1166     setOperationAction(ISD::FFLOOR,             MVT::v4f64, Legal);
   1167     setOperationAction(ISD::FCEIL,              MVT::v4f64, Legal);
   1168     setOperationAction(ISD::FTRUNC,             MVT::v4f64, Legal);
   1169     setOperationAction(ISD::FRINT,              MVT::v4f64, Legal);
   1170     setOperationAction(ISD::FNEARBYINT,         MVT::v4f64, Legal);
   1171     setOperationAction(ISD::FNEG,               MVT::v4f64, Custom);
   1172     setOperationAction(ISD::FABS,               MVT::v4f64, Custom);
   1173 
   1174     // (fp_to_int:v8i16 (v8f32 ..)) requires the result type to be promoted
   1175     // even though v8i16 is a legal type.
   1176     setOperationAction(ISD::FP_TO_SINT,         MVT::v8i16, Promote);
   1177     setOperationAction(ISD::FP_TO_UINT,         MVT::v8i16, Promote);
   1178     setOperationAction(ISD::FP_TO_SINT,         MVT::v8i32, Legal);
   1179 
   1180     setOperationAction(ISD::SINT_TO_FP,         MVT::v8i16, Promote);
   1181     setOperationAction(ISD::SINT_TO_FP,         MVT::v8i32, Legal);
   1182     setOperationAction(ISD::FP_ROUND,           MVT::v4f32, Legal);
   1183 
   1184     setOperationAction(ISD::UINT_TO_FP,         MVT::v8i8,  Custom);
   1185     setOperationAction(ISD::UINT_TO_FP,         MVT::v8i16, Custom);
   1186 
   1187     setLoadExtAction(ISD::EXTLOAD,              MVT::v4f32, Legal);
   1188 
   1189     setOperationAction(ISD::SRL,               MVT::v16i16, Custom);
   1190     setOperationAction(ISD::SRL,               MVT::v32i8, Custom);
   1191 
   1192     setOperationAction(ISD::SHL,               MVT::v16i16, Custom);
   1193     setOperationAction(ISD::SHL,               MVT::v32i8, Custom);
   1194 
   1195     setOperationAction(ISD::SRA,               MVT::v16i16, Custom);
   1196     setOperationAction(ISD::SRA,               MVT::v32i8, Custom);
   1197 
   1198     setOperationAction(ISD::SETCC,             MVT::v32i8, Custom);
   1199     setOperationAction(ISD::SETCC,             MVT::v16i16, Custom);
   1200     setOperationAction(ISD::SETCC,             MVT::v8i32, Custom);
   1201     setOperationAction(ISD::SETCC,             MVT::v4i64, Custom);
   1202 
   1203     setOperationAction(ISD::SELECT,            MVT::v4f64, Custom);
   1204     setOperationAction(ISD::SELECT,            MVT::v4i64, Custom);
   1205     setOperationAction(ISD::SELECT,            MVT::v8f32, Custom);
   1206 
   1207     setOperationAction(ISD::VSELECT,           MVT::v4f64, Custom);
   1208     setOperationAction(ISD::VSELECT,           MVT::v4i64, Custom);
   1209     setOperationAction(ISD::VSELECT,           MVT::v8i32, Custom);
   1210     setOperationAction(ISD::VSELECT,           MVT::v8f32, Custom);
   1211 
   1212     setOperationAction(ISD::SIGN_EXTEND,       MVT::v4i64, Custom);
   1213     setOperationAction(ISD::SIGN_EXTEND,       MVT::v8i32, Custom);
   1214     setOperationAction(ISD::SIGN_EXTEND,       MVT::v16i16, Custom);
   1215     setOperationAction(ISD::ZERO_EXTEND,       MVT::v4i64, Custom);
   1216     setOperationAction(ISD::ZERO_EXTEND,       MVT::v8i32, Custom);
   1217     setOperationAction(ISD::ZERO_EXTEND,       MVT::v16i16, Custom);
   1218     setOperationAction(ISD::ANY_EXTEND,        MVT::v4i64, Custom);
   1219     setOperationAction(ISD::ANY_EXTEND,        MVT::v8i32, Custom);
   1220     setOperationAction(ISD::ANY_EXTEND,        MVT::v16i16, Custom);
   1221     setOperationAction(ISD::TRUNCATE,          MVT::v16i8, Custom);
   1222     setOperationAction(ISD::TRUNCATE,          MVT::v8i16, Custom);
   1223     setOperationAction(ISD::TRUNCATE,          MVT::v4i32, Custom);
   1224 
   1225     if (Subtarget->hasFMA() || Subtarget->hasFMA4()) {
   1226       setOperationAction(ISD::FMA,             MVT::v8f32, Legal);
   1227       setOperationAction(ISD::FMA,             MVT::v4f64, Legal);
   1228       setOperationAction(ISD::FMA,             MVT::v4f32, Legal);
   1229       setOperationAction(ISD::FMA,             MVT::v2f64, Legal);
   1230       setOperationAction(ISD::FMA,             MVT::f32, Legal);
   1231       setOperationAction(ISD::FMA,             MVT::f64, Legal);
   1232     }
   1233 
   1234     if (Subtarget->hasInt256()) {
   1235       setOperationAction(ISD::ADD,             MVT::v4i64, Legal);
   1236       setOperationAction(ISD::ADD,             MVT::v8i32, Legal);
   1237       setOperationAction(ISD::ADD,             MVT::v16i16, Legal);
   1238       setOperationAction(ISD::ADD,             MVT::v32i8, Legal);
   1239 
   1240       setOperationAction(ISD::SUB,             MVT::v4i64, Legal);
   1241       setOperationAction(ISD::SUB,             MVT::v8i32, Legal);
   1242       setOperationAction(ISD::SUB,             MVT::v16i16, Legal);
   1243       setOperationAction(ISD::SUB,             MVT::v32i8, Legal);
   1244 
   1245       setOperationAction(ISD::MUL,             MVT::v4i64, Custom);
   1246       setOperationAction(ISD::MUL,             MVT::v8i32, Legal);
   1247       setOperationAction(ISD::MUL,             MVT::v16i16, Legal);
   1248       // Don't lower v32i8 because there is no 128-bit byte mul
   1249 
   1250       setOperationAction(ISD::UMUL_LOHI,       MVT::v8i32, Custom);
   1251       setOperationAction(ISD::SMUL_LOHI,       MVT::v8i32, Custom);
   1252       setOperationAction(ISD::MULHU,           MVT::v16i16, Legal);
   1253       setOperationAction(ISD::MULHS,           MVT::v16i16, Legal);
   1254 
   1255       setOperationAction(ISD::VSELECT,         MVT::v16i16, Custom);
   1256       setOperationAction(ISD::VSELECT,         MVT::v32i8, Legal);
   1257     } else {
   1258       setOperationAction(ISD::ADD,             MVT::v4i64, Custom);
   1259       setOperationAction(ISD::ADD,             MVT::v8i32, Custom);
   1260       setOperationAction(ISD::ADD,             MVT::v16i16, Custom);
   1261       setOperationAction(ISD::ADD,             MVT::v32i8, Custom);
   1262 
   1263       setOperationAction(ISD::SUB,             MVT::v4i64, Custom);
   1264       setOperationAction(ISD::SUB,             MVT::v8i32, Custom);
   1265       setOperationAction(ISD::SUB,             MVT::v16i16, Custom);
   1266       setOperationAction(ISD::SUB,             MVT::v32i8, Custom);
   1267 
   1268       setOperationAction(ISD::MUL,             MVT::v4i64, Custom);
   1269       setOperationAction(ISD::MUL,             MVT::v8i32, Custom);
   1270       setOperationAction(ISD::MUL,             MVT::v16i16, Custom);
   1271       // Don't lower v32i8 because there is no 128-bit byte mul
   1272     }
   1273 
   1274     // In the customized shift lowering, the legal cases in AVX2 will be
   1275     // recognized.
   1276     setOperationAction(ISD::SRL,               MVT::v4i64, Custom);
   1277     setOperationAction(ISD::SRL,               MVT::v8i32, Custom);
   1278 
   1279     setOperationAction(ISD::SHL,               MVT::v4i64, Custom);
   1280     setOperationAction(ISD::SHL,               MVT::v8i32, Custom);
   1281 
   1282     setOperationAction(ISD::SRA,               MVT::v8i32, Custom);
   1283 
   1284     // Custom lower several nodes for 256-bit types.
   1285     for (int i = MVT::FIRST_VECTOR_VALUETYPE;
   1286              i <= MVT::LAST_VECTOR_VALUETYPE; ++i) {
   1287       MVT VT = (MVT::SimpleValueType)i;
   1288 
   1289       // Extract subvector is special because the value type
   1290       // (result) is 128-bit but the source is 256-bit wide.
   1291       if (VT.is128BitVector())
   1292         setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Custom);
   1293 
   1294       // Do not attempt to custom lower other non-256-bit vectors
   1295       if (!VT.is256BitVector())
   1296         continue;
   1297 
   1298       setOperationAction(ISD::BUILD_VECTOR,       VT, Custom);
   1299       setOperationAction(ISD::VECTOR_SHUFFLE,     VT, Custom);
   1300       setOperationAction(ISD::INSERT_VECTOR_ELT,  VT, Custom);
   1301       setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
   1302       setOperationAction(ISD::SCALAR_TO_VECTOR,   VT, Custom);
   1303       setOperationAction(ISD::INSERT_SUBVECTOR,   VT, Custom);
   1304       setOperationAction(ISD::CONCAT_VECTORS,     VT, Custom);
   1305     }
   1306 
   1307     // Promote v32i8, v16i16, v8i32 select, and, or, xor to v4i64.
   1308     for (int i = MVT::v32i8; i != MVT::v4i64; ++i) {
   1309       MVT VT = (MVT::SimpleValueType)i;
   1310 
   1311       // Do not attempt to promote non-256-bit vectors
   1312       if (!VT.is256BitVector())
   1313         continue;
   1314 
   1315       setOperationAction(ISD::AND,    VT, Promote);
   1316       AddPromotedToType (ISD::AND,    VT, MVT::v4i64);
   1317       setOperationAction(ISD::OR,     VT, Promote);
   1318       AddPromotedToType (ISD::OR,     VT, MVT::v4i64);
   1319       setOperationAction(ISD::XOR,    VT, Promote);
   1320       AddPromotedToType (ISD::XOR,    VT, MVT::v4i64);
   1321       setOperationAction(ISD::LOAD,   VT, Promote);
   1322       AddPromotedToType (ISD::LOAD,   VT, MVT::v4i64);
   1323       setOperationAction(ISD::SELECT, VT, Promote);
   1324       AddPromotedToType (ISD::SELECT, VT, MVT::v4i64);
   1325     }
   1326   }
   1327 
   1328   if (!TM.Options.UseSoftFloat && Subtarget->hasAVX512()) {
   1329     addRegisterClass(MVT::v16i32, &X86::VR512RegClass);
   1330     addRegisterClass(MVT::v16f32, &X86::VR512RegClass);
   1331     addRegisterClass(MVT::v8i64,  &X86::VR512RegClass);
   1332     addRegisterClass(MVT::v8f64,  &X86::VR512RegClass);
   1333 
   1334     addRegisterClass(MVT::i1,     &X86::VK1RegClass);
   1335     addRegisterClass(MVT::v8i1,   &X86::VK8RegClass);
   1336     addRegisterClass(MVT::v16i1,  &X86::VK16RegClass);
   1337 
   1338     setOperationAction(ISD::BR_CC,              MVT::i1,    Expand);
   1339     setOperationAction(ISD::SETCC,              MVT::i1,    Custom);
   1340     setOperationAction(ISD::XOR,                MVT::i1,    Legal);
   1341     setOperationAction(ISD::OR,                 MVT::i1,    Legal);
   1342     setOperationAction(ISD::AND,                MVT::i1,    Legal);
   1343     setLoadExtAction(ISD::EXTLOAD,              MVT::v8f32, Legal);
   1344     setOperationAction(ISD::LOAD,               MVT::v16f32, Legal);
   1345     setOperationAction(ISD::LOAD,               MVT::v8f64, Legal);
   1346     setOperationAction(ISD::LOAD,               MVT::v8i64, Legal);
   1347     setOperationAction(ISD::LOAD,               MVT::v16i32, Legal);
   1348     setOperationAction(ISD::LOAD,               MVT::v16i1, Legal);
   1349 
   1350     setOperationAction(ISD::FADD,               MVT::v16f32, Legal);
   1351     setOperationAction(ISD::FSUB,               MVT::v16f32, Legal);
   1352     setOperationAction(ISD::FMUL,               MVT::v16f32, Legal);
   1353     setOperationAction(ISD::FDIV,               MVT::v16f32, Legal);
   1354     setOperationAction(ISD::FSQRT,              MVT::v16f32, Legal);
   1355     setOperationAction(ISD::FNEG,               MVT::v16f32, Custom);
   1356 
   1357     setOperationAction(ISD::FADD,               MVT::v8f64, Legal);
   1358     setOperationAction(ISD::FSUB,               MVT::v8f64, Legal);
   1359     setOperationAction(ISD::FMUL,               MVT::v8f64, Legal);
   1360     setOperationAction(ISD::FDIV,               MVT::v8f64, Legal);
   1361     setOperationAction(ISD::FSQRT,              MVT::v8f64, Legal);
   1362     setOperationAction(ISD::FNEG,               MVT::v8f64, Custom);
   1363     setOperationAction(ISD::FMA,                MVT::v8f64, Legal);
   1364     setOperationAction(ISD::FMA,                MVT::v16f32, Legal);
   1365 
   1366     setOperationAction(ISD::FP_TO_SINT,         MVT::i32, Legal);
   1367     setOperationAction(ISD::FP_TO_UINT,         MVT::i32, Legal);
   1368     setOperationAction(ISD::SINT_TO_FP,         MVT::i32, Legal);
   1369     setOperationAction(ISD::UINT_TO_FP,         MVT::i32, Legal);
   1370     if (Subtarget->is64Bit()) {
   1371       setOperationAction(ISD::FP_TO_UINT,       MVT::i64, Legal);
   1372       setOperationAction(ISD::FP_TO_SINT,       MVT::i64, Legal);
   1373       setOperationAction(ISD::SINT_TO_FP,       MVT::i64, Legal);
   1374       setOperationAction(ISD::UINT_TO_FP,       MVT::i64, Legal);
   1375     }
   1376     setOperationAction(ISD::FP_TO_SINT,         MVT::v16i32, Legal);
   1377     setOperationAction(ISD::FP_TO_UINT,         MVT::v16i32, Legal);
   1378     setOperationAction(ISD::FP_TO_UINT,         MVT::v8i32, Legal);
   1379     setOperationAction(ISD::FP_TO_UINT,         MVT::v4i32, Legal);
   1380     setOperationAction(ISD::SINT_TO_FP,         MVT::v16i32, Legal);
   1381     setOperationAction(ISD::UINT_TO_FP,         MVT::v16i32, Legal);
   1382     setOperationAction(ISD::UINT_TO_FP,         MVT::v8i32, Legal);
   1383     setOperationAction(ISD::UINT_TO_FP,         MVT::v4i32, Legal);
   1384     setOperationAction(ISD::FP_ROUND,           MVT::v8f32, Legal);
   1385     setOperationAction(ISD::FP_EXTEND,          MVT::v8f32, Legal);
   1386 
   1387     setOperationAction(ISD::TRUNCATE,           MVT::i1, Custom);
   1388     setOperationAction(ISD::TRUNCATE,           MVT::v16i8, Custom);
   1389     setOperationAction(ISD::TRUNCATE,           MVT::v8i32, Custom);
   1390     setOperationAction(ISD::TRUNCATE,           MVT::v8i1, Custom);
   1391     setOperationAction(ISD::TRUNCATE,           MVT::v16i1, Custom);
   1392     setOperationAction(ISD::TRUNCATE,           MVT::v16i16, Custom);
   1393     setOperationAction(ISD::ZERO_EXTEND,        MVT::v16i32, Custom);
   1394     setOperationAction(ISD::ZERO_EXTEND,        MVT::v8i64, Custom);
   1395     setOperationAction(ISD::SIGN_EXTEND,        MVT::v16i32, Custom);
   1396     setOperationAction(ISD::SIGN_EXTEND,        MVT::v8i64, Custom);
   1397     setOperationAction(ISD::SIGN_EXTEND,        MVT::v16i8, Custom);
   1398     setOperationAction(ISD::SIGN_EXTEND,        MVT::v8i16, Custom);
   1399     setOperationAction(ISD::SIGN_EXTEND,        MVT::v16i16, Custom);
   1400 
   1401     setOperationAction(ISD::CONCAT_VECTORS,     MVT::v8f64,  Custom);
   1402     setOperationAction(ISD::CONCAT_VECTORS,     MVT::v8i64,  Custom);
   1403     setOperationAction(ISD::CONCAT_VECTORS,     MVT::v16f32,  Custom);
   1404     setOperationAction(ISD::CONCAT_VECTORS,     MVT::v16i32,  Custom);
   1405     setOperationAction(ISD::CONCAT_VECTORS,     MVT::v8i1,    Custom);
   1406     setOperationAction(ISD::CONCAT_VECTORS,     MVT::v16i1, Legal);
   1407 
   1408     setOperationAction(ISD::SETCC,              MVT::v16i1, Custom);
   1409     setOperationAction(ISD::SETCC,              MVT::v8i1, Custom);
   1410 
   1411     setOperationAction(ISD::MUL,              MVT::v8i64, Custom);
   1412 
   1413     setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v8i1,  Custom);
   1414     setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v16i1, Custom);
   1415     setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v16i1, Custom);
   1416     setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v8i1, Custom);
   1417     setOperationAction(ISD::BUILD_VECTOR,       MVT::v8i1, Custom);
   1418     setOperationAction(ISD::BUILD_VECTOR,       MVT::v16i1, Custom);
   1419     setOperationAction(ISD::SELECT,             MVT::v8f64, Custom);
   1420     setOperationAction(ISD::SELECT,             MVT::v8i64, Custom);
   1421     setOperationAction(ISD::SELECT,             MVT::v16f32, Custom);
   1422 
   1423     setOperationAction(ISD::ADD,                MVT::v8i64, Legal);
   1424     setOperationAction(ISD::ADD,                MVT::v16i32, Legal);
   1425 
   1426     setOperationAction(ISD::SUB,                MVT::v8i64, Legal);
   1427     setOperationAction(ISD::SUB,                MVT::v16i32, Legal);
   1428 
   1429     setOperationAction(ISD::MUL,                MVT::v16i32, Legal);
   1430 
   1431     setOperationAction(ISD::SRL,                MVT::v8i64, Custom);
   1432     setOperationAction(ISD::SRL,                MVT::v16i32, Custom);
   1433 
   1434     setOperationAction(ISD::SHL,                MVT::v8i64, Custom);
   1435     setOperationAction(ISD::SHL,                MVT::v16i32, Custom);
   1436 
   1437     setOperationAction(ISD::SRA,                MVT::v8i64, Custom);
   1438     setOperationAction(ISD::SRA,                MVT::v16i32, Custom);
   1439 
   1440     setOperationAction(ISD::AND,                MVT::v8i64, Legal);
   1441     setOperationAction(ISD::OR,                 MVT::v8i64, Legal);
   1442     setOperationAction(ISD::XOR,                MVT::v8i64, Legal);
   1443     setOperationAction(ISD::AND,                MVT::v16i32, Legal);
   1444     setOperationAction(ISD::OR,                 MVT::v16i32, Legal);
   1445     setOperationAction(ISD::XOR,                MVT::v16i32, Legal);
   1446 
   1447     if (Subtarget->hasCDI()) {
   1448       setOperationAction(ISD::CTLZ,             MVT::v8i64, Legal);
   1449       setOperationAction(ISD::CTLZ,             MVT::v16i32, Legal);
   1450     }
   1451 
   1452     // Custom lower several nodes.
   1453     for (int i = MVT::FIRST_VECTOR_VALUETYPE;
   1454              i <= MVT::LAST_VECTOR_VALUETYPE; ++i) {
   1455       MVT VT = (MVT::SimpleValueType)i;
   1456 
   1457       unsigned EltSize = VT.getVectorElementType().getSizeInBits();
   1458       // Extract subvector is special because the value type
   1459       // (result) is 256/128-bit but the source is 512-bit wide.
   1460       if (VT.is128BitVector() || VT.is256BitVector())
   1461         setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Custom);
   1462 
   1463       if (VT.getVectorElementType() == MVT::i1)
   1464         setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Legal);
   1465 
   1466       // Do not attempt to custom lower other non-512-bit vectors
   1467       if (!VT.is512BitVector())
   1468         continue;
   1469 
   1470       if ( EltSize >= 32) {
   1471         setOperationAction(ISD::VECTOR_SHUFFLE,      VT, Custom);
   1472         setOperationAction(ISD::INSERT_VECTOR_ELT,   VT, Custom);
   1473         setOperationAction(ISD::BUILD_VECTOR,        VT, Custom);
   1474         setOperationAction(ISD::VSELECT,             VT, Legal);
   1475         setOperationAction(ISD::EXTRACT_VECTOR_ELT,  VT, Custom);
   1476         setOperationAction(ISD::SCALAR_TO_VECTOR,    VT, Custom);
   1477         setOperationAction(ISD::INSERT_SUBVECTOR,    VT, Custom);
   1478       }
   1479     }
   1480     for (int i = MVT::v32i8; i != MVT::v8i64; ++i) {
   1481       MVT VT = (MVT::SimpleValueType)i;
   1482 
   1483       // Do not attempt to promote non-256-bit vectors
   1484       if (!VT.is512BitVector())
   1485         continue;
   1486 
   1487       setOperationAction(ISD::SELECT, VT, Promote);
   1488       AddPromotedToType (ISD::SELECT, VT, MVT::v8i64);
   1489     }
   1490   }// has  AVX-512
   1491 
   1492   // SIGN_EXTEND_INREGs are evaluated by the extend type. Handle the expansion
   1493   // of this type with custom code.
   1494   for (int VT = MVT::FIRST_VECTOR_VALUETYPE;
   1495            VT != MVT::LAST_VECTOR_VALUETYPE; VT++) {
   1496     setOperationAction(ISD::SIGN_EXTEND_INREG, (MVT::SimpleValueType)VT,
   1497                        Custom);
   1498   }
   1499 
   1500   // We want to custom lower some of our intrinsics.
   1501   setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom);
   1502   setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::Other, Custom);
   1503   setOperationAction(ISD::INTRINSIC_VOID, MVT::Other, Custom);
   1504   if (!Subtarget->is64Bit())
   1505     setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::i64, Custom);
   1506 
   1507   // Only custom-lower 64-bit SADDO and friends on 64-bit because we don't
   1508   // handle type legalization for these operations here.
   1509   //
   1510   // FIXME: We really should do custom legalization for addition and
   1511   // subtraction on x86-32 once PR3203 is fixed.  We really can't do much better
   1512   // than generic legalization for 64-bit multiplication-with-overflow, though.
   1513   for (unsigned i = 0, e = 3+Subtarget->is64Bit(); i != e; ++i) {
   1514     // Add/Sub/Mul with overflow operations are custom lowered.
   1515     MVT VT = IntVTs[i];
   1516     setOperationAction(ISD::SADDO, VT, Custom);
   1517     setOperationAction(ISD::UADDO, VT, Custom);
   1518     setOperationAction(ISD::SSUBO, VT, Custom);
   1519     setOperationAction(ISD::USUBO, VT, Custom);
   1520     setOperationAction(ISD::SMULO, VT, Custom);
   1521     setOperationAction(ISD::UMULO, VT, Custom);
   1522   }
   1523 
   1524   // There are no 8-bit 3-address imul/mul instructions
   1525   setOperationAction(ISD::SMULO, MVT::i8, Expand);
   1526   setOperationAction(ISD::UMULO, MVT::i8, Expand);
   1527 
   1528   if (!Subtarget->is64Bit()) {
   1529     // These libcalls are not available in 32-bit.
   1530     setLibcallName(RTLIB::SHL_I128, nullptr);
   1531     setLibcallName(RTLIB::SRL_I128, nullptr);
   1532     setLibcallName(RTLIB::SRA_I128, nullptr);
   1533   }
   1534 
   1535   // Combine sin / cos into one node or libcall if possible.
   1536   if (Subtarget->hasSinCos()) {
   1537     setLibcallName(RTLIB::SINCOS_F32, "sincosf");
   1538     setLibcallName(RTLIB::SINCOS_F64, "sincos");
   1539     if (Subtarget->isTargetDarwin()) {
   1540       // For MacOSX, we don't want to the normal expansion of a libcall to
   1541       // sincos. We want to issue a libcall to __sincos_stret to avoid memory
   1542       // traffic.
   1543       setOperationAction(ISD::FSINCOS, MVT::f64, Custom);
   1544       setOperationAction(ISD::FSINCOS, MVT::f32, Custom);
   1545     }
   1546   }
   1547 
   1548   if (Subtarget->isTargetWin64()) {
   1549     setOperationAction(ISD::SDIV, MVT::i128, Custom);
   1550     setOperationAction(ISD::UDIV, MVT::i128, Custom);
   1551     setOperationAction(ISD::SREM, MVT::i128, Custom);
   1552     setOperationAction(ISD::UREM, MVT::i128, Custom);
   1553     setOperationAction(ISD::SDIVREM, MVT::i128, Custom);
   1554     setOperationAction(ISD::UDIVREM, MVT::i128, Custom);
   1555   }
   1556 
   1557   // We have target-specific dag combine patterns for the following nodes:
   1558   setTargetDAGCombine(ISD::VECTOR_SHUFFLE);
   1559   setTargetDAGCombine(ISD::EXTRACT_VECTOR_ELT);
   1560   setTargetDAGCombine(ISD::VSELECT);
   1561   setTargetDAGCombine(ISD::SELECT);
   1562   setTargetDAGCombine(ISD::SHL);
   1563   setTargetDAGCombine(ISD::SRA);
   1564   setTargetDAGCombine(ISD::SRL);
   1565   setTargetDAGCombine(ISD::OR);
   1566   setTargetDAGCombine(ISD::AND);
   1567   setTargetDAGCombine(ISD::ADD);
   1568   setTargetDAGCombine(ISD::FADD);
   1569   setTargetDAGCombine(ISD::FSUB);
   1570   setTargetDAGCombine(ISD::FMA);
   1571   setTargetDAGCombine(ISD::SUB);
   1572   setTargetDAGCombine(ISD::LOAD);
   1573   setTargetDAGCombine(ISD::STORE);
   1574   setTargetDAGCombine(ISD::ZERO_EXTEND);
   1575   setTargetDAGCombine(ISD::ANY_EXTEND);
   1576   setTargetDAGCombine(ISD::SIGN_EXTEND);
   1577   setTargetDAGCombine(ISD::SIGN_EXTEND_INREG);
   1578   setTargetDAGCombine(ISD::TRUNCATE);
   1579   setTargetDAGCombine(ISD::SINT_TO_FP);
   1580   setTargetDAGCombine(ISD::SETCC);
   1581   setTargetDAGCombine(ISD::INTRINSIC_WO_CHAIN);
   1582   setTargetDAGCombine(ISD::BUILD_VECTOR);
   1583   if (Subtarget->is64Bit())
   1584     setTargetDAGCombine(ISD::MUL);
   1585   setTargetDAGCombine(ISD::XOR);
   1586 
   1587   computeRegisterProperties();
   1588 
   1589   // On Darwin, -Os means optimize for size without hurting performance,
   1590   // do not reduce the limit.
   1591   MaxStoresPerMemset = 16; // For @llvm.memset -> sequence of stores
   1592   MaxStoresPerMemsetOptSize = Subtarget->isTargetDarwin() ? 16 : 8;
   1593   MaxStoresPerMemcpy = 8; // For @llvm.memcpy -> sequence of stores
   1594   MaxStoresPerMemcpyOptSize = Subtarget->isTargetDarwin() ? 8 : 4;
   1595   MaxStoresPerMemmove = 8; // For @llvm.memmove -> sequence of stores
   1596   MaxStoresPerMemmoveOptSize = Subtarget->isTargetDarwin() ? 8 : 4;
   1597   setPrefLoopAlignment(4); // 2^4 bytes.
   1598 
   1599   // Predictable cmov don't hurt on atom because it's in-order.
   1600   PredictableSelectIsExpensive = !Subtarget->isAtom();
   1601 
   1602   setPrefFunctionAlignment(4); // 2^4 bytes.
   1603 }
   1604 
   1605 TargetLoweringBase::LegalizeTypeAction
   1606 X86TargetLowering::getPreferredVectorAction(EVT VT) const {
   1607   if (ExperimentalVectorWideningLegalization &&
   1608       VT.getVectorNumElements() != 1 &&
   1609       VT.getVectorElementType().getSimpleVT() != MVT::i1)
   1610     return TypeWidenVector;
   1611 
   1612   return TargetLoweringBase::getPreferredVectorAction(VT);
   1613 }
   1614 
   1615 EVT X86TargetLowering::getSetCCResultType(LLVMContext &, EVT VT) const {
   1616   if (!VT.isVector())
   1617     return Subtarget->hasAVX512() ? MVT::i1: MVT::i8;
   1618 
   1619   if (Subtarget->hasAVX512())
   1620     switch(VT.getVectorNumElements()) {
   1621     case  8: return MVT::v8i1;
   1622     case 16: return MVT::v16i1;
   1623   }
   1624 
   1625   return VT.changeVectorElementTypeToInteger();
   1626 }
   1627 
   1628 /// getMaxByValAlign - Helper for getByValTypeAlignment to determine
   1629 /// the desired ByVal argument alignment.
   1630 static void getMaxByValAlign(Type *Ty, unsigned &MaxAlign) {
   1631   if (MaxAlign == 16)
   1632     return;
   1633   if (VectorType *VTy = dyn_cast<VectorType>(Ty)) {
   1634     if (VTy->getBitWidth() == 128)
   1635       MaxAlign = 16;
   1636   } else if (ArrayType *ATy = dyn_cast<ArrayType>(Ty)) {
   1637     unsigned EltAlign = 0;
   1638     getMaxByValAlign(ATy->getElementType(), EltAlign);
   1639     if (EltAlign > MaxAlign)
   1640       MaxAlign = EltAlign;
   1641   } else if (StructType *STy = dyn_cast<StructType>(Ty)) {
   1642     for (unsigned i = 0, e = STy->getNumElements(); i != e; ++i) {
   1643       unsigned EltAlign = 0;
   1644       getMaxByValAlign(STy->getElementType(i), EltAlign);
   1645       if (EltAlign > MaxAlign)
   1646         MaxAlign = EltAlign;
   1647       if (MaxAlign == 16)
   1648         break;
   1649     }
   1650   }
   1651 }
   1652 
   1653 /// getByValTypeAlignment - Return the desired alignment for ByVal aggregate
   1654 /// function arguments in the caller parameter area. For X86, aggregates
   1655 /// that contain SSE vectors are placed at 16-byte boundaries while the rest
   1656 /// are at 4-byte boundaries.
   1657 unsigned X86TargetLowering::getByValTypeAlignment(Type *Ty) const {
   1658   if (Subtarget->is64Bit()) {
   1659     // Max of 8 and alignment of type.
   1660     unsigned TyAlign = TD->getABITypeAlignment(Ty);
   1661     if (TyAlign > 8)
   1662       return TyAlign;
   1663     return 8;
   1664   }
   1665 
   1666   unsigned Align = 4;
   1667   if (Subtarget->hasSSE1())
   1668     getMaxByValAlign(Ty, Align);
   1669   return Align;
   1670 }
   1671 
   1672 /// getOptimalMemOpType - Returns the target specific optimal type for load
   1673 /// and store operations as a result of memset, memcpy, and memmove
   1674 /// lowering. If DstAlign is zero that means it's safe to destination
   1675 /// alignment can satisfy any constraint. Similarly if SrcAlign is zero it
   1676 /// means there isn't a need to check it against alignment requirement,
   1677 /// probably because the source does not need to be loaded. If 'IsMemset' is
   1678 /// true, that means it's expanding a memset. If 'ZeroMemset' is true, that
   1679 /// means it's a memset of zero. 'MemcpyStrSrc' indicates whether the memcpy
   1680 /// source is constant so it does not need to be loaded.
   1681 /// It returns EVT::Other if the type should be determined using generic
   1682 /// target-independent logic.
   1683 EVT
   1684 X86TargetLowering::getOptimalMemOpType(uint64_t Size,
   1685                                        unsigned DstAlign, unsigned SrcAlign,
   1686                                        bool IsMemset, bool ZeroMemset,
   1687                                        bool MemcpyStrSrc,
   1688                                        MachineFunction &MF) const {
   1689   const Function *F = MF.getFunction();
   1690   if ((!IsMemset || ZeroMemset) &&
   1691       !F->getAttributes().hasAttribute(AttributeSet::FunctionIndex,
   1692                                        Attribute::NoImplicitFloat)) {
   1693     if (Size >= 16 &&
   1694         (Subtarget->isUnalignedMemAccessFast() ||
   1695          ((DstAlign == 0 || DstAlign >= 16) &&
   1696           (SrcAlign == 0 || SrcAlign >= 16)))) {
   1697       if (Size >= 32) {
   1698         if (Subtarget->hasInt256())
   1699           return MVT::v8i32;
   1700         if (Subtarget->hasFp256())
   1701           return MVT::v8f32;
   1702       }
   1703       if (Subtarget->hasSSE2())
   1704         return MVT::v4i32;
   1705       if (Subtarget->hasSSE1())
   1706         return MVT::v4f32;
   1707     } else if (!MemcpyStrSrc && Size >= 8 &&
   1708                !Subtarget->is64Bit() &&
   1709                Subtarget->hasSSE2()) {
   1710       // Do not use f64 to lower memcpy if source is string constant. It's
   1711       // better to use i32 to avoid the loads.
   1712       return MVT::f64;
   1713     }
   1714   }
   1715   if (Subtarget->is64Bit() && Size >= 8)
   1716     return MVT::i64;
   1717   return MVT::i32;
   1718 }
   1719 
   1720 bool X86TargetLowering::isSafeMemOpType(MVT VT) const {
   1721   if (VT == MVT::f32)
   1722     return X86ScalarSSEf32;
   1723   else if (VT == MVT::f64)
   1724     return X86ScalarSSEf64;
   1725   return true;
   1726 }
   1727 
   1728 bool
   1729 X86TargetLowering::allowsUnalignedMemoryAccesses(EVT VT,
   1730                                                  unsigned,
   1731                                                  bool *Fast) const {
   1732   if (Fast)
   1733     *Fast = Subtarget->isUnalignedMemAccessFast();
   1734   return true;
   1735 }
   1736 
   1737 /// getJumpTableEncoding - Return the entry encoding for a jump table in the
   1738 /// current function.  The returned value is a member of the
   1739 /// MachineJumpTableInfo::JTEntryKind enum.
   1740 unsigned X86TargetLowering::getJumpTableEncoding() const {
   1741   // In GOT pic mode, each entry in the jump table is emitted as a @GOTOFF
   1742   // symbol.
   1743   if (getTargetMachine().getRelocationModel() == Reloc::PIC_ &&
   1744       Subtarget->isPICStyleGOT())
   1745     return MachineJumpTableInfo::EK_Custom32;
   1746 
   1747   // Otherwise, use the normal jump table encoding heuristics.
   1748   return TargetLowering::getJumpTableEncoding();
   1749 }
   1750 
   1751 const MCExpr *
   1752 X86TargetLowering::LowerCustomJumpTableEntry(const MachineJumpTableInfo *MJTI,
   1753                                              const MachineBasicBlock *MBB,
   1754                                              unsigned uid,MCContext &Ctx) const{
   1755   assert(MBB->getParent()->getTarget().getRelocationModel() == Reloc::PIC_ &&
   1756          Subtarget->isPICStyleGOT());
   1757   // In 32-bit ELF systems, our jump table entries are formed with @GOTOFF
   1758   // entries.
   1759   return MCSymbolRefExpr::Create(MBB->getSymbol(),
   1760                                  MCSymbolRefExpr::VK_GOTOFF, Ctx);
   1761 }
   1762 
   1763 /// getPICJumpTableRelocaBase - Returns relocation base for the given PIC
   1764 /// jumptable.
   1765 SDValue X86TargetLowering::getPICJumpTableRelocBase(SDValue Table,
   1766                                                     SelectionDAG &DAG) const {
   1767   if (!Subtarget->is64Bit())
   1768     // This doesn't have SDLoc associated with it, but is not really the
   1769     // same as a Register.
   1770     return DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), getPointerTy());
   1771   return Table;
   1772 }
   1773 
   1774 /// getPICJumpTableRelocBaseExpr - This returns the relocation base for the
   1775 /// given PIC jumptable, the same as getPICJumpTableRelocBase, but as an
   1776 /// MCExpr.
   1777 const MCExpr *X86TargetLowering::
   1778 getPICJumpTableRelocBaseExpr(const MachineFunction *MF, unsigned JTI,
   1779                              MCContext &Ctx) const {
   1780   // X86-64 uses RIP relative addressing based on the jump table label.
   1781   if (Subtarget->isPICStyleRIPRel())
   1782     return TargetLowering::getPICJumpTableRelocBaseExpr(MF, JTI, Ctx);
   1783 
   1784   // Otherwise, the reference is relative to the PIC base.
   1785   return MCSymbolRefExpr::Create(MF->getPICBaseSymbol(), Ctx);
   1786 }
   1787 
   1788 // FIXME: Why this routine is here? Move to RegInfo!
   1789 std::pair<const TargetRegisterClass*, uint8_t>
   1790 X86TargetLowering::findRepresentativeClass(MVT VT) const{
   1791   const TargetRegisterClass *RRC = nullptr;
   1792   uint8_t Cost = 1;
   1793   switch (VT.SimpleTy) {
   1794   default:
   1795     return TargetLowering::findRepresentativeClass(VT);
   1796   case MVT::i8: case MVT::i16: case MVT::i32: case MVT::i64:
   1797     RRC = Subtarget->is64Bit() ?
   1798       (const TargetRegisterClass*)&X86::GR64RegClass :
   1799       (const TargetRegisterClass*)&X86::GR32RegClass;
   1800     break;
   1801   case MVT::x86mmx:
   1802     RRC = &X86::VR64RegClass;
   1803     break;
   1804   case MVT::f32: case MVT::f64:
   1805   case MVT::v16i8: case MVT::v8i16: case MVT::v4i32: case MVT::v2i64:
   1806   case MVT::v4f32: case MVT::v2f64:
   1807   case MVT::v32i8: case MVT::v8i32: case MVT::v4i64: case MVT::v8f32:
   1808   case MVT::v4f64:
   1809     RRC = &X86::VR128RegClass;
   1810     break;
   1811   }
   1812   return std::make_pair(RRC, Cost);
   1813 }
   1814 
   1815 bool X86TargetLowering::getStackCookieLocation(unsigned &AddressSpace,
   1816                                                unsigned &Offset) const {
   1817   if (!Subtarget->isTargetLinux())
   1818     return false;
   1819 
   1820   if (Subtarget->is64Bit()) {
   1821     // %fs:0x28, unless we're using a Kernel code model, in which case it's %gs:
   1822     Offset = 0x28;
   1823     if (getTargetMachine().getCodeModel() == CodeModel::Kernel)
   1824       AddressSpace = 256;
   1825     else
   1826       AddressSpace = 257;
   1827   } else {
   1828     // %gs:0x14 on i386
   1829     Offset = 0x14;
   1830     AddressSpace = 256;
   1831   }
   1832   return true;
   1833 }
   1834 
   1835 bool X86TargetLowering::isNoopAddrSpaceCast(unsigned SrcAS,
   1836                                             unsigned DestAS) const {
   1837   assert(SrcAS != DestAS && "Expected different address spaces!");
   1838 
   1839   return SrcAS < 256 && DestAS < 256;
   1840 }
   1841 
   1842 //===----------------------------------------------------------------------===//
   1843 //               Return Value Calling Convention Implementation
   1844 //===----------------------------------------------------------------------===//
   1845 
   1846 #include "X86GenCallingConv.inc"
   1847 
   1848 bool
   1849 X86TargetLowering::CanLowerReturn(CallingConv::ID CallConv,
   1850                                   MachineFunction &MF, bool isVarArg,
   1851                         const SmallVectorImpl<ISD::OutputArg> &Outs,
   1852                         LLVMContext &Context) const {
   1853   SmallVector<CCValAssign, 16> RVLocs;
   1854   CCState CCInfo(CallConv, isVarArg, MF, MF.getTarget(),
   1855                  RVLocs, Context);
   1856   return CCInfo.CheckReturn(Outs, RetCC_X86);
   1857 }
   1858 
   1859 const MCPhysReg *X86TargetLowering::getScratchRegisters(CallingConv::ID) const {
   1860   static const MCPhysReg ScratchRegs[] = { X86::R11, 0 };
   1861   return ScratchRegs;
   1862 }
   1863 
   1864 SDValue
   1865 X86TargetLowering::LowerReturn(SDValue Chain,
   1866                                CallingConv::ID CallConv, bool isVarArg,
   1867                                const SmallVectorImpl<ISD::OutputArg> &Outs,
   1868                                const SmallVectorImpl<SDValue> &OutVals,
   1869                                SDLoc dl, SelectionDAG &DAG) const {
   1870   MachineFunction &MF = DAG.getMachineFunction();
   1871   X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
   1872 
   1873   SmallVector<CCValAssign, 16> RVLocs;
   1874   CCState CCInfo(CallConv, isVarArg, MF, DAG.getTarget(),
   1875                  RVLocs, *DAG.getContext());
   1876   CCInfo.AnalyzeReturn(Outs, RetCC_X86);
   1877 
   1878   SDValue Flag;
   1879   SmallVector<SDValue, 6> RetOps;
   1880   RetOps.push_back(Chain); // Operand #0 = Chain (updated below)
   1881   // Operand #1 = Bytes To Pop
   1882   RetOps.push_back(DAG.getTargetConstant(FuncInfo->getBytesToPopOnReturn(),
   1883                    MVT::i16));
   1884 
   1885   // Copy the result values into the output registers.
   1886   for (unsigned i = 0; i != RVLocs.size(); ++i) {
   1887     CCValAssign &VA = RVLocs[i];
   1888     assert(VA.isRegLoc() && "Can only return in registers!");
   1889     SDValue ValToCopy = OutVals[i];
   1890     EVT ValVT = ValToCopy.getValueType();
   1891 
   1892     // Promote values to the appropriate types
   1893     if (VA.getLocInfo() == CCValAssign::SExt)
   1894       ValToCopy = DAG.getNode(ISD::SIGN_EXTEND, dl, VA.getLocVT(), ValToCopy);
   1895     else if (VA.getLocInfo() == CCValAssign::ZExt)
   1896       ValToCopy = DAG.getNode(ISD::ZERO_EXTEND, dl, VA.getLocVT(), ValToCopy);
   1897     else if (VA.getLocInfo() == CCValAssign::AExt)
   1898       ValToCopy = DAG.getNode(ISD::ANY_EXTEND, dl, VA.getLocVT(), ValToCopy);
   1899     else if (VA.getLocInfo() == CCValAssign::BCvt)
   1900       ValToCopy = DAG.getNode(ISD::BITCAST, dl, VA.getLocVT(), ValToCopy);
   1901 
   1902     assert(VA.getLocInfo() != CCValAssign::FPExt &&
   1903            "Unexpected FP-extend for return value.");
   1904 
   1905     // If this is x86-64, and we disabled SSE, we can't return FP values,
   1906     // or SSE or MMX vectors.
   1907     if ((ValVT == MVT::f32 || ValVT == MVT::f64 ||
   1908          VA.getLocReg() == X86::XMM0 || VA.getLocReg() == X86::XMM1) &&
   1909           (Subtarget->is64Bit() && !Subtarget->hasSSE1())) {
   1910       report_fatal_error("SSE register return with SSE disabled");
   1911     }
   1912     // Likewise we can't return F64 values with SSE1 only.  gcc does so, but
   1913     // llvm-gcc has never done it right and no one has noticed, so this
   1914     // should be OK for now.
   1915     if (ValVT == MVT::f64 &&
   1916         (Subtarget->is64Bit() && !Subtarget->hasSSE2()))
   1917       report_fatal_error("SSE2 register return with SSE2 disabled");
   1918 
   1919     // Returns in ST0/ST1 are handled specially: these are pushed as operands to
   1920     // the RET instruction and handled by the FP Stackifier.
   1921     if (VA.getLocReg() == X86::ST0 ||
   1922         VA.getLocReg() == X86::ST1) {
   1923       // If this is a copy from an xmm register to ST(0), use an FPExtend to
   1924       // change the value to the FP stack register class.
   1925       if (isScalarFPTypeInSSEReg(VA.getValVT()))
   1926         ValToCopy = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f80, ValToCopy);
   1927       RetOps.push_back(ValToCopy);
   1928       // Don't emit a copytoreg.
   1929       continue;
   1930     }
   1931 
   1932     // 64-bit vector (MMX) values are returned in XMM0 / XMM1 except for v1i64
   1933     // which is returned in RAX / RDX.
   1934     if (Subtarget->is64Bit()) {
   1935       if (ValVT == MVT::x86mmx) {
   1936         if (VA.getLocReg() == X86::XMM0 || VA.getLocReg() == X86::XMM1) {
   1937           ValToCopy = DAG.getNode(ISD::BITCAST, dl, MVT::i64, ValToCopy);
   1938           ValToCopy = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64,
   1939                                   ValToCopy);
   1940           // If we don't have SSE2 available, convert to v4f32 so the generated
   1941           // register is legal.
   1942           if (!Subtarget->hasSSE2())
   1943             ValToCopy = DAG.getNode(ISD::BITCAST, dl, MVT::v4f32,ValToCopy);
   1944         }
   1945       }
   1946     }
   1947 
   1948     Chain = DAG.getCopyToReg(Chain, dl, VA.getLocReg(), ValToCopy, Flag);
   1949     Flag = Chain.getValue(1);
   1950     RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT()));
   1951   }
   1952 
   1953   // The x86-64 ABIs require that for returning structs by value we copy
   1954   // the sret argument into %rax/%eax (depending on ABI) for the return.
   1955   // Win32 requires us to put the sret argument to %eax as well.
   1956   // We saved the argument into a virtual register in the entry block,
   1957   // so now we copy the value out and into %rax/%eax.
   1958   if (DAG.getMachineFunction().getFunction()->hasStructRetAttr() &&
   1959       (Subtarget->is64Bit() || Subtarget->isTargetKnownWindowsMSVC())) {
   1960     MachineFunction &MF = DAG.getMachineFunction();
   1961     X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
   1962     unsigned Reg = FuncInfo->getSRetReturnReg();
   1963     assert(Reg &&
   1964            "SRetReturnReg should have been set in LowerFormalArguments().");
   1965     SDValue Val = DAG.getCopyFromReg(Chain, dl, Reg, getPointerTy());
   1966 
   1967     unsigned RetValReg
   1968         = (Subtarget->is64Bit() && !Subtarget->isTarget64BitILP32()) ?
   1969           X86::RAX : X86::EAX;
   1970     Chain = DAG.getCopyToReg(Chain, dl, RetValReg, Val, Flag);
   1971     Flag = Chain.getValue(1);
   1972 
   1973     // RAX/EAX now acts like a return value.
   1974     RetOps.push_back(DAG.getRegister(RetValReg, getPointerTy()));
   1975   }
   1976 
   1977   RetOps[0] = Chain;  // Update chain.
   1978 
   1979   // Add the flag if we have it.
   1980   if (Flag.getNode())
   1981     RetOps.push_back(Flag);
   1982 
   1983   return DAG.getNode(X86ISD::RET_FLAG, dl, MVT::Other, RetOps);
   1984 }
   1985 
   1986 bool X86TargetLowering::isUsedByReturnOnly(SDNode *N, SDValue &Chain) const {
   1987   if (N->getNumValues() != 1)
   1988     return false;
   1989   if (!N->hasNUsesOfValue(1, 0))
   1990     return false;
   1991 
   1992   SDValue TCChain = Chain;
   1993   SDNode *Copy = *N->use_begin();
   1994   if (Copy->getOpcode() == ISD::CopyToReg) {
   1995     // If the copy has a glue operand, we conservatively assume it isn't safe to
   1996     // perform a tail call.
   1997     if (Copy->getOperand(Copy->getNumOperands()-1).getValueType() == MVT::Glue)
   1998       return false;
   1999     TCChain = Copy->getOperand(0);
   2000   } else if (Copy->getOpcode() != ISD::FP_EXTEND)
   2001     return false;
   2002 
   2003   bool HasRet = false;
   2004   for (SDNode::use_iterator UI = Copy->use_begin(), UE = Copy->use_end();
   2005        UI != UE; ++UI) {
   2006     if (UI->getOpcode() != X86ISD::RET_FLAG)
   2007       return false;
   2008     HasRet = true;
   2009   }
   2010 
   2011   if (!HasRet)
   2012     return false;
   2013 
   2014   Chain = TCChain;
   2015   return true;
   2016 }
   2017 
   2018 MVT
   2019 X86TargetLowering::getTypeForExtArgOrReturn(MVT VT,
   2020                                             ISD::NodeType ExtendKind) const {
   2021   MVT ReturnMVT;
   2022   // TODO: Is this also valid on 32-bit?
   2023   if (Subtarget->is64Bit() && VT == MVT::i1 && ExtendKind == ISD::ZERO_EXTEND)
   2024     ReturnMVT = MVT::i8;
   2025   else
   2026     ReturnMVT = MVT::i32;
   2027 
   2028   MVT MinVT = getRegisterType(ReturnMVT);
   2029   return VT.bitsLT(MinVT) ? MinVT : VT;
   2030 }
   2031 
   2032 /// LowerCallResult - Lower the result values of a call into the
   2033 /// appropriate copies out of appropriate physical registers.
   2034 ///
   2035 SDValue
   2036 X86TargetLowering::LowerCallResult(SDValue Chain, SDValue InFlag,
   2037                                    CallingConv::ID CallConv, bool isVarArg,
   2038                                    const SmallVectorImpl<ISD::InputArg> &Ins,
   2039                                    SDLoc dl, SelectionDAG &DAG,
   2040                                    SmallVectorImpl<SDValue> &InVals) const {
   2041 
   2042   // Assign locations to each value returned by this call.
   2043   SmallVector<CCValAssign, 16> RVLocs;
   2044   bool Is64Bit = Subtarget->is64Bit();
   2045   CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(),
   2046                  DAG.getTarget(), RVLocs, *DAG.getContext());
   2047   CCInfo.AnalyzeCallResult(Ins, RetCC_X86);
   2048 
   2049   // Copy all of the result registers out of their specified physreg.
   2050   for (unsigned i = 0, e = RVLocs.size(); i != e; ++i) {
   2051     CCValAssign &VA = RVLocs[i];
   2052     EVT CopyVT = VA.getValVT();
   2053 
   2054     // If this is x86-64, and we disabled SSE, we can't return FP values
   2055     if ((CopyVT == MVT::f32 || CopyVT == MVT::f64) &&
   2056         ((Is64Bit || Ins[i].Flags.isInReg()) && !Subtarget->hasSSE1())) {
   2057       report_fatal_error("SSE register return with SSE disabled");
   2058     }
   2059 
   2060     SDValue Val;
   2061 
   2062     // If this is a call to a function that returns an fp value on the floating
   2063     // point stack, we must guarantee the value is popped from the stack, so
   2064     // a CopyFromReg is not good enough - the copy instruction may be eliminated
   2065     // if the return value is not used. We use the FpPOP_RETVAL instruction
   2066     // instead.
   2067     if (VA.getLocReg() == X86::ST0 || VA.getLocReg() == X86::ST1) {
   2068       // If we prefer to use the value in xmm registers, copy it out as f80 and
   2069       // use a truncate to move it from fp stack reg to xmm reg.
   2070       if (isScalarFPTypeInSSEReg(VA.getValVT())) CopyVT = MVT::f80;
   2071       SDValue Ops[] = { Chain, InFlag };
   2072       Chain = SDValue(DAG.getMachineNode(X86::FpPOP_RETVAL, dl, CopyVT,
   2073                                          MVT::Other, MVT::Glue, Ops), 1);
   2074       Val = Chain.getValue(0);
   2075 
   2076       // Round the f80 to the right size, which also moves it to the appropriate
   2077       // xmm register.
   2078       if (CopyVT != VA.getValVT())
   2079         Val = DAG.getNode(ISD::FP_ROUND, dl, VA.getValVT(), Val,
   2080                           // This truncation won't change the value.
   2081                           DAG.getIntPtrConstant(1));
   2082     } else {
   2083       Chain = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(),
   2084                                  CopyVT, InFlag).getValue(1);
   2085       Val = Chain.getValue(0);
   2086     }
   2087     InFlag = Chain.getValue(2);
   2088     InVals.push_back(Val);
   2089   }
   2090 
   2091   return Chain;
   2092 }
   2093 
   2094 //===----------------------------------------------------------------------===//
   2095 //                C & StdCall & Fast Calling Convention implementation
   2096 //===----------------------------------------------------------------------===//
   2097 //  StdCall calling convention seems to be standard for many Windows' API
   2098 //  routines and around. It differs from C calling convention just a little:
   2099 //  callee should clean up the stack, not caller. Symbols should be also
   2100 //  decorated in some fancy way :) It doesn't support any vector arguments.
   2101 //  For info on fast calling convention see Fast Calling Convention (tail call)
   2102 //  implementation LowerX86_32FastCCCallTo.
   2103 
   2104 /// CallIsStructReturn - Determines whether a call uses struct return
   2105 /// semantics.
   2106 enum StructReturnType {
   2107   NotStructReturn,
   2108   RegStructReturn,
   2109   StackStructReturn
   2110 };
   2111 static StructReturnType
   2112 callIsStructReturn(const SmallVectorImpl<ISD::OutputArg> &Outs) {
   2113   if (Outs.empty())
   2114     return NotStructReturn;
   2115 
   2116   const ISD::ArgFlagsTy &Flags = Outs[0].Flags;
   2117   if (!Flags.isSRet())
   2118     return NotStructReturn;
   2119   if (Flags.isInReg())
   2120     return RegStructReturn;
   2121   return StackStructReturn;
   2122 }
   2123 
   2124 /// ArgsAreStructReturn - Determines whether a function uses struct
   2125 /// return semantics.
   2126 static StructReturnType
   2127 argsAreStructReturn(const SmallVectorImpl<ISD::InputArg> &Ins) {
   2128   if (Ins.empty())
   2129     return NotStructReturn;
   2130 
   2131   const ISD::ArgFlagsTy &Flags = Ins[0].Flags;
   2132   if (!Flags.isSRet())
   2133     return NotStructReturn;
   2134   if (Flags.isInReg())
   2135     return RegStructReturn;
   2136   return StackStructReturn;
   2137 }
   2138 
   2139 /// CreateCopyOfByValArgument - Make a copy of an aggregate at address specified
   2140 /// by "Src" to address "Dst" with size and alignment information specified by
   2141 /// the specific parameter attribute. The copy will be passed as a byval
   2142 /// function parameter.
   2143 static SDValue
   2144 CreateCopyOfByValArgument(SDValue Src, SDValue Dst, SDValue Chain,
   2145                           ISD::ArgFlagsTy Flags, SelectionDAG &DAG,
   2146                           SDLoc dl) {
   2147   SDValue SizeNode = DAG.getConstant(Flags.getByValSize(), MVT::i32);
   2148 
   2149   return DAG.getMemcpy(Chain, dl, Dst, Src, SizeNode, Flags.getByValAlign(),
   2150                        /*isVolatile*/false, /*AlwaysInline=*/true,
   2151                        MachinePointerInfo(), MachinePointerInfo());
   2152 }
   2153 
   2154 /// IsTailCallConvention - Return true if the calling convention is one that
   2155 /// supports tail call optimization.
   2156 static bool IsTailCallConvention(CallingConv::ID CC) {
   2157   return (CC == CallingConv::Fast || CC == CallingConv::GHC ||
   2158           CC == CallingConv::HiPE);
   2159 }
   2160 
   2161 /// \brief Return true if the calling convention is a C calling convention.
   2162 static bool IsCCallConvention(CallingConv::ID CC) {
   2163   return (CC == CallingConv::C || CC == CallingConv::X86_64_Win64 ||
   2164           CC == CallingConv::X86_64_SysV);
   2165 }
   2166 
   2167 bool X86TargetLowering::mayBeEmittedAsTailCall(CallInst *CI) const {
   2168   if (!CI->isTailCall() || getTargetMachine().Options.DisableTailCalls)
   2169     return false;
   2170 
   2171   CallSite CS(CI);
   2172   CallingConv::ID CalleeCC = CS.getCallingConv();
   2173   if (!IsTailCallConvention(CalleeCC) && !IsCCallConvention(CalleeCC))
   2174     return false;
   2175 
   2176   return true;
   2177 }
   2178 
   2179 /// FuncIsMadeTailCallSafe - Return true if the function is being made into
   2180 /// a tailcall target by changing its ABI.
   2181 static bool FuncIsMadeTailCallSafe(CallingConv::ID CC,
   2182                                    bool GuaranteedTailCallOpt) {
   2183   return GuaranteedTailCallOpt && IsTailCallConvention(CC);
   2184 }
   2185 
   2186 SDValue
   2187 X86TargetLowering::LowerMemArgument(SDValue Chain,
   2188                                     CallingConv::ID CallConv,
   2189                                     const SmallVectorImpl<ISD::InputArg> &Ins,
   2190                                     SDLoc dl, SelectionDAG &DAG,
   2191                                     const CCValAssign &VA,
   2192                                     MachineFrameInfo *MFI,
   2193                                     unsigned i) const {
   2194   // Create the nodes corresponding to a load from this parameter slot.
   2195   ISD::ArgFlagsTy Flags = Ins[i].Flags;
   2196   bool AlwaysUseMutable = FuncIsMadeTailCallSafe(
   2197       CallConv, DAG.getTarget().Options.GuaranteedTailCallOpt);
   2198   bool isImmutable = !AlwaysUseMutable && !Flags.isByVal();
   2199   EVT ValVT;
   2200 
   2201   // If value is passed by pointer we have address passed instead of the value
   2202   // itself.
   2203   if (VA.getLocInfo() == CCValAssign::Indirect)
   2204     ValVT = VA.getLocVT();
   2205   else
   2206     ValVT = VA.getValVT();
   2207 
   2208   // FIXME: For now, all byval parameter objects are marked mutable. This can be
   2209   // changed with more analysis.
   2210   // In case of tail call optimization mark all arguments mutable. Since they
   2211   // could be overwritten by lowering of arguments in case of a tail call.
   2212   if (Flags.isByVal()) {
   2213     unsigned Bytes = Flags.getByValSize();
   2214     if (Bytes == 0) Bytes = 1; // Don't create zero-sized stack objects.
   2215     int FI = MFI->CreateFixedObject(Bytes, VA.getLocMemOffset(), isImmutable);
   2216     return DAG.getFrameIndex(FI, getPointerTy());
   2217   } else {
   2218     int FI = MFI->CreateFixedObject(ValVT.getSizeInBits()/8,
   2219                                     VA.getLocMemOffset(), isImmutable);
   2220     SDValue FIN = DAG.getFrameIndex(FI, getPointerTy());
   2221     return DAG.getLoad(ValVT, dl, Chain, FIN,
   2222                        MachinePointerInfo::getFixedStack(FI),
   2223                        false, false, false, 0);
   2224   }
   2225 }
   2226 
   2227 SDValue
   2228 X86TargetLowering::LowerFormalArguments(SDValue Chain,
   2229                                         CallingConv::ID CallConv,
   2230                                         bool isVarArg,
   2231                                       const SmallVectorImpl<ISD::InputArg> &Ins,
   2232                                         SDLoc dl,
   2233                                         SelectionDAG &DAG,
   2234                                         SmallVectorImpl<SDValue> &InVals)
   2235                                           const {
   2236   MachineFunction &MF = DAG.getMachineFunction();
   2237   X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
   2238 
   2239   const Function* Fn = MF.getFunction();
   2240   if (Fn->hasExternalLinkage() &&
   2241       Subtarget->isTargetCygMing() &&
   2242       Fn->getName() == "main")
   2243     FuncInfo->setForceFramePointer(true);
   2244 
   2245   MachineFrameInfo *MFI = MF.getFrameInfo();
   2246   bool Is64Bit = Subtarget->is64Bit();
   2247   bool IsWin64 = Subtarget->isCallingConvWin64(CallConv);
   2248 
   2249   assert(!(isVarArg && IsTailCallConvention(CallConv)) &&
   2250          "Var args not supported with calling convention fastcc, ghc or hipe");
   2251 
   2252   // Assign locations to all of the incoming arguments.
   2253   SmallVector<CCValAssign, 16> ArgLocs;
   2254   CCState CCInfo(CallConv, isVarArg, MF, DAG.getTarget(),
   2255                  ArgLocs, *DAG.getContext());
   2256 
   2257   // Allocate shadow area for Win64
   2258   if (IsWin64)
   2259     CCInfo.AllocateStack(32, 8);
   2260 
   2261   CCInfo.AnalyzeFormalArguments(Ins, CC_X86);
   2262 
   2263   unsigned LastVal = ~0U;
   2264   SDValue ArgValue;
   2265   for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
   2266     CCValAssign &VA = ArgLocs[i];
   2267     // TODO: If an arg is passed in two places (e.g. reg and stack), skip later
   2268     // places.
   2269     assert(VA.getValNo() != LastVal &&
   2270            "Don't support value assigned to multiple locs yet");
   2271     (void)LastVal;
   2272     LastVal = VA.getValNo();
   2273 
   2274     if (VA.isRegLoc()) {
   2275       EVT RegVT = VA.getLocVT();
   2276       const TargetRegisterClass *RC;
   2277       if (RegVT == MVT::i32)
   2278         RC = &X86::GR32RegClass;
   2279       else if (Is64Bit && RegVT == MVT::i64)
   2280         RC = &X86::GR64RegClass;
   2281       else if (RegVT == MVT::f32)
   2282         RC = &X86::FR32RegClass;
   2283       else if (RegVT == MVT::f64)
   2284         RC = &X86::FR64RegClass;
   2285       else if (RegVT.is512BitVector())
   2286         RC = &X86::VR512RegClass;
   2287       else if (RegVT.is256BitVector())
   2288         RC = &X86::VR256RegClass;
   2289       else if (RegVT.is128BitVector())
   2290         RC = &X86::VR128RegClass;
   2291       else if (RegVT == MVT::x86mmx)
   2292         RC = &X86::VR64RegClass;
   2293       else if (RegVT == MVT::i1)
   2294         RC = &X86::VK1RegClass;
   2295       else if (RegVT == MVT::v8i1)
   2296         RC = &X86::VK8RegClass;
   2297       else if (RegVT == MVT::v16i1)
   2298         RC = &X86::VK16RegClass;
   2299       else
   2300         llvm_unreachable("Unknown argument type!");
   2301 
   2302       unsigned Reg = MF.addLiveIn(VA.getLocReg(), RC);
   2303       ArgValue = DAG.getCopyFromReg(Chain, dl, Reg, RegVT);
   2304 
   2305       // If this is an 8 or 16-bit value, it is really passed promoted to 32
   2306       // bits.  Insert an assert[sz]ext to capture this, then truncate to the
   2307       // right size.
   2308       if (VA.getLocInfo() == CCValAssign::SExt)
   2309         ArgValue = DAG.getNode(ISD::AssertSext, dl, RegVT, ArgValue,
   2310                                DAG.getValueType(VA.getValVT()));
   2311       else if (VA.getLocInfo() == CCValAssign::ZExt)
   2312         ArgValue = DAG.getNode(ISD::AssertZext, dl, RegVT, ArgValue,
   2313                                DAG.getValueType(VA.getValVT()));
   2314       else if (VA.getLocInfo() == CCValAssign::BCvt)
   2315         ArgValue = DAG.getNode(ISD::BITCAST, dl, VA.getValVT(), ArgValue);
   2316 
   2317       if (VA.isExtInLoc()) {
   2318         // Handle MMX values passed in XMM regs.
   2319         if (RegVT.isVector())
   2320           ArgValue = DAG.getNode(X86ISD::MOVDQ2Q, dl, VA.getValVT(), ArgValue);
   2321         else
   2322           ArgValue = DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), ArgValue);
   2323       }
   2324     } else {
   2325       assert(VA.isMemLoc());
   2326       ArgValue = LowerMemArgument(Chain, CallConv, Ins, dl, DAG, VA, MFI, i);
   2327     }
   2328 
   2329     // If value is passed via pointer - do a load.
   2330     if (VA.getLocInfo() == CCValAssign::Indirect)
   2331       ArgValue = DAG.getLoad(VA.getValVT(), dl, Chain, ArgValue,
   2332                              MachinePointerInfo(), false, false, false, 0);
   2333 
   2334     InVals.push_back(ArgValue);
   2335   }
   2336 
   2337   if (Subtarget->is64Bit() || Subtarget->isTargetKnownWindowsMSVC()) {
   2338     for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
   2339       // The x86-64 ABIs require that for returning structs by value we copy
   2340       // the sret argument into %rax/%eax (depending on ABI) for the return.
   2341       // Win32 requires us to put the sret argument to %eax as well.
   2342       // Save the argument into a virtual register so that we can access it
   2343       // from the return points.
   2344       if (Ins[i].Flags.isSRet()) {
   2345         unsigned Reg = FuncInfo->getSRetReturnReg();
   2346         if (!Reg) {
   2347           MVT PtrTy = getPointerTy();
   2348           Reg = MF.getRegInfo().createVirtualRegister(getRegClassFor(PtrTy));
   2349           FuncInfo->setSRetReturnReg(Reg);
   2350         }
   2351         SDValue Copy = DAG.getCopyToReg(DAG.getEntryNode(), dl, Reg, InVals[i]);
   2352         Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Copy, Chain);
   2353         break;
   2354       }
   2355     }
   2356   }
   2357 
   2358   unsigned StackSize = CCInfo.getNextStackOffset();
   2359   // Align stack specially for tail calls.
   2360   if (FuncIsMadeTailCallSafe(CallConv,
   2361                              MF.getTarget().Options.GuaranteedTailCallOpt))
   2362     StackSize = GetAlignedArgumentStackSize(StackSize, DAG);
   2363 
   2364   // If the function takes variable number of arguments, make a frame index for
   2365   // the start of the first vararg value... for expansion of llvm.va_start.
   2366   if (isVarArg) {
   2367     if (Is64Bit || (CallConv != CallingConv::X86_FastCall &&
   2368                     CallConv != CallingConv::X86_ThisCall)) {
   2369       FuncInfo->setVarArgsFrameIndex(MFI->CreateFixedObject(1, StackSize,true));
   2370     }
   2371     if (Is64Bit) {
   2372       unsigned TotalNumIntRegs = 0, TotalNumXMMRegs = 0;
   2373 
   2374       // FIXME: We should really autogenerate these arrays
   2375       static const MCPhysReg GPR64ArgRegsWin64[] = {
   2376         X86::RCX, X86::RDX, X86::R8,  X86::R9
   2377       };
   2378       static const MCPhysReg GPR64ArgRegs64Bit[] = {
   2379         X86::RDI, X86::RSI, X86::RDX, X86::RCX, X86::R8, X86::R9
   2380       };
   2381       static const MCPhysReg XMMArgRegs64Bit[] = {
   2382         X86::XMM0, X86::XMM1, X86::XMM2, X86::XMM3,
   2383         X86::XMM4, X86::XMM5, X86::XMM6, X86::XMM7
   2384       };
   2385       const MCPhysReg *GPR64ArgRegs;
   2386       unsigned NumXMMRegs = 0;
   2387 
   2388       if (IsWin64) {
   2389         // The XMM registers which might contain var arg parameters are shadowed
   2390         // in their paired GPR.  So we only need to save the GPR to their home
   2391         // slots.
   2392         TotalNumIntRegs = 4;
   2393         GPR64ArgRegs = GPR64ArgRegsWin64;
   2394       } else {
   2395         TotalNumIntRegs = 6; TotalNumXMMRegs = 8;
   2396         GPR64ArgRegs = GPR64ArgRegs64Bit;
   2397 
   2398         NumXMMRegs = CCInfo.getFirstUnallocated(XMMArgRegs64Bit,
   2399                                                 TotalNumXMMRegs);
   2400       }
   2401       unsigned NumIntRegs = CCInfo.getFirstUnallocated(GPR64ArgRegs,
   2402                                                        TotalNumIntRegs);
   2403 
   2404       bool NoImplicitFloatOps = Fn->getAttributes().
   2405         hasAttribute(AttributeSet::FunctionIndex, Attribute::NoImplicitFloat);
   2406       assert(!(NumXMMRegs && !Subtarget->hasSSE1()) &&
   2407              "SSE register cannot be used when SSE is disabled!");
   2408       assert(!(NumXMMRegs && MF.getTarget().Options.UseSoftFloat &&
   2409                NoImplicitFloatOps) &&
   2410              "SSE register cannot be used when SSE is disabled!");
   2411       if (MF.getTarget().Options.UseSoftFloat || NoImplicitFloatOps ||
   2412           !Subtarget->hasSSE1())
   2413         // Kernel mode asks for SSE to be disabled, so don't push them
   2414         // on the stack.
   2415         TotalNumXMMRegs = 0;
   2416 
   2417       if (IsWin64) {
   2418         const TargetFrameLowering &TFI = *MF.getTarget().getFrameLowering();
   2419         // Get to the caller-allocated home save location.  Add 8 to account
   2420         // for the return address.
   2421         int HomeOffset = TFI.getOffsetOfLocalArea() + 8;
   2422         FuncInfo->setRegSaveFrameIndex(
   2423           MFI->CreateFixedObject(1, NumIntRegs * 8 + HomeOffset, false));
   2424         // Fixup to set vararg frame on shadow area (4 x i64).
   2425         if (NumIntRegs < 4)
   2426           FuncInfo->setVarArgsFrameIndex(FuncInfo->getRegSaveFrameIndex());
   2427       } else {
   2428         // For X86-64, if there are vararg parameters that are passed via
   2429         // registers, then we must store them to their spots on the stack so
   2430         // they may be loaded by deferencing the result of va_next.
   2431         FuncInfo->setVarArgsGPOffset(NumIntRegs * 8);
   2432         FuncInfo->setVarArgsFPOffset(TotalNumIntRegs * 8 + NumXMMRegs * 16);
   2433         FuncInfo->setRegSaveFrameIndex(
   2434           MFI->CreateStackObject(TotalNumIntRegs * 8 + TotalNumXMMRegs * 16, 16,
   2435                                false));
   2436       }
   2437 
   2438       // Store the integer parameter registers.
   2439       SmallVector<SDValue, 8> MemOps;
   2440       SDValue RSFIN = DAG.getFrameIndex(FuncInfo->getRegSaveFrameIndex(),
   2441                                         getPointerTy());
   2442       unsigned Offset = FuncInfo->getVarArgsGPOffset();
   2443       for (; NumIntRegs != TotalNumIntRegs; ++NumIntRegs) {
   2444         SDValue FIN = DAG.getNode(ISD::ADD, dl, getPointerTy(), RSFIN,
   2445                                   DAG.getIntPtrConstant(Offset));
   2446         unsigned VReg = MF.addLiveIn(GPR64ArgRegs[NumIntRegs],
   2447                                      &X86::GR64RegClass);
   2448         SDValue Val = DAG.getCopyFromReg(Chain, dl, VReg, MVT::i64);
   2449         SDValue Store =
   2450           DAG.getStore(Val.getValue(1), dl, Val, FIN,
   2451                        MachinePointerInfo::getFixedStack(
   2452                          FuncInfo->getRegSaveFrameIndex(), Offset),
   2453                        false, false, 0);
   2454         MemOps.push_back(Store);
   2455         Offset += 8;
   2456       }
   2457 
   2458       if (TotalNumXMMRegs != 0 && NumXMMRegs != TotalNumXMMRegs) {
   2459         // Now store the XMM (fp + vector) parameter registers.
   2460         SmallVector<SDValue, 11> SaveXMMOps;
   2461         SaveXMMOps.push_back(Chain);
   2462 
   2463         unsigned AL = MF.addLiveIn(X86::AL, &X86::GR8RegClass);
   2464         SDValue ALVal = DAG.getCopyFromReg(DAG.getEntryNode(), dl, AL, MVT::i8);
   2465         SaveXMMOps.push_back(ALVal);
   2466 
   2467         SaveXMMOps.push_back(DAG.getIntPtrConstant(
   2468                                FuncInfo->getRegSaveFrameIndex()));
   2469         SaveXMMOps.push_back(DAG.getIntPtrConstant(
   2470                                FuncInfo->getVarArgsFPOffset()));
   2471 
   2472         for (; NumXMMRegs != TotalNumXMMRegs; ++NumXMMRegs) {
   2473           unsigned VReg = MF.addLiveIn(XMMArgRegs64Bit[NumXMMRegs],
   2474                                        &X86::VR128RegClass);
   2475           SDValue Val = DAG.getCopyFromReg(Chain, dl, VReg, MVT::v4f32);
   2476           SaveXMMOps.push_back(Val);
   2477         }
   2478         MemOps.push_back(DAG.getNode(X86ISD::VASTART_SAVE_XMM_REGS, dl,
   2479                                      MVT::Other, SaveXMMOps));
   2480       }
   2481 
   2482       if (!MemOps.empty())
   2483         Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOps);
   2484     }
   2485   }
   2486 
   2487   // Some CCs need callee pop.
   2488   if (X86::isCalleePop(CallConv, Is64Bit, isVarArg,
   2489                        MF.getTarget().Options.GuaranteedTailCallOpt)) {
   2490     FuncInfo->setBytesToPopOnReturn(StackSize); // Callee pops everything.
   2491   } else {
   2492     FuncInfo->setBytesToPopOnReturn(0); // Callee pops nothing.
   2493     // If this is an sret function, the return should pop the hidden pointer.
   2494     if (!Is64Bit && !IsTailCallConvention(CallConv) &&
   2495         !Subtarget->getTargetTriple().isOSMSVCRT() &&
   2496         argsAreStructReturn(Ins) == StackStructReturn)
   2497       FuncInfo->setBytesToPopOnReturn(4);
   2498   }
   2499 
   2500   if (!Is64Bit) {
   2501     // RegSaveFrameIndex is X86-64 only.
   2502     FuncInfo->setRegSaveFrameIndex(0xAAAAAAA);
   2503     if (CallConv == CallingConv::X86_FastCall ||
   2504         CallConv == CallingConv::X86_ThisCall)
   2505       // fastcc functions can't have varargs.
   2506       FuncInfo->setVarArgsFrameIndex(0xAAAAAAA);
   2507   }
   2508 
   2509   FuncInfo->setArgumentStackSize(StackSize);
   2510 
   2511   return Chain;
   2512 }
   2513 
   2514 SDValue
   2515 X86TargetLowering::LowerMemOpCallTo(SDValue Chain,
   2516                                     SDValue StackPtr, SDValue Arg,
   2517                                     SDLoc dl, SelectionDAG &DAG,
   2518                                     const CCValAssign &VA,
   2519                                     ISD::ArgFlagsTy Flags) const {
   2520   unsigned LocMemOffset = VA.getLocMemOffset();
   2521   SDValue PtrOff = DAG.getIntPtrConstant(LocMemOffset);
   2522   PtrOff = DAG.getNode(ISD::ADD, dl, getPointerTy(), StackPtr, PtrOff);
   2523   if (Flags.isByVal())
   2524     return CreateCopyOfByValArgument(Arg, PtrOff, Chain, Flags, DAG, dl);
   2525 
   2526   return DAG.getStore(Chain, dl, Arg, PtrOff,
   2527                       MachinePointerInfo::getStack(LocMemOffset),
   2528                       false, false, 0);
   2529 }
   2530 
   2531 /// EmitTailCallLoadRetAddr - Emit a load of return address if tail call
   2532 /// optimization is performed and it is required.
   2533 SDValue
   2534 X86TargetLowering::EmitTailCallLoadRetAddr(SelectionDAG &DAG,
   2535                                            SDValue &OutRetAddr, SDValue Chain,
   2536                                            bool IsTailCall, bool Is64Bit,
   2537                                            int FPDiff, SDLoc dl) const {
   2538   // Adjust the Return address stack slot.
   2539   EVT VT = getPointerTy();
   2540   OutRetAddr = getReturnAddressFrameIndex(DAG);
   2541 
   2542   // Load the "old" Return address.
   2543   OutRetAddr = DAG.getLoad(VT, dl, Chain, OutRetAddr, MachinePointerInfo(),
   2544                            false, false, false, 0);
   2545   return SDValue(OutRetAddr.getNode(), 1);
   2546 }
   2547 
   2548 /// EmitTailCallStoreRetAddr - Emit a store of the return address if tail call
   2549 /// optimization is performed and it is required (FPDiff!=0).
   2550 static SDValue EmitTailCallStoreRetAddr(SelectionDAG &DAG, MachineFunction &MF,
   2551                                         SDValue Chain, SDValue RetAddrFrIdx,
   2552                                         EVT PtrVT, unsigned SlotSize,
   2553                                         int FPDiff, SDLoc dl) {
   2554   // Store the return address to the appropriate stack slot.
   2555   if (!FPDiff) return Chain;
   2556   // Calculate the new stack slot for the return address.
   2557   int NewReturnAddrFI =
   2558     MF.getFrameInfo()->CreateFixedObject(SlotSize, (int64_t)FPDiff - SlotSize,
   2559                                          false);
   2560   SDValue NewRetAddrFrIdx = DAG.getFrameIndex(NewReturnAddrFI, PtrVT);
   2561   Chain = DAG.getStore(Chain, dl, RetAddrFrIdx, NewRetAddrFrIdx,
   2562                        MachinePointerInfo::getFixedStack(NewReturnAddrFI),
   2563                        false, false, 0);
   2564   return Chain;
   2565 }
   2566 
   2567 SDValue
   2568 X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
   2569                              SmallVectorImpl<SDValue> &InVals) const {
   2570   SelectionDAG &DAG                     = CLI.DAG;
   2571   SDLoc &dl                             = CLI.DL;
   2572   SmallVectorImpl<ISD::OutputArg> &Outs = CLI.Outs;
   2573   SmallVectorImpl<SDValue> &OutVals     = CLI.OutVals;
   2574   SmallVectorImpl<ISD::InputArg> &Ins   = CLI.Ins;
   2575   SDValue Chain                         = CLI.Chain;
   2576   SDValue Callee                        = CLI.Callee;
   2577   CallingConv::ID CallConv              = CLI.CallConv;
   2578   bool &isTailCall                      = CLI.IsTailCall;
   2579   bool isVarArg                         = CLI.IsVarArg;
   2580 
   2581   MachineFunction &MF = DAG.getMachineFunction();
   2582   bool Is64Bit        = Subtarget->is64Bit();
   2583   bool IsWin64        = Subtarget->isCallingConvWin64(CallConv);
   2584   StructReturnType SR = callIsStructReturn(Outs);
   2585   bool IsSibcall      = false;
   2586 
   2587   if (MF.getTarget().Options.DisableTailCalls)
   2588     isTailCall = false;
   2589 
   2590   bool IsMustTail = CLI.CS && CLI.CS->isMustTailCall();
   2591   if (IsMustTail) {
   2592     // Force this to be a tail call.  The verifier rules are enough to ensure
   2593     // that we can lower this successfully without moving the return address
   2594     // around.
   2595     isTailCall = true;
   2596   } else if (isTailCall) {
   2597     // Check if it's really possible to do a tail call.
   2598     isTailCall = IsEligibleForTailCallOptimization(Callee, CallConv,
   2599                     isVarArg, SR != NotStructReturn,
   2600                     MF.getFunction()->hasStructRetAttr(), CLI.RetTy,
   2601                     Outs, OutVals, Ins, DAG);
   2602 
   2603     // Sibcalls are automatically detected tailcalls which do not require
   2604     // ABI changes.
   2605     if (!MF.getTarget().Options.GuaranteedTailCallOpt && isTailCall)
   2606       IsSibcall = true;
   2607 
   2608     if (isTailCall)
   2609       ++NumTailCalls;
   2610   }
   2611 
   2612   assert(!(isVarArg && IsTailCallConvention(CallConv)) &&
   2613          "Var args not supported with calling convention fastcc, ghc or hipe");
   2614 
   2615   // Analyze operands of the call, assigning locations to each operand.
   2616   SmallVector<CCValAssign, 16> ArgLocs;
   2617   CCState CCInfo(CallConv, isVarArg, MF, MF.getTarget(),
   2618                  ArgLocs, *DAG.getContext());
   2619 
   2620   // Allocate shadow area for Win64
   2621   if (IsWin64)
   2622     CCInfo.AllocateStack(32, 8);
   2623 
   2624   CCInfo.AnalyzeCallOperands(Outs, CC_X86);
   2625 
   2626   // Get a count of how many bytes are to be pushed on the stack.
   2627   unsigned NumBytes = CCInfo.getNextStackOffset();
   2628   if (IsSibcall)
   2629     // This is a sibcall. The memory operands are available in caller's
   2630     // own caller's stack.
   2631     NumBytes = 0;
   2632   else if (MF.getTarget().Options.GuaranteedTailCallOpt &&
   2633            IsTailCallConvention(CallConv))
   2634     NumBytes = GetAlignedArgumentStackSize(NumBytes, DAG);
   2635 
   2636   int FPDiff = 0;
   2637   if (isTailCall && !IsSibcall && !IsMustTail) {
   2638     // Lower arguments at fp - stackoffset + fpdiff.
   2639     X86MachineFunctionInfo *X86Info = MF.getInfo<X86MachineFunctionInfo>();
   2640     unsigned NumBytesCallerPushed = X86Info->getBytesToPopOnReturn();
   2641 
   2642     FPDiff = NumBytesCallerPushed - NumBytes;
   2643 
   2644     // Set the delta of movement of the returnaddr stackslot.
   2645     // But only set if delta is greater than previous delta.
   2646     if (FPDiff < X86Info->getTCReturnAddrDelta())
   2647       X86Info->setTCReturnAddrDelta(FPDiff);
   2648   }
   2649 
   2650   unsigned NumBytesToPush = NumBytes;
   2651   unsigned NumBytesToPop = NumBytes;
   2652 
   2653   // If we have an inalloca argument, all stack space has already been allocated
   2654   // for us and be right at the top of the stack.  We don't support multiple
   2655   // arguments passed in memory when using inalloca.
   2656   if (!Outs.empty() && Outs.back().Flags.isInAlloca()) {
   2657     NumBytesToPush = 0;
   2658     assert(ArgLocs.back().getLocMemOffset() == 0 &&
   2659            "an inalloca argument must be the only memory argument");
   2660   }
   2661 
   2662   if (!IsSibcall)
   2663     Chain = DAG.getCALLSEQ_START(
   2664         Chain, DAG.getIntPtrConstant(NumBytesToPush, true), dl);
   2665 
   2666   SDValue RetAddrFrIdx;
   2667   // Load return address for tail calls.
   2668   if (isTailCall && FPDiff)
   2669     Chain = EmitTailCallLoadRetAddr(DAG, RetAddrFrIdx, Chain, isTailCall,
   2670                                     Is64Bit, FPDiff, dl);
   2671 
   2672   SmallVector<std::pair<unsigned, SDValue>, 8> RegsToPass;
   2673   SmallVector<SDValue, 8> MemOpChains;
   2674   SDValue StackPtr;
   2675 
   2676   // Walk the register/memloc assignments, inserting copies/loads.  In the case
   2677   // of tail call optimization arguments are handle later.
   2678   const X86RegisterInfo *RegInfo =
   2679     static_cast<const X86RegisterInfo*>(DAG.getTarget().getRegisterInfo());
   2680   for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
   2681     // Skip inalloca arguments, they have already been written.
   2682     ISD::ArgFlagsTy Flags = Outs[i].Flags;
   2683     if (Flags.isInAlloca())
   2684       continue;
   2685 
   2686     CCValAssign &VA = ArgLocs[i];
   2687     EVT RegVT = VA.getLocVT();
   2688     SDValue Arg = OutVals[i];
   2689     bool isByVal = Flags.isByVal();
   2690 
   2691     // Promote the value if needed.
   2692     switch (VA.getLocInfo()) {
   2693     default: llvm_unreachable("Unknown loc info!");
   2694     case CCValAssign::Full: break;
   2695     case CCValAssign::SExt:
   2696       Arg = DAG.getNode(ISD::SIGN_EXTEND, dl, RegVT, Arg);
   2697       break;
   2698     case CCValAssign::ZExt:
   2699       Arg = DAG.getNode(ISD::ZERO_EXTEND, dl, RegVT, Arg);
   2700       break;
   2701     case CCValAssign::AExt:
   2702       if (RegVT.is128BitVector()) {
   2703         // Special case: passing MMX values in XMM registers.
   2704         Arg = DAG.getNode(ISD::BITCAST, dl, MVT::i64, Arg);
   2705         Arg = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, Arg);
   2706         Arg = getMOVL(DAG, dl, MVT::v2i64, DAG.getUNDEF(MVT::v2i64), Arg);
   2707       } else
   2708         Arg = DAG.getNode(ISD::ANY_EXTEND, dl, RegVT, Arg);
   2709       break;
   2710     case CCValAssign::BCvt:
   2711       Arg = DAG.getNode(ISD::BITCAST, dl, RegVT, Arg);
   2712       break;
   2713     case CCValAssign::Indirect: {
   2714       // Store the argument.
   2715       SDValue SpillSlot = DAG.CreateStackTemporary(VA.getValVT());
   2716       int FI = cast<FrameIndexSDNode>(SpillSlot)->getIndex();
   2717       Chain = DAG.getStore(Chain, dl, Arg, SpillSlot,
   2718                            MachinePointerInfo::getFixedStack(FI),
   2719                            false, false, 0);
   2720       Arg = SpillSlot;
   2721       break;
   2722     }
   2723     }
   2724 
   2725     if (VA.isRegLoc()) {
   2726       RegsToPass.push_back(std::make_pair(VA.getLocReg(), Arg));
   2727       if (isVarArg && IsWin64) {
   2728         // Win64 ABI requires argument XMM reg to be copied to the corresponding
   2729         // shadow reg if callee is a varargs function.
   2730         unsigned ShadowReg = 0;
   2731         switch (VA.getLocReg()) {
   2732         case X86::XMM0: ShadowReg = X86::RCX; break;
   2733         case X86::XMM1: ShadowReg = X86::RDX; break;
   2734         case X86::XMM2: ShadowReg = X86::R8; break;
   2735         case X86::XMM3: ShadowReg = X86::R9; break;
   2736         }
   2737         if (ShadowReg)
   2738           RegsToPass.push_back(std::make_pair(ShadowReg, Arg));
   2739       }
   2740     } else if (!IsSibcall && (!isTailCall || isByVal)) {
   2741       assert(VA.isMemLoc());
   2742       if (!StackPtr.getNode())
   2743         StackPtr = DAG.getCopyFromReg(Chain, dl, RegInfo->getStackRegister(),
   2744                                       getPointerTy());
   2745       MemOpChains.push_back(LowerMemOpCallTo(Chain, StackPtr, Arg,
   2746                                              dl, DAG, VA, Flags));
   2747     }
   2748   }
   2749 
   2750   if (!MemOpChains.empty())
   2751     Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOpChains);
   2752 
   2753   if (Subtarget->isPICStyleGOT()) {
   2754     // ELF / PIC requires GOT in the EBX register before function calls via PLT
   2755     // GOT pointer.
   2756     if (!isTailCall) {
   2757       RegsToPass.push_back(std::make_pair(unsigned(X86::EBX),
   2758                DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), getPointerTy())));
   2759     } else {
   2760       // If we are tail calling and generating PIC/GOT style code load the
   2761       // address of the callee into ECX. The value in ecx is used as target of
   2762       // the tail jump. This is done to circumvent the ebx/callee-saved problem
   2763       // for tail calls on PIC/GOT architectures. Normally we would just put the
   2764       // address of GOT into ebx and then call target@PLT. But for tail calls
   2765       // ebx would be restored (since ebx is callee saved) before jumping to the
   2766       // target@PLT.
   2767 
   2768       // Note: The actual moving to ECX is done further down.
   2769       GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee);
   2770       if (G && !G->getGlobal()->hasHiddenVisibility() &&
   2771           !G->getGlobal()->hasProtectedVisibility())
   2772         Callee = LowerGlobalAddress(Callee, DAG);
   2773       else if (isa<ExternalSymbolSDNode>(Callee))
   2774         Callee = LowerExternalSymbol(Callee, DAG);
   2775     }
   2776   }
   2777 
   2778   if (Is64Bit && isVarArg && !IsWin64) {
   2779     // From AMD64 ABI document:
   2780     // For calls that may call functions that use varargs or stdargs
   2781     // (prototype-less calls or calls to functions containing ellipsis (...) in
   2782     // the declaration) %al is used as hidden argument to specify the number
   2783     // of SSE registers used. The contents of %al do not need to match exactly
   2784     // the number of registers, but must be an ubound on the number of SSE
   2785     // registers used and is in the range 0 - 8 inclusive.
   2786 
   2787     // Count the number of XMM registers allocated.
   2788     static const MCPhysReg XMMArgRegs[] = {
   2789       X86::XMM0, X86::XMM1, X86::XMM2, X86::XMM3,
   2790       X86::XMM4, X86::XMM5, X86::XMM6, X86::XMM7
   2791     };
   2792     unsigned NumXMMRegs = CCInfo.getFirstUnallocated(XMMArgRegs, 8);
   2793     assert((Subtarget->hasSSE1() || !NumXMMRegs)
   2794            && "SSE registers cannot be used when SSE is disabled");
   2795 
   2796     RegsToPass.push_back(std::make_pair(unsigned(X86::AL),
   2797                                         DAG.getConstant(NumXMMRegs, MVT::i8)));
   2798   }
   2799 
   2800   // For tail calls lower the arguments to the 'real' stack slots.  Sibcalls
   2801   // don't need this because the eligibility check rejects calls that require
   2802   // shuffling arguments passed in memory.
   2803   if (!IsSibcall && isTailCall) {
   2804     // Force all the incoming stack arguments to be loaded from the stack
   2805     // before any new outgoing arguments are stored to the stack, because the
   2806     // outgoing stack slots may alias the incoming argument stack slots, and
   2807     // the alias isn't otherwise explicit. This is slightly more conservative
   2808     // than necessary, because it means that each store effectively depends
   2809     // on every argument instead of just those arguments it would clobber.
   2810     SDValue ArgChain = DAG.getStackArgumentTokenFactor(Chain);
   2811 
   2812     SmallVector<SDValue, 8> MemOpChains2;
   2813     SDValue FIN;
   2814     int FI = 0;
   2815     for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
   2816       CCValAssign &VA = ArgLocs[i];
   2817       if (VA.isRegLoc())
   2818         continue;
   2819       assert(VA.isMemLoc());
   2820       SDValue Arg = OutVals[i];
   2821       ISD::ArgFlagsTy Flags = Outs[i].Flags;
   2822       // Skip inalloca arguments.  They don't require any work.
   2823       if (Flags.isInAlloca())
   2824         continue;
   2825       // Create frame index.
   2826       int32_t Offset = VA.getLocMemOffset()+FPDiff;
   2827       uint32_t OpSize = (VA.getLocVT().getSizeInBits()+7)/8;
   2828       FI = MF.getFrameInfo()->CreateFixedObject(OpSize, Offset, true);
   2829       FIN = DAG.getFrameIndex(FI, getPointerTy());
   2830 
   2831       if (Flags.isByVal()) {
   2832         // Copy relative to framepointer.
   2833         SDValue Source = DAG.getIntPtrConstant(VA.getLocMemOffset());
   2834         if (!StackPtr.getNode())
   2835           StackPtr = DAG.getCopyFromReg(Chain, dl,
   2836                                         RegInfo->getStackRegister(),
   2837                                         getPointerTy());
   2838         Source = DAG.getNode(ISD::ADD, dl, getPointerTy(), StackPtr, Source);
   2839 
   2840         MemOpChains2.push_back(CreateCopyOfByValArgument(Source, FIN,
   2841                                                          ArgChain,
   2842                                                          Flags, DAG, dl));
   2843       } else {
   2844         // Store relative to framepointer.
   2845         MemOpChains2.push_back(
   2846           DAG.getStore(ArgChain, dl, Arg, FIN,
   2847                        MachinePointerInfo::getFixedStack(FI),
   2848                        false, false, 0));
   2849       }
   2850     }
   2851 
   2852     if (!MemOpChains2.empty())
   2853       Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOpChains2);
   2854 
   2855     // Store the return address to the appropriate stack slot.
   2856     Chain = EmitTailCallStoreRetAddr(DAG, MF, Chain, RetAddrFrIdx,
   2857                                      getPointerTy(), RegInfo->getSlotSize(),
   2858                                      FPDiff, dl);
   2859   }
   2860 
   2861   // Build a sequence of copy-to-reg nodes chained together with token chain
   2862   // and flag operands which copy the outgoing args into registers.
   2863   SDValue InFlag;
   2864   for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) {
   2865     Chain = DAG.getCopyToReg(Chain, dl, RegsToPass[i].first,
   2866                              RegsToPass[i].second, InFlag);
   2867     InFlag = Chain.getValue(1);
   2868   }
   2869 
   2870   if (DAG.getTarget().getCodeModel() == CodeModel::Large) {
   2871     assert(Is64Bit && "Large code model is only legal in 64-bit mode.");
   2872     // In the 64-bit large code model, we have to make all calls
   2873     // through a register, since the call instruction's 32-bit
   2874     // pc-relative offset may not be large enough to hold the whole
   2875     // address.
   2876   } else if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee)) {
   2877     // If the callee is a GlobalAddress node (quite common, every direct call
   2878     // is) turn it into a TargetGlobalAddress node so that legalize doesn't hack
   2879     // it.
   2880 
   2881     // We should use extra load for direct calls to dllimported functions in
   2882     // non-JIT mode.
   2883     const GlobalValue *GV = G->getGlobal();
   2884     if (!GV->hasDLLImportStorageClass()) {
   2885       unsigned char OpFlags = 0;
   2886       bool ExtraLoad = false;
   2887       unsigned WrapperKind = ISD::DELETED_NODE;
   2888 
   2889       // On ELF targets, in both X86-64 and X86-32 mode, direct calls to
   2890       // external symbols most go through the PLT in PIC mode.  If the symbol
   2891       // has hidden or protected visibility, or if it is static or local, then
   2892       // we don't need to use the PLT - we can directly call it.
   2893       if (Subtarget->isTargetELF() &&
   2894           DAG.getTarget().getRelocationModel() == Reloc::PIC_ &&
   2895           GV->hasDefaultVisibility() && !GV->hasLocalLinkage()) {
   2896         OpFlags = X86II::MO_PLT;
   2897       } else if (Subtarget->isPICStyleStubAny() &&
   2898                  (GV->isDeclaration() || GV->isWeakForLinker()) &&
   2899                  (!Subtarget->getTargetTriple().isMacOSX() ||
   2900                   Subtarget->getTargetTriple().isMacOSXVersionLT(10, 5))) {
   2901         // PC-relative references to external symbols should go through $stub,
   2902         // unless we're building with the leopard linker or later, which
   2903         // automatically synthesizes these stubs.
   2904         OpFlags = X86II::MO_DARWIN_STUB;
   2905       } else if (Subtarget->isPICStyleRIPRel() &&
   2906                  isa<Function>(GV) &&
   2907                  cast<Function>(GV)->getAttributes().
   2908                    hasAttribute(AttributeSet::FunctionIndex,
   2909                                 Attribute::NonLazyBind)) {
   2910         // If the function is marked as non-lazy, generate an indirect call
   2911         // which loads from the GOT directly. This avoids runtime overhead
   2912         // at the cost of eager binding (and one extra byte of encoding).
   2913         OpFlags = X86II::MO_GOTPCREL;
   2914         WrapperKind = X86ISD::WrapperRIP;
   2915         ExtraLoad = true;
   2916       }
   2917 
   2918       Callee = DAG.getTargetGlobalAddress(GV, dl, getPointerTy(),
   2919                                           G->getOffset(), OpFlags);
   2920 
   2921       // Add a wrapper if needed.
   2922       if (WrapperKind != ISD::DELETED_NODE)
   2923         Callee = DAG.getNode(X86ISD::WrapperRIP, dl, getPointerTy(), Callee);
   2924       // Add extra indirection if needed.
   2925       if (ExtraLoad)
   2926         Callee = DAG.getLoad(getPointerTy(), dl, DAG.getEntryNode(), Callee,
   2927                              MachinePointerInfo::getGOT(),
   2928                              false, false, false, 0);
   2929     }
   2930   } else if (ExternalSymbolSDNode *S = dyn_cast<ExternalSymbolSDNode>(Callee)) {
   2931     unsigned char OpFlags = 0;
   2932 
   2933     // On ELF targets, in either X86-64 or X86-32 mode, direct calls to
   2934     // external symbols should go through the PLT.
   2935     if (Subtarget->isTargetELF() &&
   2936         DAG.getTarget().getRelocationModel() == Reloc::PIC_) {
   2937       OpFlags = X86II::MO_PLT;
   2938     } else if (Subtarget->isPICStyleStubAny() &&
   2939                (!Subtarget->getTargetTriple().isMacOSX() ||
   2940                 Subtarget->getTargetTriple().isMacOSXVersionLT(10, 5))) {
   2941       // PC-relative references to external symbols should go through $stub,
   2942       // unless we're building with the leopard linker or later, which
   2943       // automatically synthesizes these stubs.
   2944       OpFlags = X86II::MO_DARWIN_STUB;
   2945     }
   2946 
   2947     Callee = DAG.getTargetExternalSymbol(S->getSymbol(), getPointerTy(),
   2948                                          OpFlags);
   2949   }
   2950 
   2951   // Returns a chain & a flag for retval copy to use.
   2952   SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
   2953   SmallVector<SDValue, 8> Ops;
   2954 
   2955   if (!IsSibcall && isTailCall) {
   2956     Chain = DAG.getCALLSEQ_END(Chain,
   2957                                DAG.getIntPtrConstant(NumBytesToPop, true),
   2958                                DAG.getIntPtrConstant(0, true), InFlag, dl);
   2959     InFlag = Chain.getValue(1);
   2960   }
   2961 
   2962   Ops.push_back(Chain);
   2963   Ops.push_back(Callee);
   2964 
   2965   if (isTailCall)
   2966     Ops.push_back(DAG.getConstant(FPDiff, MVT::i32));
   2967 
   2968   // Add argument registers to the end of the list so that they are known live
   2969   // into the call.
   2970   for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i)
   2971     Ops.push_back(DAG.getRegister(RegsToPass[i].first,
   2972                                   RegsToPass[i].second.getValueType()));
   2973 
   2974   // Add a register mask operand representing the call-preserved registers.
   2975   const TargetRegisterInfo *TRI = DAG.getTarget().getRegisterInfo();
   2976   const uint32_t *Mask = TRI->getCallPreservedMask(CallConv);
   2977   assert(Mask && "Missing call preserved mask for calling convention");
   2978   Ops.push_back(DAG.getRegisterMask(Mask));
   2979 
   2980   if (InFlag.getNode())
   2981     Ops.push_back(InFlag);
   2982 
   2983   if (isTailCall) {
   2984     // We used to do:
   2985     //// If this is the first return lowered for this function, add the regs
   2986     //// to the liveout set for the function.
   2987     // This isn't right, although it's probably harmless on x86; liveouts
   2988     // should be computed from returns not tail calls.  Consider a void
   2989     // function making a tail call to a function returning int.
   2990     return DAG.getNode(X86ISD::TC_RETURN, dl, NodeTys, Ops);
   2991   }
   2992 
   2993   Chain = DAG.getNode(X86ISD::CALL, dl, NodeTys, Ops);
   2994   InFlag = Chain.getValue(1);
   2995 
   2996   // Create the CALLSEQ_END node.
   2997   unsigned NumBytesForCalleeToPop;
   2998   if (X86::isCalleePop(CallConv, Is64Bit, isVarArg,
   2999                        DAG.getTarget().Options.GuaranteedTailCallOpt))
   3000     NumBytesForCalleeToPop = NumBytes;    // Callee pops everything
   3001   else if (!Is64Bit && !IsTailCallConvention(CallConv) &&
   3002            !Subtarget->getTargetTriple().isOSMSVCRT() &&
   3003            SR == StackStructReturn)
   3004     // If this is a call to a struct-return function, the callee
   3005     // pops the hidden struct pointer, so we have to push it back.
   3006     // This is common for Darwin/X86, Linux & Mingw32 targets.
   3007     // For MSVC Win32 targets, the caller pops the hidden struct pointer.
   3008     NumBytesForCalleeToPop = 4;
   3009   else
   3010     NumBytesForCalleeToPop = 0;  // Callee pops nothing.
   3011 
   3012   // Returns a flag for retval copy to use.
   3013   if (!IsSibcall) {
   3014     Chain = DAG.getCALLSEQ_END(Chain,
   3015                                DAG.getIntPtrConstant(NumBytesToPop, true),
   3016                                DAG.getIntPtrConstant(NumBytesForCalleeToPop,
   3017                                                      true),
   3018                                InFlag, dl);
   3019     InFlag = Chain.getValue(1);
   3020   }
   3021 
   3022   // Handle result values, copying them out of physregs into vregs that we
   3023   // return.
   3024   return LowerCallResult(Chain, InFlag, CallConv, isVarArg,
   3025                          Ins, dl, DAG, InVals);
   3026 }
   3027 
   3028 //===----------------------------------------------------------------------===//
   3029 //                Fast Calling Convention (tail call) implementation
   3030 //===----------------------------------------------------------------------===//
   3031 
   3032 //  Like std call, callee cleans arguments, convention except that ECX is
   3033 //  reserved for storing the tail called function address. Only 2 registers are
   3034 //  free for argument passing (inreg). Tail call optimization is performed
   3035 //  provided:
   3036 //                * tailcallopt is enabled
   3037 //                * caller/callee are fastcc
   3038 //  On X86_64 architecture with GOT-style position independent code only local
   3039 //  (within module) calls are supported at the moment.
   3040 //  To keep the stack aligned according to platform abi the function
   3041 //  GetAlignedArgumentStackSize ensures that argument delta is always multiples
   3042 //  of stack alignment. (Dynamic linkers need this - darwin's dyld for example)
   3043 //  If a tail called function callee has more arguments than the caller the
   3044 //  caller needs to make sure that there is room to move the RETADDR to. This is
   3045 //  achieved by reserving an area the size of the argument delta right after the
   3046 //  original REtADDR, but before the saved framepointer or the spilled registers
   3047 //  e.g. caller(arg1, arg2) calls callee(arg1, arg2,arg3,arg4)
   3048 //  stack layout:
   3049 //    arg1
   3050 //    arg2
   3051 //    RETADDR
   3052 //    [ new RETADDR
   3053 //      move area ]
   3054 //    (possible EBP)
   3055 //    ESI
   3056 //    EDI
   3057 //    local1 ..
   3058 
   3059 /// GetAlignedArgumentStackSize - Make the stack size align e.g 16n + 12 aligned
   3060 /// for a 16 byte align requirement.
   3061 unsigned
   3062 X86TargetLowering::GetAlignedArgumentStackSize(unsigned StackSize,
   3063                                                SelectionDAG& DAG) const {
   3064   MachineFunction &MF = DAG.getMachineFunction();
   3065   const TargetMachine &TM = MF.getTarget();
   3066   const X86RegisterInfo *RegInfo =
   3067     static_cast<const X86RegisterInfo*>(TM.getRegisterInfo());
   3068   const TargetFrameLowering &TFI = *TM.getFrameLowering();
   3069   unsigned StackAlignment = TFI.getStackAlignment();
   3070   uint64_t AlignMask = StackAlignment - 1;
   3071   int64_t Offset = StackSize;
   3072   unsigned SlotSize = RegInfo->getSlotSize();
   3073   if ( (Offset & AlignMask) <= (StackAlignment - SlotSize) ) {
   3074     // Number smaller than 12 so just add the difference.
   3075     Offset += ((StackAlignment - SlotSize) - (Offset & AlignMask));
   3076   } else {
   3077     // Mask out lower bits, add stackalignment once plus the 12 bytes.
   3078     Offset = ((~AlignMask) & Offset) + StackAlignment +
   3079       (StackAlignment-SlotSize);
   3080   }
   3081   return Offset;
   3082 }
   3083 
   3084 /// MatchingStackOffset - Return true if the given stack call argument is
   3085 /// already available in the same position (relatively) of the caller's
   3086 /// incoming argument stack.
   3087 static
   3088 bool MatchingStackOffset(SDValue Arg, unsigned Offset, ISD::ArgFlagsTy Flags,
   3089                          MachineFrameInfo *MFI, const MachineRegisterInfo *MRI,
   3090                          const X86InstrInfo *TII) {
   3091   unsigned Bytes = Arg.getValueType().getSizeInBits() / 8;
   3092   int FI = INT_MAX;
   3093   if (Arg.getOpcode() == ISD::CopyFromReg) {
   3094     unsigned VR = cast<RegisterSDNode>(Arg.getOperand(1))->getReg();
   3095     if (!TargetRegisterInfo::isVirtualRegister(VR))
   3096       return false;
   3097     MachineInstr *Def = MRI->getVRegDef(VR);
   3098     if (!Def)
   3099       return false;
   3100     if (!Flags.isByVal()) {
   3101       if (!TII->isLoadFromStackSlot(Def, FI))
   3102         return false;
   3103     } else {
   3104       unsigned Opcode = Def->getOpcode();
   3105       if ((Opcode == X86::LEA32r || Opcode == X86::LEA64r) &&
   3106           Def->getOperand(1).isFI()) {
   3107         FI = Def->getOperand(1).getIndex();
   3108         Bytes = Flags.getByValSize();
   3109       } else
   3110         return false;
   3111     }
   3112   } else if (LoadSDNode *Ld = dyn_cast<LoadSDNode>(Arg)) {
   3113     if (Flags.isByVal())
   3114       // ByVal argument is passed in as a pointer but it's now being
   3115       // dereferenced. e.g.
   3116       // define @foo(%struct.X* %A) {
   3117       //   tail call @bar(%struct.X* byval %A)
   3118       // }
   3119       return false;
   3120     SDValue Ptr = Ld->getBasePtr();
   3121     FrameIndexSDNode *FINode = dyn_cast<FrameIndexSDNode>(Ptr);
   3122     if (!FINode)
   3123       return false;
   3124     FI = FINode->getIndex();
   3125   } else if (Arg.getOpcode() == ISD::FrameIndex && Flags.isByVal()) {
   3126     FrameIndexSDNode *FINode = cast<FrameIndexSDNode>(Arg);
   3127     FI = FINode->getIndex();
   3128     Bytes = Flags.getByValSize();
   3129   } else
   3130     return false;
   3131 
   3132   assert(FI != INT_MAX);
   3133   if (!MFI->isFixedObjectIndex(FI))
   3134     return false;
   3135   return Offset == MFI->getObjectOffset(FI) && Bytes == MFI->getObjectSize(FI);
   3136 }
   3137 
   3138 /// IsEligibleForTailCallOptimization - Check whether the call is eligible
   3139 /// for tail call optimization. Targets which want to do tail call
   3140 /// optimization should implement this function.
   3141 bool
   3142 X86TargetLowering::IsEligibleForTailCallOptimization(SDValue Callee,
   3143                                                      CallingConv::ID CalleeCC,
   3144                                                      bool isVarArg,
   3145                                                      bool isCalleeStructRet,
   3146                                                      bool isCallerStructRet,
   3147                                                      Type *RetTy,
   3148                                     const SmallVectorImpl<ISD::OutputArg> &Outs,
   3149                                     const SmallVectorImpl<SDValue> &OutVals,
   3150                                     const SmallVectorImpl<ISD::InputArg> &Ins,
   3151                                                      SelectionDAG &DAG) const {
   3152   if (!IsTailCallConvention(CalleeCC) && !IsCCallConvention(CalleeCC))
   3153     return false;
   3154 
   3155   // If -tailcallopt is specified, make fastcc functions tail-callable.
   3156   const MachineFunction &MF = DAG.getMachineFunction();
   3157   const Function *CallerF = MF.getFunction();
   3158 
   3159   // If the function return type is x86_fp80 and the callee return type is not,
   3160   // then the FP_EXTEND of the call result is not a nop. It's not safe to
   3161   // perform a tailcall optimization here.
   3162   if (CallerF->getReturnType()->isX86_FP80Ty() && !RetTy->isX86_FP80Ty())
   3163     return false;
   3164 
   3165   CallingConv::ID CallerCC = CallerF->getCallingConv();
   3166   bool CCMatch = CallerCC == CalleeCC;
   3167   bool IsCalleeWin64 = Subtarget->isCallingConvWin64(CalleeCC);
   3168   bool IsCallerWin64 = Subtarget->isCallingConvWin64(CallerCC);
   3169 
   3170   if (DAG.getTarget().Options.GuaranteedTailCallOpt) {
   3171     if (IsTailCallConvention(CalleeCC) && CCMatch)
   3172       return true;
   3173     return false;
   3174   }
   3175 
   3176   // Look for obvious safe cases to perform tail call optimization that do not
   3177   // require ABI changes. This is what gcc calls sibcall.
   3178 
   3179   // Can't do sibcall if stack needs to be dynamically re-aligned. PEI needs to
   3180   // emit a special epilogue.
   3181   const X86RegisterInfo *RegInfo =
   3182     static_cast<const X86RegisterInfo*>(DAG.getTarget().getRegisterInfo());
   3183   if (RegInfo->needsStackRealignment(MF))
   3184     return false;
   3185 
   3186   // Also avoid sibcall optimization if either caller or callee uses struct
   3187   // return semantics.
   3188   if (isCalleeStructRet || isCallerStructRet)
   3189     return false;
   3190 
   3191   // An stdcall/thiscall caller is expected to clean up its arguments; the
   3192   // callee isn't going to do that.
   3193   // FIXME: this is more restrictive than needed. We could produce a tailcall
   3194   // when the stack adjustment matches. For example, with a thiscall that takes
   3195   // only one argument.
   3196   if (!CCMatch && (CallerCC == CallingConv::X86_StdCall ||
   3197                    CallerCC == CallingConv::X86_ThisCall))
   3198     return false;
   3199 
   3200   // Do not sibcall optimize vararg calls unless all arguments are passed via
   3201   // registers.
   3202   if (isVarArg && !Outs.empty()) {
   3203 
   3204     // Optimizing for varargs on Win64 is unlikely to be safe without
   3205     // additional testing.
   3206     if (IsCalleeWin64 || IsCallerWin64)
   3207       return false;
   3208 
   3209     SmallVector<CCValAssign, 16> ArgLocs;
   3210     CCState CCInfo(CalleeCC, isVarArg, DAG.getMachineFunction(),
   3211                    DAG.getTarget(), ArgLocs, *DAG.getContext());
   3212 
   3213     CCInfo.AnalyzeCallOperands(Outs, CC_X86);
   3214     for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i)
   3215       if (!ArgLocs[i].isRegLoc())
   3216         return false;
   3217   }
   3218 
   3219   // If the call result is in ST0 / ST1, it needs to be popped off the x87
   3220   // stack.  Therefore, if it's not used by the call it is not safe to optimize
   3221   // this into a sibcall.
   3222   bool Unused = false;
   3223   for (unsigned i = 0, e = Ins.size(); i != e; ++i) {
   3224     if (!Ins[i].Used) {
   3225       Unused = true;
   3226       break;
   3227     }
   3228   }
   3229   if (Unused) {
   3230     SmallVector<CCValAssign, 16> RVLocs;
   3231     CCState CCInfo(CalleeCC, false, DAG.getMachineFunction(),
   3232                    DAG.getTarget(), RVLocs, *DAG.getContext());
   3233     CCInfo.AnalyzeCallResult(Ins, RetCC_X86);
   3234     for (unsigned i = 0, e = RVLocs.size(); i != e; ++i) {
   3235       CCValAssign &VA = RVLocs[i];
   3236       if (VA.getLocReg() == X86::ST0 || VA.getLocReg() == X86::ST1)
   3237         return false;
   3238     }
   3239   }
   3240 
   3241   // If the calling conventions do not match, then we'd better make sure the
   3242   // results are returned in the same way as what the caller expects.
   3243   if (!CCMatch) {
   3244     SmallVector<CCValAssign, 16> RVLocs1;
   3245     CCState CCInfo1(CalleeCC, false, DAG.getMachineFunction(),
   3246                     DAG.getTarget(), RVLocs1, *DAG.getContext());
   3247     CCInfo1.AnalyzeCallResult(Ins, RetCC_X86);
   3248 
   3249     SmallVector<CCValAssign, 16> RVLocs2;
   3250     CCState CCInfo2(CallerCC, false, DAG.getMachineFunction(),
   3251                     DAG.getTarget(), RVLocs2, *DAG.getContext());
   3252     CCInfo2.AnalyzeCallResult(Ins, RetCC_X86);
   3253 
   3254     if (RVLocs1.size() != RVLocs2.size())
   3255       return false;
   3256     for (unsigned i = 0, e = RVLocs1.size(); i != e; ++i) {
   3257       if (RVLocs1[i].isRegLoc() != RVLocs2[i].isRegLoc())
   3258         return false;
   3259       if (RVLocs1[i].getLocInfo() != RVLocs2[i].getLocInfo())
   3260         return false;
   3261       if (RVLocs1[i].isRegLoc()) {
   3262         if (RVLocs1[i].getLocReg() != RVLocs2[i].getLocReg())
   3263           return false;
   3264       } else {
   3265         if (RVLocs1[i].getLocMemOffset() != RVLocs2[i].getLocMemOffset())
   3266           return false;
   3267       }
   3268     }
   3269   }
   3270 
   3271   // If the callee takes no arguments then go on to check the results of the
   3272   // call.
   3273   if (!Outs.empty()) {
   3274     // Check if stack adjustment is needed. For now, do not do this if any
   3275     // argument is passed on the stack.
   3276     SmallVector<CCValAssign, 16> ArgLocs;
   3277     CCState CCInfo(CalleeCC, isVarArg, DAG.getMachineFunction(),
   3278                    DAG.getTarget(), ArgLocs, *DAG.getContext());
   3279 
   3280     // Allocate shadow area for Win64
   3281     if (IsCalleeWin64)
   3282       CCInfo.AllocateStack(32, 8);
   3283 
   3284     CCInfo.AnalyzeCallOperands(Outs, CC_X86);
   3285     if (CCInfo.getNextStackOffset()) {
   3286       MachineFunction &MF = DAG.getMachineFunction();
   3287       if (MF.getInfo<X86MachineFunctionInfo>()->getBytesToPopOnReturn())
   3288         return false;
   3289 
   3290       // Check if the arguments are already laid out in the right way as
   3291       // the caller's fixed stack objects.
   3292       MachineFrameInfo *MFI = MF.getFrameInfo();
   3293       const MachineRegisterInfo *MRI = &MF.getRegInfo();
   3294       const X86InstrInfo *TII =
   3295           static_cast<const X86InstrInfo *>(DAG.getTarget().getInstrInfo());
   3296       for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
   3297         CCValAssign &VA = ArgLocs[i];
   3298         SDValue Arg = OutVals[i];
   3299         ISD::ArgFlagsTy Flags = Outs[i].Flags;
   3300         if (VA.getLocInfo() == CCValAssign::Indirect)
   3301           return false;
   3302         if (!VA.isRegLoc()) {
   3303           if (!MatchingStackOffset(Arg, VA.getLocMemOffset(), Flags,
   3304                                    MFI, MRI, TII))
   3305             return false;
   3306         }
   3307       }
   3308     }
   3309 
   3310     // If the tailcall address may be in a register, then make sure it's
   3311     // possible to register allocate for it. In 32-bit, the call address can
   3312     // only target EAX, EDX, or ECX since the tail call must be scheduled after
   3313     // callee-saved registers are restored. These happen to be the same
   3314     // registers used to pass 'inreg' arguments so watch out for those.
   3315     if (!Subtarget->is64Bit() &&
   3316         ((!isa<GlobalAddressSDNode>(Callee) &&
   3317           !isa<ExternalSymbolSDNode>(Callee)) ||
   3318          DAG.getTarget().getRelocationModel() == Reloc::PIC_)) {
   3319       unsigned NumInRegs = 0;
   3320       // In PIC we need an extra register to formulate the address computation
   3321       // for the callee.
   3322       unsigned MaxInRegs =
   3323 	(DAG.getTarget().getRelocationModel() == Reloc::PIC_) ? 2 : 3;
   3324 
   3325       for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
   3326         CCValAssign &VA = ArgLocs[i];
   3327         if (!VA.isRegLoc())
   3328           continue;
   3329         unsigned Reg = VA.getLocReg();
   3330         switch (Reg) {
   3331         default: break;
   3332         case X86::EAX: case X86::EDX: case X86::ECX:
   3333           if (++NumInRegs == MaxInRegs)
   3334             return false;
   3335           break;
   3336         }
   3337       }
   3338     }
   3339   }
   3340 
   3341   return true;
   3342 }
   3343 
   3344 FastISel *
   3345 X86TargetLowering::createFastISel(FunctionLoweringInfo &funcInfo,
   3346                                   const TargetLibraryInfo *libInfo) const {
   3347   return X86::createFastISel(funcInfo, libInfo);
   3348 }
   3349 
   3350 //===----------------------------------------------------------------------===//
   3351 //                           Other Lowering Hooks
   3352 //===----------------------------------------------------------------------===//
   3353 
   3354 static bool MayFoldLoad(SDValue Op) {
   3355   return Op.hasOneUse() && ISD::isNormalLoad(Op.getNode());
   3356 }
   3357 
   3358 static bool MayFoldIntoStore(SDValue Op) {
   3359   return Op.hasOneUse() && ISD::isNormalStore(*Op.getNode()->use_begin());
   3360 }
   3361 
   3362 static bool isTargetShuffle(unsigned Opcode) {
   3363   switch(Opcode) {
   3364   default: return false;
   3365   case X86ISD::PSHUFD:
   3366   case X86ISD::PSHUFHW:
   3367   case X86ISD::PSHUFLW:
   3368   case X86ISD::SHUFP:
   3369   case X86ISD::PALIGNR:
   3370   case X86ISD::MOVLHPS:
   3371   case X86ISD::MOVLHPD:
   3372   case X86ISD::MOVHLPS:
   3373   case X86ISD::MOVLPS:
   3374   case X86ISD::MOVLPD:
   3375   case X86ISD::MOVSHDUP:
   3376   case X86ISD::MOVSLDUP:
   3377   case X86ISD::MOVDDUP:
   3378   case X86ISD::MOVSS:
   3379   case X86ISD::MOVSD:
   3380   case X86ISD::UNPCKL:
   3381   case X86ISD::UNPCKH:
   3382   case X86ISD::VPERMILP:
   3383   case X86ISD::VPERM2X128:
   3384   case X86ISD::VPERMI:
   3385     return true;
   3386   }
   3387 }
   3388 
   3389 static SDValue getTargetShuffleNode(unsigned Opc, SDLoc dl, EVT VT,
   3390                                     SDValue V1, SelectionDAG &DAG) {
   3391   switch(Opc) {
   3392   default: llvm_unreachable("Unknown x86 shuffle node");
   3393   case X86ISD::MOVSHDUP:
   3394   case X86ISD::MOVSLDUP:
   3395   case X86ISD::MOVDDUP:
   3396     return DAG.getNode(Opc, dl, VT, V1);
   3397   }
   3398 }
   3399 
   3400 static SDValue getTargetShuffleNode(unsigned Opc, SDLoc dl, EVT VT,
   3401                                     SDValue V1, unsigned TargetMask,
   3402                                     SelectionDAG &DAG) {
   3403   switch(Opc) {
   3404   default: llvm_unreachable("Unknown x86 shuffle node");
   3405   case X86ISD::PSHUFD:
   3406   case X86ISD::PSHUFHW:
   3407   case X86ISD::PSHUFLW:
   3408   case X86ISD::VPERMILP:
   3409   case X86ISD::VPERMI:
   3410     return DAG.getNode(Opc, dl, VT, V1, DAG.getConstant(TargetMask, MVT::i8));
   3411   }
   3412 }
   3413 
   3414 static SDValue getTargetShuffleNode(unsigned Opc, SDLoc dl, EVT VT,
   3415                                     SDValue V1, SDValue V2, unsigned TargetMask,
   3416                                     SelectionDAG &DAG) {
   3417   switch(Opc) {
   3418   default: llvm_unreachable("Unknown x86 shuffle node");
   3419   case X86ISD::PALIGNR:
   3420   case X86ISD::SHUFP:
   3421   case X86ISD::VPERM2X128:
   3422     return DAG.getNode(Opc, dl, VT, V1, V2,
   3423                        DAG.getConstant(TargetMask, MVT::i8));
   3424   }
   3425 }
   3426 
   3427 static SDValue getTargetShuffleNode(unsigned Opc, SDLoc dl, EVT VT,
   3428                                     SDValue V1, SDValue V2, SelectionDAG &DAG) {
   3429   switch(Opc) {
   3430   default: llvm_unreachable("Unknown x86 shuffle node");
   3431   case X86ISD::MOVLHPS:
   3432   case X86ISD::MOVLHPD:
   3433   case X86ISD::MOVHLPS:
   3434   case X86ISD::MOVLPS:
   3435   case X86ISD::MOVLPD:
   3436   case X86ISD::MOVSS:
   3437   case X86ISD::MOVSD:
   3438   case X86ISD::UNPCKL:
   3439   case X86ISD::UNPCKH:
   3440     return DAG.getNode(Opc, dl, VT, V1, V2);
   3441   }
   3442 }
   3443 
   3444 SDValue X86TargetLowering::getReturnAddressFrameIndex(SelectionDAG &DAG) const {
   3445   MachineFunction &MF = DAG.getMachineFunction();
   3446   const X86RegisterInfo *RegInfo =
   3447     static_cast<const X86RegisterInfo*>(DAG.getTarget().getRegisterInfo());
   3448   X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
   3449   int ReturnAddrIndex = FuncInfo->getRAIndex();
   3450 
   3451   if (ReturnAddrIndex == 0) {
   3452     // Set up a frame object for the return address.
   3453     unsigned SlotSize = RegInfo->getSlotSize();
   3454     ReturnAddrIndex = MF.getFrameInfo()->CreateFixedObject(SlotSize,
   3455                                                            -(int64_t)SlotSize,
   3456                                                            false);
   3457     FuncInfo->setRAIndex(ReturnAddrIndex);
   3458   }
   3459 
   3460   return DAG.getFrameIndex(ReturnAddrIndex, getPointerTy());
   3461 }
   3462 
   3463 bool X86::isOffsetSuitableForCodeModel(int64_t Offset, CodeModel::Model M,
   3464                                        bool hasSymbolicDisplacement) {
   3465   // Offset should fit into 32 bit immediate field.
   3466   if (!isInt<32>(Offset))
   3467     return false;
   3468 
   3469   // If we don't have a symbolic displacement - we don't have any extra
   3470   // restrictions.
   3471   if (!hasSymbolicDisplacement)
   3472     return true;
   3473 
   3474   // FIXME: Some tweaks might be needed for medium code model.
   3475   if (M != CodeModel::Small && M != CodeModel::Kernel)
   3476     return false;
   3477 
   3478   // For small code model we assume that latest object is 16MB before end of 31
   3479   // bits boundary. We may also accept pretty large negative constants knowing
   3480   // that all objects are in the positive half of address space.
   3481   if (M == CodeModel::Small && Offset < 16*1024*1024)
   3482     return true;
   3483 
   3484   // For kernel code model we know that all object resist in the negative half
   3485   // of 32bits address space. We may not accept negative offsets, since they may
   3486   // be just off and we may accept pretty large positive ones.
   3487   if (M == CodeModel::Kernel && Offset > 0)
   3488     return true;
   3489 
   3490   return false;
   3491 }
   3492 
   3493 /// isCalleePop - Determines whether the callee is required to pop its
   3494 /// own arguments. Callee pop is necessary to support tail calls.
   3495 bool X86::isCalleePop(CallingConv::ID CallingConv,
   3496                       bool is64Bit, bool IsVarArg, bool TailCallOpt) {
   3497   if (IsVarArg)
   3498     return false;
   3499 
   3500   switch (CallingConv) {
   3501   default:
   3502     return false;
   3503   case CallingConv::X86_StdCall:
   3504     return !is64Bit;
   3505   case CallingConv::X86_FastCall:
   3506     return !is64Bit;
   3507   case CallingConv::X86_ThisCall:
   3508     return !is64Bit;
   3509   case CallingConv::Fast:
   3510     return TailCallOpt;
   3511   case CallingConv::GHC:
   3512     return TailCallOpt;
   3513   case CallingConv::HiPE:
   3514     return TailCallOpt;
   3515   }
   3516 }
   3517 
   3518 /// \brief Return true if the condition is an unsigned comparison operation.
   3519 static bool isX86CCUnsigned(unsigned X86CC) {
   3520   switch (X86CC) {
   3521   default: llvm_unreachable("Invalid integer condition!");
   3522   case X86::COND_E:     return true;
   3523   case X86::COND_G:     return false;
   3524   case X86::COND_GE:    return false;
   3525   case X86::COND_L:     return false;
   3526   case X86::COND_LE:    return false;
   3527   case X86::COND_NE:    return true;
   3528   case X86::COND_B:     return true;
   3529   case X86::COND_A:     return true;
   3530   case X86::COND_BE:    return true;
   3531   case X86::COND_AE:    return true;
   3532   }
   3533   llvm_unreachable("covered switch fell through?!");
   3534 }
   3535 
   3536 /// TranslateX86CC - do a one to one translation of a ISD::CondCode to the X86
   3537 /// specific condition code, returning the condition code and the LHS/RHS of the
   3538 /// comparison to make.
   3539 static unsigned TranslateX86CC(ISD::CondCode SetCCOpcode, bool isFP,
   3540                                SDValue &LHS, SDValue &RHS, SelectionDAG &DAG) {
   3541   if (!isFP) {
   3542     if (ConstantSDNode *RHSC = dyn_cast<ConstantSDNode>(RHS)) {
   3543       if (SetCCOpcode == ISD::SETGT && RHSC->isAllOnesValue()) {
   3544         // X > -1   -> X == 0, jump !sign.
   3545         RHS = DAG.getConstant(0, RHS.getValueType());
   3546         return X86::COND_NS;
   3547       }
   3548       if (SetCCOpcode == ISD::SETLT && RHSC->isNullValue()) {
   3549         // X < 0   -> X == 0, jump on sign.
   3550         return X86::COND_S;
   3551       }
   3552       if (SetCCOpcode == ISD::SETLT && RHSC->getZExtValue() == 1) {
   3553         // X < 1   -> X <= 0
   3554         RHS = DAG.getConstant(0, RHS.getValueType());
   3555         return X86::COND_LE;
   3556       }
   3557     }
   3558 
   3559     switch (SetCCOpcode) {
   3560     default: llvm_unreachable("Invalid integer condition!");
   3561     case ISD::SETEQ:  return X86::COND_E;
   3562     case ISD::SETGT:  return X86::COND_G;
   3563     case ISD::SETGE:  return X86::COND_GE;
   3564     case ISD::SETLT:  return X86::COND_L;
   3565     case ISD::SETLE:  return X86::COND_LE;
   3566     case ISD::SETNE:  return X86::COND_NE;
   3567     case ISD::SETULT: return X86::COND_B;
   3568     case ISD::SETUGT: return X86::COND_A;
   3569     case ISD::SETULE: return X86::COND_BE;
   3570     case ISD::SETUGE: return X86::COND_AE;
   3571     }
   3572   }
   3573 
   3574   // First determine if it is required or is profitable to flip the operands.
   3575 
   3576   // If LHS is a foldable load, but RHS is not, flip the condition.
   3577   if (ISD::isNON_EXTLoad(LHS.getNode()) &&
   3578       !ISD::isNON_EXTLoad(RHS.getNode())) {
   3579     SetCCOpcode = getSetCCSwappedOperands(SetCCOpcode);
   3580     std::swap(LHS, RHS);
   3581   }
   3582 
   3583   switch (SetCCOpcode) {
   3584   default: break;
   3585   case ISD::SETOLT:
   3586   case ISD::SETOLE:
   3587   case ISD::SETUGT:
   3588   case ISD::SETUGE:
   3589     std::swap(LHS, RHS);
   3590     break;
   3591   }
   3592 
   3593   // On a floating point condition, the flags are set as follows:
   3594   // ZF  PF  CF   op
   3595   //  0 | 0 | 0 | X > Y
   3596   //  0 | 0 | 1 | X < Y
   3597   //  1 | 0 | 0 | X == Y
   3598   //  1 | 1 | 1 | unordered
   3599   switch (SetCCOpcode) {
   3600   default: llvm_unreachable("Condcode should be pre-legalized away");
   3601   case ISD::SETUEQ:
   3602   case ISD::SETEQ:   return X86::COND_E;
   3603   case ISD::SETOLT:              // flipped
   3604   case ISD::SETOGT:
   3605   case ISD::SETGT:   return X86::COND_A;
   3606   case ISD::SETOLE:              // flipped
   3607   case ISD::SETOGE:
   3608   case ISD::SETGE:   return X86::COND_AE;
   3609   case ISD::SETUGT:              // flipped
   3610   case ISD::SETULT:
   3611   case ISD::SETLT:   return X86::COND_B;
   3612   case ISD::SETUGE:              // flipped
   3613   case ISD::SETULE:
   3614   case ISD::SETLE:   return X86::COND_BE;
   3615   case ISD::SETONE:
   3616   case ISD::SETNE:   return X86::COND_NE;
   3617   case ISD::SETUO:   return X86::COND_P;
   3618   case ISD::SETO:    return X86::COND_NP;
   3619   case ISD::SETOEQ:
   3620   case ISD::SETUNE:  return X86::COND_INVALID;
   3621   }
   3622 }
   3623 
   3624 /// hasFPCMov - is there a floating point cmov for the specific X86 condition
   3625 /// code. Current x86 isa includes the following FP cmov instructions:
   3626 /// fcmovb, fcomvbe, fcomve, fcmovu, fcmovae, fcmova, fcmovne, fcmovnu.
   3627 static bool hasFPCMov(unsigned X86CC) {
   3628   switch (X86CC) {
   3629   default:
   3630     return false;
   3631   case X86::COND_B:
   3632   case X86::COND_BE:
   3633   case X86::COND_E:
   3634   case X86::COND_P:
   3635   case X86::COND_A:
   3636   case X86::COND_AE:
   3637   case X86::COND_NE:
   3638   case X86::COND_NP:
   3639     return true;
   3640   }
   3641 }
   3642 
   3643 /// isFPImmLegal - Returns true if the target can instruction select the
   3644 /// specified FP immediate natively. If false, the legalizer will
   3645 /// materialize the FP immediate as a load from a constant pool.
   3646 bool X86TargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT) const {
   3647   for (unsigned i = 0, e = LegalFPImmediates.size(); i != e; ++i) {
   3648     if (Imm.bitwiseIsEqual(LegalFPImmediates[i]))
   3649       return true;
   3650   }
   3651   return false;
   3652 }
   3653 
   3654 /// \brief Returns true if it is beneficial to convert a load of a constant
   3655 /// to just the constant itself.
   3656 bool X86TargetLowering::shouldConvertConstantLoadToIntImm(const APInt &Imm,
   3657                                                           Type *Ty) const {
   3658   assert(Ty->isIntegerTy());
   3659 
   3660   unsigned BitSize = Ty->getPrimitiveSizeInBits();
   3661   if (BitSize == 0 || BitSize > 64)
   3662     return false;
   3663   return true;
   3664 }
   3665 
   3666 /// isUndefOrInRange - Return true if Val is undef or if its value falls within
   3667 /// the specified range (L, H].
   3668 static bool isUndefOrInRange(int Val, int Low, int Hi) {
   3669   return (Val < 0) || (Val >= Low && Val < Hi);
   3670 }
   3671 
   3672 /// isUndefOrEqual - Val is either less than zero (undef) or equal to the
   3673 /// specified value.
   3674 static bool isUndefOrEqual(int Val, int CmpVal) {
   3675   return (Val < 0 || Val == CmpVal);
   3676 }
   3677 
   3678 /// isSequentialOrUndefInRange - Return true if every element in Mask, beginning
   3679 /// from position Pos and ending in Pos+Size, falls within the specified
   3680 /// sequential range (L, L+Pos]. or is undef.
   3681 static bool isSequentialOrUndefInRange(ArrayRef<int> Mask,
   3682                                        unsigned Pos, unsigned Size, int Low) {
   3683   for (unsigned i = Pos, e = Pos+Size; i != e; ++i, ++Low)
   3684     if (!isUndefOrEqual(Mask[i], Low))
   3685       return false;
   3686   return true;
   3687 }
   3688 
   3689 /// isPSHUFDMask - Return true if the node specifies a shuffle of elements that
   3690 /// is suitable for input to PSHUFD or PSHUFW.  That is, it doesn't reference
   3691 /// the second operand.
   3692 static bool isPSHUFDMask(ArrayRef<int> Mask, MVT VT) {
   3693   if (VT == MVT::v4f32 || VT == MVT::v4i32 )
   3694     return (Mask[0] < 4 && Mask[1] < 4 && Mask[2] < 4 && Mask[3] < 4);
   3695   if (VT == MVT::v2f64 || VT == MVT::v2i64)
   3696     return (Mask[0] < 2 && Mask[1] < 2);
   3697   return false;
   3698 }
   3699 
   3700 /// isPSHUFHWMask - Return true if the node specifies a shuffle of elements that
   3701 /// is suitable for input to PSHUFHW.
   3702 static bool isPSHUFHWMask(ArrayRef<int> Mask, MVT VT, bool HasInt256) {
   3703   if (VT != MVT::v8i16 && (!HasInt256 || VT != MVT::v16i16))
   3704     return false;
   3705 
   3706   // Lower quadword copied in order or undef.
   3707   if (!isSequentialOrUndefInRange(Mask, 0, 4, 0))
   3708     return false;
   3709 
   3710   // Upper quadword shuffled.
   3711   for (unsigned i = 4; i != 8; ++i)
   3712     if (!isUndefOrInRange(Mask[i], 4, 8))
   3713       return false;
   3714 
   3715   if (VT == MVT::v16i16) {
   3716     // Lower quadword copied in order or undef.
   3717     if (!isSequentialOrUndefInRange(Mask, 8, 4, 8))
   3718       return false;
   3719 
   3720     // Upper quadword shuffled.
   3721     for (unsigned i = 12; i != 16; ++i)
   3722       if (!isUndefOrInRange(Mask[i], 12, 16))
   3723         return false;
   3724   }
   3725 
   3726   return true;
   3727 }
   3728 
   3729 /// isPSHUFLWMask - Return true if the node specifies a shuffle of elements that
   3730 /// is suitable for input to PSHUFLW.
   3731 static bool isPSHUFLWMask(ArrayRef<int> Mask, MVT VT, bool HasInt256) {
   3732   if (VT != MVT::v8i16 && (!HasInt256 || VT != MVT::v16i16))
   3733     return false;
   3734 
   3735   // Upper quadword copied in order.
   3736   if (!isSequentialOrUndefInRange(Mask, 4, 4, 4))
   3737     return false;
   3738 
   3739   // Lower quadword shuffled.
   3740   for (unsigned i = 0; i != 4; ++i)
   3741     if (!isUndefOrInRange(Mask[i], 0, 4))
   3742       return false;
   3743 
   3744   if (VT == MVT::v16i16) {
   3745     // Upper quadword copied in order.
   3746     if (!isSequentialOrUndefInRange(Mask, 12, 4, 12))
   3747       return false;
   3748 
   3749     // Lower quadword shuffled.
   3750     for (unsigned i = 8; i != 12; ++i)
   3751       if (!isUndefOrInRange(Mask[i], 8, 12))
   3752         return false;
   3753   }
   3754 
   3755   return true;
   3756 }
   3757 
   3758 /// isPALIGNRMask - Return true if the node specifies a shuffle of elements that
   3759 /// is suitable for input to PALIGNR.
   3760 static bool isPALIGNRMask(ArrayRef<int> Mask, MVT VT,
   3761                           const X86Subtarget *Subtarget) {
   3762   if ((VT.is128BitVector() && !Subtarget->hasSSSE3()) ||
   3763       (VT.is256BitVector() && !Subtarget->hasInt256()))
   3764     return false;
   3765 
   3766   unsigned NumElts = VT.getVectorNumElements();
   3767   unsigned NumLanes = VT.is512BitVector() ? 1: VT.getSizeInBits()/128;
   3768   unsigned NumLaneElts = NumElts/NumLanes;
   3769 
   3770   // Do not handle 64-bit element shuffles with palignr.
   3771   if (NumLaneElts == 2)
   3772     return false;
   3773 
   3774   for (unsigned l = 0; l != NumElts; l+=NumLaneElts) {
   3775     unsigned i;
   3776     for (i = 0; i != NumLaneElts; ++i) {
   3777       if (Mask[i+l] >= 0)
   3778         break;
   3779     }
   3780 
   3781     // Lane is all undef, go to next lane
   3782     if (i == NumLaneElts)
   3783       continue;
   3784 
   3785     int Start = Mask[i+l];
   3786 
   3787     // Make sure its in this lane in one of the sources
   3788     if (!isUndefOrInRange(Start, l, l+NumLaneElts) &&
   3789         !isUndefOrInRange(Start, l+NumElts, l+NumElts+NumLaneElts))
   3790       return false;
   3791 
   3792     // If not lane 0, then we must match lane 0
   3793     if (l != 0 && Mask[i] >= 0 && !isUndefOrEqual(Start, Mask[i]+l))
   3794       return false;
   3795 
   3796     // Correct second source to be contiguous with first source
   3797     if (Start >= (int)NumElts)
   3798       Start -= NumElts - NumLaneElts;
   3799 
   3800     // Make sure we're shifting in the right direction.
   3801     if (Start <= (int)(i+l))
   3802       return false;
   3803 
   3804     Start -= i;
   3805 
   3806     // Check the rest of the elements to see if they are consecutive.
   3807     for (++i; i != NumLaneElts; ++i) {
   3808       int Idx = Mask[i+l];
   3809 
   3810       // Make sure its in this lane
   3811       if (!isUndefOrInRange(Idx, l, l+NumLaneElts) &&
   3812           !isUndefOrInRange(Idx, l+NumElts, l+NumElts+NumLaneElts))
   3813         return false;
   3814 
   3815       // If not lane 0, then we must match lane 0
   3816       if (l != 0 && Mask[i] >= 0 && !isUndefOrEqual(Idx, Mask[i]+l))
   3817         return false;
   3818 
   3819       if (Idx >= (int)NumElts)
   3820         Idx -= NumElts - NumLaneElts;
   3821 
   3822       if (!isUndefOrEqual(Idx, Start+i))
   3823         return false;
   3824 
   3825     }
   3826   }
   3827 
   3828   return true;
   3829 }
   3830 
   3831 /// CommuteVectorShuffleMask - Change values in a shuffle permute mask assuming
   3832 /// the two vector operands have swapped position.
   3833 static void CommuteVectorShuffleMask(SmallVectorImpl<int> &Mask,
   3834                                      unsigned NumElems) {
   3835   for (unsigned i = 0; i != NumElems; ++i) {
   3836     int idx = Mask[i];
   3837     if (idx < 0)
   3838       continue;
   3839     else if (idx < (int)NumElems)
   3840       Mask[i] = idx + NumElems;
   3841     else
   3842       Mask[i] = idx - NumElems;
   3843   }
   3844 }
   3845 
   3846 /// isSHUFPMask - Return true if the specified VECTOR_SHUFFLE operand
   3847 /// specifies a shuffle of elements that is suitable for input to 128/256-bit
   3848 /// SHUFPS and SHUFPD. If Commuted is true, then it checks for sources to be
   3849 /// reverse of what x86 shuffles want.
   3850 static bool isSHUFPMask(ArrayRef<int> Mask, MVT VT, bool Commuted = false) {
   3851 
   3852   unsigned NumElems = VT.getVectorNumElements();
   3853   unsigned NumLanes = VT.getSizeInBits()/128;
   3854   unsigned NumLaneElems = NumElems/NumLanes;
   3855 
   3856   if (NumLaneElems != 2 && NumLaneElems != 4)
   3857     return false;
   3858 
   3859   unsigned EltSize = VT.getVectorElementType().getSizeInBits();
   3860   bool symetricMaskRequired =
   3861     (VT.getSizeInBits() >= 256) && (EltSize == 32);
   3862 
   3863   // VSHUFPSY divides the resulting vector into 4 chunks.
   3864   // The sources are also splitted into 4 chunks, and each destination
   3865   // chunk must come from a different source chunk.
   3866   //
   3867   //  SRC1 =>   X7    X6    X5    X4    X3    X2    X1    X0
   3868   //  SRC2 =>   Y7    Y6    Y5    Y4    Y3    Y2    Y1    Y9
   3869   //
   3870   //  DST  =>  Y7..Y4,   Y7..Y4,   X7..X4,   X7..X4,
   3871   //           Y3..Y0,   Y3..Y0,   X3..X0,   X3..X0
   3872   //
   3873   // VSHUFPDY divides the resulting vector into 4 chunks.
   3874   // The sources are also splitted into 4 chunks, and each destination
   3875   // chunk must come from a different source chunk.
   3876   //
   3877   //  SRC1 =>      X3       X2       X1       X0
   3878   //  SRC2 =>      Y3       Y2       Y1       Y0
   3879   //
   3880   //  DST  =>  Y3..Y2,  X3..X2,  Y1..Y0,  X1..X0
   3881   //
   3882   SmallVector<int, 4> MaskVal(NumLaneElems, -1);
   3883   unsigned HalfLaneElems = NumLaneElems/2;
   3884   for (unsigned l = 0; l != NumElems; l += NumLaneElems) {
   3885     for (unsigned i = 0; i != NumLaneElems; ++i) {
   3886       int Idx = Mask[i+l];
   3887       unsigned RngStart = l + ((Commuted == (i<HalfLaneElems)) ? NumElems : 0);
   3888       if (!isUndefOrInRange(Idx, RngStart, RngStart+NumLaneElems))
   3889         return false;
   3890       // For VSHUFPSY, the mask of the second half must be the same as the
   3891       // first but with the appropriate offsets. This works in the same way as
   3892       // VPERMILPS works with masks.
   3893       if (!symetricMaskRequired || Idx < 0)
   3894         continue;
   3895       if (MaskVal[i] < 0) {
   3896         MaskVal[i] = Idx - l;
   3897         continue;
   3898       }
   3899       if ((signed)(Idx - l) != MaskVal[i])
   3900         return false;
   3901     }
   3902   }
   3903 
   3904   return true;
   3905 }
   3906 
   3907 /// isMOVHLPSMask - Return true if the specified VECTOR_SHUFFLE operand
   3908 /// specifies a shuffle of elements that is suitable for input to MOVHLPS.
   3909 static bool isMOVHLPSMask(ArrayRef<int> Mask, MVT VT) {
   3910   if (!VT.is128BitVector())
   3911     return false;
   3912 
   3913   unsigned NumElems = VT.getVectorNumElements();
   3914 
   3915   if (NumElems != 4)
   3916     return false;
   3917 
   3918   // Expect bit0 == 6, bit1 == 7, bit2 == 2, bit3 == 3
   3919   return isUndefOrEqual(Mask[0], 6) &&
   3920          isUndefOrEqual(Mask[1], 7) &&
   3921          isUndefOrEqual(Mask[2], 2) &&
   3922          isUndefOrEqual(Mask[3], 3);
   3923 }
   3924 
   3925 /// isMOVHLPS_v_undef_Mask - Special case of isMOVHLPSMask for canonical form
   3926 /// of vector_shuffle v, v, <2, 3, 2, 3>, i.e. vector_shuffle v, undef,
   3927 /// <2, 3, 2, 3>
   3928 static bool isMOVHLPS_v_undef_Mask(ArrayRef<int> Mask, MVT VT) {
   3929   if (!VT.is128BitVector())
   3930     return false;
   3931 
   3932   unsigned NumElems = VT.getVectorNumElements();
   3933 
   3934   if (NumElems != 4)
   3935     return false;
   3936 
   3937   return isUndefOrEqual(Mask[0], 2) &&
   3938          isUndefOrEqual(Mask[1], 3) &&
   3939          isUndefOrEqual(Mask[2], 2) &&
   3940          isUndefOrEqual(Mask[3], 3);
   3941 }
   3942 
   3943 /// isMOVLPMask - Return true if the specified VECTOR_SHUFFLE operand
   3944 /// specifies a shuffle of elements that is suitable for input to MOVLP{S|D}.
   3945 static bool isMOVLPMask(ArrayRef<int> Mask, MVT VT) {
   3946   if (!VT.is128BitVector())
   3947     return false;
   3948 
   3949   unsigned NumElems = VT.getVectorNumElements();
   3950 
   3951   if (NumElems != 2 && NumElems != 4)
   3952     return false;
   3953 
   3954   for (unsigned i = 0, e = NumElems/2; i != e; ++i)
   3955     if (!isUndefOrEqual(Mask[i], i + NumElems))
   3956       return false;
   3957 
   3958   for (unsigned i = NumElems/2, e = NumElems; i != e; ++i)
   3959     if (!isUndefOrEqual(Mask[i], i))
   3960       return false;
   3961 
   3962   return true;
   3963 }
   3964 
   3965 /// isMOVLHPSMask - Return true if the specified VECTOR_SHUFFLE operand
   3966 /// specifies a shuffle of elements that is suitable for input to MOVLHPS.
   3967 static bool isMOVLHPSMask(ArrayRef<int> Mask, MVT VT) {
   3968   if (!VT.is128BitVector())
   3969     return false;
   3970 
   3971   unsigned NumElems = VT.getVectorNumElements();
   3972 
   3973   if (NumElems != 2 && NumElems != 4)
   3974     return false;
   3975 
   3976   for (unsigned i = 0, e = NumElems/2; i != e; ++i)
   3977     if (!isUndefOrEqual(Mask[i], i))
   3978       return false;
   3979 
   3980   for (unsigned i = 0, e = NumElems/2; i != e; ++i)
   3981     if (!isUndefOrEqual(Mask[i + e], i + NumElems))
   3982       return false;
   3983 
   3984   return true;
   3985 }
   3986 
   3987 /// isINSERTPSMask - Return true if the specified VECTOR_SHUFFLE operand
   3988 /// specifies a shuffle of elements that is suitable for input to INSERTPS.
   3989 /// i. e: If all but one element come from the same vector.
   3990 static bool isINSERTPSMask(ArrayRef<int> Mask, MVT VT) {
   3991   // TODO: Deal with AVX's VINSERTPS
   3992   if (!VT.is128BitVector() || (VT != MVT::v4f32 && VT != MVT::v4i32))
   3993     return false;
   3994 
   3995   unsigned CorrectPosV1 = 0;
   3996   unsigned CorrectPosV2 = 0;
   3997   for (int i = 0, e = (int)VT.getVectorNumElements(); i != e; ++i) {
   3998     if (Mask[i] == -1) {
   3999       ++CorrectPosV1;
   4000       ++CorrectPosV2;
   4001       continue;
   4002     }
   4003 
   4004     if (Mask[i] == i)
   4005       ++CorrectPosV1;
   4006     else if (Mask[i] == i + 4)
   4007       ++CorrectPosV2;
   4008   }
   4009 
   4010   if (CorrectPosV1 == 3 || CorrectPosV2 == 3)
   4011     // We have 3 elements (undefs count as elements from any vector) from one
   4012     // vector, and one from another.
   4013     return true;
   4014 
   4015   return false;
   4016 }
   4017 
   4018 //
   4019 // Some special combinations that can be optimized.
   4020 //
   4021 static
   4022 SDValue Compact8x32ShuffleNode(ShuffleVectorSDNode *SVOp,
   4023                                SelectionDAG &DAG) {
   4024   MVT VT = SVOp->getSimpleValueType(0);
   4025   SDLoc dl(SVOp);
   4026 
   4027   if (VT != MVT::v8i32 && VT != MVT::v8f32)
   4028     return SDValue();
   4029 
   4030   ArrayRef<int> Mask = SVOp->getMask();
   4031 
   4032   // These are the special masks that may be optimized.
   4033   static const int MaskToOptimizeEven[] = {0, 8, 2, 10, 4, 12, 6, 14};
   4034   static const int MaskToOptimizeOdd[]  = {1, 9, 3, 11, 5, 13, 7, 15};
   4035   bool MatchEvenMask = true;
   4036   bool MatchOddMask  = true;
   4037   for (int i=0; i<8; ++i) {
   4038     if (!isUndefOrEqual(Mask[i], MaskToOptimizeEven[i]))
   4039       MatchEvenMask = false;
   4040     if (!isUndefOrEqual(Mask[i], MaskToOptimizeOdd[i]))
   4041       MatchOddMask = false;
   4042   }
   4043 
   4044   if (!MatchEvenMask && !MatchOddMask)
   4045     return SDValue();
   4046 
   4047   SDValue UndefNode = DAG.getNode(ISD::UNDEF, dl, VT);
   4048 
   4049   SDValue Op0 = SVOp->getOperand(0);
   4050   SDValue Op1 = SVOp->getOperand(1);
   4051 
   4052   if (MatchEvenMask) {
   4053     // Shift the second operand right to 32 bits.
   4054     static const int ShiftRightMask[] = {-1, 0, -1, 2, -1, 4, -1, 6 };
   4055     Op1 = DAG.getVectorShuffle(VT, dl, Op1, UndefNode, ShiftRightMask);
   4056   } else {
   4057     // Shift the first operand left to 32 bits.
   4058     static const int ShiftLeftMask[] = {1, -1, 3, -1, 5, -1, 7, -1 };
   4059     Op0 = DAG.getVectorShuffle(VT, dl, Op0, UndefNode, ShiftLeftMask);
   4060   }
   4061   static const int BlendMask[] = {0, 9, 2, 11, 4, 13, 6, 15};
   4062   return DAG.getVectorShuffle(VT, dl, Op0, Op1, BlendMask);
   4063 }
   4064 
   4065 /// isUNPCKLMask - Return true if the specified VECTOR_SHUFFLE operand
   4066 /// specifies a shuffle of elements that is suitable for input to UNPCKL.
   4067 static bool isUNPCKLMask(ArrayRef<int> Mask, MVT VT,
   4068                          bool HasInt256, bool V2IsSplat = false) {
   4069 
   4070   assert(VT.getSizeInBits() >= 128 &&
   4071          "Unsupported vector type for unpckl");
   4072 
   4073   // AVX defines UNPCK* to operate independently on 128-bit lanes.
   4074   unsigned NumLanes;
   4075   unsigned NumOf256BitLanes;
   4076   unsigned NumElts = VT.getVectorNumElements();
   4077   if (VT.is256BitVector()) {
   4078     if (NumElts != 4 && NumElts != 8 &&
   4079         (!HasInt256 || (NumElts != 16 && NumElts != 32)))
   4080     return false;
   4081     NumLanes = 2;
   4082     NumOf256BitLanes = 1;
   4083   } else if (VT.is512BitVector()) {
   4084     assert(VT.getScalarType().getSizeInBits() >= 32 &&
   4085            "Unsupported vector type for unpckh");
   4086     NumLanes = 2;
   4087     NumOf256BitLanes = 2;
   4088   } else {
   4089     NumLanes = 1;
   4090     NumOf256BitLanes = 1;
   4091   }
   4092 
   4093   unsigned NumEltsInStride = NumElts/NumOf256BitLanes;
   4094   unsigned NumLaneElts = NumEltsInStride/NumLanes;
   4095 
   4096   for (unsigned l256 = 0; l256 < NumOf256BitLanes; l256 += 1) {
   4097     for (unsigned l = 0; l != NumEltsInStride; l += NumLaneElts) {
   4098       for (unsigned i = 0, j = l; i != NumLaneElts; i += 2, ++j) {
   4099         int BitI  = Mask[l256*NumEltsInStride+l+i];
   4100         int BitI1 = Mask[l256*NumEltsInStride+l+i+1];
   4101         if (!isUndefOrEqual(BitI, j+l256*NumElts))
   4102           return false;
   4103         if (V2IsSplat && !isUndefOrEqual(BitI1, NumElts))
   4104           return false;
   4105         if (!isUndefOrEqual(BitI1, j+l256*NumElts+NumEltsInStride))
   4106           return false;
   4107       }
   4108     }
   4109   }
   4110   return true;
   4111 }
   4112 
   4113 /// isUNPCKHMask - Return true if the specified VECTOR_SHUFFLE operand
   4114 /// specifies a shuffle of elements that is suitable for input to UNPCKH.
   4115 static bool isUNPCKHMask(ArrayRef<int> Mask, MVT VT,
   4116                          bool HasInt256, bool V2IsSplat = false) {
   4117   assert(VT.getSizeInBits() >= 128 &&
   4118          "Unsupported vector type for unpckh");
   4119 
   4120   // AVX defines UNPCK* to operate independently on 128-bit lanes.
   4121   unsigned NumLanes;
   4122   unsigned NumOf256BitLanes;
   4123   unsigned NumElts = VT.getVectorNumElements();
   4124   if (VT.is256BitVector()) {
   4125     if (NumElts != 4 && NumElts != 8 &&
   4126         (!HasInt256 || (NumElts != 16 && NumElts != 32)))
   4127     return false;
   4128     NumLanes = 2;
   4129     NumOf256BitLanes = 1;
   4130   } else if (VT.is512BitVector()) {
   4131     assert(VT.getScalarType().getSizeInBits() >= 32 &&
   4132            "Unsupported vector type for unpckh");
   4133     NumLanes = 2;
   4134     NumOf256BitLanes = 2;
   4135   } else {
   4136     NumLanes = 1;
   4137     NumOf256BitLanes = 1;
   4138   }
   4139 
   4140   unsigned NumEltsInStride = NumElts/NumOf256BitLanes;
   4141   unsigned NumLaneElts = NumEltsInStride/NumLanes;
   4142 
   4143   for (unsigned l256 = 0; l256 < NumOf256BitLanes; l256 += 1) {
   4144     for (unsigned l = 0; l != NumEltsInStride; l += NumLaneElts) {
   4145       for (unsigned i = 0, j = l+NumLaneElts/2; i != NumLaneElts; i += 2, ++j) {
   4146         int BitI  = Mask[l256*NumEltsInStride+l+i];
   4147         int BitI1 = Mask[l256*NumEltsInStride+l+i+1];
   4148         if (!isUndefOrEqual(BitI, j+l256*NumElts))
   4149           return false;
   4150         if (V2IsSplat && !isUndefOrEqual(BitI1, NumElts))
   4151           return false;
   4152         if (!isUndefOrEqual(BitI1, j+l256*NumElts+NumEltsInStride))
   4153           return false;
   4154       }
   4155     }
   4156   }
   4157   return true;
   4158 }
   4159 
   4160 /// isUNPCKL_v_undef_Mask - Special case of isUNPCKLMask for canonical form
   4161 /// of vector_shuffle v, v, <0, 4, 1, 5>, i.e. vector_shuffle v, undef,
   4162 /// <0, 0, 1, 1>
   4163 static bool isUNPCKL_v_undef_Mask(ArrayRef<int> Mask, MVT VT, bool HasInt256) {
   4164   unsigned NumElts = VT.getVectorNumElements();
   4165   bool Is256BitVec = VT.is256BitVector();
   4166 
   4167   if (VT.is512BitVector())
   4168     return false;
   4169   assert((VT.is128BitVector() || VT.is256BitVector()) &&
   4170          "Unsupported vector type for unpckh");
   4171 
   4172   if (Is256BitVec && NumElts != 4 && NumElts != 8 &&
   4173       (!HasInt256 || (NumElts != 16 && NumElts != 32)))
   4174     return false;
   4175 
   4176   // For 256-bit i64/f64, use MOVDDUPY instead, so reject the matching pattern
   4177   // FIXME: Need a better way to get rid of this, there's no latency difference
   4178   // between UNPCKLPD and MOVDDUP, the later should always be checked first and
   4179   // the former later. We should also remove the "_undef" special mask.
   4180   if (NumElts == 4 && Is256BitVec)
   4181     return false;
   4182 
   4183   // Handle 128 and 256-bit vector lengths. AVX defines UNPCK* to operate
   4184   // independently on 128-bit lanes.
   4185   unsigned NumLanes = VT.getSizeInBits()/128;
   4186   unsigned NumLaneElts = NumElts/NumLanes;
   4187 
   4188   for (unsigned l = 0; l != NumElts; l += NumLaneElts) {
   4189     for (unsigned i = 0, j = l; i != NumLaneElts; i += 2, ++j) {
   4190       int BitI  = Mask[l+i];
   4191       int BitI1 = Mask[l+i+1];
   4192 
   4193       if (!isUndefOrEqual(BitI, j))
   4194         return false;
   4195       if (!isUndefOrEqual(BitI1, j))
   4196         return false;
   4197     }
   4198   }
   4199 
   4200   return true;
   4201 }
   4202 
   4203 /// isUNPCKH_v_undef_Mask - Special case of isUNPCKHMask for canonical form
   4204 /// of vector_shuffle v, v, <2, 6, 3, 7>, i.e. vector_shuffle v, undef,
   4205 /// <2, 2, 3, 3>
   4206 static bool isUNPCKH_v_undef_Mask(ArrayRef<int> Mask, MVT VT, bool HasInt256) {
   4207   unsigned NumElts = VT.getVectorNumElements();
   4208 
   4209   if (VT.is512BitVector())
   4210     return false;
   4211 
   4212   assert((VT.is128BitVector() || VT.is256BitVector()) &&
   4213          "Unsupported vector type for unpckh");
   4214 
   4215   if (VT.is256BitVector() && NumElts != 4 && NumElts != 8 &&
   4216       (!HasInt256 || (NumElts != 16 && NumElts != 32)))
   4217     return false;
   4218 
   4219   // Handle 128 and 256-bit vector lengths. AVX defines UNPCK* to operate
   4220   // independently on 128-bit lanes.
   4221   unsigned NumLanes = VT.getSizeInBits()/128;
   4222   unsigned NumLaneElts = NumElts/NumLanes;
   4223 
   4224   for (unsigned l = 0; l != NumElts; l += NumLaneElts) {
   4225     for (unsigned i = 0, j = l+NumLaneElts/2; i != NumLaneElts; i += 2, ++j) {
   4226       int BitI  = Mask[l+i];
   4227       int BitI1 = Mask[l+i+1];
   4228       if (!isUndefOrEqual(BitI, j))
   4229         return false;
   4230       if (!isUndefOrEqual(BitI1, j))
   4231         return false;
   4232     }
   4233   }
   4234   return true;
   4235 }
   4236 
   4237 // Match for INSERTI64x4 INSERTF64x4 instructions (src0[0], src1[0]) or
   4238 // (src1[0], src0[1]), manipulation with 256-bit sub-vectors
   4239 static bool isINSERT64x4Mask(ArrayRef<int> Mask, MVT VT, unsigned int *Imm) {
   4240   if (!VT.is512BitVector())
   4241     return false;
   4242 
   4243   unsigned NumElts = VT.getVectorNumElements();
   4244   unsigned HalfSize = NumElts/2;
   4245   if (isSequentialOrUndefInRange(Mask, 0, HalfSize, 0)) {
   4246     if (isSequentialOrUndefInRange(Mask, HalfSize, HalfSize, NumElts)) {
   4247       *Imm = 1;
   4248       return true;
   4249     }
   4250   }
   4251   if (isSequentialOrUndefInRange(Mask, 0, HalfSize, NumElts)) {
   4252     if (isSequentialOrUndefInRange(Mask, HalfSize, HalfSize, HalfSize)) {
   4253       *Imm = 0;
   4254       return true;
   4255     }
   4256   }
   4257   return false;
   4258 }
   4259 
   4260 /// isMOVLMask - Return true if the specified VECTOR_SHUFFLE operand
   4261 /// specifies a shuffle of elements that is suitable for input to MOVSS,
   4262 /// MOVSD, and MOVD, i.e. setting the lowest element.
   4263 static bool isMOVLMask(ArrayRef<int> Mask, EVT VT) {
   4264   if (VT.getVectorElementType().getSizeInBits() < 32)
   4265     return false;
   4266   if (!VT.is128BitVector())
   4267     return false;
   4268 
   4269   unsigned NumElts = VT.getVectorNumElements();
   4270 
   4271   if (!isUndefOrEqual(Mask[0], NumElts))
   4272     return false;
   4273 
   4274   for (unsigned i = 1; i != NumElts; ++i)
   4275     if (!isUndefOrEqual(Mask[i], i))
   4276       return false;
   4277 
   4278   return true;
   4279 }
   4280 
   4281 /// isVPERM2X128Mask - Match 256-bit shuffles where the elements are considered
   4282 /// as permutations between 128-bit chunks or halves. As an example: this
   4283 /// shuffle bellow:
   4284 ///   vector_shuffle <4, 5, 6, 7, 12, 13, 14, 15>
   4285 /// The first half comes from the second half of V1 and the second half from the
   4286 /// the second half of V2.
   4287 static bool isVPERM2X128Mask(ArrayRef<int> Mask, MVT VT, bool HasFp256) {
   4288   if (!HasFp256 || !VT.is256BitVector())
   4289     return false;
   4290 
   4291   // The shuffle result is divided into half A and half B. In total the two
   4292   // sources have 4 halves, namely: C, D, E, F. The final values of A and
   4293   // B must come from C, D, E or F.
   4294   unsigned HalfSize = VT.getVectorNumElements()/2;
   4295   bool MatchA = false, MatchB = false;
   4296 
   4297   // Check if A comes from one of C, D, E, F.
   4298   for (unsigned Half = 0; Half != 4; ++Half) {
   4299     if (isSequentialOrUndefInRange(Mask, 0, HalfSize, Half*HalfSize)) {
   4300       MatchA = true;
   4301       break;
   4302     }
   4303   }
   4304 
   4305   // Check if B comes from one of C, D, E, F.
   4306   for (unsigned Half = 0; Half != 4; ++Half) {
   4307     if (isSequentialOrUndefInRange(Mask, HalfSize, HalfSize, Half*HalfSize)) {
   4308       MatchB = true;
   4309       break;
   4310     }
   4311   }
   4312 
   4313   return MatchA && MatchB;
   4314 }
   4315 
   4316 /// getShuffleVPERM2X128Immediate - Return the appropriate immediate to shuffle
   4317 /// the specified VECTOR_MASK mask with VPERM2F128/VPERM2I128 instructions.
   4318 static unsigned getShuffleVPERM2X128Immediate(ShuffleVectorSDNode *SVOp) {
   4319   MVT VT = SVOp->getSimpleValueType(0);
   4320 
   4321   unsigned HalfSize = VT.getVectorNumElements()/2;
   4322 
   4323   unsigned FstHalf = 0, SndHalf = 0;
   4324   for (unsigned i = 0; i < HalfSize; ++i) {
   4325     if (SVOp->getMaskElt(i) > 0) {
   4326       FstHalf = SVOp->getMaskElt(i)/HalfSize;
   4327       break;
   4328     }
   4329   }
   4330   for (unsigned i = HalfSize; i < HalfSize*2; ++i) {
   4331     if (SVOp->getMaskElt(i) > 0) {
   4332       SndHalf = SVOp->getMaskElt(i)/HalfSize;
   4333       break;
   4334     }
   4335   }
   4336 
   4337   return (FstHalf | (SndHalf << 4));
   4338 }
   4339 
   4340 // Symetric in-lane mask. Each lane has 4 elements (for imm8)
   4341 static bool isPermImmMask(ArrayRef<int> Mask, MVT VT, unsigned& Imm8) {
   4342   unsigned EltSize = VT.getVectorElementType().getSizeInBits();
   4343   if (EltSize < 32)
   4344     return false;
   4345 
   4346   unsigned NumElts = VT.getVectorNumElements();
   4347   Imm8 = 0;
   4348   if (VT.is128BitVector() || (VT.is256BitVector() && EltSize == 64)) {
   4349     for (unsigned i = 0; i != NumElts; ++i) {
   4350       if (Mask[i] < 0)
   4351         continue;
   4352       Imm8 |= Mask[i] << (i*2);
   4353     }
   4354     return true;
   4355   }
   4356 
   4357   unsigned LaneSize = 4;
   4358   SmallVector<int, 4> MaskVal(LaneSize, -1);
   4359 
   4360   for (unsigned l = 0; l != NumElts; l += LaneSize) {
   4361     for (unsigned i = 0; i != LaneSize; ++i) {
   4362       if (!isUndefOrInRange(Mask[i+l], l, l+LaneSize))
   4363         return false;
   4364       if (Mask[i+l] < 0)
   4365         continue;
   4366       if (MaskVal[i] < 0) {
   4367         MaskVal[i] = Mask[i+l] - l;
   4368         Imm8 |= MaskVal[i] << (i*2);
   4369         continue;
   4370       }
   4371       if (Mask[i+l] != (signed)(MaskVal[i]+l))
   4372         return false;
   4373     }
   4374   }
   4375   return true;
   4376 }
   4377 
   4378 /// isVPERMILPMask - Return true if the specified VECTOR_SHUFFLE operand
   4379 /// specifies a shuffle of elements that is suitable for input to VPERMILPD*.
   4380 /// Note that VPERMIL mask matching is different depending whether theunderlying
   4381 /// type is 32 or 64. In the VPERMILPS the high half of the mask should point
   4382 /// to the same elements of the low, but to the higher half of the source.
   4383 /// In VPERMILPD the two lanes could be shuffled independently of each other
   4384 /// with the same restriction that lanes can't be crossed. Also handles PSHUFDY.
   4385 static bool isVPERMILPMask(ArrayRef<int> Mask, MVT VT) {
   4386   unsigned EltSize = VT.getVectorElementType().getSizeInBits();
   4387   if (VT.getSizeInBits() < 256 || EltSize < 32)
   4388     return false;
   4389   bool symetricMaskRequired = (EltSize == 32);
   4390   unsigned NumElts = VT.getVectorNumElements();
   4391 
   4392   unsigned NumLanes = VT.getSizeInBits()/128;
   4393   unsigned LaneSize = NumElts/NumLanes;
   4394   // 2 or 4 elements in one lane
   4395 
   4396   SmallVector<int, 4> ExpectedMaskVal(LaneSize, -1);
   4397   for (unsigned l = 0; l != NumElts; l += LaneSize) {
   4398     for (unsigned i = 0; i != LaneSize; ++i) {
   4399       if (!isUndefOrInRange(Mask[i+l], l, l+LaneSize))
   4400         return false;
   4401       if (symetricMaskRequired) {
   4402         if (ExpectedMaskVal[i] < 0 && Mask[i+l] >= 0) {
   4403           ExpectedMaskVal[i] = Mask[i+l] - l;
   4404           continue;
   4405         }
   4406         if (!isUndefOrEqual(Mask[i+l], ExpectedMaskVal[i]+l))
   4407           return false;
   4408       }
   4409     }
   4410   }
   4411   return true;
   4412 }
   4413 
   4414 /// isCommutedMOVLMask - Returns true if the shuffle mask is except the reverse
   4415 /// of what x86 movss want. X86 movs requires the lowest  element to be lowest
   4416 /// element of vector 2 and the other elements to come from vector 1 in order.
   4417 static bool isCommutedMOVLMask(ArrayRef<int> Mask, MVT VT,
   4418                                bool V2IsSplat = false, bool V2IsUndef = false) {
   4419   if (!VT.is128BitVector())
   4420     return false;
   4421 
   4422   unsigned NumOps = VT.getVectorNumElements();
   4423   if (NumOps != 2 && NumOps != 4 && NumOps != 8 && NumOps != 16)
   4424     return false;
   4425 
   4426   if (!isUndefOrEqual(Mask[0], 0))
   4427     return false;
   4428 
   4429   for (unsigned i = 1; i != NumOps; ++i)
   4430     if (!(isUndefOrEqual(Mask[i], i+NumOps) ||
   4431           (V2IsUndef && isUndefOrInRange(Mask[i], NumOps, NumOps*2)) ||
   4432           (V2IsSplat && isUndefOrEqual(Mask[i], NumOps))))
   4433       return false;
   4434 
   4435   return true;
   4436 }
   4437 
   4438 /// isMOVSHDUPMask - Return true if the specified VECTOR_SHUFFLE operand
   4439 /// specifies a shuffle of elements that is suitable for input to MOVSHDUP.
   4440 /// Masks to match: <1, 1, 3, 3> or <1, 1, 3, 3, 5, 5, 7, 7>
   4441 static bool isMOVSHDUPMask(ArrayRef<int> Mask, MVT VT,
   4442                            const X86Subtarget *Subtarget) {
   4443   if (!Subtarget->hasSSE3())
   4444     return false;
   4445 
   4446   unsigned NumElems = VT.getVectorNumElements();
   4447 
   4448   if ((VT.is128BitVector() && NumElems != 4) ||
   4449       (VT.is256BitVector() && NumElems != 8) ||
   4450       (VT.is512BitVector() && NumElems != 16))
   4451     return false;
   4452 
   4453   // "i+1" is the value the indexed mask element must have
   4454   for (unsigned i = 0; i != NumElems; i += 2)
   4455     if (!isUndefOrEqual(Mask[i], i+1) ||
   4456         !isUndefOrEqual(Mask[i+1], i+1))
   4457       return false;
   4458 
   4459   return true;
   4460 }
   4461 
   4462 /// isMOVSLDUPMask - Return true if the specified VECTOR_SHUFFLE operand
   4463 /// specifies a shuffle of elements that is suitable for input to MOVSLDUP.
   4464 /// Masks to match: <0, 0, 2, 2> or <0, 0, 2, 2, 4, 4, 6, 6>
   4465 static bool isMOVSLDUPMask(ArrayRef<int> Mask, MVT VT,
   4466                            const X86Subtarget *Subtarget) {
   4467   if (!Subtarget->hasSSE3())
   4468     return false;
   4469 
   4470   unsigned NumElems = VT.getVectorNumElements();
   4471 
   4472   if ((VT.is128BitVector() && NumElems != 4) ||
   4473       (VT.is256BitVector() && NumElems != 8) ||
   4474       (VT.is512BitVector() && NumElems != 16))
   4475     return false;
   4476 
   4477   // "i" is the value the indexed mask element must have
   4478   for (unsigned i = 0; i != NumElems; i += 2)
   4479     if (!isUndefOrEqual(Mask[i], i) ||
   4480         !isUndefOrEqual(Mask[i+1], i))
   4481       return false;
   4482 
   4483   return true;
   4484 }
   4485 
   4486 /// isMOVDDUPYMask - Return true if the specified VECTOR_SHUFFLE operand
   4487 /// specifies a shuffle of elements that is suitable for input to 256-bit
   4488 /// version of MOVDDUP.
   4489 static bool isMOVDDUPYMask(ArrayRef<int> Mask, MVT VT, bool HasFp256) {
   4490   if (!HasFp256 || !VT.is256BitVector())
   4491     return false;
   4492 
   4493   unsigned NumElts = VT.getVectorNumElements();
   4494   if (NumElts != 4)
   4495     return false;
   4496 
   4497   for (unsigned i = 0; i != NumElts/2; ++i)
   4498     if (!isUndefOrEqual(Mask[i], 0))
   4499       return false;
   4500   for (unsigned i = NumElts/2; i != NumElts; ++i)
   4501     if (!isUndefOrEqual(Mask[i], NumElts/2))
   4502       return false;
   4503   return true;
   4504 }
   4505 
   4506 /// isMOVDDUPMask - Return true if the specified VECTOR_SHUFFLE operand
   4507 /// specifies a shuffle of elements that is suitable for input to 128-bit
   4508 /// version of MOVDDUP.
   4509 static bool isMOVDDUPMask(ArrayRef<int> Mask, MVT VT) {
   4510   if (!VT.is128BitVector())
   4511     return false;
   4512 
   4513   unsigned e = VT.getVectorNumElements() / 2;
   4514   for (unsigned i = 0; i != e; ++i)
   4515     if (!isUndefOrEqual(Mask[i], i))
   4516       return false;
   4517   for (unsigned i = 0; i != e; ++i)
   4518     if (!isUndefOrEqual(Mask[e+i], i))
   4519       return false;
   4520   return true;
   4521 }
   4522 
   4523 /// isVEXTRACTIndex - Return true if the specified
   4524 /// EXTRACT_SUBVECTOR operand specifies a vector extract that is
   4525 /// suitable for instruction that extract 128 or 256 bit vectors
   4526 static bool isVEXTRACTIndex(SDNode *N, unsigned vecWidth) {
   4527   assert((vecWidth == 128 || vecWidth == 256) && "Unexpected vector width");
   4528   if (!isa<ConstantSDNode>(N->getOperand(1).getNode()))
   4529     return false;
   4530 
   4531   // The index should be aligned on a vecWidth-bit boundary.
   4532   uint64_t Index =
   4533     cast<ConstantSDNode>(N->getOperand(1).getNode())->getZExtValue();
   4534 
   4535   MVT VT = N->getSimpleValueType(0);
   4536   unsigned ElSize = VT.getVectorElementType().getSizeInBits();
   4537   bool Result = (Index * ElSize) % vecWidth == 0;
   4538 
   4539   return Result;
   4540 }
   4541 
   4542 /// isVINSERTIndex - Return true if the specified INSERT_SUBVECTOR
   4543 /// operand specifies a subvector insert that is suitable for input to
   4544 /// insertion of 128 or 256-bit subvectors
   4545 static bool isVINSERTIndex(SDNode *N, unsigned vecWidth) {
   4546   assert((vecWidth == 128 || vecWidth == 256) && "Unexpected vector width");
   4547   if (!isa<ConstantSDNode>(N->getOperand(2).getNode()))
   4548     return false;
   4549   // The index should be aligned on a vecWidth-bit boundary.
   4550   uint64_t Index =
   4551     cast<ConstantSDNode>(N->getOperand(2).getNode())->getZExtValue();
   4552 
   4553   MVT VT = N->getSimpleValueType(0);
   4554   unsigned ElSize = VT.getVectorElementType().getSizeInBits();
   4555   bool Result = (Index * ElSize) % vecWidth == 0;
   4556 
   4557   return Result;
   4558 }
   4559 
   4560 bool X86::isVINSERT128Index(SDNode *N) {
   4561   return isVINSERTIndex(N, 128);
   4562 }
   4563 
   4564 bool X86::isVINSERT256Index(SDNode *N) {
   4565   return isVINSERTIndex(N, 256);
   4566 }
   4567 
   4568 bool X86::isVEXTRACT128Index(SDNode *N) {
   4569   return isVEXTRACTIndex(N, 128);
   4570 }
   4571 
   4572 bool X86::isVEXTRACT256Index(SDNode *N) {
   4573   return isVEXTRACTIndex(N, 256);
   4574 }
   4575 
   4576 /// getShuffleSHUFImmediate - Return the appropriate immediate to shuffle
   4577 /// the specified VECTOR_SHUFFLE mask with PSHUF* and SHUFP* instructions.
   4578 /// Handles 128-bit and 256-bit.
   4579 static unsigned getShuffleSHUFImmediate(ShuffleVectorSDNode *N) {
   4580   MVT VT = N->getSimpleValueType(0);
   4581 
   4582   assert((VT.getSizeInBits() >= 128) &&
   4583          "Unsupported vector type for PSHUF/SHUFP");
   4584 
   4585   // Handle 128 and 256-bit vector lengths. AVX defines PSHUF/SHUFP to operate
   4586   // independently on 128-bit lanes.
   4587   unsigned NumElts = VT.getVectorNumElements();
   4588   unsigned NumLanes = VT.getSizeInBits()/128;
   4589   unsigned NumLaneElts = NumElts/NumLanes;
   4590 
   4591   assert((NumLaneElts == 2 || NumLaneElts == 4 || NumLaneElts == 8) &&
   4592          "Only supports 2, 4 or 8 elements per lane");
   4593 
   4594   unsigned Shift = (NumLaneElts >= 4) ? 1 : 0;
   4595   unsigned Mask = 0;
   4596   for (unsigned i = 0; i != NumElts; ++i) {
   4597     int Elt = N->getMaskElt(i);
   4598     if (Elt < 0) continue;
   4599     Elt &= NumLaneElts - 1;
   4600     unsigned ShAmt = (i << Shift) % 8;
   4601     Mask |= Elt << ShAmt;
   4602   }
   4603 
   4604   return Mask;
   4605 }
   4606 
   4607 /// getShufflePSHUFHWImmediate - Return the appropriate immediate to shuffle
   4608 /// the specified VECTOR_SHUFFLE mask with the PSHUFHW instruction.
   4609 static unsigned getShufflePSHUFHWImmediate(ShuffleVectorSDNode *N) {
   4610   MVT VT = N->getSimpleValueType(0);
   4611 
   4612   assert((VT == MVT::v8i16 || VT == MVT::v16i16) &&
   4613          "Unsupported vector type for PSHUFHW");
   4614 
   4615   unsigned NumElts = VT.getVectorNumElements();
   4616 
   4617   unsigned Mask = 0;
   4618   for (unsigned l = 0; l != NumElts; l += 8) {
   4619     // 8 nodes per lane, but we only care about the last 4.
   4620     for (unsigned i = 0; i < 4; ++i) {
   4621       int Elt = N->getMaskElt(l+i+4);
   4622       if (Elt < 0) continue;
   4623       Elt &= 0x3; // only 2-bits.
   4624       Mask |= Elt << (i * 2);
   4625     }
   4626   }
   4627 
   4628   return Mask;
   4629 }
   4630 
   4631 /// getShufflePSHUFLWImmediate - Return the appropriate immediate to shuffle
   4632 /// the specified VECTOR_SHUFFLE mask with the PSHUFLW instruction.
   4633 static unsigned getShufflePSHUFLWImmediate(ShuffleVectorSDNode *N) {
   4634   MVT VT = N->getSimpleValueType(0);
   4635 
   4636   assert((VT == MVT::v8i16 || VT == MVT::v16i16) &&
   4637          "Unsupported vector type for PSHUFHW");
   4638 
   4639   unsigned NumElts = VT.getVectorNumElements();
   4640 
   4641   unsigned Mask = 0;
   4642   for (unsigned l = 0; l != NumElts; l += 8) {
   4643     // 8 nodes per lane, but we only care about the first 4.
   4644     for (unsigned i = 0; i < 4; ++i) {
   4645       int Elt = N->getMaskElt(l+i);
   4646       if (Elt < 0) continue;
   4647       Elt &= 0x3; // only 2-bits
   4648       Mask |= Elt << (i * 2);
   4649     }
   4650   }
   4651 
   4652   return Mask;
   4653 }
   4654 
   4655 /// getShufflePALIGNRImmediate - Return the appropriate immediate to shuffle
   4656 /// the specified VECTOR_SHUFFLE mask with the PALIGNR instruction.
   4657 static unsigned getShufflePALIGNRImmediate(ShuffleVectorSDNode *SVOp) {
   4658   MVT VT = SVOp->getSimpleValueType(0);
   4659   unsigned EltSize = VT.is512BitVector() ? 1 :
   4660     VT.getVectorElementType().getSizeInBits() >> 3;
   4661 
   4662   unsigned NumElts = VT.getVectorNumElements();
   4663   unsigned NumLanes = VT.is512BitVector() ? 1 : VT.getSizeInBits()/128;
   4664   unsigned NumLaneElts = NumElts/NumLanes;
   4665 
   4666   int Val = 0;
   4667   unsigned i;
   4668   for (i = 0; i != NumElts; ++i) {
   4669     Val = SVOp->getMaskElt(i);
   4670     if (Val >= 0)
   4671       break;
   4672   }
   4673   if (Val >= (int)NumElts)
   4674     Val -= NumElts - NumLaneElts;
   4675 
   4676   assert(Val - i > 0 && "PALIGNR imm should be positive");
   4677   return (Val - i) * EltSize;
   4678 }
   4679 
   4680 static unsigned getExtractVEXTRACTImmediate(SDNode *N, unsigned vecWidth) {
   4681   assert((vecWidth == 128 || vecWidth == 256) && "Unsupported vector width");
   4682   if (!isa<ConstantSDNode>(N->getOperand(1).getNode()))
   4683     llvm_unreachable("Illegal extract subvector for VEXTRACT");
   4684 
   4685   uint64_t Index =
   4686     cast<ConstantSDNode>(N->getOperand(1).getNode())->getZExtValue();
   4687 
   4688   MVT VecVT = N->getOperand(0).getSimpleValueType();
   4689   MVT ElVT = VecVT.getVectorElementType();
   4690 
   4691   unsigned NumElemsPerChunk = vecWidth / ElVT.getSizeInBits();
   4692   return Index / NumElemsPerChunk;
   4693 }
   4694 
   4695 static unsigned getInsertVINSERTImmediate(SDNode *N, unsigned vecWidth) {
   4696   assert((vecWidth == 128 || vecWidth == 256) && "Unsupported vector width");
   4697   if (!isa<ConstantSDNode>(N->getOperand(2).getNode()))
   4698     llvm_unreachable("Illegal insert subvector for VINSERT");
   4699 
   4700   uint64_t Index =
   4701     cast<ConstantSDNode>(N->getOperand(2).getNode())->getZExtValue();
   4702 
   4703   MVT VecVT = N->getSimpleValueType(0);
   4704   MVT ElVT = VecVT.getVectorElementType();
   4705 
   4706   unsigned NumElemsPerChunk = vecWidth / ElVT.getSizeInBits();
   4707   return Index / NumElemsPerChunk;
   4708 }
   4709 
   4710 /// getExtractVEXTRACT128Immediate - Return the appropriate immediate
   4711 /// to extract the specified EXTRACT_SUBVECTOR index with VEXTRACTF128
   4712 /// and VINSERTI128 instructions.
   4713 unsigned X86::getExtractVEXTRACT128Immediate(SDNode *N) {
   4714   return getExtractVEXTRACTImmediate(N, 128);
   4715 }
   4716 
   4717 /// getExtractVEXTRACT256Immediate - Return the appropriate immediate
   4718 /// to extract the specified EXTRACT_SUBVECTOR index with VEXTRACTF64x4
   4719 /// and VINSERTI64x4 instructions.
   4720 unsigned X86::getExtractVEXTRACT256Immediate(SDNode *N) {
   4721   return getExtractVEXTRACTImmediate(N, 256);
   4722 }
   4723 
   4724 /// getInsertVINSERT128Immediate - Return the appropriate immediate
   4725 /// to insert at the specified INSERT_SUBVECTOR index with VINSERTF128
   4726 /// and VINSERTI128 instructions.
   4727 unsigned X86::getInsertVINSERT128Immediate(SDNode *N) {
   4728   return getInsertVINSERTImmediate(N, 128);
   4729 }
   4730 
   4731 /// getInsertVINSERT256Immediate - Return the appropriate immediate
   4732 /// to insert at the specified INSERT_SUBVECTOR index with VINSERTF46x4
   4733 /// and VINSERTI64x4 instructions.
   4734 unsigned X86::getInsertVINSERT256Immediate(SDNode *N) {
   4735   return getInsertVINSERTImmediate(N, 256);
   4736 }
   4737 
   4738 /// isZero - Returns true if Elt is a constant integer zero
   4739 static bool isZero(SDValue V) {
   4740   ConstantSDNode *C = dyn_cast<ConstantSDNode>(V);
   4741   return C && C->isNullValue();
   4742 }
   4743 
   4744 /// isZeroNode - Returns true if Elt is a constant zero or a floating point
   4745 /// constant +0.0.
   4746 bool X86::isZeroNode(SDValue Elt) {
   4747   if (isZero(Elt))
   4748     return true;
   4749   if (ConstantFPSDNode *CFP = dyn_cast<ConstantFPSDNode>(Elt))
   4750     return CFP->getValueAPF().isPosZero();
   4751   return false;
   4752 }
   4753 
   4754 /// CommuteVectorShuffle - Swap vector_shuffle operands as well as values in
   4755 /// their permute mask.
   4756 static SDValue CommuteVectorShuffle(ShuffleVectorSDNode *SVOp,
   4757                                     SelectionDAG &DAG) {
   4758   MVT VT = SVOp->getSimpleValueType(0);
   4759   unsigned NumElems = VT.getVectorNumElements();
   4760   SmallVector<int, 8> MaskVec;
   4761 
   4762   for (unsigned i = 0; i != NumElems; ++i) {
   4763     int Idx = SVOp->getMaskElt(i);
   4764     if (Idx >= 0) {
   4765       if (Idx < (int)NumElems)
   4766         Idx += NumElems;
   4767       else
   4768         Idx -= NumElems;
   4769     }
   4770     MaskVec.push_back(Idx);
   4771   }
   4772   return DAG.getVectorShuffle(VT, SDLoc(SVOp), SVOp->getOperand(1),
   4773                               SVOp->getOperand(0), &MaskVec[0]);
   4774 }
   4775 
   4776 /// ShouldXformToMOVHLPS - Return true if the node should be transformed to
   4777 /// match movhlps. The lower half elements should come from upper half of
   4778 /// V1 (and in order), and the upper half elements should come from the upper
   4779 /// half of V2 (and in order).
   4780 static bool ShouldXformToMOVHLPS(ArrayRef<int> Mask, MVT VT) {
   4781   if (!VT.is128BitVector())
   4782     return false;
   4783   if (VT.getVectorNumElements() != 4)
   4784     return false;
   4785   for (unsigned i = 0, e = 2; i != e; ++i)
   4786     if (!isUndefOrEqual(Mask[i], i+2))
   4787       return false;
   4788   for (unsigned i = 2; i != 4; ++i)
   4789     if (!isUndefOrEqual(Mask[i], i+4))
   4790       return false;
   4791   return true;
   4792 }
   4793 
   4794 /// isScalarLoadToVector - Returns true if the node is a scalar load that
   4795 /// is promoted to a vector. It also returns the LoadSDNode by reference if
   4796 /// required.
   4797 static bool isScalarLoadToVector(SDNode *N, LoadSDNode **LD = nullptr) {
   4798   if (N->getOpcode() != ISD::SCALAR_TO_VECTOR)
   4799     return false;
   4800   N = N->getOperand(0).getNode();
   4801   if (!ISD::isNON_EXTLoad(N))
   4802     return false;
   4803   if (LD)
   4804     *LD = cast<LoadSDNode>(N);
   4805   return true;
   4806 }
   4807 
   4808 // Test whether the given value is a vector value which will be legalized
   4809 // into a load.
   4810 static bool WillBeConstantPoolLoad(SDNode *N) {
   4811   if (N->getOpcode() != ISD::BUILD_VECTOR)
   4812     return false;
   4813 
   4814   // Check for any non-constant elements.
   4815   for (unsigned i = 0, e = N->getNumOperands(); i != e; ++i)
   4816     switch (N->getOperand(i).getNode()->getOpcode()) {
   4817     case ISD::UNDEF:
   4818     case ISD::ConstantFP:
   4819     case ISD::Constant:
   4820       break;
   4821     default:
   4822       return false;
   4823     }
   4824 
   4825   // Vectors of all-zeros and all-ones are materialized with special
   4826   // instructions rather than being loaded.
   4827   return !ISD::isBuildVectorAllZeros(N) &&
   4828          !ISD::isBuildVectorAllOnes(N);
   4829 }
   4830 
   4831 /// ShouldXformToMOVLP{S|D} - Return true if the node should be transformed to
   4832 /// match movlp{s|d}. The lower half elements should come from lower half of
   4833 /// V1 (and in order), and the upper half elements should come from the upper
   4834 /// half of V2 (and in order). And since V1 will become the source of the
   4835 /// MOVLP, it must be either a vector load or a scalar load to vector.
   4836 static bool ShouldXformToMOVLP(SDNode *V1, SDNode *V2,
   4837                                ArrayRef<int> Mask, MVT VT) {
   4838   if (!VT.is128BitVector())
   4839     return false;
   4840 
   4841   if (!ISD::isNON_EXTLoad(V1) && !isScalarLoadToVector(V1))
   4842     return false;
   4843   // Is V2 is a vector load, don't do this transformation. We will try to use
   4844   // load folding shufps op.
   4845   if (ISD::isNON_EXTLoad(V2) || WillBeConstantPoolLoad(V2))
   4846     return false;
   4847 
   4848   unsigned NumElems = VT.getVectorNumElements();
   4849 
   4850   if (NumElems != 2 && NumElems != 4)
   4851     return false;
   4852   for (unsigned i = 0, e = NumElems/2; i != e; ++i)
   4853     if (!isUndefOrEqual(Mask[i], i))
   4854       return false;
   4855   for (unsigned i = NumElems/2, e = NumElems; i != e; ++i)
   4856     if (!isUndefOrEqual(Mask[i], i+NumElems))
   4857       return false;
   4858   return true;
   4859 }
   4860 
   4861 /// isZeroShuffle - Returns true if N is a VECTOR_SHUFFLE that can be resolved
   4862 /// to an zero vector.
   4863 /// FIXME: move to dag combiner / method on ShuffleVectorSDNode
   4864 static bool isZeroShuffle(ShuffleVectorSDNode *N) {
   4865   SDValue V1 = N->getOperand(0);
   4866   SDValue V2 = N->getOperand(1);
   4867   unsigned NumElems = N->getValueType(0).getVectorNumElements();
   4868   for (unsigned i = 0; i != NumElems; ++i) {
   4869     int Idx = N->getMaskElt(i);
   4870     if (Idx >= (int)NumElems) {
   4871       unsigned Opc = V2.getOpcode();
   4872       if (Opc == ISD::UNDEF || ISD::isBuildVectorAllZeros(V2.getNode()))
   4873         continue;
   4874       if (Opc != ISD::BUILD_VECTOR ||
   4875           !X86::isZeroNode(V2.getOperand(Idx-NumElems)))
   4876         return false;
   4877     } else if (Idx >= 0) {
   4878       unsigned Opc = V1.getOpcode();
   4879       if (Opc == ISD::UNDEF || ISD::isBuildVectorAllZeros(V1.getNode()))
   4880         continue;
   4881       if (Opc != ISD::BUILD_VECTOR ||
   4882           !X86::isZeroNode(V1.getOperand(Idx)))
   4883         return false;
   4884     }
   4885   }
   4886   return true;
   4887 }
   4888 
   4889 /// getZeroVector - Returns a vector of specified type with all zero elements.
   4890 ///
   4891 static SDValue getZeroVector(EVT VT, const X86Subtarget *Subtarget,
   4892                              SelectionDAG &DAG, SDLoc dl) {
   4893   assert(VT.isVector() && "Expected a vector type");
   4894 
   4895   // Always build SSE zero vectors as <4 x i32> bitcasted
   4896   // to their dest type. This ensures they get CSE'd.
   4897   SDValue Vec;
   4898   if (VT.is128BitVector()) {  // SSE
   4899     if (Subtarget->hasSSE2()) {  // SSE2
   4900       SDValue Cst = DAG.getTargetConstant(0, MVT::i32);
   4901       Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32, Cst, Cst, Cst, Cst);
   4902     } else { // SSE1
   4903       SDValue Cst = DAG.getTargetConstantFP(+0.0, MVT::f32);
   4904       Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4f32, Cst, Cst, Cst, Cst);
   4905     }
   4906   } else if (VT.is256BitVector()) { // AVX
   4907     if (Subtarget->hasInt256()) { // AVX2
   4908       SDValue Cst = DAG.getTargetConstant(0, MVT::i32);
   4909       SDValue Ops[] = { Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst };
   4910       Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v8i32, Ops);
   4911     } else {
   4912       // 256-bit logic and arithmetic instructions in AVX are all
   4913       // floating-point, no support for integer ops. Emit fp zeroed vectors.
   4914       SDValue Cst = DAG.getTargetConstantFP(+0.0, MVT::f32);
   4915       SDValue Ops[] = { Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst };
   4916       Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v8f32, Ops);
   4917     }
   4918   } else if (VT.is512BitVector()) { // AVX-512
   4919       SDValue Cst = DAG.getTargetConstant(0, MVT::i32);
   4920       SDValue Ops[] = { Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst,
   4921                         Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst };
   4922       Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v16i32, Ops);
   4923   } else if (VT.getScalarType() == MVT::i1) {
   4924     assert(VT.getVectorNumElements() <= 16 && "Unexpected vector type");
   4925     SDValue Cst = DAG.getTargetConstant(0, MVT::i1);
   4926     SmallVector<SDValue, 16> Ops(VT.getVectorNumElements(), Cst);
   4927     return DAG.getNode(ISD::BUILD_VECTOR, dl, VT, Ops);
   4928   } else
   4929     llvm_unreachable("Unexpected vector type");
   4930 
   4931   return DAG.getNode(ISD::BITCAST, dl, VT, Vec);
   4932 }
   4933 
   4934 /// getOnesVector - Returns a vector of specified type with all bits set.
   4935 /// Always build ones vectors as <4 x i32> or <8 x i32>. For 256-bit types with
   4936 /// no AVX2 supprt, use two <4 x i32> inserted in a <8 x i32> appropriately.
   4937 /// Then bitcast to their original type, ensuring they get CSE'd.
   4938 static SDValue getOnesVector(MVT VT, bool HasInt256, SelectionDAG &DAG,
   4939                              SDLoc dl) {
   4940   assert(VT.isVector() && "Expected a vector type");
   4941 
   4942   SDValue Cst = DAG.getTargetConstant(~0U, MVT::i32);
   4943   SDValue Vec;
   4944   if (VT.is256BitVector()) {
   4945     if (HasInt256) { // AVX2
   4946       SDValue Ops[] = { Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst };
   4947       Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v8i32, Ops);
   4948     } else { // AVX
   4949       Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32, Cst, Cst, Cst, Cst);
   4950       Vec = Concat128BitVectors(Vec, Vec, MVT::v8i32, 8, DAG, dl);
   4951     }
   4952   } else if (VT.is128BitVector()) {
   4953     Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32, Cst, Cst, Cst, Cst);
   4954   } else
   4955     llvm_unreachable("Unexpected vector type");
   4956 
   4957   return DAG.getNode(ISD::BITCAST, dl, VT, Vec);
   4958 }
   4959 
   4960 /// NormalizeMask - V2 is a splat, modify the mask (if needed) so all elements
   4961 /// that point to V2 points to its first element.
   4962 static void NormalizeMask(SmallVectorImpl<int> &Mask, unsigned NumElems) {
   4963   for (unsigned i = 0; i != NumElems; ++i) {
   4964     if (Mask[i] > (int)NumElems) {
   4965       Mask[i] = NumElems;
   4966     }
   4967   }
   4968 }
   4969 
   4970 /// getMOVLMask - Returns a vector_shuffle mask for an movs{s|d}, movd
   4971 /// operation of specified width.
   4972 static SDValue getMOVL(SelectionDAG &DAG, SDLoc dl, EVT VT, SDValue V1,
   4973                        SDValue V2) {
   4974   unsigned NumElems = VT.getVectorNumElements();
   4975   SmallVector<int, 8> Mask;
   4976   Mask.push_back(NumElems);
   4977   for (unsigned i = 1; i != NumElems; ++i)
   4978     Mask.push_back(i);
   4979   return DAG.getVectorShuffle(VT, dl, V1, V2, &Mask[0]);
   4980 }
   4981 
   4982 /// getUnpackl - Returns a vector_shuffle node for an unpackl operation.
   4983 static SDValue getUnpackl(SelectionDAG &DAG, SDLoc dl, MVT VT, SDValue V1,
   4984                           SDValue V2) {
   4985   unsigned NumElems = VT.getVectorNumElements();
   4986   SmallVector<int, 8> Mask;
   4987   for (unsigned i = 0, e = NumElems/2; i != e; ++i) {
   4988     Mask.push_back(i);
   4989     Mask.push_back(i + NumElems);
   4990   }
   4991   return DAG.getVectorShuffle(VT, dl, V1, V2, &Mask[0]);
   4992 }
   4993 
   4994 /// getUnpackh - Returns a vector_shuffle node for an unpackh operation.
   4995 static SDValue getUnpackh(SelectionDAG &DAG, SDLoc dl, MVT VT, SDValue V1,
   4996                           SDValue V2) {
   4997   unsigned NumElems = VT.getVectorNumElements();
   4998   SmallVector<int, 8> Mask;
   4999   for (unsigned i = 0, Half = NumElems/2; i != Half; ++i) {
   5000     Mask.push_back(i + Half);
   5001     Mask.push_back(i + NumElems + Half);
   5002   }
   5003   return DAG.getVectorShuffle(VT, dl, V1, V2, &Mask[0]);
   5004 }
   5005 
   5006 // PromoteSplati8i16 - All i16 and i8 vector types can't be used directly by
   5007 // a generic shuffle instruction because the target has no such instructions.
   5008 // Generate shuffles which repeat i16 and i8 several times until they can be
   5009 // represented by v4f32 and then be manipulated by target suported shuffles.
   5010 static SDValue PromoteSplati8i16(SDValue V, SelectionDAG &DAG, int &EltNo) {
   5011   MVT VT = V.getSimpleValueType();
   5012   int NumElems = VT.getVectorNumElements();
   5013   SDLoc dl(V);
   5014 
   5015   while (NumElems > 4) {
   5016     if (EltNo < NumElems/2) {
   5017       V = getUnpackl(DAG, dl, VT, V, V);
   5018     } else {
   5019       V = getUnpackh(DAG, dl, VT, V, V);
   5020       EltNo -= NumElems/2;
   5021     }
   5022     NumElems >>= 1;
   5023   }
   5024   return V;
   5025 }
   5026 
   5027 /// getLegalSplat - Generate a legal splat with supported x86 shuffles
   5028 static SDValue getLegalSplat(SelectionDAG &DAG, SDValue V, int EltNo) {
   5029   MVT VT = V.getSimpleValueType();
   5030   SDLoc dl(V);
   5031 
   5032   if (VT.is128BitVector()) {
   5033     V = DAG.getNode(ISD::BITCAST, dl, MVT::v4f32, V);
   5034     int SplatMask[4] = { EltNo, EltNo, EltNo, EltNo };
   5035     V = DAG.getVectorShuffle(MVT::v4f32, dl, V, DAG.getUNDEF(MVT::v4f32),
   5036                              &SplatMask[0]);
   5037   } else if (VT.is256BitVector()) {
   5038     // To use VPERMILPS to splat scalars, the second half of indicies must
   5039     // refer to the higher part, which is a duplication of the lower one,
   5040     // because VPERMILPS can only handle in-lane permutations.
   5041     int SplatMask[8] = { EltNo, EltNo, EltNo, EltNo,
   5042                          EltNo+4, EltNo+4, EltNo+4, EltNo+4 };
   5043 
   5044     V = DAG.getNode(ISD::BITCAST, dl, MVT::v8f32, V);
   5045     V = DAG.getVectorShuffle(MVT::v8f32, dl, V, DAG.getUNDEF(MVT::v8f32),
   5046                              &SplatMask[0]);
   5047   } else
   5048     llvm_unreachable("Vector size not supported");
   5049 
   5050   return DAG.getNode(ISD::BITCAST, dl, VT, V);
   5051 }
   5052 
   5053 /// PromoteSplat - Splat is promoted to target supported vector shuffles.
   5054 static SDValue PromoteSplat(ShuffleVectorSDNode *SV, SelectionDAG &DAG) {
   5055   MVT SrcVT = SV->getSimpleValueType(0);
   5056   SDValue V1 = SV->getOperand(0);
   5057   SDLoc dl(SV);
   5058 
   5059   int EltNo = SV->getSplatIndex();
   5060   int NumElems = SrcVT.getVectorNumElements();
   5061   bool Is256BitVec = SrcVT.is256BitVector();
   5062 
   5063   assert(((SrcVT.is128BitVector() && NumElems > 4) || Is256BitVec) &&
   5064          "Unknown how to promote splat for type");
   5065 
   5066   // Extract the 128-bit part containing the splat element and update
   5067   // the splat element index when it refers to the higher register.
   5068   if (Is256BitVec) {
   5069     V1 = Extract128BitVector(V1, EltNo, DAG, dl);
   5070     if (EltNo >= NumElems/2)
   5071       EltNo -= NumElems/2;
   5072   }
   5073 
   5074   // All i16 and i8 vector types can't be used directly by a generic shuffle
   5075   // instruction because the target has no such instruction. Generate shuffles
   5076   // which repeat i16 and i8 several times until they fit in i32, and then can
   5077   // be manipulated by target suported shuffles.
   5078   MVT EltVT = SrcVT.getVectorElementType();
   5079   if (EltVT == MVT::i8 || EltVT == MVT::i16)
   5080     V1 = PromoteSplati8i16(V1, DAG, EltNo);
   5081 
   5082   // Recreate the 256-bit vector and place the same 128-bit vector
   5083   // into the low and high part. This is necessary because we want
   5084   // to use VPERM* to shuffle the vectors
   5085   if (Is256BitVec) {
   5086     V1 = DAG.getNode(ISD::CONCAT_VECTORS, dl, SrcVT, V1, V1);
   5087   }
   5088 
   5089   return getLegalSplat(DAG, V1, EltNo);
   5090 }
   5091 
   5092 /// getShuffleVectorZeroOrUndef - Return a vector_shuffle of the specified
   5093 /// vector of zero or undef vector.  This produces a shuffle where the low
   5094 /// element of V2 is swizzled into the zero/undef vector, landing at element
   5095 /// Idx.  This produces a shuffle mask like 4,1,2,3 (idx=0) or  0,1,2,4 (idx=3).
   5096 static SDValue getShuffleVectorZeroOrUndef(SDValue V2, unsigned Idx,
   5097                                            bool IsZero,
   5098                                            const X86Subtarget *Subtarget,
   5099                                            SelectionDAG &DAG) {
   5100   MVT VT = V2.getSimpleValueType();
   5101   SDValue V1 = IsZero
   5102     ? getZeroVector(VT, Subtarget, DAG, SDLoc(V2)) : DAG.getUNDEF(VT);
   5103   unsigned NumElems = VT.getVectorNumElements();
   5104   SmallVector<int, 16> MaskVec;
   5105   for (unsigned i = 0; i != NumElems; ++i)
   5106     // If this is the insertion idx, put the low elt of V2 here.
   5107     MaskVec.push_back(i == Idx ? NumElems : i);
   5108   return DAG.getVectorShuffle(VT, SDLoc(V2), V1, V2, &MaskVec[0]);
   5109 }
   5110 
   5111 /// getTargetShuffleMask - Calculates the shuffle mask corresponding to the
   5112 /// target specific opcode. Returns true if the Mask could be calculated.
   5113 /// Sets IsUnary to true if only uses one source.
   5114 static bool getTargetShuffleMask(SDNode *N, MVT VT,
   5115                                  SmallVectorImpl<int> &Mask, bool &IsUnary) {
   5116   unsigned NumElems = VT.getVectorNumElements();
   5117   SDValue ImmN;
   5118 
   5119   IsUnary = false;
   5120   switch(N->getOpcode()) {
   5121   case X86ISD::SHUFP:
   5122     ImmN = N->getOperand(N->getNumOperands()-1);
   5123     DecodeSHUFPMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
   5124     break;
   5125   case X86ISD::UNPCKH:
   5126     DecodeUNPCKHMask(VT, Mask);
   5127     break;
   5128   case X86ISD::UNPCKL:
   5129     DecodeUNPCKLMask(VT, Mask);
   5130     break;
   5131   case X86ISD::MOVHLPS:
   5132     DecodeMOVHLPSMask(NumElems, Mask);
   5133     break;
   5134   case X86ISD::MOVLHPS:
   5135     DecodeMOVLHPSMask(NumElems, Mask);
   5136     break;
   5137   case X86ISD::PALIGNR:
   5138     ImmN = N->getOperand(N->getNumOperands()-1);
   5139     DecodePALIGNRMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
   5140     break;
   5141   case X86ISD::PSHUFD:
   5142   case X86ISD::VPERMILP:
   5143     ImmN = N->getOperand(N->getNumOperands()-1);
   5144     DecodePSHUFMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
   5145     IsUnary = true;
   5146     break;
   5147   case X86ISD::PSHUFHW:
   5148     ImmN = N->getOperand(N->getNumOperands()-1);
   5149     DecodePSHUFHWMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
   5150     IsUnary = true;
   5151     break;
   5152   case X86ISD::PSHUFLW:
   5153     ImmN = N->getOperand(N->getNumOperands()-1);
   5154     DecodePSHUFLWMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
   5155     IsUnary = true;
   5156     break;
   5157   case X86ISD::VPERMI:
   5158     ImmN = N->getOperand(N->getNumOperands()-1);
   5159     DecodeVPERMMask(cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
   5160     IsUnary = true;
   5161     break;
   5162   case X86ISD::MOVSS:
   5163   case X86ISD::MOVSD: {
   5164     // The index 0 always comes from the first element of the second source,
   5165     // this is why MOVSS and MOVSD are used in the first place. The other
   5166     // elements come from the other positions of the first source vector
   5167     Mask.push_back(NumElems);
   5168     for (unsigned i = 1; i != NumElems; ++i) {
   5169       Mask.push_back(i);
   5170     }
   5171     break;
   5172   }
   5173   case X86ISD::VPERM2X128:
   5174     ImmN = N->getOperand(N->getNumOperands()-1);
   5175     DecodeVPERM2X128Mask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
   5176     if (Mask.empty()) return false;
   5177     break;
   5178   case X86ISD::MOVDDUP:
   5179   case X86ISD::MOVLHPD:
   5180   case X86ISD::MOVLPD:
   5181   case X86ISD::MOVLPS:
   5182   case X86ISD::MOVSHDUP:
   5183   case X86ISD::MOVSLDUP:
   5184     // Not yet implemented
   5185     return false;
   5186   default: llvm_unreachable("unknown target shuffle node");
   5187   }
   5188 
   5189   return true;
   5190 }
   5191 
   5192 /// getShuffleScalarElt - Returns the scalar element that will make up the ith
   5193 /// element of the result of the vector shuffle.
   5194 static SDValue getShuffleScalarElt(SDNode *N, unsigned Index, SelectionDAG &DAG,
   5195                                    unsigned Depth) {
   5196   if (Depth == 6)
   5197     return SDValue();  // Limit search depth.
   5198 
   5199   SDValue V = SDValue(N, 0);
   5200   EVT VT = V.getValueType();
   5201   unsigned Opcode = V.getOpcode();
   5202 
   5203   // Recurse into ISD::VECTOR_SHUFFLE node to find scalars.
   5204   if (const ShuffleVectorSDNode *SV = dyn_cast<ShuffleVectorSDNode>(N)) {
   5205     int Elt = SV->getMaskElt(Index);
   5206 
   5207     if (Elt < 0)
   5208       return DAG.getUNDEF(VT.getVectorElementType());
   5209 
   5210     unsigned NumElems = VT.getVectorNumElements();
   5211     SDValue NewV = (Elt < (int)NumElems) ? SV->getOperand(0)
   5212                                          : SV->getOperand(1);
   5213     return getShuffleScalarElt(NewV.getNode(), Elt % NumElems, DAG, Depth+1);
   5214   }
   5215 
   5216   // Recurse into target specific vector shuffles to find scalars.
   5217   if (isTargetShuffle(Opcode)) {
   5218     MVT ShufVT = V.getSimpleValueType();
   5219     unsigned NumElems = ShufVT.getVectorNumElements();
   5220     SmallVector<int, 16> ShuffleMask;
   5221     bool IsUnary;
   5222 
   5223     if (!getTargetShuffleMask(N, ShufVT, ShuffleMask, IsUnary))
   5224       return SDValue();
   5225 
   5226     int Elt = ShuffleMask[Index];
   5227     if (Elt < 0)
   5228       return DAG.getUNDEF(ShufVT.getVectorElementType());
   5229 
   5230     SDValue NewV = (Elt < (int)NumElems) ? N->getOperand(0)
   5231                                          : N->getOperand(1);
   5232     return getShuffleScalarElt(NewV.getNode(), Elt % NumElems, DAG,
   5233                                Depth+1);
   5234   }
   5235 
   5236   // Actual nodes that may contain scalar elements
   5237   if (Opcode == ISD::BITCAST) {
   5238     V = V.getOperand(0);
   5239     EVT SrcVT = V.getValueType();
   5240     unsigned NumElems = VT.getVectorNumElements();
   5241 
   5242     if (!SrcVT.isVector() || SrcVT.getVectorNumElements() != NumElems)
   5243       return SDValue();
   5244   }
   5245 
   5246   if (V.getOpcode() == ISD::SCALAR_TO_VECTOR)
   5247     return (Index == 0) ? V.getOperand(0)
   5248                         : DAG.getUNDEF(VT.getVectorElementType());
   5249 
   5250   if (V.getOpcode() == ISD::BUILD_VECTOR)
   5251     return V.getOperand(Index);
   5252 
   5253   return SDValue();
   5254 }
   5255 
   5256 /// getNumOfConsecutiveZeros - Return the number of elements of a vector
   5257 /// shuffle operation which come from a consecutively from a zero. The
   5258 /// search can start in two different directions, from left or right.
   5259 /// We count undefs as zeros until PreferredNum is reached.
   5260 static unsigned getNumOfConsecutiveZeros(ShuffleVectorSDNode *SVOp,
   5261                                          unsigned NumElems, bool ZerosFromLeft,
   5262                                          SelectionDAG &DAG,
   5263                                          unsigned PreferredNum = -1U) {
   5264   unsigned NumZeros = 0;
   5265   for (unsigned i = 0; i != NumElems; ++i) {
   5266     unsigned Index = ZerosFromLeft ? i : NumElems - i - 1;
   5267     SDValue Elt = getShuffleScalarElt(SVOp, Index, DAG, 0);
   5268     if (!Elt.getNode())
   5269       break;
   5270 
   5271     if (X86::isZeroNode(Elt))
   5272       ++NumZeros;
   5273     else if (Elt.getOpcode() == ISD::UNDEF) // Undef as zero up to PreferredNum.
   5274       NumZeros = std::min(NumZeros + 1, PreferredNum);
   5275     else
   5276       break;
   5277   }
   5278 
   5279   return NumZeros;
   5280 }
   5281 
   5282 /// isShuffleMaskConsecutive - Check if the shuffle mask indicies [MaskI, MaskE)
   5283 /// correspond consecutively to elements from one of the vector operands,
   5284 /// starting from its index OpIdx. Also tell OpNum which source vector operand.
   5285 static
   5286 bool isShuffleMaskConsecutive(ShuffleVectorSDNode *SVOp,
   5287                               unsigned MaskI, unsigned MaskE, unsigned OpIdx,
   5288                               unsigned NumElems, unsigned &OpNum) {
   5289   bool SeenV1 = false;
   5290   bool SeenV2 = false;
   5291 
   5292   for (unsigned i = MaskI; i != MaskE; ++i, ++OpIdx) {
   5293     int Idx = SVOp->getMaskElt(i);
   5294     // Ignore undef indicies
   5295     if (Idx < 0)
   5296       continue;
   5297 
   5298     if (Idx < (int)NumElems)
   5299       SeenV1 = true;
   5300     else
   5301       SeenV2 = true;
   5302 
   5303     // Only accept consecutive elements from the same vector
   5304     if ((Idx % NumElems != OpIdx) || (SeenV1 && SeenV2))
   5305       return false;
   5306   }
   5307 
   5308   OpNum = SeenV1 ? 0 : 1;
   5309   return true;
   5310 }
   5311 
   5312 /// isVectorShiftRight - Returns true if the shuffle can be implemented as a
   5313 /// logical left shift of a vector.
   5314 static bool isVectorShiftRight(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG,
   5315                                bool &isLeft, SDValue &ShVal, unsigned &ShAmt) {
   5316   unsigned NumElems =
   5317     SVOp->getSimpleValueType(0).getVectorNumElements();
   5318   unsigned NumZeros = getNumOfConsecutiveZeros(
   5319       SVOp, NumElems, false /* check zeros from right */, DAG,
   5320       SVOp->getMaskElt(0));
   5321   unsigned OpSrc;
   5322 
   5323   if (!NumZeros)
   5324     return false;
   5325 
   5326   // Considering the elements in the mask that are not consecutive zeros,
   5327   // check if they consecutively come from only one of the source vectors.
   5328   //
   5329   //               V1 = {X, A, B, C}     0
   5330   //                         \  \  \    /
   5331   //   vector_shuffle V1, V2 <1, 2, 3, X>
   5332   //
   5333   if (!isShuffleMaskConsecutive(SVOp,
   5334             0,                   // Mask Start Index
   5335             NumElems-NumZeros,   // Mask End Index(exclusive)
   5336             NumZeros,            // Where to start looking in the src vector
   5337             NumElems,            // Number of elements in vector
   5338             OpSrc))              // Which source operand ?
   5339     return false;
   5340 
   5341   isLeft = false;
   5342   ShAmt = NumZeros;
   5343   ShVal = SVOp->getOperand(OpSrc);
   5344   return true;
   5345 }
   5346 
   5347 /// isVectorShiftLeft - Returns true if the shuffle can be implemented as a
   5348 /// logical left shift of a vector.
   5349 static bool isVectorShiftLeft(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG,
   5350                               bool &isLeft, SDValue &ShVal, unsigned &ShAmt) {
   5351   unsigned NumElems =
   5352     SVOp->getSimpleValueType(0).getVectorNumElements();
   5353   unsigned NumZeros = getNumOfConsecutiveZeros(
   5354       SVOp, NumElems, true /* check zeros from left */, DAG,
   5355       NumElems - SVOp->getMaskElt(NumElems - 1) - 1);
   5356   unsigned OpSrc;
   5357 
   5358   if (!NumZeros)
   5359     return false;
   5360 
   5361   // Considering the elements in the mask that are not consecutive zeros,
   5362   // check if they consecutively come from only one of the source vectors.
   5363   //
   5364   //                           0    { A, B, X, X } = V2
   5365   //                          / \    /  /
   5366   //   vector_shuffle V1, V2 <X, X, 4, 5>
   5367   //
   5368   if (!isShuffleMaskConsecutive(SVOp,
   5369             NumZeros,     // Mask Start Index
   5370             NumElems,     // Mask End Index(exclusive)
   5371             0,            // Where to start looking in the src vector
   5372             NumElems,     // Number of elements in vector
   5373             OpSrc))       // Which source operand ?
   5374     return false;
   5375 
   5376   isLeft = true;
   5377   ShAmt = NumZeros;
   5378   ShVal = SVOp->getOperand(OpSrc);
   5379   return true;
   5380 }
   5381 
   5382 /// isVectorShift - Returns true if the shuffle can be implemented as a
   5383 /// logical left or right shift of a vector.
   5384 static bool isVectorShift(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG,
   5385                           bool &isLeft, SDValue &ShVal, unsigned &ShAmt) {
   5386   // Although the logic below support any bitwidth size, there are no
   5387   // shift instructions which handle more than 128-bit vectors.
   5388   if (!SVOp->getSimpleValueType(0).is128BitVector())
   5389     return false;
   5390 
   5391   if (isVectorShiftLeft(SVOp, DAG, isLeft, ShVal, ShAmt) ||
   5392       isVectorShiftRight(SVOp, DAG, isLeft, ShVal, ShAmt))
   5393     return true;
   5394 
   5395   return false;
   5396 }
   5397 
   5398 /// LowerBuildVectorv16i8 - Custom lower build_vector of v16i8.
   5399 ///
   5400 static SDValue LowerBuildVectorv16i8(SDValue Op, unsigned NonZeros,
   5401                                        unsigned NumNonZero, unsigned NumZero,
   5402                                        SelectionDAG &DAG,
   5403                                        const X86Subtarget* Subtarget,
   5404                                        const TargetLowering &TLI) {
   5405   if (NumNonZero > 8)
   5406     return SDValue();
   5407 
   5408   SDLoc dl(Op);
   5409   SDValue V;
   5410   bool First = true;
   5411   for (unsigned i = 0; i < 16; ++i) {
   5412     bool ThisIsNonZero = (NonZeros & (1 << i)) != 0;
   5413     if (ThisIsNonZero && First) {
   5414       if (NumZero)
   5415         V = getZeroVector(MVT::v8i16, Subtarget, DAG, dl);
   5416       else
   5417         V = DAG.getUNDEF(MVT::v8i16);
   5418       First = false;
   5419     }
   5420 
   5421     if ((i & 1) != 0) {
   5422       SDValue ThisElt, LastElt;
   5423       bool LastIsNonZero = (NonZeros & (1 << (i-1))) != 0;
   5424       if (LastIsNonZero) {
   5425         LastElt = DAG.getNode(ISD::ZERO_EXTEND, dl,
   5426                               MVT::i16, Op.getOperand(i-1));
   5427       }
   5428       if (ThisIsNonZero) {
   5429         ThisElt = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i16, Op.getOperand(i));
   5430         ThisElt = DAG.getNode(ISD::SHL, dl, MVT::i16,
   5431                               ThisElt, DAG.getConstant(8, MVT::i8));
   5432         if (LastIsNonZero)
   5433           ThisElt = DAG.getNode(ISD::OR, dl, MVT::i16, ThisElt, LastElt);
   5434       } else
   5435         ThisElt = LastElt;
   5436 
   5437       if (ThisElt.getNode())
   5438         V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v8i16, V, ThisElt,
   5439                         DAG.getIntPtrConstant(i/2));
   5440     }
   5441   }
   5442 
   5443   return DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, V);
   5444 }
   5445 
   5446 /// LowerBuildVectorv8i16 - Custom lower build_vector of v8i16.
   5447 ///
   5448 static SDValue LowerBuildVectorv8i16(SDValue Op, unsigned NonZeros,
   5449                                      unsigned NumNonZero, unsigned NumZero,
   5450                                      SelectionDAG &DAG,
   5451                                      const X86Subtarget* Subtarget,
   5452                                      const TargetLowering &TLI) {
   5453   if (NumNonZero > 4)
   5454     return SDValue();
   5455 
   5456   SDLoc dl(Op);
   5457   SDValue V;
   5458   bool First = true;
   5459   for (unsigned i = 0; i < 8; ++i) {
   5460     bool isNonZero = (NonZeros & (1 << i)) != 0;
   5461     if (isNonZero) {
   5462       if (First) {
   5463         if (NumZero)
   5464           V = getZeroVector(MVT::v8i16, Subtarget, DAG, dl);
   5465         else
   5466           V = DAG.getUNDEF(MVT::v8i16);
   5467         First = false;
   5468       }
   5469       V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl,
   5470                       MVT::v8i16, V, Op.getOperand(i),
   5471                       DAG.getIntPtrConstant(i));
   5472     }
   5473   }
   5474 
   5475   return V;
   5476 }
   5477 
   5478 /// LowerBuildVectorv4x32 - Custom lower build_vector of v4i32 or v4f32.
   5479 static SDValue LowerBuildVectorv4x32(SDValue Op, unsigned NumElems,
   5480                                      unsigned NonZeros, unsigned NumNonZero,
   5481                                      unsigned NumZero, SelectionDAG &DAG,
   5482                                      const X86Subtarget *Subtarget,
   5483                                      const TargetLowering &TLI) {
   5484   // We know there's at least one non-zero element
   5485   unsigned FirstNonZeroIdx = 0;
   5486   SDValue FirstNonZero = Op->getOperand(FirstNonZeroIdx);
   5487   while (FirstNonZero.getOpcode() == ISD::UNDEF ||
   5488          X86::isZeroNode(FirstNonZero)) {
   5489     ++FirstNonZeroIdx;
   5490     FirstNonZero = Op->getOperand(FirstNonZeroIdx);
   5491   }
   5492 
   5493   if (FirstNonZero.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
   5494       !isa<ConstantSDNode>(FirstNonZero.getOperand(1)))
   5495     return SDValue();
   5496 
   5497   SDValue V = FirstNonZero.getOperand(0);
   5498   MVT VVT = V.getSimpleValueType();
   5499   if (!Subtarget->hasSSE41() || (VVT != MVT::v4f32 && VVT != MVT::v4i32))
   5500     return SDValue();
   5501 
   5502   unsigned FirstNonZeroDst =
   5503       cast<ConstantSDNode>(FirstNonZero.getOperand(1))->getZExtValue();
   5504   unsigned CorrectIdx = FirstNonZeroDst == FirstNonZeroIdx;
   5505   unsigned IncorrectIdx = CorrectIdx ? -1U : FirstNonZeroIdx;
   5506   unsigned IncorrectDst = CorrectIdx ? -1U : FirstNonZeroDst;
   5507 
   5508   for (unsigned Idx = FirstNonZeroIdx + 1; Idx < NumElems; ++Idx) {
   5509     SDValue Elem = Op.getOperand(Idx);
   5510     if (Elem.getOpcode() == ISD::UNDEF || X86::isZeroNode(Elem))
   5511       continue;
   5512 
   5513     // TODO: What else can be here? Deal with it.
   5514     if (Elem.getOpcode() != ISD::EXTRACT_VECTOR_ELT)
   5515       return SDValue();
   5516 
   5517     // TODO: Some optimizations are still possible here
   5518     // ex: Getting one element from a vector, and the rest from another.
   5519     if (Elem.getOperand(0) != V)
   5520       return SDValue();
   5521 
   5522     unsigned Dst = cast<ConstantSDNode>(Elem.getOperand(1))->getZExtValue();
   5523     if (Dst == Idx)
   5524       ++CorrectIdx;
   5525     else if (IncorrectIdx == -1U) {
   5526       IncorrectIdx = Idx;
   5527       IncorrectDst = Dst;
   5528     } else
   5529       // There was already one element with an incorrect index.
   5530       // We can't optimize this case to an insertps.
   5531       return SDValue();
   5532   }
   5533 
   5534   if (NumNonZero == CorrectIdx || NumNonZero == CorrectIdx + 1) {
   5535     SDLoc dl(Op);
   5536     EVT VT = Op.getSimpleValueType();
   5537     unsigned ElementMoveMask = 0;
   5538     if (IncorrectIdx == -1U)
   5539       ElementMoveMask = FirstNonZeroIdx << 6 | FirstNonZeroIdx << 4;
   5540     else
   5541       ElementMoveMask = IncorrectDst << 6 | IncorrectIdx << 4;
   5542 
   5543     SDValue InsertpsMask =
   5544         DAG.getIntPtrConstant(ElementMoveMask | (~NonZeros & 0xf));
   5545     return DAG.getNode(X86ISD::INSERTPS, dl, VT, V, V, InsertpsMask);
   5546   }
   5547 
   5548   return SDValue();
   5549 }
   5550 
   5551 /// getVShift - Return a vector logical shift node.
   5552 ///
   5553 static SDValue getVShift(bool isLeft, EVT VT, SDValue SrcOp,
   5554                          unsigned NumBits, SelectionDAG &DAG,
   5555                          const TargetLowering &TLI, SDLoc dl) {
   5556   assert(VT.is128BitVector() && "Unknown type for VShift");
   5557   EVT ShVT = MVT::v2i64;
   5558   unsigned Opc = isLeft ? X86ISD::VSHLDQ : X86ISD::VSRLDQ;
   5559   SrcOp = DAG.getNode(ISD::BITCAST, dl, ShVT, SrcOp);
   5560   return DAG.getNode(ISD::BITCAST, dl, VT,
   5561                      DAG.getNode(Opc, dl, ShVT, SrcOp,
   5562                              DAG.getConstant(NumBits,
   5563                                   TLI.getScalarShiftAmountTy(SrcOp.getValueType()))));
   5564 }
   5565 
   5566 static SDValue
   5567 LowerAsSplatVectorLoad(SDValue SrcOp, MVT VT, SDLoc dl, SelectionDAG &DAG) {
   5568 
   5569   // Check if the scalar load can be widened into a vector load. And if
   5570   // the address is "base + cst" see if the cst can be "absorbed" into
   5571   // the shuffle mask.
   5572   if (LoadSDNode *LD = dyn_cast<LoadSDNode>(SrcOp)) {
   5573     SDValue Ptr = LD->getBasePtr();
   5574     if (!ISD::isNormalLoad(LD) || LD->isVolatile())
   5575       return SDValue();
   5576     EVT PVT = LD->getValueType(0);
   5577     if (PVT != MVT::i32 && PVT != MVT::f32)
   5578       return SDValue();
   5579 
   5580     int FI = -1;
   5581     int64_t Offset = 0;
   5582     if (FrameIndexSDNode *FINode = dyn_cast<FrameIndexSDNode>(Ptr)) {
   5583       FI = FINode->getIndex();
   5584       Offset = 0;
   5585     } else if (DAG.isBaseWithConstantOffset(Ptr) &&
   5586                isa<FrameIndexSDNode>(Ptr.getOperand(0))) {
   5587       FI = cast<FrameIndexSDNode>(Ptr.getOperand(0))->getIndex();
   5588       Offset = Ptr.getConstantOperandVal(1);
   5589       Ptr = Ptr.getOperand(0);
   5590     } else {
   5591       return SDValue();
   5592     }
   5593 
   5594     // FIXME: 256-bit vector instructions don't require a strict alignment,
   5595     // improve this code to support it better.
   5596     unsigned RequiredAlign = VT.getSizeInBits()/8;
   5597     SDValue Chain = LD->getChain();
   5598     // Make sure the stack object alignment is at least 16 or 32.
   5599     MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo();
   5600     if (DAG.InferPtrAlignment(Ptr) < RequiredAlign) {
   5601       if (MFI->isFixedObjectIndex(FI)) {
   5602         // Can't change the alignment. FIXME: It's possible to compute
   5603         // the exact stack offset and reference FI + adjust offset instead.
   5604         // If someone *really* cares about this. That's the way to implement it.
   5605         return SDValue();
   5606       } else {
   5607         MFI->setObjectAlignment(FI, RequiredAlign);
   5608       }
   5609     }
   5610 
   5611     // (Offset % 16 or 32) must be multiple of 4. Then address is then
   5612     // Ptr + (Offset & ~15).
   5613     if (Offset < 0)
   5614       return SDValue();
   5615     if ((Offset % RequiredAlign) & 3)
   5616       return SDValue();
   5617     int64_t StartOffset = Offset & ~(RequiredAlign-1);
   5618     if (StartOffset)
   5619       Ptr = DAG.getNode(ISD::ADD, SDLoc(Ptr), Ptr.getValueType(),
   5620                         Ptr,DAG.getConstant(StartOffset, Ptr.getValueType()));
   5621 
   5622     int EltNo = (Offset - StartOffset) >> 2;
   5623     unsigned NumElems = VT.getVectorNumElements();
   5624 
   5625     EVT NVT = EVT::getVectorVT(*DAG.getContext(), PVT, NumElems);
   5626     SDValue V1 = DAG.getLoad(NVT, dl, Chain, Ptr,
   5627                              LD->getPointerInfo().getWithOffset(StartOffset),
   5628                              false, false, false, 0);
   5629 
   5630     SmallVector<int, 8> Mask;
   5631     for (unsigned i = 0; i != NumElems; ++i)
   5632       Mask.push_back(EltNo);
   5633 
   5634     return DAG.getVectorShuffle(NVT, dl, V1, DAG.getUNDEF(NVT), &Mask[0]);
   5635   }
   5636 
   5637   return SDValue();
   5638 }
   5639 
   5640 /// EltsFromConsecutiveLoads - Given the initializing elements 'Elts' of a
   5641 /// vector of type 'VT', see if the elements can be replaced by a single large
   5642 /// load which has the same value as a build_vector whose operands are 'elts'.
   5643 ///
   5644 /// Example: <load i32 *a, load i32 *a+4, undef, undef> -> zextload a
   5645 ///
   5646 /// FIXME: we'd also like to handle the case where the last elements are zero
   5647 /// rather than undef via VZEXT_LOAD, but we do not detect that case today.
   5648 /// There's even a handy isZeroNode for that purpose.
   5649 static SDValue EltsFromConsecutiveLoads(EVT VT, SmallVectorImpl<SDValue> &Elts,
   5650                                         SDLoc &DL, SelectionDAG &DAG,
   5651                                         bool isAfterLegalize) {
   5652   EVT EltVT = VT.getVectorElementType();
   5653   unsigned NumElems = Elts.size();
   5654 
   5655   LoadSDNode *LDBase = nullptr;
   5656   unsigned LastLoadedElt = -1U;
   5657 
   5658   // For each element in the initializer, see if we've found a load or an undef.
   5659   // If we don't find an initial load element, or later load elements are
   5660   // non-consecutive, bail out.
   5661   for (unsigned i = 0; i < NumElems; ++i) {
   5662     SDValue Elt = Elts[i];
   5663 
   5664     if (!Elt.getNode() ||
   5665         (Elt.getOpcode() != ISD::UNDEF && !ISD::isNON_EXTLoad(Elt.getNode())))
   5666       return SDValue();
   5667     if (!LDBase) {
   5668       if (Elt.getNode()->getOpcode() == ISD::UNDEF)
   5669         return SDValue();
   5670       LDBase = cast<LoadSDNode>(Elt.getNode());
   5671       LastLoadedElt = i;
   5672       continue;
   5673     }
   5674     if (Elt.getOpcode() == ISD::UNDEF)
   5675       continue;
   5676 
   5677     LoadSDNode *LD = cast<LoadSDNode>(Elt);
   5678     if (!DAG.isConsecutiveLoad(LD, LDBase, EltVT.getSizeInBits()/8, i))
   5679       return SDValue();
   5680     LastLoadedElt = i;
   5681   }
   5682 
   5683   // If we have found an entire vector of loads and undefs, then return a large
   5684   // load of the entire vector width starting at the base pointer.  If we found
   5685   // consecutive loads for the low half, generate a vzext_load node.
   5686   if (LastLoadedElt == NumElems - 1) {
   5687 
   5688     if (isAfterLegalize &&
   5689         !DAG.getTargetLoweringInfo().isOperationLegal(ISD::LOAD, VT))
   5690       return SDValue();
   5691 
   5692     SDValue NewLd = SDValue();
   5693 
   5694     if (DAG.InferPtrAlignment(LDBase->getBasePtr()) >= 16)
   5695       NewLd = DAG.getLoad(VT, DL, LDBase->getChain(), LDBase->getBasePtr(),
   5696                           LDBase->getPointerInfo(),
   5697                           LDBase->isVolatile(), LDBase->isNonTemporal(),
   5698                           LDBase->isInvariant(), 0);
   5699     NewLd = DAG.getLoad(VT, DL, LDBase->getChain(), LDBase->getBasePtr(),
   5700                         LDBase->getPointerInfo(),
   5701                         LDBase->isVolatile(), LDBase->isNonTemporal(),
   5702                         LDBase->isInvariant(), LDBase->getAlignment());
   5703 
   5704     if (LDBase->hasAnyUseOfValue(1)) {
   5705       SDValue NewChain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other,
   5706                                      SDValue(LDBase, 1),
   5707                                      SDValue(NewLd.getNode(), 1));
   5708       DAG.ReplaceAllUsesOfValueWith(SDValue(LDBase, 1), NewChain);
   5709       DAG.UpdateNodeOperands(NewChain.getNode(), SDValue(LDBase, 1),
   5710                              SDValue(NewLd.getNode(), 1));
   5711     }
   5712 
   5713     return NewLd;
   5714   }
   5715   if (NumElems == 4 && LastLoadedElt == 1 &&
   5716       DAG.getTargetLoweringInfo().isTypeLegal(MVT::v2i64)) {
   5717     SDVTList Tys = DAG.getVTList(MVT::v2i64, MVT::Other);
   5718     SDValue Ops[] = { LDBase->getChain(), LDBase->getBasePtr() };
   5719     SDValue ResNode =
   5720         DAG.getMemIntrinsicNode(X86ISD::VZEXT_LOAD, DL, Tys, Ops, MVT::i64,
   5721                                 LDBase->getPointerInfo(),
   5722                                 LDBase->getAlignment(),
   5723                                 false/*isVolatile*/, true/*ReadMem*/,
   5724                                 false/*WriteMem*/);
   5725 
   5726     // Make sure the newly-created LOAD is in the same position as LDBase in
   5727     // terms of dependency. We create a TokenFactor for LDBase and ResNode, and
   5728     // update uses of LDBase's output chain to use the TokenFactor.
   5729     if (LDBase->hasAnyUseOfValue(1)) {
   5730       SDValue NewChain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other,
   5731                              SDValue(LDBase, 1), SDValue(ResNode.getNode(), 1));
   5732       DAG.ReplaceAllUsesOfValueWith(SDValue(LDBase, 1), NewChain);
   5733       DAG.UpdateNodeOperands(NewChain.getNode(), SDValue(LDBase, 1),
   5734                              SDValue(ResNode.getNode(), 1));
   5735     }
   5736 
   5737     return DAG.getNode(ISD::BITCAST, DL, VT, ResNode);
   5738   }
   5739   return SDValue();
   5740 }
   5741 
   5742 /// LowerVectorBroadcast - Attempt to use the vbroadcast instruction
   5743 /// to generate a splat value for the following cases:
   5744 /// 1. A splat BUILD_VECTOR which uses a single scalar load, or a constant.
   5745 /// 2. A splat shuffle which uses a scalar_to_vector node which comes from
   5746 /// a scalar load, or a constant.
   5747 /// The VBROADCAST node is returned when a pattern is found,
   5748 /// or SDValue() otherwise.
   5749 static SDValue LowerVectorBroadcast(SDValue Op, const X86Subtarget* Subtarget,
   5750                                     SelectionDAG &DAG) {
   5751   if (!Subtarget->hasFp256())
   5752     return SDValue();
   5753 
   5754   MVT VT = Op.getSimpleValueType();
   5755   SDLoc dl(Op);
   5756 
   5757   assert((VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()) &&
   5758          "Unsupported vector type for broadcast.");
   5759 
   5760   SDValue Ld;
   5761   bool ConstSplatVal;
   5762 
   5763   switch (Op.getOpcode()) {
   5764     default:
   5765       // Unknown pattern found.
   5766       return SDValue();
   5767 
   5768     case ISD::BUILD_VECTOR: {
   5769       auto *BVOp = cast<BuildVectorSDNode>(Op.getNode());
   5770       BitVector UndefElements;
   5771       SDValue Splat = BVOp->getSplatValue(&UndefElements);
   5772 
   5773       // We need a splat of a single value to use broadcast, and it doesn't
   5774       // make any sense if the value is only in one element of the vector.
   5775       if (!Splat || (VT.getVectorNumElements() - UndefElements.count()) <= 1)
   5776         return SDValue();
   5777 
   5778       Ld = Splat;
   5779       ConstSplatVal = (Ld.getOpcode() == ISD::Constant ||
   5780                        Ld.getOpcode() == ISD::ConstantFP);
   5781 
   5782       // Make sure that all of the users of a non-constant load are from the
   5783       // BUILD_VECTOR node.
   5784       if (!ConstSplatVal && !BVOp->isOnlyUserOf(Ld.getNode()))
   5785         return SDValue();
   5786       break;
   5787     }
   5788 
   5789     case ISD::VECTOR_SHUFFLE: {
   5790       ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
   5791 
   5792       // Shuffles must have a splat mask where the first element is
   5793       // broadcasted.
   5794       if ((!SVOp->isSplat()) || SVOp->getMaskElt(0) != 0)
   5795         return SDValue();
   5796 
   5797       SDValue Sc = Op.getOperand(0);
   5798       if (Sc.getOpcode() != ISD::SCALAR_TO_VECTOR &&
   5799           Sc.getOpcode() != ISD::BUILD_VECTOR) {
   5800 
   5801         if (!Subtarget->hasInt256())
   5802           return SDValue();
   5803 
   5804         // Use the register form of the broadcast instruction available on AVX2.
   5805         if (VT.getSizeInBits() >= 256)
   5806           Sc = Extract128BitVector(Sc, 0, DAG, dl);
   5807         return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Sc);
   5808       }
   5809 
   5810       Ld = Sc.getOperand(0);
   5811       ConstSplatVal = (Ld.getOpcode() == ISD::Constant ||
   5812                        Ld.getOpcode() == ISD::ConstantFP);
   5813 
   5814       // The scalar_to_vector node and the suspected
   5815       // load node must have exactly one user.
   5816       // Constants may have multiple users.
   5817 
   5818       // AVX-512 has register version of the broadcast
   5819       bool hasRegVer = Subtarget->hasAVX512() && VT.is512BitVector() &&
   5820         Ld.getValueType().getSizeInBits() >= 32;
   5821       if (!ConstSplatVal && ((!Sc.hasOneUse() || !Ld.hasOneUse()) &&
   5822           !hasRegVer))
   5823         return SDValue();
   5824       break;
   5825     }
   5826   }
   5827 
   5828   bool IsGE256 = (VT.getSizeInBits() >= 256);
   5829 
   5830   // Handle the broadcasting a single constant scalar from the constant pool
   5831   // into a vector. On Sandybridge it is still better to load a constant vector
   5832   // from the constant pool and not to broadcast it from a scalar.
   5833   if (ConstSplatVal && Subtarget->hasInt256()) {
   5834     EVT CVT = Ld.getValueType();
   5835     assert(!CVT.isVector() && "Must not broadcast a vector type");
   5836     unsigned ScalarSize = CVT.getSizeInBits();
   5837 
   5838     if (ScalarSize == 32 || (IsGE256 && ScalarSize == 64)) {
   5839       const Constant *C = nullptr;
   5840       if (ConstantSDNode *CI = dyn_cast<ConstantSDNode>(Ld))
   5841         C = CI->getConstantIntValue();
   5842       else if (ConstantFPSDNode *CF = dyn_cast<ConstantFPSDNode>(Ld))
   5843         C = CF->getConstantFPValue();
   5844 
   5845       assert(C && "Invalid constant type");
   5846 
   5847       const TargetLowering &TLI = DAG.getTargetLoweringInfo();
   5848       SDValue CP = DAG.getConstantPool(C, TLI.getPointerTy());
   5849       unsigned Alignment = cast<ConstantPoolSDNode>(CP)->getAlignment();
   5850       Ld = DAG.getLoad(CVT, dl, DAG.getEntryNode(), CP,
   5851                        MachinePointerInfo::getConstantPool(),
   5852                        false, false, false, Alignment);
   5853 
   5854       return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld);
   5855     }
   5856   }
   5857 
   5858   bool IsLoad = ISD::isNormalLoad(Ld.getNode());
   5859   unsigned ScalarSize = Ld.getValueType().getSizeInBits();
   5860 
   5861   // Handle AVX2 in-register broadcasts.
   5862   if (!IsLoad && Subtarget->hasInt256() &&
   5863       (ScalarSize == 32 || (IsGE256 && ScalarSize == 64)))
   5864     return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld);
   5865 
   5866   // The scalar source must be a normal load.
   5867   if (!IsLoad)
   5868     return SDValue();
   5869 
   5870   if (ScalarSize == 32 || (IsGE256 && ScalarSize == 64))
   5871     return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld);
   5872 
   5873   // The integer check is needed for the 64-bit into 128-bit so it doesn't match
   5874   // double since there is no vbroadcastsd xmm
   5875   if (Subtarget->hasInt256() && Ld.getValueType().isInteger()) {
   5876     if (ScalarSize == 8 || ScalarSize == 16 || ScalarSize == 64)
   5877       return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld);
   5878   }
   5879 
   5880   // Unsupported broadcast.
   5881   return SDValue();
   5882 }
   5883 
   5884 /// \brief For an EXTRACT_VECTOR_ELT with a constant index return the real
   5885 /// underlying vector and index.
   5886 ///
   5887 /// Modifies \p ExtractedFromVec to the real vector and returns the real
   5888 /// index.
   5889 static int getUnderlyingExtractedFromVec(SDValue &ExtractedFromVec,
   5890                                          SDValue ExtIdx) {
   5891   int Idx = cast<ConstantSDNode>(ExtIdx)->getZExtValue();
   5892   if (!isa<ShuffleVectorSDNode>(ExtractedFromVec))
   5893     return Idx;
   5894 
   5895   // For 256-bit vectors, LowerEXTRACT_VECTOR_ELT_SSE4 may have already
   5896   // lowered this:
   5897   //   (extract_vector_elt (v8f32 %vreg1), Constant<6>)
   5898   // to:
   5899   //   (extract_vector_elt (vector_shuffle<2,u,u,u>
   5900   //                           (extract_subvector (v8f32 %vreg0), Constant<4>),
   5901   //                           undef)
   5902   //                       Constant<0>)
   5903   // In this case the vector is the extract_subvector expression and the index
   5904   // is 2, as specified by the shuffle.
   5905   ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(ExtractedFromVec);
   5906   SDValue ShuffleVec = SVOp->getOperand(0);
   5907   MVT ShuffleVecVT = ShuffleVec.getSimpleValueType();
   5908   assert(ShuffleVecVT.getVectorElementType() ==
   5909          ExtractedFromVec.getSimpleValueType().getVectorElementType());
   5910 
   5911   int ShuffleIdx = SVOp->getMaskElt(Idx);
   5912   if (isUndefOrInRange(ShuffleIdx, 0, ShuffleVecVT.getVectorNumElements())) {
   5913     ExtractedFromVec = ShuffleVec;
   5914     return ShuffleIdx;
   5915   }
   5916   return Idx;
   5917 }
   5918 
   5919 static SDValue buildFromShuffleMostly(SDValue Op, SelectionDAG &DAG) {
   5920   MVT VT = Op.getSimpleValueType();
   5921 
   5922   // Skip if insert_vec_elt is not supported.
   5923   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
   5924   if (!TLI.isOperationLegalOrCustom(ISD::INSERT_VECTOR_ELT, VT))
   5925     return SDValue();
   5926 
   5927   SDLoc DL(Op);
   5928   unsigned NumElems = Op.getNumOperands();
   5929 
   5930   SDValue VecIn1;
   5931   SDValue VecIn2;
   5932   SmallVector<unsigned, 4> InsertIndices;
   5933   SmallVector<int, 8> Mask(NumElems, -1);
   5934 
   5935   for (unsigned i = 0; i != NumElems; ++i) {
   5936     unsigned Opc = Op.getOperand(i).getOpcode();
   5937 
   5938     if (Opc == ISD::UNDEF)
   5939       continue;
   5940 
   5941     if (Opc != ISD::EXTRACT_VECTOR_ELT) {
   5942       // Quit if more than 1 elements need inserting.
   5943       if (InsertIndices.size() > 1)
   5944         return SDValue();
   5945 
   5946       InsertIndices.push_back(i);
   5947       continue;
   5948     }
   5949 
   5950     SDValue ExtractedFromVec = Op.getOperand(i).getOperand(0);
   5951     SDValue ExtIdx = Op.getOperand(i).getOperand(1);
   5952     // Quit if non-constant index.
   5953     if (!isa<ConstantSDNode>(ExtIdx))
   5954       return SDValue();
   5955     int Idx = getUnderlyingExtractedFromVec(ExtractedFromVec, ExtIdx);
   5956 
   5957     // Quit if extracted from vector of different type.
   5958     if (ExtractedFromVec.getValueType() != VT)
   5959       return SDValue();
   5960 
   5961     if (!VecIn1.getNode())
   5962       VecIn1 = ExtractedFromVec;
   5963     else if (VecIn1 != ExtractedFromVec) {
   5964       if (!VecIn2.getNode())
   5965         VecIn2 = ExtractedFromVec;
   5966       else if (VecIn2 != ExtractedFromVec)
   5967         // Quit if more than 2 vectors to shuffle
   5968         return SDValue();
   5969     }
   5970 
   5971     if (ExtractedFromVec == VecIn1)
   5972       Mask[i] = Idx;
   5973     else if (ExtractedFromVec == VecIn2)
   5974       Mask[i] = Idx + NumElems;
   5975   }
   5976 
   5977   if (!VecIn1.getNode())
   5978     return SDValue();
   5979 
   5980   VecIn2 = VecIn2.getNode() ? VecIn2 : DAG.getUNDEF(VT);
   5981   SDValue NV = DAG.getVectorShuffle(VT, DL, VecIn1, VecIn2, &Mask[0]);
   5982   for (unsigned i = 0, e = InsertIndices.size(); i != e; ++i) {
   5983     unsigned Idx = InsertIndices[i];
   5984     NV = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, VT, NV, Op.getOperand(Idx),
   5985                      DAG.getIntPtrConstant(Idx));
   5986   }
   5987 
   5988   return NV;
   5989 }
   5990 
   5991 // Lower BUILD_VECTOR operation for v8i1 and v16i1 types.
   5992 SDValue
   5993 X86TargetLowering::LowerBUILD_VECTORvXi1(SDValue Op, SelectionDAG &DAG) const {
   5994 
   5995   MVT VT = Op.getSimpleValueType();
   5996   assert((VT.getVectorElementType() == MVT::i1) && (VT.getSizeInBits() <= 16) &&
   5997          "Unexpected type in LowerBUILD_VECTORvXi1!");
   5998 
   5999   SDLoc dl(Op);
   6000   if (ISD::isBuildVectorAllZeros(Op.getNode())) {
   6001     SDValue Cst = DAG.getTargetConstant(0, MVT::i1);
   6002     SmallVector<SDValue, 16> Ops(VT.getVectorNumElements(), Cst);
   6003     return DAG.getNode(ISD::BUILD_VECTOR, dl, VT, Ops);
   6004   }
   6005 
   6006   if (ISD::isBuildVectorAllOnes(Op.getNode())) {
   6007     SDValue Cst = DAG.getTargetConstant(1, MVT::i1);
   6008     SmallVector<SDValue, 16> Ops(VT.getVectorNumElements(), Cst);
   6009     return DAG.getNode(ISD::BUILD_VECTOR, dl, VT, Ops);
   6010   }
   6011 
   6012   bool AllContants = true;
   6013   uint64_t Immediate = 0;
   6014   int NonConstIdx = -1;
   6015   bool IsSplat = true;
   6016   unsigned NumNonConsts = 0;
   6017   unsigned NumConsts = 0;
   6018   for (unsigned idx = 0, e = Op.getNumOperands(); idx < e; ++idx) {
   6019     SDValue In = Op.getOperand(idx);
   6020     if (In.getOpcode() == ISD::UNDEF)
   6021       continue;
   6022     if (!isa<ConstantSDNode>(In)) {
   6023       AllContants = false;
   6024       NonConstIdx = idx;
   6025       NumNonConsts++;
   6026     }
   6027     else {
   6028       NumConsts++;
   6029       if (cast<ConstantSDNode>(In)->getZExtValue())
   6030       Immediate |= (1ULL << idx);
   6031     }
   6032     if (In != Op.getOperand(0))
   6033       IsSplat = false;
   6034   }
   6035 
   6036   if (AllContants) {
   6037     SDValue FullMask = DAG.getNode(ISD::BITCAST, dl, MVT::v16i1,
   6038       DAG.getConstant(Immediate, MVT::i16));
   6039     return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, FullMask,
   6040                        DAG.getIntPtrConstant(0));
   6041   }
   6042 
   6043   if (NumNonConsts == 1 && NonConstIdx != 0) {
   6044     SDValue DstVec;
   6045     if (NumConsts) {
   6046       SDValue VecAsImm = DAG.getConstant(Immediate,
   6047                                          MVT::getIntegerVT(VT.getSizeInBits()));
   6048       DstVec = DAG.getNode(ISD::BITCAST, dl, VT, VecAsImm);
   6049     }
   6050     else
   6051       DstVec = DAG.getUNDEF(VT);
   6052     return DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, DstVec,
   6053                        Op.getOperand(NonConstIdx),
   6054                        DAG.getIntPtrConstant(NonConstIdx));
   6055   }
   6056   if (!IsSplat && (NonConstIdx != 0))
   6057     llvm_unreachable("Unsupported BUILD_VECTOR operation");
   6058   MVT SelectVT = (VT == MVT::v16i1)? MVT::i16 : MVT::i8;
   6059   SDValue Select;
   6060   if (IsSplat)
   6061     Select = DAG.getNode(ISD::SELECT, dl, SelectVT, Op.getOperand(0),
   6062                           DAG.getConstant(-1, SelectVT),
   6063                           DAG.getConstant(0, SelectVT));
   6064   else
   6065     Select = DAG.getNode(ISD::SELECT, dl, SelectVT, Op.getOperand(0),
   6066                          DAG.getConstant((Immediate | 1), SelectVT),
   6067                          DAG.getConstant(Immediate, SelectVT));
   6068   return DAG.getNode(ISD::BITCAST, dl, VT, Select);
   6069 }
   6070 
   6071 /// \brief Return true if \p N implements a horizontal binop and return the
   6072 /// operands for the horizontal binop into V0 and V1.
   6073 ///
   6074 /// This is a helper function of PerformBUILD_VECTORCombine.
   6075 /// This function checks that the build_vector \p N in input implements a
   6076 /// horizontal operation. Parameter \p Opcode defines the kind of horizontal
   6077 /// operation to match.
   6078 /// For example, if \p Opcode is equal to ISD::ADD, then this function
   6079 /// checks if \p N implements a horizontal arithmetic add; if instead \p Opcode
   6080 /// is equal to ISD::SUB, then this function checks if this is a horizontal
   6081 /// arithmetic sub.
   6082 ///
   6083 /// This function only analyzes elements of \p N whose indices are
   6084 /// in range [BaseIdx, LastIdx).
   6085 static bool isHorizontalBinOp(const BuildVectorSDNode *N, unsigned Opcode,
   6086                               SelectionDAG &DAG,
   6087                               unsigned BaseIdx, unsigned LastIdx,
   6088                               SDValue &V0, SDValue &V1) {
   6089   EVT VT = N->getValueType(0);
   6090 
   6091   assert(BaseIdx * 2 <= LastIdx && "Invalid Indices in input!");
   6092   assert(VT.isVector() && VT.getVectorNumElements() >= LastIdx &&
   6093          "Invalid Vector in input!");
   6094 
   6095   bool IsCommutable = (Opcode == ISD::ADD || Opcode == ISD::FADD);
   6096   bool CanFold = true;
   6097   unsigned ExpectedVExtractIdx = BaseIdx;
   6098   unsigned NumElts = LastIdx - BaseIdx;
   6099   V0 = DAG.getUNDEF(VT);
   6100   V1 = DAG.getUNDEF(VT);
   6101 
   6102   // Check if N implements a horizontal binop.
   6103   for (unsigned i = 0, e = NumElts; i != e && CanFold; ++i) {
   6104     SDValue Op = N->getOperand(i + BaseIdx);
   6105 
   6106     // Skip UNDEFs.
   6107     if (Op->getOpcode() == ISD::UNDEF) {
   6108       // Update the expected vector extract index.
   6109       if (i * 2 == NumElts)
   6110         ExpectedVExtractIdx = BaseIdx;
   6111       ExpectedVExtractIdx += 2;
   6112       continue;
   6113     }
   6114 
   6115     CanFold = Op->getOpcode() == Opcode && Op->hasOneUse();
   6116 
   6117     if (!CanFold)
   6118       break;
   6119 
   6120     SDValue Op0 = Op.getOperand(0);
   6121     SDValue Op1 = Op.getOperand(1);
   6122 
   6123     // Try to match the following pattern:
   6124     // (BINOP (extract_vector_elt A, I), (extract_vector_elt A, I+1))
   6125     CanFold = (Op0.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
   6126         Op1.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
   6127         Op0.getOperand(0) == Op1.getOperand(0) &&
   6128         isa<ConstantSDNode>(Op0.getOperand(1)) &&
   6129         isa<ConstantSDNode>(Op1.getOperand(1)));
   6130     if (!CanFold)
   6131       break;
   6132 
   6133     unsigned I0 = cast<ConstantSDNode>(Op0.getOperand(1))->getZExtValue();
   6134     unsigned I1 = cast<ConstantSDNode>(Op1.getOperand(1))->getZExtValue();
   6135 
   6136     if (i * 2 < NumElts) {
   6137       if (V0.getOpcode() == ISD::UNDEF)
   6138         V0 = Op0.getOperand(0);
   6139     } else {
   6140       if (V1.getOpcode() == ISD::UNDEF)
   6141         V1 = Op0.getOperand(0);
   6142       if (i * 2 == NumElts)
   6143         ExpectedVExtractIdx = BaseIdx;
   6144     }
   6145 
   6146     SDValue Expected = (i * 2 < NumElts) ? V0 : V1;
   6147     if (I0 == ExpectedVExtractIdx)
   6148       CanFold = I1 == I0 + 1 && Op0.getOperand(0) == Expected;
   6149     else if (IsCommutable && I1 == ExpectedVExtractIdx) {
   6150       // Try to match the following dag sequence:
   6151       // (BINOP (extract_vector_elt A, I+1), (extract_vector_elt A, I))
   6152       CanFold = I0 == I1 + 1 && Op1.getOperand(0) == Expected;
   6153     } else
   6154       CanFold = false;
   6155 
   6156     ExpectedVExtractIdx += 2;
   6157   }
   6158 
   6159   return CanFold;
   6160 }
   6161 
   6162 /// \brief Emit a sequence of two 128-bit horizontal add/sub followed by
   6163 /// a concat_vector.
   6164 ///
   6165 /// This is a helper function of PerformBUILD_VECTORCombine.
   6166 /// This function expects two 256-bit vectors called V0 and V1.
   6167 /// At first, each vector is split into two separate 128-bit vectors.
   6168 /// Then, the resulting 128-bit vectors are used to implement two
   6169 /// horizontal binary operations.
   6170 ///
   6171 /// The kind of horizontal binary operation is defined by \p X86Opcode.
   6172 ///
   6173 /// \p Mode specifies how the 128-bit parts of V0 and V1 are passed in input to
   6174 /// the two new horizontal binop.
   6175 /// When Mode is set, the first horizontal binop dag node would take as input
   6176 /// the lower 128-bit of V0 and the upper 128-bit of V0. The second
   6177 /// horizontal binop dag node would take as input the lower 128-bit of V1
   6178 /// and the upper 128-bit of V1.
   6179 ///   Example:
   6180 ///     HADD V0_LO, V0_HI
   6181 ///     HADD V1_LO, V1_HI
   6182 ///
   6183 /// Otherwise, the first horizontal binop dag node takes as input the lower
   6184 /// 128-bit of V0 and the lower 128-bit of V1, and the second horizontal binop
   6185 /// dag node takes the the upper 128-bit of V0 and the upper 128-bit of V1.
   6186 ///   Example:
   6187 ///     HADD V0_LO, V1_LO
   6188 ///     HADD V0_HI, V1_HI
   6189 ///
   6190 /// If \p isUndefLO is set, then the algorithm propagates UNDEF to the lower
   6191 /// 128-bits of the result. If \p isUndefHI is set, then UNDEF is propagated to
   6192 /// the upper 128-bits of the result.
   6193 static SDValue ExpandHorizontalBinOp(const SDValue &V0, const SDValue &V1,
   6194                                      SDLoc DL, SelectionDAG &DAG,
   6195                                      unsigned X86Opcode, bool Mode,
   6196                                      bool isUndefLO, bool isUndefHI) {
   6197   EVT VT = V0.getValueType();
   6198   assert(VT.is256BitVector() && VT == V1.getValueType() &&
   6199          "Invalid nodes in input!");
   6200 
   6201   unsigned NumElts = VT.getVectorNumElements();
   6202   SDValue V0_LO = Extract128BitVector(V0, 0, DAG, DL);
   6203   SDValue V0_HI = Extract128BitVector(V0, NumElts/2, DAG, DL);
   6204   SDValue V1_LO = Extract128BitVector(V1, 0, DAG, DL);
   6205   SDValue V1_HI = Extract128BitVector(V1, NumElts/2, DAG, DL);
   6206   EVT NewVT = V0_LO.getValueType();
   6207 
   6208   SDValue LO = DAG.getUNDEF(NewVT);
   6209   SDValue HI = DAG.getUNDEF(NewVT);
   6210 
   6211   if (Mode) {
   6212     // Don't emit a horizontal binop if the result is expected to be UNDEF.
   6213     if (!isUndefLO && V0->getOpcode() != ISD::UNDEF)
   6214       LO = DAG.getNode(X86Opcode, DL, NewVT, V0_LO, V0_HI);
   6215     if (!isUndefHI && V1->getOpcode() != ISD::UNDEF)
   6216       HI = DAG.getNode(X86Opcode, DL, NewVT, V1_LO, V1_HI);
   6217   } else {
   6218     // Don't emit a horizontal binop if the result is expected to be UNDEF.
   6219     if (!isUndefLO && (V0_LO->getOpcode() != ISD::UNDEF ||
   6220                        V1_LO->getOpcode() != ISD::UNDEF))
   6221       LO = DAG.getNode(X86Opcode, DL, NewVT, V0_LO, V1_LO);
   6222 
   6223     if (!isUndefHI && (V0_HI->getOpcode() != ISD::UNDEF ||
   6224                        V1_HI->getOpcode() != ISD::UNDEF))
   6225       HI = DAG.getNode(X86Opcode, DL, NewVT, V0_HI, V1_HI);
   6226   }
   6227 
   6228   return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, LO, HI);
   6229 }
   6230 
   6231 /// \brief Try to fold a build_vector that performs an 'addsub' into the
   6232 /// sequence of 'vadd + vsub + blendi'.
   6233 static SDValue matchAddSub(const BuildVectorSDNode *BV, SelectionDAG &DAG,
   6234                            const X86Subtarget *Subtarget) {
   6235   SDLoc DL(BV);
   6236   EVT VT = BV->getValueType(0);
   6237   unsigned NumElts = VT.getVectorNumElements();
   6238   SDValue InVec0 = DAG.getUNDEF(VT);
   6239   SDValue InVec1 = DAG.getUNDEF(VT);
   6240 
   6241   assert((VT == MVT::v8f32 || VT == MVT::v4f64 || VT == MVT::v4f32 ||
   6242           VT == MVT::v2f64) && "build_vector with an invalid type found!");
   6243 
   6244   // Don't try to emit a VSELECT that cannot be lowered into a blend.
   6245   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
   6246   if (!TLI.isOperationLegalOrCustom(ISD::VSELECT, VT))
   6247     return SDValue();
   6248 
   6249   // Odd-numbered elements in the input build vector are obtained from
   6250   // adding two integer/float elements.
   6251   // Even-numbered elements in the input build vector are obtained from
   6252   // subtracting two integer/float elements.
   6253   unsigned ExpectedOpcode = ISD::FSUB;
   6254   unsigned NextExpectedOpcode = ISD::FADD;
   6255   bool AddFound = false;
   6256   bool SubFound = false;
   6257 
   6258   for (unsigned i = 0, e = NumElts; i != e; i++) {
   6259     SDValue Op = BV->getOperand(i);
   6260 
   6261     // Skip 'undef' values.
   6262     unsigned Opcode = Op.getOpcode();
   6263     if (Opcode == ISD::UNDEF) {
   6264       std::swap(ExpectedOpcode, NextExpectedOpcode);
   6265       continue;
   6266     }
   6267 
   6268     // Early exit if we found an unexpected opcode.
   6269     if (Opcode != ExpectedOpcode)
   6270       return SDValue();
   6271 
   6272     SDValue Op0 = Op.getOperand(0);
   6273     SDValue Op1 = Op.getOperand(1);
   6274 
   6275     // Try to match the following pattern:
   6276     // (BINOP (extract_vector_elt A, i), (extract_vector_elt B, i))
   6277     // Early exit if we cannot match that sequence.
   6278     if (Op0.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
   6279         Op1.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
   6280         !isa<ConstantSDNode>(Op0.getOperand(1)) ||
   6281         !isa<ConstantSDNode>(Op1.getOperand(1)) ||
   6282         Op0.getOperand(1) != Op1.getOperand(1))
   6283       return SDValue();
   6284 
   6285     unsigned I0 = cast<ConstantSDNode>(Op0.getOperand(1))->getZExtValue();
   6286     if (I0 != i)
   6287       return SDValue();
   6288 
   6289     // We found a valid add/sub node. Update the information accordingly.
   6290     if (i & 1)
   6291       AddFound = true;
   6292     else
   6293       SubFound = true;
   6294 
   6295     // Update InVec0 and InVec1.
   6296     if (InVec0.getOpcode() == ISD::UNDEF)
   6297       InVec0 = Op0.getOperand(0);
   6298     if (InVec1.getOpcode() == ISD::UNDEF)
   6299       InVec1 = Op1.getOperand(0);
   6300 
   6301     // Make sure that operands in input to each add/sub node always
   6302     // come from a same pair of vectors.
   6303     if (InVec0 != Op0.getOperand(0)) {
   6304       if (ExpectedOpcode == ISD::FSUB)
   6305         return SDValue();
   6306 
   6307       // FADD is commutable. Try to commute the operands
   6308       // and then test again.
   6309       std::swap(Op0, Op1);
   6310       if (InVec0 != Op0.getOperand(0))
   6311         return SDValue();
   6312     }
   6313 
   6314     if (InVec1 != Op1.getOperand(0))
   6315       return SDValue();
   6316 
   6317     // Update the pair of expected opcodes.
   6318     std::swap(ExpectedOpcode, NextExpectedOpcode);
   6319   }
   6320 
   6321   // Don't try to fold this build_vector into a VSELECT if it has
   6322   // too many UNDEF operands.
   6323   if (AddFound && SubFound && InVec0.getOpcode() != ISD::UNDEF &&
   6324       InVec1.getOpcode() != ISD::UNDEF) {
   6325     // Emit a sequence of vector add and sub followed by a VSELECT.
   6326     // The new VSELECT will be lowered into a BLENDI.
   6327     // At ISel stage, we pattern-match the sequence 'add + sub + BLENDI'
   6328     // and emit a single ADDSUB instruction.
   6329     SDValue Sub = DAG.getNode(ExpectedOpcode, DL, VT, InVec0, InVec1);
   6330     SDValue Add = DAG.getNode(NextExpectedOpcode, DL, VT, InVec0, InVec1);
   6331 
   6332     // Construct the VSELECT mask.
   6333     EVT MaskVT = VT.changeVectorElementTypeToInteger();
   6334     EVT SVT = MaskVT.getVectorElementType();
   6335     unsigned SVTBits = SVT.getSizeInBits();
   6336     SmallVector<SDValue, 8> Ops;
   6337 
   6338     for (unsigned i = 0, e = NumElts; i != e; ++i) {
   6339       APInt Value = i & 1 ? APInt::getNullValue(SVTBits) :
   6340                             APInt::getAllOnesValue(SVTBits);
   6341       SDValue Constant = DAG.getConstant(Value, SVT);
   6342       Ops.push_back(Constant);
   6343     }
   6344 
   6345     SDValue Mask = DAG.getNode(ISD::BUILD_VECTOR, DL, MaskVT, Ops);
   6346     return DAG.getSelect(DL, VT, Mask, Sub, Add);
   6347   }
   6348 
   6349   return SDValue();
   6350 }
   6351 
   6352 static SDValue PerformBUILD_VECTORCombine(SDNode *N, SelectionDAG &DAG,
   6353                                           const X86Subtarget *Subtarget) {
   6354   SDLoc DL(N);
   6355   EVT VT = N->getValueType(0);
   6356   unsigned NumElts = VT.getVectorNumElements();
   6357   BuildVectorSDNode *BV = cast<BuildVectorSDNode>(N);
   6358   SDValue InVec0, InVec1;
   6359 
   6360   // Try to match an ADDSUB.
   6361   if ((Subtarget->hasSSE3() && (VT == MVT::v4f32 || VT == MVT::v2f64)) ||
   6362       (Subtarget->hasAVX() && (VT == MVT::v8f32 || VT == MVT::v4f64))) {
   6363     SDValue Value = matchAddSub(BV, DAG, Subtarget);
   6364     if (Value.getNode())
   6365       return Value;
   6366   }
   6367 
   6368   // Try to match horizontal ADD/SUB.
   6369   unsigned NumUndefsLO = 0;
   6370   unsigned NumUndefsHI = 0;
   6371   unsigned Half = NumElts/2;
   6372 
   6373   // Count the number of UNDEF operands in the build_vector in input.
   6374   for (unsigned i = 0, e = Half; i != e; ++i)
   6375     if (BV->getOperand(i)->getOpcode() == ISD::UNDEF)
   6376       NumUndefsLO++;
   6377 
   6378   for (unsigned i = Half, e = NumElts; i != e; ++i)
   6379     if (BV->getOperand(i)->getOpcode() == ISD::UNDEF)
   6380       NumUndefsHI++;
   6381 
   6382   // Early exit if this is either a build_vector of all UNDEFs or all the
   6383   // operands but one are UNDEF.
   6384   if (NumUndefsLO + NumUndefsHI + 1 >= NumElts)
   6385     return SDValue();
   6386 
   6387   if ((VT == MVT::v4f32 || VT == MVT::v2f64) && Subtarget->hasSSE3()) {
   6388     // Try to match an SSE3 float HADD/HSUB.
   6389     if (isHorizontalBinOp(BV, ISD::FADD, DAG, 0, NumElts, InVec0, InVec1))
   6390       return DAG.getNode(X86ISD::FHADD, DL, VT, InVec0, InVec1);
   6391 
   6392     if (isHorizontalBinOp(BV, ISD::FSUB, DAG, 0, NumElts, InVec0, InVec1))
   6393       return DAG.getNode(X86ISD::FHSUB, DL, VT, InVec0, InVec1);
   6394   } else if ((VT == MVT::v4i32 || VT == MVT::v8i16) && Subtarget->hasSSSE3()) {
   6395     // Try to match an SSSE3 integer HADD/HSUB.
   6396     if (isHorizontalBinOp(BV, ISD::ADD, DAG, 0, NumElts, InVec0, InVec1))
   6397       return DAG.getNode(X86ISD::HADD, DL, VT, InVec0, InVec1);
   6398 
   6399     if (isHorizontalBinOp(BV, ISD::SUB, DAG, 0, NumElts, InVec0, InVec1))
   6400       return DAG.getNode(X86ISD::HSUB, DL, VT, InVec0, InVec1);
   6401   }
   6402 
   6403   if (!Subtarget->hasAVX())
   6404     return SDValue();
   6405 
   6406   if ((VT == MVT::v8f32 || VT == MVT::v4f64)) {
   6407     // Try to match an AVX horizontal add/sub of packed single/double
   6408     // precision floating point values from 256-bit vectors.
   6409     SDValue InVec2, InVec3;
   6410     if (isHorizontalBinOp(BV, ISD::FADD, DAG, 0, Half, InVec0, InVec1) &&
   6411         isHorizontalBinOp(BV, ISD::FADD, DAG, Half, NumElts, InVec2, InVec3) &&
   6412         ((InVec0.getOpcode() == ISD::UNDEF ||
   6413           InVec2.getOpcode() == ISD::UNDEF) || InVec0 == InVec2) &&
   6414         ((InVec1.getOpcode() == ISD::UNDEF ||
   6415           InVec3.getOpcode() == ISD::UNDEF) || InVec1 == InVec3))
   6416       return DAG.getNode(X86ISD::FHADD, DL, VT, InVec0, InVec1);
   6417 
   6418     if (isHorizontalBinOp(BV, ISD::FSUB, DAG, 0, Half, InVec0, InVec1) &&
   6419         isHorizontalBinOp(BV, ISD::FSUB, DAG, Half, NumElts, InVec2, InVec3) &&
   6420         ((InVec0.getOpcode() == ISD::UNDEF ||
   6421           InVec2.getOpcode() == ISD::UNDEF) || InVec0 == InVec2) &&
   6422         ((InVec1.getOpcode() == ISD::UNDEF ||
   6423           InVec3.getOpcode() == ISD::UNDEF) || InVec1 == InVec3))
   6424       return DAG.getNode(X86ISD::FHSUB, DL, VT, InVec0, InVec1);
   6425   } else if (VT == MVT::v8i32 || VT == MVT::v16i16) {
   6426     // Try to match an AVX2 horizontal add/sub of signed integers.
   6427     SDValue InVec2, InVec3;
   6428     unsigned X86Opcode;
   6429     bool CanFold = true;
   6430 
   6431     if (isHorizontalBinOp(BV, ISD::ADD, DAG, 0, Half, InVec0, InVec1) &&
   6432         isHorizontalBinOp(BV, ISD::ADD, DAG, Half, NumElts, InVec2, InVec3) &&
   6433         ((InVec0.getOpcode() == ISD::UNDEF ||
   6434           InVec2.getOpcode() == ISD::UNDEF) || InVec0 == InVec2) &&
   6435         ((InVec1.getOpcode() == ISD::UNDEF ||
   6436           InVec3.getOpcode() == ISD::UNDEF) || InVec1 == InVec3))
   6437       X86Opcode = X86ISD::HADD;
   6438     else if (isHorizontalBinOp(BV, ISD::SUB, DAG, 0, Half, InVec0, InVec1) &&
   6439         isHorizontalBinOp(BV, ISD::SUB, DAG, Half, NumElts, InVec2, InVec3) &&
   6440         ((InVec0.getOpcode() == ISD::UNDEF ||
   6441           InVec2.getOpcode() == ISD::UNDEF) || InVec0 == InVec2) &&
   6442         ((InVec1.getOpcode() == ISD::UNDEF ||
   6443           InVec3.getOpcode() == ISD::UNDEF) || InVec1 == InVec3))
   6444       X86Opcode = X86ISD::HSUB;
   6445     else
   6446       CanFold = false;
   6447 
   6448     if (CanFold) {
   6449       // Fold this build_vector into a single horizontal add/sub.
   6450       // Do this only if the target has AVX2.
   6451       if (Subtarget->hasAVX2())
   6452         return DAG.getNode(X86Opcode, DL, VT, InVec0, InVec1);
   6453 
   6454       // Do not try to expand this build_vector into a pair of horizontal
   6455       // add/sub if we can emit a pair of scalar add/sub.
   6456       if (NumUndefsLO + 1 == Half || NumUndefsHI + 1 == Half)
   6457         return SDValue();
   6458 
   6459       // Convert this build_vector into a pair of horizontal binop followed by
   6460       // a concat vector.
   6461       bool isUndefLO = NumUndefsLO == Half;
   6462       bool isUndefHI = NumUndefsHI == Half;
   6463       return ExpandHorizontalBinOp(InVec0, InVec1, DL, DAG, X86Opcode, false,
   6464                                    isUndefLO, isUndefHI);
   6465     }
   6466   }
   6467 
   6468   if ((VT == MVT::v8f32 || VT == MVT::v4f64 || VT == MVT::v8i32 ||
   6469        VT == MVT::v16i16) && Subtarget->hasAVX()) {
   6470     unsigned X86Opcode;
   6471     if (isHorizontalBinOp(BV, ISD::ADD, DAG, 0, NumElts, InVec0, InVec1))
   6472       X86Opcode = X86ISD::HADD;
   6473     else if (isHorizontalBinOp(BV, ISD::SUB, DAG, 0, NumElts, InVec0, InVec1))
   6474       X86Opcode = X86ISD::HSUB;
   6475     else if (isHorizontalBinOp(BV, ISD::FADD, DAG, 0, NumElts, InVec0, InVec1))
   6476       X86Opcode = X86ISD::FHADD;
   6477     else if (isHorizontalBinOp(BV, ISD::FSUB, DAG, 0, NumElts, InVec0, InVec1))
   6478       X86Opcode = X86ISD::FHSUB;
   6479     else
   6480       return SDValue();
   6481 
   6482     // Don't try to expand this build_vector into a pair of horizontal add/sub
   6483     // if we can simply emit a pair of scalar add/sub.
   6484     if (NumUndefsLO + 1 == Half || NumUndefsHI + 1 == Half)
   6485       return SDValue();
   6486 
   6487     // Convert this build_vector into two horizontal add/sub followed by
   6488     // a concat vector.
   6489     bool isUndefLO = NumUndefsLO == Half;
   6490     bool isUndefHI = NumUndefsHI == Half;
   6491     return ExpandHorizontalBinOp(InVec0, InVec1, DL, DAG, X86Opcode, true,
   6492                                  isUndefLO, isUndefHI);
   6493   }
   6494 
   6495   return SDValue();
   6496 }
   6497 
   6498 SDValue
   6499 X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const {
   6500   SDLoc dl(Op);
   6501 
   6502   MVT VT = Op.getSimpleValueType();
   6503   MVT ExtVT = VT.getVectorElementType();
   6504   unsigned NumElems = Op.getNumOperands();
   6505 
   6506   // Generate vectors for predicate vectors.
   6507   if (VT.getScalarType() == MVT::i1 && Subtarget->hasAVX512())
   6508     return LowerBUILD_VECTORvXi1(Op, DAG);
   6509 
   6510   // Vectors containing all zeros can be matched by pxor and xorps later
   6511   if (ISD::isBuildVectorAllZeros(Op.getNode())) {
   6512     // Canonicalize this to <4 x i32> to 1) ensure the zero vectors are CSE'd
   6513     // and 2) ensure that i64 scalars are eliminated on x86-32 hosts.
   6514     if (VT == MVT::v4i32 || VT == MVT::v8i32 || VT == MVT::v16i32)
   6515       return Op;
   6516 
   6517     return getZeroVector(VT, Subtarget, DAG, dl);
   6518   }
   6519 
   6520   // Vectors containing all ones can be matched by pcmpeqd on 128-bit width
   6521   // vectors or broken into v4i32 operations on 256-bit vectors. AVX2 can use
   6522   // vpcmpeqd on 256-bit vectors.
   6523   if (Subtarget->hasSSE2() && ISD::isBuildVectorAllOnes(Op.getNode())) {
   6524     if (VT == MVT::v4i32 || (VT == MVT::v8i32 && Subtarget->hasInt256()))
   6525       return Op;
   6526 
   6527     if (!VT.is512BitVector())
   6528       return getOnesVector(VT, Subtarget->hasInt256(), DAG, dl);
   6529   }
   6530 
   6531   SDValue Broadcast = LowerVectorBroadcast(Op, Subtarget, DAG);
   6532   if (Broadcast.getNode())
   6533     return Broadcast;
   6534 
   6535   unsigned EVTBits = ExtVT.getSizeInBits();
   6536 
   6537   unsigned NumZero  = 0;
   6538   unsigned NumNonZero = 0;
   6539   unsigned NonZeros = 0;
   6540   bool IsAllConstants = true;
   6541   SmallSet<SDValue, 8> Values;
   6542   for (unsigned i = 0; i < NumElems; ++i) {
   6543     SDValue Elt = Op.getOperand(i);
   6544     if (Elt.getOpcode() == ISD::UNDEF)
   6545       continue;
   6546     Values.insert(Elt);
   6547     if (Elt.getOpcode() != ISD::Constant &&
   6548         Elt.getOpcode() != ISD::ConstantFP)
   6549       IsAllConstants = false;
   6550     if (X86::isZeroNode(Elt))
   6551       NumZero++;
   6552     else {
   6553       NonZeros |= (1 << i);
   6554       NumNonZero++;
   6555     }
   6556   }
   6557 
   6558   // All undef vector. Return an UNDEF.  All zero vectors were handled above.
   6559   if (NumNonZero == 0)
   6560     return DAG.getUNDEF(VT);
   6561 
   6562   // Special case for single non-zero, non-undef, element.
   6563   if (NumNonZero == 1) {
   6564     unsigned Idx = countTrailingZeros(NonZeros);
   6565     SDValue Item = Op.getOperand(Idx);
   6566 
   6567     // If this is an insertion of an i64 value on x86-32, and if the top bits of
   6568     // the value are obviously zero, truncate the value to i32 and do the
   6569     // insertion that way.  Only do this if the value is non-constant or if the
   6570     // value is a constant being inserted into element 0.  It is cheaper to do
   6571     // a constant pool load than it is to do a movd + shuffle.
   6572     if (ExtVT == MVT::i64 && !Subtarget->is64Bit() &&
   6573         (!IsAllConstants || Idx == 0)) {
   6574       if (DAG.MaskedValueIsZero(Item, APInt::getBitsSet(64, 32, 64))) {
   6575         // Handle SSE only.
   6576         assert(VT == MVT::v2i64 && "Expected an SSE value type!");
   6577         EVT VecVT = MVT::v4i32;
   6578         unsigned VecElts = 4;
   6579 
   6580         // Truncate the value (which may itself be a constant) to i32, and
   6581         // convert it to a vector with movd (S2V+shuffle to zero extend).
   6582         Item = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, Item);
   6583         Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VecVT, Item);
   6584         Item = getShuffleVectorZeroOrUndef(Item, 0, true, Subtarget, DAG);
   6585 
   6586         // Now we have our 32-bit value zero extended in the low element of
   6587         // a vector.  If Idx != 0, swizzle it into place.
   6588         if (Idx != 0) {
   6589           SmallVector<int, 4> Mask;
   6590           Mask.push_back(Idx);
   6591           for (unsigned i = 1; i != VecElts; ++i)
   6592             Mask.push_back(i);
   6593           Item = DAG.getVectorShuffle(VecVT, dl, Item, DAG.getUNDEF(VecVT),
   6594                                       &Mask[0]);
   6595         }
   6596         return DAG.getNode(ISD::BITCAST, dl, VT, Item);
   6597       }
   6598     }
   6599 
   6600     // If we have a constant or non-constant insertion into the low element of
   6601     // a vector, we can do this with SCALAR_TO_VECTOR + shuffle of zero into
   6602     // the rest of the elements.  This will be matched as movd/movq/movss/movsd
   6603     // depending on what the source datatype is.
   6604     if (Idx == 0) {
   6605       if (NumZero == 0)
   6606         return DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item);
   6607 
   6608       if (ExtVT == MVT::i32 || ExtVT == MVT::f32 || ExtVT == MVT::f64 ||
   6609           (ExtVT == MVT::i64 && Subtarget->is64Bit())) {
   6610         if (VT.is256BitVector() || VT.is512BitVector()) {
   6611           SDValue ZeroVec = getZeroVector(VT, Subtarget, DAG, dl);
   6612           return DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, ZeroVec,
   6613                              Item, DAG.getIntPtrConstant(0));
   6614         }
   6615         assert(VT.is128BitVector() && "Expected an SSE value type!");
   6616         Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item);
   6617         // Turn it into a MOVL (i.e. movss, movsd, or movd) to a zero vector.
   6618         return getShuffleVectorZeroOrUndef(Item, 0, true, Subtarget, DAG);
   6619       }
   6620 
   6621       if (ExtVT == MVT::i16 || ExtVT == MVT::i8) {
   6622         Item = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, Item);
   6623         Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, Item);
   6624         if (VT.is256BitVector()) {
   6625           SDValue ZeroVec = getZeroVector(MVT::v8i32, Subtarget, DAG, dl);
   6626           Item = Insert128BitVector(ZeroVec, Item, 0, DAG, dl);
   6627         } else {
   6628           assert(VT.is128BitVector() && "Expected an SSE value type!");
   6629           Item = getShuffleVectorZeroOrUndef(Item, 0, true, Subtarget, DAG);
   6630         }
   6631         return DAG.getNode(ISD::BITCAST, dl, VT, Item);
   6632       }
   6633     }
   6634 
   6635     // Is it a vector logical left shift?
   6636     if (NumElems == 2 && Idx == 1 &&
   6637         X86::isZeroNode(Op.getOperand(0)) &&
   6638         !X86::isZeroNode(Op.getOperand(1))) {
   6639       unsigned NumBits = VT.getSizeInBits();
   6640       return getVShift(true, VT,
   6641                        DAG.getNode(ISD::SCALAR_TO_VECTOR, dl,
   6642                                    VT, Op.getOperand(1)),
   6643                        NumBits/2, DAG, *this, dl);
   6644     }
   6645 
   6646     if (IsAllConstants) // Otherwise, it's better to do a constpool load.
   6647       return SDValue();
   6648 
   6649     // Otherwise, if this is a vector with i32 or f32 elements, and the element
   6650     // is a non-constant being inserted into an element other than the low one,
   6651     // we can't use a constant pool load.  Instead, use SCALAR_TO_VECTOR (aka
   6652     // movd/movss) to move this into the low element, then shuffle it into
   6653     // place.
   6654     if (EVTBits == 32) {
   6655       Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item);
   6656 
   6657       // Turn it into a shuffle of zero and zero-extended scalar to vector.
   6658       Item = getShuffleVectorZeroOrUndef(Item, 0, NumZero > 0, Subtarget, DAG);
   6659       SmallVector<int, 8> MaskVec;
   6660       for (unsigned i = 0; i != NumElems; ++i)
   6661         MaskVec.push_back(i == Idx ? 0 : 1);
   6662       return DAG.getVectorShuffle(VT, dl, Item, DAG.getUNDEF(VT), &MaskVec[0]);
   6663     }
   6664   }
   6665 
   6666   // Splat is obviously ok. Let legalizer expand it to a shuffle.
   6667   if (Values.size() == 1) {
   6668     if (EVTBits == 32) {
   6669       // Instead of a shuffle like this:
   6670       // shuffle (scalar_to_vector (load (ptr + 4))), undef, <0, 0, 0, 0>
   6671       // Check if it's possible to issue this instead.
   6672       // shuffle (vload ptr)), undef, <1, 1, 1, 1>
   6673       unsigned Idx = countTrailingZeros(NonZeros);
   6674       SDValue Item = Op.getOperand(Idx);
   6675       if (Op.getNode()->isOnlyUserOf(Item.getNode()))
   6676         return LowerAsSplatVectorLoad(Item, VT, dl, DAG);
   6677     }
   6678     return SDValue();
   6679   }
   6680 
   6681   // A vector full of immediates; various special cases are already
   6682   // handled, so this is best done with a single constant-pool load.
   6683   if (IsAllConstants)
   6684     return SDValue();
   6685 
   6686   // For AVX-length vectors, build the individual 128-bit pieces and use
   6687   // shuffles to put them in place.
   6688   if (VT.is256BitVector() || VT.is512BitVector()) {
   6689     SmallVector<SDValue, 64> V;
   6690     for (unsigned i = 0; i != NumElems; ++i)
   6691       V.push_back(Op.getOperand(i));
   6692 
   6693     EVT HVT = EVT::getVectorVT(*DAG.getContext(), ExtVT, NumElems/2);
   6694 
   6695     // Build both the lower and upper subvector.
   6696     SDValue Lower = DAG.getNode(ISD::BUILD_VECTOR, dl, HVT,
   6697                                 makeArrayRef(&V[0], NumElems/2));
   6698     SDValue Upper = DAG.getNode(ISD::BUILD_VECTOR, dl, HVT,
   6699                                 makeArrayRef(&V[NumElems / 2], NumElems/2));
   6700 
   6701     // Recreate the wider vector with the lower and upper part.
   6702     if (VT.is256BitVector())
   6703       return Concat128BitVectors(Lower, Upper, VT, NumElems, DAG, dl);
   6704     return Concat256BitVectors(Lower, Upper, VT, NumElems, DAG, dl);
   6705   }
   6706 
   6707   // Let legalizer expand 2-wide build_vectors.
   6708   if (EVTBits == 64) {
   6709     if (NumNonZero == 1) {
   6710       // One half is zero or undef.
   6711       unsigned Idx = countTrailingZeros(NonZeros);
   6712       SDValue V2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT,
   6713                                  Op.getOperand(Idx));
   6714       return getShuffleVectorZeroOrUndef(V2, Idx, true, Subtarget, DAG);
   6715     }
   6716     return SDValue();
   6717   }
   6718 
   6719   // If element VT is < 32 bits, convert it to inserts into a zero vector.
   6720   if (EVTBits == 8 && NumElems == 16) {
   6721     SDValue V = LowerBuildVectorv16i8(Op, NonZeros,NumNonZero,NumZero, DAG,
   6722                                         Subtarget, *this);
   6723     if (V.getNode()) return V;
   6724   }
   6725 
   6726   if (EVTBits == 16 && NumElems == 8) {
   6727     SDValue V = LowerBuildVectorv8i16(Op, NonZeros,NumNonZero,NumZero, DAG,
   6728                                       Subtarget, *this);
   6729     if (V.getNode()) return V;
   6730   }
   6731 
   6732   // If element VT is == 32 bits and has 4 elems, try to generate an INSERTPS
   6733   if (EVTBits == 32 && NumElems == 4) {
   6734     SDValue V = LowerBuildVectorv4x32(Op, NumElems, NonZeros, NumNonZero,
   6735                                       NumZero, DAG, Subtarget, *this);
   6736     if (V.getNode())
   6737       return V;
   6738   }
   6739 
   6740   // If element VT is == 32 bits, turn it into a number of shuffles.
   6741   SmallVector<SDValue, 8> V(NumElems);
   6742   if (NumElems == 4 && NumZero > 0) {
   6743     for (unsigned i = 0; i < 4; ++i) {
   6744       bool isZero = !(NonZeros & (1 << i));
   6745       if (isZero)
   6746         V[i] = getZeroVector(VT, Subtarget, DAG, dl);
   6747       else
   6748         V[i] = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(i));
   6749     }
   6750 
   6751     for (unsigned i = 0; i < 2; ++i) {
   6752       switch ((NonZeros & (0x3 << i*2)) >> (i*2)) {
   6753         default: break;
   6754         case 0:
   6755           V[i] = V[i*2];  // Must be a zero vector.
   6756           break;
   6757         case 1:
   6758           V[i] = getMOVL(DAG, dl, VT, V[i*2+1], V[i*2]);
   6759           break;
   6760         case 2:
   6761           V[i] = getMOVL(DAG, dl, VT, V[i*2], V[i*2+1]);
   6762           break;
   6763         case 3:
   6764           V[i] = getUnpackl(DAG, dl, VT, V[i*2], V[i*2+1]);
   6765           break;
   6766       }
   6767     }
   6768 
   6769     bool Reverse1 = (NonZeros & 0x3) == 2;
   6770     bool Reverse2 = ((NonZeros & (0x3 << 2)) >> 2) == 2;
   6771     int MaskVec[] = {
   6772       Reverse1 ? 1 : 0,
   6773       Reverse1 ? 0 : 1,
   6774       static_cast<int>(Reverse2 ? NumElems+1 : NumElems),
   6775       static_cast<int>(Reverse2 ? NumElems   : NumElems+1)
   6776     };
   6777     return DAG.getVectorShuffle(VT, dl, V[0], V[1], &MaskVec[0]);
   6778   }
   6779 
   6780   if (Values.size() > 1 && VT.is128BitVector()) {
   6781     // Check for a build vector of consecutive loads.
   6782     for (unsigned i = 0; i < NumElems; ++i)
   6783       V[i] = Op.getOperand(i);
   6784 
   6785     // Check for elements which are consecutive loads.
   6786     SDValue LD = EltsFromConsecutiveLoads(VT, V, dl, DAG, false);
   6787     if (LD.getNode())
   6788       return LD;
   6789 
   6790     // Check for a build vector from mostly shuffle plus few inserting.
   6791     SDValue Sh = buildFromShuffleMostly(Op, DAG);
   6792     if (Sh.getNode())
   6793       return Sh;
   6794 
   6795     // For SSE 4.1, use insertps to put the high elements into the low element.
   6796     if (getSubtarget()->hasSSE41()) {
   6797       SDValue Result;
   6798       if (Op.getOperand(0).getOpcode() != ISD::UNDEF)
   6799         Result = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(0));
   6800       else
   6801         Result = DAG.getUNDEF(VT);
   6802 
   6803       for (unsigned i = 1; i < NumElems; ++i) {
   6804         if (Op.getOperand(i).getOpcode() == ISD::UNDEF) continue;
   6805         Result = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, Result,
   6806                              Op.getOperand(i), DAG.getIntPtrConstant(i));
   6807       }
   6808       return Result;
   6809     }
   6810 
   6811     // Otherwise, expand into a number of unpckl*, start by extending each of
   6812     // our (non-undef) elements to the full vector width with the element in the
   6813     // bottom slot of the vector (which generates no code for SSE).
   6814     for (unsigned i = 0; i < NumElems; ++i) {
   6815       if (Op.getOperand(i).getOpcode() != ISD::UNDEF)
   6816         V[i] = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(i));
   6817       else
   6818         V[i] = DAG.getUNDEF(VT);
   6819     }
   6820 
   6821     // Next, we iteratively mix elements, e.g. for v4f32:
   6822     //   Step 1: unpcklps 0, 2 ==> X: <?, ?, 2, 0>
   6823     //         : unpcklps 1, 3 ==> Y: <?, ?, 3, 1>
   6824     //   Step 2: unpcklps X, Y ==>    <3, 2, 1, 0>
   6825     unsigned EltStride = NumElems >> 1;
   6826     while (EltStride != 0) {
   6827       for (unsigned i = 0; i < EltStride; ++i) {
   6828         // If V[i+EltStride] is undef and this is the first round of mixing,
   6829         // then it is safe to just drop this shuffle: V[i] is already in the
   6830         // right place, the one element (since it's the first round) being
   6831         // inserted as undef can be dropped.  This isn't safe for successive
   6832         // rounds because they will permute elements within both vectors.
   6833         if (V[i+EltStride].getOpcode() == ISD::UNDEF &&
   6834             EltStride == NumElems/2)
   6835           continue;
   6836 
   6837         V[i] = getUnpackl(DAG, dl, VT, V[i], V[i + EltStride]);
   6838       }
   6839       EltStride >>= 1;
   6840     }
   6841     return V[0];
   6842   }
   6843   return SDValue();
   6844 }
   6845 
   6846 // LowerAVXCONCAT_VECTORS - 256-bit AVX can use the vinsertf128 instruction
   6847 // to create 256-bit vectors from two other 128-bit ones.
   6848 static SDValue LowerAVXCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) {
   6849   SDLoc dl(Op);
   6850   MVT ResVT = Op.getSimpleValueType();
   6851 
   6852   assert((ResVT.is256BitVector() ||
   6853           ResVT.is512BitVector()) && "Value type must be 256-/512-bit wide");
   6854 
   6855   SDValue V1 = Op.getOperand(0);
   6856   SDValue V2 = Op.getOperand(1);
   6857   unsigned NumElems = ResVT.getVectorNumElements();
   6858   if(ResVT.is256BitVector())
   6859     return Concat128BitVectors(V1, V2, ResVT, NumElems, DAG, dl);
   6860 
   6861   if (Op.getNumOperands() == 4) {
   6862     MVT HalfVT = MVT::getVectorVT(ResVT.getScalarType(),
   6863                                 ResVT.getVectorNumElements()/2);
   6864     SDValue V3 = Op.getOperand(2);
   6865     SDValue V4 = Op.getOperand(3);
   6866     return Concat256BitVectors(Concat128BitVectors(V1, V2, HalfVT, NumElems/2, DAG, dl),
   6867       Concat128BitVectors(V3, V4, HalfVT, NumElems/2, DAG, dl), ResVT, NumElems, DAG, dl);
   6868   }
   6869   return Concat256BitVectors(V1, V2, ResVT, NumElems, DAG, dl);
   6870 }
   6871 
   6872 static SDValue LowerCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) {
   6873   MVT LLVM_ATTRIBUTE_UNUSED VT = Op.getSimpleValueType();
   6874   assert((VT.is256BitVector() && Op.getNumOperands() == 2) ||
   6875          (VT.is512BitVector() && (Op.getNumOperands() == 2 ||
   6876           Op.getNumOperands() == 4)));
   6877 
   6878   // AVX can use the vinsertf128 instruction to create 256-bit vectors
   6879   // from two other 128-bit ones.
   6880 
   6881   // 512-bit vector may contain 2 256-bit vectors or 4 128-bit vectors
   6882   return LowerAVXCONCAT_VECTORS(Op, DAG);
   6883 }
   6884 
   6885 
   6886 //===----------------------------------------------------------------------===//
   6887 // Vector shuffle lowering
   6888 //
   6889 // This is an experimental code path for lowering vector shuffles on x86. It is
   6890 // designed to handle arbitrary vector shuffles and blends, gracefully
   6891 // degrading performance as necessary. It works hard to recognize idiomatic
   6892 // shuffles and lower them to optimal instruction patterns without leaving
   6893 // a framework that allows reasonably efficient handling of all vector shuffle
   6894 // patterns.
   6895 //===----------------------------------------------------------------------===//
   6896 
   6897 /// \brief Tiny helper function to identify a no-op mask.
   6898 ///
   6899 /// This is a somewhat boring predicate function. It checks whether the mask
   6900 /// array input, which is assumed to be a single-input shuffle mask of the kind
   6901 /// used by the X86 shuffle instructions (not a fully general
   6902 /// ShuffleVectorSDNode mask) requires any shuffles to occur. Both undef and an
   6903 /// in-place shuffle are 'no-op's.
   6904 static bool isNoopShuffleMask(ArrayRef<int> Mask) {
   6905   for (int i = 0, Size = Mask.size(); i < Size; ++i)
   6906     if (Mask[i] != -1 && Mask[i] != i)
   6907       return false;
   6908   return true;
   6909 }
   6910 
   6911 /// \brief Helper function to classify a mask as a single-input mask.
   6912 ///
   6913 /// This isn't a generic single-input test because in the vector shuffle
   6914 /// lowering we canonicalize single inputs to be the first input operand. This
   6915 /// means we can more quickly test for a single input by only checking whether
   6916 /// an input from the second operand exists. We also assume that the size of
   6917 /// mask corresponds to the size of the input vectors which isn't true in the
   6918 /// fully general case.
   6919 static bool isSingleInputShuffleMask(ArrayRef<int> Mask) {
   6920   for (int M : Mask)
   6921     if (M >= (int)Mask.size())
   6922       return false;
   6923   return true;
   6924 }
   6925 
   6926 /// \brief Get a 4-lane 8-bit shuffle immediate for a mask.
   6927 ///
   6928 /// This helper function produces an 8-bit shuffle immediate corresponding to
   6929 /// the ubiquitous shuffle encoding scheme used in x86 instructions for
   6930 /// shuffling 4 lanes. It can be used with most of the PSHUF instructions for
   6931 /// example.
   6932 ///
   6933 /// NB: We rely heavily on "undef" masks preserving the input lane.
   6934 static SDValue getV4X86ShuffleImm8ForMask(ArrayRef<int> Mask,
   6935                                           SelectionDAG &DAG) {
   6936   assert(Mask.size() == 4 && "Only 4-lane shuffle masks");
   6937   assert(Mask[0] >= -1 && Mask[0] < 4 && "Out of bound mask element!");
   6938   assert(Mask[1] >= -1 && Mask[1] < 4 && "Out of bound mask element!");
   6939   assert(Mask[2] >= -1 && Mask[2] < 4 && "Out of bound mask element!");
   6940   assert(Mask[3] >= -1 && Mask[3] < 4 && "Out of bound mask element!");
   6941 
   6942   unsigned Imm = 0;
   6943   Imm |= (Mask[0] == -1 ? 0 : Mask[0]) << 0;
   6944   Imm |= (Mask[1] == -1 ? 1 : Mask[1]) << 2;
   6945   Imm |= (Mask[2] == -1 ? 2 : Mask[2]) << 4;
   6946   Imm |= (Mask[3] == -1 ? 3 : Mask[3]) << 6;
   6947   return DAG.getConstant(Imm, MVT::i8);
   6948 }
   6949 
   6950 /// \brief Handle lowering of 2-lane 64-bit floating point shuffles.
   6951 ///
   6952 /// This is the basis function for the 2-lane 64-bit shuffles as we have full
   6953 /// support for floating point shuffles but not integer shuffles. These
   6954 /// instructions will incur a domain crossing penalty on some chips though so
   6955 /// it is better to avoid lowering through this for integer vectors where
   6956 /// possible.
   6957 static SDValue lowerV2F64VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
   6958                                        const X86Subtarget *Subtarget,
   6959                                        SelectionDAG &DAG) {
   6960   SDLoc DL(Op);
   6961   assert(Op.getSimpleValueType() == MVT::v2f64 && "Bad shuffle type!");
   6962   assert(V1.getSimpleValueType() == MVT::v2f64 && "Bad operand type!");
   6963   assert(V2.getSimpleValueType() == MVT::v2f64 && "Bad operand type!");
   6964   ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
   6965   ArrayRef<int> Mask = SVOp->getMask();
   6966   assert(Mask.size() == 2 && "Unexpected mask size for v2 shuffle!");
   6967 
   6968   if (isSingleInputShuffleMask(Mask)) {
   6969     // Straight shuffle of a single input vector. Simulate this by using the
   6970     // single input as both of the "inputs" to this instruction..
   6971     unsigned SHUFPDMask = (Mask[0] == 1) | ((Mask[1] == 1) << 1);
   6972     return DAG.getNode(X86ISD::SHUFP, SDLoc(Op), MVT::v2f64, V1, V1,
   6973                        DAG.getConstant(SHUFPDMask, MVT::i8));
   6974   }
   6975   assert(Mask[0] >= 0 && Mask[0] < 2 && "Non-canonicalized blend!");
   6976   assert(Mask[1] >= 2 && "Non-canonicalized blend!");
   6977 
   6978   unsigned SHUFPDMask = (Mask[0] == 1) | (((Mask[1] - 2) == 1) << 1);
   6979   return DAG.getNode(X86ISD::SHUFP, SDLoc(Op), MVT::v2f64, V1, V2,
   6980                      DAG.getConstant(SHUFPDMask, MVT::i8));
   6981 }
   6982 
   6983 /// \brief Handle lowering of 2-lane 64-bit integer shuffles.
   6984 ///
   6985 /// Tries to lower a 2-lane 64-bit shuffle using shuffle operations provided by
   6986 /// the integer unit to minimize domain crossing penalties. However, for blends
   6987 /// it falls back to the floating point shuffle operation with appropriate bit
   6988 /// casting.
   6989 static SDValue lowerV2I64VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
   6990                                        const X86Subtarget *Subtarget,
   6991                                        SelectionDAG &DAG) {
   6992   SDLoc DL(Op);
   6993   assert(Op.getSimpleValueType() == MVT::v2i64 && "Bad shuffle type!");
   6994   assert(V1.getSimpleValueType() == MVT::v2i64 && "Bad operand type!");
   6995   assert(V2.getSimpleValueType() == MVT::v2i64 && "Bad operand type!");
   6996   ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
   6997   ArrayRef<int> Mask = SVOp->getMask();
   6998   assert(Mask.size() == 2 && "Unexpected mask size for v2 shuffle!");
   6999 
   7000   if (isSingleInputShuffleMask(Mask)) {
   7001     // Straight shuffle of a single input vector. For everything from SSE2
   7002     // onward this has a single fast instruction with no scary immediates.
   7003     // We have to map the mask as it is actually a v4i32 shuffle instruction.
   7004     V1 = DAG.getNode(ISD::BITCAST, DL, MVT::v4i32, V1);
   7005     int WidenedMask[4] = {
   7006         std::max(Mask[0], 0) * 2, std::max(Mask[0], 0) * 2 + 1,
   7007         std::max(Mask[1], 0) * 2, std::max(Mask[1], 0) * 2 + 1};
   7008     return DAG.getNode(
   7009         ISD::BITCAST, DL, MVT::v2i64,
   7010         DAG.getNode(X86ISD::PSHUFD, SDLoc(Op), MVT::v4i32, V1,
   7011                     getV4X86ShuffleImm8ForMask(WidenedMask, DAG)));
   7012   }
   7013 
   7014   // We implement this with SHUFPD which is pretty lame because it will likely
   7015   // incur 2 cycles of stall for integer vectors on Nehalem and older chips.
   7016   // However, all the alternatives are still more cycles and newer chips don't
   7017   // have this problem. It would be really nice if x86 had better shuffles here.
   7018   V1 = DAG.getNode(ISD::BITCAST, DL, MVT::v2f64, V1);
   7019   V2 = DAG.getNode(ISD::BITCAST, DL, MVT::v2f64, V2);
   7020   return DAG.getNode(ISD::BITCAST, DL, MVT::v2i64,
   7021                      DAG.getVectorShuffle(MVT::v2f64, DL, V1, V2, Mask));
   7022 }
   7023 
   7024 /// \brief Lower 4-lane 32-bit floating point shuffles.
   7025 ///
   7026 /// Uses instructions exclusively from the floating point unit to minimize
   7027 /// domain crossing penalties, as these are sufficient to implement all v4f32
   7028 /// shuffles.
   7029 static SDValue lowerV4F32VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
   7030                                        const X86Subtarget *Subtarget,
   7031                                        SelectionDAG &DAG) {
   7032   SDLoc DL(Op);
   7033   assert(Op.getSimpleValueType() == MVT::v4f32 && "Bad shuffle type!");
   7034   assert(V1.getSimpleValueType() == MVT::v4f32 && "Bad operand type!");
   7035   assert(V2.getSimpleValueType() == MVT::v4f32 && "Bad operand type!");
   7036   ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
   7037   ArrayRef<int> Mask = SVOp->getMask();
   7038   assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!");
   7039 
   7040   SDValue LowV = V1, HighV = V2;
   7041   int NewMask[4] = {Mask[0], Mask[1], Mask[2], Mask[3]};
   7042 
   7043   int NumV2Elements =
   7044       std::count_if(Mask.begin(), Mask.end(), [](int M) { return M >= 4; });
   7045 
   7046   if (NumV2Elements == 0)
   7047     // Straight shuffle of a single input vector. We pass the input vector to
   7048     // both operands to simulate this with a SHUFPS.
   7049     return DAG.getNode(X86ISD::SHUFP, DL, MVT::v4f32, V1, V1,
   7050                        getV4X86ShuffleImm8ForMask(Mask, DAG));
   7051 
   7052   if (NumV2Elements == 1) {
   7053     int V2Index =
   7054         std::find_if(Mask.begin(), Mask.end(), [](int M) { return M >= 4; }) -
   7055         Mask.begin();
   7056     // Compute the index adjacent to V2Index and in the same half by toggling
   7057     // the low bit.
   7058     int V2AdjIndex = V2Index ^ 1;
   7059 
   7060     if (Mask[V2AdjIndex] == -1) {
   7061       // Handles all the cases where we have a single V2 element and an undef.
   7062       // This will only ever happen in the high lanes because we commute the
   7063       // vector otherwise.
   7064       if (V2Index < 2)
   7065         std::swap(LowV, HighV);
   7066       NewMask[V2Index] -= 4;
   7067     } else {
   7068       // Handle the case where the V2 element ends up adjacent to a V1 element.
   7069       // To make this work, blend them together as the first step.
   7070       int V1Index = V2AdjIndex;
   7071       int BlendMask[4] = {Mask[V2Index] - 4, 0, Mask[V1Index], 0};
   7072       V2 = DAG.getNode(X86ISD::SHUFP, DL, MVT::v4f32, V2, V1,
   7073                        getV4X86ShuffleImm8ForMask(BlendMask, DAG));
   7074 
   7075       // Now proceed to reconstruct the final blend as we have the necessary
   7076       // high or low half formed.
   7077       if (V2Index < 2) {
   7078         LowV = V2;
   7079         HighV = V1;
   7080       } else {
   7081         HighV = V2;
   7082       }
   7083       NewMask[V1Index] = 2; // We put the V1 element in V2[2].
   7084       NewMask[V2Index] = 0; // We shifted the V2 element into V2[0].
   7085     }
   7086   } else if (NumV2Elements == 2) {
   7087     if (Mask[0] < 4 && Mask[1] < 4) {
   7088       // Handle the easy case where we have V1 in the low lanes and V2 in the
   7089       // high lanes. We never see this reversed because we sort the shuffle.
   7090       NewMask[2] -= 4;
   7091       NewMask[3] -= 4;
   7092     } else {
   7093       // We have a mixture of V1 and V2 in both low and high lanes. Rather than
   7094       // trying to place elements directly, just blend them and set up the final
   7095       // shuffle to place them.
   7096 
   7097       // The first two blend mask elements are for V1, the second two are for
   7098       // V2.
   7099       int BlendMask[4] = {Mask[0] < 4 ? Mask[0] : Mask[1],
   7100                           Mask[2] < 4 ? Mask[2] : Mask[3],
   7101                           (Mask[0] >= 4 ? Mask[0] : Mask[1]) - 4,
   7102                           (Mask[2] >= 4 ? Mask[2] : Mask[3]) - 4};
   7103       V1 = DAG.getNode(X86ISD::SHUFP, DL, MVT::v4f32, V1, V2,
   7104                        getV4X86ShuffleImm8ForMask(BlendMask, DAG));
   7105 
   7106       // Now we do a normal shuffle of V1 by giving V1 as both operands to
   7107       // a blend.
   7108       LowV = HighV = V1;
   7109       NewMask[0] = Mask[0] < 4 ? 0 : 2;
   7110       NewMask[1] = Mask[0] < 4 ? 2 : 0;
   7111       NewMask[2] = Mask[2] < 4 ? 1 : 3;
   7112       NewMask[3] = Mask[2] < 4 ? 3 : 1;
   7113     }
   7114   }
   7115   return DAG.getNode(X86ISD::SHUFP, DL, MVT::v4f32, LowV, HighV,
   7116                      getV4X86ShuffleImm8ForMask(NewMask, DAG));
   7117 }
   7118 
   7119 /// \brief Lower 4-lane i32 vector shuffles.
   7120 ///
   7121 /// We try to handle these with integer-domain shuffles where we can, but for
   7122 /// blends we use the floating point domain blend instructions.
   7123 static SDValue lowerV4I32VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
   7124                                        const X86Subtarget *Subtarget,
   7125                                        SelectionDAG &DAG) {
   7126   SDLoc DL(Op);
   7127   assert(Op.getSimpleValueType() == MVT::v4i32 && "Bad shuffle type!");
   7128   assert(V1.getSimpleValueType() == MVT::v4i32 && "Bad operand type!");
   7129   assert(V2.getSimpleValueType() == MVT::v4i32 && "Bad operand type!");
   7130   ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
   7131   ArrayRef<int> Mask = SVOp->getMask();
   7132   assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!");
   7133 
   7134   if (isSingleInputShuffleMask(Mask))
   7135     // Straight shuffle of a single input vector. For everything from SSE2
   7136     // onward this has a single fast instruction with no scary immediates.
   7137     return DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32, V1,
   7138                        getV4X86ShuffleImm8ForMask(Mask, DAG));
   7139 
   7140   // We implement this with SHUFPS because it can blend from two vectors.
   7141   // Because we're going to eventually use SHUFPS, we use SHUFPS even to build
   7142   // up the inputs, bypassing domain shift penalties that we would encur if we
   7143   // directly used PSHUFD on Nehalem and older. For newer chips, this isn't
   7144   // relevant.
   7145   return DAG.getNode(ISD::BITCAST, DL, MVT::v4i32,
   7146                      DAG.getVectorShuffle(
   7147                          MVT::v4f32, DL,
   7148                          DAG.getNode(ISD::BITCAST, DL, MVT::v4f32, V1),
   7149                          DAG.getNode(ISD::BITCAST, DL, MVT::v4f32, V2), Mask));
   7150 }
   7151 
   7152 /// \brief Lowering of single-input v8i16 shuffles is the cornerstone of SSE2
   7153 /// shuffle lowering, and the most complex part.
   7154 ///
   7155 /// The lowering strategy is to try to form pairs of input lanes which are
   7156 /// targeted at the same half of the final vector, and then use a dword shuffle
   7157 /// to place them onto the right half, and finally unpack the paired lanes into
   7158 /// their final position.
   7159 ///
   7160 /// The exact breakdown of how to form these dword pairs and align them on the
   7161 /// correct sides is really tricky. See the comments within the function for
   7162 /// more of the details.
   7163 static SDValue lowerV8I16SingleInputVectorShuffle(
   7164     SDLoc DL, SDValue V, MutableArrayRef<int> Mask,
   7165     const X86Subtarget *Subtarget, SelectionDAG &DAG) {
   7166   assert(V.getSimpleValueType() == MVT::v8i16 && "Bad input type!");
   7167   MutableArrayRef<int> LoMask = Mask.slice(0, 4);
   7168   MutableArrayRef<int> HiMask = Mask.slice(4, 4);
   7169 
   7170   SmallVector<int, 4> LoInputs;
   7171   std::copy_if(LoMask.begin(), LoMask.end(), std::back_inserter(LoInputs),
   7172                [](int M) { return M >= 0; });
   7173   std::sort(LoInputs.begin(), LoInputs.end());
   7174   LoInputs.erase(std::unique(LoInputs.begin(), LoInputs.end()), LoInputs.end());
   7175   SmallVector<int, 4> HiInputs;
   7176   std::copy_if(HiMask.begin(), HiMask.end(), std::back_inserter(HiInputs),
   7177                [](int M) { return M >= 0; });
   7178   std::sort(HiInputs.begin(), HiInputs.end());
   7179   HiInputs.erase(std::unique(HiInputs.begin(), HiInputs.end()), HiInputs.end());
   7180   int NumLToL =
   7181       std::lower_bound(LoInputs.begin(), LoInputs.end(), 4) - LoInputs.begin();
   7182   int NumHToL = LoInputs.size() - NumLToL;
   7183   int NumLToH =
   7184       std::lower_bound(HiInputs.begin(), HiInputs.end(), 4) - HiInputs.begin();
   7185   int NumHToH = HiInputs.size() - NumLToH;
   7186   MutableArrayRef<int> LToLInputs(LoInputs.data(), NumLToL);
   7187   MutableArrayRef<int> LToHInputs(HiInputs.data(), NumLToH);
   7188   MutableArrayRef<int> HToLInputs(LoInputs.data() + NumLToL, NumHToL);
   7189   MutableArrayRef<int> HToHInputs(HiInputs.data() + NumLToH, NumHToH);
   7190 
   7191   // Simplify the 1-into-3 and 3-into-1 cases with a single pshufd. For all
   7192   // such inputs we can swap two of the dwords across the half mark and end up
   7193   // with <=2 inputs to each half in each half. Once there, we can fall through
   7194   // to the generic code below. For example:
   7195   //
   7196   // Input: [a, b, c, d, e, f, g, h] -PSHUFD[0,2,1,3]-> [a, b, e, f, c, d, g, h]
   7197   // Mask:  [0, 1, 2, 7, 4, 5, 6, 3] -----------------> [0, 1, 4, 7, 2, 3, 6, 5]
   7198   //
   7199   // Before we had 3-1 in the low half and 3-1 in the high half. Afterward, 2-2
   7200   // and 2-2.
   7201   auto balanceSides = [&](ArrayRef<int> ThreeInputs, int OneInput,
   7202                           int ThreeInputHalfSum, int OneInputHalfOffset) {
   7203     // Compute the index of dword with only one word among the three inputs in
   7204     // a half by taking the sum of the half with three inputs and subtracting
   7205     // the sum of the actual three inputs. The difference is the remaining
   7206     // slot.
   7207     int DWordA = (ThreeInputHalfSum -
   7208                   std::accumulate(ThreeInputs.begin(), ThreeInputs.end(), 0)) /
   7209                  2;
   7210     int DWordB = OneInputHalfOffset / 2 + (OneInput / 2 + 1) % 2;
   7211 
   7212     int PSHUFDMask[] = {0, 1, 2, 3};
   7213     PSHUFDMask[DWordA] = DWordB;
   7214     PSHUFDMask[DWordB] = DWordA;
   7215     V = DAG.getNode(ISD::BITCAST, DL, MVT::v8i16,
   7216                     DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32,
   7217                                 DAG.getNode(ISD::BITCAST, DL, MVT::v4i32, V),
   7218                                 getV4X86ShuffleImm8ForMask(PSHUFDMask, DAG)));
   7219 
   7220     // Adjust the mask to match the new locations of A and B.
   7221     for (int &M : Mask)
   7222       if (M != -1 && M/2 == DWordA)
   7223         M = 2 * DWordB + M % 2;
   7224       else if (M != -1 && M/2 == DWordB)
   7225         M = 2 * DWordA + M % 2;
   7226 
   7227     // Recurse back into this routine to re-compute state now that this isn't
   7228     // a 3 and 1 problem.
   7229     return DAG.getVectorShuffle(MVT::v8i16, DL, V, DAG.getUNDEF(MVT::v8i16),
   7230                                 Mask);
   7231   };
   7232   if (NumLToL == 3 && NumHToL == 1)
   7233     return balanceSides(LToLInputs, HToLInputs[0], 0 + 1 + 2 + 3, 4);
   7234   else if (NumLToL == 1 && NumHToL == 3)
   7235     return balanceSides(HToLInputs, LToLInputs[0], 4 + 5 + 6 + 7, 0);
   7236   else if (NumLToH == 1 && NumHToH == 3)
   7237     return balanceSides(HToHInputs, LToHInputs[0], 4 + 5 + 6 + 7, 0);
   7238   else if (NumLToH == 3 && NumHToH == 1)
   7239     return balanceSides(LToHInputs, HToHInputs[0], 0 + 1 + 2 + 3, 4);
   7240 
   7241   // At this point there are at most two inputs to the low and high halves from
   7242   // each half. That means the inputs can always be grouped into dwords and
   7243   // those dwords can then be moved to the correct half with a dword shuffle.
   7244   // We use at most one low and one high word shuffle to collect these paired
   7245   // inputs into dwords, and finally a dword shuffle to place them.
   7246   int PSHUFLMask[4] = {-1, -1, -1, -1};
   7247   int PSHUFHMask[4] = {-1, -1, -1, -1};
   7248   int PSHUFDMask[4] = {-1, -1, -1, -1};
   7249 
   7250   // First fix the masks for all the inputs that are staying in their
   7251   // original halves. This will then dictate the targets of the cross-half
   7252   // shuffles.
   7253   auto fixInPlaceInputs = [&PSHUFDMask](
   7254       ArrayRef<int> InPlaceInputs, MutableArrayRef<int> SourceHalfMask,
   7255       MutableArrayRef<int> HalfMask, int HalfOffset) {
   7256     if (InPlaceInputs.empty())
   7257       return;
   7258     if (InPlaceInputs.size() == 1) {
   7259       SourceHalfMask[InPlaceInputs[0] - HalfOffset] =
   7260           InPlaceInputs[0] - HalfOffset;
   7261       PSHUFDMask[InPlaceInputs[0] / 2] = InPlaceInputs[0] / 2;
   7262       return;
   7263     }
   7264 
   7265     assert(InPlaceInputs.size() == 2 && "Cannot handle 3 or 4 inputs!");
   7266     SourceHalfMask[InPlaceInputs[0] - HalfOffset] =
   7267         InPlaceInputs[0] - HalfOffset;
   7268     // Put the second input next to the first so that they are packed into
   7269     // a dword. We find the adjacent index by toggling the low bit.
   7270     int AdjIndex = InPlaceInputs[0] ^ 1;
   7271     SourceHalfMask[AdjIndex - HalfOffset] = InPlaceInputs[1] - HalfOffset;
   7272     std::replace(HalfMask.begin(), HalfMask.end(), InPlaceInputs[1], AdjIndex);
   7273     PSHUFDMask[AdjIndex / 2] = AdjIndex / 2;
   7274   };
   7275   if (!HToLInputs.empty())
   7276     fixInPlaceInputs(LToLInputs, PSHUFLMask, LoMask, 0);
   7277   if (!LToHInputs.empty())
   7278     fixInPlaceInputs(HToHInputs, PSHUFHMask, HiMask, 4);
   7279 
   7280   // Now gather the cross-half inputs and place them into a free dword of
   7281   // their target half.
   7282   // FIXME: This operation could almost certainly be simplified dramatically to
   7283   // look more like the 3-1 fixing operation.
   7284   auto moveInputsToRightHalf = [&PSHUFDMask](
   7285       MutableArrayRef<int> IncomingInputs, ArrayRef<int> ExistingInputs,
   7286       MutableArrayRef<int> SourceHalfMask, MutableArrayRef<int> HalfMask,
   7287       int SourceOffset, int DestOffset) {
   7288     auto isWordClobbered = [](ArrayRef<int> SourceHalfMask, int Word) {
   7289       return SourceHalfMask[Word] != -1 && SourceHalfMask[Word] != Word;
   7290     };
   7291     auto isDWordClobbered = [&isWordClobbered](ArrayRef<int> SourceHalfMask,
   7292                                                int Word) {
   7293       int LowWord = Word & ~1;
   7294       int HighWord = Word | 1;
   7295       return isWordClobbered(SourceHalfMask, LowWord) ||
   7296              isWordClobbered(SourceHalfMask, HighWord);
   7297     };
   7298 
   7299     if (IncomingInputs.empty())
   7300       return;
   7301 
   7302     if (ExistingInputs.empty()) {
   7303       // Map any dwords with inputs from them into the right half.
   7304       for (int Input : IncomingInputs) {
   7305         // If the source half mask maps over the inputs, turn those into
   7306         // swaps and use the swapped lane.
   7307         if (isWordClobbered(SourceHalfMask, Input - SourceOffset)) {
   7308           if (SourceHalfMask[SourceHalfMask[Input - SourceOffset]] == -1) {
   7309             SourceHalfMask[SourceHalfMask[Input - SourceOffset]] =
   7310                 Input - SourceOffset;
   7311             // We have to swap the uses in our half mask in one sweep.
   7312             for (int &M : HalfMask)
   7313               if (M == SourceHalfMask[Input - SourceOffset])
   7314                 M = Input;
   7315               else if (M == Input)
   7316                 M = SourceHalfMask[Input - SourceOffset] + SourceOffset;
   7317           } else {
   7318             assert(SourceHalfMask[SourceHalfMask[Input - SourceOffset]] ==
   7319                        Input - SourceOffset &&
   7320                    "Previous placement doesn't match!");
   7321           }
   7322           // Note that this correctly re-maps both when we do a swap and when
   7323           // we observe the other side of the swap above. We rely on that to
   7324           // avoid swapping the members of the input list directly.
   7325           Input = SourceHalfMask[Input - SourceOffset] + SourceOffset;
   7326         }
   7327 
   7328         // Map the input's dword into the correct half.
   7329         if (PSHUFDMask[(Input - SourceOffset + DestOffset) / 2] == -1)
   7330           PSHUFDMask[(Input - SourceOffset + DestOffset) / 2] = Input / 2;
   7331         else
   7332           assert(PSHUFDMask[(Input - SourceOffset + DestOffset) / 2] ==
   7333                      Input / 2 &&
   7334                  "Previous placement doesn't match!");
   7335       }
   7336 
   7337       // And just directly shift any other-half mask elements to be same-half
   7338       // as we will have mirrored the dword containing the element into the
   7339       // same position within that half.
   7340       for (int &M : HalfMask)
   7341         if (M >= SourceOffset && M < SourceOffset + 4) {
   7342           M = M - SourceOffset + DestOffset;
   7343           assert(M >= 0 && "This should never wrap below zero!");
   7344         }
   7345       return;
   7346     }
   7347 
   7348     // Ensure we have the input in a viable dword of its current half. This
   7349     // is particularly tricky because the original position may be clobbered
   7350     // by inputs being moved and *staying* in that half.
   7351     if (IncomingInputs.size() == 1) {
   7352       if (isWordClobbered(SourceHalfMask, IncomingInputs[0] - SourceOffset)) {
   7353         int InputFixed = std::find(std::begin(SourceHalfMask),
   7354                                    std::end(SourceHalfMask), -1) -
   7355                          std::begin(SourceHalfMask) + SourceOffset;
   7356         SourceHalfMask[InputFixed - SourceOffset] =
   7357             IncomingInputs[0] - SourceOffset;
   7358         std::replace(HalfMask.begin(), HalfMask.end(), IncomingInputs[0],
   7359                      InputFixed);
   7360         IncomingInputs[0] = InputFixed;
   7361       }
   7362     } else if (IncomingInputs.size() == 2) {
   7363       if (IncomingInputs[0] / 2 != IncomingInputs[1] / 2 ||
   7364           isDWordClobbered(SourceHalfMask, IncomingInputs[0] - SourceOffset)) {
   7365         int SourceDWordBase = !isDWordClobbered(SourceHalfMask, 0) ? 0 : 2;
   7366         assert(!isDWordClobbered(SourceHalfMask, SourceDWordBase) &&
   7367                "Not all dwords can be clobbered!");
   7368         SourceHalfMask[SourceDWordBase] = IncomingInputs[0] - SourceOffset;
   7369         SourceHalfMask[SourceDWordBase + 1] = IncomingInputs[1] - SourceOffset;
   7370         for (int &M : HalfMask)
   7371           if (M == IncomingInputs[0])
   7372             M = SourceDWordBase + SourceOffset;
   7373           else if (M == IncomingInputs[1])
   7374             M = SourceDWordBase + 1 + SourceOffset;
   7375         IncomingInputs[0] = SourceDWordBase + SourceOffset;
   7376         IncomingInputs[1] = SourceDWordBase + 1 + SourceOffset;
   7377       }
   7378     } else {
   7379       llvm_unreachable("Unhandled input size!");
   7380     }
   7381 
   7382     // Now hoist the DWord down to the right half.
   7383     int FreeDWord = (PSHUFDMask[DestOffset / 2] == -1 ? 0 : 1) + DestOffset / 2;
   7384     assert(PSHUFDMask[FreeDWord] == -1 && "DWord not free");
   7385     PSHUFDMask[FreeDWord] = IncomingInputs[0] / 2;
   7386     for (int Input : IncomingInputs)
   7387       std::replace(HalfMask.begin(), HalfMask.end(), Input,
   7388                    FreeDWord * 2 + Input % 2);
   7389   };
   7390   moveInputsToRightHalf(HToLInputs, LToLInputs, PSHUFHMask, LoMask,
   7391                         /*SourceOffset*/ 4, /*DestOffset*/ 0);
   7392   moveInputsToRightHalf(LToHInputs, HToHInputs, PSHUFLMask, HiMask,
   7393                         /*SourceOffset*/ 0, /*DestOffset*/ 4);
   7394 
   7395   // Now enact all the shuffles we've computed to move the inputs into their
   7396   // target half.
   7397   if (!isNoopShuffleMask(PSHUFLMask))
   7398     V = DAG.getNode(X86ISD::PSHUFLW, DL, MVT::v8i16, V,
   7399                     getV4X86ShuffleImm8ForMask(PSHUFLMask, DAG));
   7400   if (!isNoopShuffleMask(PSHUFHMask))
   7401     V = DAG.getNode(X86ISD::PSHUFHW, DL, MVT::v8i16, V,
   7402                     getV4X86ShuffleImm8ForMask(PSHUFHMask, DAG));
   7403   if (!isNoopShuffleMask(PSHUFDMask))
   7404     V = DAG.getNode(ISD::BITCAST, DL, MVT::v8i16,
   7405                     DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32,
   7406                                 DAG.getNode(ISD::BITCAST, DL, MVT::v4i32, V),
   7407                                 getV4X86ShuffleImm8ForMask(PSHUFDMask, DAG)));
   7408 
   7409   // At this point, each half should contain all its inputs, and we can then
   7410   // just shuffle them into their final position.
   7411   assert(std::count_if(LoMask.begin(), LoMask.end(),
   7412                        [](int M) { return M >= 4; }) == 0 &&
   7413          "Failed to lift all the high half inputs to the low mask!");
   7414   assert(std::count_if(HiMask.begin(), HiMask.end(),
   7415                        [](int M) { return M >= 0 && M < 4; }) == 0 &&
   7416          "Failed to lift all the low half inputs to the high mask!");
   7417 
   7418   // Do a half shuffle for the low mask.
   7419   if (!isNoopShuffleMask(LoMask))
   7420     V = DAG.getNode(X86ISD::PSHUFLW, DL, MVT::v8i16, V,
   7421                     getV4X86ShuffleImm8ForMask(LoMask, DAG));
   7422 
   7423   // Do a half shuffle with the high mask after shifting its values down.
   7424   for (int &M : HiMask)
   7425     if (M >= 0)
   7426       M -= 4;
   7427   if (!isNoopShuffleMask(HiMask))
   7428     V = DAG.getNode(X86ISD::PSHUFHW, DL, MVT::v8i16, V,
   7429                     getV4X86ShuffleImm8ForMask(HiMask, DAG));
   7430 
   7431   return V;
   7432 }
   7433 
   7434 /// \brief Detect whether the mask pattern should be lowered through
   7435 /// interleaving.
   7436 ///
   7437 /// This essentially tests whether viewing the mask as an interleaving of two
   7438 /// sub-sequences reduces the cross-input traffic of a blend operation. If so,
   7439 /// lowering it through interleaving is a significantly better strategy.
   7440 static bool shouldLowerAsInterleaving(ArrayRef<int> Mask) {
   7441   int NumEvenInputs[2] = {0, 0};
   7442   int NumOddInputs[2] = {0, 0};
   7443   int NumLoInputs[2] = {0, 0};
   7444   int NumHiInputs[2] = {0, 0};
   7445   for (int i = 0, Size = Mask.size(); i < Size; ++i) {
   7446     if (Mask[i] < 0)
   7447       continue;
   7448 
   7449     int InputIdx = Mask[i] >= Size;
   7450 
   7451     if (i < Size / 2)
   7452       ++NumLoInputs[InputIdx];
   7453     else
   7454       ++NumHiInputs[InputIdx];
   7455 
   7456     if ((i % 2) == 0)
   7457       ++NumEvenInputs[InputIdx];
   7458     else
   7459       ++NumOddInputs[InputIdx];
   7460   }
   7461 
   7462   // The minimum number of cross-input results for both the interleaved and
   7463   // split cases. If interleaving results in fewer cross-input results, return
   7464   // true.
   7465   int InterleavedCrosses = std::min(NumEvenInputs[1] + NumOddInputs[0],
   7466                                     NumEvenInputs[0] + NumOddInputs[1]);
   7467   int SplitCrosses = std::min(NumLoInputs[1] + NumHiInputs[0],
   7468                               NumLoInputs[0] + NumHiInputs[1]);
   7469   return InterleavedCrosses < SplitCrosses;
   7470 }
   7471 
   7472 /// \brief Blend two v8i16 vectors using a naive unpack strategy.
   7473 ///
   7474 /// This strategy only works when the inputs from each vector fit into a single
   7475 /// half of that vector, and generally there are not so many inputs as to leave
   7476 /// the in-place shuffles required highly constrained (and thus expensive). It
   7477 /// shifts all the inputs into a single side of both input vectors and then
   7478 /// uses an unpack to interleave these inputs in a single vector. At that
   7479 /// point, we will fall back on the generic single input shuffle lowering.
   7480 static SDValue lowerV8I16BasicBlendVectorShuffle(SDLoc DL, SDValue V1,
   7481                                                  SDValue V2,
   7482                                                  MutableArrayRef<int> Mask,
   7483                                                  const X86Subtarget *Subtarget,
   7484                                                  SelectionDAG &DAG) {
   7485   assert(V1.getSimpleValueType() == MVT::v8i16 && "Bad input type!");
   7486   assert(V2.getSimpleValueType() == MVT::v8i16 && "Bad input type!");
   7487   SmallVector<int, 3> LoV1Inputs, HiV1Inputs, LoV2Inputs, HiV2Inputs;
   7488   for (int i = 0; i < 8; ++i)
   7489     if (Mask[i] >= 0 && Mask[i] < 4)
   7490       LoV1Inputs.push_back(i);
   7491     else if (Mask[i] >= 4 && Mask[i] < 8)
   7492       HiV1Inputs.push_back(i);
   7493     else if (Mask[i] >= 8 && Mask[i] < 12)
   7494       LoV2Inputs.push_back(i);
   7495     else if (Mask[i] >= 12)
   7496       HiV2Inputs.push_back(i);
   7497 
   7498   int NumV1Inputs = LoV1Inputs.size() + HiV1Inputs.size();
   7499   int NumV2Inputs = LoV2Inputs.size() + HiV2Inputs.size();
   7500   (void)NumV1Inputs;
   7501   (void)NumV2Inputs;
   7502   assert(NumV1Inputs > 0 && NumV1Inputs <= 3 && "At most 3 inputs supported");
   7503   assert(NumV2Inputs > 0 && NumV2Inputs <= 3 && "At most 3 inputs supported");
   7504   assert(NumV1Inputs + NumV2Inputs <= 4 && "At most 4 combined inputs");
   7505 
   7506   bool MergeFromLo = LoV1Inputs.size() + LoV2Inputs.size() >=
   7507                      HiV1Inputs.size() + HiV2Inputs.size();
   7508 
   7509   auto moveInputsToHalf = [&](SDValue V, ArrayRef<int> LoInputs,
   7510                               ArrayRef<int> HiInputs, bool MoveToLo,
   7511                               int MaskOffset) {
   7512     ArrayRef<int> GoodInputs = MoveToLo ? LoInputs : HiInputs;
   7513     ArrayRef<int> BadInputs = MoveToLo ? HiInputs : LoInputs;
   7514     if (BadInputs.empty())
   7515       return V;
   7516 
   7517     int MoveMask[] = {-1, -1, -1, -1, -1, -1, -1, -1};
   7518     int MoveOffset = MoveToLo ? 0 : 4;
   7519 
   7520     if (GoodInputs.empty()) {
   7521       for (int BadInput : BadInputs) {
   7522         MoveMask[Mask[BadInput] % 4 + MoveOffset] = Mask[BadInput] - MaskOffset;
   7523         Mask[BadInput] = Mask[BadInput] % 4 + MoveOffset + MaskOffset;
   7524       }
   7525     } else {
   7526       if (GoodInputs.size() == 2) {
   7527         // If the low inputs are spread across two dwords, pack them into
   7528         // a single dword.
   7529         MoveMask[Mask[GoodInputs[0]] % 2 + MoveOffset] =
   7530             Mask[GoodInputs[0]] - MaskOffset;
   7531         MoveMask[Mask[GoodInputs[1]] % 2 + MoveOffset] =
   7532             Mask[GoodInputs[1]] - MaskOffset;
   7533         Mask[GoodInputs[0]] = Mask[GoodInputs[0]] % 2 + MoveOffset + MaskOffset;
   7534         Mask[GoodInputs[1]] = Mask[GoodInputs[0]] % 2 + MoveOffset + MaskOffset;
   7535       } else {
   7536         // Otherwise pin the low inputs.
   7537         for (int GoodInput : GoodInputs)
   7538           MoveMask[Mask[GoodInput] - MaskOffset] = Mask[GoodInput] - MaskOffset;
   7539       }
   7540 
   7541       int MoveMaskIdx =
   7542           std::find(std::begin(MoveMask) + MoveOffset, std::end(MoveMask), -1) -
   7543           std::begin(MoveMask);
   7544       assert(MoveMaskIdx >= MoveOffset && "Established above");
   7545 
   7546       if (BadInputs.size() == 2) {
   7547         assert(MoveMask[MoveMaskIdx] == -1 && "Expected empty slot");
   7548         assert(MoveMask[MoveMaskIdx + 1] == -1 && "Expected empty slot");
   7549         MoveMask[MoveMaskIdx + Mask[BadInputs[0]] % 2] =
   7550             Mask[BadInputs[0]] - MaskOffset;
   7551         MoveMask[MoveMaskIdx + Mask[BadInputs[1]] % 2] =
   7552             Mask[BadInputs[1]] - MaskOffset;
   7553         Mask[BadInputs[0]] = MoveMaskIdx + Mask[BadInputs[0]] % 2 + MaskOffset;
   7554         Mask[BadInputs[1]] = MoveMaskIdx + Mask[BadInputs[1]] % 2 + MaskOffset;
   7555       } else {
   7556         assert(BadInputs.size() == 1 && "All sizes handled");
   7557         MoveMask[MoveMaskIdx] = Mask[BadInputs[0]] - MaskOffset;
   7558         Mask[BadInputs[0]] = MoveMaskIdx + MaskOffset;
   7559       }
   7560     }
   7561 
   7562     return DAG.getVectorShuffle(MVT::v8i16, DL, V, DAG.getUNDEF(MVT::v8i16),
   7563                                 MoveMask);
   7564   };
   7565   V1 = moveInputsToHalf(V1, LoV1Inputs, HiV1Inputs, MergeFromLo,
   7566                         /*MaskOffset*/ 0);
   7567   V2 = moveInputsToHalf(V2, LoV2Inputs, HiV2Inputs, MergeFromLo,
   7568                         /*MaskOffset*/ 8);
   7569 
   7570   // FIXME: Select an interleaving of the merge of V1 and V2 that minimizes
   7571   // cross-half traffic in the final shuffle.
   7572 
   7573   // Munge the mask to be a single-input mask after the unpack merges the
   7574   // results.
   7575   for (int &M : Mask)
   7576     if (M != -1)
   7577       M = 2 * (M % 4) + (M / 8);
   7578 
   7579   return DAG.getVectorShuffle(
   7580       MVT::v8i16, DL, DAG.getNode(MergeFromLo ? X86ISD::UNPCKL : X86ISD::UNPCKH,
   7581                                   DL, MVT::v8i16, V1, V2),
   7582       DAG.getUNDEF(MVT::v8i16), Mask);
   7583 }
   7584 
   7585 /// \brief Generic lowering of 8-lane i16 shuffles.
   7586 ///
   7587 /// This handles both single-input shuffles and combined shuffle/blends with
   7588 /// two inputs. The single input shuffles are immediately delegated to
   7589 /// a dedicated lowering routine.
   7590 ///
   7591 /// The blends are lowered in one of three fundamental ways. If there are few
   7592 /// enough inputs, it delegates to a basic UNPCK-based strategy. If the shuffle
   7593 /// of the input is significantly cheaper when lowered as an interleaving of
   7594 /// the two inputs, try to interleave them. Otherwise, blend the low and high
   7595 /// halves of the inputs separately (making them have relatively few inputs)
   7596 /// and then concatenate them.
   7597 static SDValue lowerV8I16VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
   7598                                        const X86Subtarget *Subtarget,
   7599                                        SelectionDAG &DAG) {
   7600   SDLoc DL(Op);
   7601   assert(Op.getSimpleValueType() == MVT::v8i16 && "Bad shuffle type!");
   7602   assert(V1.getSimpleValueType() == MVT::v8i16 && "Bad operand type!");
   7603   assert(V2.getSimpleValueType() == MVT::v8i16 && "Bad operand type!");
   7604   ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
   7605   ArrayRef<int> OrigMask = SVOp->getMask();
   7606   int MaskStorage[8] = {OrigMask[0], OrigMask[1], OrigMask[2], OrigMask[3],
   7607                         OrigMask[4], OrigMask[5], OrigMask[6], OrigMask[7]};
   7608   MutableArrayRef<int> Mask(MaskStorage);
   7609 
   7610   assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");
   7611 
   7612   auto isV1 = [](int M) { return M >= 0 && M < 8; };
   7613   auto isV2 = [](int M) { return M >= 8; };
   7614 
   7615   int NumV1Inputs = std::count_if(Mask.begin(), Mask.end(), isV1);
   7616   int NumV2Inputs = std::count_if(Mask.begin(), Mask.end(), isV2);
   7617 
   7618   if (NumV2Inputs == 0)
   7619     return lowerV8I16SingleInputVectorShuffle(DL, V1, Mask, Subtarget, DAG);
   7620 
   7621   assert(NumV1Inputs > 0 && "All single-input shuffles should be canonicalized "
   7622                             "to be V1-input shuffles.");
   7623 
   7624   if (NumV1Inputs + NumV2Inputs <= 4)
   7625     return lowerV8I16BasicBlendVectorShuffle(DL, V1, V2, Mask, Subtarget, DAG);
   7626 
   7627   // Check whether an interleaving lowering is likely to be more efficient.
   7628   // This isn't perfect but it is a strong heuristic that tends to work well on
   7629   // the kinds of shuffles that show up in practice.
   7630   //
   7631   // FIXME: Handle 1x, 2x, and 4x interleaving.
   7632   if (shouldLowerAsInterleaving(Mask)) {
   7633     // FIXME: Figure out whether we should pack these into the low or high
   7634     // halves.
   7635 
   7636     int EMask[8], OMask[8];
   7637     for (int i = 0; i < 4; ++i) {
   7638       EMask[i] = Mask[2*i];
   7639       OMask[i] = Mask[2*i + 1];
   7640       EMask[i + 4] = -1;
   7641       OMask[i + 4] = -1;
   7642     }
   7643 
   7644     SDValue Evens = DAG.getVectorShuffle(MVT::v8i16, DL, V1, V2, EMask);
   7645     SDValue Odds = DAG.getVectorShuffle(MVT::v8i16, DL, V1, V2, OMask);
   7646 
   7647     return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v8i16, Evens, Odds);
   7648   }
   7649 
   7650   int LoBlendMask[8] = {-1, -1, -1, -1, -1, -1, -1, -1};
   7651   int HiBlendMask[8] = {-1, -1, -1, -1, -1, -1, -1, -1};
   7652 
   7653   for (int i = 0; i < 4; ++i) {
   7654     LoBlendMask[i] = Mask[i];
   7655     HiBlendMask[i] = Mask[i + 4];
   7656   }
   7657 
   7658   SDValue LoV = DAG.getVectorShuffle(MVT::v8i16, DL, V1, V2, LoBlendMask);
   7659   SDValue HiV = DAG.getVectorShuffle(MVT::v8i16, DL, V1, V2, HiBlendMask);
   7660   LoV = DAG.getNode(ISD::BITCAST, DL, MVT::v2i64, LoV);
   7661   HiV = DAG.getNode(ISD::BITCAST, DL, MVT::v2i64, HiV);
   7662 
   7663   return DAG.getNode(ISD::BITCAST, DL, MVT::v8i16,
   7664                      DAG.getNode(X86ISD::UNPCKL, DL, MVT::v2i64, LoV, HiV));
   7665 }
   7666 
   7667 /// \brief Generic lowering of v16i8 shuffles.
   7668 ///
   7669 /// This is a hybrid strategy to lower v16i8 vectors. It first attempts to
   7670 /// detect any complexity reducing interleaving. If that doesn't help, it uses
   7671 /// UNPCK to spread the i8 elements across two i16-element vectors, and uses
   7672 /// the existing lowering for v8i16 blends on each half, finally PACK-ing them
   7673 /// back together.
   7674 static SDValue lowerV16I8VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
   7675                                        const X86Subtarget *Subtarget,
   7676                                        SelectionDAG &DAG) {
   7677   SDLoc DL(Op);
   7678   assert(Op.getSimpleValueType() == MVT::v16i8 && "Bad shuffle type!");
   7679   assert(V1.getSimpleValueType() == MVT::v16i8 && "Bad operand type!");
   7680   assert(V2.getSimpleValueType() == MVT::v16i8 && "Bad operand type!");
   7681   ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
   7682   ArrayRef<int> OrigMask = SVOp->getMask();
   7683   assert(OrigMask.size() == 16 && "Unexpected mask size for v16 shuffle!");
   7684   int MaskStorage[16] = {
   7685       OrigMask[0],  OrigMask[1],  OrigMask[2],  OrigMask[3],
   7686       OrigMask[4],  OrigMask[5],  OrigMask[6],  OrigMask[7],
   7687       OrigMask[8],  OrigMask[9],  OrigMask[10], OrigMask[11],
   7688       OrigMask[12], OrigMask[13], OrigMask[14], OrigMask[15]};
   7689   MutableArrayRef<int> Mask(MaskStorage);
   7690   MutableArrayRef<int> LoMask = Mask.slice(0, 8);
   7691   MutableArrayRef<int> HiMask = Mask.slice(8, 8);
   7692 
   7693   // For single-input shuffles, there are some nicer lowering tricks we can use.
   7694   if (isSingleInputShuffleMask(Mask)) {
   7695     // Check whether we can widen this to an i16 shuffle by duplicating bytes.
   7696     // Notably, this handles splat and partial-splat shuffles more efficiently.
   7697     // However, it only makes sense if the pre-duplication shuffle simplifies
   7698     // things significantly. Currently, this means we need to be able to
   7699     // express the pre-duplication shuffle as an i16 shuffle.
   7700     //
   7701     // FIXME: We should check for other patterns which can be widened into an
   7702     // i16 shuffle as well.
   7703     auto canWidenViaDuplication = [](ArrayRef<int> Mask) {
   7704       for (int i = 0; i < 16; i += 2) {
   7705         if (Mask[i] != Mask[i + 1])
   7706           return false;
   7707       }
   7708       return true;
   7709     };
   7710     auto tryToWidenViaDuplication = [&]() -> SDValue {
   7711       if (!canWidenViaDuplication(Mask))
   7712         return SDValue();
   7713       SmallVector<int, 4> LoInputs;
   7714       std::copy_if(Mask.begin(), Mask.end(), std::back_inserter(LoInputs),
   7715                    [](int M) { return M >= 0 && M < 8; });
   7716       std::sort(LoInputs.begin(), LoInputs.end());
   7717       LoInputs.erase(std::unique(LoInputs.begin(), LoInputs.end()),
   7718                      LoInputs.end());
   7719       SmallVector<int, 4> HiInputs;
   7720       std::copy_if(Mask.begin(), Mask.end(), std::back_inserter(HiInputs),
   7721                    [](int M) { return M >= 8; });
   7722       std::sort(HiInputs.begin(), HiInputs.end());
   7723       HiInputs.erase(std::unique(HiInputs.begin(), HiInputs.end()),
   7724                      HiInputs.end());
   7725 
   7726       bool TargetLo = LoInputs.size() >= HiInputs.size();
   7727       ArrayRef<int> InPlaceInputs = TargetLo ? LoInputs : HiInputs;
   7728       ArrayRef<int> MovingInputs = TargetLo ? HiInputs : LoInputs;
   7729 
   7730       int PreDupI16Shuffle[] = {-1, -1, -1, -1, -1, -1, -1, -1};
   7731       SmallDenseMap<int, int, 8> LaneMap;
   7732       for (int I : InPlaceInputs) {
   7733         PreDupI16Shuffle[I/2] = I/2;
   7734         LaneMap[I] = I;
   7735       }
   7736       int j = TargetLo ? 0 : 4, je = j + 4;
   7737       for (int i = 0, ie = MovingInputs.size(); i < ie; ++i) {
   7738         // Check if j is already a shuffle of this input. This happens when
   7739         // there are two adjacent bytes after we move the low one.
   7740         if (PreDupI16Shuffle[j] != MovingInputs[i] / 2) {
   7741           // If we haven't yet mapped the input, search for a slot into which
   7742           // we can map it.
   7743           while (j < je && PreDupI16Shuffle[j] != -1)
   7744             ++j;
   7745 
   7746           if (j == je)
   7747             // We can't place the inputs into a single half with a simple i16 shuffle, so bail.
   7748             return SDValue();
   7749 
   7750           // Map this input with the i16 shuffle.
   7751           PreDupI16Shuffle[j] = MovingInputs[i] / 2;
   7752         }
   7753 
   7754         // Update the lane map based on the mapping we ended up with.
   7755         LaneMap[MovingInputs[i]] = 2 * j + MovingInputs[i] % 2;
   7756       }
   7757       V1 = DAG.getNode(
   7758           ISD::BITCAST, DL, MVT::v16i8,
   7759           DAG.getVectorShuffle(MVT::v8i16, DL,
   7760                                DAG.getNode(ISD::BITCAST, DL, MVT::v8i16, V1),
   7761                                DAG.getUNDEF(MVT::v8i16), PreDupI16Shuffle));
   7762 
   7763       // Unpack the bytes to form the i16s that will be shuffled into place.
   7764       V1 = DAG.getNode(TargetLo ? X86ISD::UNPCKL : X86ISD::UNPCKH, DL,
   7765                        MVT::v16i8, V1, V1);
   7766 
   7767       int PostDupI16Shuffle[8] = {-1, -1, -1, -1, -1, -1, -1, -1};
   7768       for (int i = 0; i < 16; i += 2) {
   7769         if (Mask[i] != -1)
   7770           PostDupI16Shuffle[i / 2] = LaneMap[Mask[i]] - (TargetLo ? 0 : 8);
   7771         assert(PostDupI16Shuffle[i / 2] < 8 && "Invalid v8 shuffle mask!");
   7772       }
   7773       return DAG.getNode(
   7774           ISD::BITCAST, DL, MVT::v16i8,
   7775           DAG.getVectorShuffle(MVT::v8i16, DL,
   7776                                DAG.getNode(ISD::BITCAST, DL, MVT::v8i16, V1),
   7777                                DAG.getUNDEF(MVT::v8i16), PostDupI16Shuffle));
   7778     };
   7779     if (SDValue V = tryToWidenViaDuplication())
   7780       return V;
   7781   }
   7782 
   7783   // Check whether an interleaving lowering is likely to be more efficient.
   7784   // This isn't perfect but it is a strong heuristic that tends to work well on
   7785   // the kinds of shuffles that show up in practice.
   7786   //
   7787   // FIXME: We need to handle other interleaving widths (i16, i32, ...).
   7788   if (shouldLowerAsInterleaving(Mask)) {
   7789     // FIXME: Figure out whether we should pack these into the low or high
   7790     // halves.
   7791 
   7792     int EMask[16], OMask[16];
   7793     for (int i = 0; i < 8; ++i) {
   7794       EMask[i] = Mask[2*i];
   7795       OMask[i] = Mask[2*i + 1];
   7796       EMask[i + 8] = -1;
   7797       OMask[i + 8] = -1;
   7798     }
   7799 
   7800     SDValue Evens = DAG.getVectorShuffle(MVT::v16i8, DL, V1, V2, EMask);
   7801     SDValue Odds = DAG.getVectorShuffle(MVT::v16i8, DL, V1, V2, OMask);
   7802 
   7803     return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v16i8, Evens, Odds);
   7804   }
   7805 
   7806   int V1LoBlendMask[8] = {-1, -1, -1, -1, -1, -1, -1, -1};
   7807   int V1HiBlendMask[8] = {-1, -1, -1, -1, -1, -1, -1, -1};
   7808   int V2LoBlendMask[8] = {-1, -1, -1, -1, -1, -1, -1, -1};
   7809   int V2HiBlendMask[8] = {-1, -1, -1, -1, -1, -1, -1, -1};
   7810 
   7811   auto buildBlendMasks = [](MutableArrayRef<int> HalfMask,
   7812                             MutableArrayRef<int> V1HalfBlendMask,
   7813                             MutableArrayRef<int> V2HalfBlendMask) {
   7814     for (int i = 0; i < 8; ++i)
   7815       if (HalfMask[i] >= 0 && HalfMask[i] < 16) {
   7816         V1HalfBlendMask[i] = HalfMask[i];
   7817         HalfMask[i] = i;
   7818       } else if (HalfMask[i] >= 16) {
   7819         V2HalfBlendMask[i] = HalfMask[i] - 16;
   7820         HalfMask[i] = i + 8;
   7821       }
   7822   };
   7823   buildBlendMasks(LoMask, V1LoBlendMask, V2LoBlendMask);
   7824   buildBlendMasks(HiMask, V1HiBlendMask, V2HiBlendMask);
   7825 
   7826   SDValue Zero = getZeroVector(MVT::v8i16, Subtarget, DAG, DL);
   7827 
   7828   auto buildLoAndHiV8s = [&](SDValue V, MutableArrayRef<int> LoBlendMask,
   7829                              MutableArrayRef<int> HiBlendMask) {
   7830     SDValue V1, V2;
   7831     // Check if any of the odd lanes in the v16i8 are used. If not, we can mask
   7832     // them out and avoid using UNPCK{L,H} to extract the elements of V as
   7833     // i16s.
   7834     if (std::none_of(LoBlendMask.begin(), LoBlendMask.end(),
   7835                      [](int M) { return M >= 0 && M % 2 == 1; }) &&
   7836         std::none_of(HiBlendMask.begin(), HiBlendMask.end(),
   7837                      [](int M) { return M >= 0 && M % 2 == 1; })) {
   7838       // Use a mask to drop the high bytes.
   7839       V1 = DAG.getNode(ISD::BITCAST, DL, MVT::v8i16, V);
   7840       V1 = DAG.getNode(ISD::AND, DL, MVT::v8i16, V1,
   7841                        DAG.getConstant(0x00FF, MVT::v8i16));
   7842 
   7843       // This will be a single vector shuffle instead of a blend so nuke V2.
   7844       V2 = DAG.getUNDEF(MVT::v8i16);
   7845 
   7846       // Squash the masks to point directly into V1.
   7847       for (int &M : LoBlendMask)
   7848         if (M >= 0)
   7849           M /= 2;
   7850       for (int &M : HiBlendMask)
   7851         if (M >= 0)
   7852           M /= 2;
   7853     } else {
   7854       // Otherwise just unpack the low half of V into V1 and the high half into
   7855       // V2 so that we can blend them as i16s.
   7856       V1 = DAG.getNode(ISD::BITCAST, DL, MVT::v8i16,
   7857                        DAG.getNode(X86ISD::UNPCKL, DL, MVT::v16i8, V, Zero));
   7858       V2 = DAG.getNode(ISD::BITCAST, DL, MVT::v8i16,
   7859                        DAG.getNode(X86ISD::UNPCKH, DL, MVT::v16i8, V, Zero));
   7860     }
   7861 
   7862     SDValue BlendedLo = DAG.getVectorShuffle(MVT::v8i16, DL, V1, V2, LoBlendMask);
   7863     SDValue BlendedHi = DAG.getVectorShuffle(MVT::v8i16, DL, V1, V2, HiBlendMask);
   7864     return std::make_pair(BlendedLo, BlendedHi);
   7865   };
   7866   SDValue V1Lo, V1Hi, V2Lo, V2Hi;
   7867   std::tie(V1Lo, V1Hi) = buildLoAndHiV8s(V1, V1LoBlendMask, V1HiBlendMask);
   7868   std::tie(V2Lo, V2Hi) = buildLoAndHiV8s(V2, V2LoBlendMask, V2HiBlendMask);
   7869 
   7870   SDValue LoV = DAG.getVectorShuffle(MVT::v8i16, DL, V1Lo, V2Lo, LoMask);
   7871   SDValue HiV = DAG.getVectorShuffle(MVT::v8i16, DL, V1Hi, V2Hi, HiMask);
   7872 
   7873   return DAG.getNode(X86ISD::PACKUS, DL, MVT::v16i8, LoV, HiV);
   7874 }
   7875 
   7876 /// \brief Dispatching routine to lower various 128-bit x86 vector shuffles.
   7877 ///
   7878 /// This routine breaks down the specific type of 128-bit shuffle and
   7879 /// dispatches to the lowering routines accordingly.
   7880 static SDValue lower128BitVectorShuffle(SDValue Op, SDValue V1, SDValue V2,
   7881                                         MVT VT, const X86Subtarget *Subtarget,
   7882                                         SelectionDAG &DAG) {
   7883   switch (VT.SimpleTy) {
   7884   case MVT::v2i64:
   7885     return lowerV2I64VectorShuffle(Op, V1, V2, Subtarget, DAG);
   7886   case MVT::v2f64:
   7887     return lowerV2F64VectorShuffle(Op, V1, V2, Subtarget, DAG);
   7888   case MVT::v4i32:
   7889     return lowerV4I32VectorShuffle(Op, V1, V2, Subtarget, DAG);
   7890   case MVT::v4f32:
   7891     return lowerV4F32VectorShuffle(Op, V1, V2, Subtarget, DAG);
   7892   case MVT::v8i16:
   7893     return lowerV8I16VectorShuffle(Op, V1, V2, Subtarget, DAG);
   7894   case MVT::v16i8:
   7895     return lowerV16I8VectorShuffle(Op, V1, V2, Subtarget, DAG);
   7896 
   7897   default:
   7898     llvm_unreachable("Unimplemented!");
   7899   }
   7900 }
   7901 
   7902 /// \brief Tiny helper function to test whether adjacent masks are sequential.
   7903 static bool areAdjacentMasksSequential(ArrayRef<int> Mask) {
   7904   for (int i = 0, Size = Mask.size(); i < Size; i += 2)
   7905     if (Mask[i] + 1 != Mask[i+1])
   7906       return false;
   7907 
   7908   return true;
   7909 }
   7910 
   7911 /// \brief Top-level lowering for x86 vector shuffles.
   7912 ///
   7913 /// This handles decomposition, canonicalization, and lowering of all x86
   7914 /// vector shuffles. Most of the specific lowering strategies are encapsulated
   7915 /// above in helper routines. The canonicalization attempts to widen shuffles
   7916 /// to involve fewer lanes of wider elements, consolidate symmetric patterns
   7917 /// s.t. only one of the two inputs needs to be tested, etc.
   7918 static SDValue lowerVectorShuffle(SDValue Op, const X86Subtarget *Subtarget,
   7919                                   SelectionDAG &DAG) {
   7920   ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
   7921   ArrayRef<int> Mask = SVOp->getMask();
   7922   SDValue V1 = Op.getOperand(0);
   7923   SDValue V2 = Op.getOperand(1);
   7924   MVT VT = Op.getSimpleValueType();
   7925   int NumElements = VT.getVectorNumElements();
   7926   SDLoc dl(Op);
   7927 
   7928   assert(VT.getSizeInBits() != 64 && "Can't lower MMX shuffles");
   7929 
   7930   bool V1IsUndef = V1.getOpcode() == ISD::UNDEF;
   7931   bool V2IsUndef = V2.getOpcode() == ISD::UNDEF;
   7932   if (V1IsUndef && V2IsUndef)
   7933     return DAG.getUNDEF(VT);
   7934 
   7935   // When we create a shuffle node we put the UNDEF node to second operand,
   7936   // but in some cases the first operand may be transformed to UNDEF.
   7937   // In this case we should just commute the node.
   7938   if (V1IsUndef)
   7939     return CommuteVectorShuffle(SVOp, DAG);
   7940 
   7941   // Check for non-undef masks pointing at an undef vector and make the masks
   7942   // undef as well. This makes it easier to match the shuffle based solely on
   7943   // the mask.
   7944   if (V2IsUndef)
   7945     for (int M : Mask)
   7946       if (M >= NumElements) {
   7947         SmallVector<int, 8> NewMask(Mask.begin(), Mask.end());
   7948         for (int &M : NewMask)
   7949           if (M >= NumElements)
   7950             M = -1;
   7951         return DAG.getVectorShuffle(VT, dl, V1, V2, NewMask);
   7952       }
   7953 
   7954   // For integer vector shuffles, try to collapse them into a shuffle of fewer
   7955   // lanes but wider integers. We cap this to not form integers larger than i64
   7956   // but it might be interesting to form i128 integers to handle flipping the
   7957   // low and high halves of AVX 256-bit vectors.
   7958   if (VT.isInteger() && VT.getScalarSizeInBits() < 64 &&
   7959       areAdjacentMasksSequential(Mask)) {
   7960     SmallVector<int, 8> NewMask;
   7961     for (int i = 0, Size = Mask.size(); i < Size; i += 2)
   7962       NewMask.push_back(Mask[i] / 2);
   7963     MVT NewVT =
   7964         MVT::getVectorVT(MVT::getIntegerVT(VT.getScalarSizeInBits() * 2),
   7965                          VT.getVectorNumElements() / 2);
   7966     V1 = DAG.getNode(ISD::BITCAST, dl, NewVT, V1);
   7967     V2 = DAG.getNode(ISD::BITCAST, dl, NewVT, V2);
   7968     return DAG.getNode(ISD::BITCAST, dl, VT,
   7969                        DAG.getVectorShuffle(NewVT, dl, V1, V2, NewMask));
   7970   }
   7971 
   7972   int NumV1Elements = 0, NumUndefElements = 0, NumV2Elements = 0;
   7973   for (int M : SVOp->getMask())
   7974     if (M < 0)
   7975       ++NumUndefElements;
   7976     else if (M < NumElements)
   7977       ++NumV1Elements;
   7978     else
   7979       ++NumV2Elements;
   7980 
   7981   // Commute the shuffle as needed such that more elements come from V1 than
   7982   // V2. This allows us to match the shuffle pattern strictly on how many
   7983   // elements come from V1 without handling the symmetric cases.
   7984   if (NumV2Elements > NumV1Elements)
   7985     return CommuteVectorShuffle(SVOp, DAG);
   7986 
   7987   // When the number of V1 and V2 elements are the same, try to minimize the
   7988   // number of uses of V2 in the low half of the vector.
   7989   if (NumV1Elements == NumV2Elements) {
   7990     int LowV1Elements = 0, LowV2Elements = 0;
   7991     for (int M : SVOp->getMask().slice(0, NumElements / 2))
   7992       if (M >= NumElements)
   7993         ++LowV2Elements;
   7994       else if (M >= 0)
   7995         ++LowV1Elements;
   7996     if (LowV2Elements > LowV1Elements)
   7997       return CommuteVectorShuffle(SVOp, DAG);
   7998   }
   7999 
   8000   // For each vector width, delegate to a specialized lowering routine.
   8001   if (VT.getSizeInBits() == 128)
   8002     return lower128BitVectorShuffle(Op, V1, V2, VT, Subtarget, DAG);
   8003 
   8004   llvm_unreachable("Unimplemented!");
   8005 }
   8006 
   8007 
   8008 //===----------------------------------------------------------------------===//
   8009 // Legacy vector shuffle lowering
   8010 //
   8011 // This code is the legacy code handling vector shuffles until the above
   8012 // replaces its functionality and performance.
   8013 //===----------------------------------------------------------------------===//
   8014 
   8015 static bool isBlendMask(ArrayRef<int> MaskVals, MVT VT, bool hasSSE41,
   8016                         bool hasInt256, unsigned *MaskOut = nullptr) {
   8017   MVT EltVT = VT.getVectorElementType();
   8018 
   8019   // There is no blend with immediate in AVX-512.
   8020   if (VT.is512BitVector())
   8021     return false;
   8022 
   8023   if (!hasSSE41 || EltVT == MVT::i8)
   8024     return false;
   8025   if (!hasInt256 && VT == MVT::v16i16)
   8026     return false;
   8027 
   8028   unsigned MaskValue = 0;
   8029   unsigned NumElems = VT.getVectorNumElements();
   8030   // There are 2 lanes if (NumElems > 8), and 1 lane otherwise.
   8031   unsigned NumLanes = (NumElems - 1) / 8 + 1;
   8032   unsigned NumElemsInLane = NumElems / NumLanes;
   8033 
   8034   // Blend for v16i16 should be symetric for the both lanes.
   8035   for (unsigned i = 0; i < NumElemsInLane; ++i) {
   8036 
   8037     int SndLaneEltIdx = (NumLanes == 2) ? MaskVals[i + NumElemsInLane] : -1;
   8038     int EltIdx = MaskVals[i];
   8039 
   8040     if ((EltIdx < 0 || EltIdx == (int)i) &&
   8041         (SndLaneEltIdx < 0 || SndLaneEltIdx == (int)(i + NumElemsInLane)))
   8042       continue;
   8043 
   8044     if (((unsigned)EltIdx == (i + NumElems)) &&
   8045         (SndLaneEltIdx < 0 ||
   8046          (unsigned)SndLaneEltIdx == i + NumElems + NumElemsInLane))
   8047       MaskValue |= (1 << i);
   8048     else
   8049       return false;
   8050   }
   8051 
   8052   if (MaskOut)
   8053     *MaskOut = MaskValue;
   8054   return true;
   8055 }
   8056 
   8057 // Try to lower a shuffle node into a simple blend instruction.
   8058 // This function assumes isBlendMask returns true for this
   8059 // SuffleVectorSDNode
   8060 static SDValue LowerVECTOR_SHUFFLEtoBlend(ShuffleVectorSDNode *SVOp,
   8061                                           unsigned MaskValue,
   8062                                           const X86Subtarget *Subtarget,
   8063                                           SelectionDAG &DAG) {
   8064   MVT VT = SVOp->getSimpleValueType(0);
   8065   MVT EltVT = VT.getVectorElementType();
   8066   assert(isBlendMask(SVOp->getMask(), VT, Subtarget->hasSSE41(),
   8067                      Subtarget->hasInt256() && "Trying to lower a "
   8068                                                "VECTOR_SHUFFLE to a Blend but "
   8069                                                "with the wrong mask"));
   8070   SDValue V1 = SVOp->getOperand(0);
   8071   SDValue V2 = SVOp->getOperand(1);
   8072   SDLoc dl(SVOp);
   8073   unsigned NumElems = VT.getVectorNumElements();
   8074 
   8075   // Convert i32 vectors to floating point if it is not AVX2.
   8076   // AVX2 introduced VPBLENDD instruction for 128 and 256-bit vectors.
   8077   MVT BlendVT = VT;
   8078   if (EltVT == MVT::i64 || (EltVT == MVT::i32 && !Subtarget->hasInt256())) {
   8079     BlendVT = MVT::getVectorVT(MVT::getFloatingPointVT(EltVT.getSizeInBits()),
   8080                                NumElems);
   8081     V1 = DAG.getNode(ISD::BITCAST, dl, VT, V1);
   8082     V2 = DAG.getNode(ISD::BITCAST, dl, VT, V2);
   8083   }
   8084 
   8085   SDValue Ret = DAG.getNode(X86ISD::BLENDI, dl, BlendVT, V1, V2,
   8086                             DAG.getConstant(MaskValue, MVT::i32));
   8087   return DAG.getNode(ISD::BITCAST, dl, VT, Ret);
   8088 }
   8089 
   8090 /// In vector type \p VT, return true if the element at index \p InputIdx
   8091 /// falls on a different 128-bit lane than \p OutputIdx.
   8092 static bool ShuffleCrosses128bitLane(MVT VT, unsigned InputIdx,
   8093                                      unsigned OutputIdx) {
   8094   unsigned EltSize = VT.getVectorElementType().getSizeInBits();
   8095   return InputIdx * EltSize / 128 != OutputIdx * EltSize / 128;
   8096 }
   8097 
   8098 /// Generate a PSHUFB if possible.  Selects elements from \p V1 according to
   8099 /// \p MaskVals.  MaskVals[OutputIdx] = InputIdx specifies that we want to
   8100 /// shuffle the element at InputIdx in V1 to OutputIdx in the result.  If \p
   8101 /// MaskVals refers to elements outside of \p V1 or is undef (-1), insert a
   8102 /// zero.
   8103 static SDValue getPSHUFB(ArrayRef<int> MaskVals, SDValue V1, SDLoc &dl,
   8104                          SelectionDAG &DAG) {
   8105   MVT VT = V1.getSimpleValueType();
   8106   assert(VT.is128BitVector() || VT.is256BitVector());
   8107 
   8108   MVT EltVT = VT.getVectorElementType();
   8109   unsigned EltSizeInBytes = EltVT.getSizeInBits() / 8;
   8110   unsigned NumElts = VT.getVectorNumElements();
   8111 
   8112   SmallVector<SDValue, 32> PshufbMask;
   8113   for (unsigned OutputIdx = 0; OutputIdx < NumElts; ++OutputIdx) {
   8114     int InputIdx = MaskVals[OutputIdx];
   8115     unsigned InputByteIdx;
   8116 
   8117     if (InputIdx < 0 || NumElts <= (unsigned)InputIdx)
   8118       InputByteIdx = 0x80;
   8119     else {
   8120       // Cross lane is not allowed.
   8121       if (ShuffleCrosses128bitLane(VT, InputIdx, OutputIdx))
   8122         return SDValue();
   8123       InputByteIdx = InputIdx * EltSizeInBytes;
   8124       // Index is an byte offset within the 128-bit lane.
   8125       InputByteIdx &= 0xf;
   8126     }
   8127 
   8128     for (unsigned j = 0; j < EltSizeInBytes; ++j) {
   8129       PshufbMask.push_back(DAG.getConstant(InputByteIdx, MVT::i8));
   8130       if (InputByteIdx != 0x80)
   8131         ++InputByteIdx;
   8132     }
   8133   }
   8134 
   8135   MVT ShufVT = MVT::getVectorVT(MVT::i8, PshufbMask.size());
   8136   if (ShufVT != VT)
   8137     V1 = DAG.getNode(ISD::BITCAST, dl, ShufVT, V1);
   8138   return DAG.getNode(X86ISD::PSHUFB, dl, ShufVT, V1,
   8139                      DAG.getNode(ISD::BUILD_VECTOR, dl, ShufVT, PshufbMask));
   8140 }
   8141 
   8142 // v8i16 shuffles - Prefer shuffles in the following order:
   8143 // 1. [all]   pshuflw, pshufhw, optional move
   8144 // 2. [ssse3] 1 x pshufb
   8145 // 3. [ssse3] 2 x pshufb + 1 x por
   8146 // 4. [all]   mov + pshuflw + pshufhw + N x (pextrw + pinsrw)
   8147 static SDValue
   8148 LowerVECTOR_SHUFFLEv8i16(SDValue Op, const X86Subtarget *Subtarget,
   8149                          SelectionDAG &DAG) {
   8150   ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
   8151   SDValue V1 = SVOp->getOperand(0);
   8152   SDValue V2 = SVOp->getOperand(1);
   8153   SDLoc dl(SVOp);
   8154   SmallVector<int, 8> MaskVals;
   8155 
   8156   // Determine if more than 1 of the words in each of the low and high quadwords
   8157   // of the result come from the same quadword of one of the two inputs.  Undef
   8158   // mask values count as coming from any quadword, for better codegen.
   8159   //
   8160   // Lo/HiQuad[i] = j indicates how many words from the ith quad of the input
   8161   // feeds this quad.  For i, 0 and 1 refer to V1, 2 and 3 refer to V2.
   8162   unsigned LoQuad[] = { 0, 0, 0, 0 };
   8163   unsigned HiQuad[] = { 0, 0, 0, 0 };
   8164   // Indices of quads used.
   8165   std::bitset<4> InputQuads;
   8166   for (unsigned i = 0; i < 8; ++i) {
   8167     unsigned *Quad = i < 4 ? LoQuad : HiQuad;
   8168     int EltIdx = SVOp->getMaskElt(i);
   8169     MaskVals.push_back(EltIdx);
   8170     if (EltIdx < 0) {
   8171       ++Quad[0];
   8172       ++Quad[1];
   8173       ++Quad[2];
   8174       ++Quad[3];
   8175       continue;
   8176     }
   8177     ++Quad[EltIdx / 4];
   8178     InputQuads.set(EltIdx / 4);
   8179   }
   8180 
   8181   int BestLoQuad = -1;
   8182   unsigned MaxQuad = 1;
   8183   for (unsigned i = 0; i < 4; ++i) {
   8184     if (LoQuad[i] > MaxQuad) {
   8185       BestLoQuad = i;
   8186       MaxQuad = LoQuad[i];
   8187     }
   8188   }
   8189 
   8190   int BestHiQuad = -1;
   8191   MaxQuad = 1;
   8192   for (unsigned i = 0; i < 4; ++i) {
   8193     if (HiQuad[i] > MaxQuad) {
   8194       BestHiQuad = i;
   8195       MaxQuad = HiQuad[i];
   8196     }
   8197   }
   8198 
   8199   // For SSSE3, If all 8 words of the result come from only 1 quadword of each
   8200   // of the two input vectors, shuffle them into one input vector so only a
   8201   // single pshufb instruction is necessary. If there are more than 2 input
   8202   // quads, disable the next transformation since it does not help SSSE3.
   8203   bool V1Used = InputQuads[0] || InputQuads[1];
   8204   bool V2Used = InputQuads[2] || InputQuads[3];
   8205   if (Subtarget->hasSSSE3()) {
   8206     if (InputQuads.count() == 2 && V1Used && V2Used) {
   8207       BestLoQuad = InputQuads[0] ? 0 : 1;
   8208       BestHiQuad = InputQuads[2] ? 2 : 3;
   8209     }
   8210     if (InputQuads.count() > 2) {
   8211       BestLoQuad = -1;
   8212       BestHiQuad = -1;
   8213     }
   8214   }
   8215 
   8216   // If BestLoQuad or BestHiQuad are set, shuffle the quads together and update
   8217   // the shuffle mask.  If a quad is scored as -1, that means that it contains
   8218   // words from all 4 input quadwords.
   8219   SDValue NewV;
   8220   if (BestLoQuad >= 0 || BestHiQuad >= 0) {
   8221     int MaskV[] = {
   8222       BestLoQuad < 0 ? 0 : BestLoQuad,
   8223       BestHiQuad < 0 ? 1 : BestHiQuad
   8224     };
   8225     NewV = DAG.getVectorShuffle(MVT::v2i64, dl,
   8226                   DAG.getNode(ISD::BITCAST, dl, MVT::v2i64, V1),
   8227                   DAG.getNode(ISD::BITCAST, dl, MVT::v2i64, V2), &MaskV[0]);
   8228     NewV = DAG.getNode(ISD::BITCAST, dl, MVT::v8i16, NewV);
   8229 
   8230     // Rewrite the MaskVals and assign NewV to V1 if NewV now contains all the
   8231     // source words for the shuffle, to aid later transformations.
   8232     bool AllWordsInNewV = true;
   8233     bool InOrder[2] = { true, true };
   8234     for (unsigned i = 0; i != 8; ++i) {
   8235       int idx = MaskVals[i];
   8236       if (idx != (int)i)
   8237         InOrder[i/4] = false;
   8238       if (idx < 0 || (idx/4) == BestLoQuad || (idx/4) == BestHiQuad)
   8239         continue;
   8240       AllWordsInNewV = false;
   8241       break;
   8242     }
   8243 
   8244     bool pshuflw = AllWordsInNewV, pshufhw = AllWordsInNewV;
   8245     if (AllWordsInNewV) {
   8246       for (int i = 0; i != 8; ++i) {
   8247         int idx = MaskVals[i];
   8248         if (idx < 0)
   8249           continue;
   8250         idx = MaskVals[i] = (idx / 4) == BestLoQuad ? (idx & 3) : (idx & 3) + 4;
   8251         if ((idx != i) && idx < 4)
   8252           pshufhw = false;
   8253         if ((idx != i) && idx > 3)
   8254           pshuflw = false;
   8255       }
   8256       V1 = NewV;
   8257       V2Used = false;
   8258       BestLoQuad = 0;
   8259       BestHiQuad = 1;
   8260     }
   8261 
   8262     // If we've eliminated the use of V2, and the new mask is a pshuflw or
   8263     // pshufhw, that's as cheap as it gets.  Return the new shuffle.
   8264     if ((pshufhw && InOrder[0]) || (pshuflw && InOrder[1])) {
   8265       unsigned Opc = pshufhw ? X86ISD::PSHUFHW : X86ISD::PSHUFLW;
   8266       unsigned TargetMask = 0;
   8267       NewV = DAG.getVectorShuffle(MVT::v8i16, dl, NewV,
   8268                                   DAG.getUNDEF(MVT::v8i16), &MaskVals[0]);
   8269       ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(NewV.getNode());
   8270       TargetMask = pshufhw ? getShufflePSHUFHWImmediate(SVOp):
   8271                              getShufflePSHUFLWImmediate(SVOp);
   8272       V1 = NewV.getOperand(0);
   8273       return getTargetShuffleNode(Opc, dl, MVT::v8i16, V1, TargetMask, DAG);
   8274     }
   8275   }
   8276 
   8277   // Promote splats to a larger type which usually leads to more efficient code.
   8278   // FIXME: Is this true if pshufb is available?
   8279   if (SVOp->isSplat())
   8280     return PromoteSplat(SVOp, DAG);
   8281 
   8282   // If we have SSSE3, and all words of the result are from 1 input vector,
   8283   // case 2 is generated, otherwise case 3 is generated.  If no SSSE3
   8284   // is present, fall back to case 4.
   8285   if (Subtarget->hasSSSE3()) {
   8286     SmallVector<SDValue,16> pshufbMask;
   8287 
   8288     // If we have elements from both input vectors, set the high bit of the
   8289     // shuffle mask element to zero out elements that come from V2 in the V1
   8290     // mask, and elements that come from V1 in the V2 mask, so that the two
   8291     // results can be OR'd together.
   8292     bool TwoInputs = V1Used && V2Used;
   8293     V1 = getPSHUFB(MaskVals, V1, dl, DAG);
   8294     if (!TwoInputs)
   8295       return DAG.getNode(ISD::BITCAST, dl, MVT::v8i16, V1);
   8296 
   8297     // Calculate the shuffle mask for the second input, shuffle it, and
   8298     // OR it with the first shuffled input.
   8299     CommuteVectorShuffleMask(MaskVals, 8);
   8300     V2 = getPSHUFB(MaskVals, V2, dl, DAG);
   8301     V1 = DAG.getNode(ISD::OR, dl, MVT::v16i8, V1, V2);
   8302     return DAG.getNode(ISD::BITCAST, dl, MVT::v8i16, V1);
   8303   }
   8304 
   8305   // If BestLoQuad >= 0, generate a pshuflw to put the low elements in order,
   8306   // and update MaskVals with new element order.
   8307   std::bitset<8> InOrder;
   8308   if (BestLoQuad >= 0) {
   8309     int MaskV[] = { -1, -1, -1, -1, 4, 5, 6, 7 };
   8310     for (int i = 0; i != 4; ++i) {
   8311       int idx = MaskVals[i];
   8312       if (idx < 0) {
   8313         InOrder.set(i);
   8314       } else if ((idx / 4) == BestLoQuad) {
   8315         MaskV[i] = idx & 3;
   8316         InOrder.set(i);
   8317       }
   8318     }
   8319     NewV = DAG.getVectorShuffle(MVT::v8i16, dl, NewV, DAG.getUNDEF(MVT::v8i16),
   8320                                 &MaskV[0]);
   8321 
   8322     if (NewV.getOpcode() == ISD::VECTOR_SHUFFLE && Subtarget->hasSSE2()) {
   8323       ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(NewV.getNode());
   8324       NewV = getTargetShuffleNode(X86ISD::PSHUFLW, dl, MVT::v8i16,
   8325                                   NewV.getOperand(0),
   8326                                   getShufflePSHUFLWImmediate(SVOp), DAG);
   8327     }
   8328   }
   8329 
   8330   // If BestHi >= 0, generate a pshufhw to put the high elements in order,
   8331   // and update MaskVals with the new element order.
   8332   if (BestHiQuad >= 0) {
   8333     int MaskV[] = { 0, 1, 2, 3, -1, -1, -1, -1 };
   8334     for (unsigned i = 4; i != 8; ++i) {
   8335       int idx = MaskVals[i];
   8336       if (idx < 0) {
   8337         InOrder.set(i);
   8338       } else if ((idx / 4) == BestHiQuad) {
   8339         MaskV[i] = (idx & 3) + 4;
   8340         InOrder.set(i);
   8341       }
   8342     }
   8343     NewV = DAG.getVectorShuffle(MVT::v8i16, dl, NewV, DAG.getUNDEF(MVT::v8i16),
   8344                                 &MaskV[0]);
   8345 
   8346     if (NewV.getOpcode() == ISD::VECTOR_SHUFFLE && Subtarget->hasSSE2()) {
   8347       ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(NewV.getNode());
   8348       NewV = getTargetShuffleNode(X86ISD::PSHUFHW, dl, MVT::v8i16,
   8349                                   NewV.getOperand(0),
   8350                                   getShufflePSHUFHWImmediate(SVOp), DAG);
   8351     }
   8352   }
   8353 
   8354   // In case BestHi & BestLo were both -1, which means each quadword has a word
   8355   // from each of the four input quadwords, calculate the InOrder bitvector now
   8356   // before falling through to the insert/extract cleanup.
   8357   if (BestLoQuad == -1 && BestHiQuad == -1) {
   8358     NewV = V1;
   8359     for (int i = 0; i != 8; ++i)
   8360       if (MaskVals[i] < 0 || MaskVals[i] == i)
   8361         InOrder.set(i);
   8362   }
   8363 
   8364   // The other elements are put in the right place using pextrw and pinsrw.
   8365   for (unsigned i = 0; i != 8; ++i) {
   8366     if (InOrder[i])
   8367       continue;
   8368     int EltIdx = MaskVals[i];
   8369     if (EltIdx < 0)
   8370       continue;
   8371     SDValue ExtOp = (EltIdx < 8) ?
   8372       DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16, V1,
   8373                   DAG.getIntPtrConstant(EltIdx)) :
   8374       DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16, V2,
   8375                   DAG.getIntPtrConstant(EltIdx - 8));
   8376     NewV = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v8i16, NewV, ExtOp,
   8377                        DAG.getIntPtrConstant(i));
   8378   }
   8379   return NewV;
   8380 }
   8381 
   8382 /// \brief v16i16 shuffles
   8383 ///
   8384 /// FIXME: We only support generation of a single pshufb currently.  We can
   8385 /// generalize the other applicable cases from LowerVECTOR_SHUFFLEv8i16 as
   8386 /// well (e.g 2 x pshufb + 1 x por).
   8387 static SDValue
   8388 LowerVECTOR_SHUFFLEv16i16(SDValue Op, SelectionDAG &DAG) {
   8389   ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
   8390   SDValue V1 = SVOp->getOperand(0);
   8391   SDValue V2 = SVOp->getOperand(1);
   8392   SDLoc dl(SVOp);
   8393 
   8394   if (V2.getOpcode() != ISD::UNDEF)
   8395     return SDValue();
   8396 
   8397   SmallVector<int, 16> MaskVals(SVOp->getMask().begin(), SVOp->getMask().end());
   8398   return getPSHUFB(MaskVals, V1, dl, DAG);
   8399 }
   8400 
   8401 // v16i8 shuffles - Prefer shuffles in the following order:
   8402 // 1. [ssse3] 1 x pshufb
   8403 // 2. [ssse3] 2 x pshufb + 1 x por
   8404 // 3. [all]   v8i16 shuffle + N x pextrw + rotate + pinsrw
   8405 static SDValue LowerVECTOR_SHUFFLEv16i8(ShuffleVectorSDNode *SVOp,
   8406                                         const X86Subtarget* Subtarget,
   8407                                         SelectionDAG &DAG) {
   8408   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
   8409   SDValue V1 = SVOp->getOperand(0);
   8410   SDValue V2 = SVOp->getOperand(1);
   8411   SDLoc dl(SVOp);
   8412   ArrayRef<int> MaskVals = SVOp->getMask();
   8413 
   8414   // Promote splats to a larger type which usually leads to more efficient code.
   8415   // FIXME: Is this true if pshufb is available?
   8416   if (SVOp->isSplat())
   8417     return PromoteSplat(SVOp, DAG);
   8418 
   8419   // If we have SSSE3, case 1 is generated when all result bytes come from
   8420   // one of  the inputs.  Otherwise, case 2 is generated.  If no SSSE3 is
   8421   // present, fall back to case 3.
   8422 
   8423   // If SSSE3, use 1 pshufb instruction per vector with elements in the result.
   8424   if (Subtarget->hasSSSE3()) {
   8425     SmallVector<SDValue,16> pshufbMask;
   8426 
   8427     // If all result elements are from one input vector, then only translate
   8428     // undef mask values to 0x80 (zero out result) in the pshufb mask.
   8429     //
   8430     // Otherwise, we have elements from both input vectors, and must zero out
   8431     // elements that come from V2 in the first mask, and V1 in the second mask
   8432     // so that we can OR them together.
   8433     for (unsigned i = 0; i != 16; ++i) {
   8434       int EltIdx = MaskVals[i];
   8435       if (EltIdx < 0 || EltIdx >= 16)
   8436         EltIdx = 0x80;
   8437       pshufbMask.push_back(DAG.getConstant(EltIdx, MVT::i8));
   8438     }
   8439     V1 = DAG.getNode(X86ISD::PSHUFB, dl, MVT::v16i8, V1,
   8440                      DAG.getNode(ISD::BUILD_VECTOR, dl,
   8441                                  MVT::v16i8, pshufbMask));
   8442 
   8443     // As PSHUFB will zero elements with negative indices, it's safe to ignore
   8444     // the 2nd operand if it's undefined or zero.
   8445     if (V2.getOpcode() == ISD::UNDEF ||
   8446         ISD::isBuildVectorAllZeros(V2.getNode()))
   8447       return V1;
   8448 
   8449     // Calculate the shuffle mask for the second input, shuffle it, and
   8450     // OR it with the first shuffled input.
   8451     pshufbMask.clear();
   8452     for (unsigned i = 0; i != 16; ++i) {
   8453       int EltIdx = MaskVals[i];
   8454       EltIdx = (EltIdx < 16) ? 0x80 : EltIdx - 16;
   8455       pshufbMask.push_back(DAG.getConstant(EltIdx, MVT::i8));
   8456     }
   8457     V2 = DAG.getNode(X86ISD::PSHUFB, dl, MVT::v16i8, V2,
   8458                      DAG.getNode(ISD::BUILD_VECTOR, dl,
   8459                                  MVT::v16i8, pshufbMask));
   8460     return DAG.getNode(ISD::OR, dl, MVT::v16i8, V1, V2);
   8461   }
   8462 
   8463   // No SSSE3 - Calculate in place words and then fix all out of place words
   8464   // With 0-16 extracts & inserts.  Worst case is 16 bytes out of order from
   8465   // the 16 different words that comprise the two doublequadword input vectors.
   8466   V1 = DAG.getNode(ISD::BITCAST, dl, MVT::v8i16, V1);
   8467   V2 = DAG.getNode(ISD::BITCAST, dl, MVT::v8i16, V2);
   8468   SDValue NewV = V1;
   8469   for (int i = 0; i != 8; ++i) {
   8470     int Elt0 = MaskVals[i*2];
   8471     int Elt1 = MaskVals[i*2+1];
   8472 
   8473     // This word of the result is all undef, skip it.
   8474     if (Elt0 < 0 && Elt1 < 0)
   8475       continue;
   8476 
   8477     // This word of the result is already in the correct place, skip it.
   8478     if ((Elt0 == i*2) && (Elt1 == i*2+1))
   8479       continue;
   8480 
   8481     SDValue Elt0Src = Elt0 < 16 ? V1 : V2;
   8482     SDValue Elt1Src = Elt1 < 16 ? V1 : V2;
   8483     SDValue InsElt;
   8484 
   8485     // If Elt0 and Elt1 are defined, are consecutive, and can be load
   8486     // using a single extract together, load it and store it.
   8487     if ((Elt0 >= 0) && ((Elt0 + 1) == Elt1) && ((Elt0 & 1) == 0)) {
   8488       InsElt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16, Elt1Src,
   8489                            DAG.getIntPtrConstant(Elt1 / 2));
   8490       NewV = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v8i16, NewV, InsElt,
   8491                         DAG.getIntPtrConstant(i));
   8492       continue;
   8493     }
   8494 
   8495     // If Elt1 is defined, extract it from the appropriate source.  If the
   8496     // source byte is not also odd, shift the extracted word left 8 bits
   8497     // otherwise clear the bottom 8 bits if we need to do an or.
   8498     if (Elt1 >= 0) {
   8499       InsElt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16, Elt1Src,
   8500                            DAG.getIntPtrConstant(Elt1 / 2));
   8501       if ((Elt1 & 1) == 0)
   8502         InsElt = DAG.getNode(ISD::SHL, dl, MVT::i16, InsElt,
   8503                              DAG.getConstant(8,
   8504                                   TLI.getShiftAmountTy(InsElt.getValueType())));
   8505       else if (Elt0 >= 0)
   8506         InsElt = DAG.getNode(ISD::AND, dl, MVT::i16, InsElt,
   8507                              DAG.getConstant(0xFF00, MVT::i16));
   8508     }
   8509     // If Elt0 is defined, extract it from the appropriate source.  If the
   8510     // source byte is not also even, shift the extracted word right 8 bits. If
   8511     // Elt1 was also defined, OR the extracted values together before
   8512     // inserting them in the result.
   8513     if (Elt0 >= 0) {
   8514       SDValue InsElt0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16,
   8515                                     Elt0Src, DAG.getIntPtrConstant(Elt0 / 2));
   8516       if ((Elt0 & 1) != 0)
   8517         InsElt0 = DAG.getNode(ISD::SRL, dl, MVT::i16, InsElt0,
   8518                               DAG.getConstant(8,
   8519                                  TLI.getShiftAmountTy(InsElt0.getValueType())));
   8520       else if (Elt1 >= 0)
   8521         InsElt0 = DAG.getNode(ISD::AND, dl, MVT::i16, InsElt0,
   8522                              DAG.getConstant(0x00FF, MVT::i16));
   8523       InsElt = Elt1 >= 0 ? DAG.getNode(ISD::OR, dl, MVT::i16, InsElt, InsElt0)
   8524                          : InsElt0;
   8525     }
   8526     NewV = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v8i16, NewV, InsElt,
   8527                        DAG.getIntPtrConstant(i));
   8528   }
   8529   return DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, NewV);
   8530 }
   8531 
   8532 // v32i8 shuffles - Translate to VPSHUFB if possible.
   8533 static
   8534 SDValue LowerVECTOR_SHUFFLEv32i8(ShuffleVectorSDNode *SVOp,
   8535                                  const X86Subtarget *Subtarget,
   8536                                  SelectionDAG &DAG) {
   8537   MVT VT = SVOp->getSimpleValueType(0);
   8538   SDValue V1 = SVOp->getOperand(0);
   8539   SDValue V2 = SVOp->getOperand(1);
   8540   SDLoc dl(SVOp);
   8541   SmallVector<int, 32> MaskVals(SVOp->getMask().begin(), SVOp->getMask().end());
   8542 
   8543   bool V2IsUndef = V2.getOpcode() == ISD::UNDEF;
   8544   bool V1IsAllZero = ISD::isBuildVectorAllZeros(V1.getNode());
   8545   bool V2IsAllZero = ISD::isBuildVectorAllZeros(V2.getNode());
   8546 
   8547   // VPSHUFB may be generated if
   8548   // (1) one of input vector is undefined or zeroinitializer.
   8549   // The mask value 0x80 puts 0 in the corresponding slot of the vector.
   8550   // And (2) the mask indexes don't cross the 128-bit lane.
   8551   if (VT != MVT::v32i8 || !Subtarget->hasInt256() ||
   8552       (!V2IsUndef && !V2IsAllZero && !V1IsAllZero))
   8553     return SDValue();
   8554 
   8555   if (V1IsAllZero && !V2IsAllZero) {
   8556     CommuteVectorShuffleMask(MaskVals, 32);
   8557     V1 = V2;
   8558   }
   8559   return getPSHUFB(MaskVals, V1, dl, DAG);
   8560 }
   8561 
   8562 /// RewriteAsNarrowerShuffle - Try rewriting v8i16 and v16i8 shuffles as 4 wide
   8563 /// ones, or rewriting v4i32 / v4f32 as 2 wide ones if possible. This can be
   8564 /// done when every pair / quad of shuffle mask elements point to elements in
   8565 /// the right sequence. e.g.
   8566 /// vector_shuffle X, Y, <2, 3, | 10, 11, | 0, 1, | 14, 15>
   8567 static
   8568 SDValue RewriteAsNarrowerShuffle(ShuffleVectorSDNode *SVOp,
   8569                                  SelectionDAG &DAG) {
   8570   MVT VT = SVOp->getSimpleValueType(0);
   8571   SDLoc dl(SVOp);
   8572   unsigned NumElems = VT.getVectorNumElements();
   8573   MVT NewVT;
   8574   unsigned Scale;
   8575   switch (VT.SimpleTy) {
   8576   default: llvm_unreachable("Unexpected!");
   8577   case MVT::v2i64:
   8578   case MVT::v2f64:
   8579            return SDValue(SVOp, 0);
   8580   case MVT::v4f32:  NewVT = MVT::v2f64; Scale = 2; break;
   8581   case MVT::v4i32:  NewVT = MVT::v2i64; Scale = 2; break;
   8582   case MVT::v8i16:  NewVT = MVT::v4i32; Scale = 2; break;
   8583   case MVT::v16i8:  NewVT = MVT::v4i32; Scale = 4; break;
   8584   case MVT::v16i16: NewVT = MVT::v8i32; Scale = 2; break;
   8585   case MVT::v32i8:  NewVT = MVT::v8i32; Scale = 4; break;
   8586   }
   8587 
   8588   SmallVector<int, 8> MaskVec;
   8589   for (unsigned i = 0; i != NumElems; i += Scale) {
   8590     int StartIdx = -1;
   8591     for (unsigned j = 0; j != Scale; ++j) {
   8592       int EltIdx = SVOp->getMaskElt(i+j);
   8593       if (EltIdx < 0)
   8594         continue;
   8595       if (StartIdx < 0)
   8596         StartIdx = (EltIdx / Scale);
   8597       if (EltIdx != (int)(StartIdx*Scale + j))
   8598         return SDValue();
   8599     }
   8600     MaskVec.push_back(StartIdx);
   8601   }
   8602 
   8603   SDValue V1 = DAG.getNode(ISD::BITCAST, dl, NewVT, SVOp->getOperand(0));
   8604   SDValue V2 = DAG.getNode(ISD::BITCAST, dl, NewVT, SVOp->getOperand(1));
   8605   return DAG.getVectorShuffle(NewVT, dl, V1, V2, &MaskVec[0]);
   8606 }
   8607 
   8608 /// getVZextMovL - Return a zero-extending vector move low node.
   8609 ///
   8610 static SDValue getVZextMovL(MVT VT, MVT OpVT,
   8611                             SDValue SrcOp, SelectionDAG &DAG,
   8612                             const X86Subtarget *Subtarget, SDLoc dl) {
   8613   if (VT == MVT::v2f64 || VT == MVT::v4f32) {
   8614     LoadSDNode *LD = nullptr;
   8615     if (!isScalarLoadToVector(SrcOp.getNode(), &LD))
   8616       LD = dyn_cast<LoadSDNode>(SrcOp);
   8617     if (!LD) {
   8618       // movssrr and movsdrr do not clear top bits. Try to use movd, movq
   8619       // instead.
   8620       MVT ExtVT = (OpVT == MVT::v2f64) ? MVT::i64 : MVT::i32;
   8621       if ((ExtVT != MVT::i64 || Subtarget->is64Bit()) &&
   8622           SrcOp.getOpcode() == ISD::SCALAR_TO_VECTOR &&
   8623           SrcOp.getOperand(0).getOpcode() == ISD::BITCAST &&
   8624           SrcOp.getOperand(0).getOperand(0).getValueType() == ExtVT) {
   8625         // PR2108
   8626         OpVT = (OpVT == MVT::v2f64) ? MVT::v2i64 : MVT::v4i32;
   8627         return DAG.getNode(ISD::BITCAST, dl, VT,
   8628                            DAG.getNode(X86ISD::VZEXT_MOVL, dl, OpVT,
   8629                                        DAG.getNode(ISD::SCALAR_TO_VECTOR, dl,
   8630                                                    OpVT,
   8631                                                    SrcOp.getOperand(0)
   8632                                                           .getOperand(0))));
   8633       }
   8634     }
   8635   }
   8636 
   8637   return DAG.getNode(ISD::BITCAST, dl, VT,
   8638                      DAG.getNode(X86ISD::VZEXT_MOVL, dl, OpVT,
   8639                                  DAG.getNode(ISD::BITCAST, dl,
   8640                                              OpVT, SrcOp)));
   8641 }
   8642 
   8643 /// LowerVECTOR_SHUFFLE_256 - Handle all 256-bit wide vectors shuffles
   8644 /// which could not be matched by any known target speficic shuffle
   8645 static SDValue
   8646 LowerVECTOR_SHUFFLE_256(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG) {
   8647 
   8648   SDValue NewOp = Compact8x32ShuffleNode(SVOp, DAG);
   8649   if (NewOp.getNode())
   8650     return NewOp;
   8651 
   8652   MVT VT = SVOp->getSimpleValueType(0);
   8653 
   8654   unsigned NumElems = VT.getVectorNumElements();
   8655   unsigned NumLaneElems = NumElems / 2;
   8656 
   8657   SDLoc dl(SVOp);
   8658   MVT EltVT = VT.getVectorElementType();
   8659   MVT NVT = MVT::getVectorVT(EltVT, NumLaneElems);
   8660   SDValue Output[2];
   8661 
   8662   SmallVector<int, 16> Mask;
   8663   for (unsigned l = 0; l < 2; ++l) {
   8664     // Build a shuffle mask for the output, discovering on the fly which
   8665     // input vectors to use as shuffle operands (recorded in InputUsed).
   8666     // If building a suitable shuffle vector proves too hard, then bail
   8667     // out with UseBuildVector set.
   8668     bool UseBuildVector = false;
   8669     int InputUsed[2] = { -1, -1 }; // Not yet discovered.
   8670     unsigned LaneStart = l * NumLaneElems;
   8671     for (unsigned i = 0; i != NumLaneElems; ++i) {
   8672       // The mask element.  This indexes into the input.
   8673       int Idx = SVOp->getMaskElt(i+LaneStart);
   8674       if (Idx < 0) {
   8675         // the mask element does not index into any input vector.
   8676         Mask.push_back(-1);
   8677         continue;
   8678       }
   8679 
   8680       // The input vector this mask element indexes into.
   8681       int Input = Idx / NumLaneElems;
   8682 
   8683       // Turn the index into an offset from the start of the input vector.
   8684       Idx -= Input * NumLaneElems;
   8685 
   8686       // Find or create a shuffle vector operand to hold this input.
   8687       unsigned OpNo;
   8688       for (OpNo = 0; OpNo < array_lengthof(InputUsed); ++OpNo) {
   8689         if (InputUsed[OpNo] == Input)
   8690           // This input vector is already an operand.
   8691           break;
   8692         if (InputUsed[OpNo] < 0) {
   8693           // Create a new operand for this input vector.
   8694           InputUsed[OpNo] = Input;
   8695           break;
   8696         }
   8697       }
   8698 
   8699       if (OpNo >= array_lengthof(InputUsed)) {
   8700         // More than two input vectors used!  Give up on trying to create a
   8701         // shuffle vector.  Insert all elements into a BUILD_VECTOR instead.
   8702         UseBuildVector = true;
   8703         break;
   8704       }
   8705 
   8706       // Add the mask index for the new shuffle vector.
   8707       Mask.push_back(Idx + OpNo * NumLaneElems);
   8708     }
   8709 
   8710     if (UseBuildVector) {
   8711       SmallVector<SDValue, 16> SVOps;
   8712       for (unsigned i = 0; i != NumLaneElems; ++i) {
   8713         // The mask element.  This indexes into the input.
   8714         int Idx = SVOp->getMaskElt(i+LaneStart);
   8715         if (Idx < 0) {
   8716           SVOps.push_back(DAG.getUNDEF(EltVT));
   8717           continue;
   8718         }
   8719 
   8720         // The input vector this mask element indexes into.
   8721         int Input = Idx / NumElems;
   8722 
   8723         // Turn the index into an offset from the start of the input vector.
   8724         Idx -= Input * NumElems;
   8725 
   8726         // Extract the vector element by hand.
   8727         SVOps.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT,
   8728                                     SVOp->getOperand(Input),
   8729                                     DAG.getIntPtrConstant(Idx)));
   8730       }
   8731 
   8732       // Construct the output using a BUILD_VECTOR.
   8733       Output[l] = DAG.getNode(ISD::BUILD_VECTOR, dl, NVT, SVOps);
   8734     } else if (InputUsed[0] < 0) {
   8735       // No input vectors were used! The result is undefined.
   8736       Output[l] = DAG.getUNDEF(NVT);
   8737     } else {
   8738       SDValue Op0 = Extract128BitVector(SVOp->getOperand(InputUsed[0] / 2),
   8739                                         (InputUsed[0] % 2) * NumLaneElems,
   8740                                         DAG, dl);
   8741       // If only one input was used, use an undefined vector for the other.
   8742       SDValue Op1 = (InputUsed[1] < 0) ? DAG.getUNDEF(NVT) :
   8743         Extract128BitVector(SVOp->getOperand(InputUsed[1] / 2),
   8744                             (InputUsed[1] % 2) * NumLaneElems, DAG, dl);
   8745       // At least one input vector was used. Create a new shuffle vector.
   8746       Output[l] = DAG.getVectorShuffle(NVT, dl, Op0, Op1, &Mask[0]);
   8747     }
   8748 
   8749     Mask.clear();
   8750   }
   8751 
   8752   // Concatenate the result back
   8753   return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Output[0], Output[1]);
   8754 }
   8755 
   8756 /// LowerVECTOR_SHUFFLE_128v4 - Handle all 128-bit wide vectors with
   8757 /// 4 elements, and match them with several different shuffle types.
   8758 static SDValue
   8759 LowerVECTOR_SHUFFLE_128v4(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG) {
   8760   SDValue V1 = SVOp->getOperand(0);
   8761   SDValue V2 = SVOp->getOperand(1);
   8762   SDLoc dl(SVOp);
   8763   MVT VT = SVOp->getSimpleValueType(0);
   8764 
   8765   assert(VT.is128BitVector() && "Unsupported vector size");
   8766 
   8767   std::pair<int, int> Locs[4];
   8768   int Mask1[] = { -1, -1, -1, -1 };
   8769   SmallVector<int, 8> PermMask(SVOp->getMask().begin(), SVOp->getMask().end());
   8770 
   8771   unsigned NumHi = 0;
   8772   unsigned NumLo = 0;
   8773   for (unsigned i = 0; i != 4; ++i) {
   8774     int Idx = PermMask[i];
   8775     if (Idx < 0) {
   8776       Locs[i] = std::make_pair(-1, -1);
   8777     } else {
   8778       assert(Idx < 8 && "Invalid VECTOR_SHUFFLE index!");
   8779       if (Idx < 4) {
   8780         Locs[i] = std::make_pair(0, NumLo);
   8781         Mask1[NumLo] = Idx;
   8782         NumLo++;
   8783       } else {
   8784         Locs[i] = std::make_pair(1, NumHi);
   8785         if (2+NumHi < 4)
   8786           Mask1[2+NumHi] = Idx;
   8787         NumHi++;
   8788       }
   8789     }
   8790   }
   8791 
   8792   if (NumLo <= 2 && NumHi <= 2) {
   8793     // If no more than two elements come from either vector. This can be
   8794     // implemented with two shuffles. First shuffle gather the elements.
   8795     // The second shuffle, which takes the first shuffle as both of its
   8796     // vector operands, put the elements into the right order.
   8797     V1 = DAG.getVectorShuffle(VT, dl, V1, V2, &Mask1[0]);
   8798 
   8799     int Mask2[] = { -1, -1, -1, -1 };
   8800 
   8801     for (unsigned i = 0; i != 4; ++i)
   8802       if (Locs[i].first != -1) {
   8803         unsigned Idx = (i < 2) ? 0 : 4;
   8804         Idx += Locs[i].first * 2 + Locs[i].second;
   8805         Mask2[i] = Idx;
   8806       }
   8807 
   8808     return DAG.getVectorShuffle(VT, dl, V1, V1, &Mask2[0]);
   8809   }
   8810 
   8811   if (NumLo == 3 || NumHi == 3) {
   8812     // Otherwise, we must have three elements from one vector, call it X, and
   8813     // one element from the other, call it Y.  First, use a shufps to build an
   8814     // intermediate vector with the one element from Y and the element from X
   8815     // that will be in the same half in the final destination (the indexes don't
   8816     // matter). Then, use a shufps to build the final vector, taking the half
   8817     // containing the element from Y from the intermediate, and the other half
   8818     // from X.
   8819     if (NumHi == 3) {
   8820       // Normalize it so the 3 elements come from V1.
   8821       CommuteVectorShuffleMask(PermMask, 4);
   8822       std::swap(V1, V2);
   8823     }
   8824 
   8825     // Find the element from V2.
   8826     unsigned HiIndex;
   8827     for (HiIndex = 0; HiIndex < 3; ++HiIndex) {
   8828       int Val = PermMask[HiIndex];
   8829       if (Val < 0)
   8830         continue;
   8831       if (Val >= 4)
   8832         break;
   8833     }
   8834 
   8835     Mask1[0] = PermMask[HiIndex];
   8836     Mask1[1] = -1;
   8837     Mask1[2] = PermMask[HiIndex^1];
   8838     Mask1[3] = -1;
   8839     V2 = DAG.getVectorShuffle(VT, dl, V1, V2, &Mask1[0]);
   8840 
   8841     if (HiIndex >= 2) {
   8842       Mask1[0] = PermMask[0];
   8843       Mask1[1] = PermMask[1];
   8844       Mask1[2] = HiIndex & 1 ? 6 : 4;
   8845       Mask1[3] = HiIndex & 1 ? 4 : 6;
   8846       return DAG.getVectorShuffle(VT, dl, V1, V2, &Mask1[0]);
   8847     }
   8848 
   8849     Mask1[0] = HiIndex & 1 ? 2 : 0;
   8850     Mask1[1] = HiIndex & 1 ? 0 : 2;
   8851     Mask1[2] = PermMask[2];
   8852     Mask1[3] = PermMask[3];
   8853     if (Mask1[2] >= 0)
   8854       Mask1[2] += 4;
   8855     if (Mask1[3] >= 0)
   8856       Mask1[3] += 4;
   8857     return DAG.getVectorShuffle(VT, dl, V2, V1, &Mask1[0]);
   8858   }
   8859 
   8860   // Break it into (shuffle shuffle_hi, shuffle_lo).
   8861   int LoMask[] = { -1, -1, -1, -1 };
   8862   int HiMask[] = { -1, -1, -1, -1 };
   8863 
   8864   int *MaskPtr = LoMask;
   8865   unsigned MaskIdx = 0;
   8866   unsigned LoIdx = 0;
   8867   unsigned HiIdx = 2;
   8868   for (unsigned i = 0; i != 4; ++i) {
   8869     if (i == 2) {
   8870       MaskPtr = HiMask;
   8871       MaskIdx = 1;
   8872       LoIdx = 0;
   8873       HiIdx = 2;
   8874     }
   8875     int Idx = PermMask[i];
   8876     if (Idx < 0) {
   8877       Locs[i] = std::make_pair(-1, -1);
   8878     } else if (Idx < 4) {
   8879       Locs[i] = std::make_pair(MaskIdx, LoIdx);
   8880       MaskPtr[LoIdx] = Idx;
   8881       LoIdx++;
   8882     } else {
   8883       Locs[i] = std::make_pair(MaskIdx, HiIdx);
   8884       MaskPtr[HiIdx] = Idx;
   8885       HiIdx++;
   8886     }
   8887   }
   8888 
   8889   SDValue LoShuffle = DAG.getVectorShuffle(VT, dl, V1, V2, &LoMask[0]);
   8890   SDValue HiShuffle = DAG.getVectorShuffle(VT, dl, V1, V2, &HiMask[0]);
   8891   int MaskOps[] = { -1, -1, -1, -1 };
   8892   for (unsigned i = 0; i != 4; ++i)
   8893     if (Locs[i].first != -1)
   8894       MaskOps[i] = Locs[i].first * 4 + Locs[i].second;
   8895   return DAG.getVectorShuffle(VT, dl, LoShuffle, HiShuffle, &MaskOps[0]);
   8896 }
   8897 
   8898 static bool MayFoldVectorLoad(SDValue V) {
   8899   while (V.hasOneUse() && V.getOpcode() == ISD::BITCAST)
   8900     V = V.getOperand(0);
   8901 
   8902   if (V.hasOneUse() && V.getOpcode() == ISD::SCALAR_TO_VECTOR)
   8903     V = V.getOperand(0);
   8904   if (V.hasOneUse() && V.getOpcode() == ISD::BUILD_VECTOR &&
   8905       V.getNumOperands() == 2 && V.getOperand(1).getOpcode() == ISD::UNDEF)
   8906     // BUILD_VECTOR (load), undef
   8907     V = V.getOperand(0);
   8908 
   8909   return MayFoldLoad(V);
   8910 }
   8911 
   8912 static
   8913 SDValue getMOVDDup(SDValue &Op, SDLoc &dl, SDValue V1, SelectionDAG &DAG) {
   8914   MVT VT = Op.getSimpleValueType();
   8915 
   8916   // Canonizalize to v2f64.
   8917   V1 = DAG.getNode(ISD::BITCAST, dl, MVT::v2f64, V1);
   8918   return DAG.getNode(ISD::BITCAST, dl, VT,
   8919                      getTargetShuffleNode(X86ISD::MOVDDUP, dl, MVT::v2f64,
   8920                                           V1, DAG));
   8921 }
   8922 
   8923 static
   8924 SDValue getMOVLowToHigh(SDValue &Op, SDLoc &dl, SelectionDAG &DAG,
   8925                         bool HasSSE2) {
   8926   SDValue V1 = Op.getOperand(0);
   8927   SDValue V2 = Op.getOperand(1);
   8928   MVT VT = Op.getSimpleValueType();
   8929 
   8930   assert(VT != MVT::v2i64 && "unsupported shuffle type");
   8931 
   8932   if (HasSSE2 && VT == MVT::v2f64)
   8933     return getTargetShuffleNode(X86ISD::MOVLHPD, dl, VT, V1, V2, DAG);
   8934 
   8935   // v4f32 or v4i32: canonizalized to v4f32 (which is legal for SSE1)
   8936   return DAG.getNode(ISD::BITCAST, dl, VT,
   8937                      getTargetShuffleNode(X86ISD::MOVLHPS, dl, MVT::v4f32,
   8938                            DAG.getNode(ISD::BITCAST, dl, MVT::v4f32, V1),
   8939                            DAG.getNode(ISD::BITCAST, dl, MVT::v4f32, V2), DAG));
   8940 }
   8941 
   8942 static
   8943 SDValue getMOVHighToLow(SDValue &Op, SDLoc &dl, SelectionDAG &DAG) {
   8944   SDValue V1 = Op.getOperand(0);
   8945   SDValue V2 = Op.getOperand(1);
   8946   MVT VT = Op.getSimpleValueType();
   8947 
   8948   assert((VT == MVT::v4i32 || VT == MVT::v4f32) &&
   8949          "unsupported shuffle type");
   8950 
   8951   if (V2.getOpcode() == ISD::UNDEF)
   8952     V2 = V1;
   8953 
   8954   // v4i32 or v4f32
   8955   return getTargetShuffleNode(X86ISD::MOVHLPS, dl, VT, V1, V2, DAG);
   8956 }
   8957 
   8958 static
   8959 SDValue getMOVLP(SDValue &Op, SDLoc &dl, SelectionDAG &DAG, bool HasSSE2) {
   8960   SDValue V1 = Op.getOperand(0);
   8961   SDValue V2 = Op.getOperand(1);
   8962   MVT VT = Op.getSimpleValueType();
   8963   unsigned NumElems = VT.getVectorNumElements();
   8964 
   8965   // Use MOVLPS and MOVLPD in case V1 or V2 are loads. During isel, the second
   8966   // operand of these instructions is only memory, so check if there's a
   8967   // potencial load folding here, otherwise use SHUFPS or MOVSD to match the
   8968   // same masks.
   8969   bool CanFoldLoad = false;
   8970 
   8971   // Trivial case, when V2 comes from a load.
   8972   if (MayFoldVectorLoad(V2))
   8973     CanFoldLoad = true;
   8974 
   8975   // When V1 is a load, it can be folded later into a store in isel, example:
   8976   //  (store (v4f32 (X86Movlps (load addr:$src1), VR128:$src2)), addr:$src1)
   8977   //    turns into:
   8978   //  (MOVLPSmr addr:$src1, VR128:$src2)
   8979   // So, recognize this potential and also use MOVLPS or MOVLPD
   8980   else if (MayFoldVectorLoad(V1) && MayFoldIntoStore(Op))
   8981     CanFoldLoad = true;
   8982 
   8983   ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
   8984   if (CanFoldLoad) {
   8985     if (HasSSE2 && NumElems == 2)
   8986       return getTargetShuffleNode(X86ISD::MOVLPD, dl, VT, V1, V2, DAG);
   8987 
   8988     if (NumElems == 4)
   8989       // If we don't care about the second element, proceed to use movss.
   8990       if (SVOp->getMaskElt(1) != -1)
   8991         return getTargetShuffleNode(X86ISD::MOVLPS, dl, VT, V1, V2, DAG);
   8992   }
   8993 
   8994   // movl and movlp will both match v2i64, but v2i64 is never matched by
   8995   // movl earlier because we make it strict to avoid messing with the movlp load
   8996   // folding logic (see the code above getMOVLP call). Match it here then,
   8997   // this is horrible, but will stay like this until we move all shuffle
   8998   // matching to x86 specific nodes. Note that for the 1st condition all
   8999   // types are matched with movsd.
   9000   if (HasSSE2) {
   9001     // FIXME: isMOVLMask should be checked and matched before getMOVLP,
   9002     // as to remove this logic from here, as much as possible
   9003     if (NumElems == 2 || !isMOVLMask(SVOp->getMask(), VT))
   9004       return getTargetShuffleNode(X86ISD::MOVSD, dl, VT, V1, V2, DAG);
   9005     return getTargetShuffleNode(X86ISD::MOVSS, dl, VT, V1, V2, DAG);
   9006   }
   9007 
   9008   assert(VT != MVT::v4i32 && "unsupported shuffle type");
   9009 
   9010   // Invert the operand order and use SHUFPS to match it.
   9011   return getTargetShuffleNode(X86ISD::SHUFP, dl, VT, V2, V1,
   9012                               getShuffleSHUFImmediate(SVOp), DAG);
   9013 }
   9014 
   9015 static SDValue NarrowVectorLoadToElement(LoadSDNode *Load, unsigned Index,
   9016                                          SelectionDAG &DAG) {
   9017   SDLoc dl(Load);
   9018   MVT VT = Load->getSimpleValueType(0);
   9019   MVT EVT = VT.getVectorElementType();
   9020   SDValue Addr = Load->getOperand(1);
   9021   SDValue NewAddr = DAG.getNode(
   9022       ISD::ADD, dl, Addr.getSimpleValueType(), Addr,
   9023       DAG.getConstant(Index * EVT.getStoreSize(), Addr.getSimpleValueType()));
   9024 
   9025   SDValue NewLoad =
   9026       DAG.getLoad(EVT, dl, Load->getChain(), NewAddr,
   9027                   DAG.getMachineFunction().getMachineMemOperand(
   9028                       Load->getMemOperand(), 0, EVT.getStoreSize()));
   9029   return NewLoad;
   9030 }
   9031 
   9032 // It is only safe to call this function if isINSERTPSMask is true for
   9033 // this shufflevector mask.
   9034 static SDValue getINSERTPS(ShuffleVectorSDNode *SVOp, SDLoc &dl,
   9035                            SelectionDAG &DAG) {
   9036   // Generate an insertps instruction when inserting an f32 from memory onto a
   9037   // v4f32 or when copying a member from one v4f32 to another.
   9038   // We also use it for transferring i32 from one register to another,
   9039   // since it simply copies the same bits.
   9040   // If we're transferring an i32 from memory to a specific element in a
   9041   // register, we output a generic DAG that will match the PINSRD
   9042   // instruction.
   9043   MVT VT = SVOp->getSimpleValueType(0);
   9044   MVT EVT = VT.getVectorElementType();
   9045   SDValue V1 = SVOp->getOperand(0);
   9046   SDValue V2 = SVOp->getOperand(1);
   9047   auto Mask = SVOp->getMask();
   9048   assert((VT == MVT::v4f32 || VT == MVT::v4i32) &&
   9049          "unsupported vector type for insertps/pinsrd");
   9050 
   9051   auto FromV1Predicate = [](const int &i) { return i < 4 && i > -1; };
   9052   auto FromV2Predicate = [](const int &i) { return i >= 4; };
   9053   int FromV1 = std::count_if(Mask.begin(), Mask.end(), FromV1Predicate);
   9054 
   9055   SDValue From;
   9056   SDValue To;
   9057   unsigned DestIndex;
   9058   if (FromV1 == 1) {
   9059     From = V1;
   9060     To = V2;
   9061     DestIndex = std::find_if(Mask.begin(), Mask.end(), FromV1Predicate) -
   9062                 Mask.begin();
   9063   } else {
   9064     assert(std::count_if(Mask.begin(), Mask.end(), FromV2Predicate) == 1 &&
   9065            "More than one element from V1 and from V2, or no elements from one "
   9066            "of the vectors. This case should not have returned true from "
   9067            "isINSERTPSMask");
   9068     From = V2;
   9069     To = V1;
   9070     DestIndex =
   9071         std::find_if(Mask.begin(), Mask.end(), FromV2Predicate) - Mask.begin();
   9072   }
   9073 
   9074   unsigned SrcIndex = Mask[DestIndex] % 4;
   9075   if (MayFoldLoad(From)) {
   9076     // Trivial case, when From comes from a load and is only used by the
   9077     // shuffle. Make it use insertps from the vector that we need from that
   9078     // load.
   9079     SDValue NewLoad =
   9080         NarrowVectorLoadToElement(cast<LoadSDNode>(From), SrcIndex, DAG);
   9081     if (!NewLoad.getNode())
   9082       return SDValue();
   9083 
   9084     if (EVT == MVT::f32) {
   9085       // Create this as a scalar to vector to match the instruction pattern.
   9086       SDValue LoadScalarToVector =
   9087           DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, NewLoad);
   9088       SDValue InsertpsMask = DAG.getIntPtrConstant(DestIndex << 4);
   9089       return DAG.getNode(X86ISD::INSERTPS, dl, VT, To, LoadScalarToVector,
   9090                          InsertpsMask);
   9091     } else { // EVT == MVT::i32
   9092       // If we're getting an i32 from memory, use an INSERT_VECTOR_ELT
   9093       // instruction, to match the PINSRD instruction, which loads an i32 to a
   9094       // certain vector element.
   9095       return DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, To, NewLoad,
   9096                          DAG.getConstant(DestIndex, MVT::i32));
   9097     }
   9098   }
   9099 
   9100   // Vector-element-to-vector
   9101   SDValue InsertpsMask = DAG.getIntPtrConstant(DestIndex << 4 | SrcIndex << 6);
   9102   return DAG.getNode(X86ISD::INSERTPS, dl, VT, To, From, InsertpsMask);
   9103 }
   9104 
   9105 // Reduce a vector shuffle to zext.
   9106 static SDValue LowerVectorIntExtend(SDValue Op, const X86Subtarget *Subtarget,
   9107                                     SelectionDAG &DAG) {
   9108   // PMOVZX is only available from SSE41.
   9109   if (!Subtarget->hasSSE41())
   9110     return SDValue();
   9111 
   9112   MVT VT = Op.getSimpleValueType();
   9113 
   9114   // Only AVX2 support 256-bit vector integer extending.
   9115   if (!Subtarget->hasInt256() && VT.is256BitVector())
   9116     return SDValue();
   9117 
   9118   ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
   9119   SDLoc DL(Op);
   9120   SDValue V1 = Op.getOperand(0);
   9121   SDValue V2 = Op.getOperand(1);
   9122   unsigned NumElems = VT.getVectorNumElements();
   9123 
   9124   // Extending is an unary operation and the element type of the source vector
   9125   // won't be equal to or larger than i64.
   9126   if (V2.getOpcode() != ISD::UNDEF || !VT.isInteger() ||
   9127       VT.getVectorElementType() == MVT::i64)
   9128     return SDValue();
   9129 
   9130   // Find the expansion ratio, e.g. expanding from i8 to i32 has a ratio of 4.
   9131   unsigned Shift = 1; // Start from 2, i.e. 1 << 1.
   9132   while ((1U << Shift) < NumElems) {
   9133     if (SVOp->getMaskElt(1U << Shift) == 1)
   9134       break;
   9135     Shift += 1;
   9136     // The maximal ratio is 8, i.e. from i8 to i64.
   9137     if (Shift > 3)
   9138       return SDValue();
   9139   }
   9140 
   9141   // Check the shuffle mask.
   9142   unsigned Mask = (1U << Shift) - 1;
   9143   for (unsigned i = 0; i != NumElems; ++i) {
   9144     int EltIdx = SVOp->getMaskElt(i);
   9145     if ((i & Mask) != 0 && EltIdx != -1)
   9146       return SDValue();
   9147     if ((i & Mask) == 0 && (unsigned)EltIdx != (i >> Shift))
   9148       return SDValue();
   9149   }
   9150 
   9151   unsigned NBits = VT.getVectorElementType().getSizeInBits() << Shift;
   9152   MVT NeVT = MVT::getIntegerVT(NBits);
   9153   MVT NVT = MVT::getVectorVT(NeVT, NumElems >> Shift);
   9154 
   9155   if (!DAG.getTargetLoweringInfo().isTypeLegal(NVT))
   9156     return SDValue();
   9157 
   9158   // Simplify the operand as it's prepared to be fed into shuffle.
   9159   unsigned SignificantBits = NVT.getSizeInBits() >> Shift;
   9160   if (V1.getOpcode() == ISD::BITCAST &&
   9161       V1.getOperand(0).getOpcode() == ISD::SCALAR_TO_VECTOR &&
   9162       V1.getOperand(0).getOperand(0).getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
   9163       V1.getOperand(0).getOperand(0)
   9164         .getSimpleValueType().getSizeInBits() == SignificantBits) {
   9165     // (bitcast (sclr2vec (ext_vec_elt x))) -> (bitcast x)
   9166     SDValue V = V1.getOperand(0).getOperand(0).getOperand(0);
   9167     ConstantSDNode *CIdx =
   9168       dyn_cast<ConstantSDNode>(V1.getOperand(0).getOperand(0).getOperand(1));
   9169     // If it's foldable, i.e. normal load with single use, we will let code
   9170     // selection to fold it. Otherwise, we will short the conversion sequence.
   9171     if (CIdx && CIdx->getZExtValue() == 0 &&
   9172         (!ISD::isNormalLoad(V.getNode()) || !V.hasOneUse())) {
   9173       MVT FullVT = V.getSimpleValueType();
   9174       MVT V1VT = V1.getSimpleValueType();
   9175       if (FullVT.getSizeInBits() > V1VT.getSizeInBits()) {
   9176         // The "ext_vec_elt" node is wider than the result node.
   9177         // In this case we should extract subvector from V.
   9178         // (bitcast (sclr2vec (ext_vec_elt x))) -> (bitcast (extract_subvector x)).
   9179         unsigned Ratio = FullVT.getSizeInBits() / V1VT.getSizeInBits();
   9180         MVT SubVecVT = MVT::getVectorVT(FullVT.getVectorElementType(),
   9181                                         FullVT.getVectorNumElements()/Ratio);
   9182         V = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVecVT, V,
   9183                         DAG.getIntPtrConstant(0));
   9184       }
   9185       V1 = DAG.getNode(ISD::BITCAST, DL, V1VT, V);
   9186     }
   9187   }
   9188 
   9189   return DAG.getNode(ISD::BITCAST, DL, VT,
   9190                      DAG.getNode(X86ISD::VZEXT, DL, NVT, V1));
   9191 }
   9192 
   9193 static SDValue NormalizeVectorShuffle(SDValue Op, const X86Subtarget *Subtarget,
   9194                                       SelectionDAG &DAG) {
   9195   ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
   9196   MVT VT = Op.getSimpleValueType();
   9197   SDLoc dl(Op);
   9198   SDValue V1 = Op.getOperand(0);
   9199   SDValue V2 = Op.getOperand(1);
   9200 
   9201   if (isZeroShuffle(SVOp))
   9202     return getZeroVector(VT, Subtarget, DAG, dl);
   9203 
   9204   // Handle splat operations
   9205   if (SVOp->isSplat()) {
   9206     // Use vbroadcast whenever the splat comes from a foldable load
   9207     SDValue Broadcast = LowerVectorBroadcast(Op, Subtarget, DAG);
   9208     if (Broadcast.getNode())
   9209       return Broadcast;
   9210   }
   9211 
   9212   // Check integer expanding shuffles.
   9213   SDValue NewOp = LowerVectorIntExtend(Op, Subtarget, DAG);
   9214   if (NewOp.getNode())
   9215     return NewOp;
   9216 
   9217   // If the shuffle can be profitably rewritten as a narrower shuffle, then
   9218   // do it!
   9219   if (VT == MVT::v8i16 || VT == MVT::v16i8 || VT == MVT::v16i16 ||
   9220       VT == MVT::v32i8) {
   9221     SDValue NewOp = RewriteAsNarrowerShuffle(SVOp, DAG);
   9222     if (NewOp.getNode())
   9223       return DAG.getNode(ISD::BITCAST, dl, VT, NewOp);
   9224   } else if (VT.is128BitVector() && Subtarget->hasSSE2()) {
   9225     // FIXME: Figure out a cleaner way to do this.
   9226     if (ISD::isBuildVectorAllZeros(V2.getNode())) {
   9227       SDValue NewOp = RewriteAsNarrowerShuffle(SVOp, DAG);
   9228       if (NewOp.getNode()) {
   9229         MVT NewVT = NewOp.getSimpleValueType();
   9230         if (isCommutedMOVLMask(cast<ShuffleVectorSDNode>(NewOp)->getMask(),
   9231                                NewVT, true, false))
   9232           return getVZextMovL(VT, NewVT, NewOp.getOperand(0), DAG, Subtarget,
   9233                               dl);
   9234       }
   9235     } else if (ISD::isBuildVectorAllZeros(V1.getNode())) {
   9236       SDValue NewOp = RewriteAsNarrowerShuffle(SVOp, DAG);
   9237       if (NewOp.getNode()) {
   9238         MVT NewVT = NewOp.getSimpleValueType();
   9239         if (isMOVLMask(cast<ShuffleVectorSDNode>(NewOp)->getMask(), NewVT))
   9240           return getVZextMovL(VT, NewVT, NewOp.getOperand(1), DAG, Subtarget,
   9241                               dl);
   9242       }
   9243     }
   9244   }
   9245   return SDValue();
   9246 }
   9247 
   9248 SDValue
   9249 X86TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const {
   9250   ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
   9251   SDValue V1 = Op.getOperand(0);
   9252   SDValue V2 = Op.getOperand(1);
   9253   MVT VT = Op.getSimpleValueType();
   9254   SDLoc dl(Op);
   9255   unsigned NumElems = VT.getVectorNumElements();
   9256   bool V1IsUndef = V1.getOpcode() == ISD::UNDEF;
   9257   bool V2IsUndef = V2.getOpcode() == ISD::UNDEF;
   9258   bool V1IsSplat = false;
   9259   bool V2IsSplat = false;
   9260   bool HasSSE2 = Subtarget->hasSSE2();
   9261   bool HasFp256    = Subtarget->hasFp256();
   9262   bool HasInt256   = Subtarget->hasInt256();
   9263   MachineFunction &MF = DAG.getMachineFunction();
   9264   bool OptForSize = MF.getFunction()->getAttributes().
   9265     hasAttribute(AttributeSet::FunctionIndex, Attribute::OptimizeForSize);
   9266 
   9267   // Check if we should use the experimental vector shuffle lowering. If so,
   9268   // delegate completely to that code path.
   9269   if (ExperimentalVectorShuffleLowering)
   9270     return lowerVectorShuffle(Op, Subtarget, DAG);
   9271 
   9272   assert(VT.getSizeInBits() != 64 && "Can't lower MMX shuffles");
   9273 
   9274   if (V1IsUndef && V2IsUndef)
   9275     return DAG.getUNDEF(VT);
   9276 
   9277   // When we create a shuffle node we put the UNDEF node to second operand,
   9278   // but in some cases the first operand may be transformed to UNDEF.
   9279   // In this case we should just commute the node.
   9280   if (V1IsUndef)
   9281     return CommuteVectorShuffle(SVOp, DAG);
   9282 
   9283   // Vector shuffle lowering takes 3 steps:
   9284   //
   9285   // 1) Normalize the input vectors. Here splats, zeroed vectors, profitable
   9286   //    narrowing and commutation of operands should be handled.
   9287   // 2) Matching of shuffles with known shuffle masks to x86 target specific
   9288   //    shuffle nodes.
   9289   // 3) Rewriting of unmatched masks into new generic shuffle operations,
   9290   //    so the shuffle can be broken into other shuffles and the legalizer can
   9291   //    try the lowering again.
   9292   //
   9293   // The general idea is that no vector_shuffle operation should be left to
   9294   // be matched during isel, all of them must be converted to a target specific
   9295   // node here.
   9296 
   9297   // Normalize the input vectors. Here splats, zeroed vectors, profitable
   9298   // narrowing and commutation of operands should be handled. The actual code
   9299   // doesn't include all of those, work in progress...
   9300   SDValue NewOp = NormalizeVectorShuffle(Op, Subtarget, DAG);
   9301   if (NewOp.getNode())
   9302     return NewOp;
   9303 
   9304   SmallVector<int, 8> M(SVOp->getMask().begin(), SVOp->getMask().end());
   9305 
   9306   // NOTE: isPSHUFDMask can also match both masks below (unpckl_undef and
   9307   // unpckh_undef). Only use pshufd if speed is more important than size.
   9308   if (OptForSize && isUNPCKL_v_undef_Mask(M, VT, HasInt256))
   9309     return getTargetShuffleNode(X86ISD::UNPCKL, dl, VT, V1, V1, DAG);
   9310   if (OptForSize && isUNPCKH_v_undef_Mask(M, VT, HasInt256))
   9311     return getTargetShuffleNode(X86ISD::UNPCKH, dl, VT, V1, V1, DAG);
   9312 
   9313   if (isMOVDDUPMask(M, VT) && Subtarget->hasSSE3() &&
   9314       V2IsUndef && MayFoldVectorLoad(V1))
   9315     return getMOVDDup(Op, dl, V1, DAG);
   9316 
   9317   if (isMOVHLPS_v_undef_Mask(M, VT))
   9318     return getMOVHighToLow(Op, dl, DAG);
   9319 
   9320   // Use to match splats
   9321   if (HasSSE2 && isUNPCKHMask(M, VT, HasInt256) && V2IsUndef &&
   9322       (VT == MVT::v2f64 || VT == MVT::v2i64))
   9323     return getTargetShuffleNode(X86ISD::UNPCKH, dl, VT, V1, V1, DAG);
   9324 
   9325   if (isPSHUFDMask(M, VT)) {
   9326     // The actual implementation will match the mask in the if above and then
   9327     // during isel it can match several different instructions, not only pshufd
   9328     // as its name says, sad but true, emulate the behavior for now...
   9329     if (isMOVDDUPMask(M, VT) && ((VT == MVT::v4f32 || VT == MVT::v2i64)))
   9330       return getTargetShuffleNode(X86ISD::MOVLHPS, dl, VT, V1, V1, DAG);
   9331 
   9332     unsigned TargetMask = getShuffleSHUFImmediate(SVOp);
   9333 
   9334     if (HasSSE2 && (VT == MVT::v4f32 || VT == MVT::v4i32))
   9335       return getTargetShuffleNode(X86ISD::PSHUFD, dl, VT, V1, TargetMask, DAG);
   9336 
   9337     if (HasFp256 && (VT == MVT::v4f32 || VT == MVT::v2f64))
   9338       return getTargetShuffleNode(X86ISD::VPERMILP, dl, VT, V1, TargetMask,
   9339                                   DAG);
   9340 
   9341     return getTargetShuffleNode(X86ISD::SHUFP, dl, VT, V1, V1,
   9342                                 TargetMask, DAG);
   9343   }
   9344 
   9345   if (isPALIGNRMask(M, VT, Subtarget))
   9346     return getTargetShuffleNode(X86ISD::PALIGNR, dl, VT, V1, V2,
   9347                                 getShufflePALIGNRImmediate(SVOp),
   9348                                 DAG);
   9349 
   9350   // Check if this can be converted into a logical shift.
   9351   bool isLeft = false;
   9352   unsigned ShAmt = 0;
   9353   SDValue ShVal;
   9354   bool isShift = HasSSE2 && isVectorShift(SVOp, DAG, isLeft, ShVal, ShAmt);
   9355   if (isShift && ShVal.hasOneUse()) {
   9356     // If the shifted value has multiple uses, it may be cheaper to use
   9357     // v_set0 + movlhps or movhlps, etc.
   9358     MVT EltVT = VT.getVectorElementType();
   9359     ShAmt *= EltVT.getSizeInBits();
   9360     return getVShift(isLeft, VT, ShVal, ShAmt, DAG, *this, dl);
   9361   }
   9362 
   9363   if (isMOVLMask(M, VT)) {
   9364     if (ISD::isBuildVectorAllZeros(V1.getNode()))
   9365       return getVZextMovL(VT, VT, V2, DAG, Subtarget, dl);
   9366     if (!isMOVLPMask(M, VT)) {
   9367       if (HasSSE2 && (VT == MVT::v2i64 || VT == MVT::v2f64))
   9368         return getTargetShuffleNode(X86ISD::MOVSD, dl, VT, V1, V2, DAG);
   9369 
   9370       if (VT == MVT::v4i32 || VT == MVT::v4f32)
   9371         return getTargetShuffleNode(X86ISD::MOVSS, dl, VT, V1, V2, DAG);
   9372     }
   9373   }
   9374 
   9375   // FIXME: fold these into legal mask.
   9376   if (isMOVLHPSMask(M, VT) && !isUNPCKLMask(M, VT, HasInt256))
   9377     return getMOVLowToHigh(Op, dl, DAG, HasSSE2);
   9378 
   9379   if (isMOVHLPSMask(M, VT))
   9380     return getMOVHighToLow(Op, dl, DAG);
   9381 
   9382   if (V2IsUndef && isMOVSHDUPMask(M, VT, Subtarget))
   9383     return getTargetShuffleNode(X86ISD::MOVSHDUP, dl, VT, V1, DAG);
   9384 
   9385   if (V2IsUndef && isMOVSLDUPMask(M, VT, Subtarget))
   9386     return getTargetShuffleNode(X86ISD::MOVSLDUP, dl, VT, V1, DAG);
   9387 
   9388   if (isMOVLPMask(M, VT))
   9389     return getMOVLP(Op, dl, DAG, HasSSE2);
   9390 
   9391   if (ShouldXformToMOVHLPS(M, VT) ||
   9392       ShouldXformToMOVLP(V1.getNode(), V2.getNode(), M, VT))
   9393     return CommuteVectorShuffle(SVOp, DAG);
   9394 
   9395   if (isShift) {
   9396     // No better options. Use a vshldq / vsrldq.
   9397     MVT EltVT = VT.getVectorElementType();
   9398     ShAmt *= EltVT.getSizeInBits();
   9399     return getVShift(isLeft, VT, ShVal, ShAmt, DAG, *this, dl);
   9400   }
   9401 
   9402   bool Commuted = false;
   9403   // FIXME: This should also accept a bitcast of a splat?  Be careful, not
   9404   // 1,1,1,1 -> v8i16 though.
   9405   BitVector UndefElements;
   9406   if (auto *BVOp = dyn_cast<BuildVectorSDNode>(V1.getNode()))
   9407     if (BVOp->getConstantSplatNode(&UndefElements) && UndefElements.none())
   9408       V1IsSplat = true;
   9409   if (auto *BVOp = dyn_cast<BuildVectorSDNode>(V2.getNode()))
   9410     if (BVOp->getConstantSplatNode(&UndefElements) && UndefElements.none())
   9411       V2IsSplat = true;
   9412 
   9413   // Canonicalize the splat or undef, if present, to be on the RHS.
   9414   if (!V2IsUndef && V1IsSplat && !V2IsSplat) {
   9415     CommuteVectorShuffleMask(M, NumElems);
   9416     std::swap(V1, V2);
   9417     std::swap(V1IsSplat, V2IsSplat);
   9418     Commuted = true;
   9419   }
   9420 
   9421   if (isCommutedMOVLMask(M, VT, V2IsSplat, V2IsUndef)) {
   9422     // Shuffling low element of v1 into undef, just return v1.
   9423     if (V2IsUndef)
   9424       return V1;
   9425     // If V2 is a splat, the mask may be malformed such as <4,3,3,3>, which
   9426     // the instruction selector will not match, so get a canonical MOVL with
   9427     // swapped operands to undo the commute.
   9428     return getMOVL(DAG, dl, VT, V2, V1);
   9429   }
   9430 
   9431   if (isUNPCKLMask(M, VT, HasInt256))
   9432     return getTargetShuffleNode(X86ISD::UNPCKL, dl, VT, V1, V2, DAG);
   9433 
   9434   if (isUNPCKHMask(M, VT, HasInt256))
   9435     return getTargetShuffleNode(X86ISD::UNPCKH, dl, VT, V1, V2, DAG);
   9436 
   9437   if (V2IsSplat) {
   9438     // Normalize mask so all entries that point to V2 points to its first
   9439     // element then try to match unpck{h|l} again. If match, return a
   9440     // new vector_shuffle with the corrected mask.p
   9441     SmallVector<int, 8> NewMask(M.begin(), M.end());
   9442     NormalizeMask(NewMask, NumElems);
   9443     if (isUNPCKLMask(NewMask, VT, HasInt256, true))
   9444       return getTargetShuffleNode(X86ISD::UNPCKL, dl, VT, V1, V2, DAG);
   9445     if (isUNPCKHMask(NewMask, VT, HasInt256, true))
   9446       return getTargetShuffleNode(X86ISD::UNPCKH, dl, VT, V1, V2, DAG);
   9447   }
   9448 
   9449   if (Commuted) {
   9450     // Commute is back and try unpck* again.
   9451     // FIXME: this seems wrong.
   9452     CommuteVectorShuffleMask(M, NumElems);
   9453     std::swap(V1, V2);
   9454     std::swap(V1IsSplat, V2IsSplat);
   9455 
   9456     if (isUNPCKLMask(M, VT, HasInt256))
   9457       return getTargetShuffleNode(X86ISD::UNPCKL, dl, VT, V1, V2, DAG);
   9458 
   9459     if (isUNPCKHMask(M, VT, HasInt256))
   9460       return getTargetShuffleNode(X86ISD::UNPCKH, dl, VT, V1, V2, DAG);
   9461   }
   9462 
   9463   // Normalize the node to match x86 shuffle ops if needed
   9464   if (!V2IsUndef && (isSHUFPMask(M, VT, /* Commuted */ true)))
   9465     return CommuteVectorShuffle(SVOp, DAG);
   9466 
   9467   // The checks below are all present in isShuffleMaskLegal, but they are
   9468   // inlined here right now to enable us to directly emit target specific
   9469   // nodes, and remove one by one until they don't return Op anymore.
   9470 
   9471   if (ShuffleVectorSDNode::isSplatMask(&M[0], VT) &&
   9472       SVOp->getSplatIndex() == 0 && V2IsUndef) {
   9473     if (VT == MVT::v2f64 || VT == MVT::v2i64)
   9474       return getTargetShuffleNode(X86ISD::UNPCKL, dl, VT, V1, V1, DAG);
   9475   }
   9476 
   9477   if (isPSHUFHWMask(M, VT, HasInt256))
   9478     return getTargetShuffleNode(X86ISD::PSHUFHW, dl, VT, V1,
   9479                                 getShufflePSHUFHWImmediate(SVOp),
   9480                                 DAG);
   9481 
   9482   if (isPSHUFLWMask(M, VT, HasInt256))
   9483     return getTargetShuffleNode(X86ISD::PSHUFLW, dl, VT, V1,
   9484                                 getShufflePSHUFLWImmediate(SVOp),
   9485                                 DAG);
   9486 
   9487   unsigned MaskValue;
   9488   if (isBlendMask(M, VT, Subtarget->hasSSE41(), Subtarget->hasInt256(),
   9489                   &MaskValue))
   9490     return LowerVECTOR_SHUFFLEtoBlend(SVOp, MaskValue, Subtarget, DAG);
   9491 
   9492   if (isSHUFPMask(M, VT))
   9493     return getTargetShuffleNode(X86ISD::SHUFP, dl, VT, V1, V2,
   9494                                 getShuffleSHUFImmediate(SVOp), DAG);
   9495 
   9496   if (isUNPCKL_v_undef_Mask(M, VT, HasInt256))
   9497     return getTargetShuffleNode(X86ISD::UNPCKL, dl, VT, V1, V1, DAG);
   9498   if (isUNPCKH_v_undef_Mask(M, VT, HasInt256))
   9499     return getTargetShuffleNode(X86ISD::UNPCKH, dl, VT, V1, V1, DAG);
   9500 
   9501   //===--------------------------------------------------------------------===//
   9502   // Generate target specific nodes for 128 or 256-bit shuffles only
   9503   // supported in the AVX instruction set.
   9504   //
   9505 
   9506   // Handle VMOVDDUPY permutations
   9507   if (V2IsUndef && isMOVDDUPYMask(M, VT, HasFp256))
   9508     return getTargetShuffleNode(X86ISD::MOVDDUP, dl, VT, V1, DAG);
   9509 
   9510   // Handle VPERMILPS/D* permutations
   9511   if (isVPERMILPMask(M, VT)) {
   9512     if ((HasInt256 && VT == MVT::v8i32) || VT == MVT::v16i32)
   9513       return getTargetShuffleNode(X86ISD::PSHUFD, dl, VT, V1,
   9514                                   getShuffleSHUFImmediate(SVOp), DAG);
   9515     return getTargetShuffleNode(X86ISD::VPERMILP, dl, VT, V1,
   9516                                 getShuffleSHUFImmediate(SVOp), DAG);
   9517   }
   9518 
   9519   unsigned Idx;
   9520   if (VT.is512BitVector() && isINSERT64x4Mask(M, VT, &Idx))
   9521     return Insert256BitVector(V1, Extract256BitVector(V2, 0, DAG, dl),
   9522                               Idx*(NumElems/2), DAG, dl);
   9523 
   9524   // Handle VPERM2F128/VPERM2I128 permutations
   9525   if (isVPERM2X128Mask(M, VT, HasFp256))
   9526     return getTargetShuffleNode(X86ISD::VPERM2X128, dl, VT, V1,
   9527                                 V2, getShuffleVPERM2X128Immediate(SVOp), DAG);
   9528 
   9529   if (Subtarget->hasSSE41() && isINSERTPSMask(M, VT))
   9530     return getINSERTPS(SVOp, dl, DAG);
   9531 
   9532   unsigned Imm8;
   9533   if (V2IsUndef && HasInt256 && isPermImmMask(M, VT, Imm8))
   9534     return getTargetShuffleNode(X86ISD::VPERMI, dl, VT, V1, Imm8, DAG);
   9535 
   9536   if ((V2IsUndef && HasInt256 && VT.is256BitVector() && NumElems == 8) ||
   9537       VT.is512BitVector()) {
   9538     MVT MaskEltVT = MVT::getIntegerVT(VT.getVectorElementType().getSizeInBits());
   9539     MVT MaskVectorVT = MVT::getVectorVT(MaskEltVT, NumElems);
   9540     SmallVector<SDValue, 16> permclMask;
   9541     for (unsigned i = 0; i != NumElems; ++i) {
   9542       permclMask.push_back(DAG.getConstant((M[i]>=0) ? M[i] : 0, MaskEltVT));
   9543     }
   9544 
   9545     SDValue Mask = DAG.getNode(ISD::BUILD_VECTOR, dl, MaskVectorVT, permclMask);
   9546     if (V2IsUndef)
   9547       // Bitcast is for VPERMPS since mask is v8i32 but node takes v8f32
   9548       return DAG.getNode(X86ISD::VPERMV, dl, VT,
   9549                           DAG.getNode(ISD::BITCAST, dl, VT, Mask), V1);
   9550     return DAG.getNode(X86ISD::VPERMV3, dl, VT, V1,
   9551                        DAG.getNode(ISD::BITCAST, dl, VT, Mask), V2);
   9552   }
   9553 
   9554   //===--------------------------------------------------------------------===//
   9555   // Since no target specific shuffle was selected for this generic one,
   9556   // lower it into other known shuffles. FIXME: this isn't true yet, but
   9557   // this is the plan.
   9558   //
   9559 
   9560   // Handle v8i16 specifically since SSE can do byte extraction and insertion.
   9561   if (VT == MVT::v8i16) {
   9562     SDValue NewOp = LowerVECTOR_SHUFFLEv8i16(Op, Subtarget, DAG);
   9563     if (NewOp.getNode())
   9564       return NewOp;
   9565   }
   9566 
   9567   if (VT == MVT::v16i16 && Subtarget->hasInt256()) {
   9568     SDValue NewOp = LowerVECTOR_SHUFFLEv16i16(Op, DAG);
   9569     if (NewOp.getNode())
   9570       return NewOp;
   9571   }
   9572 
   9573   if (VT == MVT::v16i8) {
   9574     SDValue NewOp = LowerVECTOR_SHUFFLEv16i8(SVOp, Subtarget, DAG);
   9575     if (NewOp.getNode())
   9576       return NewOp;
   9577   }
   9578 
   9579   if (VT == MVT::v32i8) {
   9580     SDValue NewOp = LowerVECTOR_SHUFFLEv32i8(SVOp, Subtarget, DAG);
   9581     if (NewOp.getNode())
   9582       return NewOp;
   9583   }
   9584 
   9585   // Handle all 128-bit wide vectors with 4 elements, and match them with
   9586   // several different shuffle types.
   9587   if (NumElems == 4 && VT.is128BitVector())
   9588     return LowerVECTOR_SHUFFLE_128v4(SVOp, DAG);
   9589 
   9590   // Handle general 256-bit shuffles
   9591   if (VT.is256BitVector())
   9592     return LowerVECTOR_SHUFFLE_256(SVOp, DAG);
   9593 
   9594   return SDValue();
   9595 }
   9596 
   9597 // This function assumes its argument is a BUILD_VECTOR of constants or
   9598 // undef SDNodes. i.e: ISD::isBuildVectorOfConstantSDNodes(BuildVector) is
   9599 // true.
   9600 static bool BUILD_VECTORtoBlendMask(BuildVectorSDNode *BuildVector,
   9601                                     unsigned &MaskValue) {
   9602   MaskValue = 0;
   9603   unsigned NumElems = BuildVector->getNumOperands();
   9604   // There are 2 lanes if (NumElems > 8), and 1 lane otherwise.
   9605   unsigned NumLanes = (NumElems - 1) / 8 + 1;
   9606   unsigned NumElemsInLane = NumElems / NumLanes;
   9607 
   9608   // Blend for v16i16 should be symetric for the both lanes.
   9609   for (unsigned i = 0; i < NumElemsInLane; ++i) {
   9610     SDValue EltCond = BuildVector->getOperand(i);
   9611     SDValue SndLaneEltCond =
   9612         (NumLanes == 2) ? BuildVector->getOperand(i + NumElemsInLane) : EltCond;
   9613 
   9614     int Lane1Cond = -1, Lane2Cond = -1;
   9615     if (isa<ConstantSDNode>(EltCond))
   9616       Lane1Cond = !isZero(EltCond);
   9617     if (isa<ConstantSDNode>(SndLaneEltCond))
   9618       Lane2Cond = !isZero(SndLaneEltCond);
   9619 
   9620     if (Lane1Cond == Lane2Cond || Lane2Cond < 0)
   9621       // Lane1Cond != 0, means we want the first argument.
   9622       // Lane1Cond == 0, means we want the second argument.
   9623       // The encoding of this argument is 0 for the first argument, 1
   9624       // for the second. Therefore, invert the condition.
   9625       MaskValue |= !Lane1Cond << i;
   9626     else if (Lane1Cond < 0)
   9627       MaskValue |= !Lane2Cond << i;
   9628     else
   9629       return false;
   9630   }
   9631   return true;
   9632 }
   9633 
   9634 // Try to lower a vselect node into a simple blend instruction.
   9635 static SDValue LowerVSELECTtoBlend(SDValue Op, const X86Subtarget *Subtarget,
   9636                                    SelectionDAG &DAG) {
   9637   SDValue Cond = Op.getOperand(0);
   9638   SDValue LHS = Op.getOperand(1);
   9639   SDValue RHS = Op.getOperand(2);
   9640   SDLoc dl(Op);
   9641   MVT VT = Op.getSimpleValueType();
   9642   MVT EltVT = VT.getVectorElementType();
   9643   unsigned NumElems = VT.getVectorNumElements();
   9644 
   9645   // There is no blend with immediate in AVX-512.
   9646   if (VT.is512BitVector())
   9647     return SDValue();
   9648 
   9649   if (!Subtarget->hasSSE41() || EltVT == MVT::i8)
   9650     return SDValue();
   9651   if (!Subtarget->hasInt256() && VT == MVT::v16i16)
   9652     return SDValue();
   9653 
   9654   if (!ISD::isBuildVectorOfConstantSDNodes(Cond.getNode()))
   9655     return SDValue();
   9656 
   9657   // Check the mask for BLEND and build the value.
   9658   unsigned MaskValue = 0;
   9659   if (!BUILD_VECTORtoBlendMask(cast<BuildVectorSDNode>(Cond), MaskValue))
   9660     return SDValue();
   9661 
   9662   // Convert i32 vectors to floating point if it is not AVX2.
   9663   // AVX2 introduced VPBLENDD instruction for 128 and 256-bit vectors.
   9664   MVT BlendVT = VT;
   9665   if (EltVT == MVT::i64 || (EltVT == MVT::i32 && !Subtarget->hasInt256())) {
   9666     BlendVT = MVT::getVectorVT(MVT::getFloatingPointVT(EltVT.getSizeInBits()),
   9667                                NumElems);
   9668     LHS = DAG.getNode(ISD::BITCAST, dl, VT, LHS);
   9669     RHS = DAG.getNode(ISD::BITCAST, dl, VT, RHS);
   9670   }
   9671 
   9672   SDValue Ret = DAG.getNode(X86ISD::BLENDI, dl, BlendVT, LHS, RHS,
   9673                             DAG.getConstant(MaskValue, MVT::i32));
   9674   return DAG.getNode(ISD::BITCAST, dl, VT, Ret);
   9675 }
   9676 
   9677 SDValue X86TargetLowering::LowerVSELECT(SDValue Op, SelectionDAG &DAG) const {
   9678   SDValue BlendOp = LowerVSELECTtoBlend(Op, Subtarget, DAG);
   9679   if (BlendOp.getNode())
   9680     return BlendOp;
   9681 
   9682   // Some types for vselect were previously set to Expand, not Legal or
   9683   // Custom. Return an empty SDValue so we fall-through to Expand, after
   9684   // the Custom lowering phase.
   9685   MVT VT = Op.getSimpleValueType();
   9686   switch (VT.SimpleTy) {
   9687   default:
   9688     break;
   9689   case MVT::v8i16:
   9690   case MVT::v16i16:
   9691     return SDValue();
   9692   }
   9693 
   9694   // We couldn't create a "Blend with immediate" node.
   9695   // This node should still be legal, but we'll have to emit a blendv*
   9696   // instruction.
   9697   return Op;
   9698 }
   9699 
   9700 static SDValue LowerEXTRACT_VECTOR_ELT_SSE4(SDValue Op, SelectionDAG &DAG) {
   9701   MVT VT = Op.getSimpleValueType();
   9702   SDLoc dl(Op);
   9703 
   9704   if (!Op.getOperand(0).getSimpleValueType().is128BitVector())
   9705     return SDValue();
   9706 
   9707   if (VT.getSizeInBits() == 8) {
   9708     SDValue Extract = DAG.getNode(X86ISD::PEXTRB, dl, MVT::i32,
   9709                                   Op.getOperand(0), Op.getOperand(1));
   9710     SDValue Assert  = DAG.getNode(ISD::AssertZext, dl, MVT::i32, Extract,
   9711                                   DAG.getValueType(VT));
   9712     return DAG.getNode(ISD::TRUNCATE, dl, VT, Assert);
   9713   }
   9714 
   9715   if (VT.getSizeInBits() == 16) {
   9716     unsigned Idx = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
   9717     // If Idx is 0, it's cheaper to do a move instead of a pextrw.
   9718     if (Idx == 0)
   9719       return DAG.getNode(ISD::TRUNCATE, dl, MVT::i16,
   9720                          DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32,
   9721                                      DAG.getNode(ISD::BITCAST, dl,
   9722                                                  MVT::v4i32,
   9723                                                  Op.getOperand(0)),
   9724                                      Op.getOperand(1)));
   9725     SDValue Extract = DAG.getNode(X86ISD::PEXTRW, dl, MVT::i32,
   9726                                   Op.getOperand(0), Op.getOperand(1));
   9727     SDValue Assert  = DAG.getNode(ISD::AssertZext, dl, MVT::i32, Extract,
   9728                                   DAG.getValueType(VT));
   9729     return DAG.getNode(ISD::TRUNCATE, dl, VT, Assert);
   9730   }
   9731 
   9732   if (VT == MVT::f32) {
   9733     // EXTRACTPS outputs to a GPR32 register which will require a movd to copy
   9734     // the result back to FR32 register. It's only worth matching if the
   9735     // result has a single use which is a store or a bitcast to i32.  And in
   9736     // the case of a store, it's not worth it if the index is a constant 0,
   9737     // because a MOVSSmr can be used instead, which is smaller and faster.
   9738     if (!Op.hasOneUse())
   9739       return SDValue();
   9740     SDNode *User = *Op.getNode()->use_begin();
   9741     if ((User->getOpcode() != ISD::STORE ||
   9742          (isa<ConstantSDNode>(Op.getOperand(1)) &&
   9743           cast<ConstantSDNode>(Op.getOperand(1))->isNullValue())) &&
   9744         (User->getOpcode() != ISD::BITCAST ||
   9745          User->getValueType(0) != MVT::i32))
   9746       return SDValue();
   9747     SDValue Extract = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32,
   9748                                   DAG.getNode(ISD::BITCAST, dl, MVT::v4i32,
   9749                                               Op.getOperand(0)),
   9750                                               Op.getOperand(1));
   9751     return DAG.getNode(ISD::BITCAST, dl, MVT::f32, Extract);
   9752   }
   9753 
   9754   if (VT == MVT::i32 || VT == MVT::i64) {
   9755     // ExtractPS/pextrq works with constant index.
   9756     if (isa<ConstantSDNode>(Op.getOperand(1)))
   9757       return Op;
   9758   }
   9759   return SDValue();
   9760 }
   9761 
   9762 /// Extract one bit from mask vector, like v16i1 or v8i1.
   9763 /// AVX-512 feature.
   9764 SDValue
   9765 X86TargetLowering::ExtractBitFromMaskVector(SDValue Op, SelectionDAG &DAG) const {
   9766   SDValue Vec = Op.getOperand(0);
   9767   SDLoc dl(Vec);
   9768   MVT VecVT = Vec.getSimpleValueType();
   9769   SDValue Idx = Op.getOperand(1);
   9770   MVT EltVT = Op.getSimpleValueType();
   9771 
   9772   assert((EltVT == MVT::i1) && "Unexpected operands in ExtractBitFromMaskVector");
   9773 
   9774   // variable index can't be handled in mask registers,
   9775   // extend vector to VR512
   9776   if (!isa<ConstantSDNode>(Idx)) {
   9777     MVT ExtVT = (VecVT == MVT::v8i1 ?  MVT::v8i64 : MVT::v16i32);
   9778     SDValue Ext = DAG.getNode(ISD::ZERO_EXTEND, dl, ExtVT, Vec);
   9779     SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl,
   9780                               ExtVT.getVectorElementType(), Ext, Idx);
   9781     return DAG.getNode(ISD::TRUNCATE, dl, EltVT, Elt);
   9782   }
   9783 
   9784   unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
   9785   const TargetRegisterClass* rc = getRegClassFor(VecVT);
   9786   unsigned MaxSift = rc->getSize()*8 - 1;
   9787   Vec = DAG.getNode(X86ISD::VSHLI, dl, VecVT, Vec,
   9788                     DAG.getConstant(MaxSift - IdxVal, MVT::i8));
   9789   Vec = DAG.getNode(X86ISD::VSRLI, dl, VecVT, Vec,
   9790                     DAG.getConstant(MaxSift, MVT::i8));
   9791   return DAG.getNode(X86ISD::VEXTRACT, dl, MVT::i1, Vec,
   9792                        DAG.getIntPtrConstant(0));
   9793 }
   9794 
   9795 SDValue
   9796 X86TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op,
   9797                                            SelectionDAG &DAG) const {
   9798   SDLoc dl(Op);
   9799   SDValue Vec = Op.getOperand(0);
   9800   MVT VecVT = Vec.getSimpleValueType();
   9801   SDValue Idx = Op.getOperand(1);
   9802 
   9803   if (Op.getSimpleValueType() == MVT::i1)
   9804     return ExtractBitFromMaskVector(Op, DAG);
   9805 
   9806   if (!isa<ConstantSDNode>(Idx)) {
   9807     if (VecVT.is512BitVector() ||
   9808         (VecVT.is256BitVector() && Subtarget->hasInt256() &&
   9809          VecVT.getVectorElementType().getSizeInBits() == 32)) {
   9810 
   9811       MVT MaskEltVT =
   9812         MVT::getIntegerVT(VecVT.getVectorElementType().getSizeInBits());
   9813       MVT MaskVT = MVT::getVectorVT(MaskEltVT, VecVT.getSizeInBits() /
   9814                                     MaskEltVT.getSizeInBits());
   9815 
   9816       Idx = DAG.getZExtOrTrunc(Idx, dl, MaskEltVT);
   9817       SDValue Mask = DAG.getNode(X86ISD::VINSERT, dl, MaskVT,
   9818                                 getZeroVector(MaskVT, Subtarget, DAG, dl),
   9819                                 Idx, DAG.getConstant(0, getPointerTy()));
   9820       SDValue Perm = DAG.getNode(X86ISD::VPERMV, dl, VecVT, Mask, Vec);
   9821       return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, Op.getValueType(),
   9822                         Perm, DAG.getConstant(0, getPointerTy()));
   9823     }
   9824     return SDValue();
   9825   }
   9826 
   9827   // If this is a 256-bit vector result, first extract the 128-bit vector and
   9828   // then extract the element from the 128-bit vector.
   9829   if (VecVT.is256BitVector() || VecVT.is512BitVector()) {
   9830 
   9831     unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
   9832     // Get the 128-bit vector.
   9833     Vec = Extract128BitVector(Vec, IdxVal, DAG, dl);
   9834     MVT EltVT = VecVT.getVectorElementType();
   9835 
   9836     unsigned ElemsPerChunk = 128 / EltVT.getSizeInBits();
   9837 
   9838     //if (IdxVal >= NumElems/2)
   9839     //  IdxVal -= NumElems/2;
   9840     IdxVal -= (IdxVal/ElemsPerChunk)*ElemsPerChunk;
   9841     return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, Op.getValueType(), Vec,
   9842                        DAG.getConstant(IdxVal, MVT::i32));
   9843   }
   9844 
   9845   assert(VecVT.is128BitVector() && "Unexpected vector length");
   9846 
   9847   if (Subtarget->hasSSE41()) {
   9848     SDValue Res = LowerEXTRACT_VECTOR_ELT_SSE4(Op, DAG);
   9849     if (Res.getNode())
   9850       return Res;
   9851   }
   9852 
   9853   MVT VT = Op.getSimpleValueType();
   9854   // TODO: handle v16i8.
   9855   if (VT.getSizeInBits() == 16) {
   9856     SDValue Vec = Op.getOperand(0);
   9857     unsigned Idx = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
   9858     if (Idx == 0)
   9859       return DAG.getNode(ISD::TRUNCATE, dl, MVT::i16,
   9860                          DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32,
   9861                                      DAG.getNode(ISD::BITCAST, dl,
   9862                                                  MVT::v4i32, Vec),
   9863                                      Op.getOperand(1)));
   9864     // Transform it so it match pextrw which produces a 32-bit result.
   9865     MVT EltVT = MVT::i32;
   9866     SDValue Extract = DAG.getNode(X86ISD::PEXTRW, dl, EltVT,
   9867                                   Op.getOperand(0), Op.getOperand(1));
   9868     SDValue Assert  = DAG.getNode(ISD::AssertZext, dl, EltVT, Extract,
   9869                                   DAG.getValueType(VT));
   9870     return DAG.getNode(ISD::TRUNCATE, dl, VT, Assert);
   9871   }
   9872 
   9873   if (VT.getSizeInBits() == 32) {
   9874     unsigned Idx = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
   9875     if (Idx == 0)
   9876       return Op;
   9877 
   9878     // SHUFPS the element to the lowest double word, then movss.
   9879     int Mask[4] = { static_cast<int>(Idx), -1, -1, -1 };
   9880     MVT VVT = Op.getOperand(0).getSimpleValueType();
   9881     SDValue Vec = DAG.getVectorShuffle(VVT, dl, Op.getOperand(0),
   9882                                        DAG.getUNDEF(VVT), Mask);
   9883     return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Vec,
   9884                        DAG.getIntPtrConstant(0));
   9885   }
   9886 
   9887   if (VT.getSizeInBits() == 64) {
   9888     // FIXME: .td only matches this for <2 x f64>, not <2 x i64> on 32b
   9889     // FIXME: seems like this should be unnecessary if mov{h,l}pd were taught
   9890     //        to match extract_elt for f64.
   9891     unsigned Idx = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
   9892     if (Idx == 0)
   9893       return Op;
   9894 
   9895     // UNPCKHPD the element to the lowest double word, then movsd.
   9896     // Note if the lower 64 bits of the result of the UNPCKHPD is then stored
   9897     // to a f64mem, the whole operation is folded into a single MOVHPDmr.
   9898     int Mask[2] = { 1, -1 };
   9899     MVT VVT = Op.getOperand(0).getSimpleValueType();
   9900     SDValue Vec = DAG.getVectorShuffle(VVT, dl, Op.getOperand(0),
   9901                                        DAG.getUNDEF(VVT), Mask);
   9902     return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Vec,
   9903                        DAG.getIntPtrConstant(0));
   9904   }
   9905 
   9906   return SDValue();
   9907 }
   9908 
   9909 static SDValue LowerINSERT_VECTOR_ELT_SSE4(SDValue Op, SelectionDAG &DAG) {
   9910   MVT VT = Op.getSimpleValueType();
   9911   MVT EltVT = VT.getVectorElementType();
   9912   SDLoc dl(Op);
   9913 
   9914   SDValue N0 = Op.getOperand(0);
   9915   SDValue N1 = Op.getOperand(1);
   9916   SDValue N2 = Op.getOperand(2);
   9917 
   9918   if (!VT.is128BitVector())
   9919     return SDValue();
   9920 
   9921   if ((EltVT.getSizeInBits() == 8 || EltVT.getSizeInBits() == 16) &&
   9922       isa<ConstantSDNode>(N2)) {
   9923     unsigned Opc;
   9924     if (VT == MVT::v8i16)
   9925       Opc = X86ISD::PINSRW;
   9926     else if (VT == MVT::v16i8)
   9927       Opc = X86ISD::PINSRB;
   9928     else
   9929       Opc = X86ISD::PINSRB;
   9930 
   9931     // Transform it so it match pinsr{b,w} which expects a GR32 as its second
   9932     // argument.
   9933     if (N1.getValueType() != MVT::i32)
   9934       N1 = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, N1);
   9935     if (N2.getValueType() != MVT::i32)
   9936       N2 = DAG.getIntPtrConstant(cast<ConstantSDNode>(N2)->getZExtValue());
   9937     return DAG.getNode(Opc, dl, VT, N0, N1, N2);
   9938   }
   9939 
   9940   if (EltVT == MVT::f32 && isa<ConstantSDNode>(N2)) {
   9941     // Bits [7:6] of the constant are the source select.  This will always be
   9942     //  zero here.  The DAG Combiner may combine an extract_elt index into these
   9943     //  bits.  For example (insert (extract, 3), 2) could be matched by putting
   9944     //  the '3' into bits [7:6] of X86ISD::INSERTPS.
   9945     // Bits [5:4] of the constant are the destination select.  This is the
   9946     //  value of the incoming immediate.
   9947     // Bits [3:0] of the constant are the zero mask.  The DAG Combiner may
   9948     //   combine either bitwise AND or insert of float 0.0 to set these bits.
   9949     N2 = DAG.getIntPtrConstant(cast<ConstantSDNode>(N2)->getZExtValue() << 4);
   9950     // Create this as a scalar to vector..
   9951     N1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4f32, N1);
   9952     return DAG.getNode(X86ISD::INSERTPS, dl, VT, N0, N1, N2);
   9953   }
   9954 
   9955   if ((EltVT == MVT::i32 || EltVT == MVT::i64) && isa<ConstantSDNode>(N2)) {
   9956     // PINSR* works with constant index.
   9957     return Op;
   9958   }
   9959   return SDValue();
   9960 }
   9961 
   9962 /// Insert one bit to mask vector, like v16i1 or v8i1.
   9963 /// AVX-512 feature.
   9964 SDValue
   9965 X86TargetLowering::InsertBitToMaskVector(SDValue Op, SelectionDAG &DAG) const {
   9966   SDLoc dl(Op);
   9967   SDValue Vec = Op.getOperand(0);
   9968   SDValue Elt = Op.getOperand(1);
   9969   SDValue Idx = Op.getOperand(2);
   9970   MVT VecVT = Vec.getSimpleValueType();
   9971 
   9972   if (!isa<ConstantSDNode>(Idx)) {
   9973     // Non constant index. Extend source and destination,
   9974     // insert element and then truncate the result.
   9975     MVT ExtVecVT = (VecVT == MVT::v8i1 ?  MVT::v8i64 : MVT::v16i32);
   9976     MVT ExtEltVT = (VecVT == MVT::v8i1 ?  MVT::i64 : MVT::i32);
   9977     SDValue ExtOp = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, ExtVecVT,
   9978       DAG.getNode(ISD::ZERO_EXTEND, dl, ExtVecVT, Vec),
   9979       DAG.getNode(ISD::ZERO_EXTEND, dl, ExtEltVT, Elt), Idx);
   9980     return DAG.getNode(ISD::TRUNCATE, dl, VecVT, ExtOp);
   9981   }
   9982 
   9983   unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
   9984   SDValue EltInVec = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VecVT, Elt);
   9985   if (Vec.getOpcode() == ISD::UNDEF)
   9986     return DAG.getNode(X86ISD::VSHLI, dl, VecVT, EltInVec,
   9987                        DAG.getConstant(IdxVal, MVT::i8));
   9988   const TargetRegisterClass* rc = getRegClassFor(VecVT);
   9989   unsigned MaxSift = rc->getSize()*8 - 1;
   9990   EltInVec = DAG.getNode(X86ISD::VSHLI, dl, VecVT, EltInVec,
   9991                     DAG.getConstant(MaxSift, MVT::i8));
   9992   EltInVec = DAG.getNode(X86ISD::VSRLI, dl, VecVT, EltInVec,
   9993                     DAG.getConstant(MaxSift - IdxVal, MVT::i8));
   9994   return DAG.getNode(ISD::OR, dl, VecVT, Vec, EltInVec);
   9995 }
   9996 SDValue
   9997 X86TargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const {
   9998   MVT VT = Op.getSimpleValueType();
   9999   MVT EltVT = VT.getVectorElementType();
   10000 
   10001   if (EltVT == MVT::i1)
   10002     return InsertBitToMaskVector(Op, DAG);
   10003 
   10004   SDLoc dl(Op);
   10005   SDValue N0 = Op.getOperand(0);
   10006   SDValue N1 = Op.getOperand(1);
   10007   SDValue N2 = Op.getOperand(2);
   10008 
   10009   // If this is a 256-bit vector result, first extract the 128-bit vector,
   10010   // insert the element into the extracted half and then place it back.
   10011   if (VT.is256BitVector() || VT.is512BitVector()) {
   10012     if (!isa<ConstantSDNode>(N2))
   10013       return SDValue();
   10014 
   10015     // Get the desired 128-bit vector half.
   10016     unsigned IdxVal = cast<ConstantSDNode>(N2)->getZExtValue();
   10017     SDValue V = Extract128BitVector(N0, IdxVal, DAG, dl);
   10018 
   10019     // Insert the element into the desired half.
   10020     unsigned NumEltsIn128 = 128/EltVT.getSizeInBits();
   10021     unsigned IdxIn128 = IdxVal - (IdxVal/NumEltsIn128) * NumEltsIn128;
   10022 
   10023     V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, V.getValueType(), V, N1,
   10024                     DAG.getConstant(IdxIn128, MVT::i32));
   10025 
   10026     // Insert the changed part back to the 256-bit vector
   10027     return Insert128BitVector(N0, V, IdxVal, DAG, dl);
   10028   }
   10029 
   10030   if (Subtarget->hasSSE41())
   10031     return LowerINSERT_VECTOR_ELT_SSE4(Op, DAG);
   10032 
   10033   if (EltVT == MVT::i8)
   10034     return SDValue();
   10035 
   10036   if (EltVT.getSizeInBits() == 16 && isa<ConstantSDNode>(N2)) {
   10037     // Transform it so it match pinsrw which expects a 16-bit value in a GR32
   10038     // as its second argument.
   10039     if (N1.getValueType() != MVT::i32)
   10040       N1 = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, N1);
   10041     if (N2.getValueType() != MVT::i32)
   10042       N2 = DAG.getIntPtrConstant(cast<ConstantSDNode>(N2)->getZExtValue());
   10043     return DAG.getNode(X86ISD::PINSRW, dl, VT, N0, N1, N2);
   10044   }
   10045   return SDValue();
   10046 }
   10047 
   10048 static SDValue LowerSCALAR_TO_VECTOR(SDValue Op, SelectionDAG &DAG) {
   10049   SDLoc dl(Op);
   10050   MVT OpVT = Op.getSimpleValueType();
   10051 
   10052   // If this is a 256-bit vector result, first insert into a 128-bit
   10053   // vector and then insert into the 256-bit vector.
   10054   if (!OpVT.is128BitVector()) {
   10055     // Insert into a 128-bit vector.
   10056     unsigned SizeFactor = OpVT.getSizeInBits()/128;
   10057     MVT VT128 = MVT::getVectorVT(OpVT.getVectorElementType(),
   10058                                  OpVT.getVectorNumElements() / SizeFactor);
   10059 
   10060     Op = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT128, Op.getOperand(0));
   10061 
   10062     // Insert the 128-bit vector.
   10063     return Insert128BitVector(DAG.getUNDEF(OpVT), Op, 0, DAG, dl);
   10064   }
   10065 
   10066   if (OpVT == MVT::v1i64 &&
   10067       Op.getOperand(0).getValueType() == MVT::i64)
   10068     return DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v1i64, Op.getOperand(0));
   10069 
   10070   SDValue AnyExt = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, Op.getOperand(0));
   10071   assert(OpVT.is128BitVector() && "Expected an SSE type!");
   10072   return DAG.getNode(ISD::BITCAST, dl, OpVT,
   10073                      DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32,AnyExt));
   10074 }
   10075 
   10076 // Lower a node with an EXTRACT_SUBVECTOR opcode.  This may result in
   10077 // a simple subregister reference or explicit instructions to grab
   10078 // upper bits of a vector.
   10079 static SDValue LowerEXTRACT_SUBVECTOR(SDValue Op, const X86Subtarget *Subtarget,
   10080                                       SelectionDAG &DAG) {
   10081   SDLoc dl(Op);
   10082   SDValue In =  Op.getOperand(0);
   10083   SDValue Idx = Op.getOperand(1);
   10084   unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
   10085   MVT ResVT   = Op.getSimpleValueType();
   10086   MVT InVT    = In.getSimpleValueType();
   10087 
   10088   if (Subtarget->hasFp256()) {
   10089     if (ResVT.is128BitVector() &&
   10090         (InVT.is256BitVector() || InVT.is512BitVector()) &&
   10091         isa<ConstantSDNode>(Idx)) {
   10092       return Extract128BitVector(In, IdxVal, DAG, dl);
   10093     }
   10094     if (ResVT.is256BitVector() && InVT.is512BitVector() &&
   10095         isa<ConstantSDNode>(Idx)) {
   10096       return Extract256BitVector(In, IdxVal, DAG, dl);
   10097     }
   10098   }
   10099   return SDValue();
   10100 }
   10101 
   10102 // Lower a node with an INSERT_SUBVECTOR opcode.  This may result in a
   10103 // simple superregister reference or explicit instructions to insert
   10104 // the upper bits of a vector.
   10105 static SDValue LowerINSERT_SUBVECTOR(SDValue Op, const X86Subtarget *Subtarget,
   10106                                      SelectionDAG &DAG) {
   10107   if (Subtarget->hasFp256()) {
   10108     SDLoc dl(Op.getNode());
   10109     SDValue Vec = Op.getNode()->getOperand(0);
   10110     SDValue SubVec = Op.getNode()->getOperand(1);
   10111     SDValue Idx = Op.getNode()->getOperand(2);
   10112 
   10113     if ((Op.getNode()->getSimpleValueType(0).is256BitVector() ||
   10114          Op.getNode()->getSimpleValueType(0).is512BitVector()) &&
   10115         SubVec.getNode()->getSimpleValueType(0).is128BitVector() &&
   10116         isa<ConstantSDNode>(Idx)) {
   10117       unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
   10118       return Insert128BitVector(Vec, SubVec, IdxVal, DAG, dl);
   10119     }
   10120 
   10121     if (Op.getNode()->getSimpleValueType(0).is512BitVector() &&
   10122         SubVec.getNode()->getSimpleValueType(0).is256BitVector() &&
   10123         isa<ConstantSDNode>(Idx)) {
   10124       unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
   10125       return Insert256BitVector(Vec, SubVec, IdxVal, DAG, dl);
   10126     }
   10127   }
   10128   return SDValue();
   10129 }
   10130 
   10131 // ConstantPool, JumpTable, GlobalAddress, and ExternalSymbol are lowered as
   10132 // their target countpart wrapped in the X86ISD::Wrapper node. Suppose N is
   10133 // one of the above mentioned nodes. It has to be wrapped because otherwise
   10134 // Select(N) returns N. So the raw TargetGlobalAddress nodes, etc. can only
   10135 // be used to form addressing mode. These wrapped nodes will be selected
   10136 // into MOV32ri.
   10137 SDValue
   10138 X86TargetLowering::LowerConstantPool(SDValue Op, SelectionDAG &DAG) const {
   10139   ConstantPoolSDNode *CP = cast<ConstantPoolSDNode>(Op);
   10140 
   10141   // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the
   10142   // global base reg.
   10143   unsigned char OpFlag = 0;
   10144   unsigned WrapperKind = X86ISD::Wrapper;
   10145   CodeModel::Model M = DAG.getTarget().getCodeModel();
   10146 
   10147   if (Subtarget->isPICStyleRIPRel() &&
   10148       (M == CodeModel::Small || M == CodeModel::Kernel))
   10149     WrapperKind = X86ISD::WrapperRIP;
   10150   else if (Subtarget->isPICStyleGOT())
   10151     OpFlag = X86II::MO_GOTOFF;
   10152   else if (Subtarget->isPICStyleStubPIC())
   10153     OpFlag = X86II::MO_PIC_BASE_OFFSET;
   10154 
   10155   SDValue Result = DAG.getTargetConstantPool(CP->getConstVal(), getPointerTy(),
   10156                                              CP->getAlignment(),
   10157                                              CP->getOffset(), OpFlag);
   10158   SDLoc DL(CP);
   10159   Result = DAG.getNode(WrapperKind, DL, getPointerTy(), Result);
   10160   // With PIC, the address is actually $g + Offset.
   10161   if (OpFlag) {
   10162     Result = DAG.getNode(ISD::ADD, DL, getPointerTy(),
   10163                          DAG.getNode(X86ISD::GlobalBaseReg,
   10164                                      SDLoc(), getPointerTy()),
   10165                          Result);
   10166   }
   10167 
   10168   return Result;
   10169 }
   10170 
   10171 SDValue X86TargetLowering::LowerJumpTable(SDValue Op, SelectionDAG &DAG) const {
   10172   JumpTableSDNode *JT = cast<JumpTableSDNode>(Op);
   10173 
   10174   // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the
   10175   // global base reg.
   10176   unsigned char OpFlag = 0;
   10177   unsigned WrapperKind = X86ISD::Wrapper;
   10178   CodeModel::Model M = DAG.getTarget().getCodeModel();
   10179 
   10180   if (Subtarget->isPICStyleRIPRel() &&
   10181       (M == CodeModel::Small || M == CodeModel::Kernel))
   10182     WrapperKind = X86ISD::WrapperRIP;
   10183   else if (Subtarget->isPICStyleGOT())
   10184     OpFlag = X86II::MO_GOTOFF;
   10185   else if (Subtarget->isPICStyleStubPIC())
   10186     OpFlag = X86II::MO_PIC_BASE_OFFSET;
   10187 
   10188   SDValue Result = DAG.getTargetJumpTable(JT->getIndex(), getPointerTy(),
   10189                                           OpFlag);
   10190   SDLoc DL(JT);
   10191   Result = DAG.getNode(WrapperKind, DL, getPointerTy(), Result);
   10192 
   10193   // With PIC, the address is actually $g + Offset.
   10194   if (OpFlag)
   10195     Result = DAG.getNode(ISD::ADD, DL, getPointerTy(),
   10196                          DAG.getNode(X86ISD::GlobalBaseReg,
   10197                                      SDLoc(), getPointerTy()),
   10198                          Result);
   10199 
   10200   return Result;
   10201 }
   10202 
   10203 SDValue
   10204 X86TargetLowering::LowerExternalSymbol(SDValue Op, SelectionDAG &DAG) const {
   10205   const char *Sym = cast<ExternalSymbolSDNode>(Op)->getSymbol();
   10206 
   10207   // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the
   10208   // global base reg.
   10209   unsigned char OpFlag = 0;
   10210   unsigned WrapperKind = X86ISD::Wrapper;
   10211   CodeModel::Model M = DAG.getTarget().getCodeModel();
   10212 
   10213   if (Subtarget->isPICStyleRIPRel() &&
   10214       (M == CodeModel::Small || M == CodeModel::Kernel)) {
   10215     if (Subtarget->isTargetDarwin() || Subtarget->isTargetELF())
   10216       OpFlag = X86II::MO_GOTPCREL;
   10217     WrapperKind = X86ISD::WrapperRIP;
   10218   } else if (Subtarget->isPICStyleGOT()) {
   10219     OpFlag = X86II::MO_GOT;
   10220   } else if (Subtarget->isPICStyleStubPIC()) {
   10221     OpFlag = X86II::MO_DARWIN_NONLAZY_PIC_BASE;
   10222   } else if (Subtarget->isPICStyleStubNoDynamic()) {
   10223     OpFlag = X86II::MO_DARWIN_NONLAZY;
   10224   }
   10225 
   10226   SDValue Result = DAG.getTargetExternalSymbol(Sym, getPointerTy(), OpFlag);
   10227 
   10228   SDLoc DL(Op);
   10229   Result = DAG.getNode(WrapperKind, DL, getPointerTy(), Result);
   10230 
   10231   // With PIC, the address is actually $g + Offset.
   10232   if (DAG.getTarget().getRelocationModel() == Reloc::PIC_ &&
   10233       !Subtarget->is64Bit()) {
   10234     Result = DAG.getNode(ISD::ADD, DL, getPointerTy(),
   10235                          DAG.getNode(X86ISD::GlobalBaseReg,
   10236                                      SDLoc(), getPointerTy()),
   10237                          Result);
   10238   }
   10239 
   10240   // For symbols that require a load from a stub to get the address, emit the
   10241   // load.
   10242   if (isGlobalStubReference(OpFlag))
   10243     Result = DAG.getLoad(getPointerTy(), DL, DAG.getEntryNode(), Result,
   10244                          MachinePointerInfo::getGOT(), false, false, false, 0);
   10245 
   10246   return Result;
   10247 }
   10248 
   10249 SDValue
   10250 X86TargetLowering::LowerBlockAddress(SDValue Op, SelectionDAG &DAG) const {
   10251   // Create the TargetBlockAddressAddress node.
   10252   unsigned char OpFlags =
   10253     Subtarget->ClassifyBlockAddressReference();
   10254   CodeModel::Model M = DAG.getTarget().getCodeModel();
   10255   const BlockAddress *BA = cast<BlockAddressSDNode>(Op)->getBlockAddress();
   10256   int64_t Offset = cast<BlockAddressSDNode>(Op)->getOffset();
   10257   SDLoc dl(Op);
   10258   SDValue Result = DAG.getTargetBlockAddress(BA, getPointerTy(), Offset,
   10259                                              OpFlags);
   10260 
   10261   if (Subtarget->isPICStyleRIPRel() &&
   10262       (M == CodeModel::Small || M == CodeModel::Kernel))
   10263     Result = DAG.getNode(X86ISD::WrapperRIP, dl, getPointerTy(), Result);
   10264   else
   10265     Result = DAG.getNode(X86ISD::Wrapper, dl, getPointerTy(), Result);
   10266 
   10267   // With PIC, the address is actually $g + Offset.
   10268   if (isGlobalRelativeToPICBase(OpFlags)) {
   10269     Result = DAG.getNode(ISD::ADD, dl, getPointerTy(),
   10270                          DAG.getNode(X86ISD::GlobalBaseReg, dl, getPointerTy()),
   10271                          Result);
   10272   }
   10273 
   10274   return Result;
   10275 }
   10276 
   10277 SDValue
   10278 X86TargetLowering::LowerGlobalAddress(const GlobalValue *GV, SDLoc dl,
   10279                                       int64_t Offset, SelectionDAG &DAG) const {
   10280   // Create the TargetGlobalAddress node, folding in the constant
   10281   // offset if it is legal.
   10282   unsigned char OpFlags =
   10283       Subtarget->ClassifyGlobalReference(GV, DAG.getTarget());
   10284   CodeModel::Model M = DAG.getTarget().getCodeModel();
   10285   SDValue Result;
   10286   if (OpFlags == X86II::MO_NO_FLAG &&
   10287       X86::isOffsetSuitableForCodeModel(Offset, M)) {
   10288     // A direct static reference to a global.
   10289     Result = DAG.getTargetGlobalAddress(GV, dl, getPointerTy(), Offset);
   10290     Offset = 0;
   10291   } else {
   10292     Result = DAG.getTargetGlobalAddress(GV, dl, getPointerTy(), 0, OpFlags);
   10293   }
   10294 
   10295   if (Subtarget->isPICStyleRIPRel() &&
   10296       (M == CodeModel::Small || M == CodeModel::Kernel))
   10297     Result = DAG.getNode(X86ISD::WrapperRIP, dl, getPointerTy(), Result);
   10298   else
   10299     Result = DAG.getNode(X86ISD::Wrapper, dl, getPointerTy(), Result);
   10300 
   10301   // With PIC, the address is actually $g + Offset.
   10302   if (isGlobalRelativeToPICBase(OpFlags)) {
   10303     Result = DAG.getNode(ISD::ADD, dl, getPointerTy(),
   10304                          DAG.getNode(X86ISD::GlobalBaseReg, dl, getPointerTy()),
   10305                          Result);
   10306   }
   10307 
   10308   // For globals that require a load from a stub to get the address, emit the
   10309   // load.
   10310   if (isGlobalStubReference(OpFlags))
   10311     Result = DAG.getLoad(getPointerTy(), dl, DAG.getEntryNode(), Result,
   10312                          MachinePointerInfo::getGOT(), false, false, false, 0);
   10313 
   10314   // If there was a non-zero offset that we didn't fold, create an explicit
   10315   // addition for it.
   10316   if (Offset != 0)
   10317     Result = DAG.getNode(ISD::ADD, dl, getPointerTy(), Result,
   10318                          DAG.getConstant(Offset, getPointerTy()));
   10319 
   10320   return Result;
   10321 }
   10322 
   10323 SDValue
   10324 X86TargetLowering::LowerGlobalAddress(SDValue Op, SelectionDAG &DAG) const {
   10325   const GlobalValue *GV = cast<GlobalAddressSDNode>(Op)->getGlobal();
   10326   int64_t Offset = cast<GlobalAddressSDNode>(Op)->getOffset();
   10327   return LowerGlobalAddress(GV, SDLoc(Op), Offset, DAG);
   10328 }
   10329 
   10330 static SDValue
   10331 GetTLSADDR(SelectionDAG &DAG, SDValue Chain, GlobalAddressSDNode *GA,
   10332            SDValue *InFlag, const EVT PtrVT, unsigned ReturnReg,
   10333            unsigned char OperandFlags, bool LocalDynamic = false) {
   10334   MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo();
   10335   SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
   10336   SDLoc dl(GA);
   10337   SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), dl,
   10338                                            GA->getValueType(0),
   10339                                            GA->getOffset(),
   10340                                            OperandFlags);
   10341 
   10342   X86ISD::NodeType CallType = LocalDynamic ? X86ISD::TLSBASEADDR
   10343                                            : X86ISD::TLSADDR;
   10344 
   10345   if (InFlag) {
   10346     SDValue Ops[] = { Chain,  TGA, *InFlag };
   10347     Chain = DAG.getNode(CallType, dl, NodeTys, Ops);
   10348   } else {
   10349     SDValue Ops[]  = { Chain, TGA };
   10350     Chain = DAG.getNode(CallType, dl, NodeTys, Ops);
   10351   }
   10352 
   10353   // TLSADDR will be codegen'ed as call. Inform MFI that function has calls.
   10354   MFI->setAdjustsStack(true);
   10355 
   10356   SDValue Flag = Chain.getValue(1);
   10357   return DAG.getCopyFromReg(Chain, dl, ReturnReg, PtrVT, Flag);
   10358 }
   10359 
   10360 // Lower ISD::GlobalTLSAddress using the "general dynamic" model, 32 bit
   10361 static SDValue
   10362 LowerToTLSGeneralDynamicModel32(GlobalAddressSDNode *GA, SelectionDAG &DAG,
   10363                                 const EVT PtrVT) {
   10364   SDValue InFlag;
   10365   SDLoc dl(GA);  // ? function entry point might be better
   10366   SDValue Chain = DAG.getCopyToReg(DAG.getEntryNode(), dl, X86::EBX,
   10367                                    DAG.getNode(X86ISD::GlobalBaseReg,
   10368                                                SDLoc(), PtrVT), InFlag);
   10369   InFlag = Chain.getValue(1);
   10370 
   10371   return GetTLSADDR(DAG, Chain, GA, &InFlag, PtrVT, X86::EAX, X86II::MO_TLSGD);
   10372 }
   10373 
   10374 // Lower ISD::GlobalTLSAddress using the "general dynamic" model, 64 bit
   10375 static SDValue
   10376 LowerToTLSGeneralDynamicModel64(GlobalAddressSDNode *GA, SelectionDAG &DAG,
   10377                                 const EVT PtrVT) {
   10378   return GetTLSADDR(DAG, DAG.getEntryNode(), GA, nullptr, PtrVT,
   10379                     X86::RAX, X86II::MO_TLSGD);
   10380 }
   10381 
   10382 static SDValue LowerToTLSLocalDynamicModel(GlobalAddressSDNode *GA,
   10383                                            SelectionDAG &DAG,
   10384                                            const EVT PtrVT,
   10385                                            bool is64Bit) {
   10386   SDLoc dl(GA);
   10387 
   10388   // Get the start address of the TLS block for this module.
   10389   X86MachineFunctionInfo* MFI = DAG.getMachineFunction()
   10390       .getInfo<X86MachineFunctionInfo>();
   10391   MFI->incNumLocalDynamicTLSAccesses();
   10392 
   10393   SDValue Base;
   10394   if (is64Bit) {
   10395     Base = GetTLSADDR(DAG, DAG.getEntryNode(), GA, nullptr, PtrVT, X86::RAX,
   10396                       X86II::MO_TLSLD, /*LocalDynamic=*/true);
   10397   } else {
   10398     SDValue InFlag;
   10399     SDValue Chain = DAG.getCopyToReg(DAG.getEntryNode(), dl, X86::EBX,
   10400         DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT), InFlag);
   10401     InFlag = Chain.getValue(1);
   10402     Base = GetTLSADDR(DAG, Chain, GA, &InFlag, PtrVT, X86::EAX,
   10403                       X86II::MO_TLSLDM, /*LocalDynamic=*/true);
   10404   }
   10405 
   10406   // Note: the CleanupLocalDynamicTLSPass will remove redundant computations
   10407   // of Base.
   10408 
   10409   // Build x@dtpoff.
   10410   unsigned char OperandFlags = X86II::MO_DTPOFF;
   10411   unsigned WrapperKind = X86ISD::Wrapper;
   10412   SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), dl,
   10413                                            GA->getValueType(0),
   10414                                            GA->getOffset(), OperandFlags);
   10415   SDValue Offset = DAG.getNode(WrapperKind, dl, PtrVT, TGA);
   10416 
   10417   // Add x@dtpoff with the base.
   10418   return DAG.getNode(ISD::ADD, dl, PtrVT, Offset, Base);
   10419 }
   10420 
   10421 // Lower ISD::GlobalTLSAddress using the "initial exec" or "local exec" model.
   10422 static SDValue LowerToTLSExecModel(GlobalAddressSDNode *GA, SelectionDAG &DAG,
   10423                                    const EVT PtrVT, TLSModel::Model model,
   10424                                    bool is64Bit, bool isPIC) {
   10425   SDLoc dl(GA);
   10426 
   10427   // Get the Thread Pointer, which is %gs:0 (32-bit) or %fs:0 (64-bit).
   10428   Value *Ptr = Constant::getNullValue(Type::getInt8PtrTy(*DAG.getContext(),
   10429                                                          is64Bit ? 257 : 256));
   10430 
   10431   SDValue ThreadPointer =
   10432       DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), DAG.getIntPtrConstant(0),
   10433                   MachinePointerInfo(Ptr), false, false, false, 0);
   10434 
   10435   unsigned char OperandFlags = 0;
   10436   // Most TLS accesses are not RIP relative, even on x86-64.  One exception is
   10437   // initialexec.
   10438   unsigned WrapperKind = X86ISD::Wrapper;
   10439   if (model == TLSModel::LocalExec) {
   10440     OperandFlags = is64Bit ? X86II::MO_TPOFF : X86II::MO_NTPOFF;
   10441   } else if (model == TLSModel::InitialExec) {
   10442     if (is64Bit) {
   10443       OperandFlags = X86II::MO_GOTTPOFF;
   10444       WrapperKind = X86ISD::WrapperRIP;
   10445     } else {
   10446       OperandFlags = isPIC ? X86II::MO_GOTNTPOFF : X86II::MO_INDNTPOFF;
   10447     }
   10448   } else {
   10449     llvm_unreachable("Unexpected model");
   10450   }
   10451 
   10452   // emit "addl x@ntpoff,%eax" (local exec)
   10453   // or "addl x@indntpoff,%eax" (initial exec)
   10454   // or "addl x@gotntpoff(%ebx) ,%eax" (initial exec, 32-bit pic)
   10455   SDValue TGA =
   10456       DAG.getTargetGlobalAddress(GA->getGlobal(), dl, GA->getValueType(0),
   10457                                  GA->getOffset(), OperandFlags);
   10458   SDValue Offset = DAG.getNode(WrapperKind, dl, PtrVT, TGA);
   10459 
   10460   if (model == TLSModel::InitialExec) {
   10461     if (isPIC && !is64Bit) {
   10462       Offset = DAG.getNode(ISD::ADD, dl, PtrVT,
   10463                            DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT),
   10464                            Offset);
   10465     }
   10466 
   10467     Offset = DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), Offset,
   10468                          MachinePointerInfo::getGOT(), false, false, false, 0);
   10469   }
   10470 
   10471   // The address of the thread local variable is the add of the thread
   10472   // pointer with the offset of the variable.
   10473   return DAG.getNode(ISD::ADD, dl, PtrVT, ThreadPointer, Offset);
   10474 }
   10475 
   10476 SDValue
   10477 X86TargetLowering::LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const {
   10478 
   10479   GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(Op);
   10480   const GlobalValue *GV = GA->getGlobal();
   10481 
   10482   if (Subtarget->isTargetELF()) {
   10483     TLSModel::Model model = DAG.getTarget().getTLSModel(GV);
   10484 
   10485     switch (model) {
   10486       case TLSModel::GeneralDynamic:
   10487         if (Subtarget->is64Bit())
   10488           return LowerToTLSGeneralDynamicModel64(GA, DAG, getPointerTy());
   10489         return LowerToTLSGeneralDynamicModel32(GA, DAG, getPointerTy());
   10490       case TLSModel::LocalDynamic:
   10491         return LowerToTLSLocalDynamicModel(GA, DAG, getPointerTy(),
   10492                                            Subtarget->is64Bit());
   10493       case TLSModel::InitialExec:
   10494       case TLSModel::LocalExec:
   10495         return LowerToTLSExecModel(
   10496             GA, DAG, getPointerTy(), model, Subtarget->is64Bit(),
   10497             DAG.getTarget().getRelocationModel() == Reloc::PIC_);
   10498     }
   10499     llvm_unreachable("Unknown TLS model.");
   10500   }
   10501 
   10502   if (Subtarget->isTargetDarwin()) {
   10503     // Darwin only has one model of TLS.  Lower to that.
   10504     unsigned char OpFlag = 0;
   10505     unsigned WrapperKind = Subtarget->isPICStyleRIPRel() ?
   10506                            X86ISD::WrapperRIP : X86ISD::Wrapper;
   10507 
   10508     // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the
   10509     // global base reg.
   10510     bool PIC32 = (DAG.getTarget().getRelocationModel() == Reloc::PIC_) &&
   10511                  !Subtarget->is64Bit();
   10512     if (PIC32)
   10513       OpFlag = X86II::MO_TLVP_PIC_BASE;
   10514     else
   10515       OpFlag = X86II::MO_TLVP;
   10516     SDLoc DL(Op);
   10517     SDValue Result = DAG.getTargetGlobalAddress(GA->getGlobal(), DL,
   10518                                                 GA->getValueType(0),
   10519                                                 GA->getOffset(), OpFlag);
   10520     SDValue Offset = DAG.getNode(WrapperKind, DL, getPointerTy(), Result);
   10521 
   10522     // With PIC32, the address is actually $g + Offset.
   10523     if (PIC32)
   10524       Offset = DAG.getNode(ISD::ADD, DL, getPointerTy(),
   10525                            DAG.getNode(X86ISD::GlobalBaseReg,
   10526                                        SDLoc(), getPointerTy()),
   10527                            Offset);
   10528 
   10529     // Lowering the machine isd will make sure everything is in the right
   10530     // location.
   10531     SDValue Chain = DAG.getEntryNode();
   10532     SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
   10533     SDValue Args[] = { Chain, Offset };
   10534     Chain = DAG.getNode(X86ISD::TLSCALL, DL, NodeTys, Args);
   10535 
   10536     // TLSCALL will be codegen'ed as call. Inform MFI that function has calls.
   10537     MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo();
   10538     MFI->setAdjustsStack(true);
   10539 
   10540     // And our return value (tls address) is in the standard call return value
   10541     // location.
   10542     unsigned Reg = Subtarget->is64Bit() ? X86::RAX : X86::EAX;
   10543     return DAG.getCopyFromReg(Chain, DL, Reg, getPointerTy(),
   10544                               Chain.getValue(1));
   10545   }
   10546 
   10547   if (Subtarget->isTargetKnownWindowsMSVC() ||
   10548       Subtarget->isTargetWindowsGNU()) {
   10549     // Just use the implicit TLS architecture
   10550     // Need to generate someting similar to:
   10551     //   mov     rdx, qword [gs:abs 58H]; Load pointer to ThreadLocalStorage
   10552     //                                  ; from TEB
   10553     //   mov     ecx, dword [rel _tls_index]: Load index (from C runtime)
   10554     //   mov     rcx, qword [rdx+rcx*8]
   10555     //   mov     eax, .tls$:tlsvar
   10556     //   [rax+rcx] contains the address
   10557     // Windows 64bit: gs:0x58
   10558     // Windows 32bit: fs:__tls_array
   10559 
   10560     SDLoc dl(GA);
   10561     SDValue Chain = DAG.getEntryNode();
   10562 
   10563     // Get the Thread Pointer, which is %fs:__tls_array (32-bit) or
   10564     // %gs:0x58 (64-bit). On MinGW, __tls_array is not available, so directly
   10565     // use its literal value of 0x2C.
   10566     Value *Ptr = Constant::getNullValue(Subtarget->is64Bit()
   10567                                         ? Type::getInt8PtrTy(*DAG.getContext(),
   10568                                                              256)
   10569                                         : Type::getInt32PtrTy(*DAG.getContext(),
   10570                                                               257));
   10571 
   10572     SDValue TlsArray =
   10573         Subtarget->is64Bit()
   10574             ? DAG.getIntPtrConstant(0x58)
   10575             : (Subtarget->isTargetWindowsGNU()
   10576                    ? DAG.getIntPtrConstant(0x2C)
   10577                    : DAG.getExternalSymbol("_tls_array", getPointerTy()));
   10578 
   10579     SDValue ThreadPointer =
   10580         DAG.getLoad(getPointerTy(), dl, Chain, TlsArray,
   10581                     MachinePointerInfo(Ptr), false, false, false, 0);
   10582 
   10583     // Load the _tls_index variable
   10584     SDValue IDX = DAG.getExternalSymbol("_tls_index", getPointerTy());
   10585     if (Subtarget->is64Bit())
   10586       IDX = DAG.getExtLoad(ISD::ZEXTLOAD, dl, getPointerTy(), Chain,
   10587                            IDX, MachinePointerInfo(), MVT::i32,
   10588                            false, false, 0);
   10589     else
   10590       IDX = DAG.getLoad(getPointerTy(), dl, Chain, IDX, MachinePointerInfo(),
   10591                         false, false, false, 0);
   10592 
   10593     SDValue Scale = DAG.getConstant(Log2_64_Ceil(TD->getPointerSize()),
   10594                                     getPointerTy());
   10595     IDX = DAG.getNode(ISD::SHL, dl, getPointerTy(), IDX, Scale);
   10596 
   10597     SDValue res = DAG.getNode(ISD::ADD, dl, getPointerTy(), ThreadPointer, IDX);
   10598     res = DAG.getLoad(getPointerTy(), dl, Chain, res, MachinePointerInfo(),
   10599                       false, false, false, 0);
   10600 
   10601     // Get the offset of start of .tls section
   10602     SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), dl,
   10603                                              GA->getValueType(0),
   10604                                              GA->getOffset(), X86II::MO_SECREL);
   10605     SDValue Offset = DAG.getNode(X86ISD::Wrapper, dl, getPointerTy(), TGA);
   10606 
   10607     // The address of the thread local variable is the add of the thread
   10608     // pointer with the offset of the variable.
   10609     return DAG.getNode(ISD::ADD, dl, getPointerTy(), res, Offset);
   10610   }
   10611 
   10612   llvm_unreachable("TLS not implemented for this target.");
   10613 }
   10614 
   10615 /// LowerShiftParts - Lower SRA_PARTS and friends, which return two i32 values
   10616 /// and take a 2 x i32 value to shift plus a shift amount.
   10617 static SDValue LowerShiftParts(SDValue Op, SelectionDAG &DAG) {
   10618   assert(Op.getNumOperands() == 3 && "Not a double-shift!");
   10619   MVT VT = Op.getSimpleValueType();
   10620   unsigned VTBits = VT.getSizeInBits();
   10621   SDLoc dl(Op);
   10622   bool isSRA = Op.getOpcode() == ISD::SRA_PARTS;
   10623   SDValue ShOpLo = Op.getOperand(0);
   10624   SDValue ShOpHi = Op.getOperand(1);
   10625   SDValue ShAmt  = Op.getOperand(2);
   10626   // X86ISD::SHLD and X86ISD::SHRD have defined overflow behavior but the
   10627   // generic ISD nodes haven't. Insert an AND to be safe, it's optimized away
   10628   // during isel.
   10629   SDValue SafeShAmt = DAG.getNode(ISD::AND, dl, MVT::i8, ShAmt,
   10630                                   DAG.getConstant(VTBits - 1, MVT::i8));
   10631   SDValue Tmp1 = isSRA ? DAG.getNode(ISD::SRA, dl, VT, ShOpHi,
   10632                                      DAG.getConstant(VTBits - 1, MVT::i8))
   10633                        : DAG.getConstant(0, VT);
   10634 
   10635   SDValue Tmp2, Tmp3;
   10636   if (Op.getOpcode() == ISD::SHL_PARTS) {
   10637     Tmp2 = DAG.getNode(X86ISD::SHLD, dl, VT, ShOpHi, ShOpLo, ShAmt);
   10638     Tmp3 = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, SafeShAmt);
   10639   } else {
   10640     Tmp2 = DAG.getNode(X86ISD::SHRD, dl, VT, ShOpLo, ShOpHi, ShAmt);
   10641     Tmp3 = DAG.getNode(isSRA ? ISD::SRA : ISD::SRL, dl, VT, ShOpHi, SafeShAmt);
   10642   }
   10643 
   10644   // If the shift amount is larger or equal than the width of a part we can't
   10645   // rely on the results of shld/shrd. Insert a test and select the appropriate
   10646   // values for large shift amounts.
   10647   SDValue AndNode = DAG.getNode(ISD::AND, dl, MVT::i8, ShAmt,
   10648                                 DAG.getConstant(VTBits, MVT::i8));
   10649   SDValue Cond = DAG.getNode(X86ISD::CMP, dl, MVT::i32,
   10650                              AndNode, DAG.getConstant(0, MVT::i8));
   10651 
   10652   SDValue Hi, Lo;
   10653   SDValue CC = DAG.getConstant(X86::COND_NE, MVT::i8);
   10654   SDValue Ops0[4] = { Tmp2, Tmp3, CC, Cond };
   10655   SDValue Ops1[4] = { Tmp3, Tmp1, CC, Cond };
   10656 
   10657   if (Op.getOpcode() == ISD::SHL_PARTS) {
   10658     Hi = DAG.getNode(X86ISD::CMOV, dl, VT, Ops0);
   10659     Lo = DAG.getNode(X86ISD::CMOV, dl, VT, Ops1);
   10660   } else {
   10661     Lo = DAG.getNode(X86ISD::CMOV, dl, VT, Ops0);
   10662     Hi = DAG.getNode(X86ISD::CMOV, dl, VT, Ops1);
   10663   }
   10664 
   10665   SDValue Ops[2] = { Lo, Hi };
   10666   return DAG.getMergeValues(Ops, dl);
   10667 }
   10668 
   10669 SDValue X86TargetLowering::LowerSINT_TO_FP(SDValue Op,
   10670                                            SelectionDAG &DAG) const {
   10671   MVT SrcVT = Op.getOperand(0).getSimpleValueType();
   10672 
   10673   if (SrcVT.isVector())
   10674     return SDValue();
   10675 
   10676   assert(SrcVT <= MVT::i64 && SrcVT >= MVT::i16 &&
   10677          "Unknown SINT_TO_FP to lower!");
   10678 
   10679   // These are really Legal; return the operand so the caller accepts it as
   10680   // Legal.
   10681   if (SrcVT == MVT::i32 && isScalarFPTypeInSSEReg(Op.getValueType()))
   10682     return Op;
   10683   if (SrcVT == MVT::i64 && isScalarFPTypeInSSEReg(Op.getValueType()) &&
   10684       Subtarget->is64Bit()) {
   10685     return Op;
   10686   }
   10687 
   10688   SDLoc dl(Op);
   10689   unsigned Size = SrcVT.getSizeInBits()/8;
   10690   MachineFunction &MF = DAG.getMachineFunction();
   10691   int SSFI = MF.getFrameInfo()->CreateStackObject(Size, Size, false);
   10692   SDValue StackSlot = DAG.getFrameIndex(SSFI, getPointerTy());
   10693   SDValue Chain = DAG.getStore(DAG.getEntryNode(), dl, Op.getOperand(0),
   10694                                StackSlot,
   10695                                MachinePointerInfo::getFixedStack(SSFI),
   10696                                false, false, 0);
   10697   return BuildFILD(Op, SrcVT, Chain, StackSlot, DAG);
   10698 }
   10699 
   10700 SDValue X86TargetLowering::BuildFILD(SDValue Op, EVT SrcVT, SDValue Chain,
   10701                                      SDValue StackSlot,
   10702                                      SelectionDAG &DAG) const {
   10703   // Build the FILD
   10704   SDLoc DL(Op);
   10705   SDVTList Tys;
   10706   bool useSSE = isScalarFPTypeInSSEReg(Op.getValueType());
   10707   if (useSSE)
   10708     Tys = DAG.getVTList(MVT::f64, MVT::Other, MVT::Glue);
   10709   else
   10710     Tys = DAG.getVTList(Op.getValueType(), MVT::Other);
   10711 
   10712   unsigned ByteSize = SrcVT.getSizeInBits()/8;
   10713 
   10714   FrameIndexSDNode *FI = dyn_cast<FrameIndexSDNode>(StackSlot);
   10715   MachineMemOperand *MMO;
   10716   if (FI) {
   10717     int SSFI = FI->getIndex();
   10718     MMO =
   10719       DAG.getMachineFunction()
   10720       .getMachineMemOperand(MachinePointerInfo::getFixedStack(SSFI),
   10721                             MachineMemOperand::MOLoad, ByteSize, ByteSize);
   10722   } else {
   10723     MMO = cast<LoadSDNode>(StackSlot)->getMemOperand();
   10724     StackSlot = StackSlot.getOperand(1);
   10725   }
   10726   SDValue Ops[] = { Chain, StackSlot, DAG.getValueType(SrcVT) };
   10727   SDValue Result = DAG.getMemIntrinsicNode(useSSE ? X86ISD::FILD_FLAG :
   10728                                            X86ISD::FILD, DL,
   10729                                            Tys, Ops, SrcVT, MMO);
   10730 
   10731   if (useSSE) {
   10732     Chain = Result.getValue(1);
   10733     SDValue InFlag = Result.getValue(2);
   10734 
   10735     // FIXME: Currently the FST is flagged to the FILD_FLAG. This
   10736     // shouldn't be necessary except that RFP cannot be live across
   10737     // multiple blocks. When stackifier is fixed, they can be uncoupled.
   10738     MachineFunction &MF = DAG.getMachineFunction();
   10739     unsigned SSFISize = Op.getValueType().getSizeInBits()/8;
   10740     int SSFI = MF.getFrameInfo()->CreateStackObject(SSFISize, SSFISize, false);
   10741     SDValue StackSlot = DAG.getFrameIndex(SSFI, getPointerTy());
   10742     Tys = DAG.getVTList(MVT::Other);
   10743     SDValue Ops[] = {
   10744       Chain, Result, StackSlot, DAG.getValueType(Op.getValueType()), InFlag
   10745     };
   10746     MachineMemOperand *MMO =
   10747       DAG.getMachineFunction()
   10748       .getMachineMemOperand(MachinePointerInfo::getFixedStack(SSFI),
   10749                             MachineMemOperand::MOStore, SSFISize, SSFISize);
   10750 
   10751     Chain = DAG.getMemIntrinsicNode(X86ISD::FST, DL, Tys,
   10752                                     Ops, Op.getValueType(), MMO);
   10753     Result = DAG.getLoad(Op.getValueType(), DL, Chain, StackSlot,
   10754                          MachinePointerInfo::getFixedStack(SSFI),
   10755                          false, false, false, 0);
   10756   }
   10757 
   10758   return Result;
   10759 }
   10760 
   10761 // LowerUINT_TO_FP_i64 - 64-bit unsigned integer to double expansion.
   10762 SDValue X86TargetLowering::LowerUINT_TO_FP_i64(SDValue Op,
   10763                                                SelectionDAG &DAG) const {
   10764   // This algorithm is not obvious. Here it is what we're trying to output:
   10765   /*
   10766      movq       %rax,  %xmm0
   10767      punpckldq  (c0),  %xmm0  // c0: (uint4){ 0x43300000U, 0x45300000U, 0U, 0U }
   10768      subpd      (c1),  %xmm0  // c1: (double2){ 0x1.0p52, 0x1.0p52 * 0x1.0p32 }
   10769      #ifdef __SSE3__
   10770        haddpd   %xmm0, %xmm0
   10771      #else
   10772        pshufd   $0x4e, %xmm0, %xmm1
   10773        addpd    %xmm1, %xmm0
   10774      #endif
   10775   */
   10776 
   10777   SDLoc dl(Op);
   10778   LLVMContext *Context = DAG.getContext();
   10779 
   10780   // Build some magic constants.
   10781   static const uint32_t CV0[] = { 0x43300000, 0x45300000, 0, 0 };
   10782   Constant *C0 = ConstantDataVector::get(*Context, CV0);
   10783   SDValue CPIdx0 = DAG.getConstantPool(C0, getPointerTy(), 16);
   10784 
   10785   SmallVector<Constant*,2> CV1;
   10786   CV1.push_back(
   10787     ConstantFP::get(*Context, APFloat(APFloat::IEEEdouble,
   10788                                       APInt(64, 0x4330000000000000ULL))));
   10789   CV1.push_back(
   10790     ConstantFP::get(*Context, APFloat(APFloat::IEEEdouble,
   10791                                       APInt(64, 0x4530000000000000ULL))));
   10792   Constant *C1 = ConstantVector::get(CV1);
   10793   SDValue CPIdx1 = DAG.getConstantPool(C1, getPointerTy(), 16);
   10794 
   10795   // Load the 64-bit value into an XMM register.
   10796   SDValue XR1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64,
   10797                             Op.getOperand(0));
   10798   SDValue CLod0 = DAG.getLoad(MVT::v4i32, dl, DAG.getEntryNode(), CPIdx0,
   10799                               MachinePointerInfo::getConstantPool(),
   10800                               false, false, false, 16);
   10801   SDValue Unpck1 = getUnpackl(DAG, dl, MVT::v4i32,
   10802                               DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, XR1),
   10803                               CLod0);
   10804 
   10805   SDValue CLod1 = DAG.getLoad(MVT::v2f64, dl, CLod0.getValue(1), CPIdx1,
   10806                               MachinePointerInfo::getConstantPool(),
   10807                               false, false, false, 16);
   10808   SDValue XR2F = DAG.getNode(ISD::BITCAST, dl, MVT::v2f64, Unpck1);
   10809   SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::v2f64, XR2F, CLod1);
   10810   SDValue Result;
   10811 
   10812   if (Subtarget->hasSSE3()) {
   10813     // FIXME: The 'haddpd' instruction may be slower than 'movhlps + addsd'.
   10814     Result = DAG.getNode(X86ISD::FHADD, dl, MVT::v2f64, Sub, Sub);
   10815   } else {
   10816     SDValue S2F = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, Sub);
   10817     SDValue Shuffle = getTargetShuffleNode(X86ISD::PSHUFD, dl, MVT::v4i32,
   10818                                            S2F, 0x4E, DAG);
   10819     Result = DAG.getNode(ISD::FADD, dl, MVT::v2f64,
   10820                          DAG.getNode(ISD::BITCAST, dl, MVT::v2f64, Shuffle),
   10821                          Sub);
   10822   }
   10823 
   10824   return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, Result,
   10825                      DAG.getIntPtrConstant(0));
   10826 }
   10827 
   10828 // LowerUINT_TO_FP_i32 - 32-bit unsigned integer to float expansion.
   10829 SDValue X86TargetLowering::LowerUINT_TO_FP_i32(SDValue Op,
   10830                                                SelectionDAG &DAG) const {
   10831   SDLoc dl(Op);
   10832   // FP constant to bias correct the final result.
   10833   SDValue Bias = DAG.getConstantFP(BitsToDouble(0x4330000000000000ULL),
   10834                                    MVT::f64);
   10835 
   10836   // Load the 32-bit value into an XMM register.
   10837   SDValue Load = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32,
   10838                              Op.getOperand(0));
   10839 
   10840   // Zero out the upper parts of the register.
   10841   Load = getShuffleVectorZeroOrUndef(Load, 0, true, Subtarget, DAG);
   10842 
   10843   Load = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64,
   10844                      DAG.getNode(ISD::BITCAST, dl, MVT::v2f64, Load),
   10845                      DAG.getIntPtrConstant(0));
   10846 
   10847   // Or the load with the bias.
   10848   SDValue Or = DAG.getNode(ISD::OR, dl, MVT::v2i64,
   10849                            DAG.getNode(ISD::BITCAST, dl, MVT::v2i64,
   10850                                        DAG.getNode(ISD::SCALAR_TO_VECTOR, dl,
   10851                                                    MVT::v2f64, Load)),
   10852                            DAG.getNode(ISD::BITCAST, dl, MVT::v2i64,
   10853                                        DAG.getNode(ISD::SCALAR_TO_VECTOR, dl,
   10854                                                    MVT::v2f64, Bias)));
   10855   Or = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64,
   10856                    DAG.getNode(ISD::BITCAST, dl, MVT::v2f64, Or),
   10857                    DAG.getIntPtrConstant(0));
   10858 
   10859   // Subtract the bias.
   10860   SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::f64, Or, Bias);
   10861 
   10862   // Handle final rounding.
   10863   EVT DestVT = Op.getValueType();
   10864 
   10865   if (DestVT.bitsLT(MVT::f64))
   10866     return DAG.getNode(ISD::FP_ROUND, dl, DestVT, Sub,
   10867                        DAG.getIntPtrConstant(0));
   10868   if (DestVT.bitsGT(MVT::f64))
   10869     return DAG.getNode(ISD::FP_EXTEND, dl, DestVT, Sub);
   10870 
   10871   // Handle final rounding.
   10872   return Sub;
   10873 }
   10874 
   10875 SDValue X86TargetLowering::lowerUINT_TO_FP_vec(SDValue Op,
   10876                                                SelectionDAG &DAG) const {
   10877   SDValue N0 = Op.getOperand(0);
   10878   MVT SVT = N0.getSimpleValueType();
   10879   SDLoc dl(Op);
   10880 
   10881   assert((SVT == MVT::v4i8 || SVT == MVT::v4i16 ||
   10882           SVT == MVT::v8i8 || SVT == MVT::v8i16) &&
   10883          "Custom UINT_TO_FP is not supported!");
   10884 
   10885   MVT NVT = MVT::getVectorVT(MVT::i32, SVT.getVectorNumElements());
   10886   return DAG.getNode(ISD::SINT_TO_FP, dl, Op.getValueType(),
   10887                      DAG.getNode(ISD::ZERO_EXTEND, dl, NVT, N0));
   10888 }
   10889 
   10890 SDValue X86TargetLowering::LowerUINT_TO_FP(SDValue Op,
   10891                                            SelectionDAG &DAG) const {
   10892   SDValue N0 = Op.getOperand(0);
   10893   SDLoc dl(Op);
   10894 
   10895   if (Op.getValueType().isVector())
   10896     return lowerUINT_TO_FP_vec(Op, DAG);
   10897 
   10898   // Since UINT_TO_FP is legal (it's marked custom), dag combiner won't
   10899   // optimize it to a SINT_TO_FP when the sign bit is known zero. Perform
   10900   // the optimization here.
   10901   if (DAG.SignBitIsZero(N0))
   10902     return DAG.getNode(ISD::SINT_TO_FP, dl, Op.getValueType(), N0);
   10903 
   10904   MVT SrcVT = N0.getSimpleValueType();
   10905   MVT DstVT = Op.getSimpleValueType();
   10906   if (SrcVT == MVT::i64 && DstVT == MVT::f64 && X86ScalarSSEf64)
   10907     return LowerUINT_TO_FP_i64(Op, DAG);
   10908   if (SrcVT == MVT::i32 && X86ScalarSSEf64)
   10909     return LowerUINT_TO_FP_i32(Op, DAG);
   10910   if (Subtarget->is64Bit() && SrcVT == MVT::i64 && DstVT == MVT::f32)
   10911     return SDValue();
   10912 
   10913   // Make a 64-bit buffer, and use it to build an FILD.
   10914   SDValue StackSlot = DAG.CreateStackTemporary(MVT::i64);
   10915   if (SrcVT == MVT::i32) {
   10916     SDValue WordOff = DAG.getConstant(4, getPointerTy());
   10917     SDValue OffsetSlot = DAG.getNode(ISD::ADD, dl,
   10918                                      getPointerTy(), StackSlot, WordOff);
   10919     SDValue Store1 = DAG.getStore(DAG.getEntryNode(), dl, Op.getOperand(0),
   10920                                   StackSlot, MachinePointerInfo(),
   10921                                   false, false, 0);
   10922     SDValue Store2 = DAG.getStore(Store1, dl, DAG.getConstant(0, MVT::i32),
   10923                                   OffsetSlot, MachinePointerInfo(),
   10924                                   false, false, 0);
   10925     SDValue Fild = BuildFILD(Op, MVT::i64, Store2, StackSlot, DAG);
   10926     return Fild;
   10927   }
   10928 
   10929   assert(SrcVT == MVT::i64 && "Unexpected type in UINT_TO_FP");
   10930   SDValue Store = DAG.getStore(DAG.getEntryNode(), dl, Op.getOperand(0),
   10931                                StackSlot, MachinePointerInfo(),
   10932                                false, false, 0);
   10933   // For i64 source, we need to add the appropriate power of 2 if the input
   10934   // was negative.  This is the same as the optimization in
   10935   // DAGTypeLegalizer::ExpandIntOp_UNIT_TO_FP, and for it to be safe here,
   10936   // we must be careful to do the computation in x87 extended precision, not
   10937   // in SSE. (The generic code can't know it's OK to do this, or how to.)
   10938   int SSFI = cast<FrameIndexSDNode>(StackSlot)->getIndex();
   10939   MachineMemOperand *MMO =
   10940     DAG.getMachineFunction()
   10941     .getMachineMemOperand(MachinePointerInfo::getFixedStack(SSFI),
   10942                           MachineMemOperand::MOLoad, 8, 8);
   10943 
   10944   SDVTList Tys = DAG.getVTList(MVT::f80, MVT::Other);
   10945   SDValue Ops[] = { Store, StackSlot, DAG.getValueType(MVT::i64) };
   10946   SDValue Fild = DAG.getMemIntrinsicNode(X86ISD::FILD, dl, Tys, Ops,
   10947                                          MVT::i64, MMO);
   10948 
   10949   APInt FF(32, 0x5F800000ULL);
   10950 
   10951   // Check whether the sign bit is set.
   10952   SDValue SignSet = DAG.getSetCC(dl,
   10953                                  getSetCCResultType(*DAG.getContext(), MVT::i64),
   10954                                  Op.getOperand(0), DAG.getConstant(0, MVT::i64),
   10955                                  ISD::SETLT);
   10956 
   10957   // Build a 64 bit pair (0, FF) in the constant pool, with FF in the lo bits.
   10958   SDValue FudgePtr = DAG.getConstantPool(
   10959                              ConstantInt::get(*DAG.getContext(), FF.zext(64)),
   10960                                          getPointerTy());
   10961 
   10962   // Get a pointer to FF if the sign bit was set, or to 0 otherwise.
   10963   SDValue Zero = DAG.getIntPtrConstant(0);
   10964   SDValue Four = DAG.getIntPtrConstant(4);
   10965   SDValue Offset = DAG.getNode(ISD::SELECT, dl, Zero.getValueType(), SignSet,
   10966                                Zero, Four);
   10967   FudgePtr = DAG.getNode(ISD::ADD, dl, getPointerTy(), FudgePtr, Offset);
   10968 
   10969   // Load the value out, extending it from f32 to f80.
   10970   // FIXME: Avoid the extend by constructing the right constant pool?
   10971   SDValue Fudge = DAG.getExtLoad(ISD::EXTLOAD, dl, MVT::f80, DAG.getEntryNode(),
   10972                                  FudgePtr, MachinePointerInfo::getConstantPool(),
   10973                                  MVT::f32, false, false, 4);
   10974   // Extend everything to 80 bits to force it to be done on x87.
   10975   SDValue Add = DAG.getNode(ISD::FADD, dl, MVT::f80, Fild, Fudge);
   10976   return DAG.getNode(ISD::FP_ROUND, dl, DstVT, Add, DAG.getIntPtrConstant(0));
   10977 }
   10978 
   10979 std::pair<SDValue,SDValue>
   10980 X86TargetLowering:: FP_TO_INTHelper(SDValue Op, SelectionDAG &DAG,
   10981                                     bool IsSigned, bool IsReplace) const {
   10982   SDLoc DL(Op);
   10983 
   10984   EVT DstTy = Op.getValueType();
   10985 
   10986   if (!IsSigned && !isIntegerTypeFTOL(DstTy)) {
   10987     assert(DstTy == MVT::i32 && "Unexpected FP_TO_UINT");
   10988     DstTy = MVT::i64;
   10989   }
   10990 
   10991   assert(DstTy.getSimpleVT() <= MVT::i64 &&
   10992          DstTy.getSimpleVT() >= MVT::i16 &&
   10993          "Unknown FP_TO_INT to lower!");
   10994 
   10995   // These are really Legal.
   10996   if (DstTy == MVT::i32 &&
   10997       isScalarFPTypeInSSEReg(Op.getOperand(0).getValueType()))
   10998     return std::make_pair(SDValue(), SDValue());
   10999   if (Subtarget->is64Bit() &&
   11000       DstTy == MVT::i64 &&
   11001       isScalarFPTypeInSSEReg(Op.getOperand(0).getValueType()))
   11002     return std::make_pair(SDValue(), SDValue());
   11003 
   11004   // We lower FP->int64 either into FISTP64 followed by a load from a temporary
   11005   // stack slot, or into the FTOL runtime function.
   11006   MachineFunction &MF = DAG.getMachineFunction();
   11007   unsigned MemSize = DstTy.getSizeInBits()/8;
   11008   int SSFI = MF.getFrameInfo()->CreateStackObject(MemSize, MemSize, false);
   11009   SDValue StackSlot = DAG.getFrameIndex(SSFI, getPointerTy());
   11010 
   11011   unsigned Opc;
   11012   if (!IsSigned && isIntegerTypeFTOL(DstTy))
   11013     Opc = X86ISD::WIN_FTOL;
   11014   else
   11015     switch (DstTy.getSimpleVT().SimpleTy) {
   11016     default: llvm_unreachable("Invalid FP_TO_SINT to lower!");
   11017     case MVT::i16: Opc = X86ISD::FP_TO_INT16_IN_MEM; break;
   11018     case MVT::i32: Opc = X86ISD::FP_TO_INT32_IN_MEM; break;
   11019     case MVT::i64: Opc = X86ISD::FP_TO_INT64_IN_MEM; break;
   11020     }
   11021 
   11022   SDValue Chain = DAG.getEntryNode();
   11023   SDValue Value = Op.getOperand(0);
   11024   EVT TheVT = Op.getOperand(0).getValueType();
   11025   // FIXME This causes a redundant load/store if the SSE-class value is already
   11026   // in memory, such as if it is on the callstack.
   11027   if (isScalarFPTypeInSSEReg(TheVT)) {
   11028     assert(DstTy == MVT::i64 && "Invalid FP_TO_SINT to lower!");
   11029     Chain = DAG.getStore(Chain, DL, Value, StackSlot,
   11030                          MachinePointerInfo::getFixedStack(SSFI),
   11031                          false, false, 0);
   11032     SDVTList Tys = DAG.getVTList(Op.getOperand(0).getValueType(), MVT::Other);
   11033     SDValue Ops[] = {
   11034       Chain, StackSlot, DAG.getValueType(TheVT)
   11035     };
   11036 
   11037     MachineMemOperand *MMO =
   11038       MF.getMachineMemOperand(MachinePointerInfo::getFixedStack(SSFI),
   11039                               MachineMemOperand::MOLoad, MemSize, MemSize);
   11040     Value = DAG.getMemIntrinsicNode(X86ISD::FLD, DL, Tys, Ops, DstTy, MMO);
   11041     Chain = Value.getValue(1);
   11042     SSFI = MF.getFrameInfo()->CreateStackObject(MemSize, MemSize, false);
   11043     StackSlot = DAG.getFrameIndex(SSFI, getPointerTy());
   11044   }
   11045 
   11046   MachineMemOperand *MMO =
   11047     MF.getMachineMemOperand(MachinePointerInfo::getFixedStack(SSFI),
   11048                             MachineMemOperand::MOStore, MemSize, MemSize);
   11049 
   11050   if (Opc != X86ISD::WIN_FTOL) {
   11051     // Build the FP_TO_INT*_IN_MEM
   11052     SDValue Ops[] = { Chain, Value, StackSlot };
   11053     SDValue FIST = DAG.getMemIntrinsicNode(Opc, DL, DAG.getVTList(MVT::Other),
   11054                                            Ops, DstTy, MMO);
   11055     return std::make_pair(FIST, StackSlot);
   11056   } else {
   11057     SDValue ftol = DAG.getNode(X86ISD::WIN_FTOL, DL,
   11058       DAG.getVTList(MVT::Other, MVT::Glue),
   11059       Chain, Value);
   11060     SDValue eax = DAG.getCopyFromReg(ftol, DL, X86::EAX,
   11061       MVT::i32, ftol.getValue(1));
   11062     SDValue edx = DAG.getCopyFromReg(eax.getValue(1), DL, X86::EDX,
   11063       MVT::i32, eax.getValue(2));
   11064     SDValue Ops[] = { eax, edx };
   11065     SDValue pair = IsReplace
   11066       ? DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, Ops)
   11067       : DAG.getMergeValues(Ops, DL);
   11068     return std::make_pair(pair, SDValue());
   11069   }
   11070 }
   11071 
   11072 static SDValue LowerAVXExtend(SDValue Op, SelectionDAG &DAG,
   11073                               const X86Subtarget *Subtarget) {
   11074   MVT VT = Op->getSimpleValueType(0);
   11075   SDValue In = Op->getOperand(0);
   11076   MVT InVT = In.getSimpleValueType();
   11077   SDLoc dl(Op);
   11078 
   11079   // Optimize vectors in AVX mode:
   11080   //
   11081   //   v8i16 -> v8i32
   11082   //   Use vpunpcklwd for 4 lower elements  v8i16 -> v4i32.
   11083   //   Use vpunpckhwd for 4 upper elements  v8i16 -> v4i32.
   11084   //   Concat upper and lower parts.
   11085   //
   11086   //   v4i32 -> v4i64
   11087   //   Use vpunpckldq for 4 lower elements  v4i32 -> v2i64.
   11088   //   Use vpunpckhdq for 4 upper elements  v4i32 -> v2i64.
   11089   //   Concat upper and lower parts.
   11090   //
   11091 
   11092   if (((VT != MVT::v16i16) || (InVT != MVT::v16i8)) &&
   11093       ((VT != MVT::v8i32) || (InVT != MVT::v8i16)) &&
   11094       ((VT != MVT::v4i64) || (InVT != MVT::v4i32)))
   11095     return SDValue();
   11096 
   11097   if (Subtarget->hasInt256())
   11098     return DAG.getNode(X86ISD::VZEXT, dl, VT, In);
   11099 
   11100   SDValue ZeroVec = getZeroVector(InVT, Subtarget, DAG, dl);
   11101   SDValue Undef = DAG.getUNDEF(InVT);
   11102   bool NeedZero = Op.getOpcode() == ISD::ZERO_EXTEND;
   11103   SDValue OpLo = getUnpackl(DAG, dl, InVT, In, NeedZero ? ZeroVec : Undef);
   11104   SDValue OpHi = getUnpackh(DAG, dl, InVT, In, NeedZero ? ZeroVec : Undef);
   11105 
   11106   MVT HVT = MVT::getVectorVT(VT.getVectorElementType(),
   11107                              VT.getVectorNumElements()/2);
   11108 
   11109   OpLo = DAG.getNode(ISD::BITCAST, dl, HVT, OpLo);
   11110   OpHi = DAG.getNode(ISD::BITCAST, dl, HVT, OpHi);
   11111 
   11112   return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, OpLo, OpHi);
   11113 }
   11114 
   11115 static  SDValue LowerZERO_EXTEND_AVX512(SDValue Op,
   11116                                         SelectionDAG &DAG) {
   11117   MVT VT = Op->getSimpleValueType(0);
   11118   SDValue In = Op->getOperand(0);
   11119   MVT InVT = In.getSimpleValueType();
   11120   SDLoc DL(Op);
   11121   unsigned int NumElts = VT.getVectorNumElements();
   11122   if (NumElts != 8 && NumElts != 16)
   11123     return SDValue();
   11124 
   11125   if (VT.is512BitVector() && InVT.getVectorElementType() != MVT::i1)
   11126     return DAG.getNode(X86ISD::VZEXT, DL, VT, In);
   11127 
   11128   EVT ExtVT = (NumElts == 8)? MVT::v8i64 : MVT::v16i32;
   11129   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
   11130   // Now we have only mask extension
   11131   assert(InVT.getVectorElementType() == MVT::i1);
   11132   SDValue Cst = DAG.getTargetConstant(1, ExtVT.getScalarType());
   11133   const Constant *C = (dyn_cast<ConstantSDNode>(Cst))->getConstantIntValue();
   11134   SDValue CP = DAG.getConstantPool(C, TLI.getPointerTy());
   11135   unsigned Alignment = cast<ConstantPoolSDNode>(CP)->getAlignment();
   11136   SDValue Ld = DAG.getLoad(Cst.getValueType(), DL, DAG.getEntryNode(), CP,
   11137                            MachinePointerInfo::getConstantPool(),
   11138                            false, false, false, Alignment);
   11139 
   11140   SDValue Brcst = DAG.getNode(X86ISD::VBROADCASTM, DL, ExtVT, In, Ld);
   11141   if (VT.is512BitVector())
   11142     return Brcst;
   11143   return DAG.getNode(X86ISD::VTRUNC, DL, VT, Brcst);
   11144 }
   11145 
   11146 static SDValue LowerANY_EXTEND(SDValue Op, const X86Subtarget *Subtarget,
   11147                                SelectionDAG &DAG) {
   11148   if (Subtarget->hasFp256()) {
   11149     SDValue Res = LowerAVXExtend(Op, DAG, Subtarget);
   11150     if (Res.getNode())
   11151       return Res;
   11152   }
   11153 
   11154   return SDValue();
   11155 }
   11156 
   11157 static SDValue LowerZERO_EXTEND(SDValue Op, const X86Subtarget *Subtarget,
   11158                                 SelectionDAG &DAG) {
   11159   SDLoc DL(Op);
   11160   MVT VT = Op.getSimpleValueType();
   11161   SDValue In = Op.getOperand(0);
   11162   MVT SVT = In.getSimpleValueType();
   11163 
   11164   if (VT.is512BitVector() || SVT.getVectorElementType() == MVT::i1)
   11165     return LowerZERO_EXTEND_AVX512(Op, DAG);
   11166 
   11167   if (Subtarget->hasFp256()) {
   11168     SDValue Res = LowerAVXExtend(Op, DAG, Subtarget);
   11169     if (Res.getNode())
   11170       return Res;
   11171   }
   11172 
   11173   assert(!VT.is256BitVector() || !SVT.is128BitVector() ||
   11174          VT.getVectorNumElements() != SVT.getVectorNumElements());
   11175   return SDValue();
   11176 }
   11177 
   11178 SDValue X86TargetLowering::LowerTRUNCATE(SDValue Op, SelectionDAG &DAG) const {
   11179   SDLoc DL(Op);
   11180   MVT VT = Op.getSimpleValueType();
   11181   SDValue In = Op.getOperand(0);
   11182   MVT InVT = In.getSimpleValueType();
   11183 
   11184   if (VT == MVT::i1) {
   11185     assert((InVT.isInteger() && (InVT.getSizeInBits() <= 64)) &&
   11186            "Invalid scalar TRUNCATE operation");
   11187     if (InVT == MVT::i32)
   11188       return SDValue();
   11189     if (InVT.getSizeInBits() == 64)
   11190       In = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::i32, In);
   11191     else if (InVT.getSizeInBits() < 32)
   11192       In = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, In);
   11193     return DAG.getNode(ISD::TRUNCATE, DL, VT, In);
   11194   }
   11195   assert(VT.getVectorNumElements() == InVT.getVectorNumElements() &&
   11196          "Invalid TRUNCATE operation");
   11197 
   11198   if (InVT.is512BitVector() || VT.getVectorElementType() == MVT::i1) {
   11199     if (VT.getVectorElementType().getSizeInBits() >=8)
   11200       return DAG.getNode(X86ISD::VTRUNC, DL, VT, In);
   11201 
   11202     assert(VT.getVectorElementType() == MVT::i1 && "Unexpected vector type");
   11203     unsigned NumElts = InVT.getVectorNumElements();
   11204     assert ((NumElts == 8 || NumElts == 16) && "Unexpected vector type");
   11205     if (InVT.getSizeInBits() < 512) {
   11206       MVT ExtVT = (NumElts == 16)? MVT::v16i32 : MVT::v8i64;
   11207       In = DAG.getNode(ISD::SIGN_EXTEND, DL, ExtVT, In);
   11208       InVT = ExtVT;
   11209     }
   11210 
   11211     SDValue Cst = DAG.getTargetConstant(1, InVT.getVectorElementType());
   11212     const Constant *C = (dyn_cast<ConstantSDNode>(Cst))->getConstantIntValue();
   11213     SDValue CP = DAG.getConstantPool(C, getPointerTy());
   11214     unsigned Alignment = cast<ConstantPoolSDNode>(CP)->getAlignment();
   11215     SDValue Ld = DAG.getLoad(Cst.getValueType(), DL, DAG.getEntryNode(), CP,
   11216                            MachinePointerInfo::getConstantPool(),
   11217                            false, false, false, Alignment);
   11218     SDValue OneV = DAG.getNode(X86ISD::VBROADCAST, DL, InVT, Ld);
   11219     SDValue And = DAG.getNode(ISD::AND, DL, InVT, OneV, In);
   11220     return DAG.getNode(X86ISD::TESTM, DL, VT, And, And);
   11221   }
   11222 
   11223   if ((VT == MVT::v4i32) && (InVT == MVT::v4i64)) {
   11224     // On AVX2, v4i64 -> v4i32 becomes VPERMD.
   11225     if (Subtarget->hasInt256()) {
   11226       static const int ShufMask[] = {0, 2, 4, 6, -1, -1, -1, -1};
   11227       In = DAG.getNode(ISD::BITCAST, DL, MVT::v8i32, In);
   11228       In = DAG.getVectorShuffle(MVT::v8i32, DL, In, DAG.getUNDEF(MVT::v8i32),
   11229                                 ShufMask);
   11230       return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, In,
   11231                          DAG.getIntPtrConstant(0));
   11232     }
   11233 
   11234     SDValue OpLo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2i64, In,
   11235                                DAG.getIntPtrConstant(0));
   11236     SDValue OpHi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2i64, In,
   11237                                DAG.getIntPtrConstant(2));
   11238     OpLo = DAG.getNode(ISD::BITCAST, DL, MVT::v4i32, OpLo);
   11239     OpHi = DAG.getNode(ISD::BITCAST, DL, MVT::v4i32, OpHi);
   11240     static const int ShufMask[] = {0, 2, 4, 6};
   11241     return DAG.getVectorShuffle(VT, DL, OpLo, OpHi, ShufMask);
   11242   }
   11243 
   11244   if ((VT == MVT::v8i16) && (InVT == MVT::v8i32)) {
   11245     // On AVX2, v8i32 -> v8i16 becomed PSHUFB.
   11246     if (Subtarget->hasInt256()) {
   11247       In = DAG.getNode(ISD::BITCAST, DL, MVT::v32i8, In);
   11248 
   11249       SmallVector<SDValue,32> pshufbMask;
   11250       for (unsigned i = 0; i < 2; ++i) {
   11251         pshufbMask.push_back(DAG.getConstant(0x0, MVT::i8));
   11252         pshufbMask.push_back(DAG.getConstant(0x1, MVT::i8));
   11253         pshufbMask.push_back(DAG.getConstant(0x4, MVT::i8));
   11254         pshufbMask.push_back(DAG.getConstant(0x5, MVT::i8));
   11255         pshufbMask.push_back(DAG.getConstant(0x8, MVT::i8));
   11256         pshufbMask.push_back(DAG.getConstant(0x9, MVT::i8));
   11257         pshufbMask.push_back(DAG.getConstant(0xc, MVT::i8));
   11258         pshufbMask.push_back(DAG.getConstant(0xd, MVT::i8));
   11259         for (unsigned j = 0; j < 8; ++j)
   11260           pshufbMask.push_back(DAG.getConstant(0x80, MVT::i8));
   11261       }
   11262       SDValue BV = DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v32i8, pshufbMask);
   11263       In = DAG.getNode(X86ISD::PSHUFB, DL, MVT::v32i8, In, BV);
   11264       In = DAG.getNode(ISD::BITCAST, DL, MVT::v4i64, In);
   11265 
   11266       static const int ShufMask[] = {0,  2,  -1,  -1};
   11267       In = DAG.getVectorShuffle(MVT::v4i64, DL,  In, DAG.getUNDEF(MVT::v4i64),
   11268                                 &ShufMask[0]);
   11269       In = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2i64, In,
   11270                        DAG.getIntPtrConstant(0));
   11271       return DAG.getNode(ISD::BITCAST, DL, VT, In);
   11272     }
   11273 
   11274     SDValue OpLo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v4i32, In,
   11275                                DAG.getIntPtrConstant(0));
   11276 
   11277     SDValue OpHi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v4i32, In,
   11278                                DAG.getIntPtrConstant(4));
   11279 
   11280     OpLo = DAG.getNode(ISD::BITCAST, DL, MVT::v16i8, OpLo);
   11281     OpHi = DAG.getNode(ISD::BITCAST, DL, MVT::v16i8, OpHi);
   11282 
   11283     // The PSHUFB mask:
   11284     static const int ShufMask1[] = {0,  1,  4,  5,  8,  9, 12, 13,
   11285                                    -1, -1, -1, -1, -1, -1, -1, -1};
   11286 
   11287     SDValue Undef = DAG.getUNDEF(MVT::v16i8);
   11288     OpLo = DAG.getVectorShuffle(MVT::v16i8, DL, OpLo, Undef, ShufMask1);
   11289     OpHi = DAG.getVectorShuffle(MVT::v16i8, DL, OpHi, Undef, ShufMask1);
   11290 
   11291     OpLo = DAG.getNode(ISD::BITCAST, DL, MVT::v4i32, OpLo);
   11292     OpHi = DAG.getNode(ISD::BITCAST, DL, MVT::v4i32, OpHi);
   11293 
   11294     // The MOVLHPS Mask:
   11295     static const int ShufMask2[] = {0, 1, 4, 5};
   11296     SDValue res = DAG.getVectorShuffle(MVT::v4i32, DL, OpLo, OpHi, ShufMask2);
   11297     return DAG.getNode(ISD::BITCAST, DL, MVT::v8i16, res);
   11298   }
   11299 
   11300   // Handle truncation of V256 to V128 using shuffles.
   11301   if (!VT.is128BitVector() || !InVT.is256BitVector())
   11302     return SDValue();
   11303 
   11304   assert(Subtarget->hasFp256() && "256-bit vector without AVX!");
   11305 
   11306   unsigned NumElems = VT.getVectorNumElements();
   11307   MVT NVT = MVT::getVectorVT(VT.getVectorElementType(), NumElems * 2);
   11308 
   11309   SmallVector<int, 16> MaskVec(NumElems * 2, -1);
   11310   // Prepare truncation shuffle mask
   11311   for (unsigned i = 0; i != NumElems; ++i)
   11312     MaskVec[i] = i * 2;
   11313   SDValue V = DAG.getVectorShuffle(NVT, DL,
   11314                                    DAG.getNode(ISD::BITCAST, DL, NVT, In),
   11315                                    DAG.getUNDEF(NVT), &MaskVec[0]);
   11316   return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, V,
   11317                      DAG.getIntPtrConstant(0));
   11318 }
   11319 
   11320 SDValue X86TargetLowering::LowerFP_TO_SINT(SDValue Op,
   11321                                            SelectionDAG &DAG) const {
   11322   assert(!Op.getSimpleValueType().isVector());
   11323 
   11324   std::pair<SDValue,SDValue> Vals = FP_TO_INTHelper(Op, DAG,
   11325     /*IsSigned=*/ true, /*IsReplace=*/ false);
   11326   SDValue FIST = Vals.first, StackSlot = Vals.second;
   11327   // If FP_TO_INTHelper failed, the node is actually supposed to be Legal.
   11328   if (!FIST.getNode()) return Op;
   11329 
   11330   if (StackSlot.getNode())
   11331     // Load the result.
   11332     return DAG.getLoad(Op.getValueType(), SDLoc(Op),
   11333                        FIST, StackSlot, MachinePointerInfo(),
   11334                        false, false, false, 0);
   11335 
   11336   // The node is the result.
   11337   return FIST;
   11338 }
   11339 
   11340 SDValue X86TargetLowering::LowerFP_TO_UINT(SDValue Op,
   11341                                            SelectionDAG &DAG) const {
   11342   std::pair<SDValue,SDValue> Vals = FP_TO_INTHelper(Op, DAG,
   11343     /*IsSigned=*/ false, /*IsReplace=*/ false);
   11344   SDValue FIST = Vals.first, StackSlot = Vals.second;
   11345   assert(FIST.getNode() && "Unexpected failure");
   11346 
   11347   if (StackSlot.getNode())
   11348     // Load the result.
   11349     return DAG.getLoad(Op.getValueType(), SDLoc(Op),
   11350                        FIST, StackSlot, MachinePointerInfo(),
   11351                        false, false, false, 0);
   11352 
   11353   // The node is the result.
   11354   return FIST;
   11355 }
   11356 
   11357 static SDValue LowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) {
   11358   SDLoc DL(Op);
   11359   MVT VT = Op.getSimpleValueType();
   11360   SDValue In = Op.getOperand(0);
   11361   MVT SVT = In.getSimpleValueType();
   11362 
   11363   assert(SVT == MVT::v2f32 && "Only customize MVT::v2f32 type legalization!");
   11364 
   11365   return DAG.getNode(X86ISD::VFPEXT, DL, VT,
   11366                      DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v4f32,
   11367                                  In, DAG.getUNDEF(SVT)));
   11368 }
   11369 
   11370 static SDValue LowerFABS(SDValue Op, SelectionDAG &DAG) {
   11371   LLVMContext *Context = DAG.getContext();
   11372   SDLoc dl(Op);
   11373   MVT VT = Op.getSimpleValueType();
   11374   MVT EltVT = VT;
   11375   unsigned NumElts = VT == MVT::f64 ? 2 : 4;
   11376   if (VT.isVector()) {
   11377     EltVT = VT.getVectorElementType();
   11378     NumElts = VT.getVectorNumElements();
   11379   }
   11380   Constant *C;
   11381   if (EltVT == MVT::f64)
   11382     C = ConstantFP::get(*Context, APFloat(APFloat::IEEEdouble,
   11383                                           APInt(64, ~(1ULL << 63))));
   11384   else
   11385     C = ConstantFP::get(*Context, APFloat(APFloat::IEEEsingle,
   11386                                           APInt(32, ~(1U << 31))));
   11387   C = ConstantVector::getSplat(NumElts, C);
   11388   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
   11389   SDValue CPIdx = DAG.getConstantPool(C, TLI.getPointerTy());
   11390   unsigned Alignment = cast<ConstantPoolSDNode>(CPIdx)->getAlignment();
   11391   SDValue Mask = DAG.getLoad(VT, dl, DAG.getEntryNode(), CPIdx,
   11392                              MachinePointerInfo::getConstantPool(),
   11393                              false, false, false, Alignment);
   11394   if (VT.isVector()) {
   11395     MVT ANDVT = VT.is128BitVector() ? MVT::v2i64 : MVT::v4i64;
   11396     return DAG.getNode(ISD::BITCAST, dl, VT,
   11397                        DAG.getNode(ISD::AND, dl, ANDVT,
   11398                                    DAG.getNode(ISD::BITCAST, dl, ANDVT,
   11399                                                Op.getOperand(0)),
   11400                                    DAG.getNode(ISD::BITCAST, dl, ANDVT, Mask)));
   11401   }
   11402   return DAG.getNode(X86ISD::FAND, dl, VT, Op.getOperand(0), Mask);
   11403 }
   11404 
   11405 static SDValue LowerFNEG(SDValue Op, SelectionDAG &DAG) {
   11406   LLVMContext *Context = DAG.getContext();
   11407   SDLoc dl(Op);
   11408   MVT VT = Op.getSimpleValueType();
   11409   MVT EltVT = VT;
   11410   unsigned NumElts = VT == MVT::f64 ? 2 : 4;
   11411   if (VT.isVector()) {
   11412     EltVT = VT.getVectorElementType();
   11413     NumElts = VT.getVectorNumElements();
   11414   }
   11415   Constant *C;
   11416   if (EltVT == MVT::f64)
   11417     C = ConstantFP::get(*Context, APFloat(APFloat::IEEEdouble,
   11418                                           APInt(64, 1ULL << 63)));
   11419   else
   11420     C = ConstantFP::get(*Context, APFloat(APFloat::IEEEsingle,
   11421                                           APInt(32, 1U << 31)));
   11422   C = ConstantVector::getSplat(NumElts, C);
   11423   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
   11424   SDValue CPIdx = DAG.getConstantPool(C, TLI.getPointerTy());
   11425   unsigned Alignment = cast<ConstantPoolSDNode>(CPIdx)->getAlignment();
   11426   SDValue Mask = DAG.getLoad(VT, dl, DAG.getEntryNode(), CPIdx,
   11427                              MachinePointerInfo::getConstantPool(),
   11428                              false, false, false, Alignment);
   11429   if (VT.isVector()) {
   11430     MVT XORVT = MVT::getVectorVT(MVT::i64, VT.getSizeInBits()/64);
   11431     return DAG.getNode(ISD::BITCAST, dl, VT,
   11432                        DAG.getNode(ISD::XOR, dl, XORVT,
   11433                                    DAG.getNode(ISD::BITCAST, dl, XORVT,
   11434                                                Op.getOperand(0)),
   11435                                    DAG.getNode(ISD::BITCAST, dl, XORVT, Mask)));
   11436   }
   11437 
   11438   return DAG.getNode(X86ISD::FXOR, dl, VT, Op.getOperand(0), Mask);
   11439 }
   11440 
   11441 static SDValue LowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) {
   11442   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
   11443   LLVMContext *Context = DAG.getContext();
   11444   SDValue Op0 = Op.getOperand(0);
   11445   SDValue Op1 = Op.getOperand(1);
   11446   SDLoc dl(Op);
   11447   MVT VT = Op.getSimpleValueType();
   11448   MVT SrcVT = Op1.getSimpleValueType();
   11449 
   11450   // If second operand is smaller, extend it first.
   11451   if (SrcVT.bitsLT(VT)) {
   11452     Op1 = DAG.getNode(ISD::FP_EXTEND, dl, VT, Op1);
   11453     SrcVT = VT;
   11454   }
   11455   // And if it is bigger, shrink it first.
   11456   if (SrcVT.bitsGT(VT)) {
   11457     Op1 = DAG.getNode(ISD::FP_ROUND, dl, VT, Op1, DAG.getIntPtrConstant(1));
   11458     SrcVT = VT;
   11459   }
   11460 
   11461   // At this point the operands and the result should have the same
   11462   // type, and that won't be f80 since that is not custom lowered.
   11463 
   11464   // First get the sign bit of second operand.
   11465   SmallVector<Constant*,4> CV;
   11466   if (SrcVT == MVT::f64) {
   11467     const fltSemantics &Sem = APFloat::IEEEdouble;
   11468     CV.push_back(ConstantFP::get(*Context, APFloat(Sem, APInt(64, 1ULL << 63))));
   11469     CV.push_back(ConstantFP::get(*Context, APFloat(Sem, APInt(64, 0))));
   11470   } else {
   11471     const fltSemantics &Sem = APFloat::IEEEsingle;
   11472     CV.push_back(ConstantFP::get(*Context, APFloat(Sem, APInt(32, 1U << 31))));
   11473     CV.push_back(ConstantFP::get(*Context, APFloat(Sem, APInt(32, 0))));
   11474     CV.push_back(ConstantFP::get(*Context, APFloat(Sem, APInt(32, 0))));
   11475     CV.push_back(ConstantFP::get(*Context, APFloat(Sem, APInt(32, 0))));
   11476   }
   11477   Constant *C = ConstantVector::get(CV);
   11478   SDValue CPIdx = DAG.getConstantPool(C, TLI.getPointerTy(), 16);
   11479   SDValue Mask1 = DAG.getLoad(SrcVT, dl, DAG.getEntryNode(), CPIdx,
   11480                               MachinePointerInfo::getConstantPool(),
   11481                               false, false, false, 16);
   11482   SDValue SignBit = DAG.getNode(X86ISD::FAND, dl, SrcVT, Op1, Mask1);
   11483 
   11484   // Shift sign bit right or left if the two operands have different types.
   11485   if (SrcVT.bitsGT(VT)) {
   11486     // Op0 is MVT::f32, Op1 is MVT::f64.
   11487     SignBit = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2f64, SignBit);
   11488     SignBit = DAG.getNode(X86ISD::FSRL, dl, MVT::v2f64, SignBit,
   11489                           DAG.getConstant(32, MVT::i32));
   11490     SignBit = DAG.getNode(ISD::BITCAST, dl, MVT::v4f32, SignBit);
   11491     SignBit = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f32, SignBit,
   11492                           DAG.getIntPtrConstant(0));
   11493   }
   11494 
   11495   // Clear first operand sign bit.
   11496   CV.clear();
   11497   if (VT == MVT::f64) {
   11498     const fltSemantics &Sem = APFloat::IEEEdouble;
   11499     CV.push_back(ConstantFP::get(*Context, APFloat(Sem,
   11500                                                    APInt(64, ~(1ULL << 63)))));
   11501     CV.push_back(ConstantFP::get(*Context, APFloat(Sem, APInt(64, 0))));
   11502   } else {
   11503     const fltSemantics &Sem = APFloat::IEEEsingle;
   11504     CV.push_back(ConstantFP::get(*Context, APFloat(Sem,
   11505                                                    APInt(32, ~(1U << 31)))));
   11506     CV.push_back(ConstantFP::get(*Context, APFloat(Sem, APInt(32, 0))));
   11507     CV.push_back(ConstantFP::get(*Context, APFloat(Sem, APInt(32, 0))));
   11508     CV.push_back(ConstantFP::get(*Context, APFloat(Sem, APInt(32, 0))));
   11509   }
   11510   C = ConstantVector::get(CV);
   11511   CPIdx = DAG.getConstantPool(C, TLI.getPointerTy(), 16);
   11512   SDValue Mask2 = DAG.getLoad(VT, dl, DAG.getEntryNode(), CPIdx,
   11513                               MachinePointerInfo::getConstantPool(),
   11514                               false, false, false, 16);
   11515   SDValue Val = DAG.getNode(X86ISD::FAND, dl, VT, Op0, Mask2);
   11516 
   11517   // Or the value with the sign bit.
   11518   return DAG.getNode(X86ISD::FOR, dl, VT, Val, SignBit);
   11519 }
   11520 
   11521 static SDValue LowerFGETSIGN(SDValue Op, SelectionDAG &DAG) {
   11522   SDValue N0 = Op.getOperand(0);
   11523   SDLoc dl(Op);
   11524   MVT VT = Op.getSimpleValueType();
   11525 
   11526   // Lower ISD::FGETSIGN to (AND (X86ISD::FGETSIGNx86 ...) 1).
   11527   SDValue xFGETSIGN = DAG.getNode(X86ISD::FGETSIGNx86, dl, VT, N0,
   11528                                   DAG.getConstant(1, VT));
   11529   return DAG.getNode(ISD::AND, dl, VT, xFGETSIGN, DAG.getConstant(1, VT));
   11530 }
   11531 
   11532 // LowerVectorAllZeroTest - Check whether an OR'd tree is PTEST-able.
   11533 //
   11534 static SDValue LowerVectorAllZeroTest(SDValue Op, const X86Subtarget *Subtarget,
   11535                                       SelectionDAG &DAG) {
   11536   assert(Op.getOpcode() == ISD::OR && "Only check OR'd tree.");
   11537 
   11538   if (!Subtarget->hasSSE41())
   11539     return SDValue();
   11540 
   11541   if (!Op->hasOneUse())
   11542     return SDValue();
   11543 
   11544   SDNode *N = Op.getNode();
   11545   SDLoc DL(N);
   11546 
   11547   SmallVector<SDValue, 8> Opnds;
   11548   DenseMap<SDValue, unsigned> VecInMap;
   11549   SmallVector<SDValue, 8> VecIns;
   11550   EVT VT = MVT::Other;
   11551 
   11552   // Recognize a special case where a vector is casted into wide integer to
   11553   // test all 0s.
   11554   Opnds.push_back(N->getOperand(0));
   11555   Opnds.push_back(N->getOperand(1));
   11556 
   11557   for (unsigned Slot = 0, e = Opnds.size(); Slot < e; ++Slot) {
   11558     SmallVectorImpl<SDValue>::const_iterator I = Opnds.begin() + Slot;
   11559     // BFS traverse all OR'd operands.
   11560     if (I->getOpcode() == ISD::OR) {
   11561       Opnds.push_back(I->getOperand(0));
   11562       Opnds.push_back(I->getOperand(1));
   11563       // Re-evaluate the number of nodes to be traversed.
   11564       e += 2; // 2 more nodes (LHS and RHS) are pushed.
   11565       continue;
   11566     }
   11567 
   11568     // Quit if a non-EXTRACT_VECTOR_ELT
   11569     if (I->getOpcode() != ISD::EXTRACT_VECTOR_ELT)
   11570       return SDValue();
   11571 
   11572     // Quit if without a constant index.
   11573     SDValue Idx = I->getOperand(1);
   11574     if (!isa<ConstantSDNode>(Idx))
   11575       return SDValue();
   11576 
   11577     SDValue ExtractedFromVec = I->getOperand(0);
   11578     DenseMap<SDValue, unsigned>::iterator M = VecInMap.find(ExtractedFromVec);
   11579     if (M == VecInMap.end()) {
   11580       VT = ExtractedFromVec.getValueType();
   11581       // Quit if not 128/256-bit vector.
   11582       if (!VT.is128BitVector() && !VT.is256BitVector())
   11583         return SDValue();
   11584       // Quit if not the same type.
   11585       if (VecInMap.begin() != VecInMap.end() &&
   11586           VT != VecInMap.begin()->first.getValueType())
   11587         return SDValue();
   11588       M = VecInMap.insert(std::make_pair(ExtractedFromVec, 0)).first;
   11589       VecIns.push_back(ExtractedFromVec);
   11590     }
   11591     M->second |= 1U << cast<ConstantSDNode>(Idx)->getZExtValue();
   11592   }
   11593 
   11594   assert((VT.is128BitVector() || VT.is256BitVector()) &&
   11595          "Not extracted from 128-/256-bit vector.");
   11596 
   11597   unsigned FullMask = (1U << VT.getVectorNumElements()) - 1U;
   11598 
   11599   for (DenseMap<SDValue, unsigned>::const_iterator
   11600         I = VecInMap.begin(), E = VecInMap.end(); I != E; ++I) {
   11601     // Quit if not all elements are used.
   11602     if (I->second != FullMask)
   11603       return SDValue();
   11604   }
   11605 
   11606   EVT TestVT = VT.is128BitVector() ? MVT::v2i64 : MVT::v4i64;
   11607 
   11608   // Cast all vectors into TestVT for PTEST.
   11609   for (unsigned i = 0, e = VecIns.size(); i < e; ++i)
   11610     VecIns[i] = DAG.getNode(ISD::BITCAST, DL, TestVT, VecIns[i]);
   11611 
   11612   // If more than one full vectors are evaluated, OR them first before PTEST.
   11613   for (unsigned Slot = 0, e = VecIns.size(); e - Slot > 1; Slot += 2, e += 1) {
   11614     // Each iteration will OR 2 nodes and append the result until there is only
   11615     // 1 node left, i.e. the final OR'd value of all vectors.
   11616     SDValue LHS = VecIns[Slot];
   11617     SDValue RHS = VecIns[Slot + 1];
   11618     VecIns.push_back(DAG.getNode(ISD::OR, DL, TestVT, LHS, RHS));
   11619   }
   11620 
   11621   return DAG.getNode(X86ISD::PTEST, DL, MVT::i32,
   11622                      VecIns.back(), VecIns.back());
   11623 }
   11624 
   11625 /// \brief return true if \c Op has a use that doesn't just read flags.
   11626 static bool hasNonFlagsUse(SDValue Op) {
   11627   for (SDNode::use_iterator UI = Op->use_begin(), UE = Op->use_end(); UI != UE;
   11628        ++UI) {
   11629     SDNode *User = *UI;
   11630     unsigned UOpNo = UI.getOperandNo();
   11631     if (User->getOpcode() == ISD::TRUNCATE && User->hasOneUse()) {
   11632       // Look pass truncate.
   11633       UOpNo = User->use_begin().getOperandNo();
   11634       User = *User->use_begin();
   11635     }
   11636 
   11637     if (User->getOpcode() != ISD::BRCOND && User->getOpcode() != ISD::SETCC &&
   11638         !(User->getOpcode() == ISD::SELECT && UOpNo == 0))
   11639       return true;
   11640   }
   11641   return false;
   11642 }
   11643 
   11644 /// Emit nodes that will be selected as "test Op0,Op0", or something
   11645 /// equivalent.
   11646 SDValue X86TargetLowering::EmitTest(SDValue Op, unsigned X86CC, SDLoc dl,
   11647                                     SelectionDAG &DAG) const {
   11648   if (Op.getValueType() == MVT::i1)
   11649     // KORTEST instruction should be selected
   11650     return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op,
   11651                        DAG.getConstant(0, Op.getValueType()));
   11652 
   11653   // CF and OF aren't always set the way we want. Determine which
   11654   // of these we need.
   11655   bool NeedCF = false;
   11656   bool NeedOF = false;
   11657   switch (X86CC) {
   11658   default: break;
   11659   case X86::COND_A: case X86::COND_AE:
   11660   case X86::COND_B: case X86::COND_BE:
   11661     NeedCF = true;
   11662     break;
   11663   case X86::COND_G: case X86::COND_GE:
   11664   case X86::COND_L: case X86::COND_LE:
   11665   case X86::COND_O: case X86::COND_NO: {
   11666     // Check if we really need to set the
   11667     // Overflow flag. If NoSignedWrap is present
   11668     // that is not actually needed.
   11669     switch (Op->getOpcode()) {
   11670     case ISD::ADD:
   11671     case ISD::SUB:
   11672     case ISD::MUL:
   11673     case ISD::SHL: {
   11674       const BinaryWithFlagsSDNode *BinNode =
   11675           cast<BinaryWithFlagsSDNode>(Op.getNode());
   11676       if (BinNode->hasNoSignedWrap())
   11677         break;
   11678     }
   11679     default:
   11680       NeedOF = true;
   11681       break;
   11682     }
   11683     break;
   11684   }
   11685   }
   11686   // See if we can use the EFLAGS value from the operand instead of
   11687   // doing a separate TEST. TEST always sets OF and CF to 0, so unless
   11688   // we prove that the arithmetic won't overflow, we can't use OF or CF.
   11689   if (Op.getResNo() != 0 || NeedOF || NeedCF) {
   11690     // Emit a CMP with 0, which is the TEST pattern.
   11691     //if (Op.getValueType() == MVT::i1)
   11692     //  return DAG.getNode(X86ISD::CMP, dl, MVT::i1, Op,
   11693     //                     DAG.getConstant(0, MVT::i1));
   11694     return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op,
   11695                        DAG.getConstant(0, Op.getValueType()));
   11696   }
   11697   unsigned Opcode = 0;
   11698   unsigned NumOperands = 0;
   11699 
   11700   // Truncate operations may prevent the merge of the SETCC instruction
   11701   // and the arithmetic instruction before it. Attempt to truncate the operands
   11702   // of the arithmetic instruction and use a reduced bit-width instruction.
   11703   bool NeedTruncation = false;
   11704   SDValue ArithOp = Op;
   11705   if (Op->getOpcode() == ISD::TRUNCATE && Op->hasOneUse()) {
   11706     SDValue Arith = Op->getOperand(0);
   11707     // Both the trunc and the arithmetic op need to have one user each.
   11708     if (Arith->hasOneUse())
   11709       switch (Arith.getOpcode()) {
   11710         default: break;
   11711         case ISD::ADD:
   11712         case ISD::SUB:
   11713         case ISD::AND:
   11714         case ISD::OR:
   11715         case ISD::XOR: {
   11716           NeedTruncation = true;
   11717           ArithOp = Arith;
   11718         }
   11719       }
   11720   }
   11721 
   11722   // NOTICE: In the code below we use ArithOp to hold the arithmetic operation
   11723   // which may be the result of a CAST.  We use the variable 'Op', which is the
   11724   // non-casted variable when we check for possible users.
   11725   switch (ArithOp.getOpcode()) {
   11726   case ISD::ADD:
   11727     // Due to an isel shortcoming, be conservative if this add is likely to be
   11728     // selected as part of a load-modify-store instruction. When the root node
   11729     // in a match is a store, isel doesn't know how to remap non-chain non-flag
   11730     // uses of other nodes in the match, such as the ADD in this case. This
   11731     // leads to the ADD being left around and reselected, with the result being
   11732     // two adds in the output.  Alas, even if none our users are stores, that
   11733     // doesn't prove we're O.K.  Ergo, if we have any parents that aren't
   11734     // CopyToReg or SETCC, eschew INC/DEC.  A better fix seems to require
   11735     // climbing the DAG back to the root, and it doesn't seem to be worth the
   11736     // effort.
   11737     for (SDNode::use_iterator UI = Op.getNode()->use_begin(),
   11738          UE = Op.getNode()->use_end(); UI != UE; ++UI)
   11739       if (UI->getOpcode() != ISD::CopyToReg &&
   11740           UI->getOpcode() != ISD::SETCC &&
   11741           UI->getOpcode() != ISD::STORE)
   11742         goto default_case;
   11743 
   11744     if (ConstantSDNode *C =
   11745         dyn_cast<ConstantSDNode>(ArithOp.getNode()->getOperand(1))) {
   11746       // An add of one will be selected as an INC.
   11747       if (C->getAPIntValue() == 1 && !Subtarget->slowIncDec()) {
   11748         Opcode = X86ISD::INC;
   11749         NumOperands = 1;
   11750         break;
   11751       }
   11752 
   11753       // An add of negative one (subtract of one) will be selected as a DEC.
   11754       if (C->getAPIntValue().isAllOnesValue() && !Subtarget->slowIncDec()) {
   11755         Opcode = X86ISD::DEC;
   11756         NumOperands = 1;
   11757         break;
   11758       }
   11759     }
   11760 
   11761     // Otherwise use a regular EFLAGS-setting add.
   11762     Opcode = X86ISD::ADD;
   11763     NumOperands = 2;
   11764     break;
   11765   case ISD::SHL:
   11766   case ISD::SRL:
   11767     // If we have a constant logical shift that's only used in a comparison
   11768     // against zero turn it into an equivalent AND. This allows turning it into
   11769     // a TEST instruction later.
   11770     if ((X86CC == X86::COND_E || X86CC == X86::COND_NE) && Op->hasOneUse() &&
   11771         isa<ConstantSDNode>(Op->getOperand(1)) && !hasNonFlagsUse(Op)) {
   11772       EVT VT = Op.getValueType();
   11773       unsigned BitWidth = VT.getSizeInBits();
   11774       unsigned ShAmt = Op->getConstantOperandVal(1);
   11775       if (ShAmt >= BitWidth) // Avoid undefined shifts.
   11776         break;
   11777       APInt Mask = ArithOp.getOpcode() == ISD::SRL
   11778                        ? APInt::getHighBitsSet(BitWidth, BitWidth - ShAmt)
   11779                        : APInt::getLowBitsSet(BitWidth, BitWidth - ShAmt);
   11780       if (!Mask.isSignedIntN(32)) // Avoid large immediates.
   11781         break;
   11782       SDValue New = DAG.getNode(ISD::AND, dl, VT, Op->getOperand(0),
   11783                                 DAG.getConstant(Mask, VT));
   11784       DAG.ReplaceAllUsesWith(Op, New);
   11785       Op = New;
   11786     }
   11787     break;
   11788 
   11789   case ISD::AND:
   11790     // If the primary and result isn't used, don't bother using X86ISD::AND,
   11791     // because a TEST instruction will be better.
   11792     if (!hasNonFlagsUse(Op))
   11793       break;
   11794     // FALL THROUGH
   11795   case ISD::SUB:
   11796   case ISD::OR:
   11797   case ISD::XOR:
   11798     // Due to the ISEL shortcoming noted above, be conservative if this op is
   11799     // likely to be selected as part of a load-modify-store instruction.
   11800     for (SDNode::use_iterator UI = Op.getNode()->use_begin(),
   11801            UE = Op.getNode()->use_end(); UI != UE; ++UI)
   11802       if (UI->getOpcode() == ISD::STORE)
   11803         goto default_case;
   11804 
   11805     // Otherwise use a regular EFLAGS-setting instruction.
   11806     switch (ArithOp.getOpcode()) {
   11807     default: llvm_unreachable("unexpected operator!");
   11808     case ISD::SUB: Opcode = X86ISD::SUB; break;
   11809     case ISD::XOR: Opcode = X86ISD::XOR; break;
   11810     case ISD::AND: Opcode = X86ISD::AND; break;
   11811     case ISD::OR: {
   11812       if (!NeedTruncation && (X86CC == X86::COND_E || X86CC == X86::COND_NE)) {
   11813         SDValue EFLAGS = LowerVectorAllZeroTest(Op, Subtarget, DAG);
   11814         if (EFLAGS.getNode())
   11815           return EFLAGS;
   11816       }
   11817       Opcode = X86ISD::OR;
   11818       break;
   11819     }
   11820     }
   11821 
   11822     NumOperands = 2;
   11823     break;
   11824   case X86ISD::ADD:
   11825   case X86ISD::SUB:
   11826   case X86ISD::INC:
   11827   case X86ISD::DEC:
   11828   case X86ISD::OR:
   11829   case X86ISD::XOR:
   11830   case X86ISD::AND:
   11831     return SDValue(Op.getNode(), 1);
   11832   default:
   11833   default_case:
   11834     break;
   11835   }
   11836 
   11837   // If we found that truncation is beneficial, perform the truncation and
   11838   // update 'Op'.
   11839   if (NeedTruncation) {
   11840     EVT VT = Op.getValueType();
   11841     SDValue WideVal = Op->getOperand(0);
   11842     EVT WideVT = WideVal.getValueType();
   11843     unsigned ConvertedOp = 0;
   11844     // Use a target machine opcode to prevent further DAGCombine
   11845     // optimizations that may separate the arithmetic operations
   11846     // from the setcc node.
   11847     switch (WideVal.getOpcode()) {
   11848       default: break;
   11849       case ISD::ADD: ConvertedOp = X86ISD::ADD; break;
   11850       case ISD::SUB: ConvertedOp = X86ISD::SUB; break;
   11851       case ISD::AND: ConvertedOp = X86ISD::AND; break;
   11852       case ISD::OR:  ConvertedOp = X86ISD::OR;  break;
   11853       case ISD::XOR: ConvertedOp = X86ISD::XOR; break;
   11854     }
   11855 
   11856     if (ConvertedOp) {
   11857       const TargetLowering &TLI = DAG.getTargetLoweringInfo();
   11858       if (TLI.isOperationLegal(WideVal.getOpcode(), WideVT)) {
   11859         SDValue V0 = DAG.getNode(ISD::TRUNCATE, dl, VT, WideVal.getOperand(0));
   11860         SDValue V1 = DAG.getNode(ISD::TRUNCATE, dl, VT, WideVal.getOperand(1));
   11861         Op = DAG.getNode(ConvertedOp, dl, VT, V0, V1);
   11862       }
   11863     }
   11864   }
   11865 
   11866   if (Opcode == 0)
   11867     // Emit a CMP with 0, which is the TEST pattern.
   11868     return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op,
   11869                        DAG.getConstant(0, Op.getValueType()));
   11870 
   11871   SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32);
   11872   SmallVector<SDValue, 4> Ops;
   11873   for (unsigned i = 0; i != NumOperands; ++i)
   11874     Ops.push_back(Op.getOperand(i));
   11875 
   11876   SDValue New = DAG.getNode(Opcode, dl, VTs, Ops);
   11877   DAG.ReplaceAllUsesWith(Op, New);
   11878   return SDValue(New.getNode(), 1);
   11879 }
   11880 
   11881 /// Emit nodes that will be selected as "cmp Op0,Op1", or something
   11882 /// equivalent.
   11883 SDValue X86TargetLowering::EmitCmp(SDValue Op0, SDValue Op1, unsigned X86CC,
   11884                                    SDLoc dl, SelectionDAG &DAG) const {
   11885   if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op1)) {
   11886     if (C->getAPIntValue() == 0)
   11887       return EmitTest(Op0, X86CC, dl, DAG);
   11888 
   11889      if (Op0.getValueType() == MVT::i1)
   11890        llvm_unreachable("Unexpected comparison operation for MVT::i1 operands");
   11891   }
   11892 
   11893   if ((Op0.getValueType() == MVT::i8 || Op0.getValueType() == MVT::i16 ||
   11894        Op0.getValueType() == MVT::i32 || Op0.getValueType() == MVT::i64)) {
   11895     // Do the comparison at i32 if it's smaller, besides the Atom case.
   11896     // This avoids subregister aliasing issues. Keep the smaller reference
   11897     // if we're optimizing for size, however, as that'll allow better folding
   11898     // of memory operations.
   11899     if (Op0.getValueType() != MVT::i32 && Op0.getValueType() != MVT::i64 &&
   11900         !DAG.getMachineFunction().getFunction()->getAttributes().hasAttribute(
   11901              AttributeSet::FunctionIndex, Attribute::MinSize) &&
   11902         !Subtarget->isAtom()) {
   11903       unsigned ExtendOp =
   11904           isX86CCUnsigned(X86CC) ? ISD::ZERO_EXTEND : ISD::SIGN_EXTEND;
   11905       Op0 = DAG.getNode(ExtendOp, dl, MVT::i32, Op0);
   11906       Op1 = DAG.getNode(ExtendOp, dl, MVT::i32, Op1);
   11907     }
   11908     // Use SUB instead of CMP to enable CSE between SUB and CMP.
   11909     SDVTList VTs = DAG.getVTList(Op0.getValueType(), MVT::i32);
   11910     SDValue Sub = DAG.getNode(X86ISD::SUB, dl, VTs,
   11911                               Op0, Op1);
   11912     return SDValue(Sub.getNode(), 1);
   11913   }
   11914   return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op0, Op1);
   11915 }
   11916 
   11917 /// Convert a comparison if required by the subtarget.
   11918 SDValue X86TargetLowering::ConvertCmpIfNecessary(SDValue Cmp,
   11919                                                  SelectionDAG &DAG) const {
   11920   // If the subtarget does not support the FUCOMI instruction, floating-point
   11921   // comparisons have to be converted.
   11922   if (Subtarget->hasCMov() ||
   11923       Cmp.getOpcode() != X86ISD::CMP ||
   11924       !Cmp.getOperand(0).getValueType().isFloatingPoint() ||
   11925       !Cmp.getOperand(1).getValueType().isFloatingPoint())
   11926     return Cmp;
   11927 
   11928   // The instruction selector will select an FUCOM instruction instead of
   11929   // FUCOMI, which writes the comparison result to FPSW instead of EFLAGS. Hence
   11930   // build an SDNode sequence that transfers the result from FPSW into EFLAGS:
   11931   // (X86sahf (trunc (srl (X86fp_stsw (trunc (X86cmp ...)), 8))))
   11932   SDLoc dl(Cmp);
   11933   SDValue TruncFPSW = DAG.getNode(ISD::TRUNCATE, dl, MVT::i16, Cmp);
   11934   SDValue FNStSW = DAG.getNode(X86ISD::FNSTSW16r, dl, MVT::i16, TruncFPSW);
   11935   SDValue Srl = DAG.getNode(ISD::SRL, dl, MVT::i16, FNStSW,
   11936                             DAG.getConstant(8, MVT::i8));
   11937   SDValue TruncSrl = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Srl);
   11938   return DAG.getNode(X86ISD::SAHF, dl, MVT::i32, TruncSrl);
   11939 }
   11940 
   11941 static bool isAllOnes(SDValue V) {
   11942   ConstantSDNode *C = dyn_cast<ConstantSDNode>(V);
   11943   return C && C->isAllOnesValue();
   11944 }
   11945 
   11946 /// LowerToBT - Result of 'and' is compared against zero. Turn it into a BT node
   11947 /// if it's possible.
   11948 SDValue X86TargetLowering::LowerToBT(SDValue And, ISD::CondCode CC,
   11949                                      SDLoc dl, SelectionDAG &DAG) const {
   11950   SDValue Op0 = And.getOperand(0);
   11951   SDValue Op1 = And.getOperand(1);
   11952   if (Op0.getOpcode() == ISD::TRUNCATE)
   11953     Op0 = Op0.getOperand(0);
   11954   if (Op1.getOpcode() == ISD::TRUNCATE)
   11955     Op1 = Op1.getOperand(0);
   11956 
   11957   SDValue LHS, RHS;
   11958   if (Op1.getOpcode() == ISD::SHL)
   11959     std::swap(Op0, Op1);
   11960   if (Op0.getOpcode() == ISD::SHL) {
   11961     if (ConstantSDNode *And00C = dyn_cast<ConstantSDNode>(Op0.getOperand(0)))
   11962       if (And00C->getZExtValue() == 1) {
   11963         // If we looked past a truncate, check that it's only truncating away
   11964         // known zeros.
   11965         unsigned BitWidth = Op0.getValueSizeInBits();
   11966         unsigned AndBitWidth = And.getValueSizeInBits();
   11967         if (BitWidth > AndBitWidth) {
   11968           APInt Zeros, Ones;
   11969           DAG.computeKnownBits(Op0, Zeros, Ones);
   11970           if (Zeros.countLeadingOnes() < BitWidth - AndBitWidth)
   11971             return SDValue();
   11972         }
   11973         LHS = Op1;
   11974         RHS = Op0.getOperand(1);
   11975       }
   11976   } else if (Op1.getOpcode() == ISD::Constant) {
   11977     ConstantSDNode *AndRHS = cast<ConstantSDNode>(Op1);
   11978     uint64_t AndRHSVal = AndRHS->getZExtValue();
   11979     SDValue AndLHS = Op0;
   11980 
   11981     if (AndRHSVal == 1 && AndLHS.getOpcode() == ISD::SRL) {
   11982       LHS = AndLHS.getOperand(0);
   11983       RHS = AndLHS.getOperand(1);
   11984     }
   11985 
   11986     // Use BT if the immediate can't be encoded in a TEST instruction.
   11987     if (!isUInt<32>(AndRHSVal) && isPowerOf2_64(AndRHSVal)) {
   11988       LHS = AndLHS;
   11989       RHS = DAG.getConstant(Log2_64_Ceil(AndRHSVal), LHS.getValueType());
   11990     }
   11991   }
   11992 
   11993   if (LHS.getNode()) {
   11994     // If LHS is i8, promote it to i32 with any_extend.  There is no i8 BT
   11995     // instruction.  Since the shift amount is in-range-or-undefined, we know
   11996     // that doing a bittest on the i32 value is ok.  We extend to i32 because
   11997     // the encoding for the i16 version is larger than the i32 version.
   11998     // Also promote i16 to i32 for performance / code size reason.
   11999     if (LHS.getValueType() == MVT::i8 ||
   12000         LHS.getValueType() == MVT::i16)
   12001       LHS = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, LHS);
   12002 
   12003     // If the operand types disagree, extend the shift amount to match.  Since
   12004     // BT ignores high bits (like shifts) we can use anyextend.
   12005     if (LHS.getValueType() != RHS.getValueType())
   12006       RHS = DAG.getNode(ISD::ANY_EXTEND, dl, LHS.getValueType(), RHS);
   12007 
   12008     SDValue BT = DAG.getNode(X86ISD::BT, dl, MVT::i32, LHS, RHS);
   12009     X86::CondCode Cond = CC == ISD::SETEQ ? X86::COND_AE : X86::COND_B;
   12010     return DAG.getNode(X86ISD::SETCC, dl, MVT::i8,
   12011                        DAG.getConstant(Cond, MVT::i8), BT);
   12012   }
   12013 
   12014   return SDValue();
   12015 }
   12016 
   12017 /// \brief - Turns an ISD::CondCode into a value suitable for SSE floating point
   12018 /// mask CMPs.
   12019 static int translateX86FSETCC(ISD::CondCode SetCCOpcode, SDValue &Op0,
   12020                               SDValue &Op1) {
   12021   unsigned SSECC;
   12022   bool Swap = false;
   12023 
   12024   // SSE Condition code mapping:
   12025   //  0 - EQ
   12026   //  1 - LT
   12027   //  2 - LE
   12028   //  3 - UNORD
   12029   //  4 - NEQ
   12030   //  5 - NLT
   12031   //  6 - NLE
   12032   //  7 - ORD
   12033   switch (SetCCOpcode) {
   12034   default: llvm_unreachable("Unexpected SETCC condition");
   12035   case ISD::SETOEQ:
   12036   case ISD::SETEQ:  SSECC = 0; break;
   12037   case ISD::SETOGT:
   12038   case ISD::SETGT:  Swap = true; // Fallthrough
   12039   case ISD::SETLT:
   12040   case ISD::SETOLT: SSECC = 1; break;
   12041   case ISD::SETOGE:
   12042   case ISD::SETGE:  Swap = true; // Fallthrough
   12043   case ISD::SETLE:
   12044   case ISD::SETOLE: SSECC = 2; break;
   12045   case ISD::SETUO:  SSECC = 3; break;
   12046   case ISD::SETUNE:
   12047   case ISD::SETNE:  SSECC = 4; break;
   12048   case ISD::SETULE: Swap = true; // Fallthrough
   12049   case ISD::SETUGE: SSECC = 5; break;
   12050   case ISD::SETULT: Swap = true; // Fallthrough
   12051   case ISD::SETUGT: SSECC = 6; break;
   12052   case ISD::SETO:   SSECC = 7; break;
   12053   case ISD::SETUEQ:
   12054   case ISD::SETONE: SSECC = 8; break;
   12055   }
   12056   if (Swap)
   12057     std::swap(Op0, Op1);
   12058 
   12059   return SSECC;
   12060 }
   12061 
   12062 // Lower256IntVSETCC - Break a VSETCC 256-bit integer VSETCC into two new 128
   12063 // ones, and then concatenate the result back.
   12064 static SDValue Lower256IntVSETCC(SDValue Op, SelectionDAG &DAG) {
   12065   MVT VT = Op.getSimpleValueType();
   12066 
   12067   assert(VT.is256BitVector() && Op.getOpcode() == ISD::SETCC &&
   12068          "Unsupported value type for operation");
   12069 
   12070   unsigned NumElems = VT.getVectorNumElements();
   12071   SDLoc dl(Op);
   12072   SDValue CC = Op.getOperand(2);
   12073 
   12074   // Extract the LHS vectors
   12075   SDValue LHS = Op.getOperand(0);
   12076   SDValue LHS1 = Extract128BitVector(LHS, 0, DAG, dl);
   12077   SDValue LHS2 = Extract128BitVector(LHS, NumElems/2, DAG, dl);
   12078 
   12079   // Extract the RHS vectors
   12080   SDValue RHS = Op.getOperand(1);
   12081   SDValue RHS1 = Extract128BitVector(RHS, 0, DAG, dl);
   12082   SDValue RHS2 = Extract128BitVector(RHS, NumElems/2, DAG, dl);
   12083 
   12084   // Issue the operation on the smaller types and concatenate the result back
   12085   MVT EltVT = VT.getVectorElementType();
   12086   MVT NewVT = MVT::getVectorVT(EltVT, NumElems/2);
   12087   return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT,
   12088                      DAG.getNode(Op.getOpcode(), dl, NewVT, LHS1, RHS1, CC),
   12089                      DAG.getNode(Op.getOpcode(), dl, NewVT, LHS2, RHS2, CC));
   12090 }
   12091 
   12092 static SDValue LowerIntVSETCC_AVX512(SDValue Op, SelectionDAG &DAG,
   12093                                      const X86Subtarget *Subtarget) {
   12094   SDValue Op0 = Op.getOperand(0);
   12095   SDValue Op1 = Op.getOperand(1);
   12096   SDValue CC = Op.getOperand(2);
   12097   MVT VT = Op.getSimpleValueType();
   12098   SDLoc dl(Op);
   12099 
   12100   assert(Op0.getValueType().getVectorElementType().getSizeInBits() >= 32 &&
   12101          Op.getValueType().getScalarType() == MVT::i1 &&
   12102          "Cannot set masked compare for this operation");
   12103 
   12104   ISD::CondCode SetCCOpcode = cast<CondCodeSDNode>(CC)->get();
   12105   unsigned  Opc = 0;
   12106   bool Unsigned = false;
   12107   bool Swap = false;
   12108   unsigned SSECC;
   12109   switch (SetCCOpcode) {
   12110   default: llvm_unreachable("Unexpected SETCC condition");
   12111   case ISD::SETNE:  SSECC = 4; break;
   12112   case ISD::SETEQ:  Opc = X86ISD::PCMPEQM; break;
   12113   case ISD::SETUGT: SSECC = 6; Unsigned = true; break;
   12114   case ISD::SETLT:  Swap = true; //fall-through
   12115   case ISD::SETGT:  Opc = X86ISD::PCMPGTM; break;
   12116   case ISD::SETULT: SSECC = 1; Unsigned = true; break;
   12117   case ISD::SETUGE: SSECC = 5; Unsigned = true; break; //NLT
   12118   case ISD::SETGE:  Swap = true; SSECC = 2; break; // LE + swap
   12119   case ISD::SETULE: Unsigned = true; //fall-through
   12120   case ISD::SETLE:  SSECC = 2; break;
   12121   }
   12122 
   12123   if (Swap)
   12124     std::swap(Op0, Op1);
   12125   if (Opc)
   12126     return DAG.getNode(Opc, dl, VT, Op0, Op1);
   12127   Opc = Unsigned ? X86ISD::CMPMU: X86ISD::CMPM;
   12128   return DAG.getNode(Opc, dl, VT, Op0, Op1,
   12129                      DAG.getConstant(SSECC, MVT::i8));
   12130 }
   12131 
   12132 /// \brief Try to turn a VSETULT into a VSETULE by modifying its second
   12133 /// operand \p Op1.  If non-trivial (for example because it's not constant)
   12134 /// return an empty value.
   12135 static SDValue ChangeVSETULTtoVSETULE(SDLoc dl, SDValue Op1, SelectionDAG &DAG)
   12136 {
   12137   BuildVectorSDNode *BV = dyn_cast<BuildVectorSDNode>(Op1.getNode());
   12138   if (!BV)
   12139     return SDValue();
   12140 
   12141   MVT VT = Op1.getSimpleValueType();
   12142   MVT EVT = VT.getVectorElementType();
   12143   unsigned n = VT.getVectorNumElements();
   12144   SmallVector<SDValue, 8> ULTOp1;
   12145 
   12146   for (unsigned i = 0; i < n; ++i) {
   12147     ConstantSDNode *Elt = dyn_cast<ConstantSDNode>(BV->getOperand(i));
   12148     if (!Elt || Elt->isOpaque() || Elt->getValueType(0) != EVT)
   12149       return SDValue();
   12150 
   12151     // Avoid underflow.
   12152     APInt Val = Elt->getAPIntValue();
   12153     if (Val == 0)
   12154       return SDValue();
   12155 
   12156     ULTOp1.push_back(DAG.getConstant(Val - 1, EVT));
   12157   }
   12158 
   12159   return DAG.getNode(ISD::BUILD_VECTOR, dl, VT, ULTOp1);
   12160 }
   12161 
   12162 static SDValue LowerVSETCC(SDValue Op, const X86Subtarget *Subtarget,
   12163                            SelectionDAG &DAG) {
   12164   SDValue Op0 = Op.getOperand(0);
   12165   SDValue Op1 = Op.getOperand(1);
   12166   SDValue CC = Op.getOperand(2);
   12167   MVT VT = Op.getSimpleValueType();
   12168   ISD::CondCode SetCCOpcode = cast<CondCodeSDNode>(CC)->get();
   12169   bool isFP = Op.getOperand(1).getSimpleValueType().isFloatingPoint();
   12170   SDLoc dl(Op);
   12171 
   12172   if (isFP) {
   12173 #ifndef NDEBUG
   12174     MVT EltVT = Op0.getSimpleValueType().getVectorElementType();
   12175     assert(EltVT == MVT::f32 || EltVT == MVT::f64);
   12176 #endif
   12177 
   12178     unsigned SSECC = translateX86FSETCC(SetCCOpcode, Op0, Op1);
   12179     unsigned Opc = X86ISD::CMPP;
   12180     if (Subtarget->hasAVX512() && VT.getVectorElementType() == MVT::i1) {
   12181       assert(VT.getVectorNumElements() <= 16);
   12182       Opc = X86ISD::CMPM;
   12183     }
   12184     // In the two special cases we can't handle, emit two comparisons.
   12185     if (SSECC == 8) {
   12186       unsigned CC0, CC1;
   12187       unsigned CombineOpc;
   12188       if (SetCCOpcode == ISD::SETUEQ) {
   12189         CC0 = 3; CC1 = 0; CombineOpc = ISD::OR;
   12190       } else {
   12191         assert(SetCCOpcode == ISD::SETONE);
   12192         CC0 = 7; CC1 = 4; CombineOpc = ISD::AND;
   12193       }
   12194 
   12195       SDValue Cmp0 = DAG.getNode(Opc, dl, VT, Op0, Op1,
   12196                                  DAG.getConstant(CC0, MVT::i8));
   12197       SDValue Cmp1 = DAG.getNode(Opc, dl, VT, Op0, Op1,
   12198                                  DAG.getConstant(CC1, MVT::i8));
   12199       return DAG.getNode(CombineOpc, dl, VT, Cmp0, Cmp1);
   12200     }
   12201     // Handle all other FP comparisons here.
   12202     return DAG.getNode(Opc, dl, VT, Op0, Op1,
   12203                        DAG.getConstant(SSECC, MVT::i8));
   12204   }
   12205 
   12206   // Break 256-bit integer vector compare into smaller ones.
   12207   if (VT.is256BitVector() && !Subtarget->hasInt256())
   12208     return Lower256IntVSETCC(Op, DAG);
   12209 
   12210   bool MaskResult = (VT.getVectorElementType() == MVT::i1);
   12211   EVT OpVT = Op1.getValueType();
   12212   if (Subtarget->hasAVX512()) {
   12213     if (Op1.getValueType().is512BitVector() ||
   12214         (MaskResult && OpVT.getVectorElementType().getSizeInBits() >= 32))
   12215       return LowerIntVSETCC_AVX512(Op, DAG, Subtarget);
   12216 
   12217     // In AVX-512 architecture setcc returns mask with i1 elements,
   12218     // But there is no compare instruction for i8 and i16 elements.
   12219     // We are not talking about 512-bit operands in this case, these
   12220     // types are illegal.
   12221     if (MaskResult &&
   12222         (OpVT.getVectorElementType().getSizeInBits() < 32 &&
   12223          OpVT.getVectorElementType().getSizeInBits() >= 8))
   12224       return DAG.getNode(ISD::TRUNCATE, dl, VT,
   12225                          DAG.getNode(ISD::SETCC, dl, OpVT, Op0, Op1, CC));
   12226   }
   12227 
   12228   // We are handling one of the integer comparisons here.  Since SSE only has
   12229   // GT and EQ comparisons for integer, swapping operands and multiple
   12230   // operations may be required for some comparisons.
   12231   unsigned Opc;
   12232   bool Swap = false, Invert = false, FlipSigns = false, MinMax = false;
   12233   bool Subus = false;
   12234 
   12235   switch (SetCCOpcode) {
   12236   default: llvm_unreachable("Unexpected SETCC condition");
   12237   case ISD::SETNE:  Invert = true;
   12238   case ISD::SETEQ:  Opc = X86ISD::PCMPEQ; break;
   12239   case ISD::SETLT:  Swap = true;
   12240   case ISD::SETGT:  Opc = X86ISD::PCMPGT; break;
   12241   case ISD::SETGE:  Swap = true;
   12242   case ISD::SETLE:  Opc = X86ISD::PCMPGT;
   12243                     Invert = true; break;
   12244   case ISD::SETULT: Swap = true;
   12245   case ISD::SETUGT: Opc = X86ISD::PCMPGT;
   12246                     FlipSigns = true; break;
   12247   case ISD::SETUGE: Swap = true;
   12248   case ISD::SETULE: Opc = X86ISD::PCMPGT;
   12249                     FlipSigns = true; Invert = true; break;
   12250   }
   12251 
   12252   // Special case: Use min/max operations for SETULE/SETUGE
   12253   MVT VET = VT.getVectorElementType();
   12254   bool hasMinMax =
   12255        (Subtarget->hasSSE41() && (VET >= MVT::i8 && VET <= MVT::i32))
   12256     || (Subtarget->hasSSE2()  && (VET == MVT::i8));
   12257 
   12258   if (hasMinMax) {
   12259     switch (SetCCOpcode) {
   12260     default: break;
   12261     case ISD::SETULE: Opc = X86ISD::UMIN; MinMax = true; break;
   12262     case ISD::SETUGE: Opc = X86ISD::UMAX; MinMax = true; break;
   12263     }
   12264 
   12265     if (MinMax) { Swap = false; Invert = false; FlipSigns = false; }
   12266   }
   12267 
   12268   bool hasSubus = Subtarget->hasSSE2() && (VET == MVT::i8 || VET == MVT::i16);
   12269   if (!MinMax && hasSubus) {
   12270     // As another special case, use PSUBUS[BW] when it's profitable. E.g. for
   12271     // Op0 u<= Op1:
   12272     //   t = psubus Op0, Op1
   12273     //   pcmpeq t, <0..0>
   12274     switch (SetCCOpcode) {
   12275     default: break;
   12276     case ISD::SETULT: {
   12277       // If the comparison is against a constant we can turn this into a
   12278       // setule.  With psubus, setule does not require a swap.  This is
   12279       // beneficial because the constant in the register is no longer
   12280       // destructed as the destination so it can be hoisted out of a loop.
   12281       // Only do this pre-AVX since vpcmp* is no longer destructive.
   12282       if (Subtarget->hasAVX())
   12283         break;
   12284       SDValue ULEOp1 = ChangeVSETULTtoVSETULE(dl, Op1, DAG);
   12285       if (ULEOp1.getNode()) {
   12286         Op1 = ULEOp1;
   12287         Subus = true; Invert = false; Swap = false;
   12288       }
   12289       break;
   12290     }
   12291     // Psubus is better than flip-sign because it requires no inversion.
   12292     case ISD::SETUGE: Subus = true; Invert = false; Swap = true;  break;
   12293     case ISD::SETULE: Subus = true; Invert = false; Swap = false; break;
   12294     }
   12295 
   12296     if (Subus) {
   12297       Opc = X86ISD::SUBUS;
   12298       FlipSigns = false;
   12299     }
   12300   }
   12301 
   12302   if (Swap)
   12303     std::swap(Op0, Op1);
   12304 
   12305   // Check that the operation in question is available (most are plain SSE2,
   12306   // but PCMPGTQ and PCMPEQQ have different requirements).
   12307   if (VT == MVT::v2i64) {
   12308     if (Opc == X86ISD::PCMPGT && !Subtarget->hasSSE42()) {
   12309       assert(Subtarget->hasSSE2() && "Don't know how to lower!");
   12310 
   12311       // First cast everything to the right type.
   12312       Op0 = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, Op0);
   12313       Op1 = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, Op1);
   12314 
   12315       // Since SSE has no unsigned integer comparisons, we need to flip the sign
   12316       // bits of the inputs before performing those operations. The lower
   12317       // compare is always unsigned.
   12318       SDValue SB;
   12319       if (FlipSigns) {
   12320         SB = DAG.getConstant(0x80000000U, MVT::v4i32);
   12321       } else {
   12322         SDValue Sign = DAG.getConstant(0x80000000U, MVT::i32);
   12323         SDValue Zero = DAG.getConstant(0x00000000U, MVT::i32);
   12324         SB = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32,
   12325                          Sign, Zero, Sign, Zero);
   12326       }
   12327       Op0 = DAG.getNode(ISD::XOR, dl, MVT::v4i32, Op0, SB);
   12328       Op1 = DAG.getNode(ISD::XOR, dl, MVT::v4i32, Op1, SB);
   12329 
   12330       // Emulate PCMPGTQ with (hi1 > hi2) | ((hi1 == hi2) & (lo1 > lo2))
   12331       SDValue GT = DAG.getNode(X86ISD::PCMPGT, dl, MVT::v4i32, Op0, Op1);
   12332       SDValue EQ = DAG.getNode(X86ISD::PCMPEQ, dl, MVT::v4i32, Op0, Op1);
   12333 
   12334       // Create masks for only the low parts/high parts of the 64 bit integers.
   12335       static const int MaskHi[] = { 1, 1, 3, 3 };
   12336       static const int MaskLo[] = { 0, 0, 2, 2 };
   12337       SDValue EQHi = DAG.getVectorShuffle(MVT::v4i32, dl, EQ, EQ, MaskHi);
   12338       SDValue GTLo = DAG.getVectorShuffle(MVT::v4i32, dl, GT, GT, MaskLo);
   12339       SDValue GTHi = DAG.getVectorShuffle(MVT::v4i32, dl, GT, GT, MaskHi);
   12340 
   12341       SDValue Result = DAG.getNode(ISD::AND, dl, MVT::v4i32, EQHi, GTLo);
   12342       Result = DAG.getNode(ISD::OR, dl, MVT::v4i32, Result, GTHi);
   12343 
   12344       if (Invert)
   12345         Result = DAG.getNOT(dl, Result, MVT::v4i32);
   12346 
   12347       return DAG.getNode(ISD::BITCAST, dl, VT, Result);
   12348     }
   12349 
   12350     if (Opc == X86ISD::PCMPEQ && !Subtarget->hasSSE41()) {
   12351       // If pcmpeqq is missing but pcmpeqd is available synthesize pcmpeqq with
   12352       // pcmpeqd + pshufd + pand.
   12353       assert(Subtarget->hasSSE2() && !FlipSigns && "Don't know how to lower!");
   12354 
   12355       // First cast everything to the right type.
   12356       Op0 = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, Op0);
   12357       Op1 = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, Op1);
   12358 
   12359       // Do the compare.
   12360       SDValue Result = DAG.getNode(Opc, dl, MVT::v4i32, Op0, Op1);
   12361 
   12362       // Make sure the lower and upper halves are both all-ones.
   12363       static const int Mask[] = { 1, 0, 3, 2 };
   12364       SDValue Shuf = DAG.getVectorShuffle(MVT::v4i32, dl, Result, Result, Mask);
   12365       Result = DAG.getNode(ISD::AND, dl, MVT::v4i32, Result, Shuf);
   12366 
   12367       if (Invert)
   12368         Result = DAG.getNOT(dl, Result, MVT::v4i32);
   12369 
   12370       return DAG.getNode(ISD::BITCAST, dl, VT, Result);
   12371     }
   12372   }
   12373 
   12374   // Since SSE has no unsigned integer comparisons, we need to flip the sign
   12375   // bits of the inputs before performing those operations.
   12376   if (FlipSigns) {
   12377     EVT EltVT = VT.getVectorElementType();
   12378     SDValue SB = DAG.getConstant(APInt::getSignBit(EltVT.getSizeInBits()), VT);
   12379     Op0 = DAG.getNode(ISD::XOR, dl, VT, Op0, SB);
   12380     Op1 = DAG.getNode(ISD::XOR, dl, VT, Op1, SB);
   12381   }
   12382 
   12383   SDValue Result = DAG.getNode(Opc, dl, VT, Op0, Op1);
   12384 
   12385   // If the logical-not of the result is required, perform that now.
   12386   if (Invert)
   12387     Result = DAG.getNOT(dl, Result, VT);
   12388 
   12389   if (MinMax)
   12390     Result = DAG.getNode(X86ISD::PCMPEQ, dl, VT, Op0, Result);
   12391 
   12392   if (Subus)
   12393     Result = DAG.getNode(X86ISD::PCMPEQ, dl, VT, Result,
   12394                          getZeroVector(VT, Subtarget, DAG, dl));
   12395 
   12396   return Result;
   12397 }
   12398 
   12399 SDValue X86TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const {
   12400 
   12401   MVT VT = Op.getSimpleValueType();
   12402 
   12403   if (VT.isVector()) return LowerVSETCC(Op, Subtarget, DAG);
   12404 
   12405   assert(((!Subtarget->hasAVX512() && VT == MVT::i8) || (VT == MVT::i1))
   12406          && "SetCC type must be 8-bit or 1-bit integer");
   12407   SDValue Op0 = Op.getOperand(0);
   12408   SDValue Op1 = Op.getOperand(1);
   12409   SDLoc dl(Op);
   12410   ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(2))->get();
   12411 
   12412   // Optimize to BT if possible.
   12413   // Lower (X & (1 << N)) == 0 to BT(X, N).
   12414   // Lower ((X >>u N) & 1) != 0 to BT(X, N).
   12415   // Lower ((X >>s N) & 1) != 0 to BT(X, N).
   12416   if (Op0.getOpcode() == ISD::AND && Op0.hasOneUse() &&
   12417       Op1.getOpcode() == ISD::Constant &&
   12418       cast<ConstantSDNode>(Op1)->isNullValue() &&
   12419       (CC == ISD::SETEQ || CC == ISD::SETNE)) {
   12420     SDValue NewSetCC = LowerToBT(Op0, CC, dl, DAG);
   12421     if (NewSetCC.getNode())
   12422       return NewSetCC;
   12423   }
   12424 
   12425   // Look for X == 0, X == 1, X != 0, or X != 1.  We can simplify some forms of
   12426   // these.
   12427   if (Op1.getOpcode() == ISD::Constant &&
   12428       (cast<ConstantSDNode>(Op1)->getZExtValue() == 1 ||
   12429        cast<ConstantSDNode>(Op1)->isNullValue()) &&
   12430       (CC == ISD::SETEQ || CC == ISD::SETNE)) {
   12431 
   12432     // If the input is a setcc, then reuse the input setcc or use a new one with
   12433     // the inverted condition.
   12434     if (Op0.getOpcode() == X86ISD::SETCC) {
   12435       X86::CondCode CCode = (X86::CondCode)Op0.getConstantOperandVal(0);
   12436       bool Invert = (CC == ISD::SETNE) ^
   12437         cast<ConstantSDNode>(Op1)->isNullValue();
   12438       if (!Invert)
   12439         return Op0;
   12440 
   12441       CCode = X86::GetOppositeBranchCondition(CCode);
   12442       SDValue SetCC = DAG.getNode(X86ISD::SETCC, dl, MVT::i8,
   12443                                   DAG.getConstant(CCode, MVT::i8),
   12444                                   Op0.getOperand(1));
   12445       if (VT == MVT::i1)
   12446         return DAG.getNode(ISD::TRUNCATE, dl, MVT::i1, SetCC);
   12447       return SetCC;
   12448     }
   12449   }
   12450   if ((Op0.getValueType() == MVT::i1) && (Op1.getOpcode() == ISD::Constant) &&
   12451       (cast<ConstantSDNode>(Op1)->getZExtValue() == 1) &&
   12452       (CC == ISD::SETEQ || CC == ISD::SETNE)) {
   12453 
   12454     ISD::CondCode NewCC = ISD::getSetCCInverse(CC, true);
   12455     return DAG.getSetCC(dl, VT, Op0, DAG.getConstant(0, MVT::i1), NewCC);
   12456   }
   12457 
   12458   bool isFP = Op1.getSimpleValueType().isFloatingPoint();
   12459   unsigned X86CC = TranslateX86CC(CC, isFP, Op0, Op1, DAG);
   12460   if (X86CC == X86::COND_INVALID)
   12461     return SDValue();
   12462 
   12463   SDValue EFLAGS = EmitCmp(Op0, Op1, X86CC, dl, DAG);
   12464   EFLAGS = ConvertCmpIfNecessary(EFLAGS, DAG);
   12465   SDValue SetCC = DAG.getNode(X86ISD::SETCC, dl, MVT::i8,
   12466                               DAG.getConstant(X86CC, MVT::i8), EFLAGS);
   12467   if (VT == MVT::i1)
   12468     return DAG.getNode(ISD::TRUNCATE, dl, MVT::i1, SetCC);
   12469   return SetCC;
   12470 }
   12471 
   12472 // isX86LogicalCmp - Return true if opcode is a X86 logical comparison.
   12473 static bool isX86LogicalCmp(SDValue Op) {
   12474   unsigned Opc = Op.getNode()->getOpcode();
   12475   if (Opc == X86ISD::CMP || Opc == X86ISD::COMI || Opc == X86ISD::UCOMI ||
   12476       Opc == X86ISD::SAHF)
   12477     return true;
   12478   if (Op.getResNo() == 1 &&
   12479       (Opc == X86ISD::ADD ||
   12480        Opc == X86ISD::SUB ||
   12481        Opc == X86ISD::ADC ||
   12482        Opc == X86ISD::SBB ||
   12483        Opc == X86ISD::SMUL ||
   12484        Opc == X86ISD::UMUL ||
   12485        Opc == X86ISD::INC ||
   12486        Opc == X86ISD::DEC ||
   12487        Opc == X86ISD::OR ||
   12488        Opc == X86ISD::XOR ||
   12489        Opc == X86ISD::AND))
   12490     return true;
   12491 
   12492   if (Op.getResNo() == 2 && Opc == X86ISD::UMUL)
   12493     return true;
   12494 
   12495   return false;
   12496 }
   12497 
   12498 static bool isTruncWithZeroHighBitsInput(SDValue V, SelectionDAG &DAG) {
   12499   if (V.getOpcode() != ISD::TRUNCATE)
   12500     return false;
   12501 
   12502   SDValue VOp0 = V.getOperand(0);
   12503   unsigned InBits = VOp0.getValueSizeInBits();
   12504   unsigned Bits = V.getValueSizeInBits();
   12505   return DAG.MaskedValueIsZero(VOp0, APInt::getHighBitsSet(InBits,InBits-Bits));
   12506 }
   12507 
   12508 SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
   12509   bool addTest = true;
   12510   SDValue Cond  = Op.getOperand(0);
   12511   SDValue Op1 = Op.getOperand(1);
   12512   SDValue Op2 = Op.getOperand(2);
   12513   SDLoc DL(Op);
   12514   EVT VT = Op1.getValueType();
   12515   SDValue CC;
   12516 
   12517   // Lower fp selects into a CMP/AND/ANDN/OR sequence when the necessary SSE ops
   12518   // are available. Otherwise fp cmovs get lowered into a less efficient branch
   12519   // sequence later on.
   12520   if (Cond.getOpcode() == ISD::SETCC &&
   12521       ((Subtarget->hasSSE2() && (VT == MVT::f32 || VT == MVT::f64)) ||
   12522        (Subtarget->hasSSE1() && VT == MVT::f32)) &&
   12523       VT == Cond.getOperand(0).getValueType() && Cond->hasOneUse()) {
   12524     SDValue CondOp0 = Cond.getOperand(0), CondOp1 = Cond.getOperand(1);
   12525     int SSECC = translateX86FSETCC(
   12526         cast<CondCodeSDNode>(Cond.getOperand(2))->get(), CondOp0, CondOp1);
   12527 
   12528     if (SSECC != 8) {
   12529       if (Subtarget->hasAVX512()) {
   12530         SDValue Cmp = DAG.getNode(X86ISD::FSETCC, DL, MVT::i1, CondOp0, CondOp1,
   12531                                   DAG.getConstant(SSECC, MVT::i8));
   12532         return DAG.getNode(X86ISD::SELECT, DL, VT, Cmp, Op1, Op2);
   12533       }
   12534       SDValue Cmp = DAG.getNode(X86ISD::FSETCC, DL, VT, CondOp0, CondOp1,
   12535                                 DAG.getConstant(SSECC, MVT::i8));
   12536       SDValue AndN = DAG.getNode(X86ISD::FANDN, DL, VT, Cmp, Op2);
   12537       SDValue And = DAG.getNode(X86ISD::FAND, DL, VT, Cmp, Op1);
   12538       return DAG.getNode(X86ISD::FOR, DL, VT, AndN, And);
   12539     }
   12540   }
   12541 
   12542   if (Cond.getOpcode() == ISD::SETCC) {
   12543     SDValue NewCond = LowerSETCC(Cond, DAG);
   12544     if (NewCond.getNode())
   12545       Cond = NewCond;
   12546   }
   12547 
   12548   // (select (x == 0), -1, y) -> (sign_bit (x - 1)) | y
   12549   // (select (x == 0), y, -1) -> ~(sign_bit (x - 1)) | y
   12550   // (select (x != 0), y, -1) -> (sign_bit (x - 1)) | y
   12551   // (select (x != 0), -1, y) -> ~(sign_bit (x - 1)) | y
   12552   if (Cond.getOpcode() == X86ISD::SETCC &&
   12553       Cond.getOperand(1).getOpcode() == X86ISD::CMP &&
   12554       isZero(Cond.getOperand(1).getOperand(1))) {
   12555     SDValue Cmp = Cond.getOperand(1);
   12556 
   12557     unsigned CondCode =cast<ConstantSDNode>(Cond.getOperand(0))->getZExtValue();
   12558 
   12559     if ((isAllOnes(Op1) || isAllOnes(Op2)) &&
   12560         (CondCode == X86::COND_E || CondCode == X86::COND_NE)) {
   12561       SDValue Y = isAllOnes(Op2) ? Op1 : Op2;
   12562 
   12563       SDValue CmpOp0 = Cmp.getOperand(0);
   12564       // Apply further optimizations for special cases
   12565       // (select (x != 0), -1, 0) -> neg & sbb
   12566       // (select (x == 0), 0, -1) -> neg & sbb
   12567       if (ConstantSDNode *YC = dyn_cast<ConstantSDNode>(Y))
   12568         if (YC->isNullValue() &&
   12569             (isAllOnes(Op1) == (CondCode == X86::COND_NE))) {
   12570           SDVTList VTs = DAG.getVTList(CmpOp0.getValueType(), MVT::i32);
   12571           SDValue Neg = DAG.getNode(X86ISD::SUB, DL, VTs,
   12572                                     DAG.getConstant(0, CmpOp0.getValueType()),
   12573                                     CmpOp0);
   12574           SDValue Res = DAG.getNode(X86ISD::SETCC_CARRY, DL, Op.getValueType(),
   12575                                     DAG.getConstant(X86::COND_B, MVT::i8),
   12576                                     SDValue(Neg.getNode(), 1));
   12577           return Res;
   12578         }
   12579 
   12580       Cmp = DAG.getNode(X86ISD::CMP, DL, MVT::i32,
   12581                         CmpOp0, DAG.getConstant(1, CmpOp0.getValueType()));
   12582       Cmp = ConvertCmpIfNecessary(Cmp, DAG);
   12583 
   12584       SDValue Res =   // Res = 0 or -1.
   12585         DAG.getNode(X86ISD::SETCC_CARRY, DL, Op.getValueType(),
   12586                     DAG.getConstant(X86::COND_B, MVT::i8), Cmp);
   12587 
   12588       if (isAllOnes(Op1) != (CondCode == X86::COND_E))
   12589         Res = DAG.getNOT(DL, Res, Res.getValueType());
   12590 
   12591       ConstantSDNode *N2C = dyn_cast<ConstantSDNode>(Op2);
   12592       if (!N2C || !N2C->isNullValue())
   12593         Res = DAG.getNode(ISD::OR, DL, Res.getValueType(), Res, Y);
   12594       return Res;
   12595     }
   12596   }
   12597 
   12598   // Look past (and (setcc_carry (cmp ...)), 1).
   12599   if (Cond.getOpcode() == ISD::AND &&
   12600       Cond.getOperand(0).getOpcode() == X86ISD::SETCC_CARRY) {
   12601     ConstantSDNode *C = dyn_cast<ConstantSDNode>(Cond.getOperand(1));
   12602     if (C && C->getAPIntValue() == 1)
   12603       Cond = Cond.getOperand(0);
   12604   }
   12605 
   12606   // If condition flag is set by a X86ISD::CMP, then use it as the condition
   12607   // setting operand in place of the X86ISD::SETCC.
   12608   unsigned CondOpcode = Cond.getOpcode();
   12609   if (CondOpcode == X86ISD::SETCC ||
   12610       CondOpcode == X86ISD::SETCC_CARRY) {
   12611     CC = Cond.getOperand(0);
   12612 
   12613     SDValue Cmp = Cond.getOperand(1);
   12614     unsigned Opc = Cmp.getOpcode();
   12615     MVT VT = Op.getSimpleValueType();
   12616 
   12617     bool IllegalFPCMov = false;
   12618     if (VT.isFloatingPoint() && !VT.isVector() &&
   12619         !isScalarFPTypeInSSEReg(VT))  // FPStack?
   12620       IllegalFPCMov = !hasFPCMov(cast<ConstantSDNode>(CC)->getSExtValue());
   12621 
   12622     if ((isX86LogicalCmp(Cmp) && !IllegalFPCMov) ||
   12623         Opc == X86ISD::BT) { // FIXME
   12624       Cond = Cmp;
   12625       addTest = false;
   12626     }
   12627   } else if (CondOpcode == ISD::USUBO || CondOpcode == ISD::SSUBO ||
   12628              CondOpcode == ISD::UADDO || CondOpcode == ISD::SADDO ||
   12629              ((CondOpcode == ISD::UMULO || CondOpcode == ISD::SMULO) &&
   12630               Cond.getOperand(0).getValueType() != MVT::i8)) {
   12631     SDValue LHS = Cond.getOperand(0);
   12632     SDValue RHS = Cond.getOperand(1);
   12633     unsigned X86Opcode;
   12634     unsigned X86Cond;
   12635     SDVTList VTs;
   12636     switch (CondOpcode) {
   12637     case ISD::UADDO: X86Opcode = X86ISD::ADD; X86Cond = X86::COND_B; break;
   12638     case ISD::SADDO: X86Opcode = X86ISD::ADD; X86Cond = X86::COND_O; break;
   12639     case ISD::USUBO: X86Opcode = X86ISD::SUB; X86Cond = X86::COND_B; break;
   12640     case ISD::SSUBO: X86Opcode = X86ISD::SUB; X86Cond = X86::COND_O; break;
   12641     case ISD::UMULO: X86Opcode = X86ISD::UMUL; X86Cond = X86::COND_O; break;
   12642     case ISD::SMULO: X86Opcode = X86ISD::SMUL; X86Cond = X86::COND_O; break;
   12643     default: llvm_unreachable("unexpected overflowing operator");
   12644     }
   12645     if (CondOpcode == ISD::UMULO)
   12646       VTs = DAG.getVTList(LHS.getValueType(), LHS.getValueType(),
   12647                           MVT::i32);
   12648     else
   12649       VTs = DAG.getVTList(LHS.getValueType(), MVT::i32);
   12650 
   12651     SDValue X86Op = DAG.getNode(X86Opcode, DL, VTs, LHS, RHS);
   12652 
   12653     if (CondOpcode == ISD::UMULO)
   12654       Cond = X86Op.getValue(2);
   12655     else
   12656       Cond = X86Op.getValue(1);
   12657 
   12658     CC = DAG.getConstant(X86Cond, MVT::i8);
   12659     addTest = false;
   12660   }
   12661 
   12662   if (addTest) {
   12663     // Look pass the truncate if the high bits are known zero.
   12664     if (isTruncWithZeroHighBitsInput(Cond, DAG))
   12665         Cond = Cond.getOperand(0);
   12666 
   12667     // We know the result of AND is compared against zero. Try to match
   12668     // it to BT.
   12669     if (Cond.getOpcode() == ISD::AND && Cond.hasOneUse()) {
   12670       SDValue NewSetCC = LowerToBT(Cond, ISD::SETNE, DL, DAG);
   12671       if (NewSetCC.getNode()) {
   12672         CC = NewSetCC.getOperand(0);
   12673         Cond = NewSetCC.getOperand(1);
   12674         addTest = false;
   12675       }
   12676     }
   12677   }
   12678 
   12679   if (addTest) {
   12680     CC = DAG.getConstant(X86::COND_NE, MVT::i8);
   12681     Cond = EmitTest(Cond, X86::COND_NE, DL, DAG);
   12682   }
   12683 
   12684   // a <  b ? -1 :  0 -> RES = ~setcc_carry
   12685   // a <  b ?  0 : -1 -> RES = setcc_carry
   12686   // a >= b ? -1 :  0 -> RES = setcc_carry
   12687   // a >= b ?  0 : -1 -> RES = ~setcc_carry
   12688   if (Cond.getOpcode() == X86ISD::SUB) {
   12689     Cond = ConvertCmpIfNecessary(Cond, DAG);
   12690     unsigned CondCode = cast<ConstantSDNode>(CC)->getZExtValue();
   12691 
   12692     if ((CondCode == X86::COND_AE || CondCode == X86::COND_B) &&
   12693         (isAllOnes(Op1) || isAllOnes(Op2)) && (isZero(Op1) || isZero(Op2))) {
   12694       SDValue Res = DAG.getNode(X86ISD::SETCC_CARRY, DL, Op.getValueType(),
   12695                                 DAG.getConstant(X86::COND_B, MVT::i8), Cond);
   12696       if (isAllOnes(Op1) != (CondCode == X86::COND_B))
   12697         return DAG.getNOT(DL, Res, Res.getValueType());
   12698       return Res;
   12699     }
   12700   }
   12701 
   12702   // X86 doesn't have an i8 cmov. If both operands are the result of a truncate
   12703   // widen the cmov and push the truncate through. This avoids introducing a new
   12704   // branch during isel and doesn't add any extensions.
   12705   if (Op.getValueType() == MVT::i8 &&
   12706       Op1.getOpcode() == ISD::TRUNCATE && Op2.getOpcode() == ISD::TRUNCATE) {
   12707     SDValue T1 = Op1.getOperand(0), T2 = Op2.getOperand(0);
   12708     if (T1.getValueType() == T2.getValueType() &&
   12709         // Blacklist CopyFromReg to avoid partial register stalls.
   12710         T1.getOpcode() != ISD::CopyFromReg && T2.getOpcode()!=ISD::CopyFromReg){
   12711       SDVTList VTs = DAG.getVTList(T1.getValueType(), MVT::Glue);
   12712       SDValue Cmov = DAG.getNode(X86ISD::CMOV, DL, VTs, T2, T1, CC, Cond);
   12713       return DAG.getNode(ISD::TRUNCATE, DL, Op.getValueType(), Cmov);
   12714     }
   12715   }
   12716 
   12717   // X86ISD::CMOV means set the result (which is operand 1) to the RHS if
   12718   // condition is true.
   12719   SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::Glue);
   12720   SDValue Ops[] = { Op2, Op1, CC, Cond };
   12721   return DAG.getNode(X86ISD::CMOV, DL, VTs, Ops);
   12722 }
   12723 
   12724 static SDValue LowerSIGN_EXTEND_AVX512(SDValue Op, SelectionDAG &DAG) {
   12725   MVT VT = Op->getSimpleValueType(0);
   12726   SDValue In = Op->getOperand(0);
   12727   MVT InVT = In.getSimpleValueType();
   12728   SDLoc dl(Op);
   12729 
   12730   unsigned int NumElts = VT.getVectorNumElements();
   12731   if (NumElts != 8 && NumElts != 16)
   12732     return SDValue();
   12733 
   12734   if (VT.is512BitVector() && InVT.getVectorElementType() != MVT::i1)
   12735     return DAG.getNode(X86ISD::VSEXT, dl, VT, In);
   12736 
   12737   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
   12738   assert (InVT.getVectorElementType() == MVT::i1 && "Unexpected vector type");
   12739 
   12740   MVT ExtVT = (NumElts == 8) ? MVT::v8i64 : MVT::v16i32;
   12741   Constant *C = ConstantInt::get(*DAG.getContext(),
   12742     APInt::getAllOnesValue(ExtVT.getScalarType().getSizeInBits()));
   12743 
   12744   SDValue CP = DAG.getConstantPool(C, TLI.getPointerTy());
   12745   unsigned Alignment = cast<ConstantPoolSDNode>(CP)->getAlignment();
   12746   SDValue Ld = DAG.getLoad(ExtVT.getScalarType(), dl, DAG.getEntryNode(), CP,
   12747                           MachinePointerInfo::getConstantPool(),
   12748                           false, false, false, Alignment);
   12749   SDValue Brcst = DAG.getNode(X86ISD::VBROADCASTM, dl, ExtVT, In, Ld);
   12750   if (VT.is512BitVector())
   12751     return Brcst;
   12752   return DAG.getNode(X86ISD::VTRUNC, dl, VT, Brcst);
   12753 }
   12754 
   12755 static SDValue LowerSIGN_EXTEND(SDValue Op, const X86Subtarget *Subtarget,
   12756                                 SelectionDAG &DAG) {
   12757   MVT VT = Op->getSimpleValueType(0);
   12758   SDValue In = Op->getOperand(0);
   12759   MVT InVT = In.getSimpleValueType();
   12760   SDLoc dl(Op);
   12761 
   12762   if (VT.is512BitVector() || InVT.getVectorElementType() == MVT::i1)
   12763     return LowerSIGN_EXTEND_AVX512(Op, DAG);
   12764 
   12765   if ((VT != MVT::v4i64 || InVT != MVT::v4i32) &&
   12766       (VT != MVT::v8i32 || InVT != MVT::v8i16) &&
   12767       (VT != MVT::v16i16 || InVT != MVT::v16i8))
   12768     return SDValue();
   12769 
   12770   if (Subtarget->hasInt256())
   12771     return DAG.getNode(X86ISD::VSEXT, dl, VT, In);
   12772 
   12773   // Optimize vectors in AVX mode
   12774   // Sign extend  v8i16 to v8i32 and
   12775   //              v4i32 to v4i64
   12776   //
   12777   // Divide input vector into two parts
   12778   // for v4i32 the shuffle mask will be { 0, 1, -1, -1} {2, 3, -1, -1}
   12779   // use vpmovsx instruction to extend v4i32 -> v2i64; v8i16 -> v4i32
   12780   // concat the vectors to original VT
   12781 
   12782   unsigned NumElems = InVT.getVectorNumElements();
   12783   SDValue Undef = DAG.getUNDEF(InVT);
   12784 
   12785   SmallVector<int,8> ShufMask1(NumElems, -1);
   12786   for (unsigned i = 0; i != NumElems/2; ++i)
   12787     ShufMask1[i] = i;
   12788 
   12789   SDValue OpLo = DAG.getVectorShuffle(InVT, dl, In, Undef, &ShufMask1[0]);
   12790 
   12791   SmallVector<int,8> ShufMask2(NumElems, -1);
   12792   for (unsigned i = 0; i != NumElems/2; ++i)
   12793     ShufMask2[i] = i + NumElems/2;
   12794 
   12795   SDValue OpHi = DAG.getVectorShuffle(InVT, dl, In, Undef, &ShufMask2[0]);
   12796 
   12797   MVT HalfVT = MVT::getVectorVT(VT.getScalarType(),
   12798                                 VT.getVectorNumElements()/2);
   12799 
   12800   OpLo = DAG.getNode(X86ISD::VSEXT, dl, HalfVT, OpLo);
   12801   OpHi = DAG.getNode(X86ISD::VSEXT, dl, HalfVT, OpHi);
   12802 
   12803   return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, OpLo, OpHi);
   12804 }
   12805 
   12806 // isAndOrOfSingleUseSetCCs - Return true if node is an ISD::AND or
   12807 // ISD::OR of two X86ISD::SETCC nodes each of which has no other use apart
   12808 // from the AND / OR.
   12809 static bool isAndOrOfSetCCs(SDValue Op, unsigned &Opc) {
   12810   Opc = Op.getOpcode();
   12811   if (Opc != ISD::OR && Opc != ISD::AND)
   12812     return false;
   12813   return (Op.getOperand(0).getOpcode() == X86ISD::SETCC &&
   12814           Op.getOperand(0).hasOneUse() &&
   12815           Op.getOperand(1).getOpcode() == X86ISD::SETCC &&
   12816           Op.getOperand(1).hasOneUse());
   12817 }
   12818 
   12819 // isXor1OfSetCC - Return true if node is an ISD::XOR of a X86ISD::SETCC and
   12820 // 1 and that the SETCC node has a single use.
   12821 static bool isXor1OfSetCC(SDValue Op) {
   12822   if (Op.getOpcode() != ISD::XOR)
   12823     return false;
   12824   ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(Op.getOperand(1));
   12825   if (N1C && N1C->getAPIntValue() == 1) {
   12826     return Op.getOperand(0).getOpcode() == X86ISD::SETCC &&
   12827       Op.getOperand(0).hasOneUse();
   12828   }
   12829   return false;
   12830 }
   12831 
   12832 SDValue X86TargetLowering::LowerBRCOND(SDValue Op, SelectionDAG &DAG) const {
   12833   bool addTest = true;
   12834   SDValue Chain = Op.getOperand(0);
   12835   SDValue Cond  = Op.getOperand(1);
   12836   SDValue Dest  = Op.getOperand(2);
   12837   SDLoc dl(Op);
   12838   SDValue CC;
   12839   bool Inverted = false;
   12840 
   12841   if (Cond.getOpcode() == ISD::SETCC) {
   12842     // Check for setcc([su]{add,sub,mul}o == 0).
   12843     if (cast<CondCodeSDNode>(Cond.getOperand(2))->get() == ISD::SETEQ &&
   12844         isa<ConstantSDNode>(Cond.getOperand(1)) &&
   12845         cast<ConstantSDNode>(Cond.getOperand(1))->isNullValue() &&
   12846         Cond.getOperand(0).getResNo() == 1 &&
   12847         (Cond.getOperand(0).getOpcode() == ISD::SADDO ||
   12848          Cond.getOperand(0).getOpcode() == ISD::UADDO ||
   12849          Cond.getOperand(0).getOpcode() == ISD::SSUBO ||
   12850          Cond.getOperand(0).getOpcode() == ISD::USUBO ||
   12851          Cond.getOperand(0).getOpcode() == ISD::SMULO ||
   12852          Cond.getOperand(0).getOpcode() == ISD::UMULO)) {
   12853       Inverted = true;
   12854       Cond = Cond.getOperand(0);
   12855     } else {
   12856       SDValue NewCond = LowerSETCC(Cond, DAG);
   12857       if (NewCond.getNode())
   12858         Cond = NewCond;
   12859     }
   12860   }
   12861 #if 0
   12862   // FIXME: LowerXALUO doesn't handle these!!
   12863   else if (Cond.getOpcode() == X86ISD::ADD  ||
   12864            Cond.getOpcode() == X86ISD::SUB  ||
   12865            Cond.getOpcode() == X86ISD::SMUL ||
   12866            Cond.getOpcode() == X86ISD::UMUL)
   12867     Cond = LowerXALUO(Cond, DAG);
   12868 #endif
   12869 
   12870   // Look pass (and (setcc_carry (cmp ...)), 1).
   12871   if (Cond.getOpcode() == ISD::AND &&
   12872       Cond.getOperand(0).getOpcode() == X86ISD::SETCC_CARRY) {
   12873     ConstantSDNode *C = dyn_cast<ConstantSDNode>(Cond.getOperand(1));
   12874     if (C && C->getAPIntValue() == 1)
   12875       Cond = Cond.getOperand(0);
   12876   }
   12877 
   12878   // If condition flag is set by a X86ISD::CMP, then use it as the condition
   12879   // setting operand in place of the X86ISD::SETCC.
   12880   unsigned CondOpcode = Cond.getOpcode();
   12881   if (CondOpcode == X86ISD::SETCC ||
   12882       CondOpcode == X86ISD::SETCC_CARRY) {
   12883     CC = Cond.getOperand(0);
   12884 
   12885     SDValue Cmp = Cond.getOperand(1);
   12886     unsigned Opc = Cmp.getOpcode();
   12887     // FIXME: WHY THE SPECIAL CASING OF LogicalCmp??
   12888     if (isX86LogicalCmp(Cmp) || Opc == X86ISD::BT) {
   12889       Cond = Cmp;
   12890       addTest = false;
   12891     } else {
   12892       switch (cast<ConstantSDNode>(CC)->getZExtValue()) {
   12893       default: break;
   12894       case X86::COND_O:
   12895       case X86::COND_B:
   12896         // These can only come from an arithmetic instruction with overflow,
   12897         // e.g. SADDO, UADDO.
   12898         Cond = Cond.getNode()->getOperand(1);
   12899         addTest = false;
   12900         break;
   12901       }
   12902     }
   12903   }
   12904   CondOpcode = Cond.getOpcode();
   12905   if (CondOpcode == ISD::UADDO || CondOpcode == ISD::SADDO ||
   12906       CondOpcode == ISD::USUBO || CondOpcode == ISD::SSUBO ||
   12907       ((CondOpcode == ISD::UMULO || CondOpcode == ISD::SMULO) &&
   12908        Cond.getOperand(0).getValueType() != MVT::i8)) {
   12909     SDValue LHS = Cond.getOperand(0);
   12910     SDValue RHS = Cond.getOperand(1);
   12911     unsigned X86Opcode;
   12912     unsigned X86Cond;
   12913     SDVTList VTs;
   12914     // Keep this in sync with LowerXALUO, otherwise we might create redundant
   12915     // instructions that can't be removed afterwards (i.e. X86ISD::ADD and
   12916     // X86ISD::INC).
   12917     switch (CondOpcode) {
   12918     case ISD::UADDO: X86Opcode = X86ISD::ADD; X86Cond = X86::COND_B; break;
   12919     case ISD::SADDO:
   12920       if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(RHS))
   12921         if (C->isOne()) {
   12922           X86Opcode = X86ISD::INC; X86Cond = X86::COND_O;
   12923           break;
   12924         }
   12925       X86Opcode = X86ISD::ADD; X86Cond = X86::COND_O; break;
   12926     case ISD::USUBO: X86Opcode = X86ISD::SUB; X86Cond = X86::COND_B; break;
   12927     case ISD::SSUBO:
   12928       if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(RHS))
   12929         if (C->isOne()) {
   12930           X86Opcode = X86ISD::DEC; X86Cond = X86::COND_O;
   12931           break;
   12932         }
   12933       X86Opcode = X86ISD::SUB; X86Cond = X86::COND_O; break;
   12934     case ISD::UMULO: X86Opcode = X86ISD::UMUL; X86Cond = X86::COND_O; break;
   12935     case ISD::SMULO: X86Opcode = X86ISD::SMUL; X86Cond = X86::COND_O; break;
   12936     default: llvm_unreachable("unexpected overflowing operator");
   12937     }
   12938     if (Inverted)
   12939       X86Cond = X86::GetOppositeBranchCondition((X86::CondCode)X86Cond);
   12940     if (CondOpcode == ISD::UMULO)
   12941       VTs = DAG.getVTList(LHS.getValueType(), LHS.getValueType(),
   12942                           MVT::i32);
   12943     else
   12944       VTs = DAG.getVTList(LHS.getValueType(), MVT::i32);
   12945 
   12946     SDValue X86Op = DAG.getNode(X86Opcode, dl, VTs, LHS, RHS);
   12947 
   12948     if (CondOpcode == ISD::UMULO)
   12949       Cond = X86Op.getValue(2);
   12950     else
   12951       Cond = X86Op.getValue(1);
   12952 
   12953     CC = DAG.getConstant(X86Cond, MVT::i8);
   12954     addTest = false;
   12955   } else {
   12956     unsigned CondOpc;
   12957     if (Cond.hasOneUse() && isAndOrOfSetCCs(Cond, CondOpc)) {
   12958       SDValue Cmp = Cond.getOperand(0).getOperand(1);
   12959       if (CondOpc == ISD::OR) {
   12960         // Also, recognize the pattern generated by an FCMP_UNE. We can emit
   12961         // two branches instead of an explicit OR instruction with a
   12962         // separate test.
   12963         if (Cmp == Cond.getOperand(1).getOperand(1) &&
   12964             isX86LogicalCmp(Cmp)) {
   12965           CC = Cond.getOperand(0).getOperand(0);
   12966           Chain = DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(),
   12967                               Chain, Dest, CC, Cmp);
   12968           CC = Cond.getOperand(1).getOperand(0);
   12969           Cond = Cmp;
   12970           addTest = false;
   12971         }
   12972       } else { // ISD::AND
   12973         // Also, recognize the pattern generated by an FCMP_OEQ. We can emit
   12974         // two branches instead of an explicit AND instruction with a
   12975         // separate test. However, we only do this if this block doesn't
   12976         // have a fall-through edge, because this requires an explicit
   12977         // jmp when the condition is false.
   12978         if (Cmp == Cond.getOperand(1).getOperand(1) &&
   12979             isX86LogicalCmp(Cmp) &&
   12980             Op.getNode()->hasOneUse()) {
   12981           X86::CondCode CCode =
   12982             (X86::CondCode)Cond.getOperand(0).getConstantOperandVal(0);
   12983           CCode = X86::GetOppositeBranchCondition(CCode);
   12984           CC = DAG.getConstant(CCode, MVT::i8);
   12985           SDNode *User = *Op.getNode()->use_begin();
   12986           // Look for an unconditional branch following this conditional branch.
   12987           // We need this because we need to reverse the successors in order
   12988           // to implement FCMP_OEQ.
   12989           if (User->getOpcode() == ISD::BR) {
   12990             SDValue FalseBB = User->getOperand(1);
   12991             SDNode *NewBR =
   12992               DAG.UpdateNodeOperands(User, User->getOperand(0), Dest);
   12993             assert(NewBR == User);
   12994             (void)NewBR;
   12995             Dest = FalseBB;
   12996 
   12997             Chain = DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(),
   12998                                 Chain, Dest, CC, Cmp);
   12999             X86::CondCode CCode =
   13000               (X86::CondCode)Cond.getOperand(1).getConstantOperandVal(0);
   13001             CCode = X86::GetOppositeBranchCondition(CCode);
   13002             CC = DAG.getConstant(CCode, MVT::i8);
   13003             Cond = Cmp;
   13004             addTest = false;
   13005           }
   13006         }
   13007       }
   13008     } else if (Cond.hasOneUse() && isXor1OfSetCC(Cond)) {
   13009       // Recognize for xorb (setcc), 1 patterns. The xor inverts the condition.
   13010       // It should be transformed during dag combiner except when the condition
   13011       // is set by a arithmetics with overflow node.
   13012       X86::CondCode CCode =
   13013         (X86::CondCode)Cond.getOperand(0).getConstantOperandVal(0);
   13014       CCode = X86::GetOppositeBranchCondition(CCode);
   13015       CC = DAG.getConstant(CCode, MVT::i8);
   13016       Cond = Cond.getOperand(0).getOperand(1);
   13017       addTest = false;
   13018     } else if (Cond.getOpcode() == ISD::SETCC &&
   13019                cast<CondCodeSDNode>(Cond.getOperand(2))->get() == ISD::SETOEQ) {
   13020       // For FCMP_OEQ, we can emit
   13021       // two branches instead of an explicit AND instruction with a
   13022       // separate test. However, we only do this if this block doesn't
   13023       // have a fall-through edge, because this requires an explicit
   13024       // jmp when the condition is false.
   13025       if (Op.getNode()->hasOneUse()) {
   13026         SDNode *User = *Op.getNode()->use_begin();
   13027         // Look for an unconditional branch following this conditional branch.
   13028         // We need this because we need to reverse the successors in order
   13029         // to implement FCMP_OEQ.
   13030         if (User->getOpcode() == ISD::BR) {
   13031           SDValue FalseBB = User->getOperand(1);
   13032           SDNode *NewBR =
   13033             DAG.UpdateNodeOperands(User, User->getOperand(0), Dest);
   13034           assert(NewBR == User);
   13035           (void)NewBR;
   13036           Dest = FalseBB;
   13037 
   13038           SDValue Cmp = DAG.getNode(X86ISD::CMP, dl, MVT::i32,
   13039                                     Cond.getOperand(0), Cond.getOperand(1));
   13040           Cmp = ConvertCmpIfNecessary(Cmp, DAG);
   13041           CC = DAG.getConstant(X86::COND_NE, MVT::i8);
   13042           Chain = DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(),
   13043                               Chain, Dest, CC, Cmp);
   13044           CC = DAG.getConstant(X86::COND_P, MVT::i8);
   13045           Cond = Cmp;
   13046           addTest = false;
   13047         }
   13048       }
   13049     } else if (Cond.getOpcode() == ISD::SETCC &&
   13050                cast<CondCodeSDNode>(Cond.getOperand(2))->get() == ISD::SETUNE) {
   13051       // For FCMP_UNE, we can emit
   13052       // two branches instead of an explicit AND instruction with a
   13053       // separate test. However, we only do this if this block doesn't
   13054       // have a fall-through edge, because this requires an explicit
   13055       // jmp when the condition is false.
   13056       if (Op.getNode()->hasOneUse()) {
   13057         SDNode *User = *Op.getNode()->use_begin();
   13058         // Look for an unconditional branch following this conditional branch.
   13059         // We need this because we need to reverse the successors in order
   13060         // to implement FCMP_UNE.
   13061         if (User->getOpcode() == ISD::BR) {
   13062           SDValue FalseBB = User->getOperand(1);
   13063           SDNode *NewBR =
   13064             DAG.UpdateNodeOperands(User, User->getOperand(0), Dest);
   13065           assert(NewBR == User);
   13066           (void)NewBR;
   13067 
   13068           SDValue Cmp = DAG.getNode(X86ISD::CMP, dl, MVT::i32,
   13069                                     Cond.getOperand(0), Cond.getOperand(1));
   13070           Cmp = ConvertCmpIfNecessary(Cmp, DAG);
   13071           CC = DAG.getConstant(X86::COND_NE, MVT::i8);
   13072           Chain = DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(),
   13073                               Chain, Dest, CC, Cmp);
   13074           CC = DAG.getConstant(X86::COND_NP, MVT::i8);
   13075           Cond = Cmp;
   13076           addTest = false;
   13077           Dest = FalseBB;
   13078         }
   13079       }
   13080     }
   13081   }
   13082 
   13083   if (addTest) {
   13084     // Look pass the truncate if the high bits are known zero.
   13085     if (isTruncWithZeroHighBitsInput(Cond, DAG))
   13086         Cond = Cond.getOperand(0);
   13087 
   13088     // We know the result of AND is compared against zero. Try to match
   13089     // it to BT.
   13090     if (Cond.getOpcode() == ISD::AND && Cond.hasOneUse()) {
   13091       SDValue NewSetCC = LowerToBT(Cond, ISD::SETNE, dl, DAG);
   13092       if (NewSetCC.getNode()) {
   13093         CC = NewSetCC.getOperand(0);
   13094         Cond = NewSetCC.getOperand(1);
   13095         addTest = false;
   13096       }
   13097     }
   13098   }
   13099 
   13100   if (addTest) {
   13101     X86::CondCode X86Cond = Inverted ? X86::COND_E : X86::COND_NE;
   13102     CC = DAG.getConstant(X86Cond, MVT::i8);
   13103     Cond = EmitTest(Cond, X86Cond, dl, DAG);
   13104   }
   13105   Cond = ConvertCmpIfNecessary(Cond, DAG);
   13106   return DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(),
   13107                      Chain, Dest, CC, Cond);
   13108 }
   13109 
   13110 // Lower dynamic stack allocation to _alloca call for Cygwin/Mingw targets.
   13111 // Calls to _alloca is needed to probe the stack when allocating more than 4k
   13112 // bytes in one go. Touching the stack at 4K increments is necessary to ensure
   13113 // that the guard pages used by the OS virtual memory manager are allocated in
   13114 // correct sequence.
   13115 SDValue
   13116 X86TargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op,
   13117                                            SelectionDAG &DAG) const {
   13118   MachineFunction &MF = DAG.getMachineFunction();
   13119   bool SplitStack = MF.shouldSplitStack();
   13120   bool Lower = (Subtarget->isOSWindows() && !Subtarget->isTargetMacho()) ||
   13121                SplitStack;
   13122   SDLoc dl(Op);
   13123 
   13124   if (!Lower) {
   13125     const TargetLowering &TLI = DAG.getTargetLoweringInfo();
   13126     SDNode* Node = Op.getNode();
   13127 
   13128     unsigned SPReg = TLI.getStackPointerRegisterToSaveRestore();
   13129     assert(SPReg && "Target cannot require DYNAMIC_STACKALLOC expansion and"
   13130         " not tell us which reg is the stack pointer!");
   13131     EVT VT = Node->getValueType(0);
   13132     SDValue Tmp1 = SDValue(Node, 0);
   13133     SDValue Tmp2 = SDValue(Node, 1);
   13134     SDValue Tmp3 = Node->getOperand(2);
   13135     SDValue Chain = Tmp1.getOperand(0);
   13136 
   13137     // Chain the dynamic stack allocation so that it doesn't modify the stack
   13138     // pointer when other instructions are using the stack.
   13139     Chain = DAG.getCALLSEQ_START(Chain, DAG.getIntPtrConstant(0, true),
   13140         SDLoc(Node));
   13141 
   13142     SDValue Size = Tmp2.getOperand(1);
   13143     SDValue SP = DAG.getCopyFromReg(Chain, dl, SPReg, VT);
   13144     Chain = SP.getValue(1);
   13145     unsigned Align = cast<ConstantSDNode>(Tmp3)->getZExtValue();
   13146     const TargetFrameLowering &TFI = *DAG.getTarget().getFrameLowering();
   13147     unsigned StackAlign = TFI.getStackAlignment();
   13148     Tmp1 = DAG.getNode(ISD::SUB, dl, VT, SP, Size); // Value
   13149     if (Align > StackAlign)
   13150       Tmp1 = DAG.getNode(ISD::AND, dl, VT, Tmp1,
   13151           DAG.getConstant(-(uint64_t)Align, VT));
   13152     Chain = DAG.getCopyToReg(Chain, dl, SPReg, Tmp1); // Output chain
   13153 
   13154     Tmp2 = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(0, true),
   13155         DAG.getIntPtrConstant(0, true), SDValue(),
   13156         SDLoc(Node));
   13157 
   13158     SDValue Ops[2] = { Tmp1, Tmp2 };
   13159     return DAG.getMergeValues(Ops, dl);
   13160   }
   13161 
   13162   // Get the inputs.
   13163   SDValue Chain = Op.getOperand(0);
   13164   SDValue Size  = Op.getOperand(1);
   13165   unsigned Align = cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue();
   13166   EVT VT = Op.getNode()->getValueType(0);
   13167 
   13168   bool Is64Bit = Subtarget->is64Bit();
   13169   EVT SPTy = Is64Bit ? MVT::i64 : MVT::i32;
   13170 
   13171   if (SplitStack) {
   13172     MachineRegisterInfo &MRI = MF.getRegInfo();
   13173 
   13174     if (Is64Bit) {
   13175       // The 64 bit implementation of segmented stacks needs to clobber both r10
   13176       // r11. This makes it impossible to use it along with nested parameters.
   13177       const Function *F = MF.getFunction();
   13178 
   13179       for (Function::const_arg_iterator I = F->arg_begin(), E = F->arg_end();
   13180            I != E; ++I)
   13181         if (I->hasNestAttr())
   13182           report_fatal_error("Cannot use segmented stacks with functions that "
   13183                              "have nested arguments.");
   13184     }
   13185 
   13186     const TargetRegisterClass *AddrRegClass =
   13187       getRegClassFor(Subtarget->is64Bit() ? MVT::i64:MVT::i32);
   13188     unsigned Vreg = MRI.createVirtualRegister(AddrRegClass);
   13189     Chain = DAG.getCopyToReg(Chain, dl, Vreg, Size);
   13190     SDValue Value = DAG.getNode(X86ISD::SEG_ALLOCA, dl, SPTy, Chain,
   13191                                 DAG.getRegister(Vreg, SPTy));
   13192     SDValue Ops1[2] = { Value, Chain };
   13193     return DAG.getMergeValues(Ops1, dl);
   13194   } else {
   13195     SDValue Flag;
   13196     unsigned Reg = (Subtarget->is64Bit() ? X86::RAX : X86::EAX);
   13197 
   13198     Chain = DAG.getCopyToReg(Chain, dl, Reg, Size, Flag);
   13199     Flag = Chain.getValue(1);
   13200     SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
   13201 
   13202     Chain = DAG.getNode(X86ISD::WIN_ALLOCA, dl, NodeTys, Chain, Flag);
   13203 
   13204     const X86RegisterInfo *RegInfo =
   13205       static_cast<const X86RegisterInfo*>(DAG.getTarget().getRegisterInfo());
   13206     unsigned SPReg = RegInfo->getStackRegister();
   13207     SDValue SP = DAG.getCopyFromReg(Chain, dl, SPReg, SPTy);
   13208     Chain = SP.getValue(1);
   13209 
   13210     if (Align) {
   13211       SP = DAG.getNode(ISD::AND, dl, VT, SP.getValue(0),
   13212                        DAG.getConstant(-(uint64_t)Align, VT));
   13213       Chain = DAG.getCopyToReg(Chain, dl, SPReg, SP);
   13214     }
   13215 
   13216     SDValue Ops1[2] = { SP, Chain };
   13217     return DAG.getMergeValues(Ops1, dl);
   13218   }
   13219 }
   13220 
   13221 SDValue X86TargetLowering::LowerVASTART(SDValue Op, SelectionDAG &DAG) const {
   13222   MachineFunction &MF = DAG.getMachineFunction();
   13223   X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
   13224 
   13225   const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
   13226   SDLoc DL(Op);
   13227 
   13228   if (!Subtarget->is64Bit() || Subtarget->isTargetWin64()) {
   13229     // vastart just stores the address of the VarArgsFrameIndex slot into the
   13230     // memory location argument.
   13231     SDValue FR = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(),
   13232                                    getPointerTy());
   13233     return DAG.getStore(Op.getOperand(0), DL, FR, Op.getOperand(1),
   13234                         MachinePointerInfo(SV), false, false, 0);
   13235   }
   13236 
   13237   // __va_list_tag:
   13238   //   gp_offset         (0 - 6 * 8)
   13239   //   fp_offset         (48 - 48 + 8 * 16)
   13240   //   overflow_arg_area (point to parameters coming in memory).
   13241   //   reg_save_area
   13242   SmallVector<SDValue, 8> MemOps;
   13243   SDValue FIN = Op.getOperand(1);
   13244   // Store gp_offset
   13245   SDValue Store = DAG.getStore(Op.getOperand(0), DL,
   13246                                DAG.getConstant(FuncInfo->getVarArgsGPOffset(),
   13247                                                MVT::i32),
   13248                                FIN, MachinePointerInfo(SV), false, false, 0);
   13249   MemOps.push_back(Store);
   13250 
   13251   // Store fp_offset
   13252   FIN = DAG.getNode(ISD::ADD, DL, getPointerTy(),
   13253                     FIN, DAG.getIntPtrConstant(4));
   13254   Store = DAG.getStore(Op.getOperand(0), DL,
   13255                        DAG.getConstant(FuncInfo->getVarArgsFPOffset(),
   13256                                        MVT::i32),
   13257                        FIN, MachinePointerInfo(SV, 4), false, false, 0);
   13258   MemOps.push_back(Store);
   13259 
   13260   // Store ptr to overflow_arg_area
   13261   FIN = DAG.getNode(ISD::ADD, DL, getPointerTy(),
   13262                     FIN, DAG.getIntPtrConstant(4));
   13263   SDValue OVFIN = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(),
   13264                                     getPointerTy());
   13265   Store = DAG.getStore(Op.getOperand(0), DL, OVFIN, FIN,
   13266                        MachinePointerInfo(SV, 8),
   13267                        false, false, 0);
   13268   MemOps.push_back(Store);
   13269 
   13270   // Store ptr to reg_save_area.
   13271   FIN = DAG.getNode(ISD::ADD, DL, getPointerTy(),
   13272                     FIN, DAG.getIntPtrConstant(8));
   13273   SDValue RSFIN = DAG.getFrameIndex(FuncInfo->getRegSaveFrameIndex(),
   13274                                     getPointerTy());
   13275   Store = DAG.getStore(Op.getOperand(0), DL, RSFIN, FIN,
   13276                        MachinePointerInfo(SV, 16), false, false, 0);
   13277   MemOps.push_back(Store);
   13278   return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOps);
   13279 }
   13280 
   13281 SDValue X86TargetLowering::LowerVAARG(SDValue Op, SelectionDAG &DAG) const {
   13282   assert(Subtarget->is64Bit() &&
   13283          "LowerVAARG only handles 64-bit va_arg!");
   13284   assert((Subtarget->isTargetLinux() ||
   13285           Subtarget->isTargetDarwin()) &&
   13286           "Unhandled target in LowerVAARG");
   13287   assert(Op.getNode()->getNumOperands() == 4);
   13288   SDValue Chain = Op.getOperand(0);
   13289   SDValue SrcPtr = Op.getOperand(1);
   13290   const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
   13291   unsigned Align = Op.getConstantOperandVal(3);
   13292   SDLoc dl(Op);
   13293 
   13294   EVT ArgVT = Op.getNode()->getValueType(0);
   13295   Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext());
   13296   uint32_t ArgSize = getDataLayout()->getTypeAllocSize(ArgTy);
   13297   uint8_t ArgMode;
   13298 
   13299   // Decide which area this value should be read from.
   13300   // TODO: Implement the AMD64 ABI in its entirety. This simple
   13301   // selection mechanism works only for the basic types.
   13302   if (ArgVT == MVT::f80) {
   13303     llvm_unreachable("va_arg for f80 not yet implemented");
   13304   } else if (ArgVT.isFloatingPoint() && ArgSize <= 16 /*bytes*/) {
   13305     ArgMode = 2;  // Argument passed in XMM register. Use fp_offset.
   13306   } else if (ArgVT.isInteger() && ArgSize <= 32 /*bytes*/) {
   13307     ArgMode = 1;  // Argument passed in GPR64 register(s). Use gp_offset.
   13308   } else {
   13309     llvm_unreachable("Unhandled argument type in LowerVAARG");
   13310   }
   13311 
   13312   if (ArgMode == 2) {
   13313     // Sanity Check: Make sure using fp_offset makes sense.
   13314     assert(!DAG.getTarget().Options.UseSoftFloat &&
   13315            !(DAG.getMachineFunction()
   13316                 .getFunction()->getAttributes()
   13317                 .hasAttribute(AttributeSet::FunctionIndex,
   13318                               Attribute::NoImplicitFloat)) &&
   13319            Subtarget->hasSSE1());
   13320   }
   13321 
   13322   // Insert VAARG_64 node into the DAG
   13323   // VAARG_64 returns two values: Variable Argument Address, Chain
   13324   SmallVector<SDValue, 11> InstOps;
   13325   InstOps.push_back(Chain);
   13326   InstOps.push_back(SrcPtr);
   13327   InstOps.push_back(DAG.getConstant(ArgSize, MVT::i32));
   13328   InstOps.push_back(DAG.getConstant(ArgMode, MVT::i8));
   13329   InstOps.push_back(DAG.getConstant(Align, MVT::i32));
   13330   SDVTList VTs = DAG.getVTList(getPointerTy(), MVT::Other);
   13331   SDValue VAARG = DAG.getMemIntrinsicNode(X86ISD::VAARG_64, dl,
   13332                                           VTs, InstOps, MVT::i64,
   13333                                           MachinePointerInfo(SV),
   13334                                           /*Align=*/0,
   13335                                           /*Volatile=*/false,
   13336                                           /*ReadMem=*/true,
   13337                                           /*WriteMem=*/true);
   13338   Chain = VAARG.getValue(1);
   13339 
   13340   // Load the next argument and return it
   13341   return DAG.getLoad(ArgVT, dl,
   13342                      Chain,
   13343                      VAARG,
   13344                      MachinePointerInfo(),
   13345                      false, false, false, 0);
   13346 }
   13347 
   13348 static SDValue LowerVACOPY(SDValue Op, const X86Subtarget *Subtarget,
   13349                            SelectionDAG &DAG) {
   13350   // X86-64 va_list is a struct { i32, i32, i8*, i8* }.
   13351   assert(Subtarget->is64Bit() && "This code only handles 64-bit va_copy!");
   13352   SDValue Chain = Op.getOperand(0);
   13353   SDValue DstPtr = Op.getOperand(1);
   13354   SDValue SrcPtr = Op.getOperand(2);
   13355   const Value *DstSV = cast<SrcValueSDNode>(Op.getOperand(3))->getValue();
   13356   const Value *SrcSV = cast<SrcValueSDNode>(Op.getOperand(4))->getValue();
   13357   SDLoc DL(Op);
   13358 
   13359   return DAG.getMemcpy(Chain, DL, DstPtr, SrcPtr,
   13360                        DAG.getIntPtrConstant(24), 8, /*isVolatile*/false,
   13361                        false,
   13362                        MachinePointerInfo(DstSV), MachinePointerInfo(SrcSV));
   13363 }
   13364 
   13365 // getTargetVShiftByConstNode - Handle vector element shifts where the shift
   13366 // amount is a constant. Takes immediate version of shift as input.
   13367 static SDValue getTargetVShiftByConstNode(unsigned Opc, SDLoc dl, MVT VT,
   13368                                           SDValue SrcOp, uint64_t ShiftAmt,
   13369                                           SelectionDAG &DAG) {
   13370   MVT ElementType = VT.getVectorElementType();
   13371 
   13372   // Fold this packed shift into its first operand if ShiftAmt is 0.
   13373   if (ShiftAmt == 0)
   13374     return SrcOp;
   13375 
   13376   // Check for ShiftAmt >= element width
   13377   if (ShiftAmt >= ElementType.getSizeInBits()) {
   13378     if (Opc == X86ISD::VSRAI)
   13379       ShiftAmt = ElementType.getSizeInBits() - 1;
   13380     else
   13381       return DAG.getConstant(0, VT);
   13382   }
   13383 
   13384   assert((Opc == X86ISD::VSHLI || Opc == X86ISD::VSRLI || Opc == X86ISD::VSRAI)
   13385          && "Unknown target vector shift-by-constant node");
   13386 
   13387   // Fold this packed vector shift into a build vector if SrcOp is a
   13388   // vector of Constants or UNDEFs, and SrcOp valuetype is the same as VT.
   13389   if (VT == SrcOp.getSimpleValueType() &&
   13390       ISD::isBuildVectorOfConstantSDNodes(SrcOp.getNode())) {
   13391     SmallVector<SDValue, 8> Elts;
   13392     unsigned NumElts = SrcOp->getNumOperands();
   13393     ConstantSDNode *ND;
   13394 
   13395     switch(Opc) {
   13396     default: llvm_unreachable(nullptr);
   13397     case X86ISD::VSHLI:
   13398       for (unsigned i=0; i!=NumElts; ++i) {
   13399         SDValue CurrentOp = SrcOp->getOperand(i);
   13400         if (CurrentOp->getOpcode() == ISD::UNDEF) {
   13401           Elts.push_back(CurrentOp);
   13402           continue;
   13403         }
   13404         ND = cast<ConstantSDNode>(CurrentOp);
   13405         const APInt &C = ND->getAPIntValue();
   13406         Elts.push_back(DAG.getConstant(C.shl(ShiftAmt), ElementType));
   13407       }
   13408       break;
   13409     case X86ISD::VSRLI:
   13410       for (unsigned i=0; i!=NumElts; ++i) {
   13411         SDValue CurrentOp = SrcOp->getOperand(i);
   13412         if (CurrentOp->getOpcode() == ISD::UNDEF) {
   13413           Elts.push_back(CurrentOp);
   13414           continue;
   13415         }
   13416         ND = cast<ConstantSDNode>(CurrentOp);
   13417         const APInt &C = ND->getAPIntValue();
   13418         Elts.push_back(DAG.getConstant(C.lshr(ShiftAmt), ElementType));
   13419       }
   13420       break;
   13421     case X86ISD::VSRAI:
   13422       for (unsigned i=0; i!=NumElts; ++i) {
   13423         SDValue CurrentOp = SrcOp->getOperand(i);
   13424         if (CurrentOp->getOpcode() == ISD::UNDEF) {
   13425           Elts.push_back(CurrentOp);
   13426           continue;
   13427         }
   13428         ND = cast<ConstantSDNode>(CurrentOp);
   13429         const APInt &C = ND->getAPIntValue();
   13430         Elts.push_back(DAG.getConstant(C.ashr(ShiftAmt), ElementType));
   13431       }
   13432       break;
   13433     }
   13434 
   13435     return DAG.getNode(ISD::BUILD_VECTOR, dl, VT, Elts);
   13436   }
   13437 
   13438   return DAG.getNode(Opc, dl, VT, SrcOp, DAG.getConstant(ShiftAmt, MVT::i8));
   13439 }
   13440 
   13441 // getTargetVShiftNode - Handle vector element shifts where the shift amount
   13442 // may or may not be a constant. Takes immediate version of shift as input.
   13443 static SDValue getTargetVShiftNode(unsigned Opc, SDLoc dl, MVT VT,
   13444                                    SDValue SrcOp, SDValue ShAmt,
   13445                                    SelectionDAG &DAG) {
   13446   assert(ShAmt.getValueType() == MVT::i32 && "ShAmt is not i32");
   13447 
   13448   // Catch shift-by-constant.
   13449   if (ConstantSDNode *CShAmt = dyn_cast<ConstantSDNode>(ShAmt))
   13450     return getTargetVShiftByConstNode(Opc, dl, VT, SrcOp,
   13451                                       CShAmt->getZExtValue(), DAG);
   13452 
   13453   // Change opcode to non-immediate version
   13454   switch (Opc) {
   13455     default: llvm_unreachable("Unknown target vector shift node");
   13456     case X86ISD::VSHLI: Opc = X86ISD::VSHL; break;
   13457     case X86ISD::VSRLI: Opc = X86ISD::VSRL; break;
   13458     case X86ISD::VSRAI: Opc = X86ISD::VSRA; break;
   13459   }
   13460 
   13461   // Need to build a vector containing shift amount
   13462   // Shift amount is 32-bits, but SSE instructions read 64-bit, so fill with 0
   13463   SDValue ShOps[4];
   13464   ShOps[0] = ShAmt;
   13465   ShOps[1] = DAG.getConstant(0, MVT::i32);
   13466   ShOps[2] = ShOps[3] = DAG.getUNDEF(MVT::i32);
   13467   ShAmt = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32, ShOps);
   13468 
   13469   // The return type has to be a 128-bit type with the same element
   13470   // type as the input type.
   13471   MVT EltVT = VT.getVectorElementType();
   13472   EVT ShVT = MVT::getVectorVT(EltVT, 128/EltVT.getSizeInBits());
   13473 
   13474   ShAmt = DAG.getNode(ISD::BITCAST, dl, ShVT, ShAmt);
   13475   return DAG.getNode(Opc, dl, VT, SrcOp, ShAmt);
   13476 }
   13477 
   13478 static SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) {
   13479   SDLoc dl(Op);
   13480   unsigned IntNo = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
   13481   switch (IntNo) {
   13482   default: return SDValue();    // Don't custom lower most intrinsics.
   13483   // Comparison intrinsics.
   13484   case Intrinsic::x86_sse_comieq_ss:
   13485   case Intrinsic::x86_sse_comilt_ss:
   13486   case Intrinsic::x86_sse_comile_ss:
   13487   case Intrinsic::x86_sse_comigt_ss:
   13488   case Intrinsic::x86_sse_comige_ss:
   13489   case Intrinsic::x86_sse_comineq_ss:
   13490   case Intrinsic::x86_sse_ucomieq_ss:
   13491   case Intrinsic::x86_sse_ucomilt_ss:
   13492   case Intrinsic::x86_sse_ucomile_ss:
   13493   case Intrinsic::x86_sse_ucomigt_ss:
   13494   case Intrinsic::x86_sse_ucomige_ss:
   13495   case Intrinsic::x86_sse_ucomineq_ss:
   13496   case Intrinsic::x86_sse2_comieq_sd:
   13497   case Intrinsic::x86_sse2_comilt_sd:
   13498   case Intrinsic::x86_sse2_comile_sd:
   13499   case Intrinsic::x86_sse2_comigt_sd:
   13500   case Intrinsic::x86_sse2_comige_sd:
   13501   case Intrinsic::x86_sse2_comineq_sd:
   13502   case Intrinsic::x86_sse2_ucomieq_sd:
   13503   case Intrinsic::x86_sse2_ucomilt_sd:
   13504   case Intrinsic::x86_sse2_ucomile_sd:
   13505   case Intrinsic::x86_sse2_ucomigt_sd:
   13506   case Intrinsic::x86_sse2_ucomige_sd:
   13507   case Intrinsic::x86_sse2_ucomineq_sd: {
   13508     unsigned Opc;
   13509     ISD::CondCode CC;
   13510     switch (IntNo) {
   13511     default: llvm_unreachable("Impossible intrinsic");  // Can't reach here.
   13512     case Intrinsic::x86_sse_comieq_ss:
   13513     case Intrinsic::x86_sse2_comieq_sd:
   13514       Opc = X86ISD::COMI;
   13515       CC = ISD::SETEQ;
   13516       break;
   13517     case Intrinsic::x86_sse_comilt_ss:
   13518     case Intrinsic::x86_sse2_comilt_sd:
   13519       Opc = X86ISD::COMI;
   13520       CC = ISD::SETLT;
   13521       break;
   13522     case Intrinsic::x86_sse_comile_ss:
   13523     case Intrinsic::x86_sse2_comile_sd:
   13524       Opc = X86ISD::COMI;
   13525       CC = ISD::SETLE;
   13526       break;
   13527     case Intrinsic::x86_sse_comigt_ss:
   13528     case Intrinsic::x86_sse2_comigt_sd:
   13529       Opc = X86ISD::COMI;
   13530       CC = ISD::SETGT;
   13531       break;
   13532     case Intrinsic::x86_sse_comige_ss:
   13533     case Intrinsic::x86_sse2_comige_sd:
   13534       Opc = X86ISD::COMI;
   13535       CC = ISD::SETGE;
   13536       break;
   13537     case Intrinsic::x86_sse_comineq_ss:
   13538     case Intrinsic::x86_sse2_comineq_sd:
   13539       Opc = X86ISD::COMI;
   13540       CC = ISD::SETNE;
   13541       break;
   13542     case Intrinsic::x86_sse_ucomieq_ss:
   13543     case Intrinsic::x86_sse2_ucomieq_sd:
   13544       Opc = X86ISD::UCOMI;
   13545       CC = ISD::SETEQ;
   13546       break;
   13547     case Intrinsic::x86_sse_ucomilt_ss:
   13548     case Intrinsic::x86_sse2_ucomilt_sd:
   13549       Opc = X86ISD::UCOMI;
   13550       CC = ISD::SETLT;
   13551       break;
   13552     case Intrinsic::x86_sse_ucomile_ss:
   13553     case Intrinsic::x86_sse2_ucomile_sd:
   13554       Opc = X86ISD::UCOMI;
   13555       CC = ISD::SETLE;
   13556       break;
   13557     case Intrinsic::x86_sse_ucomigt_ss:
   13558     case Intrinsic::x86_sse2_ucomigt_sd:
   13559       Opc = X86ISD::UCOMI;
   13560       CC = ISD::SETGT;
   13561       break;
   13562     case Intrinsic::x86_sse_ucomige_ss:
   13563     case Intrinsic::x86_sse2_ucomige_sd:
   13564       Opc = X86ISD::UCOMI;
   13565       CC = ISD::SETGE;
   13566       break;
   13567     case Intrinsic::x86_sse_ucomineq_ss:
   13568     case Intrinsic::x86_sse2_ucomineq_sd:
   13569       Opc = X86ISD::UCOMI;
   13570       CC = ISD::SETNE;
   13571       break;
   13572     }
   13573 
   13574     SDValue LHS = Op.getOperand(1);
   13575     SDValue RHS = Op.getOperand(2);
   13576     unsigned X86CC = TranslateX86CC(CC, true, LHS, RHS, DAG);
   13577     assert(X86CC != X86::COND_INVALID && "Unexpected illegal condition!");
   13578     SDValue Cond = DAG.getNode(Opc, dl, MVT::i32, LHS, RHS);
   13579     SDValue SetCC = DAG.getNode(X86ISD::SETCC, dl, MVT::i8,
   13580                                 DAG.getConstant(X86CC, MVT::i8), Cond);
   13581     return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC);
   13582   }
   13583 
   13584   // Arithmetic intrinsics.
   13585   case Intrinsic::x86_sse2_pmulu_dq:
   13586   case Intrinsic::x86_avx2_pmulu_dq:
   13587     return DAG.getNode(X86ISD::PMULUDQ, dl, Op.getValueType(),
   13588                        Op.getOperand(1), Op.getOperand(2));
   13589 
   13590   case Intrinsic::x86_sse41_pmuldq:
   13591   case Intrinsic::x86_avx2_pmul_dq:
   13592     return DAG.getNode(X86ISD::PMULDQ, dl, Op.getValueType(),
   13593                        Op.getOperand(1), Op.getOperand(2));
   13594 
   13595   case Intrinsic::x86_sse2_pmulhu_w:
   13596   case Intrinsic::x86_avx2_pmulhu_w:
   13597     return DAG.getNode(ISD::MULHU, dl, Op.getValueType(),
   13598                        Op.getOperand(1), Op.getOperand(2));
   13599 
   13600   case Intrinsic::x86_sse2_pmulh_w:
   13601   case Intrinsic::x86_avx2_pmulh_w:
   13602     return DAG.getNode(ISD::MULHS, dl, Op.getValueType(),
   13603                        Op.getOperand(1), Op.getOperand(2));
   13604 
   13605   // SSE2/AVX2 sub with unsigned saturation intrinsics
   13606   case Intrinsic::x86_sse2_psubus_b:
   13607   case Intrinsic::x86_sse2_psubus_w:
   13608   case Intrinsic::x86_avx2_psubus_b:
   13609   case Intrinsic::x86_avx2_psubus_w:
   13610     return DAG.getNode(X86ISD::SUBUS, dl, Op.getValueType(),
   13611                        Op.getOperand(1), Op.getOperand(2));
   13612 
   13613   // SSE3/AVX horizontal add/sub intrinsics
   13614   case Intrinsic::x86_sse3_hadd_ps:
   13615   case Intrinsic::x86_sse3_hadd_pd:
   13616   case Intrinsic::x86_avx_hadd_ps_256:
   13617   case Intrinsic::x86_avx_hadd_pd_256:
   13618   case Intrinsic::x86_sse3_hsub_ps:
   13619   case Intrinsic::x86_sse3_hsub_pd:
   13620   case Intrinsic::x86_avx_hsub_ps_256:
   13621   case Intrinsic::x86_avx_hsub_pd_256:
   13622   case Intrinsic::x86_ssse3_phadd_w_128:
   13623   case Intrinsic::x86_ssse3_phadd_d_128:
   13624   case Intrinsic::x86_avx2_phadd_w:
   13625   case Intrinsic::x86_avx2_phadd_d:
   13626   case Intrinsic::x86_ssse3_phsub_w_128:
   13627   case Intrinsic::x86_ssse3_phsub_d_128:
   13628   case Intrinsic::x86_avx2_phsub_w:
   13629   case Intrinsic::x86_avx2_phsub_d: {
   13630     unsigned Opcode;
   13631     switch (IntNo) {
   13632     default: llvm_unreachable("Impossible intrinsic");  // Can't reach here.
   13633     case Intrinsic::x86_sse3_hadd_ps:
   13634     case Intrinsic::x86_sse3_hadd_pd:
   13635     case Intrinsic::x86_avx_hadd_ps_256:
   13636     case Intrinsic::x86_avx_hadd_pd_256:
   13637       Opcode = X86ISD::FHADD;
   13638       break;
   13639     case Intrinsic::x86_sse3_hsub_ps:
   13640     case Intrinsic::x86_sse3_hsub_pd:
   13641     case Intrinsic::x86_avx_hsub_ps_256:
   13642     case Intrinsic::x86_avx_hsub_pd_256:
   13643       Opcode = X86ISD::FHSUB;
   13644       break;
   13645     case Intrinsic::x86_ssse3_phadd_w_128:
   13646     case Intrinsic::x86_ssse3_phadd_d_128:
   13647     case Intrinsic::x86_avx2_phadd_w:
   13648     case Intrinsic::x86_avx2_phadd_d:
   13649       Opcode = X86ISD::HADD;
   13650       break;
   13651     case Intrinsic::x86_ssse3_phsub_w_128:
   13652     case Intrinsic::x86_ssse3_phsub_d_128:
   13653     case Intrinsic::x86_avx2_phsub_w:
   13654     case Intrinsic::x86_avx2_phsub_d:
   13655       Opcode = X86ISD::HSUB;
   13656       break;
   13657     }
   13658     return DAG.getNode(Opcode, dl, Op.getValueType(),
   13659                        Op.getOperand(1), Op.getOperand(2));
   13660   }
   13661 
   13662   // SSE2/SSE41/AVX2 integer max/min intrinsics.
   13663   case Intrinsic::x86_sse2_pmaxu_b:
   13664   case Intrinsic::x86_sse41_pmaxuw:
   13665   case Intrinsic::x86_sse41_pmaxud:
   13666   case Intrinsic::x86_avx2_pmaxu_b:
   13667   case Intrinsic::x86_avx2_pmaxu_w:
   13668   case Intrinsic::x86_avx2_pmaxu_d:
   13669   case Intrinsic::x86_sse2_pminu_b:
   13670   case Intrinsic::x86_sse41_pminuw:
   13671   case Intrinsic::x86_sse41_pminud:
   13672   case Intrinsic::x86_avx2_pminu_b:
   13673   case Intrinsic::x86_avx2_pminu_w:
   13674   case Intrinsic::x86_avx2_pminu_d:
   13675   case Intrinsic::x86_sse41_pmaxsb:
   13676   case Intrinsic::x86_sse2_pmaxs_w:
   13677   case Intrinsic::x86_sse41_pmaxsd:
   13678   case Intrinsic::x86_avx2_pmaxs_b:
   13679   case Intrinsic::x86_avx2_pmaxs_w:
   13680   case Intrinsic::x86_avx2_pmaxs_d:
   13681   case Intrinsic::x86_sse41_pminsb:
   13682   case Intrinsic::x86_sse2_pmins_w:
   13683   case Intrinsic::x86_sse41_pminsd:
   13684   case Intrinsic::x86_avx2_pmins_b:
   13685   case Intrinsic::x86_avx2_pmins_w:
   13686   case Intrinsic::x86_avx2_pmins_d: {
   13687     unsigned Opcode;
   13688     switch (IntNo) {
   13689     default: llvm_unreachable("Impossible intrinsic");  // Can't reach here.
   13690     case Intrinsic::x86_sse2_pmaxu_b:
   13691     case Intrinsic::x86_sse41_pmaxuw:
   13692     case Intrinsic::x86_sse41_pmaxud:
   13693     case Intrinsic::x86_avx2_pmaxu_b:
   13694     case Intrinsic::x86_avx2_pmaxu_w:
   13695     case Intrinsic::x86_avx2_pmaxu_d:
   13696       Opcode = X86ISD::UMAX;
   13697       break;
   13698     case Intrinsic::x86_sse2_pminu_b:
   13699     case Intrinsic::x86_sse41_pminuw:
   13700     case Intrinsic::x86_sse41_pminud:
   13701     case Intrinsic::x86_avx2_pminu_b:
   13702     case Intrinsic::x86_avx2_pminu_w:
   13703     case Intrinsic::x86_avx2_pminu_d:
   13704       Opcode = X86ISD::UMIN;
   13705       break;
   13706     case Intrinsic::x86_sse41_pmaxsb:
   13707     case Intrinsic::x86_sse2_pmaxs_w:
   13708     case Intrinsic::x86_sse41_pmaxsd:
   13709     case Intrinsic::x86_avx2_pmaxs_b:
   13710     case Intrinsic::x86_avx2_pmaxs_w:
   13711     case Intrinsic::x86_avx2_pmaxs_d:
   13712       Opcode = X86ISD::SMAX;
   13713       break;
   13714     case Intrinsic::x86_sse41_pminsb:
   13715     case Intrinsic::x86_sse2_pmins_w:
   13716     case Intrinsic::x86_sse41_pminsd:
   13717     case Intrinsic::x86_avx2_pmins_b:
   13718     case Intrinsic::x86_avx2_pmins_w:
   13719     case Intrinsic::x86_avx2_pmins_d:
   13720       Opcode = X86ISD::SMIN;
   13721       break;
   13722     }
   13723     return DAG.getNode(Opcode, dl, Op.getValueType(),
   13724                        Op.getOperand(1), Op.getOperand(2));
   13725   }
   13726 
   13727   // SSE/SSE2/AVX floating point max/min intrinsics.
   13728   case Intrinsic::x86_sse_max_ps:
   13729   case Intrinsic::x86_sse2_max_pd:
   13730   case Intrinsic::x86_avx_max_ps_256:
   13731   case Intrinsic::x86_avx_max_pd_256:
   13732   case Intrinsic::x86_sse_min_ps:
   13733   case Intrinsic::x86_sse2_min_pd:
   13734   case Intrinsic::x86_avx_min_ps_256:
   13735   case Intrinsic::x86_avx_min_pd_256: {
   13736     unsigned Opcode;
   13737     switch (IntNo) {
   13738     default: llvm_unreachable("Impossible intrinsic");  // Can't reach here.
   13739     case Intrinsic::x86_sse_max_ps:
   13740     case Intrinsic::x86_sse2_max_pd:
   13741     case Intrinsic::x86_avx_max_ps_256:
   13742     case Intrinsic::x86_avx_max_pd_256:
   13743       Opcode = X86ISD::FMAX;
   13744       break;
   13745     case Intrinsic::x86_sse_min_ps:
   13746     case Intrinsic::x86_sse2_min_pd:
   13747     case Intrinsic::x86_avx_min_ps_256:
   13748     case Intrinsic::x86_avx_min_pd_256:
   13749       Opcode = X86ISD::FMIN;
   13750       break;
   13751     }
   13752     return DAG.getNode(Opcode, dl, Op.getValueType(),
   13753                        Op.getOperand(1), Op.getOperand(2));
   13754   }
   13755 
   13756   // AVX2 variable shift intrinsics
   13757   case Intrinsic::x86_avx2_psllv_d:
   13758   case Intrinsic::x86_avx2_psllv_q:
   13759   case Intrinsic::x86_avx2_psllv_d_256:
   13760   case Intrinsic::x86_avx2_psllv_q_256:
   13761   case Intrinsic::x86_avx2_psrlv_d:
   13762   case Intrinsic::x86_avx2_psrlv_q:
   13763   case Intrinsic::x86_avx2_psrlv_d_256:
   13764   case Intrinsic::x86_avx2_psrlv_q_256:
   13765   case Intrinsic::x86_avx2_psrav_d:
   13766   case Intrinsic::x86_avx2_psrav_d_256: {
   13767     unsigned Opcode;
   13768     switch (IntNo) {
   13769     default: llvm_unreachable("Impossible intrinsic");  // Can't reach here.
   13770     case Intrinsic::x86_avx2_psllv_d:
   13771     case Intrinsic::x86_avx2_psllv_q:
   13772     case Intrinsic::x86_avx2_psllv_d_256:
   13773     case Intrinsic::x86_avx2_psllv_q_256:
   13774       Opcode = ISD::SHL;
   13775       break;
   13776     case Intrinsic::x86_avx2_psrlv_d:
   13777     case Intrinsic::x86_avx2_psrlv_q:
   13778     case Intrinsic::x86_avx2_psrlv_d_256:
   13779     case Intrinsic::x86_avx2_psrlv_q_256:
   13780       Opcode = ISD::SRL;
   13781       break;
   13782     case Intrinsic::x86_avx2_psrav_d:
   13783     case Intrinsic::x86_avx2_psrav_d_256:
   13784       Opcode = ISD::SRA;
   13785       break;
   13786     }
   13787     return DAG.getNode(Opcode, dl, Op.getValueType(),
   13788                        Op.getOperand(1), Op.getOperand(2));
   13789   }
   13790 
   13791   case Intrinsic::x86_sse2_packssdw_128:
   13792   case Intrinsic::x86_sse2_packsswb_128:
   13793   case Intrinsic::x86_avx2_packssdw:
   13794   case Intrinsic::x86_avx2_packsswb:
   13795     return DAG.getNode(X86ISD::PACKSS, dl, Op.getValueType(),
   13796                        Op.getOperand(1), Op.getOperand(2));
   13797 
   13798   case Intrinsic::x86_sse2_packuswb_128:
   13799   case Intrinsic::x86_sse41_packusdw:
   13800   case Intrinsic::x86_avx2_packuswb:
   13801   case Intrinsic::x86_avx2_packusdw:
   13802     return DAG.getNode(X86ISD::PACKUS, dl, Op.getValueType(),
   13803                        Op.getOperand(1), Op.getOperand(2));
   13804 
   13805   case Intrinsic::x86_ssse3_pshuf_b_128:
   13806   case Intrinsic::x86_avx2_pshuf_b:
   13807     return DAG.getNode(X86ISD::PSHUFB, dl, Op.getValueType(),
   13808                        Op.getOperand(1), Op.getOperand(2));
   13809 
   13810   case Intrinsic::x86_sse2_pshuf_d:
   13811     return DAG.getNode(X86ISD::PSHUFD, dl, Op.getValueType(),
   13812                        Op.getOperand(1), Op.getOperand(2));
   13813 
   13814   case Intrinsic::x86_sse2_pshufl_w:
   13815     return DAG.getNode(X86ISD::PSHUFLW, dl, Op.getValueType(),
   13816                        Op.getOperand(1), Op.getOperand(2));
   13817 
   13818   case Intrinsic::x86_sse2_pshufh_w:
   13819     return DAG.getNode(X86ISD::PSHUFHW, dl, Op.getValueType(),
   13820                        Op.getOperand(1), Op.getOperand(2));
   13821 
   13822   case Intrinsic::x86_ssse3_psign_b_128:
   13823   case Intrinsic::x86_ssse3_psign_w_128:
   13824   case Intrinsic::x86_ssse3_psign_d_128:
   13825   case Intrinsic::x86_avx2_psign_b:
   13826   case Intrinsic::x86_avx2_psign_w:
   13827   case Intrinsic::x86_avx2_psign_d:
   13828     return DAG.getNode(X86ISD::PSIGN, dl, Op.getValueType(),
   13829                        Op.getOperand(1), Op.getOperand(2));
   13830 
   13831   case Intrinsic::x86_sse41_insertps:
   13832     return DAG.getNode(X86ISD::INSERTPS, dl, Op.getValueType(),
   13833                        Op.getOperand(1), Op.getOperand(2), Op.getOperand(3));
   13834 
   13835   case Intrinsic::x86_avx_vperm2f128_ps_256:
   13836   case Intrinsic::x86_avx_vperm2f128_pd_256:
   13837   case Intrinsic::x86_avx_vperm2f128_si_256:
   13838   case Intrinsic::x86_avx2_vperm2i128:
   13839     return DAG.getNode(X86ISD::VPERM2X128, dl, Op.getValueType(),
   13840                        Op.getOperand(1), Op.getOperand(2), Op.getOperand(3));
   13841 
   13842   case Intrinsic::x86_avx2_permd:
   13843   case Intrinsic::x86_avx2_permps:
   13844     // Operands intentionally swapped. Mask is last operand to intrinsic,
   13845     // but second operand for node/instruction.
   13846     return DAG.getNode(X86ISD::VPERMV, dl, Op.getValueType(),
   13847                        Op.getOperand(2), Op.getOperand(1));
   13848 
   13849   case Intrinsic::x86_sse_sqrt_ps:
   13850   case Intrinsic::x86_sse2_sqrt_pd:
   13851   case Intrinsic::x86_avx_sqrt_ps_256:
   13852   case Intrinsic::x86_avx_sqrt_pd_256:
   13853     return DAG.getNode(ISD::FSQRT, dl, Op.getValueType(), Op.getOperand(1));
   13854 
   13855   // ptest and testp intrinsics. The intrinsic these come from are designed to
   13856   // return an integer value, not just an instruction so lower it to the ptest
   13857   // or testp pattern and a setcc for the result.
   13858   case Intrinsic::x86_sse41_ptestz:
   13859   case Intrinsic::x86_sse41_ptestc:
   13860   case Intrinsic::x86_sse41_ptestnzc:
   13861   case Intrinsic::x86_avx_ptestz_256:
   13862   case Intrinsic::x86_avx_ptestc_256:
   13863   case Intrinsic::x86_avx_ptestnzc_256:
   13864   case Intrinsic::x86_avx_vtestz_ps:
   13865   case Intrinsic::x86_avx_vtestc_ps:
   13866   case Intrinsic::x86_avx_vtestnzc_ps:
   13867   case Intrinsic::x86_avx_vtestz_pd:
   13868   case Intrinsic::x86_avx_vtestc_pd:
   13869   case Intrinsic::x86_avx_vtestnzc_pd:
   13870   case Intrinsic::x86_avx_vtestz_ps_256:
   13871   case Intrinsic::x86_avx_vtestc_ps_256:
   13872   case Intrinsic::x86_avx_vtestnzc_ps_256:
   13873   case Intrinsic::x86_avx_vtestz_pd_256:
   13874   case Intrinsic::x86_avx_vtestc_pd_256:
   13875   case Intrinsic::x86_avx_vtestnzc_pd_256: {
   13876     bool IsTestPacked = false;
   13877     unsigned X86CC;
   13878     switch (IntNo) {
   13879     default: llvm_unreachable("Bad fallthrough in Intrinsic lowering.");
   13880     case Intrinsic::x86_avx_vtestz_ps:
   13881     case Intrinsic::x86_avx_vtestz_pd:
   13882     case Intrinsic::x86_avx_vtestz_ps_256:
   13883     case Intrinsic::x86_avx_vtestz_pd_256:
   13884       IsTestPacked = true; // Fallthrough
   13885     case Intrinsic::x86_sse41_ptestz:
   13886     case Intrinsic::x86_avx_ptestz_256:
   13887       // ZF = 1
   13888       X86CC = X86::COND_E;
   13889       break;
   13890     case Intrinsic::x86_avx_vtestc_ps:
   13891     case Intrinsic::x86_avx_vtestc_pd:
   13892     case Intrinsic::x86_avx_vtestc_ps_256:
   13893     case Intrinsic::x86_avx_vtestc_pd_256:
   13894       IsTestPacked = true; // Fallthrough
   13895     case Intrinsic::x86_sse41_ptestc:
   13896     case Intrinsic::x86_avx_ptestc_256:
   13897       // CF = 1
   13898       X86CC = X86::COND_B;
   13899       break;
   13900     case Intrinsic::x86_avx_vtestnzc_ps:
   13901     case Intrinsic::x86_avx_vtestnzc_pd:
   13902     case Intrinsic::x86_avx_vtestnzc_ps_256:
   13903     case Intrinsic::x86_avx_vtestnzc_pd_256:
   13904       IsTestPacked = true; // Fallthrough
   13905     case Intrinsic::x86_sse41_ptestnzc:
   13906     case Intrinsic::x86_avx_ptestnzc_256:
   13907       // ZF and CF = 0
   13908       X86CC = X86::COND_A;
   13909       break;
   13910     }
   13911 
   13912     SDValue LHS = Op.getOperand(1);
   13913     SDValue RHS = Op.getOperand(2);
   13914     unsigned TestOpc = IsTestPacked ? X86ISD::TESTP : X86ISD::PTEST;
   13915     SDValue Test = DAG.getNode(TestOpc, dl, MVT::i32, LHS, RHS);
   13916     SDValue CC = DAG.getConstant(X86CC, MVT::i8);
   13917     SDValue SetCC = DAG.getNode(X86ISD::SETCC, dl, MVT::i8, CC, Test);
   13918     return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC);
   13919   }
   13920   case Intrinsic::x86_avx512_kortestz_w:
   13921   case Intrinsic::x86_avx512_kortestc_w: {
   13922     unsigned X86CC = (IntNo == Intrinsic::x86_avx512_kortestz_w)? X86::COND_E: X86::COND_B;
   13923     SDValue LHS = DAG.getNode(ISD::BITCAST, dl, MVT::v16i1, Op.getOperand(1));
   13924     SDValue RHS = DAG.getNode(ISD::BITCAST, dl, MVT::v16i1, Op.getOperand(2));
   13925     SDValue CC = DAG.getConstant(X86CC, MVT::i8);
   13926     SDValue Test = DAG.getNode(X86ISD::KORTEST, dl, MVT::i32, LHS, RHS);
   13927     SDValue SetCC = DAG.getNode(X86ISD::SETCC, dl, MVT::i1, CC, Test);
   13928     return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC);
   13929   }
   13930 
   13931   // SSE/AVX shift intrinsics
   13932   case Intrinsic::x86_sse2_psll_w:
   13933   case Intrinsic::x86_sse2_psll_d:
   13934   case Intrinsic::x86_sse2_psll_q:
   13935   case Intrinsic::x86_avx2_psll_w:
   13936   case Intrinsic::x86_avx2_psll_d:
   13937   case Intrinsic::x86_avx2_psll_q:
   13938   case Intrinsic::x86_sse2_psrl_w:
   13939   case Intrinsic::x86_sse2_psrl_d:
   13940   case Intrinsic::x86_sse2_psrl_q:
   13941   case Intrinsic::x86_avx2_psrl_w:
   13942   case Intrinsic::x86_avx2_psrl_d:
   13943   case Intrinsic::x86_avx2_psrl_q:
   13944   case Intrinsic::x86_sse2_psra_w:
   13945   case Intrinsic::x86_sse2_psra_d:
   13946   case Intrinsic::x86_avx2_psra_w:
   13947   case Intrinsic::x86_avx2_psra_d: {
   13948     unsigned Opcode;
   13949     switch (IntNo) {
   13950     default: llvm_unreachable("Impossible intrinsic");  // Can't reach here.
   13951     case Intrinsic::x86_sse2_psll_w:
   13952     case Intrinsic::x86_sse2_psll_d:
   13953     case Intrinsic::x86_sse2_psll_q:
   13954     case Intrinsic::x86_avx2_psll_w:
   13955     case Intrinsic::x86_avx2_psll_d:
   13956     case Intrinsic::x86_avx2_psll_q:
   13957       Opcode = X86ISD::VSHL;
   13958       break;
   13959     case Intrinsic::x86_sse2_psrl_w:
   13960     case Intrinsic::x86_sse2_psrl_d:
   13961     case Intrinsic::x86_sse2_psrl_q:
   13962     case Intrinsic::x86_avx2_psrl_w:
   13963     case Intrinsic::x86_avx2_psrl_d:
   13964     case Intrinsic::x86_avx2_psrl_q:
   13965       Opcode = X86ISD::VSRL;
   13966       break;
   13967     case Intrinsic::x86_sse2_psra_w:
   13968     case Intrinsic::x86_sse2_psra_d:
   13969     case Intrinsic::x86_avx2_psra_w:
   13970     case Intrinsic::x86_avx2_psra_d:
   13971       Opcode = X86ISD::VSRA;
   13972       break;
   13973     }
   13974     return DAG.getNode(Opcode, dl, Op.getValueType(),
   13975                        Op.getOperand(1), Op.getOperand(2));
   13976   }
   13977 
   13978   // SSE/AVX immediate shift intrinsics
   13979   case Intrinsic::x86_sse2_pslli_w:
   13980   case Intrinsic::x86_sse2_pslli_d:
   13981   case Intrinsic::x86_sse2_pslli_q:
   13982   case Intrinsic::x86_avx2_pslli_w:
   13983   case Intrinsic::x86_avx2_pslli_d:
   13984   case Intrinsic::x86_avx2_pslli_q:
   13985   case Intrinsic::x86_sse2_psrli_w:
   13986   case Intrinsic::x86_sse2_psrli_d:
   13987   case Intrinsic::x86_sse2_psrli_q:
   13988   case Intrinsic::x86_avx2_psrli_w:
   13989   case Intrinsic::x86_avx2_psrli_d:
   13990   case Intrinsic::x86_avx2_psrli_q:
   13991   case Intrinsic::x86_sse2_psrai_w:
   13992   case Intrinsic::x86_sse2_psrai_d:
   13993   case Intrinsic::x86_avx2_psrai_w:
   13994   case Intrinsic::x86_avx2_psrai_d: {
   13995     unsigned Opcode;
   13996     switch (IntNo) {
   13997     default: llvm_unreachable("Impossible intrinsic");  // Can't reach here.
   13998     case Intrinsic::x86_sse2_pslli_w:
   13999     case Intrinsic::x86_sse2_pslli_d:
   14000     case Intrinsic::x86_sse2_pslli_q:
   14001     case Intrinsic::x86_avx2_pslli_w:
   14002     case Intrinsic::x86_avx2_pslli_d:
   14003     case Intrinsic::x86_avx2_pslli_q:
   14004       Opcode = X86ISD::VSHLI;
   14005       break;
   14006     case Intrinsic::x86_sse2_psrli_w:
   14007     case Intrinsic::x86_sse2_psrli_d:
   14008     case Intrinsic::x86_sse2_psrli_q:
   14009     case Intrinsic::x86_avx2_psrli_w:
   14010     case Intrinsic::x86_avx2_psrli_d:
   14011     case Intrinsic::x86_avx2_psrli_q:
   14012       Opcode = X86ISD::VSRLI;
   14013       break;
   14014     case Intrinsic::x86_sse2_psrai_w:
   14015     case Intrinsic::x86_sse2_psrai_d:
   14016     case Intrinsic::x86_avx2_psrai_w:
   14017     case Intrinsic::x86_avx2_psrai_d:
   14018       Opcode = X86ISD::VSRAI;
   14019       break;
   14020     }
   14021     return getTargetVShiftNode(Opcode, dl, Op.getSimpleValueType(),
   14022                                Op.getOperand(1), Op.getOperand(2), DAG);
   14023   }
   14024 
   14025   case Intrinsic::x86_sse42_pcmpistria128:
   14026   case Intrinsic::x86_sse42_pcmpestria128:
   14027   case Intrinsic::x86_sse42_pcmpistric128:
   14028   case Intrinsic::x86_sse42_pcmpestric128:
   14029   case Intrinsic::x86_sse42_pcmpistrio128:
   14030   case Intrinsic::x86_sse42_pcmpestrio128:
   14031   case Intrinsic::x86_sse42_pcmpistris128:
   14032   case Intrinsic::x86_sse42_pcmpestris128:
   14033   case Intrinsic::x86_sse42_pcmpistriz128:
   14034   case Intrinsic::x86_sse42_pcmpestriz128: {
   14035     unsigned Opcode;
   14036     unsigned X86CC;
   14037     switch (IntNo) {
   14038     default: llvm_unreachable("Impossible intrinsic");  // Can't reach here.
   14039     case Intrinsic::x86_sse42_pcmpistria128:
   14040       Opcode = X86ISD::PCMPISTRI;
   14041       X86CC = X86::COND_A;
   14042       break;
   14043     case Intrinsic::x86_sse42_pcmpestria128:
   14044       Opcode = X86ISD::PCMPESTRI;
   14045       X86CC = X86::COND_A;
   14046       break;
   14047     case Intrinsic::x86_sse42_pcmpistric128:
   14048       Opcode = X86ISD::PCMPISTRI;
   14049       X86CC = X86::COND_B;
   14050       break;
   14051     case Intrinsic::x86_sse42_pcmpestric128:
   14052       Opcode = X86ISD::PCMPESTRI;
   14053       X86CC = X86::COND_B;
   14054       break;
   14055     case Intrinsic::x86_sse42_pcmpistrio128:
   14056       Opcode = X86ISD::PCMPISTRI;
   14057       X86CC = X86::COND_O;
   14058       break;
   14059     case Intrinsic::x86_sse42_pcmpestrio128:
   14060       Opcode = X86ISD::PCMPESTRI;
   14061       X86CC = X86::COND_O;
   14062       break;
   14063     case Intrinsic::x86_sse42_pcmpistris128:
   14064       Opcode = X86ISD::PCMPISTRI;
   14065       X86CC = X86::COND_S;
   14066       break;
   14067     case Intrinsic::x86_sse42_pcmpestris128:
   14068       Opcode = X86ISD::PCMPESTRI;
   14069       X86CC = X86::COND_S;
   14070       break;
   14071     case Intrinsic::x86_sse42_pcmpistriz128:
   14072       Opcode = X86ISD::PCMPISTRI;
   14073       X86CC = X86::COND_E;
   14074       break;
   14075     case Intrinsic::x86_sse42_pcmpestriz128:
   14076       Opcode = X86ISD::PCMPESTRI;
   14077       X86CC = X86::COND_E;
   14078       break;
   14079     }
   14080     SmallVector<SDValue, 5> NewOps(Op->op_begin()+1, Op->op_end());
   14081     SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32);
   14082     SDValue PCMP = DAG.getNode(Opcode, dl, VTs, NewOps);
   14083     SDValue SetCC = DAG.getNode(X86ISD::SETCC, dl, MVT::i8,
   14084                                 DAG.getConstant(X86CC, MVT::i8),
   14085                                 SDValue(PCMP.getNode(), 1));
   14086     return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC);
   14087   }
   14088 
   14089   case Intrinsic::x86_sse42_pcmpistri128:
   14090   case Intrinsic::x86_sse42_pcmpestri128: {
   14091     unsigned Opcode;
   14092     if (IntNo == Intrinsic::x86_sse42_pcmpistri128)
   14093       Opcode = X86ISD::PCMPISTRI;
   14094     else
   14095       Opcode = X86ISD::PCMPESTRI;
   14096 
   14097     SmallVector<SDValue, 5> NewOps(Op->op_begin()+1, Op->op_end());
   14098     SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32);
   14099     return DAG.getNode(Opcode, dl, VTs, NewOps);
   14100   }
   14101   case Intrinsic::x86_fma_vfmadd_ps:
   14102   case Intrinsic::x86_fma_vfmadd_pd:
   14103   case Intrinsic::x86_fma_vfmsub_ps:
   14104   case Intrinsic::x86_fma_vfmsub_pd:
   14105   case Intrinsic::x86_fma_vfnmadd_ps:
   14106   case Intrinsic::x86_fma_vfnmadd_pd:
   14107   case Intrinsic::x86_fma_vfnmsub_ps:
   14108   case Intrinsic::x86_fma_vfnmsub_pd:
   14109   case Intrinsic::x86_fma_vfmaddsub_ps:
   14110   case Intrinsic::x86_fma_vfmaddsub_pd:
   14111   case Intrinsic::x86_fma_vfmsubadd_ps:
   14112   case Intrinsic::x86_fma_vfmsubadd_pd:
   14113   case Intrinsic::x86_fma_vfmadd_ps_256:
   14114   case Intrinsic::x86_fma_vfmadd_pd_256:
   14115   case Intrinsic::x86_fma_vfmsub_ps_256:
   14116   case Intrinsic::x86_fma_vfmsub_pd_256:
   14117   case Intrinsic::x86_fma_vfnmadd_ps_256:
   14118   case Intrinsic::x86_fma_vfnmadd_pd_256:
   14119   case Intrinsic::x86_fma_vfnmsub_ps_256:
   14120   case Intrinsic::x86_fma_vfnmsub_pd_256:
   14121   case Intrinsic::x86_fma_vfmaddsub_ps_256:
   14122   case Intrinsic::x86_fma_vfmaddsub_pd_256:
   14123   case Intrinsic::x86_fma_vfmsubadd_ps_256:
   14124   case Intrinsic::x86_fma_vfmsubadd_pd_256:
   14125   case Intrinsic::x86_fma_vfmadd_ps_512:
   14126   case Intrinsic::x86_fma_vfmadd_pd_512:
   14127   case Intrinsic::x86_fma_vfmsub_ps_512:
   14128   case Intrinsic::x86_fma_vfmsub_pd_512:
   14129   case Intrinsic::x86_fma_vfnmadd_ps_512:
   14130   case Intrinsic::x86_fma_vfnmadd_pd_512:
   14131   case Intrinsic::x86_fma_vfnmsub_ps_512:
   14132   case Intrinsic::x86_fma_vfnmsub_pd_512:
   14133   case Intrinsic::x86_fma_vfmaddsub_ps_512:
   14134   case Intrinsic::x86_fma_vfmaddsub_pd_512:
   14135   case Intrinsic::x86_fma_vfmsubadd_ps_512:
   14136   case Intrinsic::x86_fma_vfmsubadd_pd_512: {
   14137     unsigned Opc;
   14138     switch (IntNo) {
   14139     default: llvm_unreachable("Impossible intrinsic");  // Can't reach here.
   14140     case Intrinsic::x86_fma_vfmadd_ps:
   14141     case Intrinsic::x86_fma_vfmadd_pd:
   14142     case Intrinsic::x86_fma_vfmadd_ps_256:
   14143     case Intrinsic::x86_fma_vfmadd_pd_256:
   14144     case Intrinsic::x86_fma_vfmadd_ps_512:
   14145     case Intrinsic::x86_fma_vfmadd_pd_512:
   14146       Opc = X86ISD::FMADD;
   14147       break;
   14148     case Intrinsic::x86_fma_vfmsub_ps:
   14149     case Intrinsic::x86_fma_vfmsub_pd:
   14150     case Intrinsic::x86_fma_vfmsub_ps_256:
   14151     case Intrinsic::x86_fma_vfmsub_pd_256:
   14152     case Intrinsic::x86_fma_vfmsub_ps_512:
   14153     case Intrinsic::x86_fma_vfmsub_pd_512:
   14154       Opc = X86ISD::FMSUB;
   14155       break;
   14156     case Intrinsic::x86_fma_vfnmadd_ps:
   14157     case Intrinsic::x86_fma_vfnmadd_pd:
   14158     case Intrinsic::x86_fma_vfnmadd_ps_256:
   14159     case Intrinsic::x86_fma_vfnmadd_pd_256:
   14160     case Intrinsic::x86_fma_vfnmadd_ps_512:
   14161     case Intrinsic::x86_fma_vfnmadd_pd_512:
   14162       Opc = X86ISD::FNMADD;
   14163       break;
   14164     case Intrinsic::x86_fma_vfnmsub_ps:
   14165     case Intrinsic::x86_fma_vfnmsub_pd:
   14166     case Intrinsic::x86_fma_vfnmsub_ps_256:
   14167     case Intrinsic::x86_fma_vfnmsub_pd_256:
   14168     case Intrinsic::x86_fma_vfnmsub_ps_512:
   14169     case Intrinsic::x86_fma_vfnmsub_pd_512:
   14170       Opc = X86ISD::FNMSUB;
   14171       break;
   14172     case Intrinsic::x86_fma_vfmaddsub_ps:
   14173     case Intrinsic::x86_fma_vfmaddsub_pd:
   14174     case Intrinsic::x86_fma_vfmaddsub_ps_256:
   14175     case Intrinsic::x86_fma_vfmaddsub_pd_256:
   14176     case Intrinsic::x86_fma_vfmaddsub_ps_512:
   14177     case Intrinsic::x86_fma_vfmaddsub_pd_512:
   14178       Opc = X86ISD::FMADDSUB;
   14179       break;
   14180     case Intrinsic::x86_fma_vfmsubadd_ps:
   14181     case Intrinsic::x86_fma_vfmsubadd_pd:
   14182     case Intrinsic::x86_fma_vfmsubadd_ps_256:
   14183     case Intrinsic::x86_fma_vfmsubadd_pd_256:
   14184     case Intrinsic::x86_fma_vfmsubadd_ps_512:
   14185     case Intrinsic::x86_fma_vfmsubadd_pd_512:
   14186       Opc = X86ISD::FMSUBADD;
   14187       break;
   14188     }
   14189 
   14190     return DAG.getNode(Opc, dl, Op.getValueType(), Op.getOperand(1),
   14191                        Op.getOperand(2), Op.getOperand(3));
   14192   }
   14193   }
   14194 }
   14195 
   14196 static SDValue getGatherNode(unsigned Opc, SDValue Op, SelectionDAG &DAG,
   14197                               SDValue Src, SDValue Mask, SDValue Base,
   14198                               SDValue Index, SDValue ScaleOp, SDValue Chain,
   14199                               const X86Subtarget * Subtarget) {
   14200   SDLoc dl(Op);
   14201   ConstantSDNode *C = dyn_cast<ConstantSDNode>(ScaleOp);
   14202   assert(C && "Invalid scale type");
   14203   SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), MVT::i8);
   14204   EVT MaskVT = MVT::getVectorVT(MVT::i1,
   14205                              Index.getSimpleValueType().getVectorNumElements());
   14206   SDValue MaskInReg;
   14207   ConstantSDNode *MaskC = dyn_cast<ConstantSDNode>(Mask);
   14208   if (MaskC)
   14209     MaskInReg = DAG.getTargetConstant(MaskC->getSExtValue(), MaskVT);
   14210   else
   14211     MaskInReg = DAG.getNode(ISD::BITCAST, dl, MaskVT, Mask);
   14212   SDVTList VTs = DAG.getVTList(Op.getValueType(), MaskVT, MVT::Other);
   14213   SDValue Disp = DAG.getTargetConstant(0, MVT::i32);
   14214   SDValue Segment = DAG.getRegister(0, MVT::i32);
   14215   if (Src.getOpcode() == ISD::UNDEF)
   14216     Src = getZeroVector(Op.getValueType(), Subtarget, DAG, dl);
   14217   SDValue Ops[] = {Src, MaskInReg, Base, Scale, Index, Disp, Segment, Chain};
   14218   SDNode *Res = DAG.getMachineNode(Opc, dl, VTs, Ops);
   14219   SDValue RetOps[] = { SDValue(Res, 0), SDValue(Res, 2) };
   14220   return DAG.getMergeValues(RetOps, dl);
   14221 }
   14222 
   14223 static SDValue getScatterNode(unsigned Opc, SDValue Op, SelectionDAG &DAG,
   14224                                SDValue Src, SDValue Mask, SDValue Base,
   14225                                SDValue Index, SDValue ScaleOp, SDValue Chain) {
   14226   SDLoc dl(Op);
   14227   ConstantSDNode *C = dyn_cast<ConstantSDNode>(ScaleOp);
   14228   assert(C && "Invalid scale type");
   14229   SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), MVT::i8);
   14230   SDValue Disp = DAG.getTargetConstant(0, MVT::i32);
   14231   SDValue Segment = DAG.getRegister(0, MVT::i32);
   14232   EVT MaskVT = MVT::getVectorVT(MVT::i1,
   14233                              Index.getSimpleValueType().getVectorNumElements());
   14234   SDValue MaskInReg;
   14235   ConstantSDNode *MaskC = dyn_cast<ConstantSDNode>(Mask);
   14236   if (MaskC)
   14237     MaskInReg = DAG.getTargetConstant(MaskC->getSExtValue(), MaskVT);
   14238   else
   14239     MaskInReg = DAG.getNode(ISD::BITCAST, dl, MaskVT, Mask);
   14240   SDVTList VTs = DAG.getVTList(MaskVT, MVT::Other);
   14241   SDValue Ops[] = {Base, Scale, Index, Disp, Segment, MaskInReg, Src, Chain};
   14242   SDNode *Res = DAG.getMachineNode(Opc, dl, VTs, Ops);
   14243   return SDValue(Res, 1);
   14244 }
   14245 
   14246 static SDValue getPrefetchNode(unsigned Opc, SDValue Op, SelectionDAG &DAG,
   14247                                SDValue Mask, SDValue Base, SDValue Index,
   14248                                SDValue ScaleOp, SDValue Chain) {
   14249   SDLoc dl(Op);
   14250   ConstantSDNode *C = dyn_cast<ConstantSDNode>(ScaleOp);
   14251   assert(C && "Invalid scale type");
   14252   SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), MVT::i8);
   14253   SDValue Disp = DAG.getTargetConstant(0, MVT::i32);
   14254   SDValue Segment = DAG.getRegister(0, MVT::i32);
   14255   EVT MaskVT =
   14256     MVT::getVectorVT(MVT::i1, Index.getSimpleValueType().getVectorNumElements());
   14257   SDValue MaskInReg;
   14258   ConstantSDNode *MaskC = dyn_cast<ConstantSDNode>(Mask);
   14259   if (MaskC)
   14260     MaskInReg = DAG.getTargetConstant(MaskC->getSExtValue(), MaskVT);
   14261   else
   14262     MaskInReg = DAG.getNode(ISD::BITCAST, dl, MaskVT, Mask);
   14263   //SDVTList VTs = DAG.getVTList(MVT::Other);
   14264   SDValue Ops[] = {MaskInReg, Base, Scale, Index, Disp, Segment, Chain};
   14265   SDNode *Res = DAG.getMachineNode(Opc, dl, MVT::Other, Ops);
   14266   return SDValue(Res, 0);
   14267 }
   14268 
   14269 // getReadPerformanceCounter - Handles the lowering of builtin intrinsics that
   14270 // read performance monitor counters (x86_rdpmc).
   14271 static void getReadPerformanceCounter(SDNode *N, SDLoc DL,
   14272                               SelectionDAG &DAG, const X86Subtarget *Subtarget,
   14273                               SmallVectorImpl<SDValue> &Results) {
   14274   assert(N->getNumOperands() == 3 && "Unexpected number of operands!");
   14275   SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue);
   14276   SDValue LO, HI;
   14277 
   14278   // The ECX register is used to select the index of the performance counter
   14279   // to read.
   14280   SDValue Chain = DAG.getCopyToReg(N->getOperand(0), DL, X86::ECX,
   14281                                    N->getOperand(2));
   14282   SDValue rd = DAG.getNode(X86ISD::RDPMC_DAG, DL, Tys, Chain);
   14283 
   14284   // Reads the content of a 64-bit performance counter and returns it in the
   14285   // registers EDX:EAX.
   14286   if (Subtarget->is64Bit()) {
   14287     LO = DAG.getCopyFromReg(rd, DL, X86::RAX, MVT::i64, rd.getValue(1));
   14288     HI = DAG.getCopyFromReg(LO.getValue(1), DL, X86::RDX, MVT::i64,
   14289                             LO.getValue(2));
   14290   } else {
   14291     LO = DAG.getCopyFromReg(rd, DL, X86::EAX, MVT::i32, rd.getValue(1));
   14292     HI = DAG.getCopyFromReg(LO.getValue(1), DL, X86::EDX, MVT::i32,
   14293                             LO.getValue(2));
   14294   }
   14295   Chain = HI.getValue(1);
   14296 
   14297   if (Subtarget->is64Bit()) {
   14298     // The EAX register is loaded with the low-order 32 bits. The EDX register
   14299     // is loaded with the supported high-order bits of the counter.
   14300     SDValue Tmp = DAG.getNode(ISD::SHL, DL, MVT::i64, HI,
   14301                               DAG.getConstant(32, MVT::i8));
   14302     Results.push_back(DAG.getNode(ISD::OR, DL, MVT::i64, LO, Tmp));
   14303     Results.push_back(Chain);
   14304     return;
   14305   }
   14306 
   14307   // Use a buildpair to merge the two 32-bit values into a 64-bit one.
   14308   SDValue Ops[] = { LO, HI };
   14309   SDValue Pair = DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, Ops);
   14310   Results.push_back(Pair);
   14311   Results.push_back(Chain);
   14312 }
   14313 
   14314 // getReadTimeStampCounter - Handles the lowering of builtin intrinsics that
   14315 // read the time stamp counter (x86_rdtsc and x86_rdtscp). This function is
   14316 // also used to custom lower READCYCLECOUNTER nodes.
   14317 static void getReadTimeStampCounter(SDNode *N, SDLoc DL, unsigned Opcode,
   14318                               SelectionDAG &DAG, const X86Subtarget *Subtarget,
   14319                               SmallVectorImpl<SDValue> &Results) {
   14320   SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue);
   14321   SDValue rd = DAG.getNode(Opcode, DL, Tys, N->getOperand(0));
   14322   SDValue LO, HI;
   14323 
   14324   // The processor's time-stamp counter (a 64-bit MSR) is stored into the
   14325   // EDX:EAX registers. EDX is loaded with the high-order 32 bits of the MSR
   14326   // and the EAX register is loaded with the low-order 32 bits.
   14327   if (Subtarget->is64Bit()) {
   14328     LO = DAG.getCopyFromReg(rd, DL, X86::RAX, MVT::i64, rd.getValue(1));
   14329     HI = DAG.getCopyFromReg(LO.getValue(1), DL, X86::RDX, MVT::i64,
   14330                             LO.getValue(2));
   14331   } else {
   14332     LO = DAG.getCopyFromReg(rd, DL, X86::EAX, MVT::i32, rd.getValue(1));
   14333     HI = DAG.getCopyFromReg(LO.getValue(1), DL, X86::EDX, MVT::i32,
   14334                             LO.getValue(2));
   14335   }
   14336   SDValue Chain = HI.getValue(1);
   14337 
   14338   if (Opcode == X86ISD::RDTSCP_DAG) {
   14339     assert(N->getNumOperands() == 3 && "Unexpected number of operands!");
   14340 
   14341     // Instruction RDTSCP loads the IA32:TSC_AUX_MSR (address C000_0103H) into
   14342     // the ECX register. Add 'ecx' explicitly to the chain.
   14343     SDValue ecx = DAG.getCopyFromReg(Chain, DL, X86::ECX, MVT::i32,
   14344                                      HI.getValue(2));
   14345     // Explicitly store the content of ECX at the location passed in input
   14346     // to the 'rdtscp' intrinsic.
   14347     Chain = DAG.getStore(ecx.getValue(1), DL, ecx, N->getOperand(2),
   14348                          MachinePointerInfo(), false, false, 0);
   14349   }
   14350 
   14351   if (Subtarget->is64Bit()) {
   14352     // The EDX register is loaded with the high-order 32 bits of the MSR, and
   14353     // the EAX register is loaded with the low-order 32 bits.
   14354     SDValue Tmp = DAG.getNode(ISD::SHL, DL, MVT::i64, HI,
   14355                               DAG.getConstant(32, MVT::i8));
   14356     Results.push_back(DAG.getNode(ISD::OR, DL, MVT::i64, LO, Tmp));
   14357     Results.push_back(Chain);
   14358     return;
   14359   }
   14360 
   14361   // Use a buildpair to merge the two 32-bit values into a 64-bit one.
   14362   SDValue Ops[] = { LO, HI };
   14363   SDValue Pair = DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, Ops);
   14364   Results.push_back(Pair);
   14365   Results.push_back(Chain);
   14366 }
   14367 
   14368 static SDValue LowerREADCYCLECOUNTER(SDValue Op, const X86Subtarget *Subtarget,
   14369                                      SelectionDAG &DAG) {
   14370   SmallVector<SDValue, 2> Results;
   14371   SDLoc DL(Op);
   14372   getReadTimeStampCounter(Op.getNode(), DL, X86ISD::RDTSC_DAG, DAG, Subtarget,
   14373                           Results);
   14374   return DAG.getMergeValues(Results, DL);
   14375 }
   14376 
   14377 enum IntrinsicType {
   14378   GATHER, SCATTER, PREFETCH, RDSEED, RDRAND, RDPMC, RDTSC, XTEST
   14379 };
   14380 
   14381 struct IntrinsicData {
   14382   IntrinsicData(IntrinsicType IType, unsigned IOpc0, unsigned IOpc1)
   14383     :Type(IType), Opc0(IOpc0), Opc1(IOpc1) {}
   14384   IntrinsicType Type;
   14385   unsigned      Opc0;
   14386   unsigned      Opc1;
   14387 };
   14388 
   14389 std::map < unsigned, IntrinsicData> IntrMap;
   14390 static void InitIntinsicsMap() {
   14391   static bool Initialized = false;
   14392   if (Initialized)
   14393     return;
   14394   IntrMap.insert(std::make_pair(Intrinsic::x86_avx512_gather_qps_512,
   14395                                 IntrinsicData(GATHER, X86::VGATHERQPSZrm, 0)));
   14396   IntrMap.insert(std::make_pair(Intrinsic::x86_avx512_gather_qps_512,
   14397                                 IntrinsicData(GATHER, X86::VGATHERQPSZrm, 0)));
   14398   IntrMap.insert(std::make_pair(Intrinsic::x86_avx512_gather_qpd_512,
   14399                                 IntrinsicData(GATHER, X86::VGATHERQPDZrm, 0)));
   14400   IntrMap.insert(std::make_pair(Intrinsic::x86_avx512_gather_dpd_512,
   14401                                 IntrinsicData(GATHER, X86::VGATHERDPDZrm, 0)));
   14402   IntrMap.insert(std::make_pair(Intrinsic::x86_avx512_gather_dps_512,
   14403                                 IntrinsicData(GATHER, X86::VGATHERDPSZrm, 0)));
   14404   IntrMap.insert(std::make_pair(Intrinsic::x86_avx512_gather_qpi_512,
   14405                                 IntrinsicData(GATHER, X86::VPGATHERQDZrm, 0)));
   14406   IntrMap.insert(std::make_pair(Intrinsic::x86_avx512_gather_qpq_512,
   14407                                 IntrinsicData(GATHER, X86::VPGATHERQQZrm, 0)));
   14408   IntrMap.insert(std::make_pair(Intrinsic::x86_avx512_gather_dpi_512,
   14409                                 IntrinsicData(GATHER, X86::VPGATHERDDZrm, 0)));
   14410   IntrMap.insert(std::make_pair(Intrinsic::x86_avx512_gather_dpq_512,
   14411                                 IntrinsicData(GATHER, X86::VPGATHERDQZrm, 0)));
   14412 
   14413   IntrMap.insert(std::make_pair(Intrinsic::x86_avx512_scatter_qps_512,
   14414                                 IntrinsicData(SCATTER, X86::VSCATTERQPSZmr, 0)));
   14415   IntrMap.insert(std::make_pair(Intrinsic::x86_avx512_scatter_qpd_512,
   14416                                 IntrinsicData(SCATTER, X86::VSCATTERQPDZmr, 0)));
   14417   IntrMap.insert(std::make_pair(Intrinsic::x86_avx512_scatter_dpd_512,
   14418                                 IntrinsicData(SCATTER, X86::VSCATTERDPDZmr, 0)));
   14419   IntrMap.insert(std::make_pair(Intrinsic::x86_avx512_scatter_dps_512,
   14420                                 IntrinsicData(SCATTER, X86::VSCATTERDPSZmr, 0)));
   14421   IntrMap.insert(std::make_pair(Intrinsic::x86_avx512_scatter_qpi_512,
   14422                                 IntrinsicData(SCATTER, X86::VPSCATTERQDZmr, 0)));
   14423   IntrMap.insert(std::make_pair(Intrinsic::x86_avx512_scatter_qpq_512,
   14424                                 IntrinsicData(SCATTER, X86::VPSCATTERQQZmr, 0)));
   14425   IntrMap.insert(std::make_pair(Intrinsic::x86_avx512_scatter_dpi_512,
   14426                                 IntrinsicData(SCATTER, X86::VPSCATTERDDZmr, 0)));
   14427   IntrMap.insert(std::make_pair(Intrinsic::x86_avx512_scatter_dpq_512,
   14428                                 IntrinsicData(SCATTER, X86::VPSCATTERDQZmr, 0)));
   14429 
   14430   IntrMap.insert(std::make_pair(Intrinsic::x86_avx512_gatherpf_qps_512,
   14431                                 IntrinsicData(PREFETCH, X86::VGATHERPF0QPSm,
   14432                                                         X86::VGATHERPF1QPSm)));
   14433   IntrMap.insert(std::make_pair(Intrinsic::x86_avx512_gatherpf_qpd_512,
   14434                                 IntrinsicData(PREFETCH, X86::VGATHERPF0QPDm,
   14435                                                         X86::VGATHERPF1QPDm)));
   14436   IntrMap.insert(std::make_pair(Intrinsic::x86_avx512_gatherpf_dpd_512,
   14437                                 IntrinsicData(PREFETCH, X86::VGATHERPF0DPDm,
   14438                                                         X86::VGATHERPF1DPDm)));
   14439   IntrMap.insert(std::make_pair(Intrinsic::x86_avx512_gatherpf_dps_512,
   14440                                 IntrinsicData(PREFETCH, X86::VGATHERPF0DPSm,
   14441                                                         X86::VGATHERPF1DPSm)));
   14442   IntrMap.insert(std::make_pair(Intrinsic::x86_avx512_scatterpf_qps_512,
   14443                                 IntrinsicData(PREFETCH, X86::VSCATTERPF0QPSm,
   14444                                                         X86::VSCATTERPF1QPSm)));
   14445   IntrMap.insert(std::make_pair(Intrinsic::x86_avx512_scatterpf_qpd_512,
   14446                                 IntrinsicData(PREFETCH, X86::VSCATTERPF0QPDm,
   14447                                                         X86::VSCATTERPF1QPDm)));
   14448   IntrMap.insert(std::make_pair(Intrinsic::x86_avx512_scatterpf_dpd_512,
   14449                                 IntrinsicData(PREFETCH, X86::VSCATTERPF0DPDm,
   14450                                                         X86::VSCATTERPF1DPDm)));
   14451   IntrMap.insert(std::make_pair(Intrinsic::x86_avx512_scatterpf_dps_512,
   14452                                 IntrinsicData(PREFETCH, X86::VSCATTERPF0DPSm,
   14453                                                         X86::VSCATTERPF1DPSm)));
   14454   IntrMap.insert(std::make_pair(Intrinsic::x86_rdrand_16,
   14455                                 IntrinsicData(RDRAND, X86ISD::RDRAND, 0)));
   14456   IntrMap.insert(std::make_pair(Intrinsic::x86_rdrand_32,
   14457                                 IntrinsicData(RDRAND, X86ISD::RDRAND, 0)));
   14458   IntrMap.insert(std::make_pair(Intrinsic::x86_rdrand_64,
   14459                                 IntrinsicData(RDRAND, X86ISD::RDRAND, 0)));
   14460   IntrMap.insert(std::make_pair(Intrinsic::x86_rdseed_16,
   14461                                 IntrinsicData(RDSEED, X86ISD::RDSEED, 0)));
   14462   IntrMap.insert(std::make_pair(Intrinsic::x86_rdseed_32,
   14463                                 IntrinsicData(RDSEED, X86ISD::RDSEED, 0)));
   14464   IntrMap.insert(std::make_pair(Intrinsic::x86_rdseed_64,
   14465                                 IntrinsicData(RDSEED, X86ISD::RDSEED, 0)));
   14466   IntrMap.insert(std::make_pair(Intrinsic::x86_xtest,
   14467                                 IntrinsicData(XTEST,  X86ISD::XTEST,  0)));
   14468   IntrMap.insert(std::make_pair(Intrinsic::x86_rdtsc,
   14469                                 IntrinsicData(RDTSC,  X86ISD::RDTSC_DAG, 0)));
   14470   IntrMap.insert(std::make_pair(Intrinsic::x86_rdtscp,
   14471                                 IntrinsicData(RDTSC,  X86ISD::RDTSCP_DAG, 0)));
   14472   IntrMap.insert(std::make_pair(Intrinsic::x86_rdpmc,
   14473                                 IntrinsicData(RDPMC,  X86ISD::RDPMC_DAG, 0)));
   14474   Initialized = true;
   14475 }
   14476 
   14477 static SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, const X86Subtarget *Subtarget,
   14478                                       SelectionDAG &DAG) {
   14479   InitIntinsicsMap();
   14480   unsigned IntNo = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
   14481   std::map < unsigned, IntrinsicData>::const_iterator itr = IntrMap.find(IntNo);
   14482   if (itr == IntrMap.end())
   14483     return SDValue();
   14484 
   14485   SDLoc dl(Op);
   14486   IntrinsicData Intr = itr->second;
   14487   switch(Intr.Type) {
   14488   case RDSEED:
   14489   case RDRAND: {
   14490     // Emit the node with the right value type.
   14491     SDVTList VTs = DAG.getVTList(Op->getValueType(0), MVT::Glue, MVT::Other);
   14492     SDValue Result = DAG.getNode(Intr.Opc0, dl, VTs, Op.getOperand(0));
   14493 
   14494     // If the value returned by RDRAND/RDSEED was valid (CF=1), return 1.
   14495     // Otherwise return the value from Rand, which is always 0, casted to i32.
   14496     SDValue Ops[] = { DAG.getZExtOrTrunc(Result, dl, Op->getValueType(1)),
   14497                       DAG.getConstant(1, Op->getValueType(1)),
   14498                       DAG.getConstant(X86::COND_B, MVT::i32),
   14499                       SDValue(Result.getNode(), 1) };
   14500     SDValue isValid = DAG.getNode(X86ISD::CMOV, dl,
   14501                                   DAG.getVTList(Op->getValueType(1), MVT::Glue),
   14502                                   Ops);
   14503 
   14504     // Return { result, isValid, chain }.
   14505     return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(), Result, isValid,
   14506                        SDValue(Result.getNode(), 2));
   14507   }
   14508   case GATHER: {
   14509   //gather(v1, mask, index, base, scale);
   14510     SDValue Chain = Op.getOperand(0);
   14511     SDValue Src   = Op.getOperand(2);
   14512     SDValue Base  = Op.getOperand(3);
   14513     SDValue Index = Op.getOperand(4);
   14514     SDValue Mask  = Op.getOperand(5);
   14515     SDValue Scale = Op.getOperand(6);
   14516     return getGatherNode(Intr.Opc0, Op, DAG, Src, Mask, Base, Index, Scale, Chain,
   14517                           Subtarget);
   14518   }
   14519   case SCATTER: {
   14520   //scatter(base, mask, index, v1, scale);
   14521     SDValue Chain = Op.getOperand(0);
   14522     SDValue Base  = Op.getOperand(2);
   14523     SDValue Mask  = Op.getOperand(3);
   14524     SDValue Index = Op.getOperand(4);
   14525     SDValue Src   = Op.getOperand(5);
   14526     SDValue Scale = Op.getOperand(6);
   14527     return getScatterNode(Intr.Opc0, Op, DAG, Src, Mask, Base, Index, Scale, Chain);
   14528   }
   14529   case PREFETCH: {
   14530     SDValue Hint = Op.getOperand(6);
   14531     unsigned HintVal;
   14532     if (dyn_cast<ConstantSDNode> (Hint) == nullptr ||
   14533         (HintVal = dyn_cast<ConstantSDNode> (Hint)->getZExtValue()) > 1)
   14534       llvm_unreachable("Wrong prefetch hint in intrinsic: should be 0 or 1");
   14535     unsigned Opcode = (HintVal ? Intr.Opc1 : Intr.Opc0);
   14536     SDValue Chain = Op.getOperand(0);
   14537     SDValue Mask  = Op.getOperand(2);
   14538     SDValue Index = Op.getOperand(3);
   14539     SDValue Base  = Op.getOperand(4);
   14540     SDValue Scale = Op.getOperand(5);
   14541     return getPrefetchNode(Opcode, Op, DAG, Mask, Base, Index, Scale, Chain);
   14542   }
   14543   // Read Time Stamp Counter (RDTSC) and Processor ID (RDTSCP).
   14544   case RDTSC: {
   14545     SmallVector<SDValue, 2> Results;
   14546     getReadTimeStampCounter(Op.getNode(), dl, Intr.Opc0, DAG, Subtarget, Results);
   14547     return DAG.getMergeValues(Results, dl);
   14548   }
   14549   // Read Performance Monitoring Counters.
   14550   case RDPMC: {
   14551     SmallVector<SDValue, 2> Results;
   14552     getReadPerformanceCounter(Op.getNode(), dl, DAG, Subtarget, Results);
   14553     return DAG.getMergeValues(Results, dl);
   14554   }
   14555   // XTEST intrinsics.
   14556   case XTEST: {
   14557     SDVTList VTs = DAG.getVTList(Op->getValueType(0), MVT::Other);
   14558     SDValue InTrans = DAG.getNode(X86ISD::XTEST, dl, VTs, Op.getOperand(0));
   14559     SDValue SetCC = DAG.getNode(X86ISD::SETCC, dl, MVT::i8,
   14560                                 DAG.getConstant(X86::COND_NE, MVT::i8),
   14561                                 InTrans);
   14562     SDValue Ret = DAG.getNode(ISD::ZERO_EXTEND, dl, Op->getValueType(0), SetCC);
   14563     return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(),
   14564                        Ret, SDValue(InTrans.getNode(), 1));
   14565   }
   14566   }
   14567   llvm_unreachable("Unknown Intrinsic Type");
   14568 }
   14569 
   14570 SDValue X86TargetLowering::LowerRETURNADDR(SDValue Op,
   14571                                            SelectionDAG &DAG) const {
   14572   MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo();
   14573   MFI->setReturnAddressIsTaken(true);
   14574 
   14575   if (verifyReturnAddressArgumentIsConstant(Op, DAG))
   14576     return SDValue();
   14577 
   14578   unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
   14579   SDLoc dl(Op);
   14580   EVT PtrVT = getPointerTy();
   14581 
   14582   if (Depth > 0) {
   14583     SDValue FrameAddr = LowerFRAMEADDR(Op, DAG);
   14584     const X86RegisterInfo *RegInfo =
   14585       static_cast<const X86RegisterInfo*>(DAG.getTarget().getRegisterInfo());
   14586     SDValue Offset = DAG.getConstant(RegInfo->getSlotSize(), PtrVT);
   14587     return DAG.getLoad(PtrVT, dl, DAG.getEntryNode(),
   14588                        DAG.getNode(ISD::ADD, dl, PtrVT,
   14589                                    FrameAddr, Offset),
   14590                        MachinePointerInfo(), false, false, false, 0);
   14591   }
   14592 
   14593   // Just load the return address.
   14594   SDValue RetAddrFI = getReturnAddressFrameIndex(DAG);
   14595   return DAG.getLoad(PtrVT, dl, DAG.getEntryNode(),
   14596                      RetAddrFI, MachinePointerInfo(), false, false, false, 0);
   14597 }
   14598 
   14599 SDValue X86TargetLowering::LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) const {
   14600   MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo();
   14601   MFI->setFrameAddressIsTaken(true);
   14602 
   14603   EVT VT = Op.getValueType();
   14604   SDLoc dl(Op);  // FIXME probably not meaningful
   14605   unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
   14606   const X86RegisterInfo *RegInfo =
   14607     static_cast<const X86RegisterInfo*>(DAG.getTarget().getRegisterInfo());
   14608   unsigned FrameReg = RegInfo->getFrameRegister(DAG.getMachineFunction());
   14609   assert(((FrameReg == X86::RBP && VT == MVT::i64) ||
   14610           (FrameReg == X86::EBP && VT == MVT::i32)) &&
   14611          "Invalid Frame Register!");
   14612   SDValue FrameAddr = DAG.getCopyFromReg(DAG.getEntryNode(), dl, FrameReg, VT);
   14613   while (Depth--)
   14614     FrameAddr = DAG.getLoad(VT, dl, DAG.getEntryNode(), FrameAddr,
   14615                             MachinePointerInfo(),
   14616                             false, false, false, 0);
   14617   return FrameAddr;
   14618 }
   14619 
   14620 // FIXME? Maybe this could be a TableGen attribute on some registers and
   14621 // this table could be generated automatically from RegInfo.
   14622 unsigned X86TargetLowering::getRegisterByName(const char* RegName,
   14623                                               EVT VT) const {
   14624   unsigned Reg = StringSwitch<unsigned>(RegName)
   14625                        .Case("esp", X86::ESP)
   14626                        .Case("rsp", X86::RSP)
   14627                        .Default(0);
   14628   if (Reg)
   14629     return Reg;
   14630   report_fatal_error("Invalid register name global variable");
   14631 }
   14632 
   14633 SDValue X86TargetLowering::LowerFRAME_TO_ARGS_OFFSET(SDValue Op,
   14634                                                      SelectionDAG &DAG) const {
   14635   const X86RegisterInfo *RegInfo =
   14636     static_cast<const X86RegisterInfo*>(DAG.getTarget().getRegisterInfo());
   14637   return DAG.getIntPtrConstant(2 * RegInfo->getSlotSize());
   14638 }
   14639 
   14640 SDValue X86TargetLowering::LowerEH_RETURN(SDValue Op, SelectionDAG &DAG) const {
   14641   SDValue Chain     = Op.getOperand(0);
   14642   SDValue Offset    = Op.getOperand(1);
   14643   SDValue Handler   = Op.getOperand(2);
   14644   SDLoc dl      (Op);
   14645 
   14646   EVT PtrVT = getPointerTy();
   14647   const X86RegisterInfo *RegInfo =
   14648     static_cast<const X86RegisterInfo*>(DAG.getTarget().getRegisterInfo());
   14649   unsigned FrameReg = RegInfo->getFrameRegister(DAG.getMachineFunction());
   14650   assert(((FrameReg == X86::RBP && PtrVT == MVT::i64) ||
   14651           (FrameReg == X86::EBP && PtrVT == MVT::i32)) &&
   14652          "Invalid Frame Register!");
   14653   SDValue Frame = DAG.getCopyFromReg(DAG.getEntryNode(), dl, FrameReg, PtrVT);
   14654   unsigned StoreAddrReg = (PtrVT == MVT::i64) ? X86::RCX : X86::ECX;
   14655 
   14656   SDValue StoreAddr = DAG.getNode(ISD::ADD, dl, PtrVT, Frame,
   14657                                  DAG.getIntPtrConstant(RegInfo->getSlotSize()));
   14658   StoreAddr = DAG.getNode(ISD::ADD, dl, PtrVT, StoreAddr, Offset);
   14659   Chain = DAG.getStore(Chain, dl, Handler, StoreAddr, MachinePointerInfo(),
   14660                        false, false, 0);
   14661   Chain = DAG.getCopyToReg(Chain, dl, StoreAddrReg, StoreAddr);
   14662 
   14663   return DAG.getNode(X86ISD::EH_RETURN, dl, MVT::Other, Chain,
   14664                      DAG.getRegister(StoreAddrReg, PtrVT));
   14665 }
   14666 
   14667 SDValue X86TargetLowering::lowerEH_SJLJ_SETJMP(SDValue Op,
   14668                                                SelectionDAG &DAG) const {
   14669   SDLoc DL(Op);
   14670   return DAG.getNode(X86ISD::EH_SJLJ_SETJMP, DL,
   14671                      DAG.getVTList(MVT::i32, MVT::Other),
   14672                      Op.getOperand(0), Op.getOperand(1));
   14673 }
   14674 
   14675 SDValue X86TargetLowering::lowerEH_SJLJ_LONGJMP(SDValue Op,
   14676                                                 SelectionDAG &DAG) const {
   14677   SDLoc DL(Op);
   14678   return DAG.getNode(X86ISD::EH_SJLJ_LONGJMP, DL, MVT::Other,
   14679                      Op.getOperand(0), Op.getOperand(1));
   14680 }
   14681 
   14682 static SDValue LowerADJUST_TRAMPOLINE(SDValue Op, SelectionDAG &DAG) {
   14683   return Op.getOperand(0);
   14684 }
   14685 
   14686 SDValue X86TargetLowering::LowerINIT_TRAMPOLINE(SDValue Op,
   14687                                                 SelectionDAG &DAG) const {
   14688   SDValue Root = Op.getOperand(0);
   14689   SDValue Trmp = Op.getOperand(1); // trampoline
   14690   SDValue FPtr = Op.getOperand(2); // nested function
   14691   SDValue Nest = Op.getOperand(3); // 'nest' parameter value
   14692   SDLoc dl (Op);
   14693 
   14694   const Value *TrmpAddr = cast<SrcValueSDNode>(Op.getOperand(4))->getValue();
   14695   const TargetRegisterInfo* TRI = DAG.getTarget().getRegisterInfo();
   14696 
   14697   if (Subtarget->is64Bit()) {
   14698     SDValue OutChains[6];
   14699 
   14700     // Large code-model.
   14701     const unsigned char JMP64r  = 0xFF; // 64-bit jmp through register opcode.
   14702     const unsigned char MOV64ri = 0xB8; // X86::MOV64ri opcode.
   14703 
   14704     const unsigned char N86R10 = TRI->getEncodingValue(X86::R10) & 0x7;
   14705     const unsigned char N86R11 = TRI->getEncodingValue(X86::R11) & 0x7;
   14706 
   14707     const unsigned char REX_WB = 0x40 | 0x08 | 0x01; // REX prefix
   14708 
   14709     // Load the pointer to the nested function into R11.
   14710     unsigned OpCode = ((MOV64ri | N86R11) << 8) | REX_WB; // movabsq r11
   14711     SDValue Addr = Trmp;
   14712     OutChains[0] = DAG.getStore(Root, dl, DAG.getConstant(OpCode, MVT::i16),
   14713                                 Addr, MachinePointerInfo(TrmpAddr),
   14714                                 false, false, 0);
   14715 
   14716     Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
   14717                        DAG.getConstant(2, MVT::i64));
   14718     OutChains[1] = DAG.getStore(Root, dl, FPtr, Addr,
   14719                                 MachinePointerInfo(TrmpAddr, 2),
   14720                                 false, false, 2);
   14721 
   14722     // Load the 'nest' parameter value into R10.
   14723     // R10 is specified in X86CallingConv.td
   14724     OpCode = ((MOV64ri | N86R10) << 8) | REX_WB; // movabsq r10
   14725     Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
   14726                        DAG.getConstant(10, MVT::i64));
   14727     OutChains[2] = DAG.getStore(Root, dl, DAG.getConstant(OpCode, MVT::i16),
   14728                                 Addr, MachinePointerInfo(TrmpAddr, 10),
   14729                                 false, false, 0);
   14730 
   14731     Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
   14732                        DAG.getConstant(12, MVT::i64));
   14733     OutChains[3] = DAG.getStore(Root, dl, Nest, Addr,
   14734                                 MachinePointerInfo(TrmpAddr, 12),
   14735                                 false, false, 2);
   14736 
   14737     // Jump to the nested function.
   14738     OpCode = (JMP64r << 8) | REX_WB; // jmpq *...
   14739     Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
   14740                        DAG.getConstant(20, MVT::i64));
   14741     OutChains[4] = DAG.getStore(Root, dl, DAG.getConstant(OpCode, MVT::i16),
   14742                                 Addr, MachinePointerInfo(TrmpAddr, 20),
   14743                                 false, false, 0);
   14744 
   14745     unsigned char ModRM = N86R11 | (4 << 3) | (3 << 6); // ...r11
   14746     Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
   14747                        DAG.getConstant(22, MVT::i64));
   14748     OutChains[5] = DAG.getStore(Root, dl, DAG.getConstant(ModRM, MVT::i8), Addr,
   14749                                 MachinePointerInfo(TrmpAddr, 22),
   14750                                 false, false, 0);
   14751 
   14752     return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, OutChains);
   14753   } else {
   14754     const Function *Func =
   14755       cast<Function>(cast<SrcValueSDNode>(Op.getOperand(5))->getValue());
   14756     CallingConv::ID CC = Func->getCallingConv();
   14757     unsigned NestReg;
   14758 
   14759     switch (CC) {
   14760     default:
   14761       llvm_unreachable("Unsupported calling convention");
   14762     case CallingConv::C:
   14763     case CallingConv::X86_StdCall: {
   14764       // Pass 'nest' parameter in ECX.
   14765       // Must be kept in sync with X86CallingConv.td
   14766       NestReg = X86::ECX;
   14767 
   14768       // Check that ECX wasn't needed by an 'inreg' parameter.
   14769       FunctionType *FTy = Func->getFunctionType();
   14770       const AttributeSet &Attrs = Func->getAttributes();
   14771 
   14772       if (!Attrs.isEmpty() && !Func->isVarArg()) {
   14773         unsigned InRegCount = 0;
   14774         unsigned Idx = 1;
   14775 
   14776         for (FunctionType::param_iterator I = FTy->param_begin(),
   14777              E = FTy->param_end(); I != E; ++I, ++Idx)
   14778           if (Attrs.hasAttribute(Idx, Attribute::InReg))
   14779             // FIXME: should only count parameters that are lowered to integers.
   14780             InRegCount += (TD->getTypeSizeInBits(*I) + 31) / 32;
   14781 
   14782         if (InRegCount > 2) {
   14783           report_fatal_error("Nest register in use - reduce number of inreg"
   14784                              " parameters!");
   14785         }
   14786       }
   14787       break;
   14788     }
   14789     case CallingConv::X86_FastCall:
   14790     case CallingConv::X86_ThisCall:
   14791     case CallingConv::Fast:
   14792       // Pass 'nest' parameter in EAX.
   14793       // Must be kept in sync with X86CallingConv.td
   14794       NestReg = X86::EAX;
   14795       break;
   14796     }
   14797 
   14798     SDValue OutChains[4];
   14799     SDValue Addr, Disp;
   14800 
   14801     Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp,
   14802                        DAG.getConstant(10, MVT::i32));
   14803     Disp = DAG.getNode(ISD::SUB, dl, MVT::i32, FPtr, Addr);
   14804 
   14805     // This is storing the opcode for MOV32ri.
   14806     const unsigned char MOV32ri = 0xB8; // X86::MOV32ri's opcode byte.
   14807     const unsigned char N86Reg = TRI->getEncodingValue(NestReg) & 0x7;
   14808     OutChains[0] = DAG.getStore(Root, dl,
   14809                                 DAG.getConstant(MOV32ri|N86Reg, MVT::i8),
   14810                                 Trmp, MachinePointerInfo(TrmpAddr),
   14811                                 false, false, 0);
   14812 
   14813     Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp,
   14814                        DAG.getConstant(1, MVT::i32));
   14815     OutChains[1] = DAG.getStore(Root, dl, Nest, Addr,
   14816                                 MachinePointerInfo(TrmpAddr, 1),
   14817                                 false, false, 1);
   14818 
   14819     const unsigned char JMP = 0xE9; // jmp <32bit dst> opcode.
   14820     Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp,
   14821                        DAG.getConstant(5, MVT::i32));
   14822     OutChains[2] = DAG.getStore(Root, dl, DAG.getConstant(JMP, MVT::i8), Addr,
   14823                                 MachinePointerInfo(TrmpAddr, 5),
   14824                                 false, false, 1);
   14825 
   14826     Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp,
   14827                        DAG.getConstant(6, MVT::i32));
   14828     OutChains[3] = DAG.getStore(Root, dl, Disp, Addr,
   14829                                 MachinePointerInfo(TrmpAddr, 6),
   14830                                 false, false, 1);
   14831 
   14832     return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, OutChains);
   14833   }
   14834 }
   14835 
   14836 SDValue X86TargetLowering::LowerFLT_ROUNDS_(SDValue Op,
   14837                                             SelectionDAG &DAG) const {
   14838   /*
   14839    The rounding mode is in bits 11:10 of FPSR, and has the following
   14840    settings:
   14841      00 Round to nearest
   14842      01 Round to -inf
   14843      10 Round to +inf
   14844      11 Round to 0
   14845 
   14846   FLT_ROUNDS, on the other hand, expects the following:
   14847     -1 Undefined
   14848      0 Round to 0
   14849      1 Round to nearest
   14850      2 Round to +inf
   14851      3 Round to -inf
   14852 
   14853   To perform the conversion, we do:
   14854     (((((FPSR & 0x800) >> 11) | ((FPSR & 0x400) >> 9)) + 1) & 3)
   14855   */
   14856 
   14857   MachineFunction &MF = DAG.getMachineFunction();
   14858   const TargetMachine &TM = MF.getTarget();
   14859   const TargetFrameLowering &TFI = *TM.getFrameLowering();
   14860   unsigned StackAlignment = TFI.getStackAlignment();
   14861   MVT VT = Op.getSimpleValueType();
   14862   SDLoc DL(Op);
   14863 
   14864   // Save FP Control Word to stack slot
   14865   int SSFI = MF.getFrameInfo()->CreateStackObject(2, StackAlignment, false);
   14866   SDValue StackSlot = DAG.getFrameIndex(SSFI, getPointerTy());
   14867 
   14868   MachineMemOperand *MMO =
   14869    MF.getMachineMemOperand(MachinePointerInfo::getFixedStack(SSFI),
   14870                            MachineMemOperand::MOStore, 2, 2);
   14871 
   14872   SDValue Ops[] = { DAG.getEntryNode(), StackSlot };
   14873   SDValue Chain = DAG.getMemIntrinsicNode(X86ISD::FNSTCW16m, DL,
   14874                                           DAG.getVTList(MVT::Other),
   14875                                           Ops, MVT::i16, MMO);
   14876 
   14877   // Load FP Control Word from stack slot
   14878   SDValue CWD = DAG.getLoad(MVT::i16, DL, Chain, StackSlot,
   14879                             MachinePointerInfo(), false, false, false, 0);
   14880 
   14881   // Transform as necessary
   14882   SDValue CWD1 =
   14883     DAG.getNode(ISD::SRL, DL, MVT::i16,
   14884                 DAG.getNode(ISD::AND, DL, MVT::i16,
   14885                             CWD, DAG.getConstant(0x800, MVT::i16)),
   14886                 DAG.getConstant(11, MVT::i8));
   14887   SDValue CWD2 =
   14888     DAG.getNode(ISD::SRL, DL, MVT::i16,
   14889                 DAG.getNode(ISD::AND, DL, MVT::i16,
   14890                             CWD, DAG.getConstant(0x400, MVT::i16)),
   14891                 DAG.getConstant(9, MVT::i8));
   14892 
   14893   SDValue RetVal =
   14894     DAG.getNode(ISD::AND, DL, MVT::i16,
   14895                 DAG.getNode(ISD::ADD, DL, MVT::i16,
   14896                             DAG.getNode(ISD::OR, DL, MVT::i16, CWD1, CWD2),
   14897                             DAG.getConstant(1, MVT::i16)),
   14898                 DAG.getConstant(3, MVT::i16));
   14899 
   14900   return DAG.getNode((VT.getSizeInBits() < 16 ?
   14901                       ISD::TRUNCATE : ISD::ZERO_EXTEND), DL, VT, RetVal);
   14902 }
   14903 
   14904 static SDValue LowerCTLZ(SDValue Op, SelectionDAG &DAG) {
   14905   MVT VT = Op.getSimpleValueType();
   14906   EVT OpVT = VT;
   14907   unsigned NumBits = VT.getSizeInBits();
   14908   SDLoc dl(Op);
   14909 
   14910   Op = Op.getOperand(0);
   14911   if (VT == MVT::i8) {
   14912     // Zero extend to i32 since there is not an i8 bsr.
   14913     OpVT = MVT::i32;
   14914     Op = DAG.getNode(ISD::ZERO_EXTEND, dl, OpVT, Op);
   14915   }
   14916 
   14917   // Issue a bsr (scan bits in reverse) which also sets EFLAGS.
   14918   SDVTList VTs = DAG.getVTList(OpVT, MVT::i32);
   14919   Op = DAG.getNode(X86ISD::BSR, dl, VTs, Op);
   14920 
   14921   // If src is zero (i.e. bsr sets ZF), returns NumBits.
   14922   SDValue Ops[] = {
   14923     Op,
   14924     DAG.getConstant(NumBits+NumBits-1, OpVT),
   14925     DAG.getConstant(X86::COND_E, MVT::i8),
   14926     Op.getValue(1)
   14927   };
   14928   Op = DAG.getNode(X86ISD::CMOV, dl, OpVT, Ops);
   14929 
   14930   // Finally xor with NumBits-1.
   14931   Op = DAG.getNode(ISD::XOR, dl, OpVT, Op, DAG.getConstant(NumBits-1, OpVT));
   14932 
   14933   if (VT == MVT::i8)
   14934     Op = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Op);
   14935   return Op;
   14936 }
   14937 
   14938 static SDValue LowerCTLZ_ZERO_UNDEF(SDValue Op, SelectionDAG &DAG) {
   14939   MVT VT = Op.getSimpleValueType();
   14940   EVT OpVT = VT;
   14941   unsigned NumBits = VT.getSizeInBits();
   14942   SDLoc dl(Op);
   14943 
   14944   Op = Op.getOperand(0);
   14945   if (VT == MVT::i8) {
   14946     // Zero extend to i32 since there is not an i8 bsr.
   14947     OpVT = MVT::i32;
   14948     Op = DAG.getNode(ISD::ZERO_EXTEND, dl, OpVT, Op);
   14949   }
   14950 
   14951   // Issue a bsr (scan bits in reverse).
   14952   SDVTList VTs = DAG.getVTList(OpVT, MVT::i32);
   14953   Op = DAG.getNode(X86ISD::BSR, dl, VTs, Op);
   14954 
   14955   // And xor with NumBits-1.
   14956   Op = DAG.getNode(ISD::XOR, dl, OpVT, Op, DAG.getConstant(NumBits-1, OpVT));
   14957 
   14958   if (VT == MVT::i8)
   14959     Op = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Op);
   14960   return Op;
   14961 }
   14962 
   14963 static SDValue LowerCTTZ(SDValue Op, SelectionDAG &DAG) {
   14964   MVT VT = Op.getSimpleValueType();
   14965   unsigned NumBits = VT.getSizeInBits();
   14966   SDLoc dl(Op);
   14967   Op = Op.getOperand(0);
   14968 
   14969   // Issue a bsf (scan bits forward) which also sets EFLAGS.
   14970   SDVTList VTs = DAG.getVTList(VT, MVT::i32);
   14971   Op = DAG.getNode(X86ISD::BSF, dl, VTs, Op);
   14972 
   14973   // If src is zero (i.e. bsf sets ZF), returns NumBits.
   14974   SDValue Ops[] = {
   14975     Op,
   14976     DAG.getConstant(NumBits, VT),
   14977     DAG.getConstant(X86::COND_E, MVT::i8),
   14978     Op.getValue(1)
   14979   };
   14980   return DAG.getNode(X86ISD::CMOV, dl, VT, Ops);
   14981 }
   14982 
   14983 // Lower256IntArith - Break a 256-bit integer operation into two new 128-bit
   14984 // ones, and then concatenate the result back.
   14985 static SDValue Lower256IntArith(SDValue Op, SelectionDAG &DAG) {
   14986   MVT VT = Op.getSimpleValueType();
   14987 
   14988   assert(VT.is256BitVector() && VT.isInteger() &&
   14989          "Unsupported value type for operation");
   14990 
   14991   unsigned NumElems = VT.getVectorNumElements();
   14992   SDLoc dl(Op);
   14993 
   14994   // Extract the LHS vectors
   14995   SDValue LHS = Op.getOperand(0);
   14996   SDValue LHS1 = Extract128BitVector(LHS, 0, DAG, dl);
   14997   SDValue LHS2 = Extract128BitVector(LHS, NumElems/2, DAG, dl);
   14998 
   14999   // Extract the RHS vectors
   15000   SDValue RHS = Op.getOperand(1);
   15001   SDValue RHS1 = Extract128BitVector(RHS, 0, DAG, dl);
   15002   SDValue RHS2 = Extract128BitVector(RHS, NumElems/2, DAG, dl);
   15003 
   15004   MVT EltVT = VT.getVectorElementType();
   15005   MVT NewVT = MVT::getVectorVT(EltVT, NumElems/2);
   15006 
   15007   return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT,
   15008                      DAG.getNode(Op.getOpcode(), dl, NewVT, LHS1, RHS1),
   15009                      DAG.getNode(Op.getOpcode(), dl, NewVT, LHS2, RHS2));
   15010 }
   15011 
   15012 static SDValue LowerADD(SDValue Op, SelectionDAG &DAG) {
   15013   assert(Op.getSimpleValueType().is256BitVector() &&
   15014          Op.getSimpleValueType().isInteger() &&
   15015          "Only handle AVX 256-bit vector integer operation");
   15016   return Lower256IntArith(Op, DAG);
   15017 }
   15018 
   15019 static SDValue LowerSUB(SDValue Op, SelectionDAG &DAG) {
   15020   assert(Op.getSimpleValueType().is256BitVector() &&
   15021          Op.getSimpleValueType().isInteger() &&
   15022          "Only handle AVX 256-bit vector integer operation");
   15023   return Lower256IntArith(Op, DAG);
   15024 }
   15025 
   15026 static SDValue LowerMUL(SDValue Op, const X86Subtarget *Subtarget,
   15027                         SelectionDAG &DAG) {
   15028   SDLoc dl(Op);
   15029   MVT VT = Op.getSimpleValueType();
   15030 
   15031   // Decompose 256-bit ops into smaller 128-bit ops.
   15032   if (VT.is256BitVector() && !Subtarget->hasInt256())
   15033     return Lower256IntArith(Op, DAG);
   15034 
   15035   SDValue A = Op.getOperand(0);
   15036   SDValue B = Op.getOperand(1);
   15037 
   15038   // Lower v4i32 mul as 2x shuffle, 2x pmuludq, 2x shuffle.
   15039   if (VT == MVT::v4i32) {
   15040     assert(Subtarget->hasSSE2() && !Subtarget->hasSSE41() &&
   15041            "Should not custom lower when pmuldq is available!");
   15042 
   15043     // Extract the odd parts.
   15044     static const int UnpackMask[] = { 1, -1, 3, -1 };
   15045     SDValue Aodds = DAG.getVectorShuffle(VT, dl, A, A, UnpackMask);
   15046     SDValue Bodds = DAG.getVectorShuffle(VT, dl, B, B, UnpackMask);
   15047 
   15048     // Multiply the even parts.
   15049     SDValue Evens = DAG.getNode(X86ISD::PMULUDQ, dl, MVT::v2i64, A, B);
   15050     // Now multiply odd parts.
   15051     SDValue Odds = DAG.getNode(X86ISD::PMULUDQ, dl, MVT::v2i64, Aodds, Bodds);
   15052 
   15053     Evens = DAG.getNode(ISD::BITCAST, dl, VT, Evens);
   15054     Odds = DAG.getNode(ISD::BITCAST, dl, VT, Odds);
   15055 
   15056     // Merge the two vectors back together with a shuffle. This expands into 2
   15057     // shuffles.
   15058     static const int ShufMask[] = { 0, 4, 2, 6 };
   15059     return DAG.getVectorShuffle(VT, dl, Evens, Odds, ShufMask);
   15060   }
   15061 
   15062   assert((VT == MVT::v2i64 || VT == MVT::v4i64 || VT == MVT::v8i64) &&
   15063          "Only know how to lower V2I64/V4I64/V8I64 multiply");
   15064 
   15065   //  Ahi = psrlqi(a, 32);
   15066   //  Bhi = psrlqi(b, 32);
   15067   //
   15068   //  AloBlo = pmuludq(a, b);
   15069   //  AloBhi = pmuludq(a, Bhi);
   15070   //  AhiBlo = pmuludq(Ahi, b);
   15071 
   15072   //  AloBhi = psllqi(AloBhi, 32);
   15073   //  AhiBlo = psllqi(AhiBlo, 32);
   15074   //  return AloBlo + AloBhi + AhiBlo;
   15075 
   15076   SDValue Ahi = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, VT, A, 32, DAG);
   15077   SDValue Bhi = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, VT, B, 32, DAG);
   15078 
   15079   // Bit cast to 32-bit vectors for MULUDQ
   15080   EVT MulVT = (VT == MVT::v2i64) ? MVT::v4i32 :
   15081                                   (VT == MVT::v4i64) ? MVT::v8i32 : MVT::v16i32;
   15082   A = DAG.getNode(ISD::BITCAST, dl, MulVT, A);
   15083   B = DAG.getNode(ISD::BITCAST, dl, MulVT, B);
   15084   Ahi = DAG.getNode(ISD::BITCAST, dl, MulVT, Ahi);
   15085   Bhi = DAG.getNode(ISD::BITCAST, dl, MulVT, Bhi);
   15086 
   15087   SDValue AloBlo = DAG.getNode(X86ISD::PMULUDQ, dl, VT, A, B);
   15088   SDValue AloBhi = DAG.getNode(X86ISD::PMULUDQ, dl, VT, A, Bhi);
   15089   SDValue AhiBlo = DAG.getNode(X86ISD::PMULUDQ, dl, VT, Ahi, B);
   15090 
   15091   AloBhi = getTargetVShiftByConstNode(X86ISD::VSHLI, dl, VT, AloBhi, 32, DAG);
   15092   AhiBlo = getTargetVShiftByConstNode(X86ISD::VSHLI, dl, VT, AhiBlo, 32, DAG);
   15093 
   15094   SDValue Res = DAG.getNode(ISD::ADD, dl, VT, AloBlo, AloBhi);
   15095   return DAG.getNode(ISD::ADD, dl, VT, Res, AhiBlo);
   15096 }
   15097 
   15098 SDValue X86TargetLowering::LowerWin64_i128OP(SDValue Op, SelectionDAG &DAG) const {
   15099   assert(Subtarget->isTargetWin64() && "Unexpected target");
   15100   EVT VT = Op.getValueType();
   15101   assert(VT.isInteger() && VT.getSizeInBits() == 128 &&
   15102          "Unexpected return type for lowering");
   15103 
   15104   RTLIB::Libcall LC;
   15105   bool isSigned;
   15106   switch (Op->getOpcode()) {
   15107   default: llvm_unreachable("Unexpected request for libcall!");
   15108   case ISD::SDIV:      isSigned = true;  LC = RTLIB::SDIV_I128;    break;
   15109   case ISD::UDIV:      isSigned = false; LC = RTLIB::UDIV_I128;    break;
   15110   case ISD::SREM:      isSigned = true;  LC = RTLIB::SREM_I128;    break;
   15111   case ISD::UREM:      isSigned = false; LC = RTLIB::UREM_I128;    break;
   15112   case ISD::SDIVREM:   isSigned = true;  LC = RTLIB::SDIVREM_I128; break;
   15113   case ISD::UDIVREM:   isSigned = false; LC = RTLIB::UDIVREM_I128; break;
   15114   }
   15115 
   15116   SDLoc dl(Op);
   15117   SDValue InChain = DAG.getEntryNode();
   15118 
   15119   TargetLowering::ArgListTy Args;
   15120   TargetLowering::ArgListEntry Entry;
   15121   for (unsigned i = 0, e = Op->getNumOperands(); i != e; ++i) {
   15122     EVT ArgVT = Op->getOperand(i).getValueType();
   15123     assert(ArgVT.isInteger() && ArgVT.getSizeInBits() == 128 &&
   15124            "Unexpected argument type for lowering");
   15125     SDValue StackPtr = DAG.CreateStackTemporary(ArgVT, 16);
   15126     Entry.Node = StackPtr;
   15127     InChain = DAG.getStore(InChain, dl, Op->getOperand(i), StackPtr, MachinePointerInfo(),
   15128                            false, false, 16);
   15129     Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext());
   15130     Entry.Ty = PointerType::get(ArgTy,0);
   15131     Entry.isSExt = false;
   15132     Entry.isZExt = false;
   15133     Args.push_back(Entry);
   15134   }
   15135 
   15136   SDValue Callee = DAG.getExternalSymbol(getLibcallName(LC),
   15137                                          getPointerTy());
   15138 
   15139   TargetLowering::CallLoweringInfo CLI(DAG);
   15140   CLI.setDebugLoc(dl).setChain(InChain)
   15141     .setCallee(getLibcallCallingConv(LC),
   15142                static_cast<EVT>(MVT::v2i64).getTypeForEVT(*DAG.getContext()),
   15143                Callee, std::move(Args), 0)
   15144     .setInRegister().setSExtResult(isSigned).setZExtResult(!isSigned);
   15145 
   15146   std::pair<SDValue, SDValue> CallInfo = LowerCallTo(CLI);
   15147   return DAG.getNode(ISD::BITCAST, dl, VT, CallInfo.first);
   15148 }
   15149 
   15150 static SDValue LowerMUL_LOHI(SDValue Op, const X86Subtarget *Subtarget,
   15151                              SelectionDAG &DAG) {
   15152   SDValue Op0 = Op.getOperand(0), Op1 = Op.getOperand(1);
   15153   EVT VT = Op0.getValueType();
   15154   SDLoc dl(Op);
   15155 
   15156   assert((VT == MVT::v4i32 && Subtarget->hasSSE2()) ||
   15157          (VT == MVT::v8i32 && Subtarget->hasInt256()));
   15158 
   15159   // Get the high parts.
   15160   const int Mask[] = {1, -1, 3, -1, 5, -1, 7, -1};
   15161   SDValue Hi0 = DAG.getVectorShuffle(VT, dl, Op0, Op0, Mask);
   15162   SDValue Hi1 = DAG.getVectorShuffle(VT, dl, Op1, Op1, Mask);
   15163 
   15164   // Emit two multiplies, one for the lower 2 ints and one for the higher 2
   15165   // ints.
   15166   MVT MulVT = VT == MVT::v4i32 ? MVT::v2i64 : MVT::v4i64;
   15167   bool IsSigned = Op->getOpcode() == ISD::SMUL_LOHI;
   15168   unsigned Opcode =
   15169       (!IsSigned || !Subtarget->hasSSE41()) ? X86ISD::PMULUDQ : X86ISD::PMULDQ;
   15170   SDValue Mul1 = DAG.getNode(ISD::BITCAST, dl, VT,
   15171                              DAG.getNode(Opcode, dl, MulVT, Op0, Op1));
   15172   SDValue Mul2 = DAG.getNode(ISD::BITCAST, dl, VT,
   15173                              DAG.getNode(Opcode, dl, MulVT, Hi0, Hi1));
   15174 
   15175   // Shuffle it back into the right order.
   15176   SDValue Highs, Lows;
   15177   if (VT == MVT::v8i32) {
   15178     const int HighMask[] = {1, 9, 3, 11, 5, 13, 7, 15};
   15179     Highs = DAG.getVectorShuffle(VT, dl, Mul1, Mul2, HighMask);
   15180     const int LowMask[] = {0, 8, 2, 10, 4, 12, 6, 14};
   15181     Lows = DAG.getVectorShuffle(VT, dl, Mul1, Mul2, LowMask);
   15182   } else {
   15183     const int HighMask[] = {1, 5, 3, 7};
   15184     Highs = DAG.getVectorShuffle(VT, dl, Mul1, Mul2, HighMask);
   15185     const int LowMask[] = {0, 4, 2, 6};
   15186     Lows = DAG.getVectorShuffle(VT, dl, Mul1, Mul2, LowMask);
   15187   }
   15188 
   15189   // If we have a signed multiply but no PMULDQ fix up the high parts of a
   15190   // unsigned multiply.
   15191   if (IsSigned && !Subtarget->hasSSE41()) {
   15192     SDValue ShAmt =
   15193         DAG.getConstant(31, DAG.getTargetLoweringInfo().getShiftAmountTy(VT));
   15194     SDValue T1 = DAG.getNode(ISD::AND, dl, VT,
   15195                              DAG.getNode(ISD::SRA, dl, VT, Op0, ShAmt), Op1);
   15196     SDValue T2 = DAG.getNode(ISD::AND, dl, VT,
   15197                              DAG.getNode(ISD::SRA, dl, VT, Op1, ShAmt), Op0);
   15198 
   15199     SDValue Fixup = DAG.getNode(ISD::ADD, dl, VT, T1, T2);
   15200     Highs = DAG.getNode(ISD::SUB, dl, VT, Highs, Fixup);
   15201   }
   15202 
   15203   return DAG.getNode(ISD::MERGE_VALUES, dl, Op.getValueType(), Highs, Lows);
   15204 }
   15205 
   15206 static SDValue LowerScalarImmediateShift(SDValue Op, SelectionDAG &DAG,
   15207                                          const X86Subtarget *Subtarget) {
   15208   MVT VT = Op.getSimpleValueType();
   15209   SDLoc dl(Op);
   15210   SDValue R = Op.getOperand(0);
   15211   SDValue Amt = Op.getOperand(1);
   15212 
   15213   // Optimize shl/srl/sra with constant shift amount.
   15214   if (auto *BVAmt = dyn_cast<BuildVectorSDNode>(Amt)) {
   15215     if (auto *ShiftConst = BVAmt->getConstantSplatNode()) {
   15216       uint64_t ShiftAmt = ShiftConst->getZExtValue();
   15217 
   15218       if (VT == MVT::v2i64 || VT == MVT::v4i32 || VT == MVT::v8i16 ||
   15219           (Subtarget->hasInt256() &&
   15220            (VT == MVT::v4i64 || VT == MVT::v8i32 || VT == MVT::v16i16)) ||
   15221           (Subtarget->hasAVX512() &&
   15222            (VT == MVT::v8i64 || VT == MVT::v16i32))) {
   15223         if (Op.getOpcode() == ISD::SHL)
   15224           return getTargetVShiftByConstNode(X86ISD::VSHLI, dl, VT, R, ShiftAmt,
   15225                                             DAG);
   15226         if (Op.getOpcode() == ISD::SRL)
   15227           return getTargetVShiftByConstNode(X86ISD::VSRLI, dl, VT, R, ShiftAmt,
   15228                                             DAG);
   15229         if (Op.getOpcode() == ISD::SRA && VT != MVT::v2i64 && VT != MVT::v4i64)
   15230           return getTargetVShiftByConstNode(X86ISD::VSRAI, dl, VT, R, ShiftAmt,
   15231                                             DAG);
   15232       }
   15233 
   15234       if (VT == MVT::v16i8) {
   15235         if (Op.getOpcode() == ISD::SHL) {
   15236           // Make a large shift.
   15237           SDValue SHL = getTargetVShiftByConstNode(X86ISD::VSHLI, dl,
   15238                                                    MVT::v8i16, R, ShiftAmt,
   15239                                                    DAG);
   15240           SHL = DAG.getNode(ISD::BITCAST, dl, VT, SHL);
   15241           // Zero out the rightmost bits.
   15242           SmallVector<SDValue, 16> V(16,
   15243                                      DAG.getConstant(uint8_t(-1U << ShiftAmt),
   15244                                                      MVT::i8));
   15245           return DAG.getNode(ISD::AND, dl, VT, SHL,
   15246                              DAG.getNode(ISD::BUILD_VECTOR, dl, VT, V));
   15247         }
   15248         if (Op.getOpcode() == ISD::SRL) {
   15249           // Make a large shift.
   15250           SDValue SRL = getTargetVShiftByConstNode(X86ISD::VSRLI, dl,
   15251                                                    MVT::v8i16, R, ShiftAmt,
   15252                                                    DAG);
   15253           SRL = DAG.getNode(ISD::BITCAST, dl, VT, SRL);
   15254           // Zero out the leftmost bits.
   15255           SmallVector<SDValue, 16> V(16,
   15256                                      DAG.getConstant(uint8_t(-1U) >> ShiftAmt,
   15257                                                      MVT::i8));
   15258           return DAG.getNode(ISD::AND, dl, VT, SRL,
   15259                              DAG.getNode(ISD::BUILD_VECTOR, dl, VT, V));
   15260         }
   15261         if (Op.getOpcode() == ISD::SRA) {
   15262           if (ShiftAmt == 7) {
   15263             // R s>> 7  ===  R s< 0
   15264             SDValue Zeros = getZeroVector(VT, Subtarget, DAG, dl);
   15265             return DAG.getNode(X86ISD::PCMPGT, dl, VT, Zeros, R);
   15266           }
   15267 
   15268           // R s>> a === ((R u>> a) ^ m) - m
   15269           SDValue Res = DAG.getNode(ISD::SRL, dl, VT, R, Amt);
   15270           SmallVector<SDValue, 16> V(16, DAG.getConstant(128 >> ShiftAmt,
   15271                                                          MVT::i8));
   15272           SDValue Mask = DAG.getNode(ISD::BUILD_VECTOR, dl, VT, V);
   15273           Res = DAG.getNode(ISD::XOR, dl, VT, Res, Mask);
   15274           Res = DAG.getNode(ISD::SUB, dl, VT, Res, Mask);
   15275           return Res;
   15276         }
   15277         llvm_unreachable("Unknown shift opcode.");
   15278       }
   15279 
   15280       if (Subtarget->hasInt256() && VT == MVT::v32i8) {
   15281         if (Op.getOpcode() == ISD::SHL) {
   15282           // Make a large shift.
   15283           SDValue SHL = getTargetVShiftByConstNode(X86ISD::VSHLI, dl,
   15284                                                    MVT::v16i16, R, ShiftAmt,
   15285                                                    DAG);
   15286           SHL = DAG.getNode(ISD::BITCAST, dl, VT, SHL);
   15287           // Zero out the rightmost bits.
   15288           SmallVector<SDValue, 32> V(32,
   15289                                      DAG.getConstant(uint8_t(-1U << ShiftAmt),
   15290                                                      MVT::i8));
   15291           return DAG.getNode(ISD::AND, dl, VT, SHL,
   15292                              DAG.getNode(ISD::BUILD_VECTOR, dl, VT, V));
   15293         }
   15294         if (Op.getOpcode() == ISD::SRL) {
   15295           // Make a large shift.
   15296           SDValue SRL = getTargetVShiftByConstNode(X86ISD::VSRLI, dl,
   15297                                                    MVT::v16i16, R, ShiftAmt,
   15298                                                    DAG);
   15299           SRL = DAG.getNode(ISD::BITCAST, dl, VT, SRL);
   15300           // Zero out the leftmost bits.
   15301           SmallVector<SDValue, 32> V(32,
   15302                                      DAG.getConstant(uint8_t(-1U) >> ShiftAmt,
   15303                                                      MVT::i8));
   15304           return DAG.getNode(ISD::AND, dl, VT, SRL,
   15305                              DAG.getNode(ISD::BUILD_VECTOR, dl, VT, V));
   15306         }
   15307         if (Op.getOpcode() == ISD::SRA) {
   15308           if (ShiftAmt == 7) {
   15309             // R s>> 7  ===  R s< 0
   15310             SDValue Zeros = getZeroVector(VT, Subtarget, DAG, dl);
   15311             return DAG.getNode(X86ISD::PCMPGT, dl, VT, Zeros, R);
   15312           }
   15313 
   15314           // R s>> a === ((R u>> a) ^ m) - m
   15315           SDValue Res = DAG.getNode(ISD::SRL, dl, VT, R, Amt);
   15316           SmallVector<SDValue, 32> V(32, DAG.getConstant(128 >> ShiftAmt,
   15317                                                          MVT::i8));
   15318           SDValue Mask = DAG.getNode(ISD::BUILD_VECTOR, dl, VT, V);
   15319           Res = DAG.getNode(ISD::XOR, dl, VT, Res, Mask);
   15320           Res = DAG.getNode(ISD::SUB, dl, VT, Res, Mask);
   15321           return Res;
   15322         }
   15323         llvm_unreachable("Unknown shift opcode.");
   15324       }
   15325     }
   15326   }
   15327 
   15328   // Special case in 32-bit mode, where i64 is expanded into high and low parts.
   15329   if (!Subtarget->is64Bit() &&
   15330       (VT == MVT::v2i64 || (Subtarget->hasInt256() && VT == MVT::v4i64)) &&
   15331       Amt.getOpcode() == ISD::BITCAST &&
   15332       Amt.getOperand(0).getOpcode() == ISD::BUILD_VECTOR) {
   15333     Amt = Amt.getOperand(0);
   15334     unsigned Ratio = Amt.getSimpleValueType().getVectorNumElements() /
   15335                      VT.getVectorNumElements();
   15336     unsigned RatioInLog2 = Log2_32_Ceil(Ratio);
   15337     uint64_t ShiftAmt = 0;
   15338     for (unsigned i = 0; i != Ratio; ++i) {
   15339       ConstantSDNode *C = dyn_cast<ConstantSDNode>(Amt.getOperand(i));
   15340       if (!C)
   15341         return SDValue();
   15342       // 6 == Log2(64)
   15343       ShiftAmt |= C->getZExtValue() << (i * (1 << (6 - RatioInLog2)));
   15344     }
   15345     // Check remaining shift amounts.
   15346     for (unsigned i = Ratio; i != Amt.getNumOperands(); i += Ratio) {
   15347       uint64_t ShAmt = 0;
   15348       for (unsigned j = 0; j != Ratio; ++j) {
   15349         ConstantSDNode *C =
   15350           dyn_cast<ConstantSDNode>(Amt.getOperand(i + j));
   15351         if (!C)
   15352           return SDValue();
   15353         // 6 == Log2(64)
   15354         ShAmt |= C->getZExtValue() << (j * (1 << (6 - RatioInLog2)));
   15355       }
   15356       if (ShAmt != ShiftAmt)
   15357         return SDValue();
   15358     }
   15359     switch (Op.getOpcode()) {
   15360     default:
   15361       llvm_unreachable("Unknown shift opcode!");
   15362     case ISD::SHL:
   15363       return getTargetVShiftByConstNode(X86ISD::VSHLI, dl, VT, R, ShiftAmt,
   15364                                         DAG);
   15365     case ISD::SRL:
   15366       return getTargetVShiftByConstNode(X86ISD::VSRLI, dl, VT, R, ShiftAmt,
   15367                                         DAG);
   15368     case ISD::SRA:
   15369       return getTargetVShiftByConstNode(X86ISD::VSRAI, dl, VT, R, ShiftAmt,
   15370                                         DAG);
   15371     }
   15372   }
   15373 
   15374   return SDValue();
   15375 }
   15376 
   15377 static SDValue LowerScalarVariableShift(SDValue Op, SelectionDAG &DAG,
   15378                                         const X86Subtarget* Subtarget) {
   15379   MVT VT = Op.getSimpleValueType();
   15380   SDLoc dl(Op);
   15381   SDValue R = Op.getOperand(0);
   15382   SDValue Amt = Op.getOperand(1);
   15383 
   15384   if ((VT == MVT::v2i64 && Op.getOpcode() != ISD::SRA) ||
   15385       VT == MVT::v4i32 || VT == MVT::v8i16 ||
   15386       (Subtarget->hasInt256() &&
   15387        ((VT == MVT::v4i64 && Op.getOpcode() != ISD::SRA) ||
   15388         VT == MVT::v8i32 || VT == MVT::v16i16)) ||
   15389        (Subtarget->hasAVX512() && (VT == MVT::v8i64 || VT == MVT::v16i32))) {
   15390     SDValue BaseShAmt;
   15391     EVT EltVT = VT.getVectorElementType();
   15392 
   15393     if (Amt.getOpcode() == ISD::BUILD_VECTOR) {
   15394       unsigned NumElts = VT.getVectorNumElements();
   15395       unsigned i, j;
   15396       for (i = 0; i != NumElts; ++i) {
   15397         if (Amt.getOperand(i).getOpcode() == ISD::UNDEF)
   15398           continue;
   15399         break;
   15400       }
   15401       for (j = i; j != NumElts; ++j) {
   15402         SDValue Arg = Amt.getOperand(j);
   15403         if (Arg.getOpcode() == ISD::UNDEF) continue;
   15404         if (Arg != Amt.getOperand(i))
   15405           break;
   15406       }
   15407       if (i != NumElts && j == NumElts)
   15408         BaseShAmt = Amt.getOperand(i);
   15409     } else {
   15410       if (Amt.getOpcode() == ISD::EXTRACT_SUBVECTOR)
   15411         Amt = Amt.getOperand(0);
   15412       if (Amt.getOpcode() == ISD::VECTOR_SHUFFLE &&
   15413                cast<ShuffleVectorSDNode>(Amt)->isSplat()) {
   15414         SDValue InVec = Amt.getOperand(0);
   15415         if (InVec.getOpcode() == ISD::BUILD_VECTOR) {
   15416           unsigned NumElts = InVec.getValueType().getVectorNumElements();
   15417           unsigned i = 0;
   15418           for (; i != NumElts; ++i) {
   15419             SDValue Arg = InVec.getOperand(i);
   15420             if (Arg.getOpcode() == ISD::UNDEF) continue;
   15421             BaseShAmt = Arg;
   15422             break;
   15423           }
   15424         } else if (InVec.getOpcode() == ISD::INSERT_VECTOR_ELT) {
   15425            if (ConstantSDNode *C =
   15426                dyn_cast<ConstantSDNode>(InVec.getOperand(2))) {
   15427              unsigned SplatIdx =
   15428                cast<ShuffleVectorSDNode>(Amt)->getSplatIndex();
   15429              if (C->getZExtValue() == SplatIdx)
   15430                BaseShAmt = InVec.getOperand(1);
   15431            }
   15432         }
   15433         if (!BaseShAmt.getNode())
   15434           BaseShAmt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, Amt,
   15435                                   DAG.getIntPtrConstant(0));
   15436       }
   15437     }
   15438 
   15439     if (BaseShAmt.getNode()) {
   15440       if (EltVT.bitsGT(MVT::i32))
   15441         BaseShAmt = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, BaseShAmt);
   15442       else if (EltVT.bitsLT(MVT::i32))
   15443         BaseShAmt = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, BaseShAmt);
   15444 
   15445       switch (Op.getOpcode()) {
   15446       default:
   15447         llvm_unreachable("Unknown shift opcode!");
   15448       case ISD::SHL:
   15449         switch (VT.SimpleTy) {
   15450         default: return SDValue();
   15451         case MVT::v2i64:
   15452         case MVT::v4i32:
   15453         case MVT::v8i16:
   15454         case MVT::v4i64:
   15455         case MVT::v8i32:
   15456         case MVT::v16i16:
   15457         case MVT::v16i32:
   15458         case MVT::v8i64:
   15459           return getTargetVShiftNode(X86ISD::VSHLI, dl, VT, R, BaseShAmt, DAG);
   15460         }
   15461       case ISD::SRA:
   15462         switch (VT.SimpleTy) {
   15463         default: return SDValue();
   15464         case MVT::v4i32:
   15465         case MVT::v8i16:
   15466         case MVT::v8i32:
   15467         case MVT::v16i16:
   15468         case MVT::v16i32:
   15469         case MVT::v8i64:
   15470           return getTargetVShiftNode(X86ISD::VSRAI, dl, VT, R, BaseShAmt, DAG);
   15471         }
   15472       case ISD::SRL:
   15473         switch (VT.SimpleTy) {
   15474         default: return SDValue();
   15475         case MVT::v2i64:
   15476         case MVT::v4i32:
   15477         case MVT::v8i16:
   15478         case MVT::v4i64:
   15479         case MVT::v8i32:
   15480         case MVT::v16i16:
   15481         case MVT::v16i32:
   15482         case MVT::v8i64:
   15483           return getTargetVShiftNode(X86ISD::VSRLI, dl, VT, R, BaseShAmt, DAG);
   15484         }
   15485       }
   15486     }
   15487   }
   15488 
   15489   // Special case in 32-bit mode, where i64 is expanded into high and low parts.
   15490   if (!Subtarget->is64Bit() &&
   15491       (VT == MVT::v2i64 || (Subtarget->hasInt256() && VT == MVT::v4i64) ||
   15492       (Subtarget->hasAVX512() && VT == MVT::v8i64)) &&
   15493       Amt.getOpcode() == ISD::BITCAST &&
   15494       Amt.getOperand(0).getOpcode() == ISD::BUILD_VECTOR) {
   15495     Amt = Amt.getOperand(0);
   15496     unsigned Ratio = Amt.getSimpleValueType().getVectorNumElements() /
   15497                      VT.getVectorNumElements();
   15498     std::vector<SDValue> Vals(Ratio);
   15499     for (unsigned i = 0; i != Ratio; ++i)
   15500       Vals[i] = Amt.getOperand(i);
   15501     for (unsigned i = Ratio; i != Amt.getNumOperands(); i += Ratio) {
   15502       for (unsigned j = 0; j != Ratio; ++j)
   15503         if (Vals[j] != Amt.getOperand(i + j))
   15504           return SDValue();
   15505     }
   15506     switch (Op.getOpcode()) {
   15507     default:
   15508       llvm_unreachable("Unknown shift opcode!");
   15509     case ISD::SHL:
   15510       return DAG.getNode(X86ISD::VSHL, dl, VT, R, Op.getOperand(1));
   15511     case ISD::SRL:
   15512       return DAG.getNode(X86ISD::VSRL, dl, VT, R, Op.getOperand(1));
   15513     case ISD::SRA:
   15514       return DAG.getNode(X86ISD::VSRA, dl, VT, R, Op.getOperand(1));
   15515     }
   15516   }
   15517 
   15518   return SDValue();
   15519 }
   15520 
   15521 static SDValue LowerShift(SDValue Op, const X86Subtarget* Subtarget,
   15522                           SelectionDAG &DAG) {
   15523   MVT VT = Op.getSimpleValueType();
   15524   SDLoc dl(Op);
   15525   SDValue R = Op.getOperand(0);
   15526   SDValue Amt = Op.getOperand(1);
   15527   SDValue V;
   15528 
   15529   assert(VT.isVector() && "Custom lowering only for vector shifts!");
   15530   assert(Subtarget->hasSSE2() && "Only custom lower when we have SSE2!");
   15531 
   15532   V = LowerScalarImmediateShift(Op, DAG, Subtarget);
   15533   if (V.getNode())
   15534     return V;
   15535 
   15536   V = LowerScalarVariableShift(Op, DAG, Subtarget);
   15537   if (V.getNode())
   15538       return V;
   15539 
   15540   if (Subtarget->hasAVX512() && (VT == MVT::v16i32 || VT == MVT::v8i64))
   15541     return Op;
   15542   // AVX2 has VPSLLV/VPSRAV/VPSRLV.
   15543   if (Subtarget->hasInt256()) {
   15544     if (Op.getOpcode() == ISD::SRL &&
   15545         (VT == MVT::v2i64 || VT == MVT::v4i32 ||
   15546          VT == MVT::v4i64 || VT == MVT::v8i32))
   15547       return Op;
   15548     if (Op.getOpcode() == ISD::SHL &&
   15549         (VT == MVT::v2i64 || VT == MVT::v4i32 ||
   15550          VT == MVT::v4i64 || VT == MVT::v8i32))
   15551       return Op;
   15552     if (Op.getOpcode() == ISD::SRA && (VT == MVT::v4i32 || VT == MVT::v8i32))
   15553       return Op;
   15554   }
   15555 
   15556   // If possible, lower this packed shift into a vector multiply instead of
   15557   // expanding it into a sequence of scalar shifts.
   15558   // Do this only if the vector shift count is a constant build_vector.
   15559   if (Op.getOpcode() == ISD::SHL &&
   15560       (VT == MVT::v8i16 || VT == MVT::v4i32 ||
   15561        (Subtarget->hasInt256() && VT == MVT::v16i16)) &&
   15562       ISD::isBuildVectorOfConstantSDNodes(Amt.getNode())) {
   15563     SmallVector<SDValue, 8> Elts;
   15564     EVT SVT = VT.getScalarType();
   15565     unsigned SVTBits = SVT.getSizeInBits();
   15566     const APInt &One = APInt(SVTBits, 1);
   15567     unsigned NumElems = VT.getVectorNumElements();
   15568 
   15569     for (unsigned i=0; i !=NumElems; ++i) {
   15570       SDValue Op = Amt->getOperand(i);
   15571       if (Op->getOpcode() == ISD::UNDEF) {
   15572         Elts.push_back(Op);
   15573         continue;
   15574       }
   15575 
   15576       ConstantSDNode *ND = cast<ConstantSDNode>(Op);
   15577       const APInt &C = APInt(SVTBits, ND->getAPIntValue().getZExtValue());
   15578       uint64_t ShAmt = C.getZExtValue();
   15579       if (ShAmt >= SVTBits) {
   15580         Elts.push_back(DAG.getUNDEF(SVT));
   15581         continue;
   15582       }
   15583       Elts.push_back(DAG.getConstant(One.shl(ShAmt), SVT));
   15584     }
   15585     SDValue BV = DAG.getNode(ISD::BUILD_VECTOR, dl, VT, Elts);
   15586     return DAG.getNode(ISD::MUL, dl, VT, R, BV);
   15587   }
   15588 
   15589   // Lower SHL with variable shift amount.
   15590   if (VT == MVT::v4i32 && Op->getOpcode() == ISD::SHL) {
   15591     Op = DAG.getNode(ISD::SHL, dl, VT, Amt, DAG.getConstant(23, VT));
   15592 
   15593     Op = DAG.getNode(ISD::ADD, dl, VT, Op, DAG.getConstant(0x3f800000U, VT));
   15594     Op = DAG.getNode(ISD::BITCAST, dl, MVT::v4f32, Op);
   15595     Op = DAG.getNode(ISD::FP_TO_SINT, dl, VT, Op);
   15596     return DAG.getNode(ISD::MUL, dl, VT, Op, R);
   15597   }
   15598 
   15599   // If possible, lower this shift as a sequence of two shifts by
   15600   // constant plus a MOVSS/MOVSD instead of scalarizing it.
   15601   // Example:
   15602   //   (v4i32 (srl A, (build_vector < X, Y, Y, Y>)))
   15603   //
   15604   // Could be rewritten as:
   15605   //   (v4i32 (MOVSS (srl A, <Y,Y,Y,Y>), (srl A, <X,X,X,X>)))
   15606   //
   15607   // The advantage is that the two shifts from the example would be
   15608   // lowered as X86ISD::VSRLI nodes. This would be cheaper than scalarizing
   15609   // the vector shift into four scalar shifts plus four pairs of vector
   15610   // insert/extract.
   15611   if ((VT == MVT::v8i16 || VT == MVT::v4i32) &&
   15612       ISD::isBuildVectorOfConstantSDNodes(Amt.getNode())) {
   15613     unsigned TargetOpcode = X86ISD::MOVSS;
   15614     bool CanBeSimplified;
   15615     // The splat value for the first packed shift (the 'X' from the example).
   15616     SDValue Amt1 = Amt->getOperand(0);
   15617     // The splat value for the second packed shift (the 'Y' from the example).
   15618     SDValue Amt2 = (VT == MVT::v4i32) ? Amt->getOperand(1) :
   15619                                         Amt->getOperand(2);
   15620 
   15621     // See if it is possible to replace this node with a sequence of
   15622     // two shifts followed by a MOVSS/MOVSD
   15623     if (VT == MVT::v4i32) {
   15624       // Check if it is legal to use a MOVSS.
   15625       CanBeSimplified = Amt2 == Amt->getOperand(2) &&
   15626                         Amt2 == Amt->getOperand(3);
   15627       if (!CanBeSimplified) {
   15628         // Otherwise, check if we can still simplify this node using a MOVSD.
   15629         CanBeSimplified = Amt1 == Amt->getOperand(1) &&
   15630                           Amt->getOperand(2) == Amt->getOperand(3);
   15631         TargetOpcode = X86ISD::MOVSD;
   15632         Amt2 = Amt->getOperand(2);
   15633       }
   15634     } else {
   15635       // Do similar checks for the case where the machine value type
   15636       // is MVT::v8i16.
   15637       CanBeSimplified = Amt1 == Amt->getOperand(1);
   15638       for (unsigned i=3; i != 8 && CanBeSimplified; ++i)
   15639         CanBeSimplified = Amt2 == Amt->getOperand(i);
   15640 
   15641       if (!CanBeSimplified) {
   15642         TargetOpcode = X86ISD::MOVSD;
   15643         CanBeSimplified = true;
   15644         Amt2 = Amt->getOperand(4);
   15645         for (unsigned i=0; i != 4 && CanBeSimplified; ++i)
   15646           CanBeSimplified = Amt1 == Amt->getOperand(i);
   15647         for (unsigned j=4; j != 8 && CanBeSimplified; ++j)
   15648           CanBeSimplified = Amt2 == Amt->getOperand(j);
   15649       }
   15650     }
   15651 
   15652     if (CanBeSimplified && isa<ConstantSDNode>(Amt1) &&
   15653         isa<ConstantSDNode>(Amt2)) {
   15654       // Replace this node with two shifts followed by a MOVSS/MOVSD.
   15655       EVT CastVT = MVT::v4i32;
   15656       SDValue Splat1 =
   15657         DAG.getConstant(cast<ConstantSDNode>(Amt1)->getAPIntValue(), VT);
   15658       SDValue Shift1 = DAG.getNode(Op->getOpcode(), dl, VT, R, Splat1);
   15659       SDValue Splat2 =
   15660         DAG.getConstant(cast<ConstantSDNode>(Amt2)->getAPIntValue(), VT);
   15661       SDValue Shift2 = DAG.getNode(Op->getOpcode(), dl, VT, R, Splat2);
   15662       if (TargetOpcode == X86ISD::MOVSD)
   15663         CastVT = MVT::v2i64;
   15664       SDValue BitCast1 = DAG.getNode(ISD::BITCAST, dl, CastVT, Shift1);
   15665       SDValue BitCast2 = DAG.getNode(ISD::BITCAST, dl, CastVT, Shift2);
   15666       SDValue Result = getTargetShuffleNode(TargetOpcode, dl, CastVT, BitCast2,
   15667                                             BitCast1, DAG);
   15668       return DAG.getNode(ISD::BITCAST, dl, VT, Result);
   15669     }
   15670   }
   15671 
   15672   if (VT == MVT::v16i8 && Op->getOpcode() == ISD::SHL) {
   15673     assert(Subtarget->hasSSE2() && "Need SSE2 for pslli/pcmpeq.");
   15674 
   15675     // a = a << 5;
   15676     Op = DAG.getNode(ISD::SHL, dl, VT, Amt, DAG.getConstant(5, VT));
   15677     Op = DAG.getNode(ISD::BITCAST, dl, VT, Op);
   15678 
   15679     // Turn 'a' into a mask suitable for VSELECT
   15680     SDValue VSelM = DAG.getConstant(0x80, VT);
   15681     SDValue OpVSel = DAG.getNode(ISD::AND, dl, VT, VSelM, Op);
   15682     OpVSel = DAG.getNode(X86ISD::PCMPEQ, dl, VT, OpVSel, VSelM);
   15683 
   15684     SDValue CM1 = DAG.getConstant(0x0f, VT);
   15685     SDValue CM2 = DAG.getConstant(0x3f, VT);
   15686 
   15687     // r = VSELECT(r, psllw(r & (char16)15, 4), a);
   15688     SDValue M = DAG.getNode(ISD::AND, dl, VT, R, CM1);
   15689     M = getTargetVShiftByConstNode(X86ISD::VSHLI, dl, MVT::v8i16, M, 4, DAG);
   15690     M = DAG.getNode(ISD::BITCAST, dl, VT, M);
   15691     R = DAG.getNode(ISD::VSELECT, dl, VT, OpVSel, M, R);
   15692 
   15693     // a += a
   15694     Op = DAG.getNode(ISD::ADD, dl, VT, Op, Op);
   15695     OpVSel = DAG.getNode(ISD::AND, dl, VT, VSelM, Op);
   15696     OpVSel = DAG.getNode(X86ISD::PCMPEQ, dl, VT, OpVSel, VSelM);
   15697 
   15698     // r = VSELECT(r, psllw(r & (char16)63, 2), a);
   15699     M = DAG.getNode(ISD::AND, dl, VT, R, CM2);
   15700     M = getTargetVShiftByConstNode(X86ISD::VSHLI, dl, MVT::v8i16, M, 2, DAG);
   15701     M = DAG.getNode(ISD::BITCAST, dl, VT, M);
   15702     R = DAG.getNode(ISD::VSELECT, dl, VT, OpVSel, M, R);
   15703 
   15704     // a += a
   15705     Op = DAG.getNode(ISD::ADD, dl, VT, Op, Op);
   15706     OpVSel = DAG.getNode(ISD::AND, dl, VT, VSelM, Op);
   15707     OpVSel = DAG.getNode(X86ISD::PCMPEQ, dl, VT, OpVSel, VSelM);
   15708 
   15709     // return VSELECT(r, r+r, a);
   15710     R = DAG.getNode(ISD::VSELECT, dl, VT, OpVSel,
   15711                     DAG.getNode(ISD::ADD, dl, VT, R, R), R);
   15712     return R;
   15713   }
   15714 
   15715   // It's worth extending once and using the v8i32 shifts for 16-bit types, but
   15716   // the extra overheads to get from v16i8 to v8i32 make the existing SSE
   15717   // solution better.
   15718   if (Subtarget->hasInt256() && VT == MVT::v8i16) {
   15719     MVT NewVT = VT == MVT::v8i16 ? MVT::v8i32 : MVT::v16i16;
   15720     unsigned ExtOpc =
   15721         Op.getOpcode() == ISD::SRA ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
   15722     R = DAG.getNode(ExtOpc, dl, NewVT, R);
   15723     Amt = DAG.getNode(ISD::ANY_EXTEND, dl, NewVT, Amt);
   15724     return DAG.getNode(ISD::TRUNCATE, dl, VT,
   15725                        DAG.getNode(Op.getOpcode(), dl, NewVT, R, Amt));
   15726     }
   15727 
   15728   // Decompose 256-bit shifts into smaller 128-bit shifts.
   15729   if (VT.is256BitVector()) {
   15730     unsigned NumElems = VT.getVectorNumElements();
   15731     MVT EltVT = VT.getVectorElementType();
   15732     EVT NewVT = MVT::getVectorVT(EltVT, NumElems/2);
   15733 
   15734     // Extract the two vectors
   15735     SDValue V1 = Extract128BitVector(R, 0, DAG, dl);
   15736     SDValue V2 = Extract128BitVector(R, NumElems/2, DAG, dl);
   15737 
   15738     // Recreate the shift amount vectors
   15739     SDValue Amt1, Amt2;
   15740     if (Amt.getOpcode() == ISD::BUILD_VECTOR) {
   15741       // Constant shift amount
   15742       SmallVector<SDValue, 4> Amt1Csts;
   15743       SmallVector<SDValue, 4> Amt2Csts;
   15744       for (unsigned i = 0; i != NumElems/2; ++i)
   15745         Amt1Csts.push_back(Amt->getOperand(i));
   15746       for (unsigned i = NumElems/2; i != NumElems; ++i)
   15747         Amt2Csts.push_back(Amt->getOperand(i));
   15748 
   15749       Amt1 = DAG.getNode(ISD::BUILD_VECTOR, dl, NewVT, Amt1Csts);
   15750       Amt2 = DAG.getNode(ISD::BUILD_VECTOR, dl, NewVT, Amt2Csts);
   15751     } else {
   15752       // Variable shift amount
   15753       Amt1 = Extract128BitVector(Amt, 0, DAG, dl);
   15754       Amt2 = Extract128BitVector(Amt, NumElems/2, DAG, dl);
   15755     }
   15756 
   15757     // Issue new vector shifts for the smaller types
   15758     V1 = DAG.getNode(Op.getOpcode(), dl, NewVT, V1, Amt1);
   15759     V2 = DAG.getNode(Op.getOpcode(), dl, NewVT, V2, Amt2);
   15760 
   15761     // Concatenate the result back
   15762     return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, V1, V2);
   15763   }
   15764 
   15765   return SDValue();
   15766 }
   15767 
   15768 static SDValue LowerXALUO(SDValue Op, SelectionDAG &DAG) {
   15769   // Lower the "add/sub/mul with overflow" instruction into a regular ins plus
   15770   // a "setcc" instruction that checks the overflow flag. The "brcond" lowering
   15771   // looks for this combo and may remove the "setcc" instruction if the "setcc"
   15772   // has only one use.
   15773   SDNode *N = Op.getNode();
   15774   SDValue LHS = N->getOperand(0);
   15775   SDValue RHS = N->getOperand(1);
   15776   unsigned BaseOp = 0;
   15777   unsigned Cond = 0;
   15778   SDLoc DL(Op);
   15779   switch (Op.getOpcode()) {
   15780   default: llvm_unreachable("Unknown ovf instruction!");
   15781   case ISD::SADDO:
   15782     // A subtract of one will be selected as a INC. Note that INC doesn't
   15783     // set CF, so we can't do this for UADDO.
   15784     if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(RHS))
   15785       if (C->isOne()) {
   15786         BaseOp = X86ISD::INC;
   15787         Cond = X86::COND_O;
   15788         break;
   15789       }
   15790     BaseOp = X86ISD::ADD;
   15791     Cond = X86::COND_O;
   15792     break;
   15793   case ISD::UADDO:
   15794     BaseOp = X86ISD::ADD;
   15795     Cond = X86::COND_B;
   15796     break;
   15797   case ISD::SSUBO:
   15798     // A subtract of one will be selected as a DEC. Note that DEC doesn't
   15799     // set CF, so we can't do this for USUBO.
   15800     if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(RHS))
   15801       if (C->isOne()) {
   15802         BaseOp = X86ISD::DEC;
   15803         Cond = X86::COND_O;
   15804         break;
   15805       }
   15806     BaseOp = X86ISD::SUB;
   15807     Cond = X86::COND_O;
   15808     break;
   15809   case ISD::USUBO:
   15810     BaseOp = X86ISD::SUB;
   15811     Cond = X86::COND_B;
   15812     break;
   15813   case ISD::SMULO:
   15814     BaseOp = X86ISD::SMUL;
   15815     Cond = X86::COND_O;
   15816     break;
   15817   case ISD::UMULO: { // i64, i8 = umulo lhs, rhs --> i64, i64, i32 umul lhs,rhs
   15818     SDVTList VTs = DAG.getVTList(N->getValueType(0), N->getValueType(0),
   15819                                  MVT::i32);
   15820     SDValue Sum = DAG.getNode(X86ISD::UMUL, DL, VTs, LHS, RHS);
   15821 
   15822     SDValue SetCC =
   15823       DAG.getNode(X86ISD::SETCC, DL, MVT::i8,
   15824                   DAG.getConstant(X86::COND_O, MVT::i32),
   15825                   SDValue(Sum.getNode(), 2));
   15826 
   15827     return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(), Sum, SetCC);
   15828   }
   15829   }
   15830 
   15831   // Also sets EFLAGS.
   15832   SDVTList VTs = DAG.getVTList(N->getValueType(0), MVT::i32);
   15833   SDValue Sum = DAG.getNode(BaseOp, DL, VTs, LHS, RHS);
   15834 
   15835   SDValue SetCC =
   15836     DAG.getNode(X86ISD::SETCC, DL, N->getValueType(1),
   15837                 DAG.getConstant(Cond, MVT::i32),
   15838                 SDValue(Sum.getNode(), 1));
   15839 
   15840   return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(), Sum, SetCC);
   15841 }
   15842 
   15843 SDValue X86TargetLowering::LowerSIGN_EXTEND_INREG(SDValue Op,
   15844                                                   SelectionDAG &DAG) const {
   15845   SDLoc dl(Op);
   15846   EVT ExtraVT = cast<VTSDNode>(Op.getOperand(1))->getVT();
   15847   MVT VT = Op.getSimpleValueType();
   15848 
   15849   if (!Subtarget->hasSSE2() || !VT.isVector())
   15850     return SDValue();
   15851 
   15852   unsigned BitsDiff = VT.getScalarType().getSizeInBits() -
   15853                       ExtraVT.getScalarType().getSizeInBits();
   15854 
   15855   switch (VT.SimpleTy) {
   15856     default: return SDValue();
   15857     case MVT::v8i32:
   15858     case MVT::v16i16:
   15859       if (!Subtarget->hasFp256())
   15860         return SDValue();
   15861       if (!Subtarget->hasInt256()) {
   15862         // needs to be split
   15863         unsigned NumElems = VT.getVectorNumElements();
   15864 
   15865         // Extract the LHS vectors
   15866         SDValue LHS = Op.getOperand(0);
   15867         SDValue LHS1 = Extract128BitVector(LHS, 0, DAG, dl);
   15868         SDValue LHS2 = Extract128BitVector(LHS, NumElems/2, DAG, dl);
   15869 
   15870         MVT EltVT = VT.getVectorElementType();
   15871         EVT NewVT = MVT::getVectorVT(EltVT, NumElems/2);
   15872 
   15873         EVT ExtraEltVT = ExtraVT.getVectorElementType();
   15874         unsigned ExtraNumElems = ExtraVT.getVectorNumElements();
   15875         ExtraVT = EVT::getVectorVT(*DAG.getContext(), ExtraEltVT,
   15876                                    ExtraNumElems/2);
   15877         SDValue Extra = DAG.getValueType(ExtraVT);
   15878 
   15879         LHS1 = DAG.getNode(Op.getOpcode(), dl, NewVT, LHS1, Extra);
   15880         LHS2 = DAG.getNode(Op.getOpcode(), dl, NewVT, LHS2, Extra);
   15881 
   15882         return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, LHS1, LHS2);
   15883       }
   15884       // fall through
   15885     case MVT::v4i32:
   15886     case MVT::v8i16: {
   15887       SDValue Op0 = Op.getOperand(0);
   15888       SDValue Op00 = Op0.getOperand(0);
   15889       SDValue Tmp1;
   15890       // Hopefully, this VECTOR_SHUFFLE is just a VZEXT.
   15891       if (Op0.getOpcode() == ISD::BITCAST &&
   15892           Op00.getOpcode() == ISD::VECTOR_SHUFFLE) {
   15893         // (sext (vzext x)) -> (vsext x)
   15894         Tmp1 = LowerVectorIntExtend(Op00, Subtarget, DAG);
   15895         if (Tmp1.getNode()) {
   15896           EVT ExtraEltVT = ExtraVT.getVectorElementType();
   15897           // This folding is only valid when the in-reg type is a vector of i8,
   15898           // i16, or i32.
   15899           if (ExtraEltVT == MVT::i8 || ExtraEltVT == MVT::i16 ||
   15900               ExtraEltVT == MVT::i32) {
   15901             SDValue Tmp1Op0 = Tmp1.getOperand(0);
   15902             assert(Tmp1Op0.getOpcode() == X86ISD::VZEXT &&
   15903                    "This optimization is invalid without a VZEXT.");
   15904             return DAG.getNode(X86ISD::VSEXT, dl, VT, Tmp1Op0.getOperand(0));
   15905           }
   15906           Op0 = Tmp1;
   15907         }
   15908       }
   15909 
   15910       // If the above didn't work, then just use Shift-Left + Shift-Right.
   15911       Tmp1 = getTargetVShiftByConstNode(X86ISD::VSHLI, dl, VT, Op0, BitsDiff,
   15912                                         DAG);
   15913       return getTargetVShiftByConstNode(X86ISD::VSRAI, dl, VT, Tmp1, BitsDiff,
   15914                                         DAG);
   15915     }
   15916   }
   15917 }
   15918 
   15919 static SDValue LowerATOMIC_FENCE(SDValue Op, const X86Subtarget *Subtarget,
   15920                                  SelectionDAG &DAG) {
   15921   SDLoc dl(Op);
   15922   AtomicOrdering FenceOrdering = static_cast<AtomicOrdering>(
   15923     cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue());
   15924   SynchronizationScope FenceScope = static_cast<SynchronizationScope>(
   15925     cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue());
   15926 
   15927   // The only fence that needs an instruction is a sequentially-consistent
   15928   // cross-thread fence.
   15929   if (FenceOrdering == SequentiallyConsistent && FenceScope == CrossThread) {
   15930     // Use mfence if we have SSE2 or we're on x86-64 (even if we asked for
   15931     // no-sse2). There isn't any reason to disable it if the target processor
   15932     // supports it.
   15933     if (Subtarget->hasSSE2() || Subtarget->is64Bit())
   15934       return DAG.getNode(X86ISD::MFENCE, dl, MVT::Other, Op.getOperand(0));
   15935 
   15936     SDValue Chain = Op.getOperand(0);
   15937     SDValue Zero = DAG.getConstant(0, MVT::i32);
   15938     SDValue Ops[] = {
   15939       DAG.getRegister(X86::ESP, MVT::i32), // Base
   15940       DAG.getTargetConstant(1, MVT::i8),   // Scale
   15941       DAG.getRegister(0, MVT::i32),        // Index
   15942       DAG.getTargetConstant(0, MVT::i32),  // Disp
   15943       DAG.getRegister(0, MVT::i32),        // Segment.
   15944       Zero,
   15945       Chain
   15946     };
   15947     SDNode *Res = DAG.getMachineNode(X86::OR32mrLocked, dl, MVT::Other, Ops);
   15948     return SDValue(Res, 0);
   15949   }
   15950 
   15951   // MEMBARRIER is a compiler barrier; it codegens to a no-op.
   15952   return DAG.getNode(X86ISD::MEMBARRIER, dl, MVT::Other, Op.getOperand(0));
   15953 }
   15954 
   15955 static SDValue LowerCMP_SWAP(SDValue Op, const X86Subtarget *Subtarget,
   15956                              SelectionDAG &DAG) {
   15957   MVT T = Op.getSimpleValueType();
   15958   SDLoc DL(Op);
   15959   unsigned Reg = 0;
   15960   unsigned size = 0;
   15961   switch(T.SimpleTy) {
   15962   default: llvm_unreachable("Invalid value type!");
   15963   case MVT::i8:  Reg = X86::AL;  size = 1; break;
   15964   case MVT::i16: Reg = X86::AX;  size = 2; break;
   15965   case MVT::i32: Reg = X86::EAX; size = 4; break;
   15966   case MVT::i64:
   15967     assert(Subtarget->is64Bit() && "Node not type legal!");
   15968     Reg = X86::RAX; size = 8;
   15969     break;
   15970   }
   15971   SDValue cpIn = DAG.getCopyToReg(Op.getOperand(0), DL, Reg,
   15972                                   Op.getOperand(2), SDValue());
   15973   SDValue Ops[] = { cpIn.getValue(0),
   15974                     Op.getOperand(1),
   15975                     Op.getOperand(3),
   15976                     DAG.getTargetConstant(size, MVT::i8),
   15977                     cpIn.getValue(1) };
   15978   SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue);
   15979   MachineMemOperand *MMO = cast<AtomicSDNode>(Op)->getMemOperand();
   15980   SDValue Result = DAG.getMemIntrinsicNode(X86ISD::LCMPXCHG_DAG, DL, Tys,
   15981                                            Ops, T, MMO);
   15982 
   15983   SDValue cpOut =
   15984     DAG.getCopyFromReg(Result.getValue(0), DL, Reg, T, Result.getValue(1));
   15985   SDValue EFLAGS = DAG.getCopyFromReg(cpOut.getValue(1), DL, X86::EFLAGS,
   15986                                       MVT::i32, cpOut.getValue(2));
   15987   SDValue Success = DAG.getNode(X86ISD::SETCC, DL, Op->getValueType(1),
   15988                                 DAG.getConstant(X86::COND_E, MVT::i8), EFLAGS);
   15989 
   15990   DAG.ReplaceAllUsesOfValueWith(Op.getValue(0), cpOut);
   15991   DAG.ReplaceAllUsesOfValueWith(Op.getValue(1), Success);
   15992   DAG.ReplaceAllUsesOfValueWith(Op.getValue(2), EFLAGS.getValue(1));
   15993   return SDValue();
   15994 }
   15995 
   15996 static SDValue LowerBITCAST(SDValue Op, const X86Subtarget *Subtarget,
   15997                             SelectionDAG &DAG) {
   15998   MVT SrcVT = Op.getOperand(0).getSimpleValueType();
   15999   MVT DstVT = Op.getSimpleValueType();
   16000 
   16001   if (SrcVT == MVT::v2i32 || SrcVT == MVT::v4i16 || SrcVT == MVT::v8i8) {
   16002     assert(Subtarget->hasSSE2() && "Requires at least SSE2!");
   16003     if (DstVT != MVT::f64)
   16004       // This conversion needs to be expanded.
   16005       return SDValue();
   16006 
   16007     SDValue InVec = Op->getOperand(0);
   16008     SDLoc dl(Op);
   16009     unsigned NumElts = SrcVT.getVectorNumElements();
   16010     EVT SVT = SrcVT.getVectorElementType();
   16011 
   16012     // Widen the vector in input in the case of MVT::v2i32.
   16013     // Example: from MVT::v2i32 to MVT::v4i32.
   16014     SmallVector<SDValue, 16> Elts;
   16015     for (unsigned i = 0, e = NumElts; i != e; ++i)
   16016       Elts.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, SVT, InVec,
   16017                                  DAG.getIntPtrConstant(i)));
   16018 
   16019     // Explicitly mark the extra elements as Undef.
   16020     SDValue Undef = DAG.getUNDEF(SVT);
   16021     for (unsigned i = NumElts, e = NumElts * 2; i != e; ++i)
   16022       Elts.push_back(Undef);
   16023 
   16024     EVT NewVT = EVT::getVectorVT(*DAG.getContext(), SVT, NumElts * 2);
   16025     SDValue BV = DAG.getNode(ISD::BUILD_VECTOR, dl, NewVT, Elts);
   16026     SDValue ToV2F64 = DAG.getNode(ISD::BITCAST, dl, MVT::v2f64, BV);
   16027     return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, ToV2F64,
   16028                        DAG.getIntPtrConstant(0));
   16029   }
   16030 
   16031   assert(Subtarget->is64Bit() && !Subtarget->hasSSE2() &&
   16032          Subtarget->hasMMX() && "Unexpected custom BITCAST");
   16033   assert((DstVT == MVT::i64 ||
   16034           (DstVT.isVector() && DstVT.getSizeInBits()==64)) &&
   16035          "Unexpected custom BITCAST");
   16036   // i64 <=> MMX conversions are Legal.
   16037   if (SrcVT==MVT::i64 && DstVT.isVector())
   16038     return Op;
   16039   if (DstVT==MVT::i64 && SrcVT.isVector())
   16040     return Op;
   16041   // MMX <=> MMX conversions are Legal.
   16042   if (SrcVT.isVector() && DstVT.isVector())
   16043     return Op;
   16044   // All other conversions need to be expanded.
   16045   return SDValue();
   16046 }
   16047 
   16048 static SDValue LowerLOAD_SUB(SDValue Op, SelectionDAG &DAG) {
   16049   SDNode *Node = Op.getNode();
   16050   SDLoc dl(Node);
   16051   EVT T = Node->getValueType(0);
   16052   SDValue negOp = DAG.getNode(ISD::SUB, dl, T,
   16053                               DAG.getConstant(0, T), Node->getOperand(2));
   16054   return DAG.getAtomic(ISD::ATOMIC_LOAD_ADD, dl,
   16055                        cast<AtomicSDNode>(Node)->getMemoryVT(),
   16056                        Node->getOperand(0),
   16057                        Node->getOperand(1), negOp,
   16058                        cast<AtomicSDNode>(Node)->getMemOperand(),
   16059                        cast<AtomicSDNode>(Node)->getOrdering(),
   16060                        cast<AtomicSDNode>(Node)->getSynchScope());
   16061 }
   16062 
   16063 static SDValue LowerATOMIC_STORE(SDValue Op, SelectionDAG &DAG) {
   16064   SDNode *Node = Op.getNode();
   16065   SDLoc dl(Node);
   16066   EVT VT = cast<AtomicSDNode>(Node)->getMemoryVT();
   16067 
   16068   // Convert seq_cst store -> xchg
   16069   // Convert wide store -> swap (-> cmpxchg8b/cmpxchg16b)
   16070   // FIXME: On 32-bit, store -> fist or movq would be more efficient
   16071   //        (The only way to get a 16-byte store is cmpxchg16b)
   16072   // FIXME: 16-byte ATOMIC_SWAP isn't actually hooked up at the moment.
   16073   if (cast<AtomicSDNode>(Node)->getOrdering() == SequentiallyConsistent ||
   16074       !DAG.getTargetLoweringInfo().isTypeLegal(VT)) {
   16075     SDValue Swap = DAG.getAtomic(ISD::ATOMIC_SWAP, dl,
   16076                                  cast<AtomicSDNode>(Node)->getMemoryVT(),
   16077                                  Node->getOperand(0),
   16078                                  Node->getOperand(1), Node->getOperand(2),
   16079                                  cast<AtomicSDNode>(Node)->getMemOperand(),
   16080                                  cast<AtomicSDNode>(Node)->getOrdering(),
   16081                                  cast<AtomicSDNode>(Node)->getSynchScope());
   16082     return Swap.getValue(1);
   16083   }
   16084   // Other atomic stores have a simple pattern.
   16085   return Op;
   16086 }
   16087 
   16088 static SDValue LowerADDC_ADDE_SUBC_SUBE(SDValue Op, SelectionDAG &DAG) {
   16089   EVT VT = Op.getNode()->getSimpleValueType(0);
   16090 
   16091   // Let legalize expand this if it isn't a legal type yet.
   16092   if (!DAG.getTargetLoweringInfo().isTypeLegal(VT))
   16093     return SDValue();
   16094 
   16095   SDVTList VTs = DAG.getVTList(VT, MVT::i32);
   16096 
   16097   unsigned Opc;
   16098   bool ExtraOp = false;
   16099   switch (Op.getOpcode()) {
   16100   default: llvm_unreachable("Invalid code");
   16101   case ISD::ADDC: Opc = X86ISD::ADD; break;
   16102   case ISD::ADDE: Opc = X86ISD::ADC; ExtraOp = true; break;
   16103   case ISD::SUBC: Opc = X86ISD::SUB; break;
   16104   case ISD::SUBE: Opc = X86ISD::SBB; ExtraOp = true; break;
   16105   }
   16106 
   16107   if (!ExtraOp)
   16108     return DAG.getNode(Opc, SDLoc(Op), VTs, Op.getOperand(0),
   16109                        Op.getOperand(1));
   16110   return DAG.getNode(Opc, SDLoc(Op), VTs, Op.getOperand(0),
   16111                      Op.getOperand(1), Op.getOperand(2));
   16112 }
   16113 
   16114 static SDValue LowerFSINCOS(SDValue Op, const X86Subtarget *Subtarget,
   16115                             SelectionDAG &DAG) {
   16116   assert(Subtarget->isTargetDarwin() && Subtarget->is64Bit());
   16117 
   16118   // For MacOSX, we want to call an alternative entry point: __sincos_stret,
   16119   // which returns the values as { float, float } (in XMM0) or
   16120   // { double, double } (which is returned in XMM0, XMM1).
   16121   SDLoc dl(Op);
   16122   SDValue Arg = Op.getOperand(0);
   16123   EVT ArgVT = Arg.getValueType();
   16124   Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext());
   16125 
   16126   TargetLowering::ArgListTy Args;
   16127   TargetLowering::ArgListEntry Entry;
   16128 
   16129   Entry.Node = Arg;
   16130   Entry.Ty = ArgTy;
   16131   Entry.isSExt = false;
   16132   Entry.isZExt = false;
   16133   Args.push_back(Entry);
   16134 
   16135   bool isF64 = ArgVT == MVT::f64;
   16136   // Only optimize x86_64 for now. i386 is a bit messy. For f32,
   16137   // the small struct {f32, f32} is returned in (eax, edx). For f64,
   16138   // the results are returned via SRet in memory.
   16139   const char *LibcallName =  isF64 ? "__sincos_stret" : "__sincosf_stret";
   16140   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
   16141   SDValue Callee = DAG.getExternalSymbol(LibcallName, TLI.getPointerTy());
   16142 
   16143   Type *RetTy = isF64
   16144     ? (Type*)StructType::get(ArgTy, ArgTy, NULL)
   16145     : (Type*)VectorType::get(ArgTy, 4);
   16146 
   16147   TargetLowering::CallLoweringInfo CLI(DAG);
   16148   CLI.setDebugLoc(dl).setChain(DAG.getEntryNode())
   16149     .setCallee(CallingConv::C, RetTy, Callee, std::move(Args), 0);
   16150 
   16151   std::pair<SDValue, SDValue> CallResult = TLI.LowerCallTo(CLI);
   16152 
   16153   if (isF64)
   16154     // Returned in xmm0 and xmm1.
   16155     return CallResult.first;
   16156 
   16157   // Returned in bits 0:31 and 32:64 xmm0.
   16158   SDValue SinVal = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, ArgVT,
   16159                                CallResult.first, DAG.getIntPtrConstant(0));
   16160   SDValue CosVal = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, ArgVT,
   16161                                CallResult.first, DAG.getIntPtrConstant(1));
   16162   SDVTList Tys = DAG.getVTList(ArgVT, ArgVT);
   16163   return DAG.getNode(ISD::MERGE_VALUES, dl, Tys, SinVal, CosVal);
   16164 }
   16165 
   16166 /// LowerOperation - Provide custom lowering hooks for some operations.
   16167 ///
   16168 SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
   16169   switch (Op.getOpcode()) {
   16170   default: llvm_unreachable("Should not custom lower this!");
   16171   case ISD::SIGN_EXTEND_INREG:  return LowerSIGN_EXTEND_INREG(Op,DAG);
   16172   case ISD::ATOMIC_FENCE:       return LowerATOMIC_FENCE(Op, Subtarget, DAG);
   16173   case ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS:
   16174     return LowerCMP_SWAP(Op, Subtarget, DAG);
   16175   case ISD::ATOMIC_LOAD_SUB:    return LowerLOAD_SUB(Op,DAG);
   16176   case ISD::ATOMIC_STORE:       return LowerATOMIC_STORE(Op,DAG);
   16177   case ISD::BUILD_VECTOR:       return LowerBUILD_VECTOR(Op, DAG);
   16178   case ISD::CONCAT_VECTORS:     return LowerCONCAT_VECTORS(Op, DAG);
   16179   case ISD::VECTOR_SHUFFLE:     return LowerVECTOR_SHUFFLE(Op, DAG);
   16180   case ISD::VSELECT:            return LowerVSELECT(Op, DAG);
   16181   case ISD::EXTRACT_VECTOR_ELT: return LowerEXTRACT_VECTOR_ELT(Op, DAG);
   16182   case ISD::INSERT_VECTOR_ELT:  return LowerINSERT_VECTOR_ELT(Op, DAG);
   16183   case ISD::EXTRACT_SUBVECTOR:  return LowerEXTRACT_SUBVECTOR(Op,Subtarget,DAG);
   16184   case ISD::INSERT_SUBVECTOR:   return LowerINSERT_SUBVECTOR(Op, Subtarget,DAG);
   16185   case ISD::SCALAR_TO_VECTOR:   return LowerSCALAR_TO_VECTOR(Op, DAG);
   16186   case ISD::ConstantPool:       return LowerConstantPool(Op, DAG);
   16187   case ISD::GlobalAddress:      return LowerGlobalAddress(Op, DAG);
   16188   case ISD::GlobalTLSAddress:   return LowerGlobalTLSAddress(Op, DAG);
   16189   case ISD::ExternalSymbol:     return LowerExternalSymbol(Op, DAG);
   16190   case ISD::BlockAddress:       return LowerBlockAddress(Op, DAG);
   16191   case ISD::SHL_PARTS:
   16192   case ISD::SRA_PARTS:
   16193   case ISD::SRL_PARTS:          return LowerShiftParts(Op, DAG);
   16194   case ISD::SINT_TO_FP:         return LowerSINT_TO_FP(Op, DAG);
   16195   case ISD::UINT_TO_FP:         return LowerUINT_TO_FP(Op, DAG);
   16196   case ISD::TRUNCATE:           return LowerTRUNCATE(Op, DAG);
   16197   case ISD::ZERO_EXTEND:        return LowerZERO_EXTEND(Op, Subtarget, DAG);
   16198   case ISD::SIGN_EXTEND:        return LowerSIGN_EXTEND(Op, Subtarget, DAG);
   16199   case ISD::ANY_EXTEND:         return LowerANY_EXTEND(Op, Subtarget, DAG);
   16200   case ISD::FP_TO_SINT:         return LowerFP_TO_SINT(Op, DAG);
   16201   case ISD::FP_TO_UINT:         return LowerFP_TO_UINT(Op, DAG);
   16202   case ISD::FP_EXTEND:          return LowerFP_EXTEND(Op, DAG);
   16203   case ISD::FABS:               return LowerFABS(Op, DAG);
   16204   case ISD::FNEG:               return LowerFNEG(Op, DAG);
   16205   case ISD::FCOPYSIGN:          return LowerFCOPYSIGN(Op, DAG);
   16206   case ISD::FGETSIGN:           return LowerFGETSIGN(Op, DAG);
   16207   case ISD::SETCC:              return LowerSETCC(Op, DAG);
   16208   case ISD::SELECT:             return LowerSELECT(Op, DAG);
   16209   case ISD::BRCOND:             return LowerBRCOND(Op, DAG);
   16210   case ISD::JumpTable:          return LowerJumpTable(Op, DAG);
   16211   case ISD::VASTART:            return LowerVASTART(Op, DAG);
   16212   case ISD::VAARG:              return LowerVAARG(Op, DAG);
   16213   case ISD::VACOPY:             return LowerVACOPY(Op, Subtarget, DAG);
   16214   case ISD::INTRINSIC_WO_CHAIN: return LowerINTRINSIC_WO_CHAIN(Op, DAG);
   16215   case ISD::INTRINSIC_VOID:
   16216   case ISD::INTRINSIC_W_CHAIN:  return LowerINTRINSIC_W_CHAIN(Op, Subtarget, DAG);
   16217   case ISD::RETURNADDR:         return LowerRETURNADDR(Op, DAG);
   16218   case ISD::FRAMEADDR:          return LowerFRAMEADDR(Op, DAG);
   16219   case ISD::FRAME_TO_ARGS_OFFSET:
   16220                                 return LowerFRAME_TO_ARGS_OFFSET(Op, DAG);
   16221   case ISD::DYNAMIC_STACKALLOC: return LowerDYNAMIC_STACKALLOC(Op, DAG);
   16222   case ISD::EH_RETURN:          return LowerEH_RETURN(Op, DAG);
   16223   case ISD::EH_SJLJ_SETJMP:     return lowerEH_SJLJ_SETJMP(Op, DAG);
   16224   case ISD::EH_SJLJ_LONGJMP:    return lowerEH_SJLJ_LONGJMP(Op, DAG);
   16225   case ISD::INIT_TRAMPOLINE:    return LowerINIT_TRAMPOLINE(Op, DAG);
   16226   case ISD::ADJUST_TRAMPOLINE:  return LowerADJUST_TRAMPOLINE(Op, DAG);
   16227   case ISD::FLT_ROUNDS_:        return LowerFLT_ROUNDS_(Op, DAG);
   16228   case ISD::CTLZ:               return LowerCTLZ(Op, DAG);
   16229   case ISD::CTLZ_ZERO_UNDEF:    return LowerCTLZ_ZERO_UNDEF(Op, DAG);
   16230   case ISD::CTTZ:               return LowerCTTZ(Op, DAG);
   16231   case ISD::MUL:                return LowerMUL(Op, Subtarget, DAG);
   16232   case ISD::UMUL_LOHI:
   16233   case ISD::SMUL_LOHI:          return LowerMUL_LOHI(Op, Subtarget, DAG);
   16234   case ISD::SRA:
   16235   case ISD::SRL:
   16236   case ISD::SHL:                return LowerShift(Op, Subtarget, DAG);
   16237   case ISD::SADDO:
   16238   case ISD::UADDO:
   16239   case ISD::SSUBO:
   16240   case ISD::USUBO:
   16241   case ISD::SMULO:
   16242   case ISD::UMULO:              return LowerXALUO(Op, DAG);
   16243   case ISD::READCYCLECOUNTER:   return LowerREADCYCLECOUNTER(Op, Subtarget,DAG);
   16244   case ISD::BITCAST:            return LowerBITCAST(Op, Subtarget, DAG);
   16245   case ISD::ADDC:
   16246   case ISD::ADDE:
   16247   case ISD::SUBC:
   16248   case ISD::SUBE:               return LowerADDC_ADDE_SUBC_SUBE(Op, DAG);
   16249   case ISD::ADD:                return LowerADD(Op, DAG);
   16250   case ISD::SUB:                return LowerSUB(Op, DAG);
   16251   case ISD::FSINCOS:            return LowerFSINCOS(Op, Subtarget, DAG);
   16252   }
   16253 }
   16254 
   16255 static void ReplaceATOMIC_LOAD(SDNode *Node,
   16256                                SmallVectorImpl<SDValue> &Results,
   16257                                SelectionDAG &DAG) {
   16258   SDLoc dl(Node);
   16259   EVT VT = cast<AtomicSDNode>(Node)->getMemoryVT();
   16260 
   16261   // Convert wide load -> cmpxchg8b/cmpxchg16b
   16262   // FIXME: On 32-bit, load -> fild or movq would be more efficient
   16263   //        (The only way to get a 16-byte load is cmpxchg16b)
   16264   // FIXME: 16-byte ATOMIC_CMP_SWAP isn't actually hooked up at the moment.
   16265   SDValue Zero = DAG.getConstant(0, VT);
   16266   SDVTList VTs = DAG.getVTList(VT, MVT::i1, MVT::Other);
   16267   SDValue Swap =
   16268       DAG.getAtomicCmpSwap(ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS, dl, VT, VTs,
   16269                            Node->getOperand(0), Node->getOperand(1), Zero, Zero,
   16270                            cast<AtomicSDNode>(Node)->getMemOperand(),
   16271                            cast<AtomicSDNode>(Node)->getOrdering(),
   16272                            cast<AtomicSDNode>(Node)->getOrdering(),
   16273                            cast<AtomicSDNode>(Node)->getSynchScope());
   16274   Results.push_back(Swap.getValue(0));
   16275   Results.push_back(Swap.getValue(2));
   16276 }
   16277 
   16278 /// ReplaceNodeResults - Replace a node with an illegal result type
   16279 /// with a new node built out of custom code.
   16280 void X86TargetLowering::ReplaceNodeResults(SDNode *N,
   16281                                            SmallVectorImpl<SDValue>&Results,
   16282                                            SelectionDAG &DAG) const {
   16283   SDLoc dl(N);
   16284   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
   16285   switch (N->getOpcode()) {
   16286   default:
   16287     llvm_unreachable("Do not know how to custom type legalize this operation!");
   16288   case ISD::SIGN_EXTEND_INREG:
   16289   case ISD::ADDC:
   16290   case ISD::ADDE:
   16291   case ISD::SUBC:
   16292   case ISD::SUBE:
   16293     // We don't want to expand or promote these.
   16294     return;
   16295   case ISD::SDIV:
   16296   case ISD::UDIV:
   16297   case ISD::SREM:
   16298   case ISD::UREM:
   16299   case ISD::SDIVREM:
   16300   case ISD::UDIVREM: {
   16301     SDValue V = LowerWin64_i128OP(SDValue(N,0), DAG);
   16302     Results.push_back(V);
   16303     return;
   16304   }
   16305   case ISD::FP_TO_SINT:
   16306   case ISD::FP_TO_UINT: {
   16307     bool IsSigned = N->getOpcode() == ISD::FP_TO_SINT;
   16308 
   16309     if (!IsSigned && !isIntegerTypeFTOL(SDValue(N, 0).getValueType()))
   16310       return;
   16311 
   16312     std::pair<SDValue,SDValue> Vals =
   16313         FP_TO_INTHelper(SDValue(N, 0), DAG, IsSigned, /*IsReplace=*/ true);
   16314     SDValue FIST = Vals.first, StackSlot = Vals.second;
   16315     if (FIST.getNode()) {
   16316       EVT VT = N->getValueType(0);
   16317       // Return a load from the stack slot.
   16318       if (StackSlot.getNode())
   16319         Results.push_back(DAG.getLoad(VT, dl, FIST, StackSlot,
   16320                                       MachinePointerInfo(),
   16321                                       false, false, false, 0));
   16322       else
   16323         Results.push_back(FIST);
   16324     }
   16325     return;
   16326   }
   16327   case ISD::UINT_TO_FP: {
   16328     assert(Subtarget->hasSSE2() && "Requires at least SSE2!");
   16329     if (N->getOperand(0).getValueType() != MVT::v2i32 ||
   16330         N->getValueType(0) != MVT::v2f32)
   16331       return;
   16332     SDValue ZExtIn = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::v2i64,
   16333                                  N->getOperand(0));
   16334     SDValue Bias = DAG.getConstantFP(BitsToDouble(0x4330000000000000ULL),
   16335                                      MVT::f64);
   16336     SDValue VBias = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v2f64, Bias, Bias);
   16337     SDValue Or = DAG.getNode(ISD::OR, dl, MVT::v2i64, ZExtIn,
   16338                              DAG.getNode(ISD::BITCAST, dl, MVT::v2i64, VBias));
   16339     Or = DAG.getNode(ISD::BITCAST, dl, MVT::v2f64, Or);
   16340     SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::v2f64, Or, VBias);
   16341     Results.push_back(DAG.getNode(X86ISD::VFPROUND, dl, MVT::v4f32, Sub));
   16342     return;
   16343   }
   16344   case ISD::FP_ROUND: {
   16345     if (!TLI.isTypeLegal(N->getOperand(0).getValueType()))
   16346         return;
   16347     SDValue V = DAG.getNode(X86ISD::VFPROUND, dl, MVT::v4f32, N->getOperand(0));
   16348     Results.push_back(V);
   16349     return;
   16350   }
   16351   case ISD::INTRINSIC_W_CHAIN: {
   16352     unsigned IntNo = cast<ConstantSDNode>(N->getOperand(1))->getZExtValue();
   16353     switch (IntNo) {
   16354     default : llvm_unreachable("Do not know how to custom type "
   16355                                "legalize this intrinsic operation!");
   16356     case Intrinsic::x86_rdtsc:
   16357       return getReadTimeStampCounter(N, dl, X86ISD::RDTSC_DAG, DAG, Subtarget,
   16358                                      Results);
   16359     case Intrinsic::x86_rdtscp:
   16360       return getReadTimeStampCounter(N, dl, X86ISD::RDTSCP_DAG, DAG, Subtarget,
   16361                                      Results);
   16362     case Intrinsic::x86_rdpmc:
   16363       return getReadPerformanceCounter(N, dl, DAG, Subtarget, Results);
   16364     }
   16365   }
   16366   case ISD::READCYCLECOUNTER: {
   16367     return getReadTimeStampCounter(N, dl, X86ISD::RDTSC_DAG, DAG, Subtarget,
   16368                                    Results);
   16369   }
   16370   case ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS: {
   16371     EVT T = N->getValueType(0);
   16372     assert((T == MVT::i64 || T == MVT::i128) && "can only expand cmpxchg pair");
   16373     bool Regs64bit = T == MVT::i128;
   16374     EVT HalfT = Regs64bit ? MVT::i64 : MVT::i32;
   16375     SDValue cpInL, cpInH;
   16376     cpInL = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, HalfT, N->getOperand(2),
   16377                         DAG.getConstant(0, HalfT));
   16378     cpInH = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, HalfT, N->getOperand(2),
   16379                         DAG.getConstant(1, HalfT));
   16380     cpInL = DAG.getCopyToReg(N->getOperand(0), dl,
   16381                              Regs64bit ? X86::RAX : X86::EAX,
   16382                              cpInL, SDValue());
   16383     cpInH = DAG.getCopyToReg(cpInL.getValue(0), dl,
   16384                              Regs64bit ? X86::RDX : X86::EDX,
   16385                              cpInH, cpInL.getValue(1));
   16386     SDValue swapInL, swapInH;
   16387     swapInL = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, HalfT, N->getOperand(3),
   16388                           DAG.getConstant(0, HalfT));
   16389     swapInH = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, HalfT, N->getOperand(3),
   16390                           DAG.getConstant(1, HalfT));
   16391     swapInL = DAG.getCopyToReg(cpInH.getValue(0), dl,
   16392                                Regs64bit ? X86::RBX : X86::EBX,
   16393                                swapInL, cpInH.getValue(1));
   16394     swapInH = DAG.getCopyToReg(swapInL.getValue(0), dl,
   16395                                Regs64bit ? X86::RCX : X86::ECX,
   16396                                swapInH, swapInL.getValue(1));
   16397     SDValue Ops[] = { swapInH.getValue(0),
   16398                       N->getOperand(1),
   16399                       swapInH.getValue(1) };
   16400     SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue);
   16401     MachineMemOperand *MMO = cast<AtomicSDNode>(N)->getMemOperand();
   16402     unsigned Opcode = Regs64bit ? X86ISD::LCMPXCHG16_DAG :
   16403                                   X86ISD::LCMPXCHG8_DAG;
   16404     SDValue Result = DAG.getMemIntrinsicNode(Opcode, dl, Tys, Ops, T, MMO);
   16405     SDValue cpOutL = DAG.getCopyFromReg(Result.getValue(0), dl,
   16406                                         Regs64bit ? X86::RAX : X86::EAX,
   16407                                         HalfT, Result.getValue(1));
   16408     SDValue cpOutH = DAG.getCopyFromReg(cpOutL.getValue(1), dl,
   16409                                         Regs64bit ? X86::RDX : X86::EDX,
   16410                                         HalfT, cpOutL.getValue(2));
   16411     SDValue OpsF[] = { cpOutL.getValue(0), cpOutH.getValue(0)};
   16412 
   16413     SDValue EFLAGS = DAG.getCopyFromReg(cpOutH.getValue(1), dl, X86::EFLAGS,
   16414                                         MVT::i32, cpOutH.getValue(2));
   16415     SDValue Success =
   16416         DAG.getNode(X86ISD::SETCC, dl, MVT::i8,
   16417                     DAG.getConstant(X86::COND_E, MVT::i8), EFLAGS);
   16418     Success = DAG.getZExtOrTrunc(Success, dl, N->getValueType(1));
   16419 
   16420     Results.push_back(DAG.getNode(ISD::BUILD_PAIR, dl, T, OpsF));
   16421     Results.push_back(Success);
   16422     Results.push_back(EFLAGS.getValue(1));
   16423     return;
   16424   }
   16425   case ISD::ATOMIC_SWAP:
   16426   case ISD::ATOMIC_LOAD_ADD:
   16427   case ISD::ATOMIC_LOAD_SUB:
   16428   case ISD::ATOMIC_LOAD_AND:
   16429   case ISD::ATOMIC_LOAD_OR:
   16430   case ISD::ATOMIC_LOAD_XOR:
   16431   case ISD::ATOMIC_LOAD_NAND:
   16432   case ISD::ATOMIC_LOAD_MIN:
   16433   case ISD::ATOMIC_LOAD_MAX:
   16434   case ISD::ATOMIC_LOAD_UMIN:
   16435   case ISD::ATOMIC_LOAD_UMAX:
   16436     // Delegate to generic TypeLegalization. Situations we can really handle
   16437     // should have already been dealt with by X86AtomicExpand.cpp.
   16438     break;
   16439   case ISD::ATOMIC_LOAD: {
   16440     ReplaceATOMIC_LOAD(N, Results, DAG);
   16441     return;
   16442   }
   16443   case ISD::BITCAST: {
   16444     assert(Subtarget->hasSSE2() && "Requires at least SSE2!");
   16445     EVT DstVT = N->getValueType(0);
   16446     EVT SrcVT = N->getOperand(0)->getValueType(0);
   16447 
   16448     if (SrcVT != MVT::f64 ||
   16449         (DstVT != MVT::v2i32 && DstVT != MVT::v4i16 && DstVT != MVT::v8i8))
   16450       return;
   16451 
   16452     unsigned NumElts = DstVT.getVectorNumElements();
   16453     EVT SVT = DstVT.getVectorElementType();
   16454     EVT WiderVT = EVT::getVectorVT(*DAG.getContext(), SVT, NumElts * 2);
   16455     SDValue Expanded = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl,
   16456                                    MVT::v2f64, N->getOperand(0));
   16457     SDValue ToVecInt = DAG.getNode(ISD::BITCAST, dl, WiderVT, Expanded);
   16458 
   16459     if (ExperimentalVectorWideningLegalization) {
   16460       // If we are legalizing vectors by widening, we already have the desired
   16461       // legal vector type, just return it.
   16462       Results.push_back(ToVecInt);
   16463       return;
   16464     }
   16465 
   16466     SmallVector<SDValue, 8> Elts;
   16467     for (unsigned i = 0, e = NumElts; i != e; ++i)
   16468       Elts.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, SVT,
   16469                                    ToVecInt, DAG.getIntPtrConstant(i)));
   16470 
   16471     Results.push_back(DAG.getNode(ISD::BUILD_VECTOR, dl, DstVT, Elts));
   16472   }
   16473   }
   16474 }
   16475 
   16476 const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const {
   16477   switch (Opcode) {
   16478   default: return nullptr;
   16479   case X86ISD::BSF:                return "X86ISD::BSF";
   16480   case X86ISD::BSR:                return "X86ISD::BSR";
   16481   case X86ISD::SHLD:               return "X86ISD::SHLD";
   16482   case X86ISD::SHRD:               return "X86ISD::SHRD";
   16483   case X86ISD::FAND:               return "X86ISD::FAND";
   16484   case X86ISD::FANDN:              return "X86ISD::FANDN";
   16485   case X86ISD::FOR:                return "X86ISD::FOR";
   16486   case X86ISD::FXOR:               return "X86ISD::FXOR";
   16487   case X86ISD::FSRL:               return "X86ISD::FSRL";
   16488   case X86ISD::FILD:               return "X86ISD::FILD";
   16489   case X86ISD::FILD_FLAG:          return "X86ISD::FILD_FLAG";
   16490   case X86ISD::FP_TO_INT16_IN_MEM: return "X86ISD::FP_TO_INT16_IN_MEM";
   16491   case X86ISD::FP_TO_INT32_IN_MEM: return "X86ISD::FP_TO_INT32_IN_MEM";
   16492   case X86ISD::FP_TO_INT64_IN_MEM: return "X86ISD::FP_TO_INT64_IN_MEM";
   16493   case X86ISD::FLD:                return "X86ISD::FLD";
   16494   case X86ISD::FST:                return "X86ISD::FST";
   16495   case X86ISD::CALL:               return "X86ISD::CALL";
   16496   case X86ISD::RDTSC_DAG:          return "X86ISD::RDTSC_DAG";
   16497   case X86ISD::RDTSCP_DAG:         return "X86ISD::RDTSCP_DAG";
   16498   case X86ISD::RDPMC_DAG:          return "X86ISD::RDPMC_DAG";
   16499   case X86ISD::BT:                 return "X86ISD::BT";
   16500   case X86ISD::CMP:                return "X86ISD::CMP";
   16501   case X86ISD::COMI:               return "X86ISD::COMI";
   16502   case X86ISD::UCOMI:              return "X86ISD::UCOMI";
   16503   case X86ISD::CMPM:               return "X86ISD::CMPM";
   16504   case X86ISD::CMPMU:              return "X86ISD::CMPMU";
   16505   case X86ISD::SETCC:              return "X86ISD::SETCC";
   16506   case X86ISD::SETCC_CARRY:        return "X86ISD::SETCC_CARRY";
   16507   case X86ISD::FSETCC:             return "X86ISD::FSETCC";
   16508   case X86ISD::CMOV:               return "X86ISD::CMOV";
   16509   case X86ISD::BRCOND:             return "X86ISD::BRCOND";
   16510   case X86ISD::RET_FLAG:           return "X86ISD::RET_FLAG";
   16511   case X86ISD::REP_STOS:           return "X86ISD::REP_STOS";
   16512   case X86ISD::REP_MOVS:           return "X86ISD::REP_MOVS";
   16513   case X86ISD::GlobalBaseReg:      return "X86ISD::GlobalBaseReg";
   16514   case X86ISD::Wrapper:            return "X86ISD::Wrapper";
   16515   case X86ISD::WrapperRIP:         return "X86ISD::WrapperRIP";
   16516   case X86ISD::PEXTRB:             return "X86ISD::PEXTRB";
   16517   case X86ISD::PEXTRW:             return "X86ISD::PEXTRW";
   16518   case X86ISD::INSERTPS:           return "X86ISD::INSERTPS";
   16519   case X86ISD::PINSRB:             return "X86ISD::PINSRB";
   16520   case X86ISD::PINSRW:             return "X86ISD::PINSRW";
   16521   case X86ISD::PSHUFB:             return "X86ISD::PSHUFB";
   16522   case X86ISD::ANDNP:              return "X86ISD::ANDNP";
   16523   case X86ISD::PSIGN:              return "X86ISD::PSIGN";
   16524   case X86ISD::BLENDV:             return "X86ISD::BLENDV";
   16525   case X86ISD::BLENDI:             return "X86ISD::BLENDI";
   16526   case X86ISD::SUBUS:              return "X86ISD::SUBUS";
   16527   case X86ISD::HADD:               return "X86ISD::HADD";
   16528   case X86ISD::HSUB:               return "X86ISD::HSUB";
   16529   case X86ISD::FHADD:              return "X86ISD::FHADD";
   16530   case X86ISD::FHSUB:              return "X86ISD::FHSUB";
   16531   case X86ISD::UMAX:               return "X86ISD::UMAX";
   16532   case X86ISD::UMIN:               return "X86ISD::UMIN";
   16533   case X86ISD::SMAX:               return "X86ISD::SMAX";
   16534   case X86ISD::SMIN:               return "X86ISD::SMIN";
   16535   case X86ISD::FMAX:               return "X86ISD::FMAX";
   16536   case X86ISD::FMIN:               return "X86ISD::FMIN";
   16537   case X86ISD::FMAXC:              return "X86ISD::FMAXC";
   16538   case X86ISD::FMINC:              return "X86ISD::FMINC";
   16539   case X86ISD::FRSQRT:             return "X86ISD::FRSQRT";
   16540   case X86ISD::FRCP:               return "X86ISD::FRCP";
   16541   case X86ISD::TLSADDR:            return "X86ISD::TLSADDR";
   16542   case X86ISD::TLSBASEADDR:        return "X86ISD::TLSBASEADDR";
   16543   case X86ISD::TLSCALL:            return "X86ISD::TLSCALL";
   16544   case X86ISD::EH_SJLJ_SETJMP:     return "X86ISD::EH_SJLJ_SETJMP";
   16545   case X86ISD::EH_SJLJ_LONGJMP:    return "X86ISD::EH_SJLJ_LONGJMP";
   16546   case X86ISD::EH_RETURN:          return "X86ISD::EH_RETURN";
   16547   case X86ISD::TC_RETURN:          return "X86ISD::TC_RETURN";
   16548   case X86ISD::FNSTCW16m:          return "X86ISD::FNSTCW16m";
   16549   case X86ISD::FNSTSW16r:          return "X86ISD::FNSTSW16r";
   16550   case X86ISD::LCMPXCHG_DAG:       return "X86ISD::LCMPXCHG_DAG";
   16551   case X86ISD::LCMPXCHG8_DAG:      return "X86ISD::LCMPXCHG8_DAG";
   16552   case X86ISD::LCMPXCHG16_DAG:     return "X86ISD::LCMPXCHG16_DAG";
   16553   case X86ISD::VZEXT_MOVL:         return "X86ISD::VZEXT_MOVL";
   16554   case X86ISD::VZEXT_LOAD:         return "X86ISD::VZEXT_LOAD";
   16555   case X86ISD::VZEXT:              return "X86ISD::VZEXT";
   16556   case X86ISD::VSEXT:              return "X86ISD::VSEXT";
   16557   case X86ISD::VTRUNC:             return "X86ISD::VTRUNC";
   16558   case X86ISD::VTRUNCM:            return "X86ISD::VTRUNCM";
   16559   case X86ISD::VINSERT:            return "X86ISD::VINSERT";
   16560   case X86ISD::VFPEXT:             return "X86ISD::VFPEXT";
   16561   case X86ISD::VFPROUND:           return "X86ISD::VFPROUND";
   16562   case X86ISD::VSHLDQ:             return "X86ISD::VSHLDQ";
   16563   case X86ISD::VSRLDQ:             return "X86ISD::VSRLDQ";
   16564   case X86ISD::VSHL:               return "X86ISD::VSHL";
   16565   case X86ISD::VSRL:               return "X86ISD::VSRL";
   16566   case X86ISD::VSRA:               return "X86ISD::VSRA";
   16567   case X86ISD::VSHLI:              return "X86ISD::VSHLI";
   16568   case X86ISD::VSRLI:              return "X86ISD::VSRLI";
   16569   case X86ISD::VSRAI:              return "X86ISD::VSRAI";
   16570   case X86ISD::CMPP:               return "X86ISD::CMPP";
   16571   case X86ISD::PCMPEQ:             return "X86ISD::PCMPEQ";
   16572   case X86ISD::PCMPGT:             return "X86ISD::PCMPGT";
   16573   case X86ISD::PCMPEQM:            return "X86ISD::PCMPEQM";
   16574   case X86ISD::PCMPGTM:            return "X86ISD::PCMPGTM";
   16575   case X86ISD::ADD:                return "X86ISD::ADD";
   16576   case X86ISD::SUB:                return "X86ISD::SUB";
   16577   case X86ISD::ADC:                return "X86ISD::ADC";
   16578   case X86ISD::SBB:                return "X86ISD::SBB";
   16579   case X86ISD::SMUL:               return "X86ISD::SMUL";
   16580   case X86ISD::UMUL:               return "X86ISD::UMUL";
   16581   case X86ISD::INC:                return "X86ISD::INC";
   16582   case X86ISD::DEC:                return "X86ISD::DEC";
   16583   case X86ISD::OR:                 return "X86ISD::OR";
   16584   case X86ISD::XOR:                return "X86ISD::XOR";
   16585   case X86ISD::AND:                return "X86ISD::AND";
   16586   case X86ISD::BEXTR:              return "X86ISD::BEXTR";
   16587   case X86ISD::MUL_IMM:            return "X86ISD::MUL_IMM";
   16588   case X86ISD::PTEST:              return "X86ISD::PTEST";
   16589   case X86ISD::TESTP:              return "X86ISD::TESTP";
   16590   case X86ISD::TESTM:              return "X86ISD::TESTM";
   16591   case X86ISD::TESTNM:             return "X86ISD::TESTNM";
   16592   case X86ISD::KORTEST:            return "X86ISD::KORTEST";
   16593   case X86ISD::PACKSS:             return "X86ISD::PACKSS";
   16594   case X86ISD::PACKUS:             return "X86ISD::PACKUS";
   16595   case X86ISD::PALIGNR:            return "X86ISD::PALIGNR";
   16596   case X86ISD::PSHUFD:             return "X86ISD::PSHUFD";
   16597   case X86ISD::PSHUFHW:            return "X86ISD::PSHUFHW";
   16598   case X86ISD::PSHUFLW:            return "X86ISD::PSHUFLW";
   16599   case X86ISD::SHUFP:              return "X86ISD::SHUFP";
   16600   case X86ISD::MOVLHPS:            return "X86ISD::MOVLHPS";
   16601   case X86ISD::MOVLHPD:            return "X86ISD::MOVLHPD";
   16602   case X86ISD::MOVHLPS:            return "X86ISD::MOVHLPS";
   16603   case X86ISD::MOVLPS:             return "X86ISD::MOVLPS";
   16604   case X86ISD::MOVLPD:             return "X86ISD::MOVLPD";
   16605   case X86ISD::MOVDDUP:            return "X86ISD::MOVDDUP";
   16606   case X86ISD::MOVSHDUP:           return "X86ISD::MOVSHDUP";
   16607   case X86ISD::MOVSLDUP:           return "X86ISD::MOVSLDUP";
   16608   case X86ISD::MOVSD:              return "X86ISD::MOVSD";
   16609   case X86ISD::MOVSS:              return "X86ISD::MOVSS";
   16610   case X86ISD::UNPCKL:             return "X86ISD::UNPCKL";
   16611   case X86ISD::UNPCKH:             return "X86ISD::UNPCKH";
   16612   case X86ISD::VBROADCAST:         return "X86ISD::VBROADCAST";
   16613   case X86ISD::VBROADCASTM:        return "X86ISD::VBROADCASTM";
   16614   case X86ISD::VEXTRACT:           return "X86ISD::VEXTRACT";
   16615   case X86ISD::VPERMILP:           return "X86ISD::VPERMILP";
   16616   case X86ISD::VPERM2X128:         return "X86ISD::VPERM2X128";
   16617   case X86ISD::VPERMV:             return "X86ISD::VPERMV";
   16618   case X86ISD::VPERMV3:            return "X86ISD::VPERMV3";
   16619   case X86ISD::VPERMIV3:           return "X86ISD::VPERMIV3";
   16620   case X86ISD::VPERMI:             return "X86ISD::VPERMI";
   16621   case X86ISD::PMULUDQ:            return "X86ISD::PMULUDQ";
   16622   case X86ISD::PMULDQ:             return "X86ISD::PMULDQ";
   16623   case X86ISD::VASTART_SAVE_XMM_REGS: return "X86ISD::VASTART_SAVE_XMM_REGS";
   16624   case X86ISD::VAARG_64:           return "X86ISD::VAARG_64";
   16625   case X86ISD::WIN_ALLOCA:         return "X86ISD::WIN_ALLOCA";
   16626   case X86ISD::MEMBARRIER:         return "X86ISD::MEMBARRIER";
   16627   case X86ISD::SEG_ALLOCA:         return "X86ISD::SEG_ALLOCA";
   16628   case X86ISD::WIN_FTOL:           return "X86ISD::WIN_FTOL";
   16629   case X86ISD::SAHF:               return "X86ISD::SAHF";
   16630   case X86ISD::RDRAND:             return "X86ISD::RDRAND";
   16631   case X86ISD::RDSEED:             return "X86ISD::RDSEED";
   16632   case X86ISD::FMADD:              return "X86ISD::FMADD";
   16633   case X86ISD::FMSUB:              return "X86ISD::FMSUB";
   16634   case X86ISD::FNMADD:             return "X86ISD::FNMADD";
   16635   case X86ISD::FNMSUB:             return "X86ISD::FNMSUB";
   16636   case X86ISD::FMADDSUB:           return "X86ISD::FMADDSUB";
   16637   case X86ISD::FMSUBADD:           return "X86ISD::FMSUBADD";
   16638   case X86ISD::PCMPESTRI:          return "X86ISD::PCMPESTRI";
   16639   case X86ISD::PCMPISTRI:          return "X86ISD::PCMPISTRI";
   16640   case X86ISD::XTEST:              return "X86ISD::XTEST";
   16641   }
   16642 }
   16643 
   16644 // isLegalAddressingMode - Return true if the addressing mode represented
   16645 // by AM is legal for this target, for a load/store of the specified type.
   16646 bool X86TargetLowering::isLegalAddressingMode(const AddrMode &AM,
   16647                                               Type *Ty) const {
   16648   // X86 supports extremely general addressing modes.
   16649   CodeModel::Model M = getTargetMachine().getCodeModel();
   16650   Reloc::Model R = getTargetMachine().getRelocationModel();
   16651 
   16652   // X86 allows a sign-extended 32-bit immediate field as a displacement.
   16653   if (!X86::isOffsetSuitableForCodeModel(AM.BaseOffs, M, AM.BaseGV != nullptr))
   16654     return false;
   16655 
   16656   if (AM.BaseGV) {
   16657     unsigned GVFlags =
   16658       Subtarget->ClassifyGlobalReference(AM.BaseGV, getTargetMachine());
   16659 
   16660     // If a reference to this global requires an extra load, we can't fold it.
   16661     if (isGlobalStubReference(GVFlags))
   16662       return false;
   16663 
   16664     // If BaseGV requires a register for the PIC base, we cannot also have a
   16665     // BaseReg specified.
   16666     if (AM.HasBaseReg && isGlobalRelativeToPICBase(GVFlags))
   16667       return false;
   16668 
   16669     // If lower 4G is not available, then we must use rip-relative addressing.
   16670     if ((M != CodeModel::Small || R != Reloc::Static) &&
   16671         Subtarget->is64Bit() && (AM.BaseOffs || AM.Scale > 1))
   16672       return false;
   16673   }
   16674 
   16675   switch (AM.Scale) {
   16676   case 0:
   16677   case 1:
   16678   case 2:
   16679   case 4:
   16680   case 8:
   16681     // These scales always work.
   16682     break;
   16683   case 3:
   16684   case 5:
   16685   case 9:
   16686     // These scales are formed with basereg+scalereg.  Only accept if there is
   16687     // no basereg yet.
   16688     if (AM.HasBaseReg)
   16689       return false;
   16690     break;
   16691   default:  // Other stuff never works.
   16692     return false;
   16693   }
   16694 
   16695   return true;
   16696 }
   16697 
   16698 bool X86TargetLowering::isVectorShiftByScalarCheap(Type *Ty) const {
   16699   unsigned Bits = Ty->getScalarSizeInBits();
   16700 
   16701   // 8-bit shifts are always expensive, but versions with a scalar amount aren't
   16702   // particularly cheaper than those without.
   16703   if (Bits == 8)
   16704     return false;
   16705 
   16706   // On AVX2 there are new vpsllv[dq] instructions (and other shifts), that make
   16707   // variable shifts just as cheap as scalar ones.
   16708   if (Subtarget->hasInt256() && (Bits == 32 || Bits == 64))
   16709     return false;
   16710 
   16711   // Otherwise, it's significantly cheaper to shift by a scalar amount than by a
   16712   // fully general vector.
   16713   return true;
   16714 }
   16715 
   16716 bool X86TargetLowering::isTruncateFree(Type *Ty1, Type *Ty2) const {
   16717   if (!Ty1->isIntegerTy() || !Ty2->isIntegerTy())
   16718     return false;
   16719   unsigned NumBits1 = Ty1->getPrimitiveSizeInBits();
   16720   unsigned NumBits2 = Ty2->getPrimitiveSizeInBits();
   16721   return NumBits1 > NumBits2;
   16722 }
   16723 
   16724 bool X86TargetLowering::allowTruncateForTailCall(Type *Ty1, Type *Ty2) const {
   16725   if (!Ty1->isIntegerTy() || !Ty2->isIntegerTy())
   16726     return false;
   16727 
   16728   if (!isTypeLegal(EVT::getEVT(Ty1)))
   16729     return false;
   16730 
   16731   assert(Ty1->getPrimitiveSizeInBits() <= 64 && "i128 is probably not a noop");
   16732 
   16733   // Assuming the caller doesn't have a zeroext or signext return parameter,
   16734   // truncation all the way down to i1 is valid.
   16735   return true;
   16736 }
   16737 
   16738 bool X86TargetLowering::isLegalICmpImmediate(int64_t Imm) const {
   16739   return isInt<32>(Imm);
   16740 }
   16741 
   16742 bool X86TargetLowering::isLegalAddImmediate(int64_t Imm) const {
   16743   // Can also use sub to handle negated immediates.
   16744   return isInt<32>(Imm);
   16745 }
   16746 
   16747 bool X86TargetLowering::isTruncateFree(EVT VT1, EVT VT2) const {
   16748   if (!VT1.isInteger() || !VT2.isInteger())
   16749     return false;
   16750   unsigned NumBits1 = VT1.getSizeInBits();
   16751   unsigned NumBits2 = VT2.getSizeInBits();
   16752   return NumBits1 > NumBits2;
   16753 }
   16754 
   16755 bool X86TargetLowering::isZExtFree(Type *Ty1, Type *Ty2) const {
   16756   // x86-64 implicitly zero-extends 32-bit results in 64-bit registers.
   16757   return Ty1->isIntegerTy(32) && Ty2->isIntegerTy(64) && Subtarget->is64Bit();
   16758 }
   16759 
   16760 bool X86TargetLowering::isZExtFree(EVT VT1, EVT VT2) const {
   16761   // x86-64 implicitly zero-extends 32-bit results in 64-bit registers.
   16762   return VT1 == MVT::i32 && VT2 == MVT::i64 && Subtarget->is64Bit();
   16763 }
   16764 
   16765 bool X86TargetLowering::isZExtFree(SDValue Val, EVT VT2) const {
   16766   EVT VT1 = Val.getValueType();
   16767   if (isZExtFree(VT1, VT2))
   16768     return true;
   16769 
   16770   if (Val.getOpcode() != ISD::LOAD)
   16771     return false;
   16772 
   16773   if (!VT1.isSimple() || !VT1.isInteger() ||
   16774       !VT2.isSimple() || !VT2.isInteger())
   16775     return false;
   16776 
   16777   switch (VT1.getSimpleVT().SimpleTy) {
   16778   default: break;
   16779   case MVT::i8:
   16780   case MVT::i16:
   16781   case MVT::i32:
   16782     // X86 has 8, 16, and 32-bit zero-extending loads.
   16783     return true;
   16784   }
   16785 
   16786   return false;
   16787 }
   16788 
   16789 bool
   16790 X86TargetLowering::isFMAFasterThanFMulAndFAdd(EVT VT) const {
   16791   if (!(Subtarget->hasFMA() || Subtarget->hasFMA4()))
   16792     return false;
   16793 
   16794   VT = VT.getScalarType();
   16795 
   16796   if (!VT.isSimple())
   16797     return false;
   16798 
   16799   switch (VT.getSimpleVT().SimpleTy) {
   16800   case MVT::f32:
   16801   case MVT::f64:
   16802     return true;
   16803   default:
   16804     break;
   16805   }
   16806 
   16807   return false;
   16808 }
   16809 
   16810 bool X86TargetLowering::isNarrowingProfitable(EVT VT1, EVT VT2) const {
   16811   // i16 instructions are longer (0x66 prefix) and potentially slower.
   16812   return !(VT1 == MVT::i32 && VT2 == MVT::i16);
   16813 }
   16814 
   16815 /// isShuffleMaskLegal - Targets can use this to indicate that they only
   16816 /// support *some* VECTOR_SHUFFLE operations, those with specific masks.
   16817 /// By default, if a target supports the VECTOR_SHUFFLE node, all mask values
   16818 /// are assumed to be legal.
   16819 bool
   16820 X86TargetLowering::isShuffleMaskLegal(const SmallVectorImpl<int> &M,
   16821                                       EVT VT) const {
   16822   if (!VT.isSimple())
   16823     return false;
   16824 
   16825   MVT SVT = VT.getSimpleVT();
   16826 
   16827   // Very little shuffling can be done for 64-bit vectors right now.
   16828   if (VT.getSizeInBits() == 64)
   16829     return false;
   16830 
   16831   // If this is a single-input shuffle with no 128 bit lane crossings we can
   16832   // lower it into pshufb.
   16833   if ((SVT.is128BitVector() && Subtarget->hasSSSE3()) ||
   16834       (SVT.is256BitVector() && Subtarget->hasInt256())) {
   16835     bool isLegal = true;
   16836     for (unsigned I = 0, E = M.size(); I != E; ++I) {
   16837       if (M[I] >= (int)SVT.getVectorNumElements() ||
   16838           ShuffleCrosses128bitLane(SVT, I, M[I])) {
   16839         isLegal = false;
   16840         break;
   16841       }
   16842     }
   16843     if (isLegal)
   16844       return true;
   16845   }
   16846 
   16847   // FIXME: blends, shifts.
   16848   return (SVT.getVectorNumElements() == 2 ||
   16849           ShuffleVectorSDNode::isSplatMask(&M[0], VT) ||
   16850           isMOVLMask(M, SVT) ||
   16851           isSHUFPMask(M, SVT) ||
   16852           isPSHUFDMask(M, SVT) ||
   16853           isPSHUFHWMask(M, SVT, Subtarget->hasInt256()) ||
   16854           isPSHUFLWMask(M, SVT, Subtarget->hasInt256()) ||
   16855           isPALIGNRMask(M, SVT, Subtarget) ||
   16856           isUNPCKLMask(M, SVT, Subtarget->hasInt256()) ||
   16857           isUNPCKHMask(M, SVT, Subtarget->hasInt256()) ||
   16858           isUNPCKL_v_undef_Mask(M, SVT, Subtarget->hasInt256()) ||
   16859           isUNPCKH_v_undef_Mask(M, SVT, Subtarget->hasInt256()) ||
   16860           isBlendMask(M, SVT, Subtarget->hasSSE41(), Subtarget->hasInt256()));
   16861 }
   16862 
   16863 bool
   16864 X86TargetLowering::isVectorClearMaskLegal(const SmallVectorImpl<int> &Mask,
   16865                                           EVT VT) const {
   16866   if (!VT.isSimple())
   16867     return false;
   16868 
   16869   MVT SVT = VT.getSimpleVT();
   16870   unsigned NumElts = SVT.getVectorNumElements();
   16871   // FIXME: This collection of masks seems suspect.
   16872   if (NumElts == 2)
   16873     return true;
   16874   if (NumElts == 4 && SVT.is128BitVector()) {
   16875     return (isMOVLMask(Mask, SVT)  ||
   16876             isCommutedMOVLMask(Mask, SVT, true) ||
   16877             isSHUFPMask(Mask, SVT) ||
   16878             isSHUFPMask(Mask, SVT, /* Commuted */ true));
   16879   }
   16880   return false;
   16881 }
   16882 
   16883 //===----------------------------------------------------------------------===//
   16884 //                           X86 Scheduler Hooks
   16885 //===----------------------------------------------------------------------===//
   16886 
   16887 /// Utility function to emit xbegin specifying the start of an RTM region.
   16888 static MachineBasicBlock *EmitXBegin(MachineInstr *MI, MachineBasicBlock *MBB,
   16889                                      const TargetInstrInfo *TII) {
   16890   DebugLoc DL = MI->getDebugLoc();
   16891 
   16892   const BasicBlock *BB = MBB->getBasicBlock();
   16893   MachineFunction::iterator I = MBB;
   16894   ++I;
   16895 
   16896   // For the v = xbegin(), we generate
   16897   //
   16898   // thisMBB:
   16899   //  xbegin sinkMBB
   16900   //
   16901   // mainMBB:
   16902   //  eax = -1
   16903   //
   16904   // sinkMBB:
   16905   //  v = eax
   16906 
   16907   MachineBasicBlock *thisMBB = MBB;
   16908   MachineFunction *MF = MBB->getParent();
   16909   MachineBasicBlock *mainMBB = MF->CreateMachineBasicBlock(BB);
   16910   MachineBasicBlock *sinkMBB = MF->CreateMachineBasicBlock(BB);
   16911   MF->insert(I, mainMBB);
   16912   MF->insert(I, sinkMBB);
   16913 
   16914   // Transfer the remainder of BB and its successor edges to sinkMBB.
   16915   sinkMBB->splice(sinkMBB->begin(), MBB,
   16916                   std::next(MachineBasicBlock::iterator(MI)), MBB->end());
   16917   sinkMBB->transferSuccessorsAndUpdatePHIs(MBB);
   16918 
   16919   // thisMBB:
   16920   //  xbegin sinkMBB
   16921   //  # fallthrough to mainMBB
   16922   //  # abortion to sinkMBB
   16923   BuildMI(thisMBB, DL, TII->get(X86::XBEGIN_4)).addMBB(sinkMBB);
   16924   thisMBB->addSuccessor(mainMBB);
   16925   thisMBB->addSuccessor(sinkMBB);
   16926 
   16927   // mainMBB:
   16928   //  EAX = -1
   16929   BuildMI(mainMBB, DL, TII->get(X86::MOV32ri), X86::EAX).addImm(-1);
   16930   mainMBB->addSuccessor(sinkMBB);
   16931 
   16932   // sinkMBB:
   16933   // EAX is live into the sinkMBB
   16934   sinkMBB->addLiveIn(X86::EAX);
   16935   BuildMI(*sinkMBB, sinkMBB->begin(), DL,
   16936           TII->get(TargetOpcode::COPY), MI->getOperand(0).getReg())
   16937     .addReg(X86::EAX);
   16938 
   16939   MI->eraseFromParent();
   16940   return sinkMBB;
   16941 }
   16942 
   16943 // FIXME: When we get size specific XMM0 registers, i.e. XMM0_V16I8
   16944 // or XMM0_V32I8 in AVX all of this code can be replaced with that
   16945 // in the .td file.
   16946 static MachineBasicBlock *EmitPCMPSTRM(MachineInstr *MI, MachineBasicBlock *BB,
   16947                                        const TargetInstrInfo *TII) {
   16948   unsigned Opc;
   16949   switch (MI->getOpcode()) {
   16950   default: llvm_unreachable("illegal opcode!");
   16951   case X86::PCMPISTRM128REG:  Opc = X86::PCMPISTRM128rr;  break;
   16952   case X86::VPCMPISTRM128REG: Opc = X86::VPCMPISTRM128rr; break;
   16953   case X86::PCMPISTRM128MEM:  Opc = X86::PCMPISTRM128rm;  break;
   16954   case X86::VPCMPISTRM128MEM: Opc = X86::VPCMPISTRM128rm; break;
   16955   case X86::PCMPESTRM128REG:  Opc = X86::PCMPESTRM128rr;  break;
   16956   case X86::VPCMPESTRM128REG: Opc = X86::VPCMPESTRM128rr; break;
   16957   case X86::PCMPESTRM128MEM:  Opc = X86::PCMPESTRM128rm;  break;
   16958   case X86::VPCMPESTRM128MEM: Opc = X86::VPCMPESTRM128rm; break;
   16959   }
   16960 
   16961   DebugLoc dl = MI->getDebugLoc();
   16962   MachineInstrBuilder MIB = BuildMI(*BB, MI, dl, TII->get(Opc));
   16963 
   16964   unsigned NumArgs = MI->getNumOperands();
   16965   for (unsigned i = 1; i < NumArgs; ++i) {
   16966     MachineOperand &Op = MI->getOperand(i);
   16967     if (!(Op.isReg() && Op.isImplicit()))
   16968       MIB.addOperand(Op);
   16969   }
   16970   if (MI->hasOneMemOperand())
   16971     MIB->setMemRefs(MI->memoperands_begin(), MI->memoperands_end());
   16972 
   16973   BuildMI(*BB, MI, dl,
   16974     TII->get(TargetOpcode::COPY), MI->getOperand(0).getReg())
   16975     .addReg(X86::XMM0);
   16976 
   16977   MI->eraseFromParent();
   16978   return BB;
   16979 }
   16980 
   16981 // FIXME: Custom handling because TableGen doesn't support multiple implicit
   16982 // defs in an instruction pattern
   16983 static MachineBasicBlock *EmitPCMPSTRI(MachineInstr *MI, MachineBasicBlock *BB,
   16984                                        const TargetInstrInfo *TII) {
   16985   unsigned Opc;
   16986   switch (MI->getOpcode()) {
   16987   default: llvm_unreachable("illegal opcode!");
   16988   case X86::PCMPISTRIREG:  Opc = X86::PCMPISTRIrr;  break;
   16989   case X86::VPCMPISTRIREG: Opc = X86::VPCMPISTRIrr; break;
   16990   case X86::PCMPISTRIMEM:  Opc = X86::PCMPISTRIrm;  break;
   16991   case X86::VPCMPISTRIMEM: Opc = X86::VPCMPISTRIrm; break;
   16992   case X86::PCMPESTRIREG:  Opc = X86::PCMPESTRIrr;  break;
   16993   case X86::VPCMPESTRIREG: Opc = X86::VPCMPESTRIrr; break;
   16994   case X86::PCMPESTRIMEM:  Opc = X86::PCMPESTRIrm;  break;
   16995   case X86::VPCMPESTRIMEM: Opc = X86::VPCMPESTRIrm; break;
   16996   }
   16997 
   16998   DebugLoc dl = MI->getDebugLoc();
   16999   MachineInstrBuilder MIB = BuildMI(*BB, MI, dl, TII->get(Opc));
   17000 
   17001   unsigned NumArgs = MI->getNumOperands(); // remove the results
   17002   for (unsigned i = 1; i < NumArgs; ++i) {
   17003     MachineOperand &Op = MI->getOperand(i);
   17004     if (!(Op.isReg() && Op.isImplicit()))
   17005       MIB.addOperand(Op);
   17006   }
   17007   if (MI->hasOneMemOperand())
   17008     MIB->setMemRefs(MI->memoperands_begin(), MI->memoperands_end());
   17009 
   17010   BuildMI(*BB, MI, dl,
   17011     TII->get(TargetOpcode::COPY), MI->getOperand(0).getReg())
   17012     .addReg(X86::ECX);
   17013 
   17014   MI->eraseFromParent();
   17015   return BB;
   17016 }
   17017 
   17018 static MachineBasicBlock * EmitMonitor(MachineInstr *MI, MachineBasicBlock *BB,
   17019                                        const TargetInstrInfo *TII,
   17020                                        const X86Subtarget* Subtarget) {
   17021   DebugLoc dl = MI->getDebugLoc();
   17022 
   17023   // Address into RAX/EAX, other two args into ECX, EDX.
   17024   unsigned MemOpc = Subtarget->is64Bit() ? X86::LEA64r : X86::LEA32r;
   17025   unsigned MemReg = Subtarget->is64Bit() ? X86::RAX : X86::EAX;
   17026   MachineInstrBuilder MIB = BuildMI(*BB, MI, dl, TII->get(MemOpc), MemReg);
   17027   for (int i = 0; i < X86::AddrNumOperands; ++i)
   17028     MIB.addOperand(MI->getOperand(i));
   17029 
   17030   unsigned ValOps = X86::AddrNumOperands;
   17031   BuildMI(*BB, MI, dl, TII->get(TargetOpcode::COPY), X86::ECX)
   17032     .addReg(MI->getOperand(ValOps).getReg());
   17033   BuildMI(*BB, MI, dl, TII->get(TargetOpcode::COPY), X86::EDX)
   17034     .addReg(MI->getOperand(ValOps+1).getReg());
   17035 
   17036   // The instruction doesn't actually take any operands though.
   17037   BuildMI(*BB, MI, dl, TII->get(X86::MONITORrrr));
   17038 
   17039   MI->eraseFromParent(); // The pseudo is gone now.
   17040   return BB;
   17041 }
   17042 
   17043 MachineBasicBlock *
   17044 X86TargetLowering::EmitVAARG64WithCustomInserter(
   17045                    MachineInstr *MI,
   17046                    MachineBasicBlock *MBB) const {
   17047   // Emit va_arg instruction on X86-64.
   17048 
   17049   // Operands to this pseudo-instruction:
   17050   // 0  ) Output        : destination address (reg)
   17051   // 1-5) Input         : va_list address (addr, i64mem)
   17052   // 6  ) ArgSize       : Size (in bytes) of vararg type
   17053   // 7  ) ArgMode       : 0=overflow only, 1=use gp_offset, 2=use fp_offset
   17054   // 8  ) Align         : Alignment of type
   17055   // 9  ) EFLAGS (implicit-def)
   17056 
   17057   assert(MI->getNumOperands() == 10 && "VAARG_64 should have 10 operands!");
   17058   assert(X86::AddrNumOperands == 5 && "VAARG_64 assumes 5 address operands");
   17059 
   17060   unsigned DestReg = MI->getOperand(0).getReg();
   17061   MachineOperand &Base = MI->getOperand(1);
   17062   MachineOperand &Scale = MI->getOperand(2);
   17063   MachineOperand &Index = MI->getOperand(3);
   17064   MachineOperand &Disp = MI->getOperand(4);
   17065   MachineOperand &Segment = MI->getOperand(5);
   17066   unsigned ArgSize = MI->getOperand(6).getImm();
   17067   unsigned ArgMode = MI->getOperand(7).getImm();
   17068   unsigned Align = MI->getOperand(8).getImm();
   17069 
   17070   // Memory Reference
   17071   assert(MI->hasOneMemOperand() && "Expected VAARG_64 to have one memoperand");
   17072   MachineInstr::mmo_iterator MMOBegin = MI->memoperands_begin();
   17073   MachineInstr::mmo_iterator MMOEnd = MI->memoperands_end();
   17074 
   17075   // Machine Information
   17076   const TargetInstrInfo *TII = MBB->getParent()->getTarget().getInstrInfo();
   17077   MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo();
   17078   const TargetRegisterClass *AddrRegClass = getRegClassFor(MVT::i64);
   17079   const TargetRegisterClass *OffsetRegClass = getRegClassFor(MVT::i32);
   17080   DebugLoc DL = MI->getDebugLoc();
   17081 
   17082   // struct va_list {
   17083   //   i32   gp_offset
   17084   //   i32   fp_offset
   17085   //   i64   overflow_area (address)
   17086   //   i64   reg_save_area (address)
   17087   // }
   17088   // sizeof(va_list) = 24
   17089   // alignment(va_list) = 8
   17090 
   17091   unsigned TotalNumIntRegs = 6;
   17092   unsigned TotalNumXMMRegs = 8;
   17093   bool UseGPOffset = (ArgMode == 1);
   17094   bool UseFPOffset = (ArgMode == 2);
   17095   unsigned MaxOffset = TotalNumIntRegs * 8 +
   17096                        (UseFPOffset ? TotalNumXMMRegs * 16 : 0);
   17097 
   17098   /* Align ArgSize to a multiple of 8 */
   17099   unsigned ArgSizeA8 = (ArgSize + 7) & ~7;
   17100   bool NeedsAlign = (Align > 8);
   17101 
   17102   MachineBasicBlock *thisMBB = MBB;
   17103   MachineBasicBlock *overflowMBB;
   17104   MachineBasicBlock *offsetMBB;
   17105   MachineBasicBlock *endMBB;
   17106 
   17107   unsigned OffsetDestReg = 0;    // Argument address computed by offsetMBB
   17108   unsigned OverflowDestReg = 0;  // Argument address computed by overflowMBB
   17109   unsigned OffsetReg = 0;
   17110 
   17111   if (!UseGPOffset && !UseFPOffset) {
   17112     // If we only pull from the overflow region, we don't create a branch.
   17113     // We don't need to alter control flow.
   17114     OffsetDestReg = 0; // unused
   17115     OverflowDestReg = DestReg;
   17116 
   17117     offsetMBB = nullptr;
   17118     overflowMBB = thisMBB;
   17119     endMBB = thisMBB;
   17120   } else {
   17121     // First emit code to check if gp_offset (or fp_offset) is below the bound.
   17122     // If so, pull the argument from reg_save_area. (branch to offsetMBB)
   17123     // If not, pull from overflow_area. (branch to overflowMBB)
   17124     //
   17125     //       thisMBB
   17126     //         |     .
   17127     //         |        .
   17128     //     offsetMBB   overflowMBB
   17129     //         |        .
   17130     //         |     .
   17131     //        endMBB
   17132 
   17133     // Registers for the PHI in endMBB
   17134     OffsetDestReg = MRI.createVirtualRegister(AddrRegClass);
   17135     OverflowDestReg = MRI.createVirtualRegister(AddrRegClass);
   17136 
   17137     const BasicBlock *LLVM_BB = MBB->getBasicBlock();
   17138     MachineFunction *MF = MBB->getParent();
   17139     overflowMBB = MF->CreateMachineBasicBlock(LLVM_BB);
   17140     offsetMBB = MF->CreateMachineBasicBlock(LLVM_BB);
   17141     endMBB = MF->CreateMachineBasicBlock(LLVM_BB);
   17142 
   17143     MachineFunction::iterator MBBIter = MBB;
   17144     ++MBBIter;
   17145 
   17146     // Insert the new basic blocks
   17147     MF->insert(MBBIter, offsetMBB);
   17148     MF->insert(MBBIter, overflowMBB);
   17149     MF->insert(MBBIter, endMBB);
   17150 
   17151     // Transfer the remainder of MBB and its successor edges to endMBB.
   17152     endMBB->splice(endMBB->begin(), thisMBB,
   17153                    std::next(MachineBasicBlock::iterator(MI)), thisMBB->end());
   17154     endMBB->transferSuccessorsAndUpdatePHIs(thisMBB);
   17155 
   17156     // Make offsetMBB and overflowMBB successors of thisMBB
   17157     thisMBB->addSuccessor(offsetMBB);
   17158     thisMBB->addSuccessor(overflowMBB);
   17159 
   17160     // endMBB is a successor of both offsetMBB and overflowMBB
   17161     offsetMBB->addSuccessor(endMBB);
   17162     overflowMBB->addSuccessor(endMBB);
   17163 
   17164     // Load the offset value into a register
   17165     OffsetReg = MRI.createVirtualRegister(OffsetRegClass);
   17166     BuildMI(thisMBB, DL, TII->get(X86::MOV32rm), OffsetReg)
   17167       .addOperand(Base)
   17168       .addOperand(Scale)
   17169       .addOperand(Index)
   17170       .addDisp(Disp, UseFPOffset ? 4 : 0)
   17171       .addOperand(Segment)
   17172       .setMemRefs(MMOBegin, MMOEnd);
   17173 
   17174     // Check if there is enough room left to pull this argument.
   17175     BuildMI(thisMBB, DL, TII->get(X86::CMP32ri))
   17176       .addReg(OffsetReg)
   17177       .addImm(MaxOffset + 8 - ArgSizeA8);
   17178 
   17179     // Branch to "overflowMBB" if offset >= max
   17180     // Fall through to "offsetMBB" otherwise
   17181     BuildMI(thisMBB, DL, TII->get(X86::GetCondBranchFromCond(X86::COND_AE)))
   17182       .addMBB(overflowMBB);
   17183   }
   17184 
   17185   // In offsetMBB, emit code to use the reg_save_area.
   17186   if (offsetMBB) {
   17187     assert(OffsetReg != 0);
   17188 
   17189     // Read the reg_save_area address.
   17190     unsigned RegSaveReg = MRI.createVirtualRegister(AddrRegClass);
   17191     BuildMI(offsetMBB, DL, TII->get(X86::MOV64rm), RegSaveReg)
   17192       .addOperand(Base)
   17193       .addOperand(Scale)
   17194       .addOperand(Index)
   17195       .addDisp(Disp, 16)
   17196       .addOperand(Segment)
   17197       .setMemRefs(MMOBegin, MMOEnd);
   17198 
   17199     // Zero-extend the offset
   17200     unsigned OffsetReg64 = MRI.createVirtualRegister(AddrRegClass);
   17201       BuildMI(offsetMBB, DL, TII->get(X86::SUBREG_TO_REG), OffsetReg64)
   17202         .addImm(0)
   17203         .addReg(OffsetReg)
   17204         .addImm(X86::sub_32bit);
   17205 
   17206     // Add the offset to the reg_save_area to get the final address.
   17207     BuildMI(offsetMBB, DL, TII->get(X86::ADD64rr), OffsetDestReg)
   17208       .addReg(OffsetReg64)
   17209       .addReg(RegSaveReg);
   17210 
   17211     // Compute the offset for the next argument
   17212     unsigned NextOffsetReg = MRI.createVirtualRegister(OffsetRegClass);
   17213     BuildMI(offsetMBB, DL, TII->get(X86::ADD32ri), NextOffsetReg)
   17214       .addReg(OffsetReg)
   17215       .addImm(UseFPOffset ? 16 : 8);
   17216 
   17217     // Store it back into the va_list.
   17218     BuildMI(offsetMBB, DL, TII->get(X86::MOV32mr))
   17219       .addOperand(Base)
   17220       .addOperand(Scale)
   17221       .addOperand(Index)
   17222       .addDisp(Disp, UseFPOffset ? 4 : 0)
   17223       .addOperand(Segment)
   17224       .addReg(NextOffsetReg)
   17225       .setMemRefs(MMOBegin, MMOEnd);
   17226 
   17227     // Jump to endMBB
   17228     BuildMI(offsetMBB, DL, TII->get(X86::JMP_4))
   17229       .addMBB(endMBB);
   17230   }
   17231 
   17232   //
   17233   // Emit code to use overflow area
   17234   //
   17235 
   17236   // Load the overflow_area address into a register.
   17237   unsigned OverflowAddrReg = MRI.createVirtualRegister(AddrRegClass);
   17238   BuildMI(overflowMBB, DL, TII->get(X86::MOV64rm), OverflowAddrReg)
   17239     .addOperand(Base)
   17240     .addOperand(Scale)
   17241     .addOperand(Index)
   17242     .addDisp(Disp, 8)
   17243     .addOperand(Segment)
   17244     .setMemRefs(MMOBegin, MMOEnd);
   17245 
   17246   // If we need to align it, do so. Otherwise, just copy the address
   17247   // to OverflowDestReg.
   17248   if (NeedsAlign) {
   17249     // Align the overflow address
   17250     assert((Align & (Align-1)) == 0 && "Alignment must be a power of 2");
   17251     unsigned TmpReg = MRI.createVirtualRegister(AddrRegClass);
   17252 
   17253     // aligned_addr = (addr + (align-1)) & ~(align-1)
   17254     BuildMI(overflowMBB, DL, TII->get(X86::ADD64ri32), TmpReg)
   17255       .addReg(OverflowAddrReg)
   17256       .addImm(Align-1);
   17257 
   17258     BuildMI(overflowMBB, DL, TII->get(X86::AND64ri32), OverflowDestReg)
   17259       .addReg(TmpReg)
   17260       .addImm(~(uint64_t)(Align-1));
   17261   } else {
   17262     BuildMI(overflowMBB, DL, TII->get(TargetOpcode::COPY), OverflowDestReg)
   17263       .addReg(OverflowAddrReg);
   17264   }
   17265 
   17266   // Compute the next overflow address after this argument.
   17267   // (the overflow address should be kept 8-byte aligned)
   17268   unsigned NextAddrReg = MRI.createVirtualRegister(AddrRegClass);
   17269   BuildMI(overflowMBB, DL, TII->get(X86::ADD64ri32), NextAddrReg)
   17270     .addReg(OverflowDestReg)
   17271     .addImm(ArgSizeA8);
   17272 
   17273   // Store the new overflow address.
   17274   BuildMI(overflowMBB, DL, TII->get(X86::MOV64mr))
   17275     .addOperand(Base)
   17276     .addOperand(Scale)
   17277     .addOperand(Index)
   17278     .addDisp(Disp, 8)
   17279     .addOperand(Segment)
   17280     .addReg(NextAddrReg)
   17281     .setMemRefs(MMOBegin, MMOEnd);
   17282 
   17283   // If we branched, emit the PHI to the front of endMBB.
   17284   if (offsetMBB) {
   17285     BuildMI(*endMBB, endMBB->begin(), DL,
   17286             TII->get(X86::PHI), DestReg)
   17287       .addReg(OffsetDestReg).addMBB(offsetMBB)
   17288       .addReg(OverflowDestReg).addMBB(overflowMBB);
   17289   }
   17290 
   17291   // Erase the pseudo instruction
   17292   MI->eraseFromParent();
   17293 
   17294   return endMBB;
   17295 }
   17296 
   17297 MachineBasicBlock *
   17298 X86TargetLowering::EmitVAStartSaveXMMRegsWithCustomInserter(
   17299                                                  MachineInstr *MI,
   17300                                                  MachineBasicBlock *MBB) const {
   17301   // Emit code to save XMM registers to the stack. The ABI says that the
   17302   // number of registers to save is given in %al, so it's theoretically
   17303   // possible to do an indirect jump trick to avoid saving all of them,
   17304   // however this code takes a simpler approach and just executes all
   17305   // of the stores if %al is non-zero. It's less code, and it's probably
   17306   // easier on the hardware branch predictor, and stores aren't all that
   17307   // expensive anyway.
   17308 
   17309   // Create the new basic blocks. One block contains all the XMM stores,
   17310   // and one block is the final destination regardless of whether any
   17311   // stores were performed.
   17312   const BasicBlock *LLVM_BB = MBB->getBasicBlock();
   17313   MachineFunction *F = MBB->getParent();
   17314   MachineFunction::iterator MBBIter = MBB;
   17315   ++MBBIter;
   17316   MachineBasicBlock *XMMSaveMBB = F->CreateMachineBasicBlock(LLVM_BB);
   17317   MachineBasicBlock *EndMBB = F->CreateMachineBasicBlock(LLVM_BB);
   17318   F->insert(MBBIter, XMMSaveMBB);
   17319   F->insert(MBBIter, EndMBB);
   17320 
   17321   // Transfer the remainder of MBB and its successor edges to EndMBB.
   17322   EndMBB->splice(EndMBB->begin(), MBB,
   17323                  std::next(MachineBasicBlock::iterator(MI)), MBB->end());
   17324   EndMBB->transferSuccessorsAndUpdatePHIs(MBB);
   17325 
   17326   // The original block will now fall through to the XMM save block.
   17327   MBB->addSuccessor(XMMSaveMBB);
   17328   // The XMMSaveMBB will fall through to the end block.
   17329   XMMSaveMBB->addSuccessor(EndMBB);
   17330 
   17331   // Now add the instructions.
   17332   const TargetInstrInfo *TII = MBB->getParent()->getTarget().getInstrInfo();
   17333   DebugLoc DL = MI->getDebugLoc();
   17334 
   17335   unsigned CountReg = MI->getOperand(0).getReg();
   17336   int64_t RegSaveFrameIndex = MI->getOperand(1).getImm();
   17337   int64_t VarArgsFPOffset = MI->getOperand(2).getImm();
   17338 
   17339   if (!Subtarget->isTargetWin64()) {
   17340     // If %al is 0, branch around the XMM save block.
   17341     BuildMI(MBB, DL, TII->get(X86::TEST8rr)).addReg(CountReg).addReg(CountReg);
   17342     BuildMI(MBB, DL, TII->get(X86::JE_4)).addMBB(EndMBB);
   17343     MBB->addSuccessor(EndMBB);
   17344   }
   17345 
   17346   // Make sure the last operand is EFLAGS, which gets clobbered by the branch
   17347   // that was just emitted, but clearly shouldn't be "saved".
   17348   assert((MI->getNumOperands() <= 3 ||
   17349           !MI->getOperand(MI->getNumOperands() - 1).isReg() ||
   17350           MI->getOperand(MI->getNumOperands() - 1).getReg() == X86::EFLAGS)
   17351          && "Expected last argument to be EFLAGS");
   17352   unsigned MOVOpc = Subtarget->hasFp256() ? X86::VMOVAPSmr : X86::MOVAPSmr;
   17353   // In the XMM save block, save all the XMM argument registers.
   17354   for (int i = 3, e = MI->getNumOperands() - 1; i != e; ++i) {
   17355     int64_t Offset = (i - 3) * 16 + VarArgsFPOffset;
   17356     MachineMemOperand *MMO =
   17357       F->getMachineMemOperand(
   17358           MachinePointerInfo::getFixedStack(RegSaveFrameIndex, Offset),
   17359         MachineMemOperand::MOStore,
   17360         /*Size=*/16, /*Align=*/16);
   17361     BuildMI(XMMSaveMBB, DL, TII->get(MOVOpc))
   17362       .addFrameIndex(RegSaveFrameIndex)
   17363       .addImm(/*Scale=*/1)
   17364       .addReg(/*IndexReg=*/0)
   17365       .addImm(/*Disp=*/Offset)
   17366       .addReg(/*Segment=*/0)
   17367       .addReg(MI->getOperand(i).getReg())
   17368       .addMemOperand(MMO);
   17369   }
   17370 
   17371   MI->eraseFromParent();   // The pseudo instruction is gone now.
   17372 
   17373   return EndMBB;
   17374 }
   17375 
   17376 // The EFLAGS operand of SelectItr might be missing a kill marker
   17377 // because there were multiple uses of EFLAGS, and ISel didn't know
   17378 // which to mark. Figure out whether SelectItr should have had a
   17379 // kill marker, and set it if it should. Returns the correct kill
   17380 // marker value.
   17381 static bool checkAndUpdateEFLAGSKill(MachineBasicBlock::iterator SelectItr,
   17382                                      MachineBasicBlock* BB,
   17383                                      const TargetRegisterInfo* TRI) {
   17384   // Scan forward through BB for a use/def of EFLAGS.
   17385   MachineBasicBlock::iterator miI(std::next(SelectItr));
   17386   for (MachineBasicBlock::iterator miE = BB->end(); miI != miE; ++miI) {
   17387     const MachineInstr& mi = *miI;
   17388     if (mi.readsRegister(X86::EFLAGS))
   17389       return false;
   17390     if (mi.definesRegister(X86::EFLAGS))
   17391       break; // Should have kill-flag - update below.
   17392   }
   17393 
   17394   // If we hit the end of the block, check whether EFLAGS is live into a
   17395   // successor.
   17396   if (miI == BB->end()) {
   17397     for (MachineBasicBlock::succ_iterator sItr = BB->succ_begin(),
   17398                                           sEnd = BB->succ_end();
   17399          sItr != sEnd; ++sItr) {
   17400       MachineBasicBlock* succ = *sItr;
   17401       if (succ->isLiveIn(X86::EFLAGS))
   17402         return false;
   17403     }
   17404   }
   17405 
   17406   // We found a def, or hit the end of the basic block and EFLAGS wasn't live
   17407   // out. SelectMI should have a kill flag on EFLAGS.
   17408   SelectItr->addRegisterKilled(X86::EFLAGS, TRI);
   17409   return true;
   17410 }
   17411 
   17412 MachineBasicBlock *
   17413 X86TargetLowering::EmitLoweredSelect(MachineInstr *MI,
   17414                                      MachineBasicBlock *BB) const {
   17415   const TargetInstrInfo *TII = BB->getParent()->getTarget().getInstrInfo();
   17416   DebugLoc DL = MI->getDebugLoc();
   17417 
   17418   // To "insert" a SELECT_CC instruction, we actually have to insert the
   17419   // diamond control-flow pattern.  The incoming instruction knows the
   17420   // destination vreg to set, the condition code register to branch on, the
   17421   // true/false values to select between, and a branch opcode to use.
   17422   const BasicBlock *LLVM_BB = BB->getBasicBlock();
   17423   MachineFunction::iterator It = BB;
   17424   ++It;
   17425 
   17426   //  thisMBB:
   17427   //  ...
   17428   //   TrueVal = ...
   17429   //   cmpTY ccX, r1, r2
   17430   //   bCC copy1MBB
   17431   //   fallthrough --> copy0MBB
   17432   MachineBasicBlock *thisMBB = BB;
   17433   MachineFunction *F = BB->getParent();
   17434   MachineBasicBlock *copy0MBB = F->CreateMachineBasicBlock(LLVM_BB);
   17435   MachineBasicBlock *sinkMBB = F->CreateMachineBasicBlock(LLVM_BB);
   17436   F->insert(It, copy0MBB);
   17437   F->insert(It, sinkMBB);
   17438 
   17439   // If the EFLAGS register isn't dead in the terminator, then claim that it's
   17440   // live into the sink and copy blocks.
   17441   const TargetRegisterInfo* TRI = BB->getParent()->getTarget().getRegisterInfo();
   17442   if (!MI->killsRegister(X86::EFLAGS) &&
   17443       !checkAndUpdateEFLAGSKill(MI, BB, TRI)) {
   17444     copy0MBB->addLiveIn(X86::EFLAGS);
   17445     sinkMBB->addLiveIn(X86::EFLAGS);
   17446   }
   17447 
   17448   // Transfer the remainder of BB and its successor edges to sinkMBB.
   17449   sinkMBB->splice(sinkMBB->begin(), BB,
   17450                   std::next(MachineBasicBlock::iterator(MI)), BB->end());
   17451   sinkMBB->transferSuccessorsAndUpdatePHIs(BB);
   17452 
   17453   // Add the true and fallthrough blocks as its successors.
   17454   BB->addSuccessor(copy0MBB);
   17455   BB->addSuccessor(sinkMBB);
   17456 
   17457   // Create the conditional branch instruction.
   17458   unsigned Opc =
   17459     X86::GetCondBranchFromCond((X86::CondCode)MI->getOperand(3).getImm());
   17460   BuildMI(BB, DL, TII->get(Opc)).addMBB(sinkMBB);
   17461 
   17462   //  copy0MBB:
   17463   //   %FalseValue = ...
   17464   //   # fallthrough to sinkMBB
   17465   copy0MBB->addSuccessor(sinkMBB);
   17466 
   17467   //  sinkMBB:
   17468   //   %Result = phi [ %FalseValue, copy0MBB ], [ %TrueValue, thisMBB ]
   17469   //  ...
   17470   BuildMI(*sinkMBB, sinkMBB->begin(), DL,
   17471           TII->get(X86::PHI), MI->getOperand(0).getReg())
   17472     .addReg(MI->getOperand(1).getReg()).addMBB(copy0MBB)
   17473     .addReg(MI->getOperand(2).getReg()).addMBB(thisMBB);
   17474 
   17475   MI->eraseFromParent();   // The pseudo instruction is gone now.
   17476   return sinkMBB;
   17477 }
   17478 
   17479 MachineBasicBlock *
   17480 X86TargetLowering::EmitLoweredSegAlloca(MachineInstr *MI, MachineBasicBlock *BB,
   17481                                         bool Is64Bit) const {
   17482   MachineFunction *MF = BB->getParent();
   17483   const TargetInstrInfo *TII = MF->getTarget().getInstrInfo();
   17484   DebugLoc DL = MI->getDebugLoc();
   17485   const BasicBlock *LLVM_BB = BB->getBasicBlock();
   17486 
   17487   assert(MF->shouldSplitStack());
   17488 
   17489   unsigned TlsReg = Is64Bit ? X86::FS : X86::GS;
   17490   unsigned TlsOffset = Is64Bit ? 0x70 : 0x30;
   17491 
   17492   // BB:
   17493   //  ... [Till the alloca]
   17494   // If stacklet is not large enough, jump to mallocMBB
   17495   //
   17496   // bumpMBB:
   17497   //  Allocate by subtracting from RSP
   17498   //  Jump to continueMBB
   17499   //
   17500   // mallocMBB:
   17501   //  Allocate by call to runtime
   17502   //
   17503   // continueMBB:
   17504   //  ...
   17505   //  [rest of original BB]
   17506   //
   17507 
   17508   MachineBasicBlock *mallocMBB = MF->CreateMachineBasicBlock(LLVM_BB);
   17509   MachineBasicBlock *bumpMBB = MF->CreateMachineBasicBlock(LLVM_BB);
   17510   MachineBasicBlock *continueMBB = MF->CreateMachineBasicBlock(LLVM_BB);
   17511 
   17512   MachineRegisterInfo &MRI = MF->getRegInfo();
   17513   const TargetRegisterClass *AddrRegClass =
   17514     getRegClassFor(Is64Bit ? MVT::i64:MVT::i32);
   17515 
   17516   unsigned mallocPtrVReg = MRI.createVirtualRegister(AddrRegClass),
   17517     bumpSPPtrVReg = MRI.createVirtualRegister(AddrRegClass),
   17518     tmpSPVReg = MRI.createVirtualRegister(AddrRegClass),
   17519     SPLimitVReg = MRI.createVirtualRegister(AddrRegClass),
   17520     sizeVReg = MI->getOperand(1).getReg(),
   17521     physSPReg = Is64Bit ? X86::RSP : X86::ESP;
   17522 
   17523   MachineFunction::iterator MBBIter = BB;
   17524   ++MBBIter;
   17525 
   17526   MF->insert(MBBIter, bumpMBB);
   17527   MF->insert(MBBIter, mallocMBB);
   17528   MF->insert(MBBIter, continueMBB);
   17529 
   17530   continueMBB->splice(continueMBB->begin(), BB,
   17531                       std::next(MachineBasicBlock::iterator(MI)), BB->end());
   17532   continueMBB->transferSuccessorsAndUpdatePHIs(BB);
   17533 
   17534   // Add code to the main basic block to check if the stack limit has been hit,
   17535   // and if so, jump to mallocMBB otherwise to bumpMBB.
   17536   BuildMI(BB, DL, TII->get(TargetOpcode::COPY), tmpSPVReg).addReg(physSPReg);
   17537   BuildMI(BB, DL, TII->get(Is64Bit ? X86::SUB64rr:X86::SUB32rr), SPLimitVReg)
   17538     .addReg(tmpSPVReg).addReg(sizeVReg);
   17539   BuildMI(BB, DL, TII->get(Is64Bit ? X86::CMP64mr:X86::CMP32mr))
   17540     .addReg(0).addImm(1).addReg(0).addImm(TlsOffset).addReg(TlsReg)
   17541     .addReg(SPLimitVReg);
   17542   BuildMI(BB, DL, TII->get(X86::JG_4)).addMBB(mallocMBB);
   17543 
   17544   // bumpMBB simply decreases the stack pointer, since we know the current
   17545   // stacklet has enough space.
   17546   BuildMI(bumpMBB, DL, TII->get(TargetOpcode::COPY), physSPReg)
   17547     .addReg(SPLimitVReg);
   17548   BuildMI(bumpMBB, DL, TII->get(TargetOpcode::COPY), bumpSPPtrVReg)
   17549     .addReg(SPLimitVReg);
   17550   BuildMI(bumpMBB, DL, TII->get(X86::JMP_4)).addMBB(continueMBB);
   17551 
   17552   // Calls into a routine in libgcc to allocate more space from the heap.
   17553   const uint32_t *RegMask =
   17554     MF->getTarget().getRegisterInfo()->getCallPreservedMask(CallingConv::C);
   17555   if (Is64Bit) {
   17556     BuildMI(mallocMBB, DL, TII->get(X86::MOV64rr), X86::RDI)
   17557       .addReg(sizeVReg);
   17558     BuildMI(mallocMBB, DL, TII->get(X86::CALL64pcrel32))
   17559       .addExternalSymbol("__morestack_allocate_stack_space")
   17560       .addRegMask(RegMask)
   17561       .addReg(X86::RDI, RegState::Implicit)
   17562       .addReg(X86::RAX, RegState::ImplicitDefine);
   17563   } else {
   17564     BuildMI(mallocMBB, DL, TII->get(X86::SUB32ri), physSPReg).addReg(physSPReg)
   17565       .addImm(12);
   17566     BuildMI(mallocMBB, DL, TII->get(X86::PUSH32r)).addReg(sizeVReg);
   17567     BuildMI(mallocMBB, DL, TII->get(X86::CALLpcrel32))
   17568       .addExternalSymbol("__morestack_allocate_stack_space")
   17569       .addRegMask(RegMask)
   17570       .addReg(X86::EAX, RegState::ImplicitDefine);
   17571   }
   17572 
   17573   if (!Is64Bit)
   17574     BuildMI(mallocMBB, DL, TII->get(X86::ADD32ri), physSPReg).addReg(physSPReg)
   17575       .addImm(16);
   17576 
   17577   BuildMI(mallocMBB, DL, TII->get(TargetOpcode::COPY), mallocPtrVReg)
   17578     .addReg(Is64Bit ? X86::RAX : X86::EAX);
   17579   BuildMI(mallocMBB, DL, TII->get(X86::JMP_4)).addMBB(continueMBB);
   17580 
   17581   // Set up the CFG correctly.
   17582   BB->addSuccessor(bumpMBB);
   17583   BB->addSuccessor(mallocMBB);
   17584   mallocMBB->addSuccessor(continueMBB);
   17585   bumpMBB->addSuccessor(continueMBB);
   17586 
   17587   // Take care of the PHI nodes.
   17588   BuildMI(*continueMBB, continueMBB->begin(), DL, TII->get(X86::PHI),
   17589           MI->getOperand(0).getReg())
   17590     .addReg(mallocPtrVReg).addMBB(mallocMBB)
   17591     .addReg(bumpSPPtrVReg).addMBB(bumpMBB);
   17592 
   17593   // Delete the original pseudo instruction.
   17594   MI->eraseFromParent();
   17595 
   17596   // And we're done.
   17597   return continueMBB;
   17598 }
   17599 
   17600 MachineBasicBlock *
   17601 X86TargetLowering::EmitLoweredWinAlloca(MachineInstr *MI,
   17602                                         MachineBasicBlock *BB) const {
   17603   const TargetInstrInfo *TII = BB->getParent()->getTarget().getInstrInfo();
   17604   DebugLoc DL = MI->getDebugLoc();
   17605 
   17606   assert(!Subtarget->isTargetMacho());
   17607 
   17608   // The lowering is pretty easy: we're just emitting the call to _alloca.  The
   17609   // non-trivial part is impdef of ESP.
   17610 
   17611   if (Subtarget->isTargetWin64()) {
   17612     if (Subtarget->isTargetCygMing()) {
   17613       // ___chkstk(Mingw64):
   17614       // Clobbers R10, R11, RAX and EFLAGS.
   17615       // Updates RSP.
   17616       BuildMI(*BB, MI, DL, TII->get(X86::W64ALLOCA))
   17617         .addExternalSymbol("___chkstk")
   17618         .addReg(X86::RAX, RegState::Implicit)
   17619         .addReg(X86::RSP, RegState::Implicit)
   17620         .addReg(X86::RAX, RegState::Define | RegState::Implicit)
   17621         .addReg(X86::RSP, RegState::Define | RegState::Implicit)
   17622         .addReg(X86::EFLAGS, RegState::Define | RegState::Implicit);
   17623     } else {
   17624       // __chkstk(MSVCRT): does not update stack pointer.
   17625       // Clobbers R10, R11 and EFLAGS.
   17626       BuildMI(*BB, MI, DL, TII->get(X86::W64ALLOCA))
   17627         .addExternalSymbol("__chkstk")
   17628         .addReg(X86::RAX, RegState::Implicit)
   17629         .addReg(X86::EFLAGS, RegState::Define | RegState::Implicit);
   17630       // RAX has the offset to be subtracted from RSP.
   17631       BuildMI(*BB, MI, DL, TII->get(X86::SUB64rr), X86::RSP)
   17632         .addReg(X86::RSP)
   17633         .addReg(X86::RAX);
   17634     }
   17635   } else {
   17636     const char *StackProbeSymbol =
   17637       Subtarget->isTargetKnownWindowsMSVC() ? "_chkstk" : "_alloca";
   17638 
   17639     BuildMI(*BB, MI, DL, TII->get(X86::CALLpcrel32))
   17640       .addExternalSymbol(StackProbeSymbol)
   17641       .addReg(X86::EAX, RegState::Implicit)
   17642       .addReg(X86::ESP, RegState::Implicit)
   17643       .addReg(X86::EAX, RegState::Define | RegState::Implicit)
   17644       .addReg(X86::ESP, RegState::Define | RegState::Implicit)
   17645       .addReg(X86::EFLAGS, RegState::Define | RegState::Implicit);
   17646   }
   17647 
   17648   MI->eraseFromParent();   // The pseudo instruction is gone now.
   17649   return BB;
   17650 }
   17651 
   17652 MachineBasicBlock *
   17653 X86TargetLowering::EmitLoweredTLSCall(MachineInstr *MI,
   17654                                       MachineBasicBlock *BB) const {
   17655   // This is pretty easy.  We're taking the value that we received from
   17656   // our load from the relocation, sticking it in either RDI (x86-64)
   17657   // or EAX and doing an indirect call.  The return value will then
   17658   // be in the normal return register.
   17659   MachineFunction *F = BB->getParent();
   17660   const X86InstrInfo *TII
   17661     = static_cast<const X86InstrInfo*>(F->getTarget().getInstrInfo());
   17662   DebugLoc DL = MI->getDebugLoc();
   17663 
   17664   assert(Subtarget->isTargetDarwin() && "Darwin only instr emitted?");
   17665   assert(MI->getOperand(3).isGlobal() && "This should be a global");
   17666 
   17667   // Get a register mask for the lowered call.
   17668   // FIXME: The 32-bit calls have non-standard calling conventions. Use a
   17669   // proper register mask.
   17670   const uint32_t *RegMask =
   17671     F->getTarget().getRegisterInfo()->getCallPreservedMask(CallingConv::C);
   17672   if (Subtarget->is64Bit()) {
   17673     MachineInstrBuilder MIB = BuildMI(*BB, MI, DL,
   17674                                       TII->get(X86::MOV64rm), X86::RDI)
   17675     .addReg(X86::RIP)
   17676     .addImm(0).addReg(0)
   17677     .addGlobalAddress(MI->getOperand(3).getGlobal(), 0,
   17678                       MI->getOperand(3).getTargetFlags())
   17679     .addReg(0);
   17680     MIB = BuildMI(*BB, MI, DL, TII->get(X86::CALL64m));
   17681     addDirectMem(MIB, X86::RDI);
   17682     MIB.addReg(X86::RAX, RegState::ImplicitDefine).addRegMask(RegMask);
   17683   } else if (F->getTarget().getRelocationModel() != Reloc::PIC_) {
   17684     MachineInstrBuilder MIB = BuildMI(*BB, MI, DL,
   17685                                       TII->get(X86::MOV32rm), X86::EAX)
   17686     .addReg(0)
   17687     .addImm(0).addReg(0)
   17688     .addGlobalAddress(MI->getOperand(3).getGlobal(), 0,
   17689                       MI->getOperand(3).getTargetFlags())
   17690     .addReg(0);
   17691     MIB = BuildMI(*BB, MI, DL, TII->get(X86::CALL32m));
   17692     addDirectMem(MIB, X86::EAX);
   17693     MIB.addReg(X86::EAX, RegState::ImplicitDefine).addRegMask(RegMask);
   17694   } else {
   17695     MachineInstrBuilder MIB = BuildMI(*BB, MI, DL,
   17696                                       TII->get(X86::MOV32rm), X86::EAX)
   17697     .addReg(TII->getGlobalBaseReg(F))
   17698     .addImm(0).addReg(0)
   17699     .addGlobalAddress(MI->getOperand(3).getGlobal(), 0,
   17700                       MI->getOperand(3).getTargetFlags())
   17701     .addReg(0);
   17702     MIB = BuildMI(*BB, MI, DL, TII->get(X86::CALL32m));
   17703     addDirectMem(MIB, X86::EAX);
   17704     MIB.addReg(X86::EAX, RegState::ImplicitDefine).addRegMask(RegMask);
   17705   }
   17706 
   17707   MI->eraseFromParent(); // The pseudo instruction is gone now.
   17708   return BB;
   17709 }
   17710 
   17711 MachineBasicBlock *
   17712 X86TargetLowering::emitEHSjLjSetJmp(MachineInstr *MI,
   17713                                     MachineBasicBlock *MBB) const {
   17714   DebugLoc DL = MI->getDebugLoc();
   17715   MachineFunction *MF = MBB->getParent();
   17716   const TargetInstrInfo *TII = MF->getTarget().getInstrInfo();
   17717   MachineRegisterInfo &MRI = MF->getRegInfo();
   17718 
   17719   const BasicBlock *BB = MBB->getBasicBlock();
   17720   MachineFunction::iterator I = MBB;
   17721   ++I;
   17722 
   17723   // Memory Reference
   17724   MachineInstr::mmo_iterator MMOBegin = MI->memoperands_begin();
   17725   MachineInstr::mmo_iterator MMOEnd = MI->memoperands_end();
   17726 
   17727   unsigned DstReg;
   17728   unsigned MemOpndSlot = 0;
   17729 
   17730   unsigned CurOp = 0;
   17731 
   17732   DstReg = MI->getOperand(CurOp++).getReg();
   17733   const TargetRegisterClass *RC = MRI.getRegClass(DstReg);
   17734   assert(RC->hasType(MVT::i32) && "Invalid destination!");
   17735   unsigned mainDstReg = MRI.createVirtualRegister(RC);
   17736   unsigned restoreDstReg = MRI.createVirtualRegister(RC);
   17737 
   17738   MemOpndSlot = CurOp;
   17739 
   17740   MVT PVT = getPointerTy();
   17741   assert((PVT == MVT::i64 || PVT == MVT::i32) &&
   17742          "Invalid Pointer Size!");
   17743 
   17744   // For v = setjmp(buf), we generate
   17745   //
   17746   // thisMBB:
   17747   //  buf[LabelOffset] = restoreMBB
   17748   //  SjLjSetup restoreMBB
   17749   //
   17750   // mainMBB:
   17751   //  v_main = 0
   17752   //
   17753   // sinkMBB:
   17754   //  v = phi(main, restore)
   17755   //
   17756   // restoreMBB:
   17757   //  v_restore = 1
   17758 
   17759   MachineBasicBlock *thisMBB = MBB;
   17760   MachineBasicBlock *mainMBB = MF->CreateMachineBasicBlock(BB);
   17761   MachineBasicBlock *sinkMBB = MF->CreateMachineBasicBlock(BB);
   17762   MachineBasicBlock *restoreMBB = MF->CreateMachineBasicBlock(BB);
   17763   MF->insert(I, mainMBB);
   17764   MF->insert(I, sinkMBB);
   17765   MF->push_back(restoreMBB);
   17766 
   17767   MachineInstrBuilder MIB;
   17768 
   17769   // Transfer the remainder of BB and its successor edges to sinkMBB.
   17770   sinkMBB->splice(sinkMBB->begin(), MBB,
   17771                   std::next(MachineBasicBlock::iterator(MI)), MBB->end());
   17772   sinkMBB->transferSuccessorsAndUpdatePHIs(MBB);
   17773 
   17774   // thisMBB:
   17775   unsigned PtrStoreOpc = 0;
   17776   unsigned LabelReg = 0;
   17777   const int64_t LabelOffset = 1 * PVT.getStoreSize();
   17778   Reloc::Model RM = MF->getTarget().getRelocationModel();
   17779   bool UseImmLabel = (MF->getTarget().getCodeModel() == CodeModel::Small) &&
   17780                      (RM == Reloc::Static || RM == Reloc::DynamicNoPIC);
   17781 
   17782   // Prepare IP either in reg or imm.
   17783   if (!UseImmLabel) {
   17784     PtrStoreOpc = (PVT == MVT::i64) ? X86::MOV64mr : X86::MOV32mr;
   17785     const TargetRegisterClass *PtrRC = getRegClassFor(PVT);
   17786     LabelReg = MRI.createVirtualRegister(PtrRC);
   17787     if (Subtarget->is64Bit()) {
   17788       MIB = BuildMI(*thisMBB, MI, DL, TII->get(X86::LEA64r), LabelReg)
   17789               .addReg(X86::RIP)
   17790               .addImm(0)
   17791               .addReg(0)
   17792               .addMBB(restoreMBB)
   17793               .addReg(0);
   17794     } else {
   17795       const X86InstrInfo *XII = static_cast<const X86InstrInfo*>(TII);
   17796       MIB = BuildMI(*thisMBB, MI, DL, TII->get(X86::LEA32r), LabelReg)
   17797               .addReg(XII->getGlobalBaseReg(MF))
   17798               .addImm(0)
   17799               .addReg(0)
   17800               .addMBB(restoreMBB, Subtarget->ClassifyBlockAddressReference())
   17801               .addReg(0);
   17802     }
   17803   } else
   17804     PtrStoreOpc = (PVT == MVT::i64) ? X86::MOV64mi32 : X86::MOV32mi;
   17805   // Store IP
   17806   MIB = BuildMI(*thisMBB, MI, DL, TII->get(PtrStoreOpc));
   17807   for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {
   17808     if (i == X86::AddrDisp)
   17809       MIB.addDisp(MI->getOperand(MemOpndSlot + i), LabelOffset);
   17810     else
   17811       MIB.addOperand(MI->getOperand(MemOpndSlot + i));
   17812   }
   17813   if (!UseImmLabel)
   17814     MIB.addReg(LabelReg);
   17815   else
   17816     MIB.addMBB(restoreMBB);
   17817   MIB.setMemRefs(MMOBegin, MMOEnd);
   17818   // Setup
   17819   MIB = BuildMI(*thisMBB, MI, DL, TII->get(X86::EH_SjLj_Setup))
   17820           .addMBB(restoreMBB);
   17821 
   17822   const X86RegisterInfo *RegInfo =
   17823     static_cast<const X86RegisterInfo*>(MF->getTarget().getRegisterInfo());
   17824   MIB.addRegMask(RegInfo->getNoPreservedMask());
   17825   thisMBB->addSuccessor(mainMBB);
   17826   thisMBB->addSuccessor(restoreMBB);
   17827 
   17828   // mainMBB:
   17829   //  EAX = 0
   17830   BuildMI(mainMBB, DL, TII->get(X86::MOV32r0), mainDstReg);
   17831   mainMBB->addSuccessor(sinkMBB);
   17832 
   17833   // sinkMBB:
   17834   BuildMI(*sinkMBB, sinkMBB->begin(), DL,
   17835           TII->get(X86::PHI), DstReg)
   17836     .addReg(mainDstReg).addMBB(mainMBB)
   17837     .addReg(restoreDstReg).addMBB(restoreMBB);
   17838 
   17839   // restoreMBB:
   17840   BuildMI(restoreMBB, DL, TII->get(X86::MOV32ri), restoreDstReg).addImm(1);
   17841   BuildMI(restoreMBB, DL, TII->get(X86::JMP_4)).addMBB(sinkMBB);
   17842   restoreMBB->addSuccessor(sinkMBB);
   17843 
   17844   MI->eraseFromParent();
   17845   return sinkMBB;
   17846 }
   17847 
   17848 MachineBasicBlock *
   17849 X86TargetLowering::emitEHSjLjLongJmp(MachineInstr *MI,
   17850                                      MachineBasicBlock *MBB) const {
   17851   DebugLoc DL = MI->getDebugLoc();
   17852   MachineFunction *MF = MBB->getParent();
   17853   const TargetInstrInfo *TII = MF->getTarget().getInstrInfo();
   17854   MachineRegisterInfo &MRI = MF->getRegInfo();
   17855 
   17856   // Memory Reference
   17857   MachineInstr::mmo_iterator MMOBegin = MI->memoperands_begin();
   17858   MachineInstr::mmo_iterator MMOEnd = MI->memoperands_end();
   17859 
   17860   MVT PVT = getPointerTy();
   17861   assert((PVT == MVT::i64 || PVT == MVT::i32) &&
   17862          "Invalid Pointer Size!");
   17863 
   17864   const TargetRegisterClass *RC =
   17865     (PVT == MVT::i64) ? &X86::GR64RegClass : &X86::GR32RegClass;
   17866   unsigned Tmp = MRI.createVirtualRegister(RC);
   17867   // Since FP is only updated here but NOT referenced, it's treated as GPR.
   17868   const X86RegisterInfo *RegInfo =
   17869     static_cast<const X86RegisterInfo*>(MF->getTarget().getRegisterInfo());
   17870   unsigned FP = (PVT == MVT::i64) ? X86::RBP : X86::EBP;
   17871   unsigned SP = RegInfo->getStackRegister();
   17872 
   17873   MachineInstrBuilder MIB;
   17874 
   17875   const int64_t LabelOffset = 1 * PVT.getStoreSize();
   17876   const int64_t SPOffset = 2 * PVT.getStoreSize();
   17877 
   17878   unsigned PtrLoadOpc = (PVT == MVT::i64) ? X86::MOV64rm : X86::MOV32rm;
   17879   unsigned IJmpOpc = (PVT == MVT::i64) ? X86::JMP64r : X86::JMP32r;
   17880 
   17881   // Reload FP
   17882   MIB = BuildMI(*MBB, MI, DL, TII->get(PtrLoadOpc), FP);
   17883   for (unsigned i = 0; i < X86::AddrNumOperands; ++i)
   17884     MIB.addOperand(MI->getOperand(i));
   17885   MIB.setMemRefs(MMOBegin, MMOEnd);
   17886   // Reload IP
   17887   MIB = BuildMI(*MBB, MI, DL, TII->get(PtrLoadOpc), Tmp);
   17888   for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {
   17889     if (i == X86::AddrDisp)
   17890       MIB.addDisp(MI->getOperand(i), LabelOffset);
   17891     else
   17892       MIB.addOperand(MI->getOperand(i));
   17893   }
   17894   MIB.setMemRefs(MMOBegin, MMOEnd);
   17895   // Reload SP
   17896   MIB = BuildMI(*MBB, MI, DL, TII->get(PtrLoadOpc), SP);
   17897   for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {
   17898     if (i == X86::AddrDisp)
   17899       MIB.addDisp(MI->getOperand(i), SPOffset);
   17900     else
   17901       MIB.addOperand(MI->getOperand(i));
   17902   }
   17903   MIB.setMemRefs(MMOBegin, MMOEnd);
   17904   // Jump
   17905   BuildMI(*MBB, MI, DL, TII->get(IJmpOpc)).addReg(Tmp);
   17906 
   17907   MI->eraseFromParent();
   17908   return MBB;
   17909 }
   17910 
   17911 // Replace 213-type (isel default) FMA3 instructions with 231-type for
   17912 // accumulator loops. Writing back to the accumulator allows the coalescer
   17913 // to remove extra copies in the loop.
   17914 MachineBasicBlock *
   17915 X86TargetLowering::emitFMA3Instr(MachineInstr *MI,
   17916                                  MachineBasicBlock *MBB) const {
   17917   MachineOperand &AddendOp = MI->getOperand(3);
   17918 
   17919   // Bail out early if the addend isn't a register - we can't switch these.
   17920   if (!AddendOp.isReg())
   17921     return MBB;
   17922 
   17923   MachineFunction &MF = *MBB->getParent();
   17924   MachineRegisterInfo &MRI = MF.getRegInfo();
   17925 
   17926   // Check whether the addend is defined by a PHI:
   17927   assert(MRI.hasOneDef(AddendOp.getReg()) && "Multiple defs in SSA?");
   17928   MachineInstr &AddendDef = *MRI.def_instr_begin(AddendOp.getReg());
   17929   if (!AddendDef.isPHI())
   17930     return MBB;
   17931 
   17932   // Look for the following pattern:
   17933   // loop:
   17934   //   %addend = phi [%entry, 0], [%loop, %result]
   17935   //   ...
   17936   //   %result<tied1> = FMA213 %m2<tied0>, %m1, %addend
   17937 
   17938   // Replace with:
   17939   //   loop:
   17940   //   %addend = phi [%entry, 0], [%loop, %result]
   17941   //   ...
   17942   //   %result<tied1> = FMA231 %addend<tied0>, %m1, %m2
   17943 
   17944   for (unsigned i = 1, e = AddendDef.getNumOperands(); i < e; i += 2) {
   17945     assert(AddendDef.getOperand(i).isReg());
   17946     MachineOperand PHISrcOp = AddendDef.getOperand(i);
   17947     MachineInstr &PHISrcInst = *MRI.def_instr_begin(PHISrcOp.getReg());
   17948     if (&PHISrcInst == MI) {
   17949       // Found a matching instruction.
   17950       unsigned NewFMAOpc = 0;
   17951       switch (MI->getOpcode()) {
   17952         case X86::VFMADDPDr213r: NewFMAOpc = X86::VFMADDPDr231r; break;
   17953         case X86::VFMADDPSr213r: NewFMAOpc = X86::VFMADDPSr231r; break;
   17954         case X86::VFMADDSDr213r: NewFMAOpc = X86::VFMADDSDr231r; break;
   17955         case X86::VFMADDSSr213r: NewFMAOpc = X86::VFMADDSSr231r; break;
   17956         case X86::VFMSUBPDr213r: NewFMAOpc = X86::VFMSUBPDr231r; break;
   17957         case X86::VFMSUBPSr213r: NewFMAOpc = X86::VFMSUBPSr231r; break;
   17958         case X86::VFMSUBSDr213r: NewFMAOpc = X86::VFMSUBSDr231r; break;
   17959         case X86::VFMSUBSSr213r: NewFMAOpc = X86::VFMSUBSSr231r; break;
   17960         case X86::VFNMADDPDr213r: NewFMAOpc = X86::VFNMADDPDr231r; break;
   17961         case X86::VFNMADDPSr213r: NewFMAOpc = X86::VFNMADDPSr231r; break;
   17962         case X86::VFNMADDSDr213r: NewFMAOpc = X86::VFNMADDSDr231r; break;
   17963         case X86::VFNMADDSSr213r: NewFMAOpc = X86::VFNMADDSSr231r; break;
   17964         case X86::VFNMSUBPDr213r: NewFMAOpc = X86::VFNMSUBPDr231r; break;
   17965         case X86::VFNMSUBPSr213r: NewFMAOpc = X86::VFNMSUBPSr231r; break;
   17966         case X86::VFNMSUBSDr213r: NewFMAOpc = X86::VFNMSUBSDr231r; break;
   17967         case X86::VFNMSUBSSr213r: NewFMAOpc = X86::VFNMSUBSSr231r; break;
   17968         case X86::VFMADDPDr213rY: NewFMAOpc = X86::VFMADDPDr231rY; break;
   17969         case X86::VFMADDPSr213rY: NewFMAOpc = X86::VFMADDPSr231rY; break;
   17970         case X86::VFMSUBPDr213rY: NewFMAOpc = X86::VFMSUBPDr231rY; break;
   17971         case X86::VFMSUBPSr213rY: NewFMAOpc = X86::VFMSUBPSr231rY; break;
   17972         case X86::VFNMADDPDr213rY: NewFMAOpc = X86::VFNMADDPDr231rY; break;
   17973         case X86::VFNMADDPSr213rY: NewFMAOpc = X86::VFNMADDPSr231rY; break;
   17974         case X86::VFNMSUBPDr213rY: NewFMAOpc = X86::VFNMSUBPDr231rY; break;
   17975         case X86::VFNMSUBPSr213rY: NewFMAOpc = X86::VFNMSUBPSr231rY; break;
   17976         default: llvm_unreachable("Unrecognized FMA variant.");
   17977       }
   17978 
   17979       const TargetInstrInfo &TII = *MF.getTarget().getInstrInfo();
   17980       MachineInstrBuilder MIB =
   17981         BuildMI(MF, MI->getDebugLoc(), TII.get(NewFMAOpc))
   17982         .addOperand(MI->getOperand(0))
   17983         .addOperand(MI->getOperand(3))
   17984         .addOperand(MI->getOperand(2))
   17985         .addOperand(MI->getOperand(1));
   17986       MBB->insert(MachineBasicBlock::iterator(MI), MIB);
   17987       MI->eraseFromParent();
   17988     }
   17989   }
   17990 
   17991   return MBB;
   17992 }
   17993 
   17994 MachineBasicBlock *
   17995 X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI,
   17996                                                MachineBasicBlock *BB) const {
   17997   switch (MI->getOpcode()) {
   17998   default: llvm_unreachable("Unexpected instr type to insert");
   17999   case X86::TAILJMPd64:
   18000   case X86::TAILJMPr64:
   18001   case X86::TAILJMPm64:
   18002     llvm_unreachable("TAILJMP64 would not be touched here.");
   18003   case X86::TCRETURNdi64:
   18004   case X86::TCRETURNri64:
   18005   case X86::TCRETURNmi64:
   18006     return BB;
   18007   case X86::WIN_ALLOCA:
   18008     return EmitLoweredWinAlloca(MI, BB);
   18009   case X86::SEG_ALLOCA_32:
   18010     return EmitLoweredSegAlloca(MI, BB, false);
   18011   case X86::SEG_ALLOCA_64:
   18012     return EmitLoweredSegAlloca(MI, BB, true);
   18013   case X86::TLSCall_32:
   18014   case X86::TLSCall_64:
   18015     return EmitLoweredTLSCall(MI, BB);
   18016   case X86::CMOV_GR8:
   18017   case X86::CMOV_FR32:
   18018   case X86::CMOV_FR64:
   18019   case X86::CMOV_V4F32:
   18020   case X86::CMOV_V2F64:
   18021   case X86::CMOV_V2I64:
   18022   case X86::CMOV_V8F32:
   18023   case X86::CMOV_V4F64:
   18024   case X86::CMOV_V4I64:
   18025   case X86::CMOV_V16F32:
   18026   case X86::CMOV_V8F64:
   18027   case X86::CMOV_V8I64:
   18028   case X86::CMOV_GR16:
   18029   case X86::CMOV_GR32:
   18030   case X86::CMOV_RFP32:
   18031   case X86::CMOV_RFP64:
   18032   case X86::CMOV_RFP80:
   18033     return EmitLoweredSelect(MI, BB);
   18034 
   18035   case X86::FP32_TO_INT16_IN_MEM:
   18036   case X86::FP32_TO_INT32_IN_MEM:
   18037   case X86::FP32_TO_INT64_IN_MEM:
   18038   case X86::FP64_TO_INT16_IN_MEM:
   18039   case X86::FP64_TO_INT32_IN_MEM:
   18040   case X86::FP64_TO_INT64_IN_MEM:
   18041   case X86::FP80_TO_INT16_IN_MEM:
   18042   case X86::FP80_TO_INT32_IN_MEM:
   18043   case X86::FP80_TO_INT64_IN_MEM: {
   18044     MachineFunction *F = BB->getParent();
   18045     const TargetInstrInfo *TII = F->getTarget().getInstrInfo();
   18046     DebugLoc DL = MI->getDebugLoc();
   18047 
   18048     // Change the floating point control register to use "round towards zero"
   18049     // mode when truncating to an integer value.
   18050     int CWFrameIdx = F->getFrameInfo()->CreateStackObject(2, 2, false);
   18051     addFrameReference(BuildMI(*BB, MI, DL,
   18052                               TII->get(X86::FNSTCW16m)), CWFrameIdx);
   18053 
   18054     // Load the old value of the high byte of the control word...
   18055     unsigned OldCW =
   18056       F->getRegInfo().createVirtualRegister(&X86::GR16RegClass);
   18057     addFrameReference(BuildMI(*BB, MI, DL, TII->get(X86::MOV16rm), OldCW),
   18058                       CWFrameIdx);
   18059 
   18060     // Set the high part to be round to zero...
   18061     addFrameReference(BuildMI(*BB, MI, DL, TII->get(X86::MOV16mi)), CWFrameIdx)
   18062       .addImm(0xC7F);
   18063 
   18064     // Reload the modified control word now...
   18065     addFrameReference(BuildMI(*BB, MI, DL,
   18066                               TII->get(X86::FLDCW16m)), CWFrameIdx);
   18067 
   18068     // Restore the memory image of control word to original value
   18069     addFrameReference(BuildMI(*BB, MI, DL, TII->get(X86::MOV16mr)), CWFrameIdx)
   18070       .addReg(OldCW);
   18071 
   18072     // Get the X86 opcode to use.
   18073     unsigned Opc;
   18074     switch (MI->getOpcode()) {
   18075     default: llvm_unreachable("illegal opcode!");
   18076     case X86::FP32_TO_INT16_IN_MEM: Opc = X86::IST_Fp16m32; break;
   18077     case X86::FP32_TO_INT32_IN_MEM: Opc = X86::IST_Fp32m32; break;
   18078     case X86::FP32_TO_INT64_IN_MEM: Opc = X86::IST_Fp64m32; break;
   18079     case X86::FP64_TO_INT16_IN_MEM: Opc = X86::IST_Fp16m64; break;
   18080     case X86::FP64_TO_INT32_IN_MEM: Opc = X86::IST_Fp32m64; break;
   18081     case X86::FP64_TO_INT64_IN_MEM: Opc = X86::IST_Fp64m64; break;
   18082     case X86::FP80_TO_INT16_IN_MEM: Opc = X86::IST_Fp16m80; break;
   18083     case X86::FP80_TO_INT32_IN_MEM: Opc = X86::IST_Fp32m80; break;
   18084     case X86::FP80_TO_INT64_IN_MEM: Opc = X86::IST_Fp64m80; break;
   18085     }
   18086 
   18087     X86AddressMode AM;
   18088     MachineOperand &Op = MI->getOperand(0);
   18089     if (Op.isReg()) {
   18090       AM.BaseType = X86AddressMode::RegBase;
   18091       AM.Base.Reg = Op.getReg();
   18092     } else {
   18093       AM.BaseType = X86AddressMode::FrameIndexBase;
   18094       AM.Base.FrameIndex = Op.getIndex();
   18095     }
   18096     Op = MI->getOperand(1);
   18097     if (Op.isImm())
   18098       AM.Scale = Op.getImm();
   18099     Op = MI->getOperand(2);
   18100     if (Op.isImm())
   18101       AM.IndexReg = Op.getImm();
   18102     Op = MI->getOperand(3);
   18103     if (Op.isGlobal()) {
   18104       AM.GV = Op.getGlobal();
   18105     } else {
   18106       AM.Disp = Op.getImm();
   18107     }
   18108     addFullAddress(BuildMI(*BB, MI, DL, TII->get(Opc)), AM)
   18109                       .addReg(MI->getOperand(X86::AddrNumOperands).getReg());
   18110 
   18111     // Reload the original control word now.
   18112     addFrameReference(BuildMI(*BB, MI, DL,
   18113                               TII->get(X86::FLDCW16m)), CWFrameIdx);
   18114 
   18115     MI->eraseFromParent();   // The pseudo instruction is gone now.
   18116     return BB;
   18117   }
   18118     // String/text processing lowering.
   18119   case X86::PCMPISTRM128REG:
   18120   case X86::VPCMPISTRM128REG:
   18121   case X86::PCMPISTRM128MEM:
   18122   case X86::VPCMPISTRM128MEM:
   18123   case X86::PCMPESTRM128REG:
   18124   case X86::VPCMPESTRM128REG:
   18125   case X86::PCMPESTRM128MEM:
   18126   case X86::VPCMPESTRM128MEM:
   18127     assert(Subtarget->hasSSE42() &&
   18128            "Target must have SSE4.2 or AVX features enabled");
   18129     return EmitPCMPSTRM(MI, BB, BB->getParent()->getTarget().getInstrInfo());
   18130 
   18131   // String/text processing lowering.
   18132   case X86::PCMPISTRIREG:
   18133   case X86::VPCMPISTRIREG:
   18134   case X86::PCMPISTRIMEM:
   18135   case X86::VPCMPISTRIMEM:
   18136   case X86::PCMPESTRIREG:
   18137   case X86::VPCMPESTRIREG:
   18138   case X86::PCMPESTRIMEM:
   18139   case X86::VPCMPESTRIMEM:
   18140     assert(Subtarget->hasSSE42() &&
   18141            "Target must have SSE4.2 or AVX features enabled");
   18142     return EmitPCMPSTRI(MI, BB, BB->getParent()->getTarget().getInstrInfo());
   18143 
   18144   // Thread synchronization.
   18145   case X86::MONITOR:
   18146     return EmitMonitor(MI, BB, BB->getParent()->getTarget().getInstrInfo(), Subtarget);
   18147 
   18148   // xbegin
   18149   case X86::XBEGIN:
   18150     return EmitXBegin(MI, BB, BB->getParent()->getTarget().getInstrInfo());
   18151 
   18152   case X86::VASTART_SAVE_XMM_REGS:
   18153     return EmitVAStartSaveXMMRegsWithCustomInserter(MI, BB);
   18154 
   18155   case X86::VAARG_64:
   18156     return EmitVAARG64WithCustomInserter(MI, BB);
   18157 
   18158   case X86::EH_SjLj_SetJmp32:
   18159   case X86::EH_SjLj_SetJmp64:
   18160     return emitEHSjLjSetJmp(MI, BB);
   18161 
   18162   case X86::EH_SjLj_LongJmp32:
   18163   case X86::EH_SjLj_LongJmp64:
   18164     return emitEHSjLjLongJmp(MI, BB);
   18165 
   18166   case TargetOpcode::STACKMAP:
   18167   case TargetOpcode::PATCHPOINT:
   18168     return emitPatchPoint(MI, BB);
   18169 
   18170   case X86::VFMADDPDr213r:
   18171   case X86::VFMADDPSr213r:
   18172   case X86::VFMADDSDr213r:
   18173   case X86::VFMADDSSr213r:
   18174   case X86::VFMSUBPDr213r:
   18175   case X86::VFMSUBPSr213r:
   18176   case X86::VFMSUBSDr213r:
   18177   case X86::VFMSUBSSr213r:
   18178   case X86::VFNMADDPDr213r:
   18179   case X86::VFNMADDPSr213r:
   18180   case X86::VFNMADDSDr213r:
   18181   case X86::VFNMADDSSr213r:
   18182   case X86::VFNMSUBPDr213r:
   18183   case X86::VFNMSUBPSr213r:
   18184   case X86::VFNMSUBSDr213r:
   18185   case X86::VFNMSUBSSr213r:
   18186   case X86::VFMADDPDr213rY:
   18187   case X86::VFMADDPSr213rY:
   18188   case X86::VFMSUBPDr213rY:
   18189   case X86::VFMSUBPSr213rY:
   18190   case X86::VFNMADDPDr213rY:
   18191   case X86::VFNMADDPSr213rY:
   18192   case X86::VFNMSUBPDr213rY:
   18193   case X86::VFNMSUBPSr213rY:
   18194     return emitFMA3Instr(MI, BB);
   18195   }
   18196 }
   18197 
   18198 //===----------------------------------------------------------------------===//
   18199 //                           X86 Optimization Hooks
   18200 //===----------------------------------------------------------------------===//
   18201 
   18202 void X86TargetLowering::computeKnownBitsForTargetNode(const SDValue Op,
   18203                                                       APInt &KnownZero,
   18204                                                       APInt &KnownOne,
   18205                                                       const SelectionDAG &DAG,
   18206                                                       unsigned Depth) const {
   18207   unsigned BitWidth = KnownZero.getBitWidth();
   18208   unsigned Opc = Op.getOpcode();
   18209   assert((Opc >= ISD::BUILTIN_OP_END ||
   18210           Opc == ISD::INTRINSIC_WO_CHAIN ||
   18211           Opc == ISD::INTRINSIC_W_CHAIN ||
   18212           Opc == ISD::INTRINSIC_VOID) &&
   18213          "Should use MaskedValueIsZero if you don't know whether Op"
   18214          " is a target node!");
   18215 
   18216   KnownZero = KnownOne = APInt(BitWidth, 0);   // Don't know anything.
   18217   switch (Opc) {
   18218   default: break;
   18219   case X86ISD::ADD:
   18220   case X86ISD::SUB:
   18221   case X86ISD::ADC:
   18222   case X86ISD::SBB:
   18223   case X86ISD::SMUL:
   18224   case X86ISD::UMUL:
   18225   case X86ISD::INC:
   18226   case X86ISD::DEC:
   18227   case X86ISD::OR:
   18228   case X86ISD::XOR:
   18229   case X86ISD::AND:
   18230     // These nodes' second result is a boolean.
   18231     if (Op.getResNo() == 0)
   18232       break;
   18233     // Fallthrough
   18234   case X86ISD::SETCC:
   18235     KnownZero |= APInt::getHighBitsSet(BitWidth, BitWidth - 1);
   18236     break;
   18237   case ISD::INTRINSIC_WO_CHAIN: {
   18238     unsigned IntId = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
   18239     unsigned NumLoBits = 0;
   18240     switch (IntId) {
   18241     default: break;
   18242     case Intrinsic::x86_sse_movmsk_ps:
   18243     case Intrinsic::x86_avx_movmsk_ps_256:
   18244     case Intrinsic::x86_sse2_movmsk_pd:
   18245     case Intrinsic::x86_avx_movmsk_pd_256:
   18246     case Intrinsic::x86_mmx_pmovmskb:
   18247     case Intrinsic::x86_sse2_pmovmskb_128:
   18248     case Intrinsic::x86_avx2_pmovmskb: {
   18249       // High bits of movmskp{s|d}, pmovmskb are known zero.
   18250       switch (IntId) {
   18251         default: llvm_unreachable("Impossible intrinsic");  // Can't reach here.
   18252         case Intrinsic::x86_sse_movmsk_ps:      NumLoBits = 4; break;
   18253         case Intrinsic::x86_avx_movmsk_ps_256:  NumLoBits = 8; break;
   18254         case Intrinsic::x86_sse2_movmsk_pd:     NumLoBits = 2; break;
   18255         case Intrinsic::x86_avx_movmsk_pd_256:  NumLoBits = 4; break;
   18256         case Intrinsic::x86_mmx_pmovmskb:       NumLoBits = 8; break;
   18257         case Intrinsic::x86_sse2_pmovmskb_128:  NumLoBits = 16; break;
   18258         case Intrinsic::x86_avx2_pmovmskb:      NumLoBits = 32; break;
   18259       }
   18260       KnownZero = APInt::getHighBitsSet(BitWidth, BitWidth - NumLoBits);
   18261       break;
   18262     }
   18263     }
   18264     break;
   18265   }
   18266   }
   18267 }
   18268 
   18269 unsigned X86TargetLowering::ComputeNumSignBitsForTargetNode(
   18270   SDValue Op,
   18271   const SelectionDAG &,
   18272   unsigned Depth) const {
   18273   // SETCC_CARRY sets the dest to ~0 for true or 0 for false.
   18274   if (Op.getOpcode() == X86ISD::SETCC_CARRY)
   18275     return Op.getValueType().getScalarType().getSizeInBits();
   18276 
   18277   // Fallback case.
   18278   return 1;
   18279 }
   18280 
   18281 /// isGAPlusOffset - Returns true (and the GlobalValue and the offset) if the
   18282 /// node is a GlobalAddress + offset.
   18283 bool X86TargetLowering::isGAPlusOffset(SDNode *N,
   18284                                        const GlobalValue* &GA,
   18285                                        int64_t &Offset) const {
   18286   if (N->getOpcode() == X86ISD::Wrapper) {
   18287     if (isa<GlobalAddressSDNode>(N->getOperand(0))) {
   18288       GA = cast<GlobalAddressSDNode>(N->getOperand(0))->getGlobal();
   18289       Offset = cast<GlobalAddressSDNode>(N->getOperand(0))->getOffset();
   18290       return true;
   18291     }
   18292   }
   18293   return TargetLowering::isGAPlusOffset(N, GA, Offset);
   18294 }
   18295 
   18296 /// isShuffleHigh128VectorInsertLow - Checks whether the shuffle node is the
   18297 /// same as extracting the high 128-bit part of 256-bit vector and then
   18298 /// inserting the result into the low part of a new 256-bit vector
   18299 static bool isShuffleHigh128VectorInsertLow(ShuffleVectorSDNode *SVOp) {
   18300   EVT VT = SVOp->getValueType(0);
   18301   unsigned NumElems = VT.getVectorNumElements();
   18302 
   18303   // vector_shuffle <4, 5, 6, 7, u, u, u, u> or <2, 3, u, u>
   18304   for (unsigned i = 0, j = NumElems/2; i != NumElems/2; ++i, ++j)
   18305     if (!isUndefOrEqual(SVOp->getMaskElt(i), j) ||
   18306         SVOp->getMaskElt(j) >= 0)
   18307       return false;
   18308 
   18309   return true;
   18310 }
   18311 
   18312 /// isShuffleLow128VectorInsertHigh - Checks whether the shuffle node is the
   18313 /// same as extracting the low 128-bit part of 256-bit vector and then
   18314 /// inserting the result into the high part of a new 256-bit vector
   18315 static bool isShuffleLow128VectorInsertHigh(ShuffleVectorSDNode *SVOp) {
   18316   EVT VT = SVOp->getValueType(0);
   18317   unsigned NumElems = VT.getVectorNumElements();
   18318 
   18319   // vector_shuffle <u, u, u, u, 0, 1, 2, 3> or <u, u, 0, 1>
   18320   for (unsigned i = NumElems/2, j = 0; i != NumElems; ++i, ++j)
   18321     if (!isUndefOrEqual(SVOp->getMaskElt(i), j) ||
   18322         SVOp->getMaskElt(j) >= 0)
   18323       return false;
   18324 
   18325   return true;
   18326 }
   18327 
   18328 /// PerformShuffleCombine256 - Performs shuffle combines for 256-bit vectors.
   18329 static SDValue PerformShuffleCombine256(SDNode *N, SelectionDAG &DAG,
   18330                                         TargetLowering::DAGCombinerInfo &DCI,
   18331                                         const X86Subtarget* Subtarget) {
   18332   SDLoc dl(N);
   18333   ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N);
   18334   SDValue V1 = SVOp->getOperand(0);
   18335   SDValue V2 = SVOp->getOperand(1);
   18336   EVT VT = SVOp->getValueType(0);
   18337   unsigned NumElems = VT.getVectorNumElements();
   18338 
   18339   if (V1.getOpcode() == ISD::CONCAT_VECTORS &&
   18340       V2.getOpcode() == ISD::CONCAT_VECTORS) {
   18341     //
   18342     //                   0,0,0,...
   18343     //                      |
   18344     //    V      UNDEF    BUILD_VECTOR    UNDEF
   18345     //     \      /           \           /
   18346     //  CONCAT_VECTOR         CONCAT_VECTOR
   18347     //         \                  /
   18348     //          \                /
   18349     //          RESULT: V + zero extended
   18350     //
   18351     if (V2.getOperand(0).getOpcode() != ISD::BUILD_VECTOR ||
   18352         V2.getOperand(1).getOpcode() != ISD::UNDEF ||
   18353         V1.getOperand(1).getOpcode() != ISD::UNDEF)
   18354       return SDValue();
   18355 
   18356     if (!ISD::isBuildVectorAllZeros(V2.getOperand(0).getNode()))
   18357       return SDValue();
   18358 
   18359     // To match the shuffle mask, the first half of the mask should
   18360     // be exactly the first vector, and all the rest a splat with the
   18361     // first element of the second one.
   18362     for (unsigned i = 0; i != NumElems/2; ++i)
   18363       if (!isUndefOrEqual(SVOp->getMaskElt(i), i) ||
   18364           !isUndefOrEqual(SVOp->getMaskElt(i+NumElems/2), NumElems))
   18365         return SDValue();
   18366 
   18367     // If V1 is coming from a vector load then just fold to a VZEXT_LOAD.
   18368     if (LoadSDNode *Ld = dyn_cast<LoadSDNode>(V1.getOperand(0))) {
   18369       if (Ld->hasNUsesOfValue(1, 0)) {
   18370         SDVTList Tys = DAG.getVTList(MVT::v4i64, MVT::Other);
   18371         SDValue Ops[] = { Ld->getChain(), Ld->getBasePtr() };
   18372         SDValue ResNode =
   18373           DAG.getMemIntrinsicNode(X86ISD::VZEXT_LOAD, dl, Tys, Ops,
   18374                                   Ld->getMemoryVT(),
   18375                                   Ld->getPointerInfo(),
   18376                                   Ld->getAlignment(),
   18377                                   false/*isVolatile*/, true/*ReadMem*/,
   18378                                   false/*WriteMem*/);
   18379 
   18380         // Make sure the newly-created LOAD is in the same position as Ld in
   18381         // terms of dependency. We create a TokenFactor for Ld and ResNode,
   18382         // and update uses of Ld's output chain to use the TokenFactor.
   18383         if (Ld->hasAnyUseOfValue(1)) {
   18384           SDValue NewChain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
   18385                              SDValue(Ld, 1), SDValue(ResNode.getNode(), 1));
   18386           DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), NewChain);
   18387           DAG.UpdateNodeOperands(NewChain.getNode(), SDValue(Ld, 1),
   18388                                  SDValue(ResNode.getNode(), 1));
   18389         }
   18390 
   18391         return DAG.getNode(ISD::BITCAST, dl, VT, ResNode);
   18392       }
   18393     }
   18394 
   18395     // Emit a zeroed vector and insert the desired subvector on its
   18396     // first half.
   18397     SDValue Zeros = getZeroVector(VT, Subtarget, DAG, dl);
   18398     SDValue InsV = Insert128BitVector(Zeros, V1.getOperand(0), 0, DAG, dl);
   18399     return DCI.CombineTo(N, InsV);
   18400   }
   18401 
   18402   //===--------------------------------------------------------------------===//
   18403   // Combine some shuffles into subvector extracts and inserts:
   18404   //
   18405 
   18406   // vector_shuffle <4, 5, 6, 7, u, u, u, u> or <2, 3, u, u>
   18407   if (isShuffleHigh128VectorInsertLow(SVOp)) {
   18408     SDValue V = Extract128BitVector(V1, NumElems/2, DAG, dl);
   18409     SDValue InsV = Insert128BitVector(DAG.getUNDEF(VT), V, 0, DAG, dl);
   18410     return DCI.CombineTo(N, InsV);
   18411   }
   18412 
   18413   // vector_shuffle <u, u, u, u, 0, 1, 2, 3> or <u, u, 0, 1>
   18414   if (isShuffleLow128VectorInsertHigh(SVOp)) {
   18415     SDValue V = Extract128BitVector(V1, 0, DAG, dl);
   18416     SDValue InsV = Insert128BitVector(DAG.getUNDEF(VT), V, NumElems/2, DAG, dl);
   18417     return DCI.CombineTo(N, InsV);
   18418   }
   18419 
   18420   return SDValue();
   18421 }
   18422 
   18423 /// \brief Get the PSHUF-style mask from PSHUF node.
   18424 ///
   18425 /// This is a very minor wrapper around getTargetShuffleMask to easy forming v4
   18426 /// PSHUF-style masks that can be reused with such instructions.
   18427 static SmallVector<int, 4> getPSHUFShuffleMask(SDValue N) {
   18428   SmallVector<int, 4> Mask;
   18429   bool IsUnary;
   18430   bool HaveMask = getTargetShuffleMask(N.getNode(), N.getSimpleValueType(), Mask, IsUnary);
   18431   (void)HaveMask;
   18432   assert(HaveMask);
   18433 
   18434   switch (N.getOpcode()) {
   18435   case X86ISD::PSHUFD:
   18436     return Mask;
   18437   case X86ISD::PSHUFLW:
   18438     Mask.resize(4);
   18439     return Mask;
   18440   case X86ISD::PSHUFHW:
   18441     Mask.erase(Mask.begin(), Mask.begin() + 4);
   18442     for (int &M : Mask)
   18443       M -= 4;
   18444     return Mask;
   18445   default:
   18446     llvm_unreachable("No valid shuffle instruction found!");
   18447   }
   18448 }
   18449 
   18450 /// \brief Search for a combinable shuffle across a chain ending in pshufd.
   18451 ///
   18452 /// We walk up the chain and look for a combinable shuffle, skipping over
   18453 /// shuffles that we could hoist this shuffle's transformation past without
   18454 /// altering anything.
   18455 static bool combineRedundantDWordShuffle(SDValue N, MutableArrayRef<int> Mask,
   18456                                          SelectionDAG &DAG,
   18457                                          TargetLowering::DAGCombinerInfo &DCI) {
   18458   assert(N.getOpcode() == X86ISD::PSHUFD &&
   18459          "Called with something other than an x86 128-bit half shuffle!");
   18460   SDLoc DL(N);
   18461 
   18462   // Walk up a single-use chain looking for a combinable shuffle.
   18463   SDValue V = N.getOperand(0);
   18464   for (; V.hasOneUse(); V = V.getOperand(0)) {
   18465     switch (V.getOpcode()) {
   18466     default:
   18467       return false; // Nothing combined!
   18468 
   18469     case ISD::BITCAST:
   18470       // Skip bitcasts as we always know the type for the target specific
   18471       // instructions.
   18472       continue;
   18473 
   18474     case X86ISD::PSHUFD:
   18475       // Found another dword shuffle.
   18476       break;
   18477 
   18478     case X86ISD::PSHUFLW:
   18479       // Check that the low words (being shuffled) are the identity in the
   18480       // dword shuffle, and the high words are self-contained.
   18481       if (Mask[0] != 0 || Mask[1] != 1 ||
   18482           !(Mask[2] >= 2 && Mask[2] < 4 && Mask[3] >= 2 && Mask[3] < 4))
   18483         return false;
   18484 
   18485       continue;
   18486 
   18487     case X86ISD::PSHUFHW:
   18488       // Check that the high words (being shuffled) are the identity in the
   18489       // dword shuffle, and the low words are self-contained.
   18490       if (Mask[2] != 2 || Mask[3] != 3 ||
   18491           !(Mask[0] >= 0 && Mask[0] < 2 && Mask[1] >= 0 && Mask[1] < 2))
   18492         return false;
   18493 
   18494       continue;
   18495 
   18496     case X86ISD::UNPCKL:
   18497     case X86ISD::UNPCKH:
   18498       // For either i8 -> i16 or i16 -> i32 unpacks, we can combine a dword
   18499       // shuffle into a preceding word shuffle.
   18500       if (V.getValueType() != MVT::v16i8 && V.getValueType() != MVT::v8i16)
   18501         return false;
   18502 
   18503       // Search for a half-shuffle which we can combine with.
   18504       unsigned CombineOp =
   18505           V.getOpcode() == X86ISD::UNPCKL ? X86ISD::PSHUFLW : X86ISD::PSHUFHW;
   18506       if (V.getOperand(0) != V.getOperand(1) ||
   18507           !V->isOnlyUserOf(V.getOperand(0).getNode()))
   18508         return false;
   18509       V = V.getOperand(0);
   18510       do {
   18511         switch (V.getOpcode()) {
   18512         default:
   18513           return false; // Nothing to combine.
   18514 
   18515         case X86ISD::PSHUFLW:
   18516         case X86ISD::PSHUFHW:
   18517           if (V.getOpcode() == CombineOp)
   18518             break;
   18519 
   18520           // Fallthrough!
   18521         case ISD::BITCAST:
   18522           V = V.getOperand(0);
   18523           continue;
   18524         }
   18525         break;
   18526       } while (V.hasOneUse());
   18527       break;
   18528     }
   18529     // Break out of the loop if we break out of the switch.
   18530     break;
   18531   }
   18532 
   18533   if (!V.hasOneUse())
   18534     // We fell out of the loop without finding a viable combining instruction.
   18535     return false;
   18536 
   18537   // Record the old value to use in RAUW-ing.
   18538   SDValue Old = V;
   18539 
   18540   // Merge this node's mask and our incoming mask.
   18541   SmallVector<int, 4> VMask = getPSHUFShuffleMask(V);
   18542   for (int &M : Mask)
   18543     M = VMask[M];
   18544   V = DAG.getNode(V.getOpcode(), DL, V.getValueType(), V.getOperand(0),
   18545                   getV4X86ShuffleImm8ForMask(Mask, DAG));
   18546 
   18547   // It is possible that one of the combinable shuffles was completely absorbed
   18548   // by the other, just replace it and revisit all users in that case.
   18549   if (Old.getNode() == V.getNode()) {
   18550     DCI.CombineTo(N.getNode(), N.getOperand(0), /*AddTo=*/true);
   18551     return true;
   18552   }
   18553 
   18554   // Replace N with its operand as we're going to combine that shuffle away.
   18555   DAG.ReplaceAllUsesWith(N, N.getOperand(0));
   18556 
   18557   // Replace the combinable shuffle with the combined one, updating all users
   18558   // so that we re-evaluate the chain here.
   18559   DCI.CombineTo(Old.getNode(), V, /*AddTo*/ true);
   18560   return true;
   18561 }
   18562 
   18563 /// \brief Search for a combinable shuffle across a chain ending in pshuflw or pshufhw.
   18564 ///
   18565 /// We walk up the chain, skipping shuffles of the other half and looking
   18566 /// through shuffles which switch halves trying to find a shuffle of the same
   18567 /// pair of dwords.
   18568 static bool combineRedundantHalfShuffle(SDValue N, MutableArrayRef<int> Mask,
   18569                                         SelectionDAG &DAG,
   18570                                         TargetLowering::DAGCombinerInfo &DCI) {
   18571   assert(
   18572       (N.getOpcode() == X86ISD::PSHUFLW || N.getOpcode() == X86ISD::PSHUFHW) &&
   18573       "Called with something other than an x86 128-bit half shuffle!");
   18574   SDLoc DL(N);
   18575   unsigned CombineOpcode = N.getOpcode();
   18576 
   18577   // Walk up a single-use chain looking for a combinable shuffle.
   18578   SDValue V = N.getOperand(0);
   18579   for (; V.hasOneUse(); V = V.getOperand(0)) {
   18580     switch (V.getOpcode()) {
   18581     default:
   18582       return false; // Nothing combined!
   18583 
   18584     case ISD::BITCAST:
   18585       // Skip bitcasts as we always know the type for the target specific
   18586       // instructions.
   18587       continue;
   18588 
   18589     case X86ISD::PSHUFLW:
   18590     case X86ISD::PSHUFHW:
   18591       if (V.getOpcode() == CombineOpcode)
   18592         break;
   18593 
   18594       // Other-half shuffles are no-ops.
   18595       continue;
   18596 
   18597     case X86ISD::PSHUFD: {
   18598       // We can only handle pshufd if the half we are combining either stays in
   18599       // its half, or switches to the other half. Bail if one of these isn't
   18600       // true.
   18601       SmallVector<int, 4> VMask = getPSHUFShuffleMask(V);
   18602       int DOffset = CombineOpcode == X86ISD::PSHUFLW ? 0 : 2;
   18603       if (!((VMask[DOffset + 0] < 2 && VMask[DOffset + 1] < 2) ||
   18604             (VMask[DOffset + 0] >= 2 && VMask[DOffset + 1] >= 2)))
   18605         return false;
   18606 
   18607       // Map the mask through the pshufd and keep walking up the chain.
   18608       for (int i = 0; i < 4; ++i)
   18609         Mask[i] = 2 * (VMask[DOffset + Mask[i] / 2] % 2) + Mask[i] % 2;
   18610 
   18611       // Switch halves if the pshufd does.
   18612       CombineOpcode =
   18613           VMask[DOffset + Mask[0] / 2] < 2 ? X86ISD::PSHUFLW : X86ISD::PSHUFHW;
   18614       continue;
   18615     }
   18616     }
   18617     // Break out of the loop if we break out of the switch.
   18618     break;
   18619   }
   18620 
   18621   if (!V.hasOneUse())
   18622     // We fell out of the loop without finding a viable combining instruction.
   18623     return false;
   18624 
   18625   // Record the old value to use in RAUW-ing.
   18626   SDValue Old = V;
   18627 
   18628   // Merge this node's mask and our incoming mask (adjusted to account for all
   18629   // the pshufd instructions encountered).
   18630   SmallVector<int, 4> VMask = getPSHUFShuffleMask(V);
   18631   for (int &M : Mask)
   18632     M = VMask[M];
   18633   V = DAG.getNode(V.getOpcode(), DL, MVT::v8i16, V.getOperand(0),
   18634                   getV4X86ShuffleImm8ForMask(Mask, DAG));
   18635 
   18636   // Replace N with its operand as we're going to combine that shuffle away.
   18637   DAG.ReplaceAllUsesWith(N, N.getOperand(0));
   18638 
   18639   // Replace the combinable shuffle with the combined one, updating all users
   18640   // so that we re-evaluate the chain here.
   18641   DCI.CombineTo(Old.getNode(), V, /*AddTo*/ true);
   18642   return true;
   18643 }
   18644 
   18645 /// \brief Try to combine x86 target specific shuffles.
   18646 static SDValue PerformTargetShuffleCombine(SDValue N, SelectionDAG &DAG,
   18647                                            TargetLowering::DAGCombinerInfo &DCI,
   18648                                            const X86Subtarget *Subtarget) {
   18649   SDLoc DL(N);
   18650   MVT VT = N.getSimpleValueType();
   18651   SmallVector<int, 4> Mask;
   18652 
   18653   switch (N.getOpcode()) {
   18654   case X86ISD::PSHUFD:
   18655   case X86ISD::PSHUFLW:
   18656   case X86ISD::PSHUFHW:
   18657     Mask = getPSHUFShuffleMask(N);
   18658     assert(Mask.size() == 4);
   18659     break;
   18660   default:
   18661     return SDValue();
   18662   }
   18663 
   18664   // Nuke no-op shuffles that show up after combining.
   18665   if (isNoopShuffleMask(Mask))
   18666     return DCI.CombineTo(N.getNode(), N.getOperand(0), /*AddTo*/ true);
   18667 
   18668   // Look for simplifications involving one or two shuffle instructions.
   18669   SDValue V = N.getOperand(0);
   18670   switch (N.getOpcode()) {
   18671   default:
   18672     break;
   18673   case X86ISD::PSHUFLW:
   18674   case X86ISD::PSHUFHW:
   18675     assert(VT == MVT::v8i16);
   18676     (void)VT;
   18677 
   18678     if (combineRedundantHalfShuffle(N, Mask, DAG, DCI))
   18679       return SDValue(); // We combined away this shuffle, so we're done.
   18680 
   18681     // See if this reduces to a PSHUFD which is no more expensive and can
   18682     // combine with more operations.
   18683     if (Mask[0] % 2 == 0 && Mask[2] % 2 == 0 &&
   18684         areAdjacentMasksSequential(Mask)) {
   18685       int DMask[] = {-1, -1, -1, -1};
   18686       int DOffset = N.getOpcode() == X86ISD::PSHUFLW ? 0 : 2;
   18687       DMask[DOffset + 0] = DOffset + Mask[0] / 2;
   18688       DMask[DOffset + 1] = DOffset + Mask[2] / 2;
   18689       V = DAG.getNode(ISD::BITCAST, DL, MVT::v4i32, V);
   18690       DCI.AddToWorklist(V.getNode());
   18691       V = DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32, V,
   18692                       getV4X86ShuffleImm8ForMask(DMask, DAG));
   18693       DCI.AddToWorklist(V.getNode());
   18694       return DAG.getNode(ISD::BITCAST, DL, MVT::v8i16, V);
   18695     }
   18696 
   18697     // Look for shuffle patterns which can be implemented as a single unpack.
   18698     // FIXME: This doesn't handle the location of the PSHUFD generically, and
   18699     // only works when we have a PSHUFD followed by two half-shuffles.
   18700     if (Mask[0] == Mask[1] && Mask[2] == Mask[3] &&
   18701         (V.getOpcode() == X86ISD::PSHUFLW ||
   18702          V.getOpcode() == X86ISD::PSHUFHW) &&
   18703         V.getOpcode() != N.getOpcode() &&
   18704         V.hasOneUse()) {
   18705       SDValue D = V.getOperand(0);
   18706       while (D.getOpcode() == ISD::BITCAST && D.hasOneUse())
   18707         D = D.getOperand(0);
   18708       if (D.getOpcode() == X86ISD::PSHUFD && D.hasOneUse()) {
   18709         SmallVector<int, 4> VMask = getPSHUFShuffleMask(V);
   18710         SmallVector<int, 4> DMask = getPSHUFShuffleMask(D);
   18711         int NOffset = N.getOpcode() == X86ISD::PSHUFLW ? 0 : 4;
   18712         int VOffset = V.getOpcode() == X86ISD::PSHUFLW ? 0 : 4;
   18713         int WordMask[8];
   18714         for (int i = 0; i < 4; ++i) {
   18715           WordMask[i + NOffset] = Mask[i] + NOffset;
   18716           WordMask[i + VOffset] = VMask[i] + VOffset;
   18717         }
   18718         // Map the word mask through the DWord mask.
   18719         int MappedMask[8];
   18720         for (int i = 0; i < 8; ++i)
   18721           MappedMask[i] = 2 * DMask[WordMask[i] / 2] + WordMask[i] % 2;
   18722         const int UnpackLoMask[] = {0, 0, 1, 1, 2, 2, 3, 3};
   18723         const int UnpackHiMask[] = {4, 4, 5, 5, 6, 6, 7, 7};
   18724         if (std::equal(std::begin(MappedMask), std::end(MappedMask),
   18725                        std::begin(UnpackLoMask)) ||
   18726             std::equal(std::begin(MappedMask), std::end(MappedMask),
   18727                        std::begin(UnpackHiMask))) {
   18728           // We can replace all three shuffles with an unpack.
   18729           V = DAG.getNode(ISD::BITCAST, DL, MVT::v8i16, D.getOperand(0));
   18730           DCI.AddToWorklist(V.getNode());
   18731           return DAG.getNode(MappedMask[0] == 0 ? X86ISD::UNPCKL
   18732                                                 : X86ISD::UNPCKH,
   18733                              DL, MVT::v8i16, V, V);
   18734         }
   18735       }
   18736     }
   18737 
   18738     break;
   18739 
   18740   case X86ISD::PSHUFD:
   18741     if (combineRedundantDWordShuffle(N, Mask, DAG, DCI))
   18742       return SDValue(); // We combined away this shuffle.
   18743 
   18744     break;
   18745   }
   18746 
   18747   return SDValue();
   18748 }
   18749 
   18750 /// PerformShuffleCombine - Performs several different shuffle combines.
   18751 static SDValue PerformShuffleCombine(SDNode *N, SelectionDAG &DAG,
   18752                                      TargetLowering::DAGCombinerInfo &DCI,
   18753                                      const X86Subtarget *Subtarget) {
   18754   SDLoc dl(N);
   18755   SDValue N0 = N->getOperand(0);
   18756   SDValue N1 = N->getOperand(1);
   18757   EVT VT = N->getValueType(0);
   18758 
   18759   // Canonicalize shuffles that perform 'addsub' on packed float vectors
   18760   // according to the rule:
   18761   //  (shuffle (FADD A, B), (FSUB A, B), Mask) ->
   18762   //  (shuffle (FSUB A, -B), (FADD A, -B), Mask)
   18763   //
   18764   // Where 'Mask' is:
   18765   //  <0,5,2,7>             -- for v4f32 and v4f64 shuffles;
   18766   //  <0,3>                 -- for v2f64 shuffles;
   18767   //  <0,9,2,11,4,13,6,15>  -- for v8f32 shuffles.
   18768   //
   18769   // This helps pattern-matching more SSE3/AVX ADDSUB instructions
   18770   // during ISel stage.
   18771   if (N->getOpcode() == ISD::VECTOR_SHUFFLE &&
   18772       ((Subtarget->hasSSE3() && (VT == MVT::v4f32 || VT == MVT::v2f64)) ||
   18773        (Subtarget->hasAVX() && (VT == MVT::v8f32 || VT == MVT::v4f64))) &&
   18774       N0->getOpcode() == ISD::FADD && N1->getOpcode() == ISD::FSUB &&
   18775       // Operands to the FADD and FSUB must be the same.
   18776       ((N0->getOperand(0) == N1->getOperand(0) &&
   18777         N0->getOperand(1) == N1->getOperand(1)) ||
   18778        // FADD is commutable. See if by commuting the operands of the FADD
   18779        // we would still be able to match the operands of the FSUB dag node.
   18780        (N0->getOperand(1) == N1->getOperand(0) &&
   18781         N0->getOperand(0) == N1->getOperand(1))) &&
   18782       N0->getOperand(0)->getOpcode() != ISD::UNDEF &&
   18783       N0->getOperand(1)->getOpcode() != ISD::UNDEF) {
   18784 
   18785     ShuffleVectorSDNode *SV = cast<ShuffleVectorSDNode>(N);
   18786     unsigned NumElts = VT.getVectorNumElements();
   18787     ArrayRef<int> Mask = SV->getMask();
   18788     bool CanFold = true;
   18789 
   18790     for (unsigned i = 0, e = NumElts; i != e && CanFold; ++i)
   18791       CanFold = Mask[i] == (int)((i & 1) ? i + NumElts : i);
   18792 
   18793     if (CanFold) {
   18794       SDValue Op0 = N1->getOperand(0);
   18795       SDValue Op1 = DAG.getNode(ISD::FNEG, dl, VT, N1->getOperand(1));
   18796       SDValue Sub = DAG.getNode(ISD::FSUB, dl, VT, Op0, Op1);
   18797       SDValue Add = DAG.getNode(ISD::FADD, dl, VT, Op0, Op1);
   18798       return DAG.getVectorShuffle(VT, dl, Sub, Add, Mask);
   18799     }
   18800   }
   18801 
   18802   // Don't create instructions with illegal types after legalize types has run.
   18803   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
   18804   if (!DCI.isBeforeLegalize() && !TLI.isTypeLegal(VT.getVectorElementType()))
   18805     return SDValue();
   18806 
   18807   // Combine 256-bit vector shuffles. This is only profitable when in AVX mode
   18808   if (Subtarget->hasFp256() && VT.is256BitVector() &&
   18809       N->getOpcode() == ISD::VECTOR_SHUFFLE)
   18810     return PerformShuffleCombine256(N, DAG, DCI, Subtarget);
   18811 
   18812   // During Type Legalization, when promoting illegal vector types,
   18813   // the backend might introduce new shuffle dag nodes and bitcasts.
   18814   //
   18815   // This code performs the following transformation:
   18816   // fold: (shuffle (bitcast (BINOP A, B)), Undef, <Mask>) ->
   18817   //       (shuffle (BINOP (bitcast A), (bitcast B)), Undef, <Mask>)
   18818   //
   18819   // We do this only if both the bitcast and the BINOP dag nodes have
   18820   // one use. Also, perform this transformation only if the new binary
   18821   // operation is legal. This is to avoid introducing dag nodes that
   18822   // potentially need to be further expanded (or custom lowered) into a
   18823   // less optimal sequence of dag nodes.
   18824   if (!DCI.isBeforeLegalize() && DCI.isBeforeLegalizeOps() &&
   18825       N1.getOpcode() == ISD::UNDEF && N0.hasOneUse() &&
   18826       N0.getOpcode() == ISD::BITCAST) {
   18827     SDValue BC0 = N0.getOperand(0);
   18828     EVT SVT = BC0.getValueType();
   18829     unsigned Opcode = BC0.getOpcode();
   18830     unsigned NumElts = VT.getVectorNumElements();
   18831 
   18832     if (BC0.hasOneUse() && SVT.isVector() &&
   18833         SVT.getVectorNumElements() * 2 == NumElts &&
   18834         TLI.isOperationLegal(Opcode, VT)) {
   18835       bool CanFold = false;
   18836       switch (Opcode) {
   18837       default : break;
   18838       case ISD::ADD :
   18839       case ISD::FADD :
   18840       case ISD::SUB :
   18841       case ISD::FSUB :
   18842       case ISD::MUL :
   18843       case ISD::FMUL :
   18844         CanFold = true;
   18845       }
   18846 
   18847       unsigned SVTNumElts = SVT.getVectorNumElements();
   18848       ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N);
   18849       for (unsigned i = 0, e = SVTNumElts; i != e && CanFold; ++i)
   18850         CanFold = SVOp->getMaskElt(i) == (int)(i * 2);
   18851       for (unsigned i = SVTNumElts, e = NumElts; i != e && CanFold; ++i)
   18852         CanFold = SVOp->getMaskElt(i) < 0;
   18853 
   18854       if (CanFold) {
   18855         SDValue BC00 = DAG.getNode(ISD::BITCAST, dl, VT, BC0.getOperand(0));
   18856         SDValue BC01 = DAG.getNode(ISD::BITCAST, dl, VT, BC0.getOperand(1));
   18857         SDValue NewBinOp = DAG.getNode(BC0.getOpcode(), dl, VT, BC00, BC01);
   18858         return DAG.getVectorShuffle(VT, dl, NewBinOp, N1, &SVOp->getMask()[0]);
   18859       }
   18860     }
   18861   }
   18862 
   18863   // Only handle 128 wide vector from here on.
   18864   if (!VT.is128BitVector())
   18865     return SDValue();
   18866 
   18867   // Combine a vector_shuffle that is equal to build_vector load1, load2, load3,
   18868   // load4, <0, 1, 2, 3> into a 128-bit load if the load addresses are
   18869   // consecutive, non-overlapping, and in the right order.
   18870   SmallVector<SDValue, 16> Elts;
   18871   for (unsigned i = 0, e = VT.getVectorNumElements(); i != e; ++i)
   18872     Elts.push_back(getShuffleScalarElt(N, i, DAG, 0));
   18873 
   18874   SDValue LD = EltsFromConsecutiveLoads(VT, Elts, dl, DAG, true);
   18875   if (LD.getNode())
   18876     return LD;
   18877 
   18878   if (isTargetShuffle(N->getOpcode())) {
   18879     SDValue Shuffle =
   18880         PerformTargetShuffleCombine(SDValue(N, 0), DAG, DCI, Subtarget);
   18881     if (Shuffle.getNode())
   18882       return Shuffle;
   18883   }
   18884 
   18885   return SDValue();
   18886 }
   18887 
   18888 /// PerformTruncateCombine - Converts truncate operation to
   18889 /// a sequence of vector shuffle operations.
   18890 /// It is possible when we truncate 256-bit vector to 128-bit vector
   18891 static SDValue PerformTruncateCombine(SDNode *N, SelectionDAG &DAG,
   18892                                       TargetLowering::DAGCombinerInfo &DCI,
   18893                                       const X86Subtarget *Subtarget)  {
   18894   return SDValue();
   18895 }
   18896 
   18897 /// XFormVExtractWithShuffleIntoLoad - Check if a vector extract from a target
   18898 /// specific shuffle of a load can be folded into a single element load.
   18899 /// Similar handling for VECTOR_SHUFFLE is performed by DAGCombiner, but
   18900 /// shuffles have been customed lowered so we need to handle those here.
   18901 static SDValue XFormVExtractWithShuffleIntoLoad(SDNode *N, SelectionDAG &DAG,
   18902                                          TargetLowering::DAGCombinerInfo &DCI) {
   18903   if (DCI.isBeforeLegalizeOps())
   18904     return SDValue();
   18905 
   18906   SDValue InVec = N->getOperand(0);
   18907   SDValue EltNo = N->getOperand(1);
   18908 
   18909   if (!isa<ConstantSDNode>(EltNo))
   18910     return SDValue();
   18911 
   18912   EVT VT = InVec.getValueType();
   18913 
   18914   bool HasShuffleIntoBitcast = false;
   18915   if (InVec.getOpcode() == ISD::BITCAST) {
   18916     // Don't duplicate a load with other uses.
   18917     if (!InVec.hasOneUse())
   18918       return SDValue();
   18919     EVT BCVT = InVec.getOperand(0).getValueType();
   18920     if (BCVT.getVectorNumElements() != VT.getVectorNumElements())
   18921       return SDValue();
   18922     InVec = InVec.getOperand(0);
   18923     HasShuffleIntoBitcast = true;
   18924   }
   18925 
   18926   if (!isTargetShuffle(InVec.getOpcode()))
   18927     return SDValue();
   18928 
   18929   // Don't duplicate a load with other uses.
   18930   if (!InVec.hasOneUse())
   18931     return SDValue();
   18932 
   18933   SmallVector<int, 16> ShuffleMask;
   18934   bool UnaryShuffle;
   18935   if (!getTargetShuffleMask(InVec.getNode(), VT.getSimpleVT(), ShuffleMask,
   18936                             UnaryShuffle))
   18937     return SDValue();
   18938 
   18939   // Select the input vector, guarding against out of range extract vector.
   18940   unsigned NumElems = VT.getVectorNumElements();
   18941   int Elt = cast<ConstantSDNode>(EltNo)->getZExtValue();
   18942   int Idx = (Elt > (int)NumElems) ? -1 : ShuffleMask[Elt];
   18943   SDValue LdNode = (Idx < (int)NumElems) ? InVec.getOperand(0)
   18944                                          : InVec.getOperand(1);
   18945 
   18946   // If inputs to shuffle are the same for both ops, then allow 2 uses
   18947   unsigned AllowedUses = InVec.getOperand(0) == InVec.getOperand(1) ? 2 : 1;
   18948 
   18949   if (LdNode.getOpcode() == ISD::BITCAST) {
   18950     // Don't duplicate a load with other uses.
   18951     if (!LdNode.getNode()->hasNUsesOfValue(AllowedUses, 0))
   18952       return SDValue();
   18953 
   18954     AllowedUses = 1; // only allow 1 load use if we have a bitcast
   18955     LdNode = LdNode.getOperand(0);
   18956   }
   18957 
   18958   if (!ISD::isNormalLoad(LdNode.getNode()))
   18959     return SDValue();
   18960 
   18961   LoadSDNode *LN0 = cast<LoadSDNode>(LdNode);
   18962 
   18963   if (!LN0 ||!LN0->hasNUsesOfValue(AllowedUses, 0) || LN0->isVolatile())
   18964     return SDValue();
   18965 
   18966   if (HasShuffleIntoBitcast) {
   18967     // If there's a bitcast before the shuffle, check if the load type and
   18968     // alignment is valid.
   18969     unsigned Align = LN0->getAlignment();
   18970     const TargetLowering &TLI = DAG.getTargetLoweringInfo();
   18971     unsigned NewAlign = TLI.getDataLayout()->
   18972       getABITypeAlignment(VT.getTypeForEVT(*DAG.getContext()));
   18973 
   18974     if (NewAlign > Align || !TLI.isOperationLegalOrCustom(ISD::LOAD, VT))
   18975       return SDValue();
   18976   }
   18977 
   18978   // All checks match so transform back to vector_shuffle so that DAG combiner
   18979   // can finish the job
   18980   SDLoc dl(N);
   18981 
   18982   // Create shuffle node taking into account the case that its a unary shuffle
   18983   SDValue Shuffle = (UnaryShuffle) ? DAG.getUNDEF(VT) : InVec.getOperand(1);
   18984   Shuffle = DAG.getVectorShuffle(InVec.getValueType(), dl,
   18985                                  InVec.getOperand(0), Shuffle,
   18986                                  &ShuffleMask[0]);
   18987   Shuffle = DAG.getNode(ISD::BITCAST, dl, VT, Shuffle);
   18988   return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, N->getValueType(0), Shuffle,
   18989                      EltNo);
   18990 }
   18991 
   18992 /// PerformEXTRACT_VECTOR_ELTCombine - Detect vector gather/scatter index
   18993 /// generation and convert it from being a bunch of shuffles and extracts
   18994 /// to a simple store and scalar loads to extract the elements.
   18995 static SDValue PerformEXTRACT_VECTOR_ELTCombine(SDNode *N, SelectionDAG &DAG,
   18996                                          TargetLowering::DAGCombinerInfo &DCI) {
   18997   SDValue NewOp = XFormVExtractWithShuffleIntoLoad(N, DAG, DCI);
   18998   if (NewOp.getNode())
   18999     return NewOp;
   19000 
   19001   SDValue InputVector = N->getOperand(0);
   19002 
   19003   // Detect whether we are trying to convert from mmx to i32 and the bitcast
   19004   // from mmx to v2i32 has a single usage.
   19005   if (InputVector.getNode()->getOpcode() == llvm::ISD::BITCAST &&
   19006       InputVector.getNode()->getOperand(0).getValueType() == MVT::x86mmx &&
   19007       InputVector.hasOneUse() && N->getValueType(0) == MVT::i32)
   19008     return DAG.getNode(X86ISD::MMX_MOVD2W, SDLoc(InputVector),
   19009                        N->getValueType(0),
   19010                        InputVector.getNode()->getOperand(0));
   19011 
   19012   // Only operate on vectors of 4 elements, where the alternative shuffling
   19013   // gets to be more expensive.
   19014   if (InputVector.getValueType() != MVT::v4i32)
   19015     return SDValue();
   19016 
   19017   // Check whether every use of InputVector is an EXTRACT_VECTOR_ELT with a
   19018   // single use which is a sign-extend or zero-extend, and all elements are
   19019   // used.
   19020   SmallVector<SDNode *, 4> Uses;
   19021   unsigned ExtractedElements = 0;
   19022   for (SDNode::use_iterator UI = InputVector.getNode()->use_begin(),
   19023        UE = InputVector.getNode()->use_end(); UI != UE; ++UI) {
   19024     if (UI.getUse().getResNo() != InputVector.getResNo())
   19025       return SDValue();
   19026 
   19027     SDNode *Extract = *UI;
   19028     if (Extract->getOpcode() != ISD::EXTRACT_VECTOR_ELT)
   19029       return SDValue();
   19030 
   19031     if (Extract->getValueType(0) != MVT::i32)
   19032       return SDValue();
   19033     if (!Extract->hasOneUse())
   19034       return SDValue();
   19035     if (Extract->use_begin()->getOpcode() != ISD::SIGN_EXTEND &&
   19036         Extract->use_begin()->getOpcode() != ISD::ZERO_EXTEND)
   19037       return SDValue();
   19038     if (!isa<ConstantSDNode>(Extract->getOperand(1)))
   19039       return SDValue();
   19040 
   19041     // Record which element was extracted.
   19042     ExtractedElements |=
   19043       1 << cast<ConstantSDNode>(Extract->getOperand(1))->getZExtValue();
   19044 
   19045     Uses.push_back(Extract);
   19046   }
   19047 
   19048   // If not all the elements were used, this may not be worthwhile.
   19049   if (ExtractedElements != 15)
   19050     return SDValue();
   19051 
   19052   // Ok, we've now decided to do the transformation.
   19053   SDLoc dl(InputVector);
   19054 
   19055   // Store the value to a temporary stack slot.
   19056   SDValue StackPtr = DAG.CreateStackTemporary(InputVector.getValueType());
   19057   SDValue Ch = DAG.getStore(DAG.getEntryNode(), dl, InputVector, StackPtr,
   19058                             MachinePointerInfo(), false, false, 0);
   19059 
   19060   // Replace each use (extract) with a load of the appropriate element.
   19061   for (SmallVectorImpl<SDNode *>::iterator UI = Uses.begin(),
   19062        UE = Uses.end(); UI != UE; ++UI) {
   19063     SDNode *Extract = *UI;
   19064 
   19065     // cOMpute the element's address.
   19066     SDValue Idx = Extract->getOperand(1);
   19067     unsigned EltSize =
   19068         InputVector.getValueType().getVectorElementType().getSizeInBits()/8;
   19069     uint64_t Offset = EltSize * cast<ConstantSDNode>(Idx)->getZExtValue();
   19070     const TargetLowering &TLI = DAG.getTargetLoweringInfo();
   19071     SDValue OffsetVal = DAG.getConstant(Offset, TLI.getPointerTy());
   19072 
   19073     SDValue ScalarAddr = DAG.getNode(ISD::ADD, dl, TLI.getPointerTy(),
   19074                                      StackPtr, OffsetVal);
   19075 
   19076     // Load the scalar.
   19077     SDValue LoadScalar = DAG.getLoad(Extract->getValueType(0), dl, Ch,
   19078                                      ScalarAddr, MachinePointerInfo(),
   19079                                      false, false, false, 0);
   19080 
   19081     // Replace the exact with the load.
   19082     DAG.ReplaceAllUsesOfValueWith(SDValue(Extract, 0), LoadScalar);
   19083   }
   19084 
   19085   // The replacement was made in place; don't return anything.
   19086   return SDValue();
   19087 }
   19088 
   19089 /// \brief Matches a VSELECT onto min/max or return 0 if the node doesn't match.
   19090 static std::pair<unsigned, bool>
   19091 matchIntegerMINMAX(SDValue Cond, EVT VT, SDValue LHS, SDValue RHS,
   19092                    SelectionDAG &DAG, const X86Subtarget *Subtarget) {
   19093   if (!VT.isVector())
   19094     return std::make_pair(0, false);
   19095 
   19096   bool NeedSplit = false;
   19097   switch (VT.getSimpleVT().SimpleTy) {
   19098   default: return std::make_pair(0, false);
   19099   case MVT::v32i8:
   19100   case MVT::v16i16:
   19101   case MVT::v8i32:
   19102     if (!Subtarget->hasAVX2())
   19103       NeedSplit = true;
   19104     if (!Subtarget->hasAVX())
   19105       return std::make_pair(0, false);
   19106     break;
   19107   case MVT::v16i8:
   19108   case MVT::v8i16:
   19109   case MVT::v4i32:
   19110     if (!Subtarget->hasSSE2())
   19111       return std::make_pair(0, false);
   19112   }
   19113 
   19114   // SSE2 has only a small subset of the operations.
   19115   bool hasUnsigned = Subtarget->hasSSE41() ||
   19116                      (Subtarget->hasSSE2() && VT == MVT::v16i8);
   19117   bool hasSigned = Subtarget->hasSSE41() ||
   19118                    (Subtarget->hasSSE2() && VT == MVT::v8i16);
   19119 
   19120   ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
   19121 
   19122   unsigned Opc = 0;
   19123   // Check for x CC y ? x : y.
   19124   if (DAG.isEqualTo(LHS, Cond.getOperand(0)) &&
   19125       DAG.isEqualTo(RHS, Cond.getOperand(1))) {
   19126     switch (CC) {
   19127     default: break;
   19128     case ISD::SETULT:
   19129     case ISD::SETULE:
   19130       Opc = hasUnsigned ? X86ISD::UMIN : 0; break;
   19131     case ISD::SETUGT:
   19132     case ISD::SETUGE:
   19133       Opc = hasUnsigned ? X86ISD::UMAX : 0; break;
   19134     case ISD::SETLT:
   19135     case ISD::SETLE:
   19136       Opc = hasSigned ? X86ISD::SMIN : 0; break;
   19137     case ISD::SETGT:
   19138     case ISD::SETGE:
   19139       Opc = hasSigned ? X86ISD::SMAX : 0; break;
   19140     }
   19141   // Check for x CC y ? y : x -- a min/max with reversed arms.
   19142   } else if (DAG.isEqualTo(LHS, Cond.getOperand(1)) &&
   19143              DAG.isEqualTo(RHS, Cond.getOperand(0))) {
   19144     switch (CC) {
   19145     default: break;
   19146     case ISD::SETULT:
   19147     case ISD::SETULE:
   19148       Opc = hasUnsigned ? X86ISD::UMAX : 0; break;
   19149     case ISD::SETUGT:
   19150     case ISD::SETUGE:
   19151       Opc = hasUnsigned ? X86ISD::UMIN : 0; break;
   19152     case ISD::SETLT:
   19153     case ISD::SETLE:
   19154       Opc = hasSigned ? X86ISD::SMAX : 0; break;
   19155     case ISD::SETGT:
   19156     case ISD::SETGE:
   19157       Opc = hasSigned ? X86ISD::SMIN : 0; break;
   19158     }
   19159   }
   19160 
   19161   return std::make_pair(Opc, NeedSplit);
   19162 }
   19163 
   19164 static SDValue
   19165 TransformVSELECTtoBlendVECTOR_SHUFFLE(SDNode *N, SelectionDAG &DAG,
   19166                                       const X86Subtarget *Subtarget) {
   19167   SDLoc dl(N);
   19168   SDValue Cond = N->getOperand(0);
   19169   SDValue LHS = N->getOperand(1);
   19170   SDValue RHS = N->getOperand(2);
   19171 
   19172   if (Cond.getOpcode() == ISD::SIGN_EXTEND) {
   19173     SDValue CondSrc = Cond->getOperand(0);
   19174     if (CondSrc->getOpcode() == ISD::SIGN_EXTEND_INREG)
   19175       Cond = CondSrc->getOperand(0);
   19176   }
   19177 
   19178   MVT VT = N->getSimpleValueType(0);
   19179   MVT EltVT = VT.getVectorElementType();
   19180   unsigned NumElems = VT.getVectorNumElements();
   19181   // There is no blend with immediate in AVX-512.
   19182   if (VT.is512BitVector())
   19183     return SDValue();
   19184 
   19185   if (!Subtarget->hasSSE41() || EltVT == MVT::i8)
   19186     return SDValue();
   19187   if (!Subtarget->hasInt256() && VT == MVT::v16i16)
   19188     return SDValue();
   19189 
   19190   if (!ISD::isBuildVectorOfConstantSDNodes(Cond.getNode()))
   19191     return SDValue();
   19192 
   19193   unsigned MaskValue = 0;
   19194   if (!BUILD_VECTORtoBlendMask(cast<BuildVectorSDNode>(Cond), MaskValue))
   19195     return SDValue();
   19196 
   19197   SmallVector<int, 8> ShuffleMask(NumElems, -1);
   19198   for (unsigned i = 0; i < NumElems; ++i) {
   19199     // Be sure we emit undef where we can.
   19200     if (Cond.getOperand(i)->getOpcode() == ISD::UNDEF)
   19201       ShuffleMask[i] = -1;
   19202     else
   19203       ShuffleMask[i] = i + NumElems * ((MaskValue >> i) & 1);
   19204   }
   19205 
   19206   return DAG.getVectorShuffle(VT, dl, LHS, RHS, &ShuffleMask[0]);
   19207 }
   19208 
   19209 /// PerformSELECTCombine - Do target-specific dag combines on SELECT and VSELECT
   19210 /// nodes.
   19211 static SDValue PerformSELECTCombine(SDNode *N, SelectionDAG &DAG,
   19212                                     TargetLowering::DAGCombinerInfo &DCI,
   19213                                     const X86Subtarget *Subtarget) {
   19214   SDLoc DL(N);
   19215   SDValue Cond = N->getOperand(0);
   19216   // Get the LHS/RHS of the select.
   19217   SDValue LHS = N->getOperand(1);
   19218   SDValue RHS = N->getOperand(2);
   19219   EVT VT = LHS.getValueType();
   19220   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
   19221 
   19222   // If we have SSE[12] support, try to form min/max nodes. SSE min/max
   19223   // instructions match the semantics of the common C idiom x<y?x:y but not
   19224   // x<=y?x:y, because of how they handle negative zero (which can be
   19225   // ignored in unsafe-math mode).
   19226   if (Cond.getOpcode() == ISD::SETCC && VT.isFloatingPoint() &&
   19227       VT != MVT::f80 && TLI.isTypeLegal(VT) &&
   19228       (Subtarget->hasSSE2() ||
   19229        (Subtarget->hasSSE1() && VT.getScalarType() == MVT::f32))) {
   19230     ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
   19231 
   19232     unsigned Opcode = 0;
   19233     // Check for x CC y ? x : y.
   19234     if (DAG.isEqualTo(LHS, Cond.getOperand(0)) &&
   19235         DAG.isEqualTo(RHS, Cond.getOperand(1))) {
   19236       switch (CC) {
   19237       default: break;
   19238       case ISD::SETULT:
   19239         // Converting this to a min would handle NaNs incorrectly, and swapping
   19240         // the operands would cause it to handle comparisons between positive
   19241         // and negative zero incorrectly.
   19242         if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS)) {
   19243           if (!DAG.getTarget().Options.UnsafeFPMath &&
   19244               !(DAG.isKnownNeverZero(LHS) || DAG.isKnownNeverZero(RHS)))
   19245             break;
   19246           std::swap(LHS, RHS);
   19247         }
   19248         Opcode = X86ISD::FMIN;
   19249         break;
   19250       case ISD::SETOLE:
   19251         // Converting this to a min would handle comparisons between positive
   19252         // and negative zero incorrectly.
   19253         if (!DAG.getTarget().Options.UnsafeFPMath &&
   19254             !DAG.isKnownNeverZero(LHS) && !DAG.isKnownNeverZero(RHS))
   19255           break;
   19256         Opcode = X86ISD::FMIN;
   19257         break;
   19258       case ISD::SETULE:
   19259         // Converting this to a min would handle both negative zeros and NaNs
   19260         // incorrectly, but we can swap the operands to fix both.
   19261         std::swap(LHS, RHS);
   19262       case ISD::SETOLT:
   19263       case ISD::SETLT:
   19264       case ISD::SETLE:
   19265         Opcode = X86ISD::FMIN;
   19266         break;
   19267 
   19268       case ISD::SETOGE:
   19269         // Converting this to a max would handle comparisons between positive
   19270         // and negative zero incorrectly.
   19271         if (!DAG.getTarget().Options.UnsafeFPMath &&
   19272             !DAG.isKnownNeverZero(LHS) && !DAG.isKnownNeverZero(RHS))
   19273           break;
   19274         Opcode = X86ISD::FMAX;
   19275         break;
   19276       case ISD::SETUGT:
   19277         // Converting this to a max would handle NaNs incorrectly, and swapping
   19278         // the operands would cause it to handle comparisons between positive
   19279         // and negative zero incorrectly.
   19280         if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS)) {
   19281           if (!DAG.getTarget().Options.UnsafeFPMath &&
   19282               !(DAG.isKnownNeverZero(LHS) || DAG.isKnownNeverZero(RHS)))
   19283             break;
   19284           std::swap(LHS, RHS);
   19285         }
   19286         Opcode = X86ISD::FMAX;
   19287         break;
   19288       case ISD::SETUGE:
   19289         // Converting this to a max would handle both negative zeros and NaNs
   19290         // incorrectly, but we can swap the operands to fix both.
   19291         std::swap(LHS, RHS);
   19292       case ISD::SETOGT:
   19293       case ISD::SETGT:
   19294       case ISD::SETGE:
   19295         Opcode = X86ISD::FMAX;
   19296         break;
   19297       }
   19298     // Check for x CC y ? y : x -- a min/max with reversed arms.
   19299     } else if (DAG.isEqualTo(LHS, Cond.getOperand(1)) &&
   19300                DAG.isEqualTo(RHS, Cond.getOperand(0))) {
   19301       switch (CC) {
   19302       default: break;
   19303       case ISD::SETOGE:
   19304         // Converting this to a min would handle comparisons between positive
   19305         // and negative zero incorrectly, and swapping the operands would
   19306         // cause it to handle NaNs incorrectly.
   19307         if (!DAG.getTarget().Options.UnsafeFPMath &&
   19308             !(DAG.isKnownNeverZero(LHS) || DAG.isKnownNeverZero(RHS))) {
   19309           if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS))
   19310             break;
   19311           std::swap(LHS, RHS);
   19312         }
   19313         Opcode = X86ISD::FMIN;
   19314         break;
   19315       case ISD::SETUGT:
   19316         // Converting this to a min would handle NaNs incorrectly.
   19317         if (!DAG.getTarget().Options.UnsafeFPMath &&
   19318             (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS)))
   19319           break;
   19320         Opcode = X86ISD::FMIN;
   19321         break;
   19322       case ISD::SETUGE:
   19323         // Converting this to a min would handle both negative zeros and NaNs
   19324         // incorrectly, but we can swap the operands to fix both.
   19325         std::swap(LHS, RHS);
   19326       case ISD::SETOGT:
   19327       case ISD::SETGT:
   19328       case ISD::SETGE:
   19329         Opcode = X86ISD::FMIN;
   19330         break;
   19331 
   19332       case ISD::SETULT:
   19333         // Converting this to a max would handle NaNs incorrectly.
   19334         if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS))
   19335           break;
   19336         Opcode = X86ISD::FMAX;
   19337         break;
   19338       case ISD::SETOLE:
   19339         // Converting this to a max would handle comparisons between positive
   19340         // and negative zero incorrectly, and swapping the operands would
   19341         // cause it to handle NaNs incorrectly.
   19342         if (!DAG.getTarget().Options.UnsafeFPMath &&
   19343             !DAG.isKnownNeverZero(LHS) && !DAG.isKnownNeverZero(RHS)) {
   19344           if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS))
   19345             break;
   19346           std::swap(LHS, RHS);
   19347         }
   19348         Opcode = X86ISD::FMAX;
   19349         break;
   19350       case ISD::SETULE:
   19351         // Converting this to a max would handle both negative zeros and NaNs
   19352         // incorrectly, but we can swap the operands to fix both.
   19353         std::swap(LHS, RHS);
   19354       case ISD::SETOLT:
   19355       case ISD::SETLT:
   19356       case ISD::SETLE:
   19357         Opcode = X86ISD::FMAX;
   19358         break;
   19359       }
   19360     }
   19361 
   19362     if (Opcode)
   19363       return DAG.getNode(Opcode, DL, N->getValueType(0), LHS, RHS);
   19364   }
   19365 
   19366   EVT CondVT = Cond.getValueType();
   19367   if (Subtarget->hasAVX512() && VT.isVector() && CondVT.isVector() &&
   19368       CondVT.getVectorElementType() == MVT::i1) {
   19369     // v16i8 (select v16i1, v16i8, v16i8) does not have a proper
   19370     // lowering on AVX-512. In this case we convert it to
   19371     // v16i8 (select v16i8, v16i8, v16i8) and use AVX instruction.
   19372     // The same situation for all 128 and 256-bit vectors of i8 and i16
   19373     EVT OpVT = LHS.getValueType();
   19374     if ((OpVT.is128BitVector() || OpVT.is256BitVector()) &&
   19375         (OpVT.getVectorElementType() == MVT::i8 ||
   19376          OpVT.getVectorElementType() == MVT::i16)) {
   19377       Cond = DAG.getNode(ISD::SIGN_EXTEND, DL, OpVT, Cond);
   19378       DCI.AddToWorklist(Cond.getNode());
   19379       return DAG.getNode(N->getOpcode(), DL, OpVT, Cond, LHS, RHS);
   19380     }
   19381   }
   19382   // If this is a select between two integer constants, try to do some
   19383   // optimizations.
   19384   if (ConstantSDNode *TrueC = dyn_cast<ConstantSDNode>(LHS)) {
   19385     if (ConstantSDNode *FalseC = dyn_cast<ConstantSDNode>(RHS))
   19386       // Don't do this for crazy integer types.
   19387       if (DAG.getTargetLoweringInfo().isTypeLegal(LHS.getValueType())) {
   19388         // If this is efficiently invertible, canonicalize the LHSC/RHSC values
   19389         // so that TrueC (the true value) is larger than FalseC.
   19390         bool NeedsCondInvert = false;
   19391 
   19392         if (TrueC->getAPIntValue().ult(FalseC->getAPIntValue()) &&
   19393             // Efficiently invertible.
   19394             (Cond.getOpcode() == ISD::SETCC ||  // setcc -> invertible.
   19395              (Cond.getOpcode() == ISD::XOR &&   // xor(X, C) -> invertible.
   19396               isa<ConstantSDNode>(Cond.getOperand(1))))) {
   19397           NeedsCondInvert = true;
   19398           std::swap(TrueC, FalseC);
   19399         }
   19400 
   19401         // Optimize C ? 8 : 0 -> zext(C) << 3.  Likewise for any pow2/0.
   19402         if (FalseC->getAPIntValue() == 0 &&
   19403             TrueC->getAPIntValue().isPowerOf2()) {
   19404           if (NeedsCondInvert) // Invert the condition if needed.
   19405             Cond = DAG.getNode(ISD::XOR, DL, Cond.getValueType(), Cond,
   19406                                DAG.getConstant(1, Cond.getValueType()));
   19407 
   19408           // Zero extend the condition if needed.
   19409           Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, LHS.getValueType(), Cond);
   19410 
   19411           unsigned ShAmt = TrueC->getAPIntValue().logBase2();
   19412           return DAG.getNode(ISD::SHL, DL, LHS.getValueType(), Cond,
   19413                              DAG.getConstant(ShAmt, MVT::i8));
   19414         }
   19415 
   19416         // Optimize Cond ? cst+1 : cst -> zext(setcc(C)+cst.
   19417         if (FalseC->getAPIntValue()+1 == TrueC->getAPIntValue()) {
   19418           if (NeedsCondInvert) // Invert the condition if needed.
   19419             Cond = DAG.getNode(ISD::XOR, DL, Cond.getValueType(), Cond,
   19420                                DAG.getConstant(1, Cond.getValueType()));
   19421 
   19422           // Zero extend the condition if needed.
   19423           Cond = DAG.getNode(ISD::ZERO_EXTEND, DL,
   19424                              FalseC->getValueType(0), Cond);
   19425           return DAG.getNode(ISD::ADD, DL, Cond.getValueType(), Cond,
   19426                              SDValue(FalseC, 0));
   19427         }
   19428 
   19429         // Optimize cases that will turn into an LEA instruction.  This requires
   19430         // an i32 or i64 and an efficient multiplier (1, 2, 3, 4, 5, 8, 9).
   19431         if (N->getValueType(0) == MVT::i32 || N->getValueType(0) == MVT::i64) {
   19432           uint64_t Diff = TrueC->getZExtValue()-FalseC->getZExtValue();
   19433           if (N->getValueType(0) == MVT::i32) Diff = (unsigned)Diff;
   19434 
   19435           bool isFastMultiplier = false;
   19436           if (Diff < 10) {
   19437             switch ((unsigned char)Diff) {
   19438               default: break;
   19439               case 1:  // result = add base, cond
   19440               case 2:  // result = lea base(    , cond*2)
   19441               case 3:  // result = lea base(cond, cond*2)
   19442               case 4:  // result = lea base(    , cond*4)
   19443               case 5:  // result = lea base(cond, cond*4)
   19444               case 8:  // result = lea base(    , cond*8)
   19445               case 9:  // result = lea base(cond, cond*8)
   19446                 isFastMultiplier = true;
   19447                 break;
   19448             }
   19449           }
   19450 
   19451           if (isFastMultiplier) {
   19452             APInt Diff = TrueC->getAPIntValue()-FalseC->getAPIntValue();
   19453             if (NeedsCondInvert) // Invert the condition if needed.
   19454               Cond = DAG.getNode(ISD::XOR, DL, Cond.getValueType(), Cond,
   19455                                  DAG.getConstant(1, Cond.getValueType()));
   19456 
   19457             // Zero extend the condition if needed.
   19458             Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, FalseC->getValueType(0),
   19459                                Cond);
   19460             // Scale the condition by the difference.
   19461             if (Diff != 1)
   19462               Cond = DAG.getNode(ISD::MUL, DL, Cond.getValueType(), Cond,
   19463                                  DAG.getConstant(Diff, Cond.getValueType()));
   19464 
   19465             // Add the base if non-zero.
   19466             if (FalseC->getAPIntValue() != 0)
   19467               Cond = DAG.getNode(ISD::ADD, DL, Cond.getValueType(), Cond,
   19468                                  SDValue(FalseC, 0));
   19469             return Cond;
   19470           }
   19471         }
   19472       }
   19473   }
   19474 
   19475   // Canonicalize max and min:
   19476   // (x > y) ? x : y -> (x >= y) ? x : y
   19477   // (x < y) ? x : y -> (x <= y) ? x : y
   19478   // This allows use of COND_S / COND_NS (see TranslateX86CC) which eliminates
   19479   // the need for an extra compare
   19480   // against zero. e.g.
   19481   // (x - y) > 0 : (x - y) ? 0 -> (x - y) >= 0 : (x - y) ? 0
   19482   // subl   %esi, %edi
   19483   // testl  %edi, %edi
   19484   // movl   $0, %eax
   19485   // cmovgl %edi, %eax
   19486   // =>
   19487   // xorl   %eax, %eax
   19488   // subl   %esi, $edi
   19489   // cmovsl %eax, %edi
   19490   if (N->getOpcode() == ISD::SELECT && Cond.getOpcode() == ISD::SETCC &&
   19491       DAG.isEqualTo(LHS, Cond.getOperand(0)) &&
   19492       DAG.isEqualTo(RHS, Cond.getOperand(1))) {
   19493     ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
   19494     switch (CC) {
   19495     default: break;
   19496     case ISD::SETLT:
   19497     case ISD::SETGT: {
   19498       ISD::CondCode NewCC = (CC == ISD::SETLT) ? ISD::SETLE : ISD::SETGE;
   19499       Cond = DAG.getSetCC(SDLoc(Cond), Cond.getValueType(),
   19500                           Cond.getOperand(0), Cond.getOperand(1), NewCC);
   19501       return DAG.getNode(ISD::SELECT, DL, VT, Cond, LHS, RHS);
   19502     }
   19503     }
   19504   }
   19505 
   19506   // Early exit check
   19507   if (!TLI.isTypeLegal(VT))
   19508     return SDValue();
   19509 
   19510   // Match VSELECTs into subs with unsigned saturation.
   19511   if (N->getOpcode() == ISD::VSELECT && Cond.getOpcode() == ISD::SETCC &&
   19512       // psubus is available in SSE2 and AVX2 for i8 and i16 vectors.
   19513       ((Subtarget->hasSSE2() && (VT == MVT::v16i8 || VT == MVT::v8i16)) ||
   19514        (Subtarget->hasAVX2() && (VT == MVT::v32i8 || VT == MVT::v16i16)))) {
   19515     ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
   19516 
   19517     // Check if one of the arms of the VSELECT is a zero vector. If it's on the
   19518     // left side invert the predicate to simplify logic below.
   19519     SDValue Other;
   19520     if (ISD::isBuildVectorAllZeros(LHS.getNode())) {
   19521       Other = RHS;
   19522       CC = ISD::getSetCCInverse(CC, true);
   19523     } else if (ISD::isBuildVectorAllZeros(RHS.getNode())) {
   19524       Other = LHS;
   19525     }
   19526 
   19527     if (Other.getNode() && Other->getNumOperands() == 2 &&
   19528         DAG.isEqualTo(Other->getOperand(0), Cond.getOperand(0))) {
   19529       SDValue OpLHS = Other->getOperand(0), OpRHS = Other->getOperand(1);
   19530       SDValue CondRHS = Cond->getOperand(1);
   19531 
   19532       // Look for a general sub with unsigned saturation first.
   19533       // x >= y ? x-y : 0 --> subus x, y
   19534       // x >  y ? x-y : 0 --> subus x, y
   19535       if ((CC == ISD::SETUGE || CC == ISD::SETUGT) &&
   19536           Other->getOpcode() == ISD::SUB && DAG.isEqualTo(OpRHS, CondRHS))
   19537         return DAG.getNode(X86ISD::SUBUS, DL, VT, OpLHS, OpRHS);
   19538 
   19539       if (auto *OpRHSBV = dyn_cast<BuildVectorSDNode>(OpRHS))
   19540         if (auto *OpRHSConst = OpRHSBV->getConstantSplatNode()) {
   19541           if (auto *CondRHSBV = dyn_cast<BuildVectorSDNode>(CondRHS))
   19542             if (auto *CondRHSConst = CondRHSBV->getConstantSplatNode())
   19543               // If the RHS is a constant we have to reverse the const
   19544               // canonicalization.
   19545               // x > C-1 ? x+-C : 0 --> subus x, C
   19546               if (CC == ISD::SETUGT && Other->getOpcode() == ISD::ADD &&
   19547                   CondRHSConst->getAPIntValue() ==
   19548                       (-OpRHSConst->getAPIntValue() - 1))
   19549                 return DAG.getNode(
   19550                     X86ISD::SUBUS, DL, VT, OpLHS,
   19551                     DAG.getConstant(-OpRHSConst->getAPIntValue(), VT));
   19552 
   19553           // Another special case: If C was a sign bit, the sub has been
   19554           // canonicalized into a xor.
   19555           // FIXME: Would it be better to use computeKnownBits to determine
   19556           //        whether it's safe to decanonicalize the xor?
   19557           // x s< 0 ? x^C : 0 --> subus x, C
   19558           if (CC == ISD::SETLT && Other->getOpcode() == ISD::XOR &&
   19559               ISD::isBuildVectorAllZeros(CondRHS.getNode()) &&
   19560               OpRHSConst->getAPIntValue().isSignBit())
   19561             // Note that we have to rebuild the RHS constant here to ensure we
   19562             // don't rely on particular values of undef lanes.
   19563             return DAG.getNode(
   19564                 X86ISD::SUBUS, DL, VT, OpLHS,
   19565                 DAG.getConstant(OpRHSConst->getAPIntValue(), VT));
   19566         }
   19567     }
   19568   }
   19569 
   19570   // Try to match a min/max vector operation.
   19571   if (N->getOpcode() == ISD::VSELECT && Cond.getOpcode() == ISD::SETCC) {
   19572     std::pair<unsigned, bool> ret = matchIntegerMINMAX(Cond, VT, LHS, RHS, DAG, Subtarget);
   19573     unsigned Opc = ret.first;
   19574     bool NeedSplit = ret.second;
   19575 
   19576     if (Opc && NeedSplit) {
   19577       unsigned NumElems = VT.getVectorNumElements();
   19578       // Extract the LHS vectors
   19579       SDValue LHS1 = Extract128BitVector(LHS, 0, DAG, DL);
   19580       SDValue LHS2 = Extract128BitVector(LHS, NumElems/2, DAG, DL);
   19581 
   19582       // Extract the RHS vectors
   19583       SDValue RHS1 = Extract128BitVector(RHS, 0, DAG, DL);
   19584       SDValue RHS2 = Extract128BitVector(RHS, NumElems/2, DAG, DL);
   19585 
   19586       // Create min/max for each subvector
   19587       LHS = DAG.getNode(Opc, DL, LHS1.getValueType(), LHS1, RHS1);
   19588       RHS = DAG.getNode(Opc, DL, LHS2.getValueType(), LHS2, RHS2);
   19589 
   19590       // Merge the result
   19591       return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, LHS, RHS);
   19592     } else if (Opc)
   19593       return DAG.getNode(Opc, DL, VT, LHS, RHS);
   19594   }
   19595 
   19596   // Simplify vector selection if the selector will be produced by CMPP*/PCMP*.
   19597   if (N->getOpcode() == ISD::VSELECT && Cond.getOpcode() == ISD::SETCC &&
   19598       // Check if SETCC has already been promoted
   19599       TLI.getSetCCResultType(*DAG.getContext(), VT) == CondVT &&
   19600       // Check that condition value type matches vselect operand type
   19601       CondVT == VT) {
   19602 
   19603     assert(Cond.getValueType().isVector() &&
   19604            "vector select expects a vector selector!");
   19605 
   19606     bool TValIsAllOnes = ISD::isBuildVectorAllOnes(LHS.getNode());
   19607     bool FValIsAllZeros = ISD::isBuildVectorAllZeros(RHS.getNode());
   19608 
   19609     if (!TValIsAllOnes && !FValIsAllZeros) {
   19610       // Try invert the condition if true value is not all 1s and false value
   19611       // is not all 0s.
   19612       bool TValIsAllZeros = ISD::isBuildVectorAllZeros(LHS.getNode());
   19613       bool FValIsAllOnes = ISD::isBuildVectorAllOnes(RHS.getNode());
   19614 
   19615       if (TValIsAllZeros || FValIsAllOnes) {
   19616         SDValue CC = Cond.getOperand(2);
   19617         ISD::CondCode NewCC =
   19618           ISD::getSetCCInverse(cast<CondCodeSDNode>(CC)->get(),
   19619                                Cond.getOperand(0).getValueType().isInteger());
   19620         Cond = DAG.getSetCC(DL, CondVT, Cond.getOperand(0), Cond.getOperand(1), NewCC);
   19621         std::swap(LHS, RHS);
   19622         TValIsAllOnes = FValIsAllOnes;
   19623         FValIsAllZeros = TValIsAllZeros;
   19624       }
   19625     }
   19626 
   19627     if (TValIsAllOnes || FValIsAllZeros) {
   19628       SDValue Ret;
   19629 
   19630       if (TValIsAllOnes && FValIsAllZeros)
   19631         Ret = Cond;
   19632       else if (TValIsAllOnes)
   19633         Ret = DAG.getNode(ISD::OR, DL, CondVT, Cond,
   19634                           DAG.getNode(ISD::BITCAST, DL, CondVT, RHS));
   19635       else if (FValIsAllZeros)
   19636         Ret = DAG.getNode(ISD::AND, DL, CondVT, Cond,
   19637                           DAG.getNode(ISD::BITCAST, DL, CondVT, LHS));
   19638 
   19639       return DAG.getNode(ISD::BITCAST, DL, VT, Ret);
   19640     }
   19641   }
   19642 
   19643   // Try to fold this VSELECT into a MOVSS/MOVSD
   19644   if (N->getOpcode() == ISD::VSELECT &&
   19645       Cond.getOpcode() == ISD::BUILD_VECTOR && !DCI.isBeforeLegalize()) {
   19646     if (VT == MVT::v4i32 || VT == MVT::v4f32 ||
   19647         (Subtarget->hasSSE2() && (VT == MVT::v2i64 || VT == MVT::v2f64))) {
   19648       bool CanFold = false;
   19649       unsigned NumElems = Cond.getNumOperands();
   19650       SDValue A = LHS;
   19651       SDValue B = RHS;
   19652 
   19653       if (isZero(Cond.getOperand(0))) {
   19654         CanFold = true;
   19655 
   19656         // fold (vselect <0,-1,-1,-1>, A, B) -> (movss A, B)
   19657         // fold (vselect <0,-1> -> (movsd A, B)
   19658         for (unsigned i = 1, e = NumElems; i != e && CanFold; ++i)
   19659           CanFold = isAllOnes(Cond.getOperand(i));
   19660       } else if (isAllOnes(Cond.getOperand(0))) {
   19661         CanFold = true;
   19662         std::swap(A, B);
   19663 
   19664         // fold (vselect <-1,0,0,0>, A, B) -> (movss B, A)
   19665         // fold (vselect <-1,0> -> (movsd B, A)
   19666         for (unsigned i = 1, e = NumElems; i != e && CanFold; ++i)
   19667           CanFold = isZero(Cond.getOperand(i));
   19668       }
   19669 
   19670       if (CanFold) {
   19671         if (VT == MVT::v4i32 || VT == MVT::v4f32)
   19672           return getTargetShuffleNode(X86ISD::MOVSS, DL, VT, A, B, DAG);
   19673         return getTargetShuffleNode(X86ISD::MOVSD, DL, VT, A, B, DAG);
   19674       }
   19675 
   19676       if (Subtarget->hasSSE2() && (VT == MVT::v4i32 || VT == MVT::v4f32)) {
   19677         // fold (v4i32: vselect <0,0,-1,-1>, A, B) ->
   19678         //      (v4i32 (bitcast (movsd (v2i64 (bitcast A)),
   19679         //                             (v2i64 (bitcast B)))))
   19680         //
   19681         // fold (v4f32: vselect <0,0,-1,-1>, A, B) ->
   19682         //      (v4f32 (bitcast (movsd (v2f64 (bitcast A)),
   19683         //                             (v2f64 (bitcast B)))))
   19684         //
   19685         // fold (v4i32: vselect <-1,-1,0,0>, A, B) ->
   19686         //      (v4i32 (bitcast (movsd (v2i64 (bitcast B)),
   19687         //                             (v2i64 (bitcast A)))))
   19688         //
   19689         // fold (v4f32: vselect <-1,-1,0,0>, A, B) ->
   19690         //      (v4f32 (bitcast (movsd (v2f64 (bitcast B)),
   19691         //                             (v2f64 (bitcast A)))))
   19692 
   19693         CanFold = (isZero(Cond.getOperand(0)) &&
   19694                    isZero(Cond.getOperand(1)) &&
   19695                    isAllOnes(Cond.getOperand(2)) &&
   19696                    isAllOnes(Cond.getOperand(3)));
   19697 
   19698         if (!CanFold && isAllOnes(Cond.getOperand(0)) &&
   19699             isAllOnes(Cond.getOperand(1)) &&
   19700             isZero(Cond.getOperand(2)) &&
   19701             isZero(Cond.getOperand(3))) {
   19702           CanFold = true;
   19703           std::swap(LHS, RHS);
   19704         }
   19705 
   19706         if (CanFold) {
   19707           EVT NVT = (VT == MVT::v4i32) ? MVT::v2i64 : MVT::v2f64;
   19708           SDValue NewA = DAG.getNode(ISD::BITCAST, DL, NVT, LHS);
   19709           SDValue NewB = DAG.getNode(ISD::BITCAST, DL, NVT, RHS);
   19710           SDValue Select = getTargetShuffleNode(X86ISD::MOVSD, DL, NVT, NewA,
   19711                                                 NewB, DAG);
   19712           return DAG.getNode(ISD::BITCAST, DL, VT, Select);
   19713         }
   19714       }
   19715     }
   19716   }
   19717 
   19718   // If we know that this node is legal then we know that it is going to be
   19719   // matched by one of the SSE/AVX BLEND instructions. These instructions only
   19720   // depend on the highest bit in each word. Try to use SimplifyDemandedBits
   19721   // to simplify previous instructions.
   19722   if (N->getOpcode() == ISD::VSELECT && DCI.isBeforeLegalizeOps() &&
   19723       !DCI.isBeforeLegalize() &&
   19724       // We explicitly check against v8i16 and v16i16 because, although
   19725       // they're marked as Custom, they might only be legal when Cond is a
   19726       // build_vector of constants. This will be taken care in a later
   19727       // condition.
   19728       (TLI.isOperationLegalOrCustom(ISD::VSELECT, VT) && VT != MVT::v16i16 &&
   19729        VT != MVT::v8i16)) {
   19730     unsigned BitWidth = Cond.getValueType().getScalarType().getSizeInBits();
   19731 
   19732     // Don't optimize vector selects that map to mask-registers.
   19733     if (BitWidth == 1)
   19734       return SDValue();
   19735 
   19736     // Check all uses of that condition operand to check whether it will be
   19737     // consumed by non-BLEND instructions, which may depend on all bits are set
   19738     // properly.
   19739     for (SDNode::use_iterator I = Cond->use_begin(),
   19740                               E = Cond->use_end(); I != E; ++I)
   19741       if (I->getOpcode() != ISD::VSELECT)
   19742         // TODO: Add other opcodes eventually lowered into BLEND.
   19743         return SDValue();
   19744 
   19745     assert(BitWidth >= 8 && BitWidth <= 64 && "Invalid mask size");
   19746     APInt DemandedMask = APInt::getHighBitsSet(BitWidth, 1);
   19747 
   19748     APInt KnownZero, KnownOne;
   19749     TargetLowering::TargetLoweringOpt TLO(DAG, DCI.isBeforeLegalize(),
   19750                                           DCI.isBeforeLegalizeOps());
   19751     if (TLO.ShrinkDemandedConstant(Cond, DemandedMask) ||
   19752         TLI.SimplifyDemandedBits(Cond, DemandedMask, KnownZero, KnownOne, TLO))
   19753       DCI.CommitTargetLoweringOpt(TLO);
   19754   }
   19755 
   19756   // We should generate an X86ISD::BLENDI from a vselect if its argument
   19757   // is a sign_extend_inreg of an any_extend of a BUILD_VECTOR of
   19758   // constants. This specific pattern gets generated when we split a
   19759   // selector for a 512 bit vector in a machine without AVX512 (but with
   19760   // 256-bit vectors), during legalization:
   19761   //
   19762   // (vselect (sign_extend (any_extend (BUILD_VECTOR)) i1) LHS RHS)
   19763   //
   19764   // Iff we find this pattern and the build_vectors are built from
   19765   // constants, we translate the vselect into a shuffle_vector that we
   19766   // know will be matched by LowerVECTOR_SHUFFLEtoBlend.
   19767   if (N->getOpcode() == ISD::VSELECT && !DCI.isBeforeLegalize()) {
   19768     SDValue Shuffle = TransformVSELECTtoBlendVECTOR_SHUFFLE(N, DAG, Subtarget);
   19769     if (Shuffle.getNode())
   19770       return Shuffle;
   19771   }
   19772 
   19773   return SDValue();
   19774 }
   19775 
   19776 // Check whether a boolean test is testing a boolean value generated by
   19777 // X86ISD::SETCC. If so, return the operand of that SETCC and proper condition
   19778 // code.
   19779 //
   19780 // Simplify the following patterns:
   19781 // (Op (CMP (SETCC Cond EFLAGS) 1) EQ) or
   19782 // (Op (CMP (SETCC Cond EFLAGS) 0) NEQ)
   19783 // to (Op EFLAGS Cond)
   19784 //
   19785 // (Op (CMP (SETCC Cond EFLAGS) 0) EQ) or
   19786 // (Op (CMP (SETCC Cond EFLAGS) 1) NEQ)
   19787 // to (Op EFLAGS !Cond)
   19788 //
   19789 // where Op could be BRCOND or CMOV.
   19790 //
   19791 static SDValue checkBoolTestSetCCCombine(SDValue Cmp, X86::CondCode &CC) {
   19792   // Quit if not CMP and SUB with its value result used.
   19793   if (Cmp.getOpcode() != X86ISD::CMP &&
   19794       (Cmp.getOpcode() != X86ISD::SUB || Cmp.getNode()->hasAnyUseOfValue(0)))
   19795       return SDValue();
   19796 
   19797   // Quit if not used as a boolean value.
   19798   if (CC != X86::COND_E && CC != X86::COND_NE)
   19799     return SDValue();
   19800 
   19801   // Check CMP operands. One of them should be 0 or 1 and the other should be
   19802   // an SetCC or extended from it.
   19803   SDValue Op1 = Cmp.getOperand(0);
   19804   SDValue Op2 = Cmp.getOperand(1);
   19805 
   19806   SDValue SetCC;
   19807   const ConstantSDNode* C = nullptr;
   19808   bool needOppositeCond = (CC == X86::COND_E);
   19809   bool checkAgainstTrue = false; // Is it a comparison against 1?
   19810 
   19811   if ((C = dyn_cast<ConstantSDNode>(Op1)))
   19812     SetCC = Op2;
   19813   else if ((C = dyn_cast<ConstantSDNode>(Op2)))
   19814     SetCC = Op1;
   19815   else // Quit if all operands are not constants.
   19816     return SDValue();
   19817 
   19818   if (C->getZExtValue() == 1) {
   19819     needOppositeCond = !needOppositeCond;
   19820     checkAgainstTrue = true;
   19821   } else if (C->getZExtValue() != 0)
   19822     // Quit if the constant is neither 0 or 1.
   19823     return SDValue();
   19824 
   19825   bool truncatedToBoolWithAnd = false;
   19826   // Skip (zext $x), (trunc $x), or (and $x, 1) node.
   19827   while (SetCC.getOpcode() == ISD::ZERO_EXTEND ||
   19828          SetCC.getOpcode() == ISD::TRUNCATE ||
   19829          SetCC.getOpcode() == ISD::AND) {
   19830     if (SetCC.getOpcode() == ISD::AND) {
   19831       int OpIdx = -1;
   19832       ConstantSDNode *CS;
   19833       if ((CS = dyn_cast<ConstantSDNode>(SetCC.getOperand(0))) &&
   19834           CS->getZExtValue() == 1)
   19835         OpIdx = 1;
   19836       if ((CS = dyn_cast<ConstantSDNode>(SetCC.getOperand(1))) &&
   19837           CS->getZExtValue() == 1)
   19838         OpIdx = 0;
   19839       if (OpIdx == -1)
   19840         break;
   19841       SetCC = SetCC.getOperand(OpIdx);
   19842       truncatedToBoolWithAnd = true;
   19843     } else
   19844       SetCC = SetCC.getOperand(0);
   19845   }
   19846 
   19847   switch (SetCC.getOpcode()) {
   19848   case X86ISD::SETCC_CARRY:
   19849     // Since SETCC_CARRY gives output based on R = CF ? ~0 : 0, it's unsafe to
   19850     // simplify it if the result of SETCC_CARRY is not canonicalized to 0 or 1,
   19851     // i.e. it's a comparison against true but the result of SETCC_CARRY is not
   19852     // truncated to i1 using 'and'.
   19853     if (checkAgainstTrue && !truncatedToBoolWithAnd)
   19854       break;
   19855     assert(X86::CondCode(SetCC.getConstantOperandVal(0)) == X86::COND_B &&
   19856            "Invalid use of SETCC_CARRY!");
   19857     // FALL THROUGH
   19858   case X86ISD::SETCC:
   19859     // Set the condition code or opposite one if necessary.
   19860     CC = X86::CondCode(SetCC.getConstantOperandVal(0));
   19861     if (needOppositeCond)
   19862       CC = X86::GetOppositeBranchCondition(CC);
   19863     return SetCC.getOperand(1);
   19864   case X86ISD::CMOV: {
   19865     // Check whether false/true value has canonical one, i.e. 0 or 1.
   19866     ConstantSDNode *FVal = dyn_cast<ConstantSDNode>(SetCC.getOperand(0));
   19867     ConstantSDNode *TVal = dyn_cast<ConstantSDNode>(SetCC.getOperand(1));
   19868     // Quit if true value is not a constant.
   19869     if (!TVal)
   19870       return SDValue();
   19871     // Quit if false value is not a constant.
   19872     if (!FVal) {
   19873       SDValue Op = SetCC.getOperand(0);
   19874       // Skip 'zext' or 'trunc' node.
   19875       if (Op.getOpcode() == ISD::ZERO_EXTEND ||
   19876           Op.getOpcode() == ISD::TRUNCATE)
   19877         Op = Op.getOperand(0);
   19878       // A special case for rdrand/rdseed, where 0 is set if false cond is
   19879       // found.
   19880       if ((Op.getOpcode() != X86ISD::RDRAND &&
   19881            Op.getOpcode() != X86ISD::RDSEED) || Op.getResNo() != 0)
   19882         return SDValue();
   19883     }
   19884     // Quit if false value is not the constant 0 or 1.
   19885     bool FValIsFalse = true;
   19886     if (FVal && FVal->getZExtValue() != 0) {
   19887       if (FVal->getZExtValue() != 1)
   19888         return SDValue();
   19889       // If FVal is 1, opposite cond is needed.
   19890       needOppositeCond = !needOppositeCond;
   19891       FValIsFalse = false;
   19892     }
   19893     // Quit if TVal is not the constant opposite of FVal.
   19894     if (FValIsFalse && TVal->getZExtValue() != 1)
   19895       return SDValue();
   19896     if (!FValIsFalse && TVal->getZExtValue() != 0)
   19897       return SDValue();
   19898     CC = X86::CondCode(SetCC.getConstantOperandVal(2));
   19899     if (needOppositeCond)
   19900       CC = X86::GetOppositeBranchCondition(CC);
   19901     return SetCC.getOperand(3);
   19902   }
   19903   }
   19904 
   19905   return SDValue();
   19906 }
   19907 
   19908 /// Optimize X86ISD::CMOV [LHS, RHS, CONDCODE (e.g. X86::COND_NE), CONDVAL]
   19909 static SDValue PerformCMOVCombine(SDNode *N, SelectionDAG &DAG,
   19910                                   TargetLowering::DAGCombinerInfo &DCI,
   19911                                   const X86Subtarget *Subtarget) {
   19912   SDLoc DL(N);
   19913 
   19914   // If the flag operand isn't dead, don't touch this CMOV.
   19915   if (N->getNumValues() == 2 && !SDValue(N, 1).use_empty())
   19916     return SDValue();
   19917 
   19918   SDValue FalseOp = N->getOperand(0);
   19919   SDValue TrueOp = N->getOperand(1);
   19920   X86::CondCode CC = (X86::CondCode)N->getConstantOperandVal(2);
   19921   SDValue Cond = N->getOperand(3);
   19922 
   19923   if (CC == X86::COND_E || CC == X86::COND_NE) {
   19924     switch (Cond.getOpcode()) {
   19925     default: break;
   19926     case X86ISD::BSR:
   19927     case X86ISD::BSF:
   19928       // If operand of BSR / BSF are proven never zero, then ZF cannot be set.
   19929       if (DAG.isKnownNeverZero(Cond.getOperand(0)))
   19930         return (CC == X86::COND_E) ? FalseOp : TrueOp;
   19931     }
   19932   }
   19933 
   19934   SDValue Flags;
   19935 
   19936   Flags = checkBoolTestSetCCCombine(Cond, CC);
   19937   if (Flags.getNode() &&
   19938       // Extra check as FCMOV only supports a subset of X86 cond.
   19939       (FalseOp.getValueType() != MVT::f80 || hasFPCMov(CC))) {
   19940     SDValue Ops[] = { FalseOp, TrueOp,
   19941                       DAG.getConstant(CC, MVT::i8), Flags };
   19942     return DAG.getNode(X86ISD::CMOV, DL, N->getVTList(), Ops);
   19943   }
   19944 
   19945   // If this is a select between two integer constants, try to do some
   19946   // optimizations.  Note that the operands are ordered the opposite of SELECT
   19947   // operands.
   19948   if (ConstantSDNode *TrueC = dyn_cast<ConstantSDNode>(TrueOp)) {
   19949     if (ConstantSDNode *FalseC = dyn_cast<ConstantSDNode>(FalseOp)) {
   19950       // Canonicalize the TrueC/FalseC values so that TrueC (the true value) is
   19951       // larger than FalseC (the false value).
   19952       if (TrueC->getAPIntValue().ult(FalseC->getAPIntValue())) {
   19953         CC = X86::GetOppositeBranchCondition(CC);
   19954         std::swap(TrueC, FalseC);
   19955         std::swap(TrueOp, FalseOp);
   19956       }
   19957 
   19958       // Optimize C ? 8 : 0 -> zext(setcc(C)) << 3.  Likewise for any pow2/0.
   19959       // This is efficient for any integer data type (including i8/i16) and
   19960       // shift amount.
   19961       if (FalseC->getAPIntValue() == 0 && TrueC->getAPIntValue().isPowerOf2()) {
   19962         Cond = DAG.getNode(X86ISD::SETCC, DL, MVT::i8,
   19963                            DAG.getConstant(CC, MVT::i8), Cond);
   19964 
   19965         // Zero extend the condition if needed.
   19966         Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, TrueC->getValueType(0), Cond);
   19967 
   19968         unsigned ShAmt = TrueC->getAPIntValue().logBase2();
   19969         Cond = DAG.getNode(ISD::SHL, DL, Cond.getValueType(), Cond,
   19970                            DAG.getConstant(ShAmt, MVT::i8));
   19971         if (N->getNumValues() == 2)  // Dead flag value?
   19972           return DCI.CombineTo(N, Cond, SDValue());
   19973         return Cond;
   19974       }
   19975 
   19976       // Optimize Cond ? cst+1 : cst -> zext(setcc(C)+cst.  This is efficient
   19977       // for any integer data type, including i8/i16.
   19978       if (FalseC->getAPIntValue()+1 == TrueC->getAPIntValue()) {
   19979         Cond = DAG.getNode(X86ISD::SETCC, DL, MVT::i8,
   19980                            DAG.getConstant(CC, MVT::i8), Cond);
   19981 
   19982         // Zero extend the condition if needed.
   19983         Cond = DAG.getNode(ISD::ZERO_EXTEND, DL,
   19984                            FalseC->getValueType(0), Cond);
   19985         Cond = DAG.getNode(ISD::ADD, DL, Cond.getValueType(), Cond,
   19986                            SDValue(FalseC, 0));
   19987 
   19988         if (N->getNumValues() == 2)  // Dead flag value?
   19989           return DCI.CombineTo(N, Cond, SDValue());
   19990         return Cond;
   19991       }
   19992 
   19993       // Optimize cases that will turn into an LEA instruction.  This requires
   19994       // an i32 or i64 and an efficient multiplier (1, 2, 3, 4, 5, 8, 9).
   19995       if (N->getValueType(0) == MVT::i32 || N->getValueType(0) == MVT::i64) {
   19996         uint64_t Diff = TrueC->getZExtValue()-FalseC->getZExtValue();
   19997         if (N->getValueType(0) == MVT::i32) Diff = (unsigned)Diff;
   19998 
   19999         bool isFastMultiplier = false;
   20000         if (Diff < 10) {
   20001           switch ((unsigned char)Diff) {
   20002           default: break;
   20003           case 1:  // result = add base, cond
   20004           case 2:  // result = lea base(    , cond*2)
   20005           case 3:  // result = lea base(cond, cond*2)
   20006           case 4:  // result = lea base(    , cond*4)
   20007           case 5:  // result = lea base(cond, cond*4)
   20008           case 8:  // result = lea base(    , cond*8)
   20009           case 9:  // result = lea base(cond, cond*8)
   20010             isFastMultiplier = true;
   20011             break;
   20012           }
   20013         }
   20014 
   20015         if (isFastMultiplier) {
   20016           APInt Diff = TrueC->getAPIntValue()-FalseC->getAPIntValue();
   20017           Cond = DAG.getNode(X86ISD::SETCC, DL, MVT::i8,
   20018                              DAG.getConstant(CC, MVT::i8), Cond);
   20019           // Zero extend the condition if needed.
   20020           Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, FalseC->getValueType(0),
   20021                              Cond);
   20022           // Scale the condition by the difference.
   20023           if (Diff != 1)
   20024             Cond = DAG.getNode(ISD::MUL, DL, Cond.getValueType(), Cond,
   20025                                DAG.getConstant(Diff, Cond.getValueType()));
   20026 
   20027           // Add the base if non-zero.
   20028           if (FalseC->getAPIntValue() != 0)
   20029             Cond = DAG.getNode(ISD::ADD, DL, Cond.getValueType(), Cond,
   20030                                SDValue(FalseC, 0));
   20031           if (N->getNumValues() == 2)  // Dead flag value?
   20032             return DCI.CombineTo(N, Cond, SDValue());
   20033           return Cond;
   20034         }
   20035       }
   20036     }
   20037   }
   20038 
   20039   // Handle these cases:
   20040   //   (select (x != c), e, c) -> select (x != c), e, x),
   20041   //   (select (x == c), c, e) -> select (x == c), x, e)
   20042   // where the c is an integer constant, and the "select" is the combination
   20043   // of CMOV and CMP.
   20044   //
   20045   // The rationale for this change is that the conditional-move from a constant
   20046   // needs two instructions, however, conditional-move from a register needs
   20047   // only one instruction.
   20048   //
   20049   // CAVEAT: By replacing a constant with a symbolic value, it may obscure
   20050   //  some instruction-combining opportunities. This opt needs to be
   20051   //  postponed as late as possible.
   20052   //
   20053   if (!DCI.isBeforeLegalize() && !DCI.isBeforeLegalizeOps()) {
   20054     // the DCI.xxxx conditions are provided to postpone the optimization as
   20055     // late as possible.
   20056 
   20057     ConstantSDNode *CmpAgainst = nullptr;
   20058     if ((Cond.getOpcode() == X86ISD::CMP || Cond.getOpcode() == X86ISD::SUB) &&
   20059         (CmpAgainst = dyn_cast<ConstantSDNode>(Cond.getOperand(1))) &&
   20060         !isa<ConstantSDNode>(Cond.getOperand(0))) {
   20061 
   20062       if (CC == X86::COND_NE &&
   20063           CmpAgainst == dyn_cast<ConstantSDNode>(FalseOp)) {
   20064         CC = X86::GetOppositeBranchCondition(CC);
   20065         std::swap(TrueOp, FalseOp);
   20066       }
   20067 
   20068       if (CC == X86::COND_E &&
   20069           CmpAgainst == dyn_cast<ConstantSDNode>(TrueOp)) {
   20070         SDValue Ops[] = { FalseOp, Cond.getOperand(0),
   20071                           DAG.getConstant(CC, MVT::i8), Cond };
   20072         return DAG.getNode(X86ISD::CMOV, DL, N->getVTList (), Ops);
   20073       }
   20074     }
   20075   }
   20076 
   20077   return SDValue();
   20078 }
   20079 
   20080 static SDValue PerformINTRINSIC_WO_CHAINCombine(SDNode *N, SelectionDAG &DAG,
   20081                                                 const X86Subtarget *Subtarget) {
   20082   unsigned IntNo = cast<ConstantSDNode>(N->getOperand(0))->getZExtValue();
   20083   switch (IntNo) {
   20084   default: return SDValue();
   20085   // SSE/AVX/AVX2 blend intrinsics.
   20086   case Intrinsic::x86_avx2_pblendvb:
   20087   case Intrinsic::x86_avx2_pblendw:
   20088   case Intrinsic::x86_avx2_pblendd_128:
   20089   case Intrinsic::x86_avx2_pblendd_256:
   20090     // Don't try to simplify this intrinsic if we don't have AVX2.
   20091     if (!Subtarget->hasAVX2())
   20092       return SDValue();
   20093     // FALL-THROUGH
   20094   case Intrinsic::x86_avx_blend_pd_256:
   20095   case Intrinsic::x86_avx_blend_ps_256:
   20096   case Intrinsic::x86_avx_blendv_pd_256:
   20097   case Intrinsic::x86_avx_blendv_ps_256:
   20098     // Don't try to simplify this intrinsic if we don't have AVX.
   20099     if (!Subtarget->hasAVX())
   20100       return SDValue();
   20101     // FALL-THROUGH
   20102   case Intrinsic::x86_sse41_pblendw:
   20103   case Intrinsic::x86_sse41_blendpd:
   20104   case Intrinsic::x86_sse41_blendps:
   20105   case Intrinsic::x86_sse41_blendvps:
   20106   case Intrinsic::x86_sse41_blendvpd:
   20107   case Intrinsic::x86_sse41_pblendvb: {
   20108     SDValue Op0 = N->getOperand(1);
   20109     SDValue Op1 = N->getOperand(2);
   20110     SDValue Mask = N->getOperand(3);
   20111 
   20112     // Don't try to simplify this intrinsic if we don't have SSE4.1.
   20113     if (!Subtarget->hasSSE41())
   20114       return SDValue();
   20115 
   20116     // fold (blend A, A, Mask) -> A
   20117     if (Op0 == Op1)
   20118       return Op0;
   20119     // fold (blend A, B, allZeros) -> A
   20120     if (ISD::isBuildVectorAllZeros(Mask.getNode()))
   20121       return Op0;
   20122     // fold (blend A, B, allOnes) -> B
   20123     if (ISD::isBuildVectorAllOnes(Mask.getNode()))
   20124       return Op1;
   20125 
   20126     // Simplify the case where the mask is a constant i32 value.
   20127     if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Mask)) {
   20128       if (C->isNullValue())
   20129         return Op0;
   20130       if (C->isAllOnesValue())
   20131         return Op1;
   20132     }
   20133 
   20134     return SDValue();
   20135   }
   20136 
   20137   // Packed SSE2/AVX2 arithmetic shift immediate intrinsics.
   20138   case Intrinsic::x86_sse2_psrai_w:
   20139   case Intrinsic::x86_sse2_psrai_d:
   20140   case Intrinsic::x86_avx2_psrai_w:
   20141   case Intrinsic::x86_avx2_psrai_d:
   20142   case Intrinsic::x86_sse2_psra_w:
   20143   case Intrinsic::x86_sse2_psra_d:
   20144   case Intrinsic::x86_avx2_psra_w:
   20145   case Intrinsic::x86_avx2_psra_d: {
   20146     SDValue Op0 = N->getOperand(1);
   20147     SDValue Op1 = N->getOperand(2);
   20148     EVT VT = Op0.getValueType();
   20149     assert(VT.isVector() && "Expected a vector type!");
   20150 
   20151     if (isa<BuildVectorSDNode>(Op1))
   20152       Op1 = Op1.getOperand(0);
   20153 
   20154     if (!isa<ConstantSDNode>(Op1))
   20155       return SDValue();
   20156 
   20157     EVT SVT = VT.getVectorElementType();
   20158     unsigned SVTBits = SVT.getSizeInBits();
   20159 
   20160     ConstantSDNode *CND = cast<ConstantSDNode>(Op1);
   20161     const APInt &C = APInt(SVTBits, CND->getAPIntValue().getZExtValue());
   20162     uint64_t ShAmt = C.getZExtValue();
   20163 
   20164     // Don't try to convert this shift into a ISD::SRA if the shift
   20165     // count is bigger than or equal to the element size.
   20166     if (ShAmt >= SVTBits)
   20167       return SDValue();
   20168 
   20169     // Trivial case: if the shift count is zero, then fold this
   20170     // into the first operand.
   20171     if (ShAmt == 0)
   20172       return Op0;
   20173 
   20174     // Replace this packed shift intrinsic with a target independent
   20175     // shift dag node.
   20176     SDValue Splat = DAG.getConstant(C, VT);
   20177     return DAG.getNode(ISD::SRA, SDLoc(N), VT, Op0, Splat);
   20178   }
   20179   }
   20180 }
   20181 
   20182 /// PerformMulCombine - Optimize a single multiply with constant into two
   20183 /// in order to implement it with two cheaper instructions, e.g.
   20184 /// LEA + SHL, LEA + LEA.
   20185 static SDValue PerformMulCombine(SDNode *N, SelectionDAG &DAG,
   20186                                  TargetLowering::DAGCombinerInfo &DCI) {
   20187   if (DCI.isBeforeLegalize() || DCI.isCalledByLegalizer())
   20188     return SDValue();
   20189 
   20190   EVT VT = N->getValueType(0);
   20191   if (VT != MVT::i64)
   20192     return SDValue();
   20193 
   20194   ConstantSDNode *C = dyn_cast<ConstantSDNode>(N->getOperand(1));
   20195   if (!C)
   20196     return SDValue();
   20197   uint64_t MulAmt = C->getZExtValue();
   20198   if (isPowerOf2_64(MulAmt) || MulAmt == 3 || MulAmt == 5 || MulAmt == 9)
   20199     return SDValue();
   20200 
   20201   uint64_t MulAmt1 = 0;
   20202   uint64_t MulAmt2 = 0;
   20203   if ((MulAmt % 9) == 0) {
   20204     MulAmt1 = 9;
   20205     MulAmt2 = MulAmt / 9;
   20206   } else if ((MulAmt % 5) == 0) {
   20207     MulAmt1 = 5;
   20208     MulAmt2 = MulAmt / 5;
   20209   } else if ((MulAmt % 3) == 0) {
   20210     MulAmt1 = 3;
   20211     MulAmt2 = MulAmt / 3;
   20212   }
   20213   if (MulAmt2 &&
   20214       (isPowerOf2_64(MulAmt2) || MulAmt2 == 3 || MulAmt2 == 5 || MulAmt2 == 9)){
   20215     SDLoc DL(N);
   20216 
   20217     if (isPowerOf2_64(MulAmt2) &&
   20218         !(N->hasOneUse() && N->use_begin()->getOpcode() == ISD::ADD))
   20219       // If second multiplifer is pow2, issue it first. We want the multiply by
   20220       // 3, 5, or 9 to be folded into the addressing mode unless the lone use
   20221       // is an add.
   20222       std::swap(MulAmt1, MulAmt2);
   20223 
   20224     SDValue NewMul;
   20225     if (isPowerOf2_64(MulAmt1))
   20226       NewMul = DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
   20227                            DAG.getConstant(Log2_64(MulAmt1), MVT::i8));
   20228     else
   20229       NewMul = DAG.getNode(X86ISD::MUL_IMM, DL, VT, N->getOperand(0),
   20230                            DAG.getConstant(MulAmt1, VT));
   20231 
   20232     if (isPowerOf2_64(MulAmt2))
   20233       NewMul = DAG.getNode(ISD::SHL, DL, VT, NewMul,
   20234                            DAG.getConstant(Log2_64(MulAmt2), MVT::i8));
   20235     else
   20236       NewMul = DAG.getNode(X86ISD::MUL_IMM, DL, VT, NewMul,
   20237                            DAG.getConstant(MulAmt2, VT));
   20238 
   20239     // Do not add new nodes to DAG combiner worklist.
   20240     DCI.CombineTo(N, NewMul, false);
   20241   }
   20242   return SDValue();
   20243 }
   20244 
   20245 static SDValue PerformSHLCombine(SDNode *N, SelectionDAG &DAG) {
   20246   SDValue N0 = N->getOperand(0);
   20247   SDValue N1 = N->getOperand(1);
   20248   ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(N1);
   20249   EVT VT = N0.getValueType();
   20250 
   20251   // fold (shl (and (setcc_c), c1), c2) -> (and setcc_c, (c1 << c2))
   20252   // since the result of setcc_c is all zero's or all ones.
   20253   if (VT.isInteger() && !VT.isVector() &&
   20254       N1C && N0.getOpcode() == ISD::AND &&
   20255       N0.getOperand(1).getOpcode() == ISD::Constant) {
   20256     SDValue N00 = N0.getOperand(0);
   20257     if (N00.getOpcode() == X86ISD::SETCC_CARRY ||
   20258         ((N00.getOpcode() == ISD::ANY_EXTEND ||
   20259           N00.getOpcode() == ISD::ZERO_EXTEND) &&
   20260          N00.getOperand(0).getOpcode() == X86ISD::SETCC_CARRY)) {
   20261       APInt Mask = cast<ConstantSDNode>(N0.getOperand(1))->getAPIntValue();
   20262       APInt ShAmt = N1C->getAPIntValue();
   20263       Mask = Mask.shl(ShAmt);
   20264       if (Mask != 0)
   20265         return DAG.getNode(ISD::AND, SDLoc(N), VT,
   20266                            N00, DAG.getConstant(Mask, VT));
   20267     }
   20268   }
   20269 
   20270   // Hardware support for vector shifts is sparse which makes us scalarize the
   20271   // vector operations in many cases. Also, on sandybridge ADD is faster than
   20272   // shl.
   20273   // (shl V, 1) -> add V,V
   20274   if (auto *N1BV = dyn_cast<BuildVectorSDNode>(N1))
   20275     if (auto *N1SplatC = N1BV->getConstantSplatNode()) {
   20276       assert(N0.getValueType().isVector() && "Invalid vector shift type");
   20277       // We shift all of the values by one. In many cases we do not have
   20278       // hardware support for this operation. This is better expressed as an ADD
   20279       // of two values.
   20280       if (N1SplatC->getZExtValue() == 1)
   20281         return DAG.getNode(ISD::ADD, SDLoc(N), VT, N0, N0);
   20282     }
   20283 
   20284   return SDValue();
   20285 }
   20286 
   20287 /// \brief Returns a vector of 0s if the node in input is a vector logical
   20288 /// shift by a constant amount which is known to be bigger than or equal
   20289 /// to the vector element size in bits.
   20290 static SDValue performShiftToAllZeros(SDNode *N, SelectionDAG &DAG,
   20291                                       const X86Subtarget *Subtarget) {
   20292   EVT VT = N->getValueType(0);
   20293 
   20294   if (VT != MVT::v2i64 && VT != MVT::v4i32 && VT != MVT::v8i16 &&
   20295       (!Subtarget->hasInt256() ||
   20296        (VT != MVT::v4i64 && VT != MVT::v8i32 && VT != MVT::v16i16)))
   20297     return SDValue();
   20298 
   20299   SDValue Amt = N->getOperand(1);
   20300   SDLoc DL(N);
   20301   if (auto *AmtBV = dyn_cast<BuildVectorSDNode>(Amt))
   20302     if (auto *AmtSplat = AmtBV->getConstantSplatNode()) {
   20303       APInt ShiftAmt = AmtSplat->getAPIntValue();
   20304       unsigned MaxAmount = VT.getVectorElementType().getSizeInBits();
   20305 
   20306       // SSE2/AVX2 logical shifts always return a vector of 0s
   20307       // if the shift amount is bigger than or equal to
   20308       // the element size. The constant shift amount will be
   20309       // encoded as a 8-bit immediate.
   20310       if (ShiftAmt.trunc(8).uge(MaxAmount))
   20311         return getZeroVector(VT, Subtarget, DAG, DL);
   20312     }
   20313 
   20314   return SDValue();
   20315 }
   20316 
   20317 /// PerformShiftCombine - Combine shifts.
   20318 static SDValue PerformShiftCombine(SDNode* N, SelectionDAG &DAG,
   20319                                    TargetLowering::DAGCombinerInfo &DCI,
   20320                                    const X86Subtarget *Subtarget) {
   20321   if (N->getOpcode() == ISD::SHL) {
   20322     SDValue V = PerformSHLCombine(N, DAG);
   20323     if (V.getNode()) return V;
   20324   }
   20325 
   20326   if (N->getOpcode() != ISD::SRA) {
   20327     // Try to fold this logical shift into a zero vector.
   20328     SDValue V = performShiftToAllZeros(N, DAG, Subtarget);
   20329     if (V.getNode()) return V;
   20330   }
   20331 
   20332   return SDValue();
   20333 }
   20334 
   20335 // CMPEQCombine - Recognize the distinctive  (AND (setcc ...) (setcc ..))
   20336 // where both setccs reference the same FP CMP, and rewrite for CMPEQSS
   20337 // and friends.  Likewise for OR -> CMPNEQSS.
   20338 static SDValue CMPEQCombine(SDNode *N, SelectionDAG &DAG,
   20339                             TargetLowering::DAGCombinerInfo &DCI,
   20340                             const X86Subtarget *Subtarget) {
   20341   unsigned opcode;
   20342 
   20343   // SSE1 supports CMP{eq|ne}SS, and SSE2 added CMP{eq|ne}SD, but
   20344   // we're requiring SSE2 for both.
   20345   if (Subtarget->hasSSE2() && isAndOrOfSetCCs(SDValue(N, 0U), opcode)) {
   20346     SDValue N0 = N->getOperand(0);
   20347     SDValue N1 = N->getOperand(1);
   20348     SDValue CMP0 = N0->getOperand(1);
   20349     SDValue CMP1 = N1->getOperand(1);
   20350     SDLoc DL(N);
   20351 
   20352     // The SETCCs should both refer to the same CMP.
   20353     if (CMP0.getOpcode() != X86ISD::CMP || CMP0 != CMP1)
   20354       return SDValue();
   20355 
   20356     SDValue CMP00 = CMP0->getOperand(0);
   20357     SDValue CMP01 = CMP0->getOperand(1);
   20358     EVT     VT    = CMP00.getValueType();
   20359 
   20360     if (VT == MVT::f32 || VT == MVT::f64) {
   20361       bool ExpectingFlags = false;
   20362       // Check for any users that want flags:
   20363       for (SDNode::use_iterator UI = N->use_begin(), UE = N->use_end();
   20364            !ExpectingFlags && UI != UE; ++UI)
   20365         switch (UI->getOpcode()) {
   20366         default:
   20367         case ISD::BR_CC:
   20368         case ISD::BRCOND:
   20369         case ISD::SELECT:
   20370           ExpectingFlags = true;
   20371           break;
   20372         case ISD::CopyToReg:
   20373         case ISD::SIGN_EXTEND:
   20374         case ISD::ZERO_EXTEND:
   20375         case ISD::ANY_EXTEND:
   20376           break;
   20377         }
   20378 
   20379       if (!ExpectingFlags) {
   20380         enum X86::CondCode cc0 = (enum X86::CondCode)N0.getConstantOperandVal(0);
   20381         enum X86::CondCode cc1 = (enum X86::CondCode)N1.getConstantOperandVal(0);
   20382 
   20383         if (cc1 == X86::COND_E || cc1 == X86::COND_NE) {
   20384           X86::CondCode tmp = cc0;
   20385           cc0 = cc1;
   20386           cc1 = tmp;
   20387         }
   20388 
   20389         if ((cc0 == X86::COND_E  && cc1 == X86::COND_NP) ||
   20390             (cc0 == X86::COND_NE && cc1 == X86::COND_P)) {
   20391           // FIXME: need symbolic constants for these magic numbers.
   20392           // See X86ATTInstPrinter.cpp:printSSECC().
   20393           unsigned x86cc = (cc0 == X86::COND_E) ? 0 : 4;
   20394           if (Subtarget->hasAVX512()) {
   20395             SDValue FSetCC = DAG.getNode(X86ISD::FSETCC, DL, MVT::i1, CMP00,
   20396                                          CMP01, DAG.getConstant(x86cc, MVT::i8));
   20397             if (N->getValueType(0) != MVT::i1)
   20398               return DAG.getNode(ISD::ZERO_EXTEND, DL, N->getValueType(0),
   20399                                  FSetCC);
   20400             return FSetCC;
   20401           }
   20402           SDValue OnesOrZeroesF = DAG.getNode(X86ISD::FSETCC, DL,
   20403                                               CMP00.getValueType(), CMP00, CMP01,
   20404                                               DAG.getConstant(x86cc, MVT::i8));
   20405 
   20406           bool is64BitFP = (CMP00.getValueType() == MVT::f64);
   20407           MVT IntVT = is64BitFP ? MVT::i64 : MVT::i32;
   20408 
   20409           if (is64BitFP && !Subtarget->is64Bit()) {
   20410             // On a 32-bit target, we cannot bitcast the 64-bit float to a
   20411             // 64-bit integer, since that's not a legal type. Since
   20412             // OnesOrZeroesF is all ones of all zeroes, we don't need all the
   20413             // bits, but can do this little dance to extract the lowest 32 bits
   20414             // and work with those going forward.
   20415             SDValue Vector64 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v2f64,
   20416                                            OnesOrZeroesF);
   20417             SDValue Vector32 = DAG.getNode(ISD::BITCAST, DL, MVT::v4f32,
   20418                                            Vector64);
   20419             OnesOrZeroesF = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32,
   20420                                         Vector32, DAG.getIntPtrConstant(0));
   20421             IntVT = MVT::i32;
   20422           }
   20423 
   20424           SDValue OnesOrZeroesI = DAG.getNode(ISD::BITCAST, DL, IntVT, OnesOrZeroesF);
   20425           SDValue ANDed = DAG.getNode(ISD::AND, DL, IntVT, OnesOrZeroesI,
   20426                                       DAG.getConstant(1, IntVT));
   20427           SDValue OneBitOfTruth = DAG.getNode(ISD::TRUNCATE, DL, MVT::i8, ANDed);
   20428           return OneBitOfTruth;
   20429         }
   20430       }
   20431     }
   20432   }
   20433   return SDValue();
   20434 }
   20435 
   20436 /// CanFoldXORWithAllOnes - Test whether the XOR operand is a AllOnes vector
   20437 /// so it can be folded inside ANDNP.
   20438 static bool CanFoldXORWithAllOnes(const SDNode *N) {
   20439   EVT VT = N->getValueType(0);
   20440 
   20441   // Match direct AllOnes for 128 and 256-bit vectors
   20442   if (ISD::isBuildVectorAllOnes(N))
   20443     return true;
   20444 
   20445   // Look through a bit convert.
   20446   if (N->getOpcode() == ISD::BITCAST)
   20447     N = N->getOperand(0).getNode();
   20448 
   20449   // Sometimes the operand may come from a insert_subvector building a 256-bit
   20450   // allones vector
   20451   if (VT.is256BitVector() &&
   20452       N->getOpcode() == ISD::INSERT_SUBVECTOR) {
   20453     SDValue V1 = N->getOperand(0);
   20454     SDValue V2 = N->getOperand(1);
   20455 
   20456     if (V1.getOpcode() == ISD::INSERT_SUBVECTOR &&
   20457         V1.getOperand(0).getOpcode() == ISD::UNDEF &&
   20458         ISD::isBuildVectorAllOnes(V1.getOperand(1).getNode()) &&
   20459         ISD::isBuildVectorAllOnes(V2.getNode()))
   20460       return true;
   20461   }
   20462 
   20463   return false;
   20464 }
   20465 
   20466 // On AVX/AVX2 the type v8i1 is legalized to v8i16, which is an XMM sized
   20467 // register. In most cases we actually compare or select YMM-sized registers
   20468 // and mixing the two types creates horrible code. This method optimizes
   20469 // some of the transition sequences.
   20470 static SDValue WidenMaskArithmetic(SDNode *N, SelectionDAG &DAG,
   20471                                  TargetLowering::DAGCombinerInfo &DCI,
   20472                                  const X86Subtarget *Subtarget) {
   20473   EVT VT = N->getValueType(0);
   20474   if (!VT.is256BitVector())
   20475     return SDValue();
   20476 
   20477   assert((N->getOpcode() == ISD::ANY_EXTEND ||
   20478           N->getOpcode() == ISD::ZERO_EXTEND ||
   20479           N->getOpcode() == ISD::SIGN_EXTEND) && "Invalid Node");
   20480 
   20481   SDValue Narrow = N->getOperand(0);
   20482   EVT NarrowVT = Narrow->getValueType(0);
   20483   if (!NarrowVT.is128BitVector())
   20484     return SDValue();
   20485 
   20486   if (Narrow->getOpcode() != ISD::XOR &&
   20487       Narrow->getOpcode() != ISD::AND &&
   20488       Narrow->getOpcode() != ISD::OR)
   20489     return SDValue();
   20490 
   20491   SDValue N0  = Narrow->getOperand(0);
   20492   SDValue N1  = Narrow->getOperand(1);
   20493   SDLoc DL(Narrow);
   20494 
   20495   // The Left side has to be a trunc.
   20496   if (N0.getOpcode() != ISD::TRUNCATE)
   20497     return SDValue();
   20498 
   20499   // The type of the truncated inputs.
   20500   EVT WideVT = N0->getOperand(0)->getValueType(0);
   20501   if (WideVT != VT)
   20502     return SDValue();
   20503 
   20504   // The right side has to be a 'trunc' or a constant vector.
   20505   bool RHSTrunc = N1.getOpcode() == ISD::TRUNCATE;
   20506   ConstantSDNode *RHSConstSplat = nullptr;
   20507   if (auto *RHSBV = dyn_cast<BuildVectorSDNode>(N1))
   20508     RHSConstSplat = RHSBV->getConstantSplatNode();
   20509   if (!RHSTrunc && !RHSConstSplat)
   20510     return SDValue();
   20511 
   20512   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
   20513 
   20514   if (!TLI.isOperationLegalOrPromote(Narrow->getOpcode(), WideVT))
   20515     return SDValue();
   20516 
   20517   // Set N0 and N1 to hold the inputs to the new wide operation.
   20518   N0 = N0->getOperand(0);
   20519   if (RHSConstSplat) {
   20520     N1 = DAG.getNode(ISD::ZERO_EXTEND, DL, WideVT.getScalarType(),
   20521                      SDValue(RHSConstSplat, 0));
   20522     SmallVector<SDValue, 8> C(WideVT.getVectorNumElements(), N1);
   20523     N1 = DAG.getNode(ISD::BUILD_VECTOR, DL, WideVT, C);
   20524   } else if (RHSTrunc) {
   20525     N1 = N1->getOperand(0);
   20526   }
   20527 
   20528   // Generate the wide operation.
   20529   SDValue Op = DAG.getNode(Narrow->getOpcode(), DL, WideVT, N0, N1);
   20530   unsigned Opcode = N->getOpcode();
   20531   switch (Opcode) {
   20532   case ISD::ANY_EXTEND:
   20533     return Op;
   20534   case ISD::ZERO_EXTEND: {
   20535     unsigned InBits = NarrowVT.getScalarType().getSizeInBits();
   20536     APInt Mask = APInt::getAllOnesValue(InBits);
   20537     Mask = Mask.zext(VT.getScalarType().getSizeInBits());
   20538     return DAG.getNode(ISD::AND, DL, VT,
   20539                        Op, DAG.getConstant(Mask, VT));
   20540   }
   20541   case ISD::SIGN_EXTEND:
   20542     return DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT,
   20543                        Op, DAG.getValueType(NarrowVT));
   20544   default:
   20545     llvm_unreachable("Unexpected opcode");
   20546   }
   20547 }
   20548 
   20549 static SDValue PerformAndCombine(SDNode *N, SelectionDAG &DAG,
   20550                                  TargetLowering::DAGCombinerInfo &DCI,
   20551                                  const X86Subtarget *Subtarget) {
   20552   EVT VT = N->getValueType(0);
   20553   if (DCI.isBeforeLegalizeOps())
   20554     return SDValue();
   20555 
   20556   SDValue R = CMPEQCombine(N, DAG, DCI, Subtarget);
   20557   if (R.getNode())
   20558     return R;
   20559 
   20560   // Create BEXTR instructions
   20561   // BEXTR is ((X >> imm) & (2**size-1))
   20562   if (VT == MVT::i32 || VT == MVT::i64) {
   20563     SDValue N0 = N->getOperand(0);
   20564     SDValue N1 = N->getOperand(1);
   20565     SDLoc DL(N);
   20566 
   20567     // Check for BEXTR.
   20568     if ((Subtarget->hasBMI() || Subtarget->hasTBM()) &&
   20569         (N0.getOpcode() == ISD::SRA || N0.getOpcode() == ISD::SRL)) {
   20570       ConstantSDNode *MaskNode = dyn_cast<ConstantSDNode>(N1);
   20571       ConstantSDNode *ShiftNode = dyn_cast<ConstantSDNode>(N0.getOperand(1));
   20572       if (MaskNode && ShiftNode) {
   20573         uint64_t Mask = MaskNode->getZExtValue();
   20574         uint64_t Shift = ShiftNode->getZExtValue();
   20575         if (isMask_64(Mask)) {
   20576           uint64_t MaskSize = CountPopulation_64(Mask);
   20577           if (Shift + MaskSize <= VT.getSizeInBits())
   20578             return DAG.getNode(X86ISD::BEXTR, DL, VT, N0.getOperand(0),
   20579                                DAG.getConstant(Shift | (MaskSize << 8), VT));
   20580         }
   20581       }
   20582     } // BEXTR
   20583 
   20584     return SDValue();
   20585   }
   20586 
   20587   // Want to form ANDNP nodes:
   20588   // 1) In the hopes of then easily combining them with OR and AND nodes
   20589   //    to form PBLEND/PSIGN.
   20590   // 2) To match ANDN packed intrinsics
   20591   if (VT != MVT::v2i64 && VT != MVT::v4i64)
   20592     return SDValue();
   20593 
   20594   SDValue N0 = N->getOperand(0);
   20595   SDValue N1 = N->getOperand(1);
   20596   SDLoc DL(N);
   20597 
   20598   // Check LHS for vnot
   20599   if (N0.getOpcode() == ISD::XOR &&
   20600       //ISD::isBuildVectorAllOnes(N0.getOperand(1).getNode()))
   20601       CanFoldXORWithAllOnes(N0.getOperand(1).getNode()))
   20602     return DAG.getNode(X86ISD::ANDNP, DL, VT, N0.getOperand(0), N1);
   20603 
   20604   // Check RHS for vnot
   20605   if (N1.getOpcode() == ISD::XOR &&
   20606       //ISD::isBuildVectorAllOnes(N1.getOperand(1).getNode()))
   20607       CanFoldXORWithAllOnes(N1.getOperand(1).getNode()))
   20608     return DAG.getNode(X86ISD::ANDNP, DL, VT, N1.getOperand(0), N0);
   20609 
   20610   return SDValue();
   20611 }
   20612 
   20613 static SDValue PerformOrCombine(SDNode *N, SelectionDAG &DAG,
   20614                                 TargetLowering::DAGCombinerInfo &DCI,
   20615                                 const X86Subtarget *Subtarget) {
   20616   if (DCI.isBeforeLegalizeOps())
   20617     return SDValue();
   20618 
   20619   SDValue R = CMPEQCombine(N, DAG, DCI, Subtarget);
   20620   if (R.getNode())
   20621     return R;
   20622 
   20623   SDValue N0 = N->getOperand(0);
   20624   SDValue N1 = N->getOperand(1);
   20625   EVT VT = N->getValueType(0);
   20626 
   20627   // look for psign/blend
   20628   if (VT == MVT::v2i64 || VT == MVT::v4i64) {
   20629     if (!Subtarget->hasSSSE3() ||
   20630         (VT == MVT::v4i64 && !Subtarget->hasInt256()))
   20631       return SDValue();
   20632 
   20633     // Canonicalize pandn to RHS
   20634     if (N0.getOpcode() == X86ISD::ANDNP)
   20635       std::swap(N0, N1);
   20636     // or (and (m, y), (pandn m, x))
   20637     if (N0.getOpcode() == ISD::AND && N1.getOpcode() == X86ISD::ANDNP) {
   20638       SDValue Mask = N1.getOperand(0);
   20639       SDValue X    = N1.getOperand(1);
   20640       SDValue Y;
   20641       if (N0.getOperand(0) == Mask)
   20642         Y = N0.getOperand(1);
   20643       if (N0.getOperand(1) == Mask)
   20644         Y = N0.getOperand(0);
   20645 
   20646       // Check to see if the mask appeared in both the AND and ANDNP and
   20647       if (!Y.getNode())
   20648         return SDValue();
   20649 
   20650       // Validate that X, Y, and Mask are BIT_CONVERTS, and see through them.
   20651       // Look through mask bitcast.
   20652       if (Mask.getOpcode() == ISD::BITCAST)
   20653         Mask = Mask.getOperand(0);
   20654       if (X.getOpcode() == ISD::BITCAST)
   20655         X = X.getOperand(0);
   20656       if (Y.getOpcode() == ISD::BITCAST)
   20657         Y = Y.getOperand(0);
   20658 
   20659       EVT MaskVT = Mask.getValueType();
   20660 
   20661       // Validate that the Mask operand is a vector sra node.
   20662       // FIXME: what to do for bytes, since there is a psignb/pblendvb, but
   20663       // there is no psrai.b
   20664       unsigned EltBits = MaskVT.getVectorElementType().getSizeInBits();
   20665       unsigned SraAmt = ~0;
   20666       if (Mask.getOpcode() == ISD::SRA) {
   20667         if (auto *AmtBV = dyn_cast<BuildVectorSDNode>(Mask.getOperand(1)))
   20668           if (auto *AmtConst = AmtBV->getConstantSplatNode())
   20669             SraAmt = AmtConst->getZExtValue();
   20670       } else if (Mask.getOpcode() == X86ISD::VSRAI) {
   20671         SDValue SraC = Mask.getOperand(1);
   20672         SraAmt  = cast<ConstantSDNode>(SraC)->getZExtValue();
   20673       }
   20674       if ((SraAmt + 1) != EltBits)
   20675         return SDValue();
   20676 
   20677       SDLoc DL(N);
   20678 
   20679       // Now we know we at least have a plendvb with the mask val.  See if
   20680       // we can form a psignb/w/d.
   20681       // psign = x.type == y.type == mask.type && y = sub(0, x);
   20682       if (Y.getOpcode() == ISD::SUB && Y.getOperand(1) == X &&
   20683           ISD::isBuildVectorAllZeros(Y.getOperand(0).getNode()) &&
   20684           X.getValueType() == MaskVT && Y.getValueType() == MaskVT) {
   20685         assert((EltBits == 8 || EltBits == 16 || EltBits == 32) &&
   20686                "Unsupported VT for PSIGN");
   20687         Mask = DAG.getNode(X86ISD::PSIGN, DL, MaskVT, X, Mask.getOperand(0));
   20688         return DAG.getNode(ISD::BITCAST, DL, VT, Mask);
   20689       }
   20690       // PBLENDVB only available on SSE 4.1
   20691       if (!Subtarget->hasSSE41())
   20692         return SDValue();
   20693 
   20694       EVT BlendVT = (VT == MVT::v4i64) ? MVT::v32i8 : MVT::v16i8;
   20695 
   20696       X = DAG.getNode(ISD::BITCAST, DL, BlendVT, X);
   20697       Y = DAG.getNode(ISD::BITCAST, DL, BlendVT, Y);
   20698       Mask = DAG.getNode(ISD::BITCAST, DL, BlendVT, Mask);
   20699       Mask = DAG.getNode(ISD::VSELECT, DL, BlendVT, Mask, Y, X);
   20700       return DAG.getNode(ISD::BITCAST, DL, VT, Mask);
   20701     }
   20702   }
   20703 
   20704   if (VT != MVT::i16 && VT != MVT::i32 && VT != MVT::i64)
   20705     return SDValue();
   20706 
   20707   // fold (or (x << c) | (y >> (64 - c))) ==> (shld64 x, y, c)
   20708   MachineFunction &MF = DAG.getMachineFunction();
   20709   bool OptForSize = MF.getFunction()->getAttributes().
   20710     hasAttribute(AttributeSet::FunctionIndex, Attribute::OptimizeForSize);
   20711 
   20712   // SHLD/SHRD instructions have lower register pressure, but on some
   20713   // platforms they have higher latency than the equivalent
   20714   // series of shifts/or that would otherwise be generated.
   20715   // Don't fold (or (x << c) | (y >> (64 - c))) if SHLD/SHRD instructions
   20716   // have higher latencies and we are not optimizing for size.
   20717   if (!OptForSize && Subtarget->isSHLDSlow())
   20718     return SDValue();
   20719 
   20720   if (N0.getOpcode() == ISD::SRL && N1.getOpcode() == ISD::SHL)
   20721     std::swap(N0, N1);
   20722   if (N0.getOpcode() != ISD::SHL || N1.getOpcode() != ISD::SRL)
   20723     return SDValue();
   20724   if (!N0.hasOneUse() || !N1.hasOneUse())
   20725     return SDValue();
   20726 
   20727   SDValue ShAmt0 = N0.getOperand(1);
   20728   if (ShAmt0.getValueType() != MVT::i8)
   20729     return SDValue();
   20730   SDValue ShAmt1 = N1.getOperand(1);
   20731   if (ShAmt1.getValueType() != MVT::i8)
   20732     return SDValue();
   20733   if (ShAmt0.getOpcode() == ISD::TRUNCATE)
   20734     ShAmt0 = ShAmt0.getOperand(0);
   20735   if (ShAmt1.getOpcode() == ISD::TRUNCATE)
   20736     ShAmt1 = ShAmt1.getOperand(0);
   20737 
   20738   SDLoc DL(N);
   20739   unsigned Opc = X86ISD::SHLD;
   20740   SDValue Op0 = N0.getOperand(0);
   20741   SDValue Op1 = N1.getOperand(0);
   20742   if (ShAmt0.getOpcode() == ISD::SUB) {
   20743     Opc = X86ISD::SHRD;
   20744     std::swap(Op0, Op1);
   20745     std::swap(ShAmt0, ShAmt1);
   20746   }
   20747 
   20748   unsigned Bits = VT.getSizeInBits();
   20749   if (ShAmt1.getOpcode() == ISD::SUB) {
   20750     SDValue Sum = ShAmt1.getOperand(0);
   20751     if (ConstantSDNode *SumC = dyn_cast<ConstantSDNode>(Sum)) {
   20752       SDValue ShAmt1Op1 = ShAmt1.getOperand(1);
   20753       if (ShAmt1Op1.getNode()->getOpcode() == ISD::TRUNCATE)
   20754         ShAmt1Op1 = ShAmt1Op1.getOperand(0);
   20755       if (SumC->getSExtValue() == Bits && ShAmt1Op1 == ShAmt0)
   20756         return DAG.getNode(Opc, DL, VT,
   20757                            Op0, Op1,
   20758                            DAG.getNode(ISD::TRUNCATE, DL,
   20759                                        MVT::i8, ShAmt0));
   20760     }
   20761   } else if (ConstantSDNode *ShAmt1C = dyn_cast<ConstantSDNode>(ShAmt1)) {
   20762     ConstantSDNode *ShAmt0C = dyn_cast<ConstantSDNode>(ShAmt0);
   20763     if (ShAmt0C &&
   20764         ShAmt0C->getSExtValue() + ShAmt1C->getSExtValue() == Bits)
   20765       return DAG.getNode(Opc, DL, VT,
   20766                          N0.getOperand(0), N1.getOperand(0),
   20767                          DAG.getNode(ISD::TRUNCATE, DL,
   20768                                        MVT::i8, ShAmt0));
   20769   }
   20770 
   20771   return SDValue();
   20772 }
   20773 
   20774 // Generate NEG and CMOV for integer abs.
   20775 static SDValue performIntegerAbsCombine(SDNode *N, SelectionDAG &DAG) {
   20776   EVT VT = N->getValueType(0);
   20777 
   20778   // Since X86 does not have CMOV for 8-bit integer, we don't convert
   20779   // 8-bit integer abs to NEG and CMOV.
   20780   if (VT.isInteger() && VT.getSizeInBits() == 8)
   20781     return SDValue();
   20782 
   20783   SDValue N0 = N->getOperand(0);
   20784   SDValue N1 = N->getOperand(1);
   20785   SDLoc DL(N);
   20786 
   20787   // Check pattern of XOR(ADD(X,Y), Y) where Y is SRA(X, size(X)-1)
   20788   // and change it to SUB and CMOV.
   20789   if (VT.isInteger() && N->getOpcode() == ISD::XOR &&
   20790       N0.getOpcode() == ISD::ADD &&
   20791       N0.getOperand(1) == N1 &&
   20792       N1.getOpcode() == ISD::SRA &&
   20793       N1.getOperand(0) == N0.getOperand(0))
   20794     if (ConstantSDNode *Y1C = dyn_cast<ConstantSDNode>(N1.getOperand(1)))
   20795       if (Y1C->getAPIntValue() == VT.getSizeInBits()-1) {
   20796         // Generate SUB & CMOV.
   20797         SDValue Neg = DAG.getNode(X86ISD::SUB, DL, DAG.getVTList(VT, MVT::i32),
   20798                                   DAG.getConstant(0, VT), N0.getOperand(0));
   20799 
   20800         SDValue Ops[] = { N0.getOperand(0), Neg,
   20801                           DAG.getConstant(X86::COND_GE, MVT::i8),
   20802                           SDValue(Neg.getNode(), 1) };
   20803         return DAG.getNode(X86ISD::CMOV, DL, DAG.getVTList(VT, MVT::Glue), Ops);
   20804       }
   20805   return SDValue();
   20806 }
   20807 
   20808 // PerformXorCombine - Attempts to turn XOR nodes into BLSMSK nodes
   20809 static SDValue PerformXorCombine(SDNode *N, SelectionDAG &DAG,
   20810                                  TargetLowering::DAGCombinerInfo &DCI,
   20811                                  const X86Subtarget *Subtarget) {
   20812   if (DCI.isBeforeLegalizeOps())
   20813     return SDValue();
   20814 
   20815   if (Subtarget->hasCMov()) {
   20816     SDValue RV = performIntegerAbsCombine(N, DAG);
   20817     if (RV.getNode())
   20818       return RV;
   20819   }
   20820 
   20821   return SDValue();
   20822 }
   20823 
   20824 /// PerformLOADCombine - Do target-specific dag combines on LOAD nodes.
   20825 static SDValue PerformLOADCombine(SDNode *N, SelectionDAG &DAG,
   20826                                   TargetLowering::DAGCombinerInfo &DCI,
   20827                                   const X86Subtarget *Subtarget) {
   20828   LoadSDNode *Ld = cast<LoadSDNode>(N);
   20829   EVT RegVT = Ld->getValueType(0);
   20830   EVT MemVT = Ld->getMemoryVT();
   20831   SDLoc dl(Ld);
   20832   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
   20833   unsigned RegSz = RegVT.getSizeInBits();
   20834 
   20835   // On Sandybridge unaligned 256bit loads are inefficient.
   20836   ISD::LoadExtType Ext = Ld->getExtensionType();
   20837   unsigned Alignment = Ld->getAlignment();
   20838   bool IsAligned = Alignment == 0 || Alignment >= MemVT.getSizeInBits()/8;
   20839   if (RegVT.is256BitVector() && !Subtarget->hasInt256() &&
   20840       !DCI.isBeforeLegalizeOps() && !IsAligned && Ext == ISD::NON_EXTLOAD) {
   20841     unsigned NumElems = RegVT.getVectorNumElements();
   20842     if (NumElems < 2)
   20843       return SDValue();
   20844 
   20845     SDValue Ptr = Ld->getBasePtr();
   20846     SDValue Increment = DAG.getConstant(16, TLI.getPointerTy());
   20847 
   20848     EVT HalfVT = EVT::getVectorVT(*DAG.getContext(), MemVT.getScalarType(),
   20849                                   NumElems/2);
   20850     SDValue Load1 = DAG.getLoad(HalfVT, dl, Ld->getChain(), Ptr,
   20851                                 Ld->getPointerInfo(), Ld->isVolatile(),
   20852                                 Ld->isNonTemporal(), Ld->isInvariant(),
   20853                                 Alignment);
   20854     Ptr = DAG.getNode(ISD::ADD, dl, Ptr.getValueType(), Ptr, Increment);
   20855     SDValue Load2 = DAG.getLoad(HalfVT, dl, Ld->getChain(), Ptr,
   20856                                 Ld->getPointerInfo(), Ld->isVolatile(),
   20857                                 Ld->isNonTemporal(), Ld->isInvariant(),
   20858                                 std::min(16U, Alignment));
   20859     SDValue TF = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
   20860                              Load1.getValue(1),
   20861                              Load2.getValue(1));
   20862 
   20863     SDValue NewVec = DAG.getUNDEF(RegVT);
   20864     NewVec = Insert128BitVector(NewVec, Load1, 0, DAG, dl);
   20865     NewVec = Insert128BitVector(NewVec, Load2, NumElems/2, DAG, dl);
   20866     return DCI.CombineTo(N, NewVec, TF, true);
   20867   }
   20868 
   20869   // If this is a vector EXT Load then attempt to optimize it using a
   20870   // shuffle. If SSSE3 is not available we may emit an illegal shuffle but the
   20871   // expansion is still better than scalar code.
   20872   // We generate X86ISD::VSEXT for SEXTLOADs if it's available, otherwise we'll
   20873   // emit a shuffle and a arithmetic shift.
   20874   // TODO: It is possible to support ZExt by zeroing the undef values
   20875   // during the shuffle phase or after the shuffle.
   20876   if (RegVT.isVector() && RegVT.isInteger() && Subtarget->hasSSE2() &&
   20877       (Ext == ISD::EXTLOAD || Ext == ISD::SEXTLOAD)) {
   20878     assert(MemVT != RegVT && "Cannot extend to the same type");
   20879     assert(MemVT.isVector() && "Must load a vector from memory");
   20880 
   20881     unsigned NumElems = RegVT.getVectorNumElements();
   20882     unsigned MemSz = MemVT.getSizeInBits();
   20883     assert(RegSz > MemSz && "Register size must be greater than the mem size");
   20884 
   20885     if (Ext == ISD::SEXTLOAD && RegSz == 256 && !Subtarget->hasInt256())
   20886       return SDValue();
   20887 
   20888     // All sizes must be a power of two.
   20889     if (!isPowerOf2_32(RegSz * MemSz * NumElems))
   20890       return SDValue();
   20891 
   20892     // Attempt to load the original value using scalar loads.
   20893     // Find the largest scalar type that divides the total loaded size.
   20894     MVT SclrLoadTy = MVT::i8;
   20895     for (unsigned tp = MVT::FIRST_INTEGER_VALUETYPE;
   20896          tp < MVT::LAST_INTEGER_VALUETYPE; ++tp) {
   20897       MVT Tp = (MVT::SimpleValueType)tp;
   20898       if (TLI.isTypeLegal(Tp) && ((MemSz % Tp.getSizeInBits()) == 0)) {
   20899         SclrLoadTy = Tp;
   20900       }
   20901     }
   20902 
   20903     // On 32bit systems, we can't save 64bit integers. Try bitcasting to F64.
   20904     if (TLI.isTypeLegal(MVT::f64) && SclrLoadTy.getSizeInBits() < 64 &&
   20905         (64 <= MemSz))
   20906       SclrLoadTy = MVT::f64;
   20907 
   20908     // Calculate the number of scalar loads that we need to perform
   20909     // in order to load our vector from memory.
   20910     unsigned NumLoads = MemSz / SclrLoadTy.getSizeInBits();
   20911     if (Ext == ISD::SEXTLOAD && NumLoads > 1)
   20912       return SDValue();
   20913 
   20914     unsigned loadRegZize = RegSz;
   20915     if (Ext == ISD::SEXTLOAD && RegSz == 256)
   20916       loadRegZize /= 2;
   20917 
   20918     // Represent our vector as a sequence of elements which are the
   20919     // largest scalar that we can load.
   20920     EVT LoadUnitVecVT = EVT::getVectorVT(*DAG.getContext(), SclrLoadTy,
   20921       loadRegZize/SclrLoadTy.getSizeInBits());
   20922 
   20923     // Represent the data using the same element type that is stored in
   20924     // memory. In practice, we ''widen'' MemVT.
   20925     EVT WideVecVT =
   20926           EVT::getVectorVT(*DAG.getContext(), MemVT.getScalarType(),
   20927                        loadRegZize/MemVT.getScalarType().getSizeInBits());
   20928 
   20929     assert(WideVecVT.getSizeInBits() == LoadUnitVecVT.getSizeInBits() &&
   20930       "Invalid vector type");
   20931 
   20932     // We can't shuffle using an illegal type.
   20933     if (!TLI.isTypeLegal(WideVecVT))
   20934       return SDValue();
   20935 
   20936     SmallVector<SDValue, 8> Chains;
   20937     SDValue Ptr = Ld->getBasePtr();
   20938     SDValue Increment = DAG.getConstant(SclrLoadTy.getSizeInBits()/8,
   20939                                         TLI.getPointerTy());
   20940     SDValue Res = DAG.getUNDEF(LoadUnitVecVT);
   20941 
   20942     for (unsigned i = 0; i < NumLoads; ++i) {
   20943       // Perform a single load.
   20944       SDValue ScalarLoad = DAG.getLoad(SclrLoadTy, dl, Ld->getChain(),
   20945                                        Ptr, Ld->getPointerInfo(),
   20946                                        Ld->isVolatile(), Ld->isNonTemporal(),
   20947                                        Ld->isInvariant(), Ld->getAlignment());
   20948       Chains.push_back(ScalarLoad.getValue(1));
   20949       // Create the first element type using SCALAR_TO_VECTOR in order to avoid
   20950       // another round of DAGCombining.
   20951       if (i == 0)
   20952         Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, LoadUnitVecVT, ScalarLoad);
   20953       else
   20954         Res = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, LoadUnitVecVT, Res,
   20955                           ScalarLoad, DAG.getIntPtrConstant(i));
   20956 
   20957       Ptr = DAG.getNode(ISD::ADD, dl, Ptr.getValueType(), Ptr, Increment);
   20958     }
   20959 
   20960     SDValue TF = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Chains);
   20961 
   20962     // Bitcast the loaded value to a vector of the original element type, in
   20963     // the size of the target vector type.
   20964     SDValue SlicedVec = DAG.getNode(ISD::BITCAST, dl, WideVecVT, Res);
   20965     unsigned SizeRatio = RegSz/MemSz;
   20966 
   20967     if (Ext == ISD::SEXTLOAD) {
   20968       // If we have SSE4.1 we can directly emit a VSEXT node.
   20969       if (Subtarget->hasSSE41()) {
   20970         SDValue Sext = DAG.getNode(X86ISD::VSEXT, dl, RegVT, SlicedVec);
   20971         return DCI.CombineTo(N, Sext, TF, true);
   20972       }
   20973 
   20974       // Otherwise we'll shuffle the small elements in the high bits of the
   20975       // larger type and perform an arithmetic shift. If the shift is not legal
   20976       // it's better to scalarize.
   20977       if (!TLI.isOperationLegalOrCustom(ISD::SRA, RegVT))
   20978         return SDValue();
   20979 
   20980       // Redistribute the loaded elements into the different locations.
   20981       SmallVector<int, 8> ShuffleVec(NumElems * SizeRatio, -1);
   20982       for (unsigned i = 0; i != NumElems; ++i)
   20983         ShuffleVec[i*SizeRatio + SizeRatio-1] = i;
   20984 
   20985       SDValue Shuff = DAG.getVectorShuffle(WideVecVT, dl, SlicedVec,
   20986                                            DAG.getUNDEF(WideVecVT),
   20987                                            &ShuffleVec[0]);
   20988 
   20989       Shuff = DAG.getNode(ISD::BITCAST, dl, RegVT, Shuff);
   20990 
   20991       // Build the arithmetic shift.
   20992       unsigned Amt = RegVT.getVectorElementType().getSizeInBits() -
   20993                      MemVT.getVectorElementType().getSizeInBits();
   20994       Shuff = DAG.getNode(ISD::SRA, dl, RegVT, Shuff,
   20995                           DAG.getConstant(Amt, RegVT));
   20996 
   20997       return DCI.CombineTo(N, Shuff, TF, true);
   20998     }
   20999 
   21000     // Redistribute the loaded elements into the different locations.
   21001     SmallVector<int, 8> ShuffleVec(NumElems * SizeRatio, -1);
   21002     for (unsigned i = 0; i != NumElems; ++i)
   21003       ShuffleVec[i*SizeRatio] = i;
   21004 
   21005     SDValue Shuff = DAG.getVectorShuffle(WideVecVT, dl, SlicedVec,
   21006                                          DAG.getUNDEF(WideVecVT),
   21007                                          &ShuffleVec[0]);
   21008 
   21009     // Bitcast to the requested type.
   21010     Shuff = DAG.getNode(ISD::BITCAST, dl, RegVT, Shuff);
   21011     // Replace the original load with the new sequence
   21012     // and return the new chain.
   21013     return DCI.CombineTo(N, Shuff, TF, true);
   21014   }
   21015 
   21016   return SDValue();
   21017 }
   21018 
   21019 /// PerformSTORECombine - Do target-specific dag combines on STORE nodes.
   21020 static SDValue PerformSTORECombine(SDNode *N, SelectionDAG &DAG,
   21021                                    const X86Subtarget *Subtarget) {
   21022   StoreSDNode *St = cast<StoreSDNode>(N);
   21023   EVT VT = St->getValue().getValueType();
   21024   EVT StVT = St->getMemoryVT();
   21025   SDLoc dl(St);
   21026   SDValue StoredVal = St->getOperand(1);
   21027   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
   21028 
   21029   // If we are saving a concatenation of two XMM registers, perform two stores.
   21030   // On Sandy Bridge, 256-bit memory operations are executed by two
   21031   // 128-bit ports. However, on Haswell it is better to issue a single 256-bit
   21032   // memory  operation.
   21033   unsigned Alignment = St->getAlignment();
   21034   bool IsAligned = Alignment == 0 || Alignment >= VT.getSizeInBits()/8;
   21035   if (VT.is256BitVector() && !Subtarget->hasInt256() &&
   21036       StVT == VT && !IsAligned) {
   21037     unsigned NumElems = VT.getVectorNumElements();
   21038     if (NumElems < 2)
   21039       return SDValue();
   21040 
   21041     SDValue Value0 = Extract128BitVector(StoredVal, 0, DAG, dl);
   21042     SDValue Value1 = Extract128BitVector(StoredVal, NumElems/2, DAG, dl);
   21043 
   21044     SDValue Stride = DAG.getConstant(16, TLI.getPointerTy());
   21045     SDValue Ptr0 = St->getBasePtr();
   21046     SDValue Ptr1 = DAG.getNode(ISD::ADD, dl, Ptr0.getValueType(), Ptr0, Stride);
   21047 
   21048     SDValue Ch0 = DAG.getStore(St->getChain(), dl, Value0, Ptr0,
   21049                                 St->getPointerInfo(), St->isVolatile(),
   21050                                 St->isNonTemporal(), Alignment);
   21051     SDValue Ch1 = DAG.getStore(St->getChain(), dl, Value1, Ptr1,
   21052                                 St->getPointerInfo(), St->isVolatile(),
   21053                                 St->isNonTemporal(),
   21054                                 std::min(16U, Alignment));
   21055     return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Ch0, Ch1);
   21056   }
   21057 
   21058   // Optimize trunc store (of multiple scalars) to shuffle and store.
   21059   // First, pack all of the elements in one place. Next, store to memory
   21060   // in fewer chunks.
   21061   if (St->isTruncatingStore() && VT.isVector()) {
   21062     const TargetLowering &TLI = DAG.getTargetLoweringInfo();
   21063     unsigned NumElems = VT.getVectorNumElements();
   21064     assert(StVT != VT && "Cannot truncate to the same type");
   21065     unsigned FromSz = VT.getVectorElementType().getSizeInBits();
   21066     unsigned ToSz = StVT.getVectorElementType().getSizeInBits();
   21067 
   21068     // From, To sizes and ElemCount must be pow of two
   21069     if (!isPowerOf2_32(NumElems * FromSz * ToSz)) return SDValue();
   21070     // We are going to use the original vector elt for storing.
   21071     // Accumulated smaller vector elements must be a multiple of the store size.
   21072     if (0 != (NumElems * FromSz) % ToSz) return SDValue();
   21073 
   21074     unsigned SizeRatio  = FromSz / ToSz;
   21075 
   21076     assert(SizeRatio * NumElems * ToSz == VT.getSizeInBits());
   21077 
   21078     // Create a type on which we perform the shuffle
   21079     EVT WideVecVT = EVT::getVectorVT(*DAG.getContext(),
   21080             StVT.getScalarType(), NumElems*SizeRatio);
   21081 
   21082     assert(WideVecVT.getSizeInBits() == VT.getSizeInBits());
   21083 
   21084     SDValue WideVec = DAG.getNode(ISD::BITCAST, dl, WideVecVT, St->getValue());
   21085     SmallVector<int, 8> ShuffleVec(NumElems * SizeRatio, -1);
   21086     for (unsigned i = 0; i != NumElems; ++i)
   21087       ShuffleVec[i] = i * SizeRatio;
   21088 
   21089     // Can't shuffle using an illegal type.
   21090     if (!TLI.isTypeLegal(WideVecVT))
   21091       return SDValue();
   21092 
   21093     SDValue Shuff = DAG.getVectorShuffle(WideVecVT, dl, WideVec,
   21094                                          DAG.getUNDEF(WideVecVT),
   21095                                          &ShuffleVec[0]);
   21096     // At this point all of the data is stored at the bottom of the
   21097     // register. We now need to save it to mem.
   21098 
   21099     // Find the largest store unit
   21100     MVT StoreType = MVT::i8;
   21101     for (unsigned tp = MVT::FIRST_INTEGER_VALUETYPE;
   21102          tp < MVT::LAST_INTEGER_VALUETYPE; ++tp) {
   21103       MVT Tp = (MVT::SimpleValueType)tp;
   21104       if (TLI.isTypeLegal(Tp) && Tp.getSizeInBits() <= NumElems * ToSz)
   21105         StoreType = Tp;
   21106     }
   21107 
   21108     // On 32bit systems, we can't save 64bit integers. Try bitcasting to F64.
   21109     if (TLI.isTypeLegal(MVT::f64) && StoreType.getSizeInBits() < 64 &&
   21110         (64 <= NumElems * ToSz))
   21111       StoreType = MVT::f64;
   21112 
   21113     // Bitcast the original vector into a vector of store-size units
   21114     EVT StoreVecVT = EVT::getVectorVT(*DAG.getContext(),
   21115             StoreType, VT.getSizeInBits()/StoreType.getSizeInBits());
   21116     assert(StoreVecVT.getSizeInBits() == VT.getSizeInBits());
   21117     SDValue ShuffWide = DAG.getNode(ISD::BITCAST, dl, StoreVecVT, Shuff);
   21118     SmallVector<SDValue, 8> Chains;
   21119     SDValue Increment = DAG.getConstant(StoreType.getSizeInBits()/8,
   21120                                         TLI.getPointerTy());
   21121     SDValue Ptr = St->getBasePtr();
   21122 
   21123     // Perform one or more big stores into memory.
   21124     for (unsigned i=0, e=(ToSz*NumElems)/StoreType.getSizeInBits(); i!=e; ++i) {
   21125       SDValue SubVec = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl,
   21126                                    StoreType, ShuffWide,
   21127                                    DAG.getIntPtrConstant(i));
   21128       SDValue Ch = DAG.getStore(St->getChain(), dl, SubVec, Ptr,
   21129                                 St->getPointerInfo(), St->isVolatile(),
   21130                                 St->isNonTemporal(), St->getAlignment());
   21131       Ptr = DAG.getNode(ISD::ADD, dl, Ptr.getValueType(), Ptr, Increment);
   21132       Chains.push_back(Ch);
   21133     }
   21134 
   21135     return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Chains);
   21136   }
   21137 
   21138   // Turn load->store of MMX types into GPR load/stores.  This avoids clobbering
   21139   // the FP state in cases where an emms may be missing.
   21140   // A preferable solution to the general problem is to figure out the right
   21141   // places to insert EMMS.  This qualifies as a quick hack.
   21142 
   21143   // Similarly, turn load->store of i64 into double load/stores in 32-bit mode.
   21144   if (VT.getSizeInBits() != 64)
   21145     return SDValue();
   21146 
   21147   const Function *F = DAG.getMachineFunction().getFunction();
   21148   bool NoImplicitFloatOps = F->getAttributes().
   21149     hasAttribute(AttributeSet::FunctionIndex, Attribute::NoImplicitFloat);
   21150   bool F64IsLegal = !DAG.getTarget().Options.UseSoftFloat && !NoImplicitFloatOps
   21151                      && Subtarget->hasSSE2();
   21152   if ((VT.isVector() ||
   21153        (VT == MVT::i64 && F64IsLegal && !Subtarget->is64Bit())) &&
   21154       isa<LoadSDNode>(St->getValue()) &&
   21155       !cast<LoadSDNode>(St->getValue())->isVolatile() &&
   21156       St->getChain().hasOneUse() && !St->isVolatile()) {
   21157     SDNode* LdVal = St->getValue().getNode();
   21158     LoadSDNode *Ld = nullptr;
   21159     int TokenFactorIndex = -1;
   21160     SmallVector<SDValue, 8> Ops;
   21161     SDNode* ChainVal = St->getChain().getNode();
   21162     // Must be a store of a load.  We currently handle two cases:  the load
   21163     // is a direct child, and it's under an intervening TokenFactor.  It is
   21164     // possible to dig deeper under nested TokenFactors.
   21165     if (ChainVal == LdVal)
   21166       Ld = cast<LoadSDNode>(St->getChain());
   21167     else if (St->getValue().hasOneUse() &&
   21168              ChainVal->getOpcode() == ISD::TokenFactor) {
   21169       for (unsigned i = 0, e = ChainVal->getNumOperands(); i != e; ++i) {
   21170         if (ChainVal->getOperand(i).getNode() == LdVal) {
   21171           TokenFactorIndex = i;
   21172           Ld = cast<LoadSDNode>(St->getValue());
   21173         } else
   21174           Ops.push_back(ChainVal->getOperand(i));
   21175       }
   21176     }
   21177 
   21178     if (!Ld || !ISD::isNormalLoad(Ld))
   21179       return SDValue();
   21180 
   21181     // If this is not the MMX case, i.e. we are just turning i64 load/store
   21182     // into f64 load/store, avoid the transformation if there are multiple
   21183     // uses of the loaded value.
   21184     if (!VT.isVector() && !Ld->hasNUsesOfValue(1, 0))
   21185       return SDValue();
   21186 
   21187     SDLoc LdDL(Ld);
   21188     SDLoc StDL(N);
   21189     // If we are a 64-bit capable x86, lower to a single movq load/store pair.
   21190     // Otherwise, if it's legal to use f64 SSE instructions, use f64 load/store
   21191     // pair instead.
   21192     if (Subtarget->is64Bit() || F64IsLegal) {
   21193       EVT LdVT = Subtarget->is64Bit() ? MVT::i64 : MVT::f64;
   21194       SDValue NewLd = DAG.getLoad(LdVT, LdDL, Ld->getChain(), Ld->getBasePtr(),
   21195                                   Ld->getPointerInfo(), Ld->isVolatile(),
   21196                                   Ld->isNonTemporal(), Ld->isInvariant(),
   21197                                   Ld->getAlignment());
   21198       SDValue NewChain = NewLd.getValue(1);
   21199       if (TokenFactorIndex != -1) {
   21200         Ops.push_back(NewChain);
   21201         NewChain = DAG.getNode(ISD::TokenFactor, LdDL, MVT::Other, Ops);
   21202       }
   21203       return DAG.getStore(NewChain, StDL, NewLd, St->getBasePtr(),
   21204                           St->getPointerInfo(),
   21205                           St->isVolatile(), St->isNonTemporal(),
   21206                           St->getAlignment());
   21207     }
   21208 
   21209     // Otherwise, lower to two pairs of 32-bit loads / stores.
   21210     SDValue LoAddr = Ld->getBasePtr();
   21211     SDValue HiAddr = DAG.getNode(ISD::ADD, LdDL, MVT::i32, LoAddr,
   21212                                  DAG.getConstant(4, MVT::i32));
   21213 
   21214     SDValue LoLd = DAG.getLoad(MVT::i32, LdDL, Ld->getChain(), LoAddr,
   21215                                Ld->getPointerInfo(),
   21216                                Ld->isVolatile(), Ld->isNonTemporal(),
   21217                                Ld->isInvariant(), Ld->getAlignment());
   21218     SDValue HiLd = DAG.getLoad(MVT::i32, LdDL, Ld->getChain(), HiAddr,
   21219                                Ld->getPointerInfo().getWithOffset(4),
   21220                                Ld->isVolatile(), Ld->isNonTemporal(),
   21221                                Ld->isInvariant(),
   21222                                MinAlign(Ld->getAlignment(), 4));
   21223 
   21224     SDValue NewChain = LoLd.getValue(1);
   21225     if (TokenFactorIndex != -1) {
   21226       Ops.push_back(LoLd);
   21227       Ops.push_back(HiLd);
   21228       NewChain = DAG.getNode(ISD::TokenFactor, LdDL, MVT::Other, Ops);
   21229     }
   21230 
   21231     LoAddr = St->getBasePtr();
   21232     HiAddr = DAG.getNode(ISD::ADD, StDL, MVT::i32, LoAddr,
   21233                          DAG.getConstant(4, MVT::i32));
   21234 
   21235     SDValue LoSt = DAG.getStore(NewChain, StDL, LoLd, LoAddr,
   21236                                 St->getPointerInfo(),
   21237                                 St->isVolatile(), St->isNonTemporal(),
   21238                                 St->getAlignment());
   21239     SDValue HiSt = DAG.getStore(NewChain, StDL, HiLd, HiAddr,
   21240                                 St->getPointerInfo().getWithOffset(4),
   21241                                 St->isVolatile(),
   21242                                 St->isNonTemporal(),
   21243                                 MinAlign(St->getAlignment(), 4));
   21244     return DAG.getNode(ISD::TokenFactor, StDL, MVT::Other, LoSt, HiSt);
   21245   }
   21246   return SDValue();
   21247 }
   21248 
   21249 /// isHorizontalBinOp - Return 'true' if this vector operation is "horizontal"
   21250 /// and return the operands for the horizontal operation in LHS and RHS.  A
   21251 /// horizontal operation performs the binary operation on successive elements
   21252 /// of its first operand, then on successive elements of its second operand,
   21253 /// returning the resulting values in a vector.  For example, if
   21254 ///   A = < float a0, float a1, float a2, float a3 >
   21255 /// and
   21256 ///   B = < float b0, float b1, float b2, float b3 >
   21257 /// then the result of doing a horizontal operation on A and B is
   21258 ///   A horizontal-op B = < a0 op a1, a2 op a3, b0 op b1, b2 op b3 >.
   21259 /// In short, LHS and RHS are inspected to see if LHS op RHS is of the form
   21260 /// A horizontal-op B, for some already available A and B, and if so then LHS is
   21261 /// set to A, RHS to B, and the routine returns 'true'.
   21262 /// Note that the binary operation should have the property that if one of the
   21263 /// operands is UNDEF then the result is UNDEF.
   21264 static bool isHorizontalBinOp(SDValue &LHS, SDValue &RHS, bool IsCommutative) {
   21265   // Look for the following pattern: if
   21266   //   A = < float a0, float a1, float a2, float a3 >
   21267   //   B = < float b0, float b1, float b2, float b3 >
   21268   // and
   21269   //   LHS = VECTOR_SHUFFLE A, B, <0, 2, 4, 6>
   21270   //   RHS = VECTOR_SHUFFLE A, B, <1, 3, 5, 7>
   21271   // then LHS op RHS = < a0 op a1, a2 op a3, b0 op b1, b2 op b3 >
   21272   // which is A horizontal-op B.
   21273 
   21274   // At least one of the operands should be a vector shuffle.
   21275   if (LHS.getOpcode() != ISD::VECTOR_SHUFFLE &&
   21276       RHS.getOpcode() != ISD::VECTOR_SHUFFLE)
   21277     return false;
   21278 
   21279   MVT VT = LHS.getSimpleValueType();
   21280 
   21281   assert((VT.is128BitVector() || VT.is256BitVector()) &&
   21282          "Unsupported vector type for horizontal add/sub");
   21283 
   21284   // Handle 128 and 256-bit vector lengths. AVX defines horizontal add/sub to
   21285   // operate independently on 128-bit lanes.
   21286   unsigned NumElts = VT.getVectorNumElements();
   21287   unsigned NumLanes = VT.getSizeInBits()/128;
   21288   unsigned NumLaneElts = NumElts / NumLanes;
   21289   assert((NumLaneElts % 2 == 0) &&
   21290          "Vector type should have an even number of elements in each lane");
   21291   unsigned HalfLaneElts = NumLaneElts/2;
   21292 
   21293   // View LHS in the form
   21294   //   LHS = VECTOR_SHUFFLE A, B, LMask
   21295   // If LHS is not a shuffle then pretend it is the shuffle
   21296   //   LHS = VECTOR_SHUFFLE LHS, undef, <0, 1, ..., N-1>
   21297   // NOTE: in what follows a default initialized SDValue represents an UNDEF of
   21298   // type VT.
   21299   SDValue A, B;
   21300   SmallVector<int, 16> LMask(NumElts);
   21301   if (LHS.getOpcode() == ISD::VECTOR_SHUFFLE) {
   21302     if (LHS.getOperand(0).getOpcode() != ISD::UNDEF)
   21303       A = LHS.getOperand(0);
   21304     if (LHS.getOperand(1).getOpcode() != ISD::UNDEF)
   21305       B = LHS.getOperand(1);
   21306     ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(LHS.getNode())->getMask();
   21307     std::copy(Mask.begin(), Mask.end(), LMask.begin());
   21308   } else {
   21309     if (LHS.getOpcode() != ISD::UNDEF)
   21310       A = LHS;
   21311     for (unsigned i = 0; i != NumElts; ++i)
   21312       LMask[i] = i;
   21313   }
   21314 
   21315   // Likewise, view RHS in the form
   21316   //   RHS = VECTOR_SHUFFLE C, D, RMask
   21317   SDValue C, D;
   21318   SmallVector<int, 16> RMask(NumElts);
   21319   if (RHS.getOpcode() == ISD::VECTOR_SHUFFLE) {
   21320     if (RHS.getOperand(0).getOpcode() != ISD::UNDEF)
   21321       C = RHS.getOperand(0);
   21322     if (RHS.getOperand(1).getOpcode() != ISD::UNDEF)
   21323       D = RHS.getOperand(1);
   21324     ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(RHS.getNode())->getMask();
   21325     std::copy(Mask.begin(), Mask.end(), RMask.begin());
   21326   } else {
   21327     if (RHS.getOpcode() != ISD::UNDEF)
   21328       C = RHS;
   21329     for (unsigned i = 0; i != NumElts; ++i)
   21330       RMask[i] = i;
   21331   }
   21332 
   21333   // Check that the shuffles are both shuffling the same vectors.
   21334   if (!(A == C && B == D) && !(A == D && B == C))
   21335     return false;
   21336 
   21337   // If everything is UNDEF then bail out: it would be better to fold to UNDEF.
   21338   if (!A.getNode() && !B.getNode())
   21339     return false;
   21340 
   21341   // If A and B occur in reverse order in RHS, then "swap" them (which means
   21342   // rewriting the mask).
   21343   if (A != C)
   21344     CommuteVectorShuffleMask(RMask, NumElts);
   21345 
   21346   // At this point LHS and RHS are equivalent to
   21347   //   LHS = VECTOR_SHUFFLE A, B, LMask
   21348   //   RHS = VECTOR_SHUFFLE A, B, RMask
   21349   // Check that the masks correspond to performing a horizontal operation.
   21350   for (unsigned l = 0; l != NumElts; l += NumLaneElts) {
   21351     for (unsigned i = 0; i != NumLaneElts; ++i) {
   21352       int LIdx = LMask[i+l], RIdx = RMask[i+l];
   21353 
   21354       // Ignore any UNDEF components.
   21355       if (LIdx < 0 || RIdx < 0 ||
   21356           (!A.getNode() && (LIdx < (int)NumElts || RIdx < (int)NumElts)) ||
   21357           (!B.getNode() && (LIdx >= (int)NumElts || RIdx >= (int)NumElts)))
   21358         continue;
   21359 
   21360       // Check that successive elements are being operated on.  If not, this is
   21361       // not a horizontal operation.
   21362       unsigned Src = (i/HalfLaneElts); // each lane is split between srcs
   21363       int Index = 2*(i%HalfLaneElts) + NumElts*Src + l;
   21364       if (!(LIdx == Index && RIdx == Index + 1) &&
   21365           !(IsCommutative && LIdx == Index + 1 && RIdx == Index))
   21366         return false;
   21367     }
   21368   }
   21369 
   21370   LHS = A.getNode() ? A : B; // If A is 'UNDEF', use B for it.
   21371   RHS = B.getNode() ? B : A; // If B is 'UNDEF', use A for it.
   21372   return true;
   21373 }
   21374 
   21375 /// PerformFADDCombine - Do target-specific dag combines on floating point adds.
   21376 static SDValue PerformFADDCombine(SDNode *N, SelectionDAG &DAG,
   21377                                   const X86Subtarget *Subtarget) {
   21378   EVT VT = N->getValueType(0);
   21379   SDValue LHS = N->getOperand(0);
   21380   SDValue RHS = N->getOperand(1);
   21381 
   21382   // Try to synthesize horizontal adds from adds of shuffles.
   21383   if (((Subtarget->hasSSE3() && (VT == MVT::v4f32 || VT == MVT::v2f64)) ||
   21384        (Subtarget->hasFp256() && (VT == MVT::v8f32 || VT == MVT::v4f64))) &&
   21385       isHorizontalBinOp(LHS, RHS, true))
   21386     return DAG.getNode(X86ISD::FHADD, SDLoc(N), VT, LHS, RHS);
   21387   return SDValue();
   21388 }
   21389 
   21390 /// PerformFSUBCombine - Do target-specific dag combines on floating point subs.
   21391 static SDValue PerformFSUBCombine(SDNode *N, SelectionDAG &DAG,
   21392                                   const X86Subtarget *Subtarget) {
   21393   EVT VT = N->getValueType(0);
   21394   SDValue LHS = N->getOperand(0);
   21395   SDValue RHS = N->getOperand(1);
   21396 
   21397   // Try to synthesize horizontal subs from subs of shuffles.
   21398   if (((Subtarget->hasSSE3() && (VT == MVT::v4f32 || VT == MVT::v2f64)) ||
   21399        (Subtarget->hasFp256() && (VT == MVT::v8f32 || VT == MVT::v4f64))) &&
   21400       isHorizontalBinOp(LHS, RHS, false))
   21401     return DAG.getNode(X86ISD::FHSUB, SDLoc(N), VT, LHS, RHS);
   21402   return SDValue();
   21403 }
   21404 
   21405 /// PerformFORCombine - Do target-specific dag combines on X86ISD::FOR and
   21406 /// X86ISD::FXOR nodes.
   21407 static SDValue PerformFORCombine(SDNode *N, SelectionDAG &DAG) {
   21408   assert(N->getOpcode() == X86ISD::FOR || N->getOpcode() == X86ISD::FXOR);
   21409   // F[X]OR(0.0, x) -> x
   21410   // F[X]OR(x, 0.0) -> x
   21411   if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N->getOperand(0)))
   21412     if (C->getValueAPF().isPosZero())
   21413       return N->getOperand(1);
   21414   if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N->getOperand(1)))
   21415     if (C->getValueAPF().isPosZero())
   21416       return N->getOperand(0);
   21417   return SDValue();
   21418 }
   21419 
   21420 /// PerformFMinFMaxCombine - Do target-specific dag combines on X86ISD::FMIN and
   21421 /// X86ISD::FMAX nodes.
   21422 static SDValue PerformFMinFMaxCombine(SDNode *N, SelectionDAG &DAG) {
   21423   assert(N->getOpcode() == X86ISD::FMIN || N->getOpcode() == X86ISD::FMAX);
   21424 
   21425   // Only perform optimizations if UnsafeMath is used.
   21426   if (!DAG.getTarget().Options.UnsafeFPMath)
   21427     return SDValue();
   21428 
   21429   // If we run in unsafe-math mode, then convert the FMAX and FMIN nodes
   21430   // into FMINC and FMAXC, which are Commutative operations.
   21431   unsigned NewOp = 0;
   21432   switch (N->getOpcode()) {
   21433     default: llvm_unreachable("unknown opcode");
   21434     case X86ISD::FMIN:  NewOp = X86ISD::FMINC; break;
   21435     case X86ISD::FMAX:  NewOp = X86ISD::FMAXC; break;
   21436   }
   21437 
   21438   return DAG.getNode(NewOp, SDLoc(N), N->getValueType(0),
   21439                      N->getOperand(0), N->getOperand(1));
   21440 }
   21441 
   21442 /// PerformFANDCombine - Do target-specific dag combines on X86ISD::FAND nodes.
   21443 static SDValue PerformFANDCombine(SDNode *N, SelectionDAG &DAG) {
   21444   // FAND(0.0, x) -> 0.0
   21445   // FAND(x, 0.0) -> 0.0
   21446   if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N->getOperand(0)))
   21447     if (C->getValueAPF().isPosZero())
   21448       return N->getOperand(0);
   21449   if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N->getOperand(1)))
   21450     if (C->getValueAPF().isPosZero())
   21451       return N->getOperand(1);
   21452   return SDValue();
   21453 }
   21454 
   21455 /// PerformFANDNCombine - Do target-specific dag combines on X86ISD::FANDN nodes
   21456 static SDValue PerformFANDNCombine(SDNode *N, SelectionDAG &DAG) {
   21457   // FANDN(x, 0.0) -> 0.0
   21458   // FANDN(0.0, x) -> x
   21459   if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N->getOperand(0)))
   21460     if (C->getValueAPF().isPosZero())
   21461       return N->getOperand(1);
   21462   if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N->getOperand(1)))
   21463     if (C->getValueAPF().isPosZero())
   21464       return N->getOperand(1);
   21465   return SDValue();
   21466 }
   21467 
   21468 static SDValue PerformBTCombine(SDNode *N,
   21469                                 SelectionDAG &DAG,
   21470                                 TargetLowering::DAGCombinerInfo &DCI) {
   21471   // BT ignores high bits in the bit index operand.
   21472   SDValue Op1 = N->getOperand(1);
   21473   if (Op1.hasOneUse()) {
   21474     unsigned BitWidth = Op1.getValueSizeInBits();
   21475     APInt DemandedMask = APInt::getLowBitsSet(BitWidth, Log2_32(BitWidth));
   21476     APInt KnownZero, KnownOne;
   21477     TargetLowering::TargetLoweringOpt TLO(DAG, !DCI.isBeforeLegalize(),
   21478                                           !DCI.isBeforeLegalizeOps());
   21479     const TargetLowering &TLI = DAG.getTargetLoweringInfo();
   21480     if (TLO.ShrinkDemandedConstant(Op1, DemandedMask) ||
   21481         TLI.SimplifyDemandedBits(Op1, DemandedMask, KnownZero, KnownOne, TLO))
   21482       DCI.CommitTargetLoweringOpt(TLO);
   21483   }
   21484   return SDValue();
   21485 }
   21486 
   21487 static SDValue PerformVZEXT_MOVLCombine(SDNode *N, SelectionDAG &DAG) {
   21488   SDValue Op = N->getOperand(0);
   21489   if (Op.getOpcode() == ISD::BITCAST)
   21490     Op = Op.getOperand(0);
   21491   EVT VT = N->getValueType(0), OpVT = Op.getValueType();
   21492   if (Op.getOpcode() == X86ISD::VZEXT_LOAD &&
   21493       VT.getVectorElementType().getSizeInBits() ==
   21494       OpVT.getVectorElementType().getSizeInBits()) {
   21495     return DAG.getNode(ISD::BITCAST, SDLoc(N), VT, Op);
   21496   }
   21497   return SDValue();
   21498 }
   21499 
   21500 static SDValue PerformSIGN_EXTEND_INREGCombine(SDNode *N, SelectionDAG &DAG,
   21501                                                const X86Subtarget *Subtarget) {
   21502   EVT VT = N->getValueType(0);
   21503   if (!VT.isVector())
   21504     return SDValue();
   21505 
   21506   SDValue N0 = N->getOperand(0);
   21507   SDValue N1 = N->getOperand(1);
   21508   EVT ExtraVT = cast<VTSDNode>(N1)->getVT();
   21509   SDLoc dl(N);
   21510 
   21511   // The SIGN_EXTEND_INREG to v4i64 is expensive operation on the
   21512   // both SSE and AVX2 since there is no sign-extended shift right
   21513   // operation on a vector with 64-bit elements.
   21514   //(sext_in_reg (v4i64 anyext (v4i32 x )), ExtraVT) ->
   21515   // (v4i64 sext (v4i32 sext_in_reg (v4i32 x , ExtraVT)))
   21516   if (VT == MVT::v4i64 && (N0.getOpcode() == ISD::ANY_EXTEND ||
   21517       N0.getOpcode() == ISD::SIGN_EXTEND)) {
   21518     SDValue N00 = N0.getOperand(0);
   21519 
   21520     // EXTLOAD has a better solution on AVX2,
   21521     // it may be replaced with X86ISD::VSEXT node.
   21522     if (N00.getOpcode() == ISD::LOAD && Subtarget->hasInt256())
   21523       if (!ISD::isNormalLoad(N00.getNode()))
   21524         return SDValue();
   21525 
   21526     if (N00.getValueType() == MVT::v4i32 && ExtraVT.getSizeInBits() < 128) {
   21527         SDValue Tmp = DAG.getNode(ISD::SIGN_EXTEND_INREG, dl, MVT::v4i32,
   21528                                   N00, N1);
   21529       return DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v4i64, Tmp);
   21530     }
   21531   }
   21532   return SDValue();
   21533 }
   21534 
   21535 static SDValue PerformSExtCombine(SDNode *N, SelectionDAG &DAG,
   21536                                   TargetLowering::DAGCombinerInfo &DCI,
   21537                                   const X86Subtarget *Subtarget) {
   21538   if (!DCI.isBeforeLegalizeOps())
   21539     return SDValue();
   21540 
   21541   if (!Subtarget->hasFp256())
   21542     return SDValue();
   21543 
   21544   EVT VT = N->getValueType(0);
   21545   if (VT.isVector() && VT.getSizeInBits() == 256) {
   21546     SDValue R = WidenMaskArithmetic(N, DAG, DCI, Subtarget);
   21547     if (R.getNode())
   21548       return R;
   21549   }
   21550 
   21551   return SDValue();
   21552 }
   21553 
   21554 static SDValue PerformFMACombine(SDNode *N, SelectionDAG &DAG,
   21555                                  const X86Subtarget* Subtarget) {
   21556   SDLoc dl(N);
   21557   EVT VT = N->getValueType(0);
   21558 
   21559   // Let legalize expand this if it isn't a legal type yet.
   21560   if (!DAG.getTargetLoweringInfo().isTypeLegal(VT))
   21561     return SDValue();
   21562 
   21563   EVT ScalarVT = VT.getScalarType();
   21564   if ((ScalarVT != MVT::f32 && ScalarVT != MVT::f64) ||
   21565       (!Subtarget->hasFMA() && !Subtarget->hasFMA4()))
   21566     return SDValue();
   21567 
   21568   SDValue A = N->getOperand(0);
   21569   SDValue B = N->getOperand(1);
   21570   SDValue C = N->getOperand(2);
   21571 
   21572   bool NegA = (A.getOpcode() == ISD::FNEG);
   21573   bool NegB = (B.getOpcode() == ISD::FNEG);
   21574   bool NegC = (C.getOpcode() == ISD::FNEG);
   21575 
   21576   // Negative multiplication when NegA xor NegB
   21577   bool NegMul = (NegA != NegB);
   21578   if (NegA)
   21579     A = A.getOperand(0);
   21580   if (NegB)
   21581     B = B.getOperand(0);
   21582   if (NegC)
   21583     C = C.getOperand(0);
   21584 
   21585   unsigned Opcode;
   21586   if (!NegMul)
   21587     Opcode = (!NegC) ? X86ISD::FMADD : X86ISD::FMSUB;
   21588   else
   21589     Opcode = (!NegC) ? X86ISD::FNMADD : X86ISD::FNMSUB;
   21590 
   21591   return DAG.getNode(Opcode, dl, VT, A, B, C);
   21592 }
   21593 
   21594 static SDValue PerformZExtCombine(SDNode *N, SelectionDAG &DAG,
   21595                                   TargetLowering::DAGCombinerInfo &DCI,
   21596                                   const X86Subtarget *Subtarget) {
   21597   // (i32 zext (and (i8  x86isd::setcc_carry), 1)) ->
   21598   //           (and (i32 x86isd::setcc_carry), 1)
   21599   // This eliminates the zext. This transformation is necessary because
   21600   // ISD::SETCC is always legalized to i8.
   21601   SDLoc dl(N);
   21602   SDValue N0 = N->getOperand(0);
   21603   EVT VT = N->getValueType(0);
   21604 
   21605   if (N0.getOpcode() == ISD::AND &&
   21606       N0.hasOneUse() &&
   21607       N0.getOperand(0).hasOneUse()) {
   21608     SDValue N00 = N0.getOperand(0);
   21609     if (N00.getOpcode() == X86ISD::SETCC_CARRY) {
   21610       ConstantSDNode *C = dyn_cast<ConstantSDNode>(N0.getOperand(1));
   21611       if (!C || C->getZExtValue() != 1)
   21612         return SDValue();
   21613       return DAG.getNode(ISD::AND, dl, VT,
   21614                          DAG.getNode(X86ISD::SETCC_CARRY, dl, VT,
   21615                                      N00.getOperand(0), N00.getOperand(1)),
   21616                          DAG.getConstant(1, VT));
   21617     }
   21618   }
   21619 
   21620   if (N0.getOpcode() == ISD::TRUNCATE &&
   21621       N0.hasOneUse() &&
   21622       N0.getOperand(0).hasOneUse()) {
   21623     SDValue N00 = N0.getOperand(0);
   21624     if (N00.getOpcode() == X86ISD::SETCC_CARRY) {
   21625       return DAG.getNode(ISD::AND, dl, VT,
   21626                          DAG.getNode(X86ISD::SETCC_CARRY, dl, VT,
   21627                                      N00.getOperand(0), N00.getOperand(1)),
   21628                          DAG.getConstant(1, VT));
   21629     }
   21630   }
   21631   if (VT.is256BitVector()) {
   21632     SDValue R = WidenMaskArithmetic(N, DAG, DCI, Subtarget);
   21633     if (R.getNode())
   21634       return R;
   21635   }
   21636 
   21637   return SDValue();
   21638 }
   21639 
   21640 // Optimize x == -y --> x+y == 0
   21641 //          x != -y --> x+y != 0
   21642 static SDValue PerformISDSETCCCombine(SDNode *N, SelectionDAG &DAG,
   21643                                       const X86Subtarget* Subtarget) {
   21644   ISD::CondCode CC = cast<CondCodeSDNode>(N->getOperand(2))->get();
   21645   SDValue LHS = N->getOperand(0);
   21646   SDValue RHS = N->getOperand(1);
   21647   EVT VT = N->getValueType(0);
   21648   SDLoc DL(N);
   21649 
   21650   if ((CC == ISD::SETNE || CC == ISD::SETEQ) && LHS.getOpcode() == ISD::SUB)
   21651     if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(LHS.getOperand(0)))
   21652       if (C->getAPIntValue() == 0 && LHS.hasOneUse()) {
   21653         SDValue addV = DAG.getNode(ISD::ADD, SDLoc(N),
   21654                                    LHS.getValueType(), RHS, LHS.getOperand(1));
   21655         return DAG.getSetCC(SDLoc(N), N->getValueType(0),
   21656                             addV, DAG.getConstant(0, addV.getValueType()), CC);
   21657       }
   21658   if ((CC == ISD::SETNE || CC == ISD::SETEQ) && RHS.getOpcode() == ISD::SUB)
   21659     if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(RHS.getOperand(0)))
   21660       if (C->getAPIntValue() == 0 && RHS.hasOneUse()) {
   21661         SDValue addV = DAG.getNode(ISD::ADD, SDLoc(N),
   21662                                    RHS.getValueType(), LHS, RHS.getOperand(1));
   21663         return DAG.getSetCC(SDLoc(N), N->getValueType(0),
   21664                             addV, DAG.getConstant(0, addV.getValueType()), CC);
   21665       }
   21666 
   21667   if (VT.getScalarType() == MVT::i1) {
   21668     bool IsSEXT0 = (LHS.getOpcode() == ISD::SIGN_EXTEND) &&
   21669       (LHS.getOperand(0).getValueType().getScalarType() ==  MVT::i1);
   21670     bool IsVZero0 = ISD::isBuildVectorAllZeros(LHS.getNode());
   21671     if (!IsSEXT0 && !IsVZero0)
   21672       return SDValue();
   21673     bool IsSEXT1 = (RHS.getOpcode() == ISD::SIGN_EXTEND) &&
   21674       (RHS.getOperand(0).getValueType().getScalarType() ==  MVT::i1);
   21675     bool IsVZero1 = ISD::isBuildVectorAllZeros(RHS.getNode());
   21676 
   21677     if (!IsSEXT1 && !IsVZero1)
   21678       return SDValue();
   21679 
   21680     if (IsSEXT0 && IsVZero1) {
   21681       assert(VT == LHS.getOperand(0).getValueType() && "Uexpected operand type");
   21682       if (CC == ISD::SETEQ)
   21683         return DAG.getNOT(DL, LHS.getOperand(0), VT);
   21684       return LHS.getOperand(0);
   21685     }
   21686     if (IsSEXT1 && IsVZero0) {
   21687       assert(VT == RHS.getOperand(0).getValueType() && "Uexpected operand type");
   21688       if (CC == ISD::SETEQ)
   21689         return DAG.getNOT(DL, RHS.getOperand(0), VT);
   21690       return RHS.getOperand(0);
   21691     }
   21692   }
   21693 
   21694   return SDValue();
   21695 }
   21696 
   21697 static SDValue PerformINSERTPSCombine(SDNode *N, SelectionDAG &DAG,
   21698                                       const X86Subtarget *Subtarget) {
   21699   SDLoc dl(N);
   21700   MVT VT = N->getOperand(1)->getSimpleValueType(0);
   21701   assert((VT == MVT::v4f32 || VT == MVT::v4i32) &&
   21702          "X86insertps is only defined for v4x32");
   21703 
   21704   SDValue Ld = N->getOperand(1);
   21705   if (MayFoldLoad(Ld)) {
   21706     // Extract the countS bits from the immediate so we can get the proper
   21707     // address when narrowing the vector load to a specific element.
   21708     // When the second source op is a memory address, interps doesn't use
   21709     // countS and just gets an f32 from that address.
   21710     unsigned DestIndex =
   21711         cast<ConstantSDNode>(N->getOperand(2))->getZExtValue() >> 6;
   21712     Ld = NarrowVectorLoadToElement(cast<LoadSDNode>(Ld), DestIndex, DAG);
   21713   } else
   21714     return SDValue();
   21715 
   21716   // Create this as a scalar to vector to match the instruction pattern.
   21717   SDValue LoadScalarToVector = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Ld);
   21718   // countS bits are ignored when loading from memory on insertps, which
   21719   // means we don't need to explicitly set them to 0.
   21720   return DAG.getNode(X86ISD::INSERTPS, dl, VT, N->getOperand(0),
   21721                      LoadScalarToVector, N->getOperand(2));
   21722 }
   21723 
   21724 // Helper function of PerformSETCCCombine. It is to materialize "setb reg"
   21725 // as "sbb reg,reg", since it can be extended without zext and produces
   21726 // an all-ones bit which is more useful than 0/1 in some cases.
   21727 static SDValue MaterializeSETB(SDLoc DL, SDValue EFLAGS, SelectionDAG &DAG,
   21728                                MVT VT) {
   21729   if (VT == MVT::i8)
   21730     return DAG.getNode(ISD::AND, DL, VT,
   21731                        DAG.getNode(X86ISD::SETCC_CARRY, DL, MVT::i8,
   21732                                    DAG.getConstant(X86::COND_B, MVT::i8), EFLAGS),
   21733                        DAG.getConstant(1, VT));
   21734   assert (VT == MVT::i1 && "Unexpected type for SECCC node");
   21735   return DAG.getNode(ISD::TRUNCATE, DL, MVT::i1,
   21736                      DAG.getNode(X86ISD::SETCC_CARRY, DL, MVT::i8,
   21737                                  DAG.getConstant(X86::COND_B, MVT::i8), EFLAGS));
   21738 }
   21739 
   21740 // Optimize  RES = X86ISD::SETCC CONDCODE, EFLAG_INPUT
   21741 static SDValue PerformSETCCCombine(SDNode *N, SelectionDAG &DAG,
   21742                                    TargetLowering::DAGCombinerInfo &DCI,
   21743                                    const X86Subtarget *Subtarget) {
   21744   SDLoc DL(N);
   21745   X86::CondCode CC = X86::CondCode(N->getConstantOperandVal(0));
   21746   SDValue EFLAGS = N->getOperand(1);
   21747 
   21748   if (CC == X86::COND_A) {
   21749     // Try to convert COND_A into COND_B in an attempt to facilitate
   21750     // materializing "setb reg".
   21751     //
   21752     // Do not flip "e > c", where "c" is a constant, because Cmp instruction
   21753     // cannot take an immediate as its first operand.
   21754     //
   21755     if (EFLAGS.getOpcode() == X86ISD::SUB && EFLAGS.hasOneUse() &&
   21756         EFLAGS.getValueType().isInteger() &&
   21757         !isa<ConstantSDNode>(EFLAGS.getOperand(1))) {
   21758       SDValue NewSub = DAG.getNode(X86ISD::SUB, SDLoc(EFLAGS),
   21759                                    EFLAGS.getNode()->getVTList(),
   21760                                    EFLAGS.getOperand(1), EFLAGS.getOperand(0));
   21761       SDValue NewEFLAGS = SDValue(NewSub.getNode(), EFLAGS.getResNo());
   21762       return MaterializeSETB(DL, NewEFLAGS, DAG, N->getSimpleValueType(0));
   21763     }
   21764   }
   21765 
   21766   // Materialize "setb reg" as "sbb reg,reg", since it can be extended without
   21767   // a zext and produces an all-ones bit which is more useful than 0/1 in some
   21768   // cases.
   21769   if (CC == X86::COND_B)
   21770     return MaterializeSETB(DL, EFLAGS, DAG, N->getSimpleValueType(0));
   21771 
   21772   SDValue Flags;
   21773 
   21774   Flags = checkBoolTestSetCCCombine(EFLAGS, CC);
   21775   if (Flags.getNode()) {
   21776     SDValue Cond = DAG.getConstant(CC, MVT::i8);
   21777     return DAG.getNode(X86ISD::SETCC, DL, N->getVTList(), Cond, Flags);
   21778   }
   21779 
   21780   return SDValue();
   21781 }
   21782 
   21783 // Optimize branch condition evaluation.
   21784 //
   21785 static SDValue PerformBrCondCombine(SDNode *N, SelectionDAG &DAG,
   21786                                     TargetLowering::DAGCombinerInfo &DCI,
   21787                                     const X86Subtarget *Subtarget) {
   21788   SDLoc DL(N);
   21789   SDValue Chain = N->getOperand(0);
   21790   SDValue Dest = N->getOperand(1);
   21791   SDValue EFLAGS = N->getOperand(3);
   21792   X86::CondCode CC = X86::CondCode(N->getConstantOperandVal(2));
   21793 
   21794   SDValue Flags;
   21795 
   21796   Flags = checkBoolTestSetCCCombine(EFLAGS, CC);
   21797   if (Flags.getNode()) {
   21798     SDValue Cond = DAG.getConstant(CC, MVT::i8);
   21799     return DAG.getNode(X86ISD::BRCOND, DL, N->getVTList(), Chain, Dest, Cond,
   21800                        Flags);
   21801   }
   21802 
   21803   return SDValue();
   21804 }
   21805 
   21806 static SDValue PerformSINT_TO_FPCombine(SDNode *N, SelectionDAG &DAG,
   21807                                         const X86TargetLowering *XTLI) {
   21808   SDValue Op0 = N->getOperand(0);
   21809   EVT InVT = Op0->getValueType(0);
   21810 
   21811   // SINT_TO_FP(v4i8) -> SINT_TO_FP(SEXT(v4i8 to v4i32))
   21812   if (InVT == MVT::v8i8 || InVT == MVT::v4i8) {
   21813     SDLoc dl(N);
   21814     MVT DstVT = InVT == MVT::v4i8 ? MVT::v4i32 : MVT::v8i32;
   21815     SDValue P = DAG.getNode(ISD::SIGN_EXTEND, dl, DstVT, Op0);
   21816     return DAG.getNode(ISD::SINT_TO_FP, dl, N->getValueType(0), P);
   21817   }
   21818 
   21819   // Transform (SINT_TO_FP (i64 ...)) into an x87 operation if we have
   21820   // a 32-bit target where SSE doesn't support i64->FP operations.
   21821   if (Op0.getOpcode() == ISD::LOAD) {
   21822     LoadSDNode *Ld = cast<LoadSDNode>(Op0.getNode());
   21823     EVT VT = Ld->getValueType(0);
   21824     if (!Ld->isVolatile() && !N->getValueType(0).isVector() &&
   21825         ISD::isNON_EXTLoad(Op0.getNode()) && Op0.hasOneUse() &&
   21826         !XTLI->getSubtarget()->is64Bit() &&
   21827         VT == MVT::i64) {
   21828       SDValue FILDChain = XTLI->BuildFILD(SDValue(N, 0), Ld->getValueType(0),
   21829                                           Ld->getChain(), Op0, DAG);
   21830       DAG.ReplaceAllUsesOfValueWith(Op0.getValue(1), FILDChain.getValue(1));
   21831       return FILDChain;
   21832     }
   21833   }
   21834   return SDValue();
   21835 }
   21836 
   21837 // Optimize RES, EFLAGS = X86ISD::ADC LHS, RHS, EFLAGS
   21838 static SDValue PerformADCCombine(SDNode *N, SelectionDAG &DAG,
   21839                                  X86TargetLowering::DAGCombinerInfo &DCI) {
   21840   // If the LHS and RHS of the ADC node are zero, then it can't overflow and
   21841   // the result is either zero or one (depending on the input carry bit).
   21842   // Strength reduce this down to a "set on carry" aka SETCC_CARRY&1.
   21843   if (X86::isZeroNode(N->getOperand(0)) &&
   21844       X86::isZeroNode(N->getOperand(1)) &&
   21845       // We don't have a good way to replace an EFLAGS use, so only do this when
   21846       // dead right now.
   21847       SDValue(N, 1).use_empty()) {
   21848     SDLoc DL(N);
   21849     EVT VT = N->getValueType(0);
   21850     SDValue CarryOut = DAG.getConstant(0, N->getValueType(1));
   21851     SDValue Res1 = DAG.getNode(ISD::AND, DL, VT,
   21852                                DAG.getNode(X86ISD::SETCC_CARRY, DL, VT,
   21853                                            DAG.getConstant(X86::COND_B,MVT::i8),
   21854                                            N->getOperand(2)),
   21855                                DAG.getConstant(1, VT));
   21856     return DCI.CombineTo(N, Res1, CarryOut);
   21857   }
   21858 
   21859   return SDValue();
   21860 }
   21861 
   21862 // fold (add Y, (sete  X, 0)) -> adc  0, Y
   21863 //      (add Y, (setne X, 0)) -> sbb -1, Y
   21864 //      (sub (sete  X, 0), Y) -> sbb  0, Y
   21865 //      (sub (setne X, 0), Y) -> adc -1, Y
   21866 static SDValue OptimizeConditionalInDecrement(SDNode *N, SelectionDAG &DAG) {
   21867   SDLoc DL(N);
   21868 
   21869   // Look through ZExts.
   21870   SDValue Ext = N->getOperand(N->getOpcode() == ISD::SUB ? 1 : 0);
   21871   if (Ext.getOpcode() != ISD::ZERO_EXTEND || !Ext.hasOneUse())
   21872     return SDValue();
   21873 
   21874   SDValue SetCC = Ext.getOperand(0);
   21875   if (SetCC.getOpcode() != X86ISD::SETCC || !SetCC.hasOneUse())
   21876     return SDValue();
   21877 
   21878   X86::CondCode CC = (X86::CondCode)SetCC.getConstantOperandVal(0);
   21879   if (CC != X86::COND_E && CC != X86::COND_NE)
   21880     return SDValue();
   21881 
   21882   SDValue Cmp = SetCC.getOperand(1);
   21883   if (Cmp.getOpcode() != X86ISD::CMP || !Cmp.hasOneUse() ||
   21884       !X86::isZeroNode(Cmp.getOperand(1)) ||
   21885       !Cmp.getOperand(0).getValueType().isInteger())
   21886     return SDValue();
   21887 
   21888   SDValue CmpOp0 = Cmp.getOperand(0);
   21889   SDValue NewCmp = DAG.getNode(X86ISD::CMP, DL, MVT::i32, CmpOp0,
   21890                                DAG.getConstant(1, CmpOp0.getValueType()));
   21891 
   21892   SDValue OtherVal = N->getOperand(N->getOpcode() == ISD::SUB ? 0 : 1);
   21893   if (CC == X86::COND_NE)
   21894     return DAG.getNode(N->getOpcode() == ISD::SUB ? X86ISD::ADC : X86ISD::SBB,
   21895                        DL, OtherVal.getValueType(), OtherVal,
   21896                        DAG.getConstant(-1ULL, OtherVal.getValueType()), NewCmp);
   21897   return DAG.getNode(N->getOpcode() == ISD::SUB ? X86ISD::SBB : X86ISD::ADC,
   21898                      DL, OtherVal.getValueType(), OtherVal,
   21899                      DAG.getConstant(0, OtherVal.getValueType()), NewCmp);
   21900 }
   21901 
   21902 /// PerformADDCombine - Do target-specific dag combines on integer adds.
   21903 static SDValue PerformAddCombine(SDNode *N, SelectionDAG &DAG,
   21904                                  const X86Subtarget *Subtarget) {
   21905   EVT VT = N->getValueType(0);
   21906   SDValue Op0 = N->getOperand(0);
   21907   SDValue Op1 = N->getOperand(1);
   21908 
   21909   // Try to synthesize horizontal adds from adds of shuffles.
   21910   if (((Subtarget->hasSSSE3() && (VT == MVT::v8i16 || VT == MVT::v4i32)) ||
   21911        (Subtarget->hasInt256() && (VT == MVT::v16i16 || VT == MVT::v8i32))) &&
   21912       isHorizontalBinOp(Op0, Op1, true))
   21913     return DAG.getNode(X86ISD::HADD, SDLoc(N), VT, Op0, Op1);
   21914 
   21915   return OptimizeConditionalInDecrement(N, DAG);
   21916 }
   21917 
   21918 static SDValue PerformSubCombine(SDNode *N, SelectionDAG &DAG,
   21919                                  const X86Subtarget *Subtarget) {
   21920   SDValue Op0 = N->getOperand(0);
   21921   SDValue Op1 = N->getOperand(1);
   21922 
   21923   // X86 can't encode an immediate LHS of a sub. See if we can push the
   21924   // negation into a preceding instruction.
   21925   if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op0)) {
   21926     // If the RHS of the sub is a XOR with one use and a constant, invert the
   21927     // immediate. Then add one to the LHS of the sub so we can turn
   21928     // X-Y -> X+~Y+1, saving one register.
   21929     if (Op1->hasOneUse() && Op1.getOpcode() == ISD::XOR &&
   21930         isa<ConstantSDNode>(Op1.getOperand(1))) {
   21931       APInt XorC = cast<ConstantSDNode>(Op1.getOperand(1))->getAPIntValue();
   21932       EVT VT = Op0.getValueType();
   21933       SDValue NewXor = DAG.getNode(ISD::XOR, SDLoc(Op1), VT,
   21934                                    Op1.getOperand(0),
   21935                                    DAG.getConstant(~XorC, VT));
   21936       return DAG.getNode(ISD::ADD, SDLoc(N), VT, NewXor,
   21937                          DAG.getConstant(C->getAPIntValue()+1, VT));
   21938     }
   21939   }
   21940 
   21941   // Try to synthesize horizontal adds from adds of shuffles.
   21942   EVT VT = N->getValueType(0);
   21943   if (((Subtarget->hasSSSE3() && (VT == MVT::v8i16 || VT == MVT::v4i32)) ||
   21944        (Subtarget->hasInt256() && (VT == MVT::v16i16 || VT == MVT::v8i32))) &&
   21945       isHorizontalBinOp(Op0, Op1, true))
   21946     return DAG.getNode(X86ISD::HSUB, SDLoc(N), VT, Op0, Op1);
   21947 
   21948   return OptimizeConditionalInDecrement(N, DAG);
   21949 }
   21950 
   21951 /// performVZEXTCombine - Performs build vector combines
   21952 static SDValue performVZEXTCombine(SDNode *N, SelectionDAG &DAG,
   21953                                         TargetLowering::DAGCombinerInfo &DCI,
   21954                                         const X86Subtarget *Subtarget) {
   21955   // (vzext (bitcast (vzext (x)) -> (vzext x)
   21956   SDValue In = N->getOperand(0);
   21957   while (In.getOpcode() == ISD::BITCAST)
   21958     In = In.getOperand(0);
   21959 
   21960   if (In.getOpcode() != X86ISD::VZEXT)
   21961     return SDValue();
   21962 
   21963   return DAG.getNode(X86ISD::VZEXT, SDLoc(N), N->getValueType(0),
   21964                      In.getOperand(0));
   21965 }
   21966 
   21967 SDValue X86TargetLowering::PerformDAGCombine(SDNode *N,
   21968                                              DAGCombinerInfo &DCI) const {
   21969   SelectionDAG &DAG = DCI.DAG;
   21970   switch (N->getOpcode()) {
   21971   default: break;
   21972   case ISD::EXTRACT_VECTOR_ELT:
   21973     return PerformEXTRACT_VECTOR_ELTCombine(N, DAG, DCI);
   21974   case ISD::VSELECT:
   21975   case ISD::SELECT:         return PerformSELECTCombine(N, DAG, DCI, Subtarget);
   21976   case X86ISD::CMOV:        return PerformCMOVCombine(N, DAG, DCI, Subtarget);
   21977   case ISD::ADD:            return PerformAddCombine(N, DAG, Subtarget);
   21978   case ISD::SUB:            return PerformSubCombine(N, DAG, Subtarget);
   21979   case X86ISD::ADC:         return PerformADCCombine(N, DAG, DCI);
   21980   case ISD::MUL:            return PerformMulCombine(N, DAG, DCI);
   21981   case ISD::SHL:
   21982   case ISD::SRA:
   21983   case ISD::SRL:            return PerformShiftCombine(N, DAG, DCI, Subtarget);
   21984   case ISD::AND:            return PerformAndCombine(N, DAG, DCI, Subtarget);
   21985   case ISD::OR:             return PerformOrCombine(N, DAG, DCI, Subtarget);
   21986   case ISD::XOR:            return PerformXorCombine(N, DAG, DCI, Subtarget);
   21987   case ISD::LOAD:           return PerformLOADCombine(N, DAG, DCI, Subtarget);
   21988   case ISD::STORE:          return PerformSTORECombine(N, DAG, Subtarget);
   21989   case ISD::SINT_TO_FP:     return PerformSINT_TO_FPCombine(N, DAG, this);
   21990   case ISD::FADD:           return PerformFADDCombine(N, DAG, Subtarget);
   21991   case ISD::FSUB:           return PerformFSUBCombine(N, DAG, Subtarget);
   21992   case X86ISD::FXOR:
   21993   case X86ISD::FOR:         return PerformFORCombine(N, DAG);
   21994   case X86ISD::FMIN:
   21995   case X86ISD::FMAX:        return PerformFMinFMaxCombine(N, DAG);
   21996   case X86ISD::FAND:        return PerformFANDCombine(N, DAG);
   21997   case X86ISD::FANDN:       return PerformFANDNCombine(N, DAG);
   21998   case X86ISD::BT:          return PerformBTCombine(N, DAG, DCI);
   21999   case X86ISD::VZEXT_MOVL:  return PerformVZEXT_MOVLCombine(N, DAG);
   22000   case ISD::ANY_EXTEND:
   22001   case ISD::ZERO_EXTEND:    return PerformZExtCombine(N, DAG, DCI, Subtarget);
   22002   case ISD::SIGN_EXTEND:    return PerformSExtCombine(N, DAG, DCI, Subtarget);
   22003   case ISD::SIGN_EXTEND_INREG:
   22004     return PerformSIGN_EXTEND_INREGCombine(N, DAG, Subtarget);
   22005   case ISD::TRUNCATE:       return PerformTruncateCombine(N, DAG,DCI,Subtarget);
   22006   case ISD::SETCC:          return PerformISDSETCCCombine(N, DAG, Subtarget);
   22007   case X86ISD::SETCC:       return PerformSETCCCombine(N, DAG, DCI, Subtarget);
   22008   case X86ISD::BRCOND:      return PerformBrCondCombine(N, DAG, DCI, Subtarget);
   22009   case X86ISD::VZEXT:       return performVZEXTCombine(N, DAG, DCI, Subtarget);
   22010   case X86ISD::SHUFP:       // Handle all target specific shuffles
   22011   case X86ISD::PALIGNR:
   22012   case X86ISD::UNPCKH:
   22013   case X86ISD::UNPCKL:
   22014   case X86ISD::MOVHLPS:
   22015   case X86ISD::MOVLHPS:
   22016   case X86ISD::PSHUFD:
   22017   case X86ISD::PSHUFHW:
   22018   case X86ISD::PSHUFLW:
   22019   case X86ISD::MOVSS:
   22020   case X86ISD::MOVSD:
   22021   case X86ISD::VPERMILP:
   22022   case X86ISD::VPERM2X128:
   22023   case ISD::VECTOR_SHUFFLE: return PerformShuffleCombine(N, DAG, DCI,Subtarget);
   22024   case ISD::FMA:            return PerformFMACombine(N, DAG, Subtarget);
   22025   case ISD::INTRINSIC_WO_CHAIN:
   22026     return PerformINTRINSIC_WO_CHAINCombine(N, DAG, Subtarget);
   22027   case X86ISD::INSERTPS:
   22028     return PerformINSERTPSCombine(N, DAG, Subtarget);
   22029   case ISD::BUILD_VECTOR: return PerformBUILD_VECTORCombine(N, DAG, Subtarget);
   22030   }
   22031 
   22032   return SDValue();
   22033 }
   22034 
   22035 /// isTypeDesirableForOp - Return true if the target has native support for
   22036 /// the specified value type and it is 'desirable' to use the type for the
   22037 /// given node type. e.g. On x86 i16 is legal, but undesirable since i16
   22038 /// instruction encodings are longer and some i16 instructions are slow.
   22039 bool X86TargetLowering::isTypeDesirableForOp(unsigned Opc, EVT VT) const {
   22040   if (!isTypeLegal(VT))
   22041     return false;
   22042   if (VT != MVT::i16)
   22043     return true;
   22044 
   22045   switch (Opc) {
   22046   default:
   22047     return true;
   22048   case ISD::LOAD:
   22049   case ISD::SIGN_EXTEND:
   22050   case ISD::ZERO_EXTEND:
   22051   case ISD::ANY_EXTEND:
   22052   case ISD::SHL:
   22053   case ISD::SRL:
   22054   case ISD::SUB:
   22055   case ISD::ADD:
   22056   case ISD::MUL:
   22057   case ISD::AND:
   22058   case ISD::OR:
   22059   case ISD::XOR:
   22060     return false;
   22061   }
   22062 }
   22063 
   22064 /// IsDesirableToPromoteOp - This method query the target whether it is
   22065 /// beneficial for dag combiner to promote the specified node. If true, it
   22066 /// should return the desired promotion type by reference.
   22067 bool X86TargetLowering::IsDesirableToPromoteOp(SDValue Op, EVT &PVT) const {
   22068   EVT VT = Op.getValueType();
   22069   if (VT != MVT::i16)
   22070     return false;
   22071 
   22072   bool Promote = false;
   22073   bool Commute = false;
   22074   switch (Op.getOpcode()) {
   22075   default: break;
   22076   case ISD::LOAD: {
   22077     LoadSDNode *LD = cast<LoadSDNode>(Op);
   22078     // If the non-extending load has a single use and it's not live out, then it
   22079     // might be folded.
   22080     if (LD->getExtensionType() == ISD::NON_EXTLOAD /*&&
   22081                                                      Op.hasOneUse()*/) {
   22082       for (SDNode::use_iterator UI = Op.getNode()->use_begin(),
   22083              UE = Op.getNode()->use_end(); UI != UE; ++UI) {
   22084         // The only case where we'd want to promote LOAD (rather then it being
   22085         // promoted as an operand is when it's only use is liveout.
   22086         if (UI->getOpcode() != ISD::CopyToReg)
   22087           return false;
   22088       }
   22089     }
   22090     Promote = true;
   22091     break;
   22092   }
   22093   case ISD::SIGN_EXTEND:
   22094   case ISD::ZERO_EXTEND:
   22095   case ISD::ANY_EXTEND:
   22096     Promote = true;
   22097     break;
   22098   case ISD::SHL:
   22099   case ISD::SRL: {
   22100     SDValue N0 = Op.getOperand(0);
   22101     // Look out for (store (shl (load), x)).
   22102     if (MayFoldLoad(N0) && MayFoldIntoStore(Op))
   22103       return false;
   22104     Promote = true;
   22105     break;
   22106   }
   22107   case ISD::ADD:
   22108   case ISD::MUL:
   22109   case ISD::AND:
   22110   case ISD::OR:
   22111   case ISD::XOR:
   22112     Commute = true;
   22113     // fallthrough
   22114   case ISD::SUB: {
   22115     SDValue N0 = Op.getOperand(0);
   22116     SDValue N1 = Op.getOperand(1);
   22117     if (!Commute && MayFoldLoad(N1))
   22118       return false;
   22119     // Avoid disabling potential load folding opportunities.
   22120     if (MayFoldLoad(N0) && (!isa<ConstantSDNode>(N1) || MayFoldIntoStore(Op)))
   22121       return false;
   22122     if (MayFoldLoad(N1) && (!isa<ConstantSDNode>(N0) || MayFoldIntoStore(Op)))
   22123       return false;
   22124     Promote = true;
   22125   }
   22126   }
   22127 
   22128   PVT = MVT::i32;
   22129   return Promote;
   22130 }
   22131 
   22132 //===----------------------------------------------------------------------===//
   22133 //                           X86 Inline Assembly Support
   22134 //===----------------------------------------------------------------------===//
   22135 
   22136 namespace {
   22137   // Helper to match a string separated by whitespace.
   22138   bool matchAsmImpl(StringRef s, ArrayRef<const StringRef *> args) {
   22139     s = s.substr(s.find_first_not_of(" \t")); // Skip leading whitespace.
   22140 
   22141     for (unsigned i = 0, e = args.size(); i != e; ++i) {
   22142       StringRef piece(*args[i]);
   22143       if (!s.startswith(piece)) // Check if the piece matches.
   22144         return false;
   22145 
   22146       s = s.substr(piece.size());
   22147       StringRef::size_type pos = s.find_first_not_of(" \t");
   22148       if (pos == 0) // We matched a prefix.
   22149         return false;
   22150 
   22151       s = s.substr(pos);
   22152     }
   22153 
   22154     return s.empty();
   22155   }
   22156   const VariadicFunction1<bool, StringRef, StringRef, matchAsmImpl> matchAsm={};
   22157 }
   22158 
   22159 static bool clobbersFlagRegisters(const SmallVector<StringRef, 4> &AsmPieces) {
   22160 
   22161   if (AsmPieces.size() == 3 || AsmPieces.size() == 4) {
   22162     if (std::count(AsmPieces.begin(), AsmPieces.end(), "~{cc}") &&
   22163         std::count(AsmPieces.begin(), AsmPieces.end(), "~{flags}") &&
   22164         std::count(AsmPieces.begin(), AsmPieces.end(), "~{fpsr}")) {
   22165 
   22166       if (AsmPieces.size() == 3)
   22167         return true;
   22168       else if (std::count(AsmPieces.begin(), AsmPieces.end(), "~{dirflag}"))
   22169         return true;
   22170     }
   22171   }
   22172   return false;
   22173 }
   22174 
   22175 bool X86TargetLowering::ExpandInlineAsm(CallInst *CI) const {
   22176   InlineAsm *IA = cast<InlineAsm>(CI->getCalledValue());
   22177 
   22178   std::string AsmStr = IA->getAsmString();
   22179 
   22180   IntegerType *Ty = dyn_cast<IntegerType>(CI->getType());
   22181   if (!Ty || Ty->getBitWidth() % 16 != 0)
   22182     return false;
   22183 
   22184   // TODO: should remove alternatives from the asmstring: "foo {a|b}" -> "foo a"
   22185   SmallVector<StringRef, 4> AsmPieces;
   22186   SplitString(AsmStr, AsmPieces, ";\n");
   22187 
   22188   switch (AsmPieces.size()) {
   22189   default: return false;
   22190   case 1:
   22191     // FIXME: this should verify that we are targeting a 486 or better.  If not,
   22192     // we will turn this bswap into something that will be lowered to logical
   22193     // ops instead of emitting the bswap asm.  For now, we don't support 486 or
   22194     // lower so don't worry about this.
   22195     // bswap $0
   22196     if (matchAsm(AsmPieces[0], "bswap", "$0") ||
   22197         matchAsm(AsmPieces[0], "bswapl", "$0") ||
   22198         matchAsm(AsmPieces[0], "bswapq", "$0") ||
   22199         matchAsm(AsmPieces[0], "bswap", "${0:q}") ||
   22200         matchAsm(AsmPieces[0], "bswapl", "${0:q}") ||
   22201         matchAsm(AsmPieces[0], "bswapq", "${0:q}")) {
   22202       // No need to check constraints, nothing other than the equivalent of
   22203       // "=r,0" would be valid here.
   22204       return IntrinsicLowering::LowerToByteSwap(CI);
   22205     }
   22206 
   22207     // rorw $$8, ${0:w}  -->  llvm.bswap.i16
   22208     if (CI->getType()->isIntegerTy(16) &&
   22209         IA->getConstraintString().compare(0, 5, "=r,0,") == 0 &&
   22210         (matchAsm(AsmPieces[0], "rorw", "$$8,", "${0:w}") ||
   22211          matchAsm(AsmPieces[0], "rolw", "$$8,", "${0:w}"))) {
   22212       AsmPieces.clear();
   22213       const std::string &ConstraintsStr = IA->getConstraintString();
   22214       SplitString(StringRef(ConstraintsStr).substr(5), AsmPieces, ",");
   22215       array_pod_sort(AsmPieces.begin(), AsmPieces.end());
   22216       if (clobbersFlagRegisters(AsmPieces))
   22217         return IntrinsicLowering::LowerToByteSwap(CI);
   22218     }
   22219     break;
   22220   case 3:
   22221     if (CI->getType()->isIntegerTy(32) &&
   22222         IA->getConstraintString().compare(0, 5, "=r,0,") == 0 &&
   22223         matchAsm(AsmPieces[0], "rorw", "$$8,", "${0:w}") &&
   22224         matchAsm(AsmPieces[1], "rorl", "$$16,", "$0") &&
   22225         matchAsm(AsmPieces[2], "rorw", "$$8,", "${0:w}")) {
   22226       AsmPieces.clear();
   22227       const std::string &ConstraintsStr = IA->getConstraintString();
   22228       SplitString(StringRef(ConstraintsStr).substr(5), AsmPieces, ",");
   22229       array_pod_sort(AsmPieces.begin(), AsmPieces.end());
   22230       if (clobbersFlagRegisters(AsmPieces))
   22231         return IntrinsicLowering::LowerToByteSwap(CI);
   22232     }
   22233 
   22234     if (CI->getType()->isIntegerTy(64)) {
   22235       InlineAsm::ConstraintInfoVector Constraints = IA->ParseConstraints();
   22236       if (Constraints.size() >= 2 &&
   22237           Constraints[0].Codes.size() == 1 && Constraints[0].Codes[0] == "A" &&
   22238           Constraints[1].Codes.size() == 1 && Constraints[1].Codes[0] == "0") {
   22239         // bswap %eax / bswap %edx / xchgl %eax, %edx  -> llvm.bswap.i64
   22240         if (matchAsm(AsmPieces[0], "bswap", "%eax") &&
   22241             matchAsm(AsmPieces[1], "bswap", "%edx") &&
   22242             matchAsm(AsmPieces[2], "xchgl", "%eax,", "%edx"))
   22243           return IntrinsicLowering::LowerToByteSwap(CI);
   22244       }
   22245     }
   22246     break;
   22247   }
   22248   return false;
   22249 }
   22250 
   22251 /// getConstraintType - Given a constraint letter, return the type of
   22252 /// constraint it is for this target.
   22253 X86TargetLowering::ConstraintType
   22254 X86TargetLowering::getConstraintType(const std::string &Constraint) const {
   22255   if (Constraint.size() == 1) {
   22256     switch (Constraint[0]) {
   22257     case 'R':
   22258     case 'q':
   22259     case 'Q':
   22260     case 'f':
   22261     case 't':
   22262     case 'u':
   22263     case 'y':
   22264     case 'x':
   22265     case 'Y':
   22266     case 'l':
   22267       return C_RegisterClass;
   22268     case 'a':
   22269     case 'b':
   22270     case 'c':
   22271     case 'd':
   22272     case 'S':
   22273     case 'D':
   22274     case 'A':
   22275       return C_Register;
   22276     case 'I':
   22277     case 'J':
   22278     case 'K':
   22279     case 'L':
   22280     case 'M':
   22281     case 'N':
   22282     case 'G':
   22283     case 'C':
   22284     case 'e':
   22285     case 'Z':
   22286       return C_Other;
   22287     default:
   22288       break;
   22289     }
   22290   }
   22291   return TargetLowering::getConstraintType(Constraint);
   22292 }
   22293 
   22294 /// Examine constraint type and operand type and determine a weight value.
   22295 /// This object must already have been set up with the operand type
   22296 /// and the current alternative constraint selected.
   22297 TargetLowering::ConstraintWeight
   22298   X86TargetLowering::getSingleConstraintMatchWeight(
   22299     AsmOperandInfo &info, const char *constraint) const {
   22300   ConstraintWeight weight = CW_Invalid;
   22301   Value *CallOperandVal = info.CallOperandVal;
   22302     // If we don't have a value, we can't do a match,
   22303     // but allow it at the lowest weight.
   22304   if (!CallOperandVal)
   22305     return CW_Default;
   22306   Type *type = CallOperandVal->getType();
   22307   // Look at the constraint type.
   22308   switch (*constraint) {
   22309   default:
   22310     weight = TargetLowering::getSingleConstraintMatchWeight(info, constraint);
   22311   case 'R':
   22312   case 'q':
   22313   case 'Q':
   22314   case 'a':
   22315   case 'b':
   22316   case 'c':
   22317   case 'd':
   22318   case 'S':
   22319   case 'D':
   22320   case 'A':
   22321     if (CallOperandVal->getType()->isIntegerTy())
   22322       weight = CW_SpecificReg;
   22323     break;
   22324   case 'f':
   22325   case 't':
   22326   case 'u':
   22327     if (type->isFloatingPointTy())
   22328       weight = CW_SpecificReg;
   22329     break;
   22330   case 'y':
   22331     if (type->isX86_MMXTy() && Subtarget->hasMMX())
   22332       weight = CW_SpecificReg;
   22333     break;
   22334   case 'x':
   22335   case 'Y':
   22336     if (((type->getPrimitiveSizeInBits() == 128) && Subtarget->hasSSE1()) ||
   22337         ((type->getPrimitiveSizeInBits() == 256) && Subtarget->hasFp256()))
   22338       weight = CW_Register;
   22339     break;
   22340   case 'I':
   22341     if (ConstantInt *C = dyn_cast<ConstantInt>(info.CallOperandVal)) {
   22342       if (C->getZExtValue() <= 31)
   22343         weight = CW_Constant;
   22344     }
   22345     break;
   22346   case 'J':
   22347     if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
   22348       if (C->getZExtValue() <= 63)
   22349         weight = CW_Constant;
   22350     }
   22351     break;
   22352   case 'K':
   22353     if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
   22354       if ((C->getSExtValue() >= -0x80) && (C->getSExtValue() <= 0x7f))
   22355         weight = CW_Constant;
   22356     }
   22357     break;
   22358   case 'L':
   22359     if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
   22360       if ((C->getZExtValue() == 0xff) || (C->getZExtValue() == 0xffff))
   22361         weight = CW_Constant;
   22362     }
   22363     break;
   22364   case 'M':
   22365     if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
   22366       if (C->getZExtValue() <= 3)
   22367         weight = CW_Constant;
   22368     }
   22369     break;
   22370   case 'N':
   22371     if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
   22372       if (C->getZExtValue() <= 0xff)
   22373         weight = CW_Constant;
   22374     }
   22375     break;
   22376   case 'G':
   22377   case 'C':
   22378     if (dyn_cast<ConstantFP>(CallOperandVal)) {
   22379       weight = CW_Constant;
   22380     }
   22381     break;
   22382   case 'e':
   22383     if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
   22384       if ((C->getSExtValue() >= -0x80000000LL) &&
   22385           (C->getSExtValue() <= 0x7fffffffLL))
   22386         weight = CW_Constant;
   22387     }
   22388     break;
   22389   case 'Z':
   22390     if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
   22391       if (C->getZExtValue() <= 0xffffffff)
   22392         weight = CW_Constant;
   22393     }
   22394     break;
   22395   }
   22396   return weight;
   22397 }
   22398 
   22399 /// LowerXConstraint - try to replace an X constraint, which matches anything,
   22400 /// with another that has more specific requirements based on the type of the
   22401 /// corresponding operand.
   22402 const char *X86TargetLowering::
   22403 LowerXConstraint(EVT ConstraintVT) const {
   22404   // FP X constraints get lowered to SSE1/2 registers if available, otherwise
   22405   // 'f' like normal targets.
   22406   if (ConstraintVT.isFloatingPoint()) {
   22407     if (Subtarget->hasSSE2())
   22408       return "Y";
   22409     if (Subtarget->hasSSE1())
   22410       return "x";
   22411   }
   22412 
   22413   return TargetLowering::LowerXConstraint(ConstraintVT);
   22414 }
   22415 
   22416 /// LowerAsmOperandForConstraint - Lower the specified operand into the Ops
   22417 /// vector.  If it is invalid, don't add anything to Ops.
   22418 void X86TargetLowering::LowerAsmOperandForConstraint(SDValue Op,
   22419                                                      std::string &Constraint,
   22420                                                      std::vector<SDValue>&Ops,
   22421                                                      SelectionDAG &DAG) const {
   22422   SDValue Result;
   22423 
   22424   // Only support length 1 constraints for now.
   22425   if (Constraint.length() > 1) return;
   22426 
   22427   char ConstraintLetter = Constraint[0];
   22428   switch (ConstraintLetter) {
   22429   default: break;
   22430   case 'I':
   22431     if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
   22432       if (C->getZExtValue() <= 31) {
   22433         Result = DAG.getTargetConstant(C->getZExtValue(), Op.getValueType());
   22434         break;
   22435       }
   22436     }
   22437     return;
   22438   case 'J':
   22439     if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
   22440       if (C->getZExtValue() <= 63) {
   22441         Result = DAG.getTargetConstant(C->getZExtValue(), Op.getValueType());
   22442         break;
   22443       }
   22444     }
   22445     return;
   22446   case 'K':
   22447     if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
   22448       if (isInt<8>(C->getSExtValue())) {
   22449         Result = DAG.getTargetConstant(C->getZExtValue(), Op.getValueType());
   22450         break;
   22451       }
   22452     }
   22453     return;
   22454   case 'N':
   22455     if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
   22456       if (C->getZExtValue() <= 255) {
   22457         Result = DAG.getTargetConstant(C->getZExtValue(), Op.getValueType());
   22458         break;
   22459       }
   22460     }
   22461     return;
   22462   case 'e': {
   22463     // 32-bit signed value
   22464     if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
   22465       if (ConstantInt::isValueValidForType(Type::getInt32Ty(*DAG.getContext()),
   22466                                            C->getSExtValue())) {
   22467         // Widen to 64 bits here to get it sign extended.
   22468         Result = DAG.getTargetConstant(C->getSExtValue(), MVT::i64);
   22469         break;
   22470       }
   22471     // FIXME gcc accepts some relocatable values here too, but only in certain
   22472     // memory models; it's complicated.
   22473     }
   22474     return;
   22475   }
   22476   case 'Z': {
   22477     // 32-bit unsigned value
   22478     if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
   22479       if (ConstantInt::isValueValidForType(Type::getInt32Ty(*DAG.getContext()),
   22480                                            C->getZExtValue())) {
   22481         Result = DAG.getTargetConstant(C->getZExtValue(), Op.getValueType());
   22482         break;
   22483       }
   22484     }
   22485     // FIXME gcc accepts some relocatable values here too, but only in certain
   22486     // memory models; it's complicated.
   22487     return;
   22488   }
   22489   case 'i': {
   22490     // Literal immediates are always ok.
   22491     if (ConstantSDNode *CST = dyn_cast<ConstantSDNode>(Op)) {
   22492       // Widen to 64 bits here to get it sign extended.
   22493       Result = DAG.getTargetConstant(CST->getSExtValue(), MVT::i64);
   22494       break;
   22495     }
   22496 
   22497     // In any sort of PIC mode addresses need to be computed at runtime by
   22498     // adding in a register or some sort of table lookup.  These can't
   22499     // be used as immediates.
   22500     if (Subtarget->isPICStyleGOT() || Subtarget->isPICStyleStubPIC())
   22501       return;
   22502 
   22503     // If we are in non-pic codegen mode, we allow the address of a global (with
   22504     // an optional displacement) to be used with 'i'.
   22505     GlobalAddressSDNode *GA = nullptr;
   22506     int64_t Offset = 0;
   22507 
   22508     // Match either (GA), (GA+C), (GA+C1+C2), etc.
   22509     while (1) {
   22510       if ((GA = dyn_cast<GlobalAddressSDNode>(Op))) {
   22511         Offset += GA->getOffset();
   22512         break;
   22513       } else if (Op.getOpcode() == ISD::ADD) {
   22514         if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op.getOperand(1))) {
   22515           Offset += C->getZExtValue();
   22516           Op = Op.getOperand(0);
   22517           continue;
   22518         }
   22519       } else if (Op.getOpcode() == ISD::SUB) {
   22520         if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op.getOperand(1))) {
   22521           Offset += -C->getZExtValue();
   22522           Op = Op.getOperand(0);
   22523           continue;
   22524         }
   22525       }
   22526 
   22527       // Otherwise, this isn't something we can handle, reject it.
   22528       return;
   22529     }
   22530 
   22531     const GlobalValue *GV = GA->getGlobal();
   22532     // If we require an extra load to get this address, as in PIC mode, we
   22533     // can't accept it.
   22534     if (isGlobalStubReference(
   22535             Subtarget->ClassifyGlobalReference(GV, DAG.getTarget())))
   22536       return;
   22537 
   22538     Result = DAG.getTargetGlobalAddress(GV, SDLoc(Op),
   22539                                         GA->getValueType(0), Offset);
   22540     break;
   22541   }
   22542   }
   22543 
   22544   if (Result.getNode()) {
   22545     Ops.push_back(Result);
   22546     return;
   22547   }
   22548   return TargetLowering::LowerAsmOperandForConstraint(Op, Constraint, Ops, DAG);
   22549 }
   22550 
   22551 std::pair<unsigned, const TargetRegisterClass*>
   22552 X86TargetLowering::getRegForInlineAsmConstraint(const std::string &Constraint,
   22553                                                 MVT VT) const {
   22554   // First, see if this is a constraint that directly corresponds to an LLVM
   22555   // register class.
   22556   if (Constraint.size() == 1) {
   22557     // GCC Constraint Letters
   22558     switch (Constraint[0]) {
   22559     default: break;
   22560       // TODO: Slight differences here in allocation order and leaving
   22561       // RIP in the class. Do they matter any more here than they do
   22562       // in the normal allocation?
   22563     case 'q':   // GENERAL_REGS in 64-bit mode, Q_REGS in 32-bit mode.
   22564       if (Subtarget->is64Bit()) {
   22565         if (VT == MVT::i32 || VT == MVT::f32)
   22566           return std::make_pair(0U, &X86::GR32RegClass);
   22567         if (VT == MVT::i16)
   22568           return std::make_pair(0U, &X86::GR16RegClass);
   22569         if (VT == MVT::i8 || VT == MVT::i1)
   22570           return std::make_pair(0U, &X86::GR8RegClass);
   22571         if (VT == MVT::i64 || VT == MVT::f64)
   22572           return std::make_pair(0U, &X86::GR64RegClass);
   22573         break;
   22574       }
   22575       // 32-bit fallthrough
   22576     case 'Q':   // Q_REGS
   22577       if (VT == MVT::i32 || VT == MVT::f32)
   22578         return std::make_pair(0U, &X86::GR32_ABCDRegClass);
   22579       if (VT == MVT::i16)
   22580         return std::make_pair(0U, &X86::GR16_ABCDRegClass);
   22581       if (VT == MVT::i8 || VT == MVT::i1)
   22582         return std::make_pair(0U, &X86::GR8_ABCD_LRegClass);
   22583       if (VT == MVT::i64)
   22584         return std::make_pair(0U, &X86::GR64_ABCDRegClass);
   22585       break;
   22586     case 'r':   // GENERAL_REGS
   22587     case 'l':   // INDEX_REGS
   22588       if (VT == MVT::i8 || VT == MVT::i1)
   22589         return std::make_pair(0U, &X86::GR8RegClass);
   22590       if (VT == MVT::i16)
   22591         return std::make_pair(0U, &X86::GR16RegClass);
   22592       if (VT == MVT::i32 || VT == MVT::f32 || !Subtarget->is64Bit())
   22593         return std::make_pair(0U, &X86::GR32RegClass);
   22594       return std::make_pair(0U, &X86::GR64RegClass);
   22595     case 'R':   // LEGACY_REGS
   22596       if (VT == MVT::i8 || VT == MVT::i1)
   22597         return std::make_pair(0U, &X86::GR8_NOREXRegClass);
   22598       if (VT == MVT::i16)
   22599         return std::make_pair(0U, &X86::GR16_NOREXRegClass);
   22600       if (VT == MVT::i32 || !Subtarget->is64Bit())
   22601         return std::make_pair(0U, &X86::GR32_NOREXRegClass);
   22602       return std::make_pair(0U, &X86::GR64_NOREXRegClass);
   22603     case 'f':  // FP Stack registers.
   22604       // If SSE is enabled for this VT, use f80 to ensure the isel moves the
   22605       // value to the correct fpstack register class.
   22606       if (VT == MVT::f32 && !isScalarFPTypeInSSEReg(VT))
   22607         return std::make_pair(0U, &X86::RFP32RegClass);
   22608       if (VT == MVT::f64 && !isScalarFPTypeInSSEReg(VT))
   22609         return std::make_pair(0U, &X86::RFP64RegClass);
   22610       return std::make_pair(0U, &X86::RFP80RegClass);
   22611     case 'y':   // MMX_REGS if MMX allowed.
   22612       if (!Subtarget->hasMMX()) break;
   22613       return std::make_pair(0U, &X86::VR64RegClass);
   22614     case 'Y':   // SSE_REGS if SSE2 allowed
   22615       if (!Subtarget->hasSSE2()) break;
   22616       // FALL THROUGH.
   22617     case 'x':   // SSE_REGS if SSE1 allowed or AVX_REGS if AVX allowed
   22618       if (!Subtarget->hasSSE1()) break;
   22619 
   22620       switch (VT.SimpleTy) {
   22621       default: break;
   22622       // Scalar SSE types.
   22623       case MVT::f32:
   22624       case MVT::i32:
   22625         return std::make_pair(0U, &X86::FR32RegClass);
   22626       case MVT::f64:
   22627       case MVT::i64:
   22628         return std::make_pair(0U, &X86::FR64RegClass);
   22629       // Vector types.
   22630       case MVT::v16i8:
   22631       case MVT::v8i16:
   22632       case MVT::v4i32:
   22633       case MVT::v2i64:
   22634       case MVT::v4f32:
   22635       case MVT::v2f64:
   22636         return std::make_pair(0U, &X86::VR128RegClass);
   22637       // AVX types.
   22638       case MVT::v32i8:
   22639       case MVT::v16i16:
   22640       case MVT::v8i32:
   22641       case MVT::v4i64:
   22642       case MVT::v8f32:
   22643       case MVT::v4f64:
   22644         return std::make_pair(0U, &X86::VR256RegClass);
   22645       case MVT::v8f64:
   22646       case MVT::v16f32:
   22647       case MVT::v16i32:
   22648       case MVT::v8i64:
   22649         return std::make_pair(0U, &X86::VR512RegClass);
   22650       }
   22651       break;
   22652     }
   22653   }
   22654 
   22655   // Use the default implementation in TargetLowering to convert the register
   22656   // constraint into a member of a register class.
   22657   std::pair<unsigned, const TargetRegisterClass*> Res;
   22658   Res = TargetLowering::getRegForInlineAsmConstraint(Constraint, VT);
   22659 
   22660   // Not found as a standard register?
   22661   if (!Res.second) {
   22662     // Map st(0) -> st(7) -> ST0
   22663     if (Constraint.size() == 7 && Constraint[0] == '{' &&
   22664         tolower(Constraint[1]) == 's' &&
   22665         tolower(Constraint[2]) == 't' &&
   22666         Constraint[3] == '(' &&
   22667         (Constraint[4] >= '0' && Constraint[4] <= '7') &&
   22668         Constraint[5] == ')' &&
   22669         Constraint[6] == '}') {
   22670 
   22671       Res.first = X86::ST0+Constraint[4]-'0';
   22672       Res.second = &X86::RFP80RegClass;
   22673       return Res;
   22674     }
   22675 
   22676     // GCC allows "st(0)" to be called just plain "st".
   22677     if (StringRef("{st}").equals_lower(Constraint)) {
   22678       Res.first = X86::ST0;
   22679       Res.second = &X86::RFP80RegClass;
   22680       return Res;
   22681     }
   22682 
   22683     // flags -> EFLAGS
   22684     if (StringRef("{flags}").equals_lower(Constraint)) {
   22685       Res.first = X86::EFLAGS;
   22686       Res.second = &X86::CCRRegClass;
   22687       return Res;
   22688     }
   22689 
   22690     // 'A' means EAX + EDX.
   22691     if (Constraint == "A") {
   22692       Res.first = X86::EAX;
   22693       Res.second = &X86::GR32_ADRegClass;
   22694       return Res;
   22695     }
   22696     return Res;
   22697   }
   22698 
   22699   // Otherwise, check to see if this is a register class of the wrong value
   22700   // type.  For example, we want to map "{ax},i32" -> {eax}, we don't want it to
   22701   // turn into {ax},{dx}.
   22702   if (Res.second->hasType(VT))
   22703     return Res;   // Correct type already, nothing to do.
   22704 
   22705   // All of the single-register GCC register classes map their values onto
   22706   // 16-bit register pieces "ax","dx","cx","bx","si","di","bp","sp".  If we
   22707   // really want an 8-bit or 32-bit register, map to the appropriate register
   22708   // class and return the appropriate register.
   22709   if (Res.second == &X86::GR16RegClass) {
   22710     if (VT == MVT::i8 || VT == MVT::i1) {
   22711       unsigned DestReg = 0;
   22712       switch (Res.first) {
   22713       default: break;
   22714       case X86::AX: DestReg = X86::AL; break;
   22715       case X86::DX: DestReg = X86::DL; break;
   22716       case X86::CX: DestReg = X86::CL; break;
   22717       case X86::BX: DestReg = X86::BL; break;
   22718       }
   22719       if (DestReg) {
   22720         Res.first = DestReg;
   22721         Res.second = &X86::GR8RegClass;
   22722       }
   22723     } else if (VT == MVT::i32 || VT == MVT::f32) {
   22724       unsigned DestReg = 0;
   22725       switch (Res.first) {
   22726       default: break;
   22727       case X86::AX: DestReg = X86::EAX; break;
   22728       case X86::DX: DestReg = X86::EDX; break;
   22729       case X86::CX: DestReg = X86::ECX; break;
   22730       case X86::BX: DestReg = X86::EBX; break;
   22731       case X86::SI: DestReg = X86::ESI; break;
   22732       case X86::DI: DestReg = X86::EDI; break;
   22733       case X86::BP: DestReg = X86::EBP; break;
   22734       case X86::SP: DestReg = X86::ESP; break;
   22735       }
   22736       if (DestReg) {
   22737         Res.first = DestReg;
   22738         Res.second = &X86::GR32RegClass;
   22739       }
   22740     } else if (VT == MVT::i64 || VT == MVT::f64) {
   22741       unsigned DestReg = 0;
   22742       switch (Res.first) {
   22743       default: break;
   22744       case X86::AX: DestReg = X86::RAX; break;
   22745       case X86::DX: DestReg = X86::RDX; break;
   22746       case X86::CX: DestReg = X86::RCX; break;
   22747       case X86::BX: DestReg = X86::RBX; break;
   22748       case X86::SI: DestReg = X86::RSI; break;
   22749       case X86::DI: DestReg = X86::RDI; break;
   22750       case X86::BP: DestReg = X86::RBP; break;
   22751       case X86::SP: DestReg = X86::RSP; break;
   22752       }
   22753       if (DestReg) {
   22754         Res.first = DestReg;
   22755         Res.second = &X86::GR64RegClass;
   22756       }
   22757     }
   22758   } else if (Res.second == &X86::FR32RegClass ||
   22759              Res.second == &X86::FR64RegClass ||
   22760              Res.second == &X86::VR128RegClass ||
   22761              Res.second == &X86::VR256RegClass ||
   22762              Res.second == &X86::FR32XRegClass ||
   22763              Res.second == &X86::FR64XRegClass ||
   22764              Res.second == &X86::VR128XRegClass ||
   22765              Res.second == &X86::VR256XRegClass ||
   22766              Res.second == &X86::VR512RegClass) {
   22767     // Handle references to XMM physical registers that got mapped into the
   22768     // wrong class.  This can happen with constraints like {xmm0} where the
   22769     // target independent register mapper will just pick the first match it can
   22770     // find, ignoring the required type.
   22771 
   22772     if (VT == MVT::f32 || VT == MVT::i32)
   22773       Res.second = &X86::FR32RegClass;
   22774     else if (VT == MVT::f64 || VT == MVT::i64)
   22775       Res.second = &X86::FR64RegClass;
   22776     else if (X86::VR128RegClass.hasType(VT))
   22777       Res.second = &X86::VR128RegClass;
   22778     else if (X86::VR256RegClass.hasType(VT))
   22779       Res.second = &X86::VR256RegClass;
   22780     else if (X86::VR512RegClass.hasType(VT))
   22781       Res.second = &X86::VR512RegClass;
   22782   }
   22783 
   22784   return Res;
   22785 }
   22786 
   22787 int X86TargetLowering::getScalingFactorCost(const AddrMode &AM,
   22788                                             Type *Ty) const {
   22789   // Scaling factors are not free at all.
   22790   // An indexed folded instruction, i.e., inst (reg1, reg2, scale),
   22791   // will take 2 allocations in the out of order engine instead of 1
   22792   // for plain addressing mode, i.e. inst (reg1).
   22793   // E.g.,
   22794   // vaddps (%rsi,%drx), %ymm0, %ymm1
   22795   // Requires two allocations (one for the load, one for the computation)
   22796   // whereas:
   22797   // vaddps (%rsi), %ymm0, %ymm1
   22798   // Requires just 1 allocation, i.e., freeing allocations for other operations
   22799   // and having less micro operations to execute.
   22800   //
   22801   // For some X86 architectures, this is even worse because for instance for
   22802   // stores, the complex addressing mode forces the instruction to use the
   22803   // "load" ports instead of the dedicated "store" port.
   22804   // E.g., on Haswell:
   22805   // vmovaps %ymm1, (%r8, %rdi) can use port 2 or 3.
   22806   // vmovaps %ymm1, (%r8) can use port 2, 3, or 7.
   22807   if (isLegalAddressingMode(AM, Ty))
   22808     // Scale represents reg2 * scale, thus account for 1
   22809     // as soon as we use a second register.
   22810     return AM.Scale != 0;
   22811   return -1;
   22812 }
   22813 
   22814 bool X86TargetLowering::isTargetFTOL() const {
   22815   return Subtarget->isTargetKnownWindowsMSVC() && !Subtarget->is64Bit();
   22816 }
   22817