Home | History | Annotate | Download | only in X86
      1 //===-- X86ISelLowering.cpp - X86 DAG Lowering Implementation -------------===//
      2 //
      3 //                     The LLVM Compiler Infrastructure
      4 //
      5 // This file is distributed under the University of Illinois Open Source
      6 // License. See LICENSE.TXT for details.
      7 //
      8 //===----------------------------------------------------------------------===//
      9 //
     10 // This file defines the interfaces that X86 uses to lower LLVM code into a
     11 // selection DAG.
     12 //
     13 //===----------------------------------------------------------------------===//
     14 
     15 #define DEBUG_TYPE "x86-isel"
     16 #include "X86ISelLowering.h"
     17 #include "X86.h"
     18 #include "X86InstrBuilder.h"
     19 #include "X86TargetMachine.h"
     20 #include "X86TargetObjectFile.h"
     21 #include "Utils/X86ShuffleDecode.h"
     22 #include "llvm/CallingConv.h"
     23 #include "llvm/Constants.h"
     24 #include "llvm/DerivedTypes.h"
     25 #include "llvm/GlobalAlias.h"
     26 #include "llvm/GlobalVariable.h"
     27 #include "llvm/Function.h"
     28 #include "llvm/Instructions.h"
     29 #include "llvm/Intrinsics.h"
     30 #include "llvm/LLVMContext.h"
     31 #include "llvm/CodeGen/IntrinsicLowering.h"
     32 #include "llvm/CodeGen/MachineFrameInfo.h"
     33 #include "llvm/CodeGen/MachineFunction.h"
     34 #include "llvm/CodeGen/MachineInstrBuilder.h"
     35 #include "llvm/CodeGen/MachineJumpTableInfo.h"
     36 #include "llvm/CodeGen/MachineModuleInfo.h"
     37 #include "llvm/CodeGen/MachineRegisterInfo.h"
     38 #include "llvm/MC/MCAsmInfo.h"
     39 #include "llvm/MC/MCContext.h"
     40 #include "llvm/MC/MCExpr.h"
     41 #include "llvm/MC/MCSymbol.h"
     42 #include "llvm/ADT/SmallSet.h"
     43 #include "llvm/ADT/Statistic.h"
     44 #include "llvm/ADT/StringExtras.h"
     45 #include "llvm/ADT/VariadicFunction.h"
     46 #include "llvm/Support/CallSite.h"
     47 #include "llvm/Support/Debug.h"
     48 #include "llvm/Support/ErrorHandling.h"
     49 #include "llvm/Support/MathExtras.h"
     50 #include "llvm/Target/TargetOptions.h"
     51 #include <bitset>
     52 #include <cctype>
     53 using namespace llvm;
     54 
     55 STATISTIC(NumTailCalls, "Number of tail calls");
     56 
     57 // Forward declarations.
     58 static SDValue getMOVL(SelectionDAG &DAG, DebugLoc dl, EVT VT, SDValue V1,
     59                        SDValue V2);
     60 
     61 /// Generate a DAG to grab 128-bits from a vector > 128 bits.  This
     62 /// sets things up to match to an AVX VEXTRACTF128 instruction or a
     63 /// simple subregister reference.  Idx is an index in the 128 bits we
     64 /// want.  It need not be aligned to a 128-bit bounday.  That makes
     65 /// lowering EXTRACT_VECTOR_ELT operations easier.
     66 static SDValue Extract128BitVector(SDValue Vec,
     67                                    SDValue Idx,
     68                                    SelectionDAG &DAG,
     69                                    DebugLoc dl) {
     70   EVT VT = Vec.getValueType();
     71   assert(VT.getSizeInBits() == 256 && "Unexpected vector size!");
     72   EVT ElVT = VT.getVectorElementType();
     73   int Factor = VT.getSizeInBits()/128;
     74   EVT ResultVT = EVT::getVectorVT(*DAG.getContext(), ElVT,
     75                                   VT.getVectorNumElements()/Factor);
     76 
     77   // Extract from UNDEF is UNDEF.
     78   if (Vec.getOpcode() == ISD::UNDEF)
     79     return DAG.getNode(ISD::UNDEF, dl, ResultVT);
     80 
     81   if (isa<ConstantSDNode>(Idx)) {
     82     unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
     83 
     84     // Extract the relevant 128 bits.  Generate an EXTRACT_SUBVECTOR
     85     // we can match to VEXTRACTF128.
     86     unsigned ElemsPerChunk = 128 / ElVT.getSizeInBits();
     87 
     88     // This is the index of the first element of the 128-bit chunk
     89     // we want.
     90     unsigned NormalizedIdxVal = (((IdxVal * ElVT.getSizeInBits()) / 128)
     91                                  * ElemsPerChunk);
     92 
     93     SDValue VecIdx = DAG.getConstant(NormalizedIdxVal, MVT::i32);
     94     SDValue Result = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, ResultVT, Vec,
     95                                  VecIdx);
     96 
     97     return Result;
     98   }
     99 
    100   return SDValue();
    101 }
    102 
    103 /// Generate a DAG to put 128-bits into a vector > 128 bits.  This
    104 /// sets things up to match to an AVX VINSERTF128 instruction or a
    105 /// simple superregister reference.  Idx is an index in the 128 bits
    106 /// we want.  It need not be aligned to a 128-bit bounday.  That makes
    107 /// lowering INSERT_VECTOR_ELT operations easier.
    108 static SDValue Insert128BitVector(SDValue Result,
    109                                   SDValue Vec,
    110                                   SDValue Idx,
    111                                   SelectionDAG &DAG,
    112                                   DebugLoc dl) {
    113   if (isa<ConstantSDNode>(Idx)) {
    114     EVT VT = Vec.getValueType();
    115     assert(VT.getSizeInBits() == 128 && "Unexpected vector size!");
    116 
    117     EVT ElVT = VT.getVectorElementType();
    118     unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
    119     EVT ResultVT = Result.getValueType();
    120 
    121     // Insert the relevant 128 bits.
    122     unsigned ElemsPerChunk = 128/ElVT.getSizeInBits();
    123 
    124     // This is the index of the first element of the 128-bit chunk
    125     // we want.
    126     unsigned NormalizedIdxVal = (((IdxVal * ElVT.getSizeInBits())/128)
    127                                  * ElemsPerChunk);
    128 
    129     SDValue VecIdx = DAG.getConstant(NormalizedIdxVal, MVT::i32);
    130     Result = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResultVT, Result, Vec,
    131                          VecIdx);
    132     return Result;
    133   }
    134 
    135   return SDValue();
    136 }
    137 
    138 static TargetLoweringObjectFile *createTLOF(X86TargetMachine &TM) {
    139   const X86Subtarget *Subtarget = &TM.getSubtarget<X86Subtarget>();
    140   bool is64Bit = Subtarget->is64Bit();
    141 
    142   if (Subtarget->isTargetEnvMacho()) {
    143     if (is64Bit)
    144       return new X8664_MachoTargetObjectFile();
    145     return new TargetLoweringObjectFileMachO();
    146   }
    147 
    148   if (Subtarget->isTargetELF())
    149     return new TargetLoweringObjectFileELF();
    150   if (Subtarget->isTargetCOFF() && !Subtarget->isTargetEnvMacho())
    151     return new TargetLoweringObjectFileCOFF();
    152   llvm_unreachable("unknown subtarget type");
    153 }
    154 
    155 X86TargetLowering::X86TargetLowering(X86TargetMachine &TM)
    156   : TargetLowering(TM, createTLOF(TM)) {
    157   Subtarget = &TM.getSubtarget<X86Subtarget>();
    158   X86ScalarSSEf64 = Subtarget->hasSSE2();
    159   X86ScalarSSEf32 = Subtarget->hasSSE1();
    160   X86StackPtr = Subtarget->is64Bit() ? X86::RSP : X86::ESP;
    161 
    162   RegInfo = TM.getRegisterInfo();
    163   TD = getTargetData();
    164 
    165   // Set up the TargetLowering object.
    166   static MVT IntVTs[] = { MVT::i8, MVT::i16, MVT::i32, MVT::i64 };
    167 
    168   // X86 is weird, it always uses i8 for shift amounts and setcc results.
    169   setBooleanContents(ZeroOrOneBooleanContent);
    170   // X86-SSE is even stranger. It uses -1 or 0 for vector masks.
    171   setBooleanVectorContents(ZeroOrNegativeOneBooleanContent);
    172 
    173   // For 64-bit since we have so many registers use the ILP scheduler, for
    174   // 32-bit code use the register pressure specific scheduling.
    175   // For 32 bit Atom, use Hybrid (register pressure + latency) scheduling.
    176   if (Subtarget->is64Bit())
    177     setSchedulingPreference(Sched::ILP);
    178   else if (Subtarget->isAtom())
    179     setSchedulingPreference(Sched::Hybrid);
    180   else
    181     setSchedulingPreference(Sched::RegPressure);
    182   setStackPointerRegisterToSaveRestore(X86StackPtr);
    183 
    184   if (Subtarget->isTargetWindows() && !Subtarget->isTargetCygMing()) {
    185     // Setup Windows compiler runtime calls.
    186     setLibcallName(RTLIB::SDIV_I64, "_alldiv");
    187     setLibcallName(RTLIB::UDIV_I64, "_aulldiv");
    188     setLibcallName(RTLIB::SREM_I64, "_allrem");
    189     setLibcallName(RTLIB::UREM_I64, "_aullrem");
    190     setLibcallName(RTLIB::MUL_I64, "_allmul");
    191     setLibcallCallingConv(RTLIB::SDIV_I64, CallingConv::X86_StdCall);
    192     setLibcallCallingConv(RTLIB::UDIV_I64, CallingConv::X86_StdCall);
    193     setLibcallCallingConv(RTLIB::SREM_I64, CallingConv::X86_StdCall);
    194     setLibcallCallingConv(RTLIB::UREM_I64, CallingConv::X86_StdCall);
    195     setLibcallCallingConv(RTLIB::MUL_I64, CallingConv::X86_StdCall);
    196 
    197     // The _ftol2 runtime function has an unusual calling conv, which
    198     // is modeled by a special pseudo-instruction.
    199     setLibcallName(RTLIB::FPTOUINT_F64_I64, 0);
    200     setLibcallName(RTLIB::FPTOUINT_F32_I64, 0);
    201     setLibcallName(RTLIB::FPTOUINT_F64_I32, 0);
    202     setLibcallName(RTLIB::FPTOUINT_F32_I32, 0);
    203   }
    204 
    205   if (Subtarget->isTargetDarwin()) {
    206     // Darwin should use _setjmp/_longjmp instead of setjmp/longjmp.
    207     setUseUnderscoreSetJmp(false);
    208     setUseUnderscoreLongJmp(false);
    209   } else if (Subtarget->isTargetMingw()) {
    210     // MS runtime is weird: it exports _setjmp, but longjmp!
    211     setUseUnderscoreSetJmp(true);
    212     setUseUnderscoreLongJmp(false);
    213   } else {
    214     setUseUnderscoreSetJmp(true);
    215     setUseUnderscoreLongJmp(true);
    216   }
    217 
    218   // Set up the register classes.
    219   addRegisterClass(MVT::i8, X86::GR8RegisterClass);
    220   addRegisterClass(MVT::i16, X86::GR16RegisterClass);
    221   addRegisterClass(MVT::i32, X86::GR32RegisterClass);
    222   if (Subtarget->is64Bit())
    223     addRegisterClass(MVT::i64, X86::GR64RegisterClass);
    224 
    225   setLoadExtAction(ISD::SEXTLOAD, MVT::i1, Promote);
    226 
    227   // We don't accept any truncstore of integer registers.
    228   setTruncStoreAction(MVT::i64, MVT::i32, Expand);
    229   setTruncStoreAction(MVT::i64, MVT::i16, Expand);
    230   setTruncStoreAction(MVT::i64, MVT::i8 , Expand);
    231   setTruncStoreAction(MVT::i32, MVT::i16, Expand);
    232   setTruncStoreAction(MVT::i32, MVT::i8 , Expand);
    233   setTruncStoreAction(MVT::i16, MVT::i8,  Expand);
    234 
    235   // SETOEQ and SETUNE require checking two conditions.
    236   setCondCodeAction(ISD::SETOEQ, MVT::f32, Expand);
    237   setCondCodeAction(ISD::SETOEQ, MVT::f64, Expand);
    238   setCondCodeAction(ISD::SETOEQ, MVT::f80, Expand);
    239   setCondCodeAction(ISD::SETUNE, MVT::f32, Expand);
    240   setCondCodeAction(ISD::SETUNE, MVT::f64, Expand);
    241   setCondCodeAction(ISD::SETUNE, MVT::f80, Expand);
    242 
    243   // Promote all UINT_TO_FP to larger SINT_TO_FP's, as X86 doesn't have this
    244   // operation.
    245   setOperationAction(ISD::UINT_TO_FP       , MVT::i1   , Promote);
    246   setOperationAction(ISD::UINT_TO_FP       , MVT::i8   , Promote);
    247   setOperationAction(ISD::UINT_TO_FP       , MVT::i16  , Promote);
    248 
    249   if (Subtarget->is64Bit()) {
    250     setOperationAction(ISD::UINT_TO_FP     , MVT::i32  , Promote);
    251     setOperationAction(ISD::UINT_TO_FP     , MVT::i64  , Custom);
    252   } else if (!TM.Options.UseSoftFloat) {
    253     // We have an algorithm for SSE2->double, and we turn this into a
    254     // 64-bit FILD followed by conditional FADD for other targets.
    255     setOperationAction(ISD::UINT_TO_FP     , MVT::i64  , Custom);
    256     // We have an algorithm for SSE2, and we turn this into a 64-bit
    257     // FILD for other targets.
    258     setOperationAction(ISD::UINT_TO_FP     , MVT::i32  , Custom);
    259   }
    260 
    261   // Promote i1/i8 SINT_TO_FP to larger SINT_TO_FP's, as X86 doesn't have
    262   // this operation.
    263   setOperationAction(ISD::SINT_TO_FP       , MVT::i1   , Promote);
    264   setOperationAction(ISD::SINT_TO_FP       , MVT::i8   , Promote);
    265 
    266   if (!TM.Options.UseSoftFloat) {
    267     // SSE has no i16 to fp conversion, only i32
    268     if (X86ScalarSSEf32) {
    269       setOperationAction(ISD::SINT_TO_FP     , MVT::i16  , Promote);
    270       // f32 and f64 cases are Legal, f80 case is not
    271       setOperationAction(ISD::SINT_TO_FP     , MVT::i32  , Custom);
    272     } else {
    273       setOperationAction(ISD::SINT_TO_FP     , MVT::i16  , Custom);
    274       setOperationAction(ISD::SINT_TO_FP     , MVT::i32  , Custom);
    275     }
    276   } else {
    277     setOperationAction(ISD::SINT_TO_FP     , MVT::i16  , Promote);
    278     setOperationAction(ISD::SINT_TO_FP     , MVT::i32  , Promote);
    279   }
    280 
    281   // In 32-bit mode these are custom lowered.  In 64-bit mode F32 and F64
    282   // are Legal, f80 is custom lowered.
    283   setOperationAction(ISD::FP_TO_SINT     , MVT::i64  , Custom);
    284   setOperationAction(ISD::SINT_TO_FP     , MVT::i64  , Custom);
    285 
    286   // Promote i1/i8 FP_TO_SINT to larger FP_TO_SINTS's, as X86 doesn't have
    287   // this operation.
    288   setOperationAction(ISD::FP_TO_SINT       , MVT::i1   , Promote);
    289   setOperationAction(ISD::FP_TO_SINT       , MVT::i8   , Promote);
    290 
    291   if (X86ScalarSSEf32) {
    292     setOperationAction(ISD::FP_TO_SINT     , MVT::i16  , Promote);
    293     // f32 and f64 cases are Legal, f80 case is not
    294     setOperationAction(ISD::FP_TO_SINT     , MVT::i32  , Custom);
    295   } else {
    296     setOperationAction(ISD::FP_TO_SINT     , MVT::i16  , Custom);
    297     setOperationAction(ISD::FP_TO_SINT     , MVT::i32  , Custom);
    298   }
    299 
    300   // Handle FP_TO_UINT by promoting the destination to a larger signed
    301   // conversion.
    302   setOperationAction(ISD::FP_TO_UINT       , MVT::i1   , Promote);
    303   setOperationAction(ISD::FP_TO_UINT       , MVT::i8   , Promote);
    304   setOperationAction(ISD::FP_TO_UINT       , MVT::i16  , Promote);
    305 
    306   if (Subtarget->is64Bit()) {
    307     setOperationAction(ISD::FP_TO_UINT     , MVT::i64  , Expand);
    308     setOperationAction(ISD::FP_TO_UINT     , MVT::i32  , Promote);
    309   } else if (!TM.Options.UseSoftFloat) {
    310     // Since AVX is a superset of SSE3, only check for SSE here.
    311     if (Subtarget->hasSSE1() && !Subtarget->hasSSE3())
    312       // Expand FP_TO_UINT into a select.
    313       // FIXME: We would like to use a Custom expander here eventually to do
    314       // the optimal thing for SSE vs. the default expansion in the legalizer.
    315       setOperationAction(ISD::FP_TO_UINT   , MVT::i32  , Expand);
    316     else
    317       // With SSE3 we can use fisttpll to convert to a signed i64; without
    318       // SSE, we're stuck with a fistpll.
    319       setOperationAction(ISD::FP_TO_UINT   , MVT::i32  , Custom);
    320   }
    321 
    322   if (isTargetFTOL()) {
    323     // Use the _ftol2 runtime function, which has a pseudo-instruction
    324     // to handle its weird calling convention.
    325     setOperationAction(ISD::FP_TO_UINT     , MVT::i64  , Custom);
    326   }
    327 
    328   // TODO: when we have SSE, these could be more efficient, by using movd/movq.
    329   if (!X86ScalarSSEf64) {
    330     setOperationAction(ISD::BITCAST        , MVT::f32  , Expand);
    331     setOperationAction(ISD::BITCAST        , MVT::i32  , Expand);
    332     if (Subtarget->is64Bit()) {
    333       setOperationAction(ISD::BITCAST      , MVT::f64  , Expand);
    334       // Without SSE, i64->f64 goes through memory.
    335       setOperationAction(ISD::BITCAST      , MVT::i64  , Expand);
    336     }
    337   }
    338 
    339   // Scalar integer divide and remainder are lowered to use operations that
    340   // produce two results, to match the available instructions. This exposes
    341   // the two-result form to trivial CSE, which is able to combine x/y and x%y
    342   // into a single instruction.
    343   //
    344   // Scalar integer multiply-high is also lowered to use two-result
    345   // operations, to match the available instructions. However, plain multiply
    346   // (low) operations are left as Legal, as there are single-result
    347   // instructions for this in x86. Using the two-result multiply instructions
    348   // when both high and low results are needed must be arranged by dagcombine.
    349   for (unsigned i = 0, e = 4; i != e; ++i) {
    350     MVT VT = IntVTs[i];
    351     setOperationAction(ISD::MULHS, VT, Expand);
    352     setOperationAction(ISD::MULHU, VT, Expand);
    353     setOperationAction(ISD::SDIV, VT, Expand);
    354     setOperationAction(ISD::UDIV, VT, Expand);
    355     setOperationAction(ISD::SREM, VT, Expand);
    356     setOperationAction(ISD::UREM, VT, Expand);
    357 
    358     // Add/Sub overflow ops with MVT::Glues are lowered to EFLAGS dependences.
    359     setOperationAction(ISD::ADDC, VT, Custom);
    360     setOperationAction(ISD::ADDE, VT, Custom);
    361     setOperationAction(ISD::SUBC, VT, Custom);
    362     setOperationAction(ISD::SUBE, VT, Custom);
    363   }
    364 
    365   setOperationAction(ISD::BR_JT            , MVT::Other, Expand);
    366   setOperationAction(ISD::BRCOND           , MVT::Other, Custom);
    367   setOperationAction(ISD::BR_CC            , MVT::Other, Expand);
    368   setOperationAction(ISD::SELECT_CC        , MVT::Other, Expand);
    369   if (Subtarget->is64Bit())
    370     setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i32, Legal);
    371   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i16  , Legal);
    372   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i8   , Legal);
    373   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1   , Expand);
    374   setOperationAction(ISD::FP_ROUND_INREG   , MVT::f32  , Expand);
    375   setOperationAction(ISD::FREM             , MVT::f32  , Expand);
    376   setOperationAction(ISD::FREM             , MVT::f64  , Expand);
    377   setOperationAction(ISD::FREM             , MVT::f80  , Expand);
    378   setOperationAction(ISD::FLT_ROUNDS_      , MVT::i32  , Custom);
    379 
    380   // Promote the i8 variants and force them on up to i32 which has a shorter
    381   // encoding.
    382   setOperationAction(ISD::CTTZ             , MVT::i8   , Promote);
    383   AddPromotedToType (ISD::CTTZ             , MVT::i8   , MVT::i32);
    384   setOperationAction(ISD::CTTZ_ZERO_UNDEF  , MVT::i8   , Promote);
    385   AddPromotedToType (ISD::CTTZ_ZERO_UNDEF  , MVT::i8   , MVT::i32);
    386   if (Subtarget->hasBMI()) {
    387     setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i16  , Expand);
    388     setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i32  , Expand);
    389     if (Subtarget->is64Bit())
    390       setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i64, Expand);
    391   } else {
    392     setOperationAction(ISD::CTTZ           , MVT::i16  , Custom);
    393     setOperationAction(ISD::CTTZ           , MVT::i32  , Custom);
    394     if (Subtarget->is64Bit())
    395       setOperationAction(ISD::CTTZ         , MVT::i64  , Custom);
    396   }
    397 
    398   if (Subtarget->hasLZCNT()) {
    399     // When promoting the i8 variants, force them to i32 for a shorter
    400     // encoding.
    401     setOperationAction(ISD::CTLZ           , MVT::i8   , Promote);
    402     AddPromotedToType (ISD::CTLZ           , MVT::i8   , MVT::i32);
    403     setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i8   , Promote);
    404     AddPromotedToType (ISD::CTLZ_ZERO_UNDEF, MVT::i8   , MVT::i32);
    405     setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i16  , Expand);
    406     setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i32  , Expand);
    407     if (Subtarget->is64Bit())
    408       setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i64, Expand);
    409   } else {
    410     setOperationAction(ISD::CTLZ           , MVT::i8   , Custom);
    411     setOperationAction(ISD::CTLZ           , MVT::i16  , Custom);
    412     setOperationAction(ISD::CTLZ           , MVT::i32  , Custom);
    413     setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i8   , Custom);
    414     setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i16  , Custom);
    415     setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i32  , Custom);
    416     if (Subtarget->is64Bit()) {
    417       setOperationAction(ISD::CTLZ         , MVT::i64  , Custom);
    418       setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i64, Custom);
    419     }
    420   }
    421 
    422   if (Subtarget->hasPOPCNT()) {
    423     setOperationAction(ISD::CTPOP          , MVT::i8   , Promote);
    424   } else {
    425     setOperationAction(ISD::CTPOP          , MVT::i8   , Expand);
    426     setOperationAction(ISD::CTPOP          , MVT::i16  , Expand);
    427     setOperationAction(ISD::CTPOP          , MVT::i32  , Expand);
    428     if (Subtarget->is64Bit())
    429       setOperationAction(ISD::CTPOP        , MVT::i64  , Expand);
    430   }
    431 
    432   setOperationAction(ISD::READCYCLECOUNTER , MVT::i64  , Custom);
    433   setOperationAction(ISD::BSWAP            , MVT::i16  , Expand);
    434 
    435   // These should be promoted to a larger select which is supported.
    436   setOperationAction(ISD::SELECT          , MVT::i1   , Promote);
    437   // X86 wants to expand cmov itself.
    438   setOperationAction(ISD::SELECT          , MVT::i8   , Custom);
    439   setOperationAction(ISD::SELECT          , MVT::i16  , Custom);
    440   setOperationAction(ISD::SELECT          , MVT::i32  , Custom);
    441   setOperationAction(ISD::SELECT          , MVT::f32  , Custom);
    442   setOperationAction(ISD::SELECT          , MVT::f64  , Custom);
    443   setOperationAction(ISD::SELECT          , MVT::f80  , Custom);
    444   setOperationAction(ISD::SETCC           , MVT::i8   , Custom);
    445   setOperationAction(ISD::SETCC           , MVT::i16  , Custom);
    446   setOperationAction(ISD::SETCC           , MVT::i32  , Custom);
    447   setOperationAction(ISD::SETCC           , MVT::f32  , Custom);
    448   setOperationAction(ISD::SETCC           , MVT::f64  , Custom);
    449   setOperationAction(ISD::SETCC           , MVT::f80  , Custom);
    450   if (Subtarget->is64Bit()) {
    451     setOperationAction(ISD::SELECT        , MVT::i64  , Custom);
    452     setOperationAction(ISD::SETCC         , MVT::i64  , Custom);
    453   }
    454   setOperationAction(ISD::EH_RETURN       , MVT::Other, Custom);
    455 
    456   // Darwin ABI issue.
    457   setOperationAction(ISD::ConstantPool    , MVT::i32  , Custom);
    458   setOperationAction(ISD::JumpTable       , MVT::i32  , Custom);
    459   setOperationAction(ISD::GlobalAddress   , MVT::i32  , Custom);
    460   setOperationAction(ISD::GlobalTLSAddress, MVT::i32  , Custom);
    461   if (Subtarget->is64Bit())
    462     setOperationAction(ISD::GlobalTLSAddress, MVT::i64, Custom);
    463   setOperationAction(ISD::ExternalSymbol  , MVT::i32  , Custom);
    464   setOperationAction(ISD::BlockAddress    , MVT::i32  , Custom);
    465   if (Subtarget->is64Bit()) {
    466     setOperationAction(ISD::ConstantPool  , MVT::i64  , Custom);
    467     setOperationAction(ISD::JumpTable     , MVT::i64  , Custom);
    468     setOperationAction(ISD::GlobalAddress , MVT::i64  , Custom);
    469     setOperationAction(ISD::ExternalSymbol, MVT::i64  , Custom);
    470     setOperationAction(ISD::BlockAddress  , MVT::i64  , Custom);
    471   }
    472   // 64-bit addm sub, shl, sra, srl (iff 32-bit x86)
    473   setOperationAction(ISD::SHL_PARTS       , MVT::i32  , Custom);
    474   setOperationAction(ISD::SRA_PARTS       , MVT::i32  , Custom);
    475   setOperationAction(ISD::SRL_PARTS       , MVT::i32  , Custom);
    476   if (Subtarget->is64Bit()) {
    477     setOperationAction(ISD::SHL_PARTS     , MVT::i64  , Custom);
    478     setOperationAction(ISD::SRA_PARTS     , MVT::i64  , Custom);
    479     setOperationAction(ISD::SRL_PARTS     , MVT::i64  , Custom);
    480   }
    481 
    482   if (Subtarget->hasSSE1())
    483     setOperationAction(ISD::PREFETCH      , MVT::Other, Legal);
    484 
    485   setOperationAction(ISD::MEMBARRIER    , MVT::Other, Custom);
    486   setOperationAction(ISD::ATOMIC_FENCE  , MVT::Other, Custom);
    487 
    488   // On X86 and X86-64, atomic operations are lowered to locked instructions.
    489   // Locked instructions, in turn, have implicit fence semantics (all memory
    490   // operations are flushed before issuing the locked instruction, and they
    491   // are not buffered), so we can fold away the common pattern of
    492   // fence-atomic-fence.
    493   setShouldFoldAtomicFences(true);
    494 
    495   // Expand certain atomics
    496   for (unsigned i = 0, e = 4; i != e; ++i) {
    497     MVT VT = IntVTs[i];
    498     setOperationAction(ISD::ATOMIC_CMP_SWAP, VT, Custom);
    499     setOperationAction(ISD::ATOMIC_LOAD_SUB, VT, Custom);
    500     setOperationAction(ISD::ATOMIC_STORE, VT, Custom);
    501   }
    502 
    503   if (!Subtarget->is64Bit()) {
    504     setOperationAction(ISD::ATOMIC_LOAD, MVT::i64, Custom);
    505     setOperationAction(ISD::ATOMIC_LOAD_ADD, MVT::i64, Custom);
    506     setOperationAction(ISD::ATOMIC_LOAD_SUB, MVT::i64, Custom);
    507     setOperationAction(ISD::ATOMIC_LOAD_AND, MVT::i64, Custom);
    508     setOperationAction(ISD::ATOMIC_LOAD_OR, MVT::i64, Custom);
    509     setOperationAction(ISD::ATOMIC_LOAD_XOR, MVT::i64, Custom);
    510     setOperationAction(ISD::ATOMIC_LOAD_NAND, MVT::i64, Custom);
    511     setOperationAction(ISD::ATOMIC_SWAP, MVT::i64, Custom);
    512   }
    513 
    514   if (Subtarget->hasCmpxchg16b()) {
    515     setOperationAction(ISD::ATOMIC_CMP_SWAP, MVT::i128, Custom);
    516   }
    517 
    518   // FIXME - use subtarget debug flags
    519   if (!Subtarget->isTargetDarwin() &&
    520       !Subtarget->isTargetELF() &&
    521       !Subtarget->isTargetCygMing()) {
    522     setOperationAction(ISD::EH_LABEL, MVT::Other, Expand);
    523   }
    524 
    525   setOperationAction(ISD::EXCEPTIONADDR, MVT::i64, Expand);
    526   setOperationAction(ISD::EHSELECTION,   MVT::i64, Expand);
    527   setOperationAction(ISD::EXCEPTIONADDR, MVT::i32, Expand);
    528   setOperationAction(ISD::EHSELECTION,   MVT::i32, Expand);
    529   if (Subtarget->is64Bit()) {
    530     setExceptionPointerRegister(X86::RAX);
    531     setExceptionSelectorRegister(X86::RDX);
    532   } else {
    533     setExceptionPointerRegister(X86::EAX);
    534     setExceptionSelectorRegister(X86::EDX);
    535   }
    536   setOperationAction(ISD::FRAME_TO_ARGS_OFFSET, MVT::i32, Custom);
    537   setOperationAction(ISD::FRAME_TO_ARGS_OFFSET, MVT::i64, Custom);
    538 
    539   setOperationAction(ISD::INIT_TRAMPOLINE, MVT::Other, Custom);
    540   setOperationAction(ISD::ADJUST_TRAMPOLINE, MVT::Other, Custom);
    541 
    542   setOperationAction(ISD::TRAP, MVT::Other, Legal);
    543 
    544   // VASTART needs to be custom lowered to use the VarArgsFrameIndex
    545   setOperationAction(ISD::VASTART           , MVT::Other, Custom);
    546   setOperationAction(ISD::VAEND             , MVT::Other, Expand);
    547   if (Subtarget->is64Bit()) {
    548     setOperationAction(ISD::VAARG           , MVT::Other, Custom);
    549     setOperationAction(ISD::VACOPY          , MVT::Other, Custom);
    550   } else {
    551     setOperationAction(ISD::VAARG           , MVT::Other, Expand);
    552     setOperationAction(ISD::VACOPY          , MVT::Other, Expand);
    553   }
    554 
    555   setOperationAction(ISD::STACKSAVE,          MVT::Other, Expand);
    556   setOperationAction(ISD::STACKRESTORE,       MVT::Other, Expand);
    557 
    558   if (Subtarget->isTargetCOFF() && !Subtarget->isTargetEnvMacho())
    559     setOperationAction(ISD::DYNAMIC_STACKALLOC, Subtarget->is64Bit() ?
    560                        MVT::i64 : MVT::i32, Custom);
    561   else if (TM.Options.EnableSegmentedStacks)
    562     setOperationAction(ISD::DYNAMIC_STACKALLOC, Subtarget->is64Bit() ?
    563                        MVT::i64 : MVT::i32, Custom);
    564   else
    565     setOperationAction(ISD::DYNAMIC_STACKALLOC, Subtarget->is64Bit() ?
    566                        MVT::i64 : MVT::i32, Expand);
    567 
    568   if (!TM.Options.UseSoftFloat && X86ScalarSSEf64) {
    569     // f32 and f64 use SSE.
    570     // Set up the FP register classes.
    571     addRegisterClass(MVT::f32, X86::FR32RegisterClass);
    572     addRegisterClass(MVT::f64, X86::FR64RegisterClass);
    573 
    574     // Use ANDPD to simulate FABS.
    575     setOperationAction(ISD::FABS , MVT::f64, Custom);
    576     setOperationAction(ISD::FABS , MVT::f32, Custom);
    577 
    578     // Use XORP to simulate FNEG.
    579     setOperationAction(ISD::FNEG , MVT::f64, Custom);
    580     setOperationAction(ISD::FNEG , MVT::f32, Custom);
    581 
    582     // Use ANDPD and ORPD to simulate FCOPYSIGN.
    583     setOperationAction(ISD::FCOPYSIGN, MVT::f64, Custom);
    584     setOperationAction(ISD::FCOPYSIGN, MVT::f32, Custom);
    585 
    586     // Lower this to FGETSIGNx86 plus an AND.
    587     setOperationAction(ISD::FGETSIGN, MVT::i64, Custom);
    588     setOperationAction(ISD::FGETSIGN, MVT::i32, Custom);
    589 
    590     // We don't support sin/cos/fmod
    591     setOperationAction(ISD::FSIN , MVT::f64, Expand);
    592     setOperationAction(ISD::FCOS , MVT::f64, Expand);
    593     setOperationAction(ISD::FSIN , MVT::f32, Expand);
    594     setOperationAction(ISD::FCOS , MVT::f32, Expand);
    595 
    596     // Expand FP immediates into loads from the stack, except for the special
    597     // cases we handle.
    598     addLegalFPImmediate(APFloat(+0.0)); // xorpd
    599     addLegalFPImmediate(APFloat(+0.0f)); // xorps
    600   } else if (!TM.Options.UseSoftFloat && X86ScalarSSEf32) {
    601     // Use SSE for f32, x87 for f64.
    602     // Set up the FP register classes.
    603     addRegisterClass(MVT::f32, X86::FR32RegisterClass);
    604     addRegisterClass(MVT::f64, X86::RFP64RegisterClass);
    605 
    606     // Use ANDPS to simulate FABS.
    607     setOperationAction(ISD::FABS , MVT::f32, Custom);
    608 
    609     // Use XORP to simulate FNEG.
    610     setOperationAction(ISD::FNEG , MVT::f32, Custom);
    611 
    612     setOperationAction(ISD::UNDEF,     MVT::f64, Expand);
    613 
    614     // Use ANDPS and ORPS to simulate FCOPYSIGN.
    615     setOperationAction(ISD::FCOPYSIGN, MVT::f64, Expand);
    616     setOperationAction(ISD::FCOPYSIGN, MVT::f32, Custom);
    617 
    618     // We don't support sin/cos/fmod
    619     setOperationAction(ISD::FSIN , MVT::f32, Expand);
    620     setOperationAction(ISD::FCOS , MVT::f32, Expand);
    621 
    622     // Special cases we handle for FP constants.
    623     addLegalFPImmediate(APFloat(+0.0f)); // xorps
    624     addLegalFPImmediate(APFloat(+0.0)); // FLD0
    625     addLegalFPImmediate(APFloat(+1.0)); // FLD1
    626     addLegalFPImmediate(APFloat(-0.0)); // FLD0/FCHS
    627     addLegalFPImmediate(APFloat(-1.0)); // FLD1/FCHS
    628 
    629     if (!TM.Options.UnsafeFPMath) {
    630       setOperationAction(ISD::FSIN           , MVT::f64  , Expand);
    631       setOperationAction(ISD::FCOS           , MVT::f64  , Expand);
    632     }
    633   } else if (!TM.Options.UseSoftFloat) {
    634     // f32 and f64 in x87.
    635     // Set up the FP register classes.
    636     addRegisterClass(MVT::f64, X86::RFP64RegisterClass);
    637     addRegisterClass(MVT::f32, X86::RFP32RegisterClass);
    638 
    639     setOperationAction(ISD::UNDEF,     MVT::f64, Expand);
    640     setOperationAction(ISD::UNDEF,     MVT::f32, Expand);
    641     setOperationAction(ISD::FCOPYSIGN, MVT::f64, Expand);
    642     setOperationAction(ISD::FCOPYSIGN, MVT::f32, Expand);
    643 
    644     if (!TM.Options.UnsafeFPMath) {
    645       setOperationAction(ISD::FSIN           , MVT::f64  , Expand);
    646       setOperationAction(ISD::FCOS           , MVT::f64  , Expand);
    647     }
    648     addLegalFPImmediate(APFloat(+0.0)); // FLD0
    649     addLegalFPImmediate(APFloat(+1.0)); // FLD1
    650     addLegalFPImmediate(APFloat(-0.0)); // FLD0/FCHS
    651     addLegalFPImmediate(APFloat(-1.0)); // FLD1/FCHS
    652     addLegalFPImmediate(APFloat(+0.0f)); // FLD0
    653     addLegalFPImmediate(APFloat(+1.0f)); // FLD1
    654     addLegalFPImmediate(APFloat(-0.0f)); // FLD0/FCHS
    655     addLegalFPImmediate(APFloat(-1.0f)); // FLD1/FCHS
    656   }
    657 
    658   // We don't support FMA.
    659   setOperationAction(ISD::FMA, MVT::f64, Expand);
    660   setOperationAction(ISD::FMA, MVT::f32, Expand);
    661 
    662   // Long double always uses X87.
    663   if (!TM.Options.UseSoftFloat) {
    664     addRegisterClass(MVT::f80, X86::RFP80RegisterClass);
    665     setOperationAction(ISD::UNDEF,     MVT::f80, Expand);
    666     setOperationAction(ISD::FCOPYSIGN, MVT::f80, Expand);
    667     {
    668       APFloat TmpFlt = APFloat::getZero(APFloat::x87DoubleExtended);
    669       addLegalFPImmediate(TmpFlt);  // FLD0
    670       TmpFlt.changeSign();
    671       addLegalFPImmediate(TmpFlt);  // FLD0/FCHS
    672 
    673       bool ignored;
    674       APFloat TmpFlt2(+1.0);
    675       TmpFlt2.convert(APFloat::x87DoubleExtended, APFloat::rmNearestTiesToEven,
    676                       &ignored);
    677       addLegalFPImmediate(TmpFlt2);  // FLD1
    678       TmpFlt2.changeSign();
    679       addLegalFPImmediate(TmpFlt2);  // FLD1/FCHS
    680     }
    681 
    682     if (!TM.Options.UnsafeFPMath) {
    683       setOperationAction(ISD::FSIN           , MVT::f80  , Expand);
    684       setOperationAction(ISD::FCOS           , MVT::f80  , Expand);
    685     }
    686 
    687     setOperationAction(ISD::FFLOOR, MVT::f80, Expand);
    688     setOperationAction(ISD::FCEIL,  MVT::f80, Expand);
    689     setOperationAction(ISD::FTRUNC, MVT::f80, Expand);
    690     setOperationAction(ISD::FRINT,  MVT::f80, Expand);
    691     setOperationAction(ISD::FNEARBYINT, MVT::f80, Expand);
    692     setOperationAction(ISD::FMA, MVT::f80, Expand);
    693   }
    694 
    695   // Always use a library call for pow.
    696   setOperationAction(ISD::FPOW             , MVT::f32  , Expand);
    697   setOperationAction(ISD::FPOW             , MVT::f64  , Expand);
    698   setOperationAction(ISD::FPOW             , MVT::f80  , Expand);
    699 
    700   setOperationAction(ISD::FLOG, MVT::f80, Expand);
    701   setOperationAction(ISD::FLOG2, MVT::f80, Expand);
    702   setOperationAction(ISD::FLOG10, MVT::f80, Expand);
    703   setOperationAction(ISD::FEXP, MVT::f80, Expand);
    704   setOperationAction(ISD::FEXP2, MVT::f80, Expand);
    705 
    706   // First set operation action for all vector types to either promote
    707   // (for widening) or expand (for scalarization). Then we will selectively
    708   // turn on ones that can be effectively codegen'd.
    709   for (unsigned VT = (unsigned)MVT::FIRST_VECTOR_VALUETYPE;
    710        VT <= (unsigned)MVT::LAST_VECTOR_VALUETYPE; ++VT) {
    711     setOperationAction(ISD::ADD , (MVT::SimpleValueType)VT, Expand);
    712     setOperationAction(ISD::SUB , (MVT::SimpleValueType)VT, Expand);
    713     setOperationAction(ISD::FADD, (MVT::SimpleValueType)VT, Expand);
    714     setOperationAction(ISD::FNEG, (MVT::SimpleValueType)VT, Expand);
    715     setOperationAction(ISD::FSUB, (MVT::SimpleValueType)VT, Expand);
    716     setOperationAction(ISD::MUL , (MVT::SimpleValueType)VT, Expand);
    717     setOperationAction(ISD::FMUL, (MVT::SimpleValueType)VT, Expand);
    718     setOperationAction(ISD::SDIV, (MVT::SimpleValueType)VT, Expand);
    719     setOperationAction(ISD::UDIV, (MVT::SimpleValueType)VT, Expand);
    720     setOperationAction(ISD::FDIV, (MVT::SimpleValueType)VT, Expand);
    721     setOperationAction(ISD::SREM, (MVT::SimpleValueType)VT, Expand);
    722     setOperationAction(ISD::UREM, (MVT::SimpleValueType)VT, Expand);
    723     setOperationAction(ISD::LOAD, (MVT::SimpleValueType)VT, Expand);
    724     setOperationAction(ISD::VECTOR_SHUFFLE, (MVT::SimpleValueType)VT, Expand);
    725     setOperationAction(ISD::EXTRACT_VECTOR_ELT,(MVT::SimpleValueType)VT,Expand);
    726     setOperationAction(ISD::INSERT_VECTOR_ELT,(MVT::SimpleValueType)VT, Expand);
    727     setOperationAction(ISD::EXTRACT_SUBVECTOR,(MVT::SimpleValueType)VT,Expand);
    728     setOperationAction(ISD::INSERT_SUBVECTOR,(MVT::SimpleValueType)VT,Expand);
    729     setOperationAction(ISD::FABS, (MVT::SimpleValueType)VT, Expand);
    730     setOperationAction(ISD::FSIN, (MVT::SimpleValueType)VT, Expand);
    731     setOperationAction(ISD::FCOS, (MVT::SimpleValueType)VT, Expand);
    732     setOperationAction(ISD::FREM, (MVT::SimpleValueType)VT, Expand);
    733     setOperationAction(ISD::FPOWI, (MVT::SimpleValueType)VT, Expand);
    734     setOperationAction(ISD::FSQRT, (MVT::SimpleValueType)VT, Expand);
    735     setOperationAction(ISD::FCOPYSIGN, (MVT::SimpleValueType)VT, Expand);
    736     setOperationAction(ISD::SMUL_LOHI, (MVT::SimpleValueType)VT, Expand);
    737     setOperationAction(ISD::UMUL_LOHI, (MVT::SimpleValueType)VT, Expand);
    738     setOperationAction(ISD::SDIVREM, (MVT::SimpleValueType)VT, Expand);
    739     setOperationAction(ISD::UDIVREM, (MVT::SimpleValueType)VT, Expand);
    740     setOperationAction(ISD::FPOW, (MVT::SimpleValueType)VT, Expand);
    741     setOperationAction(ISD::CTPOP, (MVT::SimpleValueType)VT, Expand);
    742     setOperationAction(ISD::CTTZ, (MVT::SimpleValueType)VT, Expand);
    743     setOperationAction(ISD::CTTZ_ZERO_UNDEF, (MVT::SimpleValueType)VT, Expand);
    744     setOperationAction(ISD::CTLZ, (MVT::SimpleValueType)VT, Expand);
    745     setOperationAction(ISD::CTLZ_ZERO_UNDEF, (MVT::SimpleValueType)VT, Expand);
    746     setOperationAction(ISD::SHL, (MVT::SimpleValueType)VT, Expand);
    747     setOperationAction(ISD::SRA, (MVT::SimpleValueType)VT, Expand);
    748     setOperationAction(ISD::SRL, (MVT::SimpleValueType)VT, Expand);
    749     setOperationAction(ISD::ROTL, (MVT::SimpleValueType)VT, Expand);
    750     setOperationAction(ISD::ROTR, (MVT::SimpleValueType)VT, Expand);
    751     setOperationAction(ISD::BSWAP, (MVT::SimpleValueType)VT, Expand);
    752     setOperationAction(ISD::SETCC, (MVT::SimpleValueType)VT, Expand);
    753     setOperationAction(ISD::FLOG, (MVT::SimpleValueType)VT, Expand);
    754     setOperationAction(ISD::FLOG2, (MVT::SimpleValueType)VT, Expand);
    755     setOperationAction(ISD::FLOG10, (MVT::SimpleValueType)VT, Expand);
    756     setOperationAction(ISD::FEXP, (MVT::SimpleValueType)VT, Expand);
    757     setOperationAction(ISD::FEXP2, (MVT::SimpleValueType)VT, Expand);
    758     setOperationAction(ISD::FP_TO_UINT, (MVT::SimpleValueType)VT, Expand);
    759     setOperationAction(ISD::FP_TO_SINT, (MVT::SimpleValueType)VT, Expand);
    760     setOperationAction(ISD::UINT_TO_FP, (MVT::SimpleValueType)VT, Expand);
    761     setOperationAction(ISD::SINT_TO_FP, (MVT::SimpleValueType)VT, Expand);
    762     setOperationAction(ISD::SIGN_EXTEND_INREG, (MVT::SimpleValueType)VT,Expand);
    763     setOperationAction(ISD::TRUNCATE,  (MVT::SimpleValueType)VT, Expand);
    764     setOperationAction(ISD::SIGN_EXTEND,  (MVT::SimpleValueType)VT, Expand);
    765     setOperationAction(ISD::ZERO_EXTEND,  (MVT::SimpleValueType)VT, Expand);
    766     setOperationAction(ISD::ANY_EXTEND,  (MVT::SimpleValueType)VT, Expand);
    767     setOperationAction(ISD::VSELECT,  (MVT::SimpleValueType)VT, Expand);
    768     for (unsigned InnerVT = (unsigned)MVT::FIRST_VECTOR_VALUETYPE;
    769          InnerVT <= (unsigned)MVT::LAST_VECTOR_VALUETYPE; ++InnerVT)
    770       setTruncStoreAction((MVT::SimpleValueType)VT,
    771                           (MVT::SimpleValueType)InnerVT, Expand);
    772     setLoadExtAction(ISD::SEXTLOAD, (MVT::SimpleValueType)VT, Expand);
    773     setLoadExtAction(ISD::ZEXTLOAD, (MVT::SimpleValueType)VT, Expand);
    774     setLoadExtAction(ISD::EXTLOAD, (MVT::SimpleValueType)VT, Expand);
    775   }
    776 
    777   // FIXME: In order to prevent SSE instructions being expanded to MMX ones
    778   // with -msoft-float, disable use of MMX as well.
    779   if (!TM.Options.UseSoftFloat && Subtarget->hasMMX()) {
    780     addRegisterClass(MVT::x86mmx, X86::VR64RegisterClass);
    781     // No operations on x86mmx supported, everything uses intrinsics.
    782   }
    783 
    784   // MMX-sized vectors (other than x86mmx) are expected to be expanded
    785   // into smaller operations.
    786   setOperationAction(ISD::MULHS,              MVT::v8i8,  Expand);
    787   setOperationAction(ISD::MULHS,              MVT::v4i16, Expand);
    788   setOperationAction(ISD::MULHS,              MVT::v2i32, Expand);
    789   setOperationAction(ISD::MULHS,              MVT::v1i64, Expand);
    790   setOperationAction(ISD::AND,                MVT::v8i8,  Expand);
    791   setOperationAction(ISD::AND,                MVT::v4i16, Expand);
    792   setOperationAction(ISD::AND,                MVT::v2i32, Expand);
    793   setOperationAction(ISD::AND,                MVT::v1i64, Expand);
    794   setOperationAction(ISD::OR,                 MVT::v8i8,  Expand);
    795   setOperationAction(ISD::OR,                 MVT::v4i16, Expand);
    796   setOperationAction(ISD::OR,                 MVT::v2i32, Expand);
    797   setOperationAction(ISD::OR,                 MVT::v1i64, Expand);
    798   setOperationAction(ISD::XOR,                MVT::v8i8,  Expand);
    799   setOperationAction(ISD::XOR,                MVT::v4i16, Expand);
    800   setOperationAction(ISD::XOR,                MVT::v2i32, Expand);
    801   setOperationAction(ISD::XOR,                MVT::v1i64, Expand);
    802   setOperationAction(ISD::SCALAR_TO_VECTOR,   MVT::v8i8,  Expand);
    803   setOperationAction(ISD::SCALAR_TO_VECTOR,   MVT::v4i16, Expand);
    804   setOperationAction(ISD::SCALAR_TO_VECTOR,   MVT::v2i32, Expand);
    805   setOperationAction(ISD::SCALAR_TO_VECTOR,   MVT::v1i64, Expand);
    806   setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v1i64, Expand);
    807   setOperationAction(ISD::SELECT,             MVT::v8i8,  Expand);
    808   setOperationAction(ISD::SELECT,             MVT::v4i16, Expand);
    809   setOperationAction(ISD::SELECT,             MVT::v2i32, Expand);
    810   setOperationAction(ISD::SELECT,             MVT::v1i64, Expand);
    811   setOperationAction(ISD::BITCAST,            MVT::v8i8,  Expand);
    812   setOperationAction(ISD::BITCAST,            MVT::v4i16, Expand);
    813   setOperationAction(ISD::BITCAST,            MVT::v2i32, Expand);
    814   setOperationAction(ISD::BITCAST,            MVT::v1i64, Expand);
    815 
    816   if (!TM.Options.UseSoftFloat && Subtarget->hasSSE1()) {
    817     addRegisterClass(MVT::v4f32, X86::VR128RegisterClass);
    818 
    819     setOperationAction(ISD::FADD,               MVT::v4f32, Legal);
    820     setOperationAction(ISD::FSUB,               MVT::v4f32, Legal);
    821     setOperationAction(ISD::FMUL,               MVT::v4f32, Legal);
    822     setOperationAction(ISD::FDIV,               MVT::v4f32, Legal);
    823     setOperationAction(ISD::FSQRT,              MVT::v4f32, Legal);
    824     setOperationAction(ISD::FNEG,               MVT::v4f32, Custom);
    825     setOperationAction(ISD::LOAD,               MVT::v4f32, Legal);
    826     setOperationAction(ISD::BUILD_VECTOR,       MVT::v4f32, Custom);
    827     setOperationAction(ISD::VECTOR_SHUFFLE,     MVT::v4f32, Custom);
    828     setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4f32, Custom);
    829     setOperationAction(ISD::SELECT,             MVT::v4f32, Custom);
    830     setOperationAction(ISD::SETCC,              MVT::v4f32, Custom);
    831   }
    832 
    833   if (!TM.Options.UseSoftFloat && Subtarget->hasSSE2()) {
    834     addRegisterClass(MVT::v2f64, X86::VR128RegisterClass);
    835 
    836     // FIXME: Unfortunately -soft-float and -no-implicit-float means XMM
    837     // registers cannot be used even for integer operations.
    838     addRegisterClass(MVT::v16i8, X86::VR128RegisterClass);
    839     addRegisterClass(MVT::v8i16, X86::VR128RegisterClass);
    840     addRegisterClass(MVT::v4i32, X86::VR128RegisterClass);
    841     addRegisterClass(MVT::v2i64, X86::VR128RegisterClass);
    842 
    843     setOperationAction(ISD::ADD,                MVT::v16i8, Legal);
    844     setOperationAction(ISD::ADD,                MVT::v8i16, Legal);
    845     setOperationAction(ISD::ADD,                MVT::v4i32, Legal);
    846     setOperationAction(ISD::ADD,                MVT::v2i64, Legal);
    847     setOperationAction(ISD::MUL,                MVT::v2i64, Custom);
    848     setOperationAction(ISD::SUB,                MVT::v16i8, Legal);
    849     setOperationAction(ISD::SUB,                MVT::v8i16, Legal);
    850     setOperationAction(ISD::SUB,                MVT::v4i32, Legal);
    851     setOperationAction(ISD::SUB,                MVT::v2i64, Legal);
    852     setOperationAction(ISD::MUL,                MVT::v8i16, Legal);
    853     setOperationAction(ISD::FADD,               MVT::v2f64, Legal);
    854     setOperationAction(ISD::FSUB,               MVT::v2f64, Legal);
    855     setOperationAction(ISD::FMUL,               MVT::v2f64, Legal);
    856     setOperationAction(ISD::FDIV,               MVT::v2f64, Legal);
    857     setOperationAction(ISD::FSQRT,              MVT::v2f64, Legal);
    858     setOperationAction(ISD::FNEG,               MVT::v2f64, Custom);
    859 
    860     setOperationAction(ISD::SETCC,              MVT::v2i64, Custom);
    861     setOperationAction(ISD::SETCC,              MVT::v16i8, Custom);
    862     setOperationAction(ISD::SETCC,              MVT::v8i16, Custom);
    863     setOperationAction(ISD::SETCC,              MVT::v4i32, Custom);
    864 
    865     setOperationAction(ISD::SCALAR_TO_VECTOR,   MVT::v16i8, Custom);
    866     setOperationAction(ISD::SCALAR_TO_VECTOR,   MVT::v8i16, Custom);
    867     setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v8i16, Custom);
    868     setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v4i32, Custom);
    869     setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v4f32, Custom);
    870 
    871     setOperationAction(ISD::CONCAT_VECTORS,     MVT::v2f64, Custom);
    872     setOperationAction(ISD::CONCAT_VECTORS,     MVT::v2i64, Custom);
    873     setOperationAction(ISD::CONCAT_VECTORS,     MVT::v16i8, Custom);
    874     setOperationAction(ISD::CONCAT_VECTORS,     MVT::v8i16, Custom);
    875     setOperationAction(ISD::CONCAT_VECTORS,     MVT::v4i32, Custom);
    876 
    877     // Custom lower build_vector, vector_shuffle, and extract_vector_elt.
    878     for (unsigned i = (unsigned)MVT::v16i8; i != (unsigned)MVT::v2i64; ++i) {
    879       EVT VT = (MVT::SimpleValueType)i;
    880       // Do not attempt to custom lower non-power-of-2 vectors
    881       if (!isPowerOf2_32(VT.getVectorNumElements()))
    882         continue;
    883       // Do not attempt to custom lower non-128-bit vectors
    884       if (!VT.is128BitVector())
    885         continue;
    886       setOperationAction(ISD::BUILD_VECTOR,
    887                          VT.getSimpleVT().SimpleTy, Custom);
    888       setOperationAction(ISD::VECTOR_SHUFFLE,
    889                          VT.getSimpleVT().SimpleTy, Custom);
    890       setOperationAction(ISD::EXTRACT_VECTOR_ELT,
    891                          VT.getSimpleVT().SimpleTy, Custom);
    892     }
    893 
    894     setOperationAction(ISD::BUILD_VECTOR,       MVT::v2f64, Custom);
    895     setOperationAction(ISD::BUILD_VECTOR,       MVT::v2i64, Custom);
    896     setOperationAction(ISD::VECTOR_SHUFFLE,     MVT::v2f64, Custom);
    897     setOperationAction(ISD::VECTOR_SHUFFLE,     MVT::v2i64, Custom);
    898     setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v2f64, Custom);
    899     setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2f64, Custom);
    900 
    901     if (Subtarget->is64Bit()) {
    902       setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v2i64, Custom);
    903       setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2i64, Custom);
    904     }
    905 
    906     // Promote v16i8, v8i16, v4i32 load, select, and, or, xor to v2i64.
    907     for (unsigned i = (unsigned)MVT::v16i8; i != (unsigned)MVT::v2i64; i++) {
    908       MVT::SimpleValueType SVT = (MVT::SimpleValueType)i;
    909       EVT VT = SVT;
    910 
    911       // Do not attempt to promote non-128-bit vectors
    912       if (!VT.is128BitVector())
    913         continue;
    914 
    915       setOperationAction(ISD::AND,    SVT, Promote);
    916       AddPromotedToType (ISD::AND,    SVT, MVT::v2i64);
    917       setOperationAction(ISD::OR,     SVT, Promote);
    918       AddPromotedToType (ISD::OR,     SVT, MVT::v2i64);
    919       setOperationAction(ISD::XOR,    SVT, Promote);
    920       AddPromotedToType (ISD::XOR,    SVT, MVT::v2i64);
    921       setOperationAction(ISD::LOAD,   SVT, Promote);
    922       AddPromotedToType (ISD::LOAD,   SVT, MVT::v2i64);
    923       setOperationAction(ISD::SELECT, SVT, Promote);
    924       AddPromotedToType (ISD::SELECT, SVT, MVT::v2i64);
    925     }
    926 
    927     setTruncStoreAction(MVT::f64, MVT::f32, Expand);
    928 
    929     // Custom lower v2i64 and v2f64 selects.
    930     setOperationAction(ISD::LOAD,               MVT::v2f64, Legal);
    931     setOperationAction(ISD::LOAD,               MVT::v2i64, Legal);
    932     setOperationAction(ISD::SELECT,             MVT::v2f64, Custom);
    933     setOperationAction(ISD::SELECT,             MVT::v2i64, Custom);
    934 
    935     setOperationAction(ISD::FP_TO_SINT,         MVT::v4i32, Legal);
    936     setOperationAction(ISD::SINT_TO_FP,         MVT::v4i32, Legal);
    937   }
    938 
    939   if (Subtarget->hasSSE41()) {
    940     setOperationAction(ISD::FFLOOR,             MVT::f32,   Legal);
    941     setOperationAction(ISD::FCEIL,              MVT::f32,   Legal);
    942     setOperationAction(ISD::FTRUNC,             MVT::f32,   Legal);
    943     setOperationAction(ISD::FRINT,              MVT::f32,   Legal);
    944     setOperationAction(ISD::FNEARBYINT,         MVT::f32,   Legal);
    945     setOperationAction(ISD::FFLOOR,             MVT::f64,   Legal);
    946     setOperationAction(ISD::FCEIL,              MVT::f64,   Legal);
    947     setOperationAction(ISD::FTRUNC,             MVT::f64,   Legal);
    948     setOperationAction(ISD::FRINT,              MVT::f64,   Legal);
    949     setOperationAction(ISD::FNEARBYINT,         MVT::f64,   Legal);
    950 
    951     // FIXME: Do we need to handle scalar-to-vector here?
    952     setOperationAction(ISD::MUL,                MVT::v4i32, Legal);
    953 
    954     setOperationAction(ISD::VSELECT,            MVT::v2f64, Legal);
    955     setOperationAction(ISD::VSELECT,            MVT::v2i64, Legal);
    956     setOperationAction(ISD::VSELECT,            MVT::v16i8, Legal);
    957     setOperationAction(ISD::VSELECT,            MVT::v4i32, Legal);
    958     setOperationAction(ISD::VSELECT,            MVT::v4f32, Legal);
    959 
    960     // i8 and i16 vectors are custom , because the source register and source
    961     // source memory operand types are not the same width.  f32 vectors are
    962     // custom since the immediate controlling the insert encodes additional
    963     // information.
    964     setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v16i8, Custom);
    965     setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v8i16, Custom);
    966     setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v4i32, Custom);
    967     setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v4f32, Custom);
    968 
    969     setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v16i8, Custom);
    970     setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v8i16, Custom);
    971     setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4i32, Custom);
    972     setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4f32, Custom);
    973 
    974     // FIXME: these should be Legal but thats only for the case where
    975     // the index is constant.  For now custom expand to deal with that.
    976     if (Subtarget->is64Bit()) {
    977       setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v2i64, Custom);
    978       setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2i64, Custom);
    979     }
    980   }
    981 
    982   if (Subtarget->hasSSE2()) {
    983     setOperationAction(ISD::SRL,               MVT::v8i16, Custom);
    984     setOperationAction(ISD::SRL,               MVT::v16i8, Custom);
    985 
    986     setOperationAction(ISD::SHL,               MVT::v8i16, Custom);
    987     setOperationAction(ISD::SHL,               MVT::v16i8, Custom);
    988 
    989     setOperationAction(ISD::SRA,               MVT::v8i16, Custom);
    990     setOperationAction(ISD::SRA,               MVT::v16i8, Custom);
    991 
    992     if (Subtarget->hasAVX2()) {
    993       setOperationAction(ISD::SRL,             MVT::v2i64, Legal);
    994       setOperationAction(ISD::SRL,             MVT::v4i32, Legal);
    995 
    996       setOperationAction(ISD::SHL,             MVT::v2i64, Legal);
    997       setOperationAction(ISD::SHL,             MVT::v4i32, Legal);
    998 
    999       setOperationAction(ISD::SRA,             MVT::v4i32, Legal);
   1000     } else {
   1001       setOperationAction(ISD::SRL,             MVT::v2i64, Custom);
   1002       setOperationAction(ISD::SRL,             MVT::v4i32, Custom);
   1003 
   1004       setOperationAction(ISD::SHL,             MVT::v2i64, Custom);
   1005       setOperationAction(ISD::SHL,             MVT::v4i32, Custom);
   1006 
   1007       setOperationAction(ISD::SRA,             MVT::v4i32, Custom);
   1008     }
   1009   }
   1010 
   1011   if (Subtarget->hasSSE42())
   1012     setOperationAction(ISD::SETCC,             MVT::v2i64, Custom);
   1013 
   1014   if (!TM.Options.UseSoftFloat && Subtarget->hasAVX()) {
   1015     addRegisterClass(MVT::v32i8,  X86::VR256RegisterClass);
   1016     addRegisterClass(MVT::v16i16, X86::VR256RegisterClass);
   1017     addRegisterClass(MVT::v8i32,  X86::VR256RegisterClass);
   1018     addRegisterClass(MVT::v8f32,  X86::VR256RegisterClass);
   1019     addRegisterClass(MVT::v4i64,  X86::VR256RegisterClass);
   1020     addRegisterClass(MVT::v4f64,  X86::VR256RegisterClass);
   1021 
   1022     setOperationAction(ISD::LOAD,               MVT::v8f32, Legal);
   1023     setOperationAction(ISD::LOAD,               MVT::v4f64, Legal);
   1024     setOperationAction(ISD::LOAD,               MVT::v4i64, Legal);
   1025 
   1026     setOperationAction(ISD::FADD,               MVT::v8f32, Legal);
   1027     setOperationAction(ISD::FSUB,               MVT::v8f32, Legal);
   1028     setOperationAction(ISD::FMUL,               MVT::v8f32, Legal);
   1029     setOperationAction(ISD::FDIV,               MVT::v8f32, Legal);
   1030     setOperationAction(ISD::FSQRT,              MVT::v8f32, Legal);
   1031     setOperationAction(ISD::FNEG,               MVT::v8f32, Custom);
   1032 
   1033     setOperationAction(ISD::FADD,               MVT::v4f64, Legal);
   1034     setOperationAction(ISD::FSUB,               MVT::v4f64, Legal);
   1035     setOperationAction(ISD::FMUL,               MVT::v4f64, Legal);
   1036     setOperationAction(ISD::FDIV,               MVT::v4f64, Legal);
   1037     setOperationAction(ISD::FSQRT,              MVT::v4f64, Legal);
   1038     setOperationAction(ISD::FNEG,               MVT::v4f64, Custom);
   1039 
   1040     setOperationAction(ISD::FP_TO_SINT,         MVT::v8i32, Legal);
   1041     setOperationAction(ISD::SINT_TO_FP,         MVT::v8i32, Legal);
   1042     setOperationAction(ISD::FP_ROUND,           MVT::v4f32, Legal);
   1043 
   1044     setOperationAction(ISD::CONCAT_VECTORS,     MVT::v4f64,  Custom);
   1045     setOperationAction(ISD::CONCAT_VECTORS,     MVT::v4i64,  Custom);
   1046     setOperationAction(ISD::CONCAT_VECTORS,     MVT::v8f32,  Custom);
   1047     setOperationAction(ISD::CONCAT_VECTORS,     MVT::v8i32,  Custom);
   1048     setOperationAction(ISD::CONCAT_VECTORS,     MVT::v32i8,  Custom);
   1049     setOperationAction(ISD::CONCAT_VECTORS,     MVT::v16i16, Custom);
   1050 
   1051     setOperationAction(ISD::SRL,               MVT::v16i16, Custom);
   1052     setOperationAction(ISD::SRL,               MVT::v32i8, Custom);
   1053 
   1054     setOperationAction(ISD::SHL,               MVT::v16i16, Custom);
   1055     setOperationAction(ISD::SHL,               MVT::v32i8, Custom);
   1056 
   1057     setOperationAction(ISD::SRA,               MVT::v16i16, Custom);
   1058     setOperationAction(ISD::SRA,               MVT::v32i8, Custom);
   1059 
   1060     setOperationAction(ISD::SETCC,             MVT::v32i8, Custom);
   1061     setOperationAction(ISD::SETCC,             MVT::v16i16, Custom);
   1062     setOperationAction(ISD::SETCC,             MVT::v8i32, Custom);
   1063     setOperationAction(ISD::SETCC,             MVT::v4i64, Custom);
   1064 
   1065     setOperationAction(ISD::SELECT,            MVT::v4f64, Custom);
   1066     setOperationAction(ISD::SELECT,            MVT::v4i64, Custom);
   1067     setOperationAction(ISD::SELECT,            MVT::v8f32, Custom);
   1068 
   1069     setOperationAction(ISD::VSELECT,           MVT::v4f64, Legal);
   1070     setOperationAction(ISD::VSELECT,           MVT::v4i64, Legal);
   1071     setOperationAction(ISD::VSELECT,           MVT::v8i32, Legal);
   1072     setOperationAction(ISD::VSELECT,           MVT::v8f32, Legal);
   1073 
   1074     if (Subtarget->hasAVX2()) {
   1075       setOperationAction(ISD::ADD,             MVT::v4i64, Legal);
   1076       setOperationAction(ISD::ADD,             MVT::v8i32, Legal);
   1077       setOperationAction(ISD::ADD,             MVT::v16i16, Legal);
   1078       setOperationAction(ISD::ADD,             MVT::v32i8, Legal);
   1079 
   1080       setOperationAction(ISD::SUB,             MVT::v4i64, Legal);
   1081       setOperationAction(ISD::SUB,             MVT::v8i32, Legal);
   1082       setOperationAction(ISD::SUB,             MVT::v16i16, Legal);
   1083       setOperationAction(ISD::SUB,             MVT::v32i8, Legal);
   1084 
   1085       setOperationAction(ISD::MUL,             MVT::v4i64, Custom);
   1086       setOperationAction(ISD::MUL,             MVT::v8i32, Legal);
   1087       setOperationAction(ISD::MUL,             MVT::v16i16, Legal);
   1088       // Don't lower v32i8 because there is no 128-bit byte mul
   1089 
   1090       setOperationAction(ISD::VSELECT,         MVT::v32i8, Legal);
   1091 
   1092       setOperationAction(ISD::SRL,             MVT::v4i64, Legal);
   1093       setOperationAction(ISD::SRL,             MVT::v8i32, Legal);
   1094 
   1095       setOperationAction(ISD::SHL,             MVT::v4i64, Legal);
   1096       setOperationAction(ISD::SHL,             MVT::v8i32, Legal);
   1097 
   1098       setOperationAction(ISD::SRA,             MVT::v8i32, Legal);
   1099     } else {
   1100       setOperationAction(ISD::ADD,             MVT::v4i64, Custom);
   1101       setOperationAction(ISD::ADD,             MVT::v8i32, Custom);
   1102       setOperationAction(ISD::ADD,             MVT::v16i16, Custom);
   1103       setOperationAction(ISD::ADD,             MVT::v32i8, Custom);
   1104 
   1105       setOperationAction(ISD::SUB,             MVT::v4i64, Custom);
   1106       setOperationAction(ISD::SUB,             MVT::v8i32, Custom);
   1107       setOperationAction(ISD::SUB,             MVT::v16i16, Custom);
   1108       setOperationAction(ISD::SUB,             MVT::v32i8, Custom);
   1109 
   1110       setOperationAction(ISD::MUL,             MVT::v4i64, Custom);
   1111       setOperationAction(ISD::MUL,             MVT::v8i32, Custom);
   1112       setOperationAction(ISD::MUL,             MVT::v16i16, Custom);
   1113       // Don't lower v32i8 because there is no 128-bit byte mul
   1114 
   1115       setOperationAction(ISD::SRL,             MVT::v4i64, Custom);
   1116       setOperationAction(ISD::SRL,             MVT::v8i32, Custom);
   1117 
   1118       setOperationAction(ISD::SHL,             MVT::v4i64, Custom);
   1119       setOperationAction(ISD::SHL,             MVT::v8i32, Custom);
   1120 
   1121       setOperationAction(ISD::SRA,             MVT::v8i32, Custom);
   1122     }
   1123 
   1124     // Custom lower several nodes for 256-bit types.
   1125     for (unsigned i = (unsigned)MVT::FIRST_VECTOR_VALUETYPE;
   1126                   i <= (unsigned)MVT::LAST_VECTOR_VALUETYPE; ++i) {
   1127       MVT::SimpleValueType SVT = (MVT::SimpleValueType)i;
   1128       EVT VT = SVT;
   1129 
   1130       // Extract subvector is special because the value type
   1131       // (result) is 128-bit but the source is 256-bit wide.
   1132       if (VT.is128BitVector())
   1133         setOperationAction(ISD::EXTRACT_SUBVECTOR, SVT, Custom);
   1134 
   1135       // Do not attempt to custom lower other non-256-bit vectors
   1136       if (!VT.is256BitVector())
   1137         continue;
   1138 
   1139       setOperationAction(ISD::BUILD_VECTOR,       SVT, Custom);
   1140       setOperationAction(ISD::VECTOR_SHUFFLE,     SVT, Custom);
   1141       setOperationAction(ISD::INSERT_VECTOR_ELT,  SVT, Custom);
   1142       setOperationAction(ISD::EXTRACT_VECTOR_ELT, SVT, Custom);
   1143       setOperationAction(ISD::SCALAR_TO_VECTOR,   SVT, Custom);
   1144       setOperationAction(ISD::INSERT_SUBVECTOR,   SVT, Custom);
   1145     }
   1146 
   1147     // Promote v32i8, v16i16, v8i32 select, and, or, xor to v4i64.
   1148     for (unsigned i = (unsigned)MVT::v32i8; i != (unsigned)MVT::v4i64; ++i) {
   1149       MVT::SimpleValueType SVT = (MVT::SimpleValueType)i;
   1150       EVT VT = SVT;
   1151 
   1152       // Do not attempt to promote non-256-bit vectors
   1153       if (!VT.is256BitVector())
   1154         continue;
   1155 
   1156       setOperationAction(ISD::AND,    SVT, Promote);
   1157       AddPromotedToType (ISD::AND,    SVT, MVT::v4i64);
   1158       setOperationAction(ISD::OR,     SVT, Promote);
   1159       AddPromotedToType (ISD::OR,     SVT, MVT::v4i64);
   1160       setOperationAction(ISD::XOR,    SVT, Promote);
   1161       AddPromotedToType (ISD::XOR,    SVT, MVT::v4i64);
   1162       setOperationAction(ISD::LOAD,   SVT, Promote);
   1163       AddPromotedToType (ISD::LOAD,   SVT, MVT::v4i64);
   1164       setOperationAction(ISD::SELECT, SVT, Promote);
   1165       AddPromotedToType (ISD::SELECT, SVT, MVT::v4i64);
   1166     }
   1167   }
   1168 
   1169   // SIGN_EXTEND_INREGs are evaluated by the extend type. Handle the expansion
   1170   // of this type with custom code.
   1171   for (unsigned VT = (unsigned)MVT::FIRST_VECTOR_VALUETYPE;
   1172          VT != (unsigned)MVT::LAST_VECTOR_VALUETYPE; VT++) {
   1173     setOperationAction(ISD::SIGN_EXTEND_INREG, (MVT::SimpleValueType)VT,
   1174                        Custom);
   1175   }
   1176 
   1177   // We want to custom lower some of our intrinsics.
   1178   setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom);
   1179 
   1180 
   1181   // Only custom-lower 64-bit SADDO and friends on 64-bit because we don't
   1182   // handle type legalization for these operations here.
   1183   //
   1184   // FIXME: We really should do custom legalization for addition and
   1185   // subtraction on x86-32 once PR3203 is fixed.  We really can't do much better
   1186   // than generic legalization for 64-bit multiplication-with-overflow, though.
   1187   for (unsigned i = 0, e = 3+Subtarget->is64Bit(); i != e; ++i) {
   1188     // Add/Sub/Mul with overflow operations are custom lowered.
   1189     MVT VT = IntVTs[i];
   1190     setOperationAction(ISD::SADDO, VT, Custom);
   1191     setOperationAction(ISD::UADDO, VT, Custom);
   1192     setOperationAction(ISD::SSUBO, VT, Custom);
   1193     setOperationAction(ISD::USUBO, VT, Custom);
   1194     setOperationAction(ISD::SMULO, VT, Custom);
   1195     setOperationAction(ISD::UMULO, VT, Custom);
   1196   }
   1197 
   1198   // There are no 8-bit 3-address imul/mul instructions
   1199   setOperationAction(ISD::SMULO, MVT::i8, Expand);
   1200   setOperationAction(ISD::UMULO, MVT::i8, Expand);
   1201 
   1202   if (!Subtarget->is64Bit()) {
   1203     // These libcalls are not available in 32-bit.
   1204     setLibcallName(RTLIB::SHL_I128, 0);
   1205     setLibcallName(RTLIB::SRL_I128, 0);
   1206     setLibcallName(RTLIB::SRA_I128, 0);
   1207   }
   1208 
   1209   // We have target-specific dag combine patterns for the following nodes:
   1210   setTargetDAGCombine(ISD::VECTOR_SHUFFLE);
   1211   setTargetDAGCombine(ISD::EXTRACT_VECTOR_ELT);
   1212   setTargetDAGCombine(ISD::VSELECT);
   1213   setTargetDAGCombine(ISD::SELECT);
   1214   setTargetDAGCombine(ISD::SHL);
   1215   setTargetDAGCombine(ISD::SRA);
   1216   setTargetDAGCombine(ISD::SRL);
   1217   setTargetDAGCombine(ISD::OR);
   1218   setTargetDAGCombine(ISD::AND);
   1219   setTargetDAGCombine(ISD::ADD);
   1220   setTargetDAGCombine(ISD::FADD);
   1221   setTargetDAGCombine(ISD::FSUB);
   1222   setTargetDAGCombine(ISD::SUB);
   1223   setTargetDAGCombine(ISD::LOAD);
   1224   setTargetDAGCombine(ISD::STORE);
   1225   setTargetDAGCombine(ISD::ZERO_EXTEND);
   1226   setTargetDAGCombine(ISD::SIGN_EXTEND);
   1227   setTargetDAGCombine(ISD::TRUNCATE);
   1228   setTargetDAGCombine(ISD::SINT_TO_FP);
   1229   if (Subtarget->is64Bit())
   1230     setTargetDAGCombine(ISD::MUL);
   1231   if (Subtarget->hasBMI())
   1232     setTargetDAGCombine(ISD::XOR);
   1233 
   1234   computeRegisterProperties();
   1235 
   1236   // On Darwin, -Os means optimize for size without hurting performance,
   1237   // do not reduce the limit.
   1238   maxStoresPerMemset = 16; // For @llvm.memset -> sequence of stores
   1239   maxStoresPerMemsetOptSize = Subtarget->isTargetDarwin() ? 16 : 8;
   1240   maxStoresPerMemcpy = 8; // For @llvm.memcpy -> sequence of stores
   1241   maxStoresPerMemcpyOptSize = Subtarget->isTargetDarwin() ? 8 : 4;
   1242   maxStoresPerMemmove = 8; // For @llvm.memmove -> sequence of stores
   1243   maxStoresPerMemmoveOptSize = Subtarget->isTargetDarwin() ? 8 : 4;
   1244   setPrefLoopAlignment(4); // 2^4 bytes.
   1245   benefitFromCodePlacementOpt = true;
   1246 
   1247   setPrefFunctionAlignment(4); // 2^4 bytes.
   1248 }
   1249 
   1250 
   1251 EVT X86TargetLowering::getSetCCResultType(EVT VT) const {
   1252   if (!VT.isVector()) return MVT::i8;
   1253   return VT.changeVectorElementTypeToInteger();
   1254 }
   1255 
   1256 
   1257 /// getMaxByValAlign - Helper for getByValTypeAlignment to determine
   1258 /// the desired ByVal argument alignment.
   1259 static void getMaxByValAlign(Type *Ty, unsigned &MaxAlign) {
   1260   if (MaxAlign == 16)
   1261     return;
   1262   if (VectorType *VTy = dyn_cast<VectorType>(Ty)) {
   1263     if (VTy->getBitWidth() == 128)
   1264       MaxAlign = 16;
   1265   } else if (ArrayType *ATy = dyn_cast<ArrayType>(Ty)) {
   1266     unsigned EltAlign = 0;
   1267     getMaxByValAlign(ATy->getElementType(), EltAlign);
   1268     if (EltAlign > MaxAlign)
   1269       MaxAlign = EltAlign;
   1270   } else if (StructType *STy = dyn_cast<StructType>(Ty)) {
   1271     for (unsigned i = 0, e = STy->getNumElements(); i != e; ++i) {
   1272       unsigned EltAlign = 0;
   1273       getMaxByValAlign(STy->getElementType(i), EltAlign);
   1274       if (EltAlign > MaxAlign)
   1275         MaxAlign = EltAlign;
   1276       if (MaxAlign == 16)
   1277         break;
   1278     }
   1279   }
   1280   return;
   1281 }
   1282 
   1283 /// getByValTypeAlignment - Return the desired alignment for ByVal aggregate
   1284 /// function arguments in the caller parameter area. For X86, aggregates
   1285 /// that contain SSE vectors are placed at 16-byte boundaries while the rest
   1286 /// are at 4-byte boundaries.
   1287 unsigned X86TargetLowering::getByValTypeAlignment(Type *Ty) const {
   1288   if (Subtarget->is64Bit()) {
   1289     // Max of 8 and alignment of type.
   1290     unsigned TyAlign = TD->getABITypeAlignment(Ty);
   1291     if (TyAlign > 8)
   1292       return TyAlign;
   1293     return 8;
   1294   }
   1295 
   1296   unsigned Align = 4;
   1297   if (Subtarget->hasSSE1())
   1298     getMaxByValAlign(Ty, Align);
   1299   return Align;
   1300 }
   1301 
   1302 /// getOptimalMemOpType - Returns the target specific optimal type for load
   1303 /// and store operations as a result of memset, memcpy, and memmove
   1304 /// lowering. If DstAlign is zero that means it's safe to destination
   1305 /// alignment can satisfy any constraint. Similarly if SrcAlign is zero it
   1306 /// means there isn't a need to check it against alignment requirement,
   1307 /// probably because the source does not need to be loaded. If
   1308 /// 'IsZeroVal' is true, that means it's safe to return a
   1309 /// non-scalar-integer type, e.g. empty string source, constant, or loaded
   1310 /// from memory. 'MemcpyStrSrc' indicates whether the memcpy source is
   1311 /// constant so it does not need to be loaded.
   1312 /// It returns EVT::Other if the type should be determined using generic
   1313 /// target-independent logic.
   1314 EVT
   1315 X86TargetLowering::getOptimalMemOpType(uint64_t Size,
   1316                                        unsigned DstAlign, unsigned SrcAlign,
   1317                                        bool IsZeroVal,
   1318                                        bool MemcpyStrSrc,
   1319                                        MachineFunction &MF) const {
   1320   // FIXME: This turns off use of xmm stores for memset/memcpy on targets like
   1321   // linux.  This is because the stack realignment code can't handle certain
   1322   // cases like PR2962.  This should be removed when PR2962 is fixed.
   1323   const Function *F = MF.getFunction();
   1324   if (IsZeroVal &&
   1325       !F->hasFnAttr(Attribute::NoImplicitFloat)) {
   1326     if (Size >= 16 &&
   1327         (Subtarget->isUnalignedMemAccessFast() ||
   1328          ((DstAlign == 0 || DstAlign >= 16) &&
   1329           (SrcAlign == 0 || SrcAlign >= 16))) &&
   1330         Subtarget->getStackAlignment() >= 16) {
   1331       if (Subtarget->getStackAlignment() >= 32) {
   1332         if (Subtarget->hasAVX2())
   1333           return MVT::v8i32;
   1334         if (Subtarget->hasAVX())
   1335           return MVT::v8f32;
   1336       }
   1337       if (Subtarget->hasSSE2())
   1338         return MVT::v4i32;
   1339       if (Subtarget->hasSSE1())
   1340         return MVT::v4f32;
   1341     } else if (!MemcpyStrSrc && Size >= 8 &&
   1342                !Subtarget->is64Bit() &&
   1343                Subtarget->getStackAlignment() >= 8 &&
   1344                Subtarget->hasSSE2()) {
   1345       // Do not use f64 to lower memcpy if source is string constant. It's
   1346       // better to use i32 to avoid the loads.
   1347       return MVT::f64;
   1348     }
   1349   }
   1350   if (Subtarget->is64Bit() && Size >= 8)
   1351     return MVT::i64;
   1352   return MVT::i32;
   1353 }
   1354 
   1355 /// getJumpTableEncoding - Return the entry encoding for a jump table in the
   1356 /// current function.  The returned value is a member of the
   1357 /// MachineJumpTableInfo::JTEntryKind enum.
   1358 unsigned X86TargetLowering::getJumpTableEncoding() const {
   1359   // In GOT pic mode, each entry in the jump table is emitted as a @GOTOFF
   1360   // symbol.
   1361   if (getTargetMachine().getRelocationModel() == Reloc::PIC_ &&
   1362       Subtarget->isPICStyleGOT())
   1363     return MachineJumpTableInfo::EK_Custom32;
   1364 
   1365   // Otherwise, use the normal jump table encoding heuristics.
   1366   return TargetLowering::getJumpTableEncoding();
   1367 }
   1368 
   1369 const MCExpr *
   1370 X86TargetLowering::LowerCustomJumpTableEntry(const MachineJumpTableInfo *MJTI,
   1371                                              const MachineBasicBlock *MBB,
   1372                                              unsigned uid,MCContext &Ctx) const{
   1373   assert(getTargetMachine().getRelocationModel() == Reloc::PIC_ &&
   1374          Subtarget->isPICStyleGOT());
   1375   // In 32-bit ELF systems, our jump table entries are formed with @GOTOFF
   1376   // entries.
   1377   return MCSymbolRefExpr::Create(MBB->getSymbol(),
   1378                                  MCSymbolRefExpr::VK_GOTOFF, Ctx);
   1379 }
   1380 
   1381 /// getPICJumpTableRelocaBase - Returns relocation base for the given PIC
   1382 /// jumptable.
   1383 SDValue X86TargetLowering::getPICJumpTableRelocBase(SDValue Table,
   1384                                                     SelectionDAG &DAG) const {
   1385   if (!Subtarget->is64Bit())
   1386     // This doesn't have DebugLoc associated with it, but is not really the
   1387     // same as a Register.
   1388     return DAG.getNode(X86ISD::GlobalBaseReg, DebugLoc(), getPointerTy());
   1389   return Table;
   1390 }
   1391 
   1392 /// getPICJumpTableRelocBaseExpr - This returns the relocation base for the
   1393 /// given PIC jumptable, the same as getPICJumpTableRelocBase, but as an
   1394 /// MCExpr.
   1395 const MCExpr *X86TargetLowering::
   1396 getPICJumpTableRelocBaseExpr(const MachineFunction *MF, unsigned JTI,
   1397                              MCContext &Ctx) const {
   1398   // X86-64 uses RIP relative addressing based on the jump table label.
   1399   if (Subtarget->isPICStyleRIPRel())
   1400     return TargetLowering::getPICJumpTableRelocBaseExpr(MF, JTI, Ctx);
   1401 
   1402   // Otherwise, the reference is relative to the PIC base.
   1403   return MCSymbolRefExpr::Create(MF->getPICBaseSymbol(), Ctx);
   1404 }
   1405 
   1406 // FIXME: Why this routine is here? Move to RegInfo!
   1407 std::pair<const TargetRegisterClass*, uint8_t>
   1408 X86TargetLowering::findRepresentativeClass(EVT VT) const{
   1409   const TargetRegisterClass *RRC = 0;
   1410   uint8_t Cost = 1;
   1411   switch (VT.getSimpleVT().SimpleTy) {
   1412   default:
   1413     return TargetLowering::findRepresentativeClass(VT);
   1414   case MVT::i8: case MVT::i16: case MVT::i32: case MVT::i64:
   1415     RRC = (Subtarget->is64Bit()
   1416            ? X86::GR64RegisterClass : X86::GR32RegisterClass);
   1417     break;
   1418   case MVT::x86mmx:
   1419     RRC = X86::VR64RegisterClass;
   1420     break;
   1421   case MVT::f32: case MVT::f64:
   1422   case MVT::v16i8: case MVT::v8i16: case MVT::v4i32: case MVT::v2i64:
   1423   case MVT::v4f32: case MVT::v2f64:
   1424   case MVT::v32i8: case MVT::v8i32: case MVT::v4i64: case MVT::v8f32:
   1425   case MVT::v4f64:
   1426     RRC = X86::VR128RegisterClass;
   1427     break;
   1428   }
   1429   return std::make_pair(RRC, Cost);
   1430 }
   1431 
   1432 bool X86TargetLowering::getStackCookieLocation(unsigned &AddressSpace,
   1433                                                unsigned &Offset) const {
   1434   if (!Subtarget->isTargetLinux())
   1435     return false;
   1436 
   1437   if (Subtarget->is64Bit()) {
   1438     // %fs:0x28, unless we're using a Kernel code model, in which case it's %gs:
   1439     Offset = 0x28;
   1440     if (getTargetMachine().getCodeModel() == CodeModel::Kernel)
   1441       AddressSpace = 256;
   1442     else
   1443       AddressSpace = 257;
   1444   } else {
   1445     // %gs:0x14 on i386
   1446     Offset = 0x14;
   1447     AddressSpace = 256;
   1448   }
   1449   return true;
   1450 }
   1451 
   1452 
   1453 //===----------------------------------------------------------------------===//
   1454 //               Return Value Calling Convention Implementation
   1455 //===----------------------------------------------------------------------===//
   1456 
   1457 #include "X86GenCallingConv.inc"
   1458 
   1459 bool
   1460 X86TargetLowering::CanLowerReturn(CallingConv::ID CallConv,
   1461 				  MachineFunction &MF, bool isVarArg,
   1462                         const SmallVectorImpl<ISD::OutputArg> &Outs,
   1463                         LLVMContext &Context) const {
   1464   SmallVector<CCValAssign, 16> RVLocs;
   1465   CCState CCInfo(CallConv, isVarArg, MF, getTargetMachine(),
   1466                  RVLocs, Context);
   1467   return CCInfo.CheckReturn(Outs, RetCC_X86);
   1468 }
   1469 
   1470 SDValue
   1471 X86TargetLowering::LowerReturn(SDValue Chain,
   1472                                CallingConv::ID CallConv, bool isVarArg,
   1473                                const SmallVectorImpl<ISD::OutputArg> &Outs,
   1474                                const SmallVectorImpl<SDValue> &OutVals,
   1475                                DebugLoc dl, SelectionDAG &DAG) const {
   1476   MachineFunction &MF = DAG.getMachineFunction();
   1477   X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
   1478 
   1479   SmallVector<CCValAssign, 16> RVLocs;
   1480   CCState CCInfo(CallConv, isVarArg, MF, getTargetMachine(),
   1481                  RVLocs, *DAG.getContext());
   1482   CCInfo.AnalyzeReturn(Outs, RetCC_X86);
   1483 
   1484   // Add the regs to the liveout set for the function.
   1485   MachineRegisterInfo &MRI = DAG.getMachineFunction().getRegInfo();
   1486   for (unsigned i = 0; i != RVLocs.size(); ++i)
   1487     if (RVLocs[i].isRegLoc() && !MRI.isLiveOut(RVLocs[i].getLocReg()))
   1488       MRI.addLiveOut(RVLocs[i].getLocReg());
   1489 
   1490   SDValue Flag;
   1491 
   1492   SmallVector<SDValue, 6> RetOps;
   1493   RetOps.push_back(Chain); // Operand #0 = Chain (updated below)
   1494   // Operand #1 = Bytes To Pop
   1495   RetOps.push_back(DAG.getTargetConstant(FuncInfo->getBytesToPopOnReturn(),
   1496                    MVT::i16));
   1497 
   1498   // Copy the result values into the output registers.
   1499   for (unsigned i = 0; i != RVLocs.size(); ++i) {
   1500     CCValAssign &VA = RVLocs[i];
   1501     assert(VA.isRegLoc() && "Can only return in registers!");
   1502     SDValue ValToCopy = OutVals[i];
   1503     EVT ValVT = ValToCopy.getValueType();
   1504 
   1505     // If this is x86-64, and we disabled SSE, we can't return FP values,
   1506     // or SSE or MMX vectors.
   1507     if ((ValVT == MVT::f32 || ValVT == MVT::f64 ||
   1508          VA.getLocReg() == X86::XMM0 || VA.getLocReg() == X86::XMM1) &&
   1509           (Subtarget->is64Bit() && !Subtarget->hasSSE1())) {
   1510       report_fatal_error("SSE register return with SSE disabled");
   1511     }
   1512     // Likewise we can't return F64 values with SSE1 only.  gcc does so, but
   1513     // llvm-gcc has never done it right and no one has noticed, so this
   1514     // should be OK for now.
   1515     if (ValVT == MVT::f64 &&
   1516         (Subtarget->is64Bit() && !Subtarget->hasSSE2()))
   1517       report_fatal_error("SSE2 register return with SSE2 disabled");
   1518 
   1519     // Returns in ST0/ST1 are handled specially: these are pushed as operands to
   1520     // the RET instruction and handled by the FP Stackifier.
   1521     if (VA.getLocReg() == X86::ST0 ||
   1522         VA.getLocReg() == X86::ST1) {
   1523       // If this is a copy from an xmm register to ST(0), use an FPExtend to
   1524       // change the value to the FP stack register class.
   1525       if (isScalarFPTypeInSSEReg(VA.getValVT()))
   1526         ValToCopy = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f80, ValToCopy);
   1527       RetOps.push_back(ValToCopy);
   1528       // Don't emit a copytoreg.
   1529       continue;
   1530     }
   1531 
   1532     // 64-bit vector (MMX) values are returned in XMM0 / XMM1 except for v1i64
   1533     // which is returned in RAX / RDX.
   1534     if (Subtarget->is64Bit()) {
   1535       if (ValVT == MVT::x86mmx) {
   1536         if (VA.getLocReg() == X86::XMM0 || VA.getLocReg() == X86::XMM1) {
   1537           ValToCopy = DAG.getNode(ISD::BITCAST, dl, MVT::i64, ValToCopy);
   1538           ValToCopy = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64,
   1539                                   ValToCopy);
   1540           // If we don't have SSE2 available, convert to v4f32 so the generated
   1541           // register is legal.
   1542           if (!Subtarget->hasSSE2())
   1543             ValToCopy = DAG.getNode(ISD::BITCAST, dl, MVT::v4f32,ValToCopy);
   1544         }
   1545       }
   1546     }
   1547 
   1548     Chain = DAG.getCopyToReg(Chain, dl, VA.getLocReg(), ValToCopy, Flag);
   1549     Flag = Chain.getValue(1);
   1550   }
   1551 
   1552   // The x86-64 ABI for returning structs by value requires that we copy
   1553   // the sret argument into %rax for the return. We saved the argument into
   1554   // a virtual register in the entry block, so now we copy the value out
   1555   // and into %rax.
   1556   if (Subtarget->is64Bit() &&
   1557       DAG.getMachineFunction().getFunction()->hasStructRetAttr()) {
   1558     MachineFunction &MF = DAG.getMachineFunction();
   1559     X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
   1560     unsigned Reg = FuncInfo->getSRetReturnReg();
   1561     assert(Reg &&
   1562            "SRetReturnReg should have been set in LowerFormalArguments().");
   1563     SDValue Val = DAG.getCopyFromReg(Chain, dl, Reg, getPointerTy());
   1564 
   1565     Chain = DAG.getCopyToReg(Chain, dl, X86::RAX, Val, Flag);
   1566     Flag = Chain.getValue(1);
   1567 
   1568     // RAX now acts like a return value.
   1569     MRI.addLiveOut(X86::RAX);
   1570   }
   1571 
   1572   RetOps[0] = Chain;  // Update chain.
   1573 
   1574   // Add the flag if we have it.
   1575   if (Flag.getNode())
   1576     RetOps.push_back(Flag);
   1577 
   1578   return DAG.getNode(X86ISD::RET_FLAG, dl,
   1579                      MVT::Other, &RetOps[0], RetOps.size());
   1580 }
   1581 
   1582 bool X86TargetLowering::isUsedByReturnOnly(SDNode *N, SDValue &Chain) const {
   1583   if (N->getNumValues() != 1)
   1584     return false;
   1585   if (!N->hasNUsesOfValue(1, 0))
   1586     return false;
   1587 
   1588   SDValue TCChain = Chain;
   1589   SDNode *Copy = *N->use_begin();
   1590   if (Copy->getOpcode() == ISD::CopyToReg) {
   1591     // If the copy has a glue operand, we conservatively assume it isn't safe to
   1592     // perform a tail call.
   1593     if (Copy->getOperand(Copy->getNumOperands()-1).getValueType() == MVT::Glue)
   1594       return false;
   1595     TCChain = Copy->getOperand(0);
   1596   } else if (Copy->getOpcode() != ISD::FP_EXTEND)
   1597     return false;
   1598 
   1599   bool HasRet = false;
   1600   for (SDNode::use_iterator UI = Copy->use_begin(), UE = Copy->use_end();
   1601        UI != UE; ++UI) {
   1602     if (UI->getOpcode() != X86ISD::RET_FLAG)
   1603       return false;
   1604     HasRet = true;
   1605   }
   1606 
   1607   if (!HasRet)
   1608     return false;
   1609 
   1610   Chain = TCChain;
   1611   return true;
   1612 }
   1613 
   1614 EVT
   1615 X86TargetLowering::getTypeForExtArgOrReturn(LLVMContext &Context, EVT VT,
   1616                                             ISD::NodeType ExtendKind) const {
   1617   MVT ReturnMVT;
   1618   // TODO: Is this also valid on 32-bit?
   1619   if (Subtarget->is64Bit() && VT == MVT::i1 && ExtendKind == ISD::ZERO_EXTEND)
   1620     ReturnMVT = MVT::i8;
   1621   else
   1622     ReturnMVT = MVT::i32;
   1623 
   1624   EVT MinVT = getRegisterType(Context, ReturnMVT);
   1625   return VT.bitsLT(MinVT) ? MinVT : VT;
   1626 }
   1627 
   1628 /// LowerCallResult - Lower the result values of a call into the
   1629 /// appropriate copies out of appropriate physical registers.
   1630 ///
   1631 SDValue
   1632 X86TargetLowering::LowerCallResult(SDValue Chain, SDValue InFlag,
   1633                                    CallingConv::ID CallConv, bool isVarArg,
   1634                                    const SmallVectorImpl<ISD::InputArg> &Ins,
   1635                                    DebugLoc dl, SelectionDAG &DAG,
   1636                                    SmallVectorImpl<SDValue> &InVals) const {
   1637 
   1638   // Assign locations to each value returned by this call.
   1639   SmallVector<CCValAssign, 16> RVLocs;
   1640   bool Is64Bit = Subtarget->is64Bit();
   1641   CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(),
   1642 		 getTargetMachine(), RVLocs, *DAG.getContext());
   1643   CCInfo.AnalyzeCallResult(Ins, RetCC_X86);
   1644 
   1645   // Copy all of the result registers out of their specified physreg.
   1646   for (unsigned i = 0; i != RVLocs.size(); ++i) {
   1647     CCValAssign &VA = RVLocs[i];
   1648     EVT CopyVT = VA.getValVT();
   1649 
   1650     // If this is x86-64, and we disabled SSE, we can't return FP values
   1651     if ((CopyVT == MVT::f32 || CopyVT == MVT::f64) &&
   1652         ((Is64Bit || Ins[i].Flags.isInReg()) && !Subtarget->hasSSE1())) {
   1653       report_fatal_error("SSE register return with SSE disabled");
   1654     }
   1655 
   1656     SDValue Val;
   1657 
   1658     // If this is a call to a function that returns an fp value on the floating
   1659     // point stack, we must guarantee the the value is popped from the stack, so
   1660     // a CopyFromReg is not good enough - the copy instruction may be eliminated
   1661     // if the return value is not used. We use the FpPOP_RETVAL instruction
   1662     // instead.
   1663     if (VA.getLocReg() == X86::ST0 || VA.getLocReg() == X86::ST1) {
   1664       // If we prefer to use the value in xmm registers, copy it out as f80 and
   1665       // use a truncate to move it from fp stack reg to xmm reg.
   1666       if (isScalarFPTypeInSSEReg(VA.getValVT())) CopyVT = MVT::f80;
   1667       SDValue Ops[] = { Chain, InFlag };
   1668       Chain = SDValue(DAG.getMachineNode(X86::FpPOP_RETVAL, dl, CopyVT,
   1669                                          MVT::Other, MVT::Glue, Ops, 2), 1);
   1670       Val = Chain.getValue(0);
   1671 
   1672       // Round the f80 to the right size, which also moves it to the appropriate
   1673       // xmm register.
   1674       if (CopyVT != VA.getValVT())
   1675         Val = DAG.getNode(ISD::FP_ROUND, dl, VA.getValVT(), Val,
   1676                           // This truncation won't change the value.
   1677                           DAG.getIntPtrConstant(1));
   1678     } else {
   1679       Chain = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(),
   1680                                  CopyVT, InFlag).getValue(1);
   1681       Val = Chain.getValue(0);
   1682     }
   1683     InFlag = Chain.getValue(2);
   1684     InVals.push_back(Val);
   1685   }
   1686 
   1687   return Chain;
   1688 }
   1689 
   1690 
   1691 //===----------------------------------------------------------------------===//
   1692 //                C & StdCall & Fast Calling Convention implementation
   1693 //===----------------------------------------------------------------------===//
   1694 //  StdCall calling convention seems to be standard for many Windows' API
   1695 //  routines and around. It differs from C calling convention just a little:
   1696 //  callee should clean up the stack, not caller. Symbols should be also
   1697 //  decorated in some fancy way :) It doesn't support any vector arguments.
   1698 //  For info on fast calling convention see Fast Calling Convention (tail call)
   1699 //  implementation LowerX86_32FastCCCallTo.
   1700 
   1701 /// CallIsStructReturn - Determines whether a call uses struct return
   1702 /// semantics.
   1703 static bool CallIsStructReturn(const SmallVectorImpl<ISD::OutputArg> &Outs) {
   1704   if (Outs.empty())
   1705     return false;
   1706 
   1707   return Outs[0].Flags.isSRet();
   1708 }
   1709 
   1710 /// ArgsAreStructReturn - Determines whether a function uses struct
   1711 /// return semantics.
   1712 static bool
   1713 ArgsAreStructReturn(const SmallVectorImpl<ISD::InputArg> &Ins) {
   1714   if (Ins.empty())
   1715     return false;
   1716 
   1717   return Ins[0].Flags.isSRet();
   1718 }
   1719 
   1720 /// CreateCopyOfByValArgument - Make a copy of an aggregate at address specified
   1721 /// by "Src" to address "Dst" with size and alignment information specified by
   1722 /// the specific parameter attribute. The copy will be passed as a byval
   1723 /// function parameter.
   1724 static SDValue
   1725 CreateCopyOfByValArgument(SDValue Src, SDValue Dst, SDValue Chain,
   1726                           ISD::ArgFlagsTy Flags, SelectionDAG &DAG,
   1727                           DebugLoc dl) {
   1728   SDValue SizeNode = DAG.getConstant(Flags.getByValSize(), MVT::i32);
   1729 
   1730   return DAG.getMemcpy(Chain, dl, Dst, Src, SizeNode, Flags.getByValAlign(),
   1731                        /*isVolatile*/false, /*AlwaysInline=*/true,
   1732                        MachinePointerInfo(), MachinePointerInfo());
   1733 }
   1734 
   1735 /// IsTailCallConvention - Return true if the calling convention is one that
   1736 /// supports tail call optimization.
   1737 static bool IsTailCallConvention(CallingConv::ID CC) {
   1738   return (CC == CallingConv::Fast || CC == CallingConv::GHC);
   1739 }
   1740 
   1741 bool X86TargetLowering::mayBeEmittedAsTailCall(CallInst *CI) const {
   1742   if (!CI->isTailCall() || getTargetMachine().Options.DisableTailCalls)
   1743     return false;
   1744 
   1745   CallSite CS(CI);
   1746   CallingConv::ID CalleeCC = CS.getCallingConv();
   1747   if (!IsTailCallConvention(CalleeCC) && CalleeCC != CallingConv::C)
   1748     return false;
   1749 
   1750   return true;
   1751 }
   1752 
   1753 /// FuncIsMadeTailCallSafe - Return true if the function is being made into
   1754 /// a tailcall target by changing its ABI.
   1755 static bool FuncIsMadeTailCallSafe(CallingConv::ID CC,
   1756                                    bool GuaranteedTailCallOpt) {
   1757   return GuaranteedTailCallOpt && IsTailCallConvention(CC);
   1758 }
   1759 
   1760 SDValue
   1761 X86TargetLowering::LowerMemArgument(SDValue Chain,
   1762                                     CallingConv::ID CallConv,
   1763                                     const SmallVectorImpl<ISD::InputArg> &Ins,
   1764                                     DebugLoc dl, SelectionDAG &DAG,
   1765                                     const CCValAssign &VA,
   1766                                     MachineFrameInfo *MFI,
   1767                                     unsigned i) const {
   1768   // Create the nodes corresponding to a load from this parameter slot.
   1769   ISD::ArgFlagsTy Flags = Ins[i].Flags;
   1770   bool AlwaysUseMutable = FuncIsMadeTailCallSafe(CallConv,
   1771                               getTargetMachine().Options.GuaranteedTailCallOpt);
   1772   bool isImmutable = !AlwaysUseMutable && !Flags.isByVal();
   1773   EVT ValVT;
   1774 
   1775   // If value is passed by pointer we have address passed instead of the value
   1776   // itself.
   1777   if (VA.getLocInfo() == CCValAssign::Indirect)
   1778     ValVT = VA.getLocVT();
   1779   else
   1780     ValVT = VA.getValVT();
   1781 
   1782   // FIXME: For now, all byval parameter objects are marked mutable. This can be
   1783   // changed with more analysis.
   1784   // In case of tail call optimization mark all arguments mutable. Since they
   1785   // could be overwritten by lowering of arguments in case of a tail call.
   1786   if (Flags.isByVal()) {
   1787     unsigned Bytes = Flags.getByValSize();
   1788     if (Bytes == 0) Bytes = 1; // Don't create zero-sized stack objects.
   1789     int FI = MFI->CreateFixedObject(Bytes, VA.getLocMemOffset(), isImmutable);
   1790     return DAG.getFrameIndex(FI, getPointerTy());
   1791   } else {
   1792     int FI = MFI->CreateFixedObject(ValVT.getSizeInBits()/8,
   1793                                     VA.getLocMemOffset(), isImmutable);
   1794     SDValue FIN = DAG.getFrameIndex(FI, getPointerTy());
   1795     return DAG.getLoad(ValVT, dl, Chain, FIN,
   1796                        MachinePointerInfo::getFixedStack(FI),
   1797                        false, false, false, 0);
   1798   }
   1799 }
   1800 
   1801 SDValue
   1802 X86TargetLowering::LowerFormalArguments(SDValue Chain,
   1803                                         CallingConv::ID CallConv,
   1804                                         bool isVarArg,
   1805                                       const SmallVectorImpl<ISD::InputArg> &Ins,
   1806                                         DebugLoc dl,
   1807                                         SelectionDAG &DAG,
   1808                                         SmallVectorImpl<SDValue> &InVals)
   1809                                           const {
   1810   MachineFunction &MF = DAG.getMachineFunction();
   1811   X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
   1812 
   1813   const Function* Fn = MF.getFunction();
   1814   if (Fn->hasExternalLinkage() &&
   1815       Subtarget->isTargetCygMing() &&
   1816       Fn->getName() == "main")
   1817     FuncInfo->setForceFramePointer(true);
   1818 
   1819   MachineFrameInfo *MFI = MF.getFrameInfo();
   1820   bool Is64Bit = Subtarget->is64Bit();
   1821   bool IsWindows = Subtarget->isTargetWindows();
   1822   bool IsWin64 = Subtarget->isTargetWin64();
   1823 
   1824   assert(!(isVarArg && IsTailCallConvention(CallConv)) &&
   1825          "Var args not supported with calling convention fastcc or ghc");
   1826 
   1827   // Assign locations to all of the incoming arguments.
   1828   SmallVector<CCValAssign, 16> ArgLocs;
   1829   CCState CCInfo(CallConv, isVarArg, MF, getTargetMachine(),
   1830                  ArgLocs, *DAG.getContext());
   1831 
   1832   // Allocate shadow area for Win64
   1833   if (IsWin64) {
   1834     CCInfo.AllocateStack(32, 8);
   1835   }
   1836 
   1837   CCInfo.AnalyzeFormalArguments(Ins, CC_X86);
   1838 
   1839   unsigned LastVal = ~0U;
   1840   SDValue ArgValue;
   1841   for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
   1842     CCValAssign &VA = ArgLocs[i];
   1843     // TODO: If an arg is passed in two places (e.g. reg and stack), skip later
   1844     // places.
   1845     assert(VA.getValNo() != LastVal &&
   1846            "Don't support value assigned to multiple locs yet");
   1847     (void)LastVal;
   1848     LastVal = VA.getValNo();
   1849 
   1850     if (VA.isRegLoc()) {
   1851       EVT RegVT = VA.getLocVT();
   1852       const TargetRegisterClass *RC;
   1853       if (RegVT == MVT::i32)
   1854         RC = X86::GR32RegisterClass;
   1855       else if (Is64Bit && RegVT == MVT::i64)
   1856         RC = X86::GR64RegisterClass;
   1857       else if (RegVT == MVT::f32)
   1858         RC = X86::FR32RegisterClass;
   1859       else if (RegVT == MVT::f64)
   1860         RC = X86::FR64RegisterClass;
   1861       else if (RegVT.isVector() && RegVT.getSizeInBits() == 256)
   1862         RC = X86::VR256RegisterClass;
   1863       else if (RegVT.isVector() && RegVT.getSizeInBits() == 128)
   1864         RC = X86::VR128RegisterClass;
   1865       else if (RegVT == MVT::x86mmx)
   1866         RC = X86::VR64RegisterClass;
   1867       else
   1868         llvm_unreachable("Unknown argument type!");
   1869 
   1870       unsigned Reg = MF.addLiveIn(VA.getLocReg(), RC);
   1871       ArgValue = DAG.getCopyFromReg(Chain, dl, Reg, RegVT);
   1872 
   1873       // If this is an 8 or 16-bit value, it is really passed promoted to 32
   1874       // bits.  Insert an assert[sz]ext to capture this, then truncate to the
   1875       // right size.
   1876       if (VA.getLocInfo() == CCValAssign::SExt)
   1877         ArgValue = DAG.getNode(ISD::AssertSext, dl, RegVT, ArgValue,
   1878                                DAG.getValueType(VA.getValVT()));
   1879       else if (VA.getLocInfo() == CCValAssign::ZExt)
   1880         ArgValue = DAG.getNode(ISD::AssertZext, dl, RegVT, ArgValue,
   1881                                DAG.getValueType(VA.getValVT()));
   1882       else if (VA.getLocInfo() == CCValAssign::BCvt)
   1883         ArgValue = DAG.getNode(ISD::BITCAST, dl, VA.getValVT(), ArgValue);
   1884 
   1885       if (VA.isExtInLoc()) {
   1886         // Handle MMX values passed in XMM regs.
   1887         if (RegVT.isVector()) {
   1888           ArgValue = DAG.getNode(X86ISD::MOVDQ2Q, dl, VA.getValVT(),
   1889                                  ArgValue);
   1890         } else
   1891           ArgValue = DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), ArgValue);
   1892       }
   1893     } else {
   1894       assert(VA.isMemLoc());
   1895       ArgValue = LowerMemArgument(Chain, CallConv, Ins, dl, DAG, VA, MFI, i);
   1896     }
   1897 
   1898     // If value is passed via pointer - do a load.
   1899     if (VA.getLocInfo() == CCValAssign::Indirect)
   1900       ArgValue = DAG.getLoad(VA.getValVT(), dl, Chain, ArgValue,
   1901                              MachinePointerInfo(), false, false, false, 0);
   1902 
   1903     InVals.push_back(ArgValue);
   1904   }
   1905 
   1906   // The x86-64 ABI for returning structs by value requires that we copy
   1907   // the sret argument into %rax for the return. Save the argument into
   1908   // a virtual register so that we can access it from the return points.
   1909   if (Is64Bit && MF.getFunction()->hasStructRetAttr()) {
   1910     X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
   1911     unsigned Reg = FuncInfo->getSRetReturnReg();
   1912     if (!Reg) {
   1913       Reg = MF.getRegInfo().createVirtualRegister(getRegClassFor(MVT::i64));
   1914       FuncInfo->setSRetReturnReg(Reg);
   1915     }
   1916     SDValue Copy = DAG.getCopyToReg(DAG.getEntryNode(), dl, Reg, InVals[0]);
   1917     Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Copy, Chain);
   1918   }
   1919 
   1920   unsigned StackSize = CCInfo.getNextStackOffset();
   1921   // Align stack specially for tail calls.
   1922   if (FuncIsMadeTailCallSafe(CallConv,
   1923                              MF.getTarget().Options.GuaranteedTailCallOpt))
   1924     StackSize = GetAlignedArgumentStackSize(StackSize, DAG);
   1925 
   1926   // If the function takes variable number of arguments, make a frame index for
   1927   // the start of the first vararg value... for expansion of llvm.va_start.
   1928   if (isVarArg) {
   1929     if (Is64Bit || (CallConv != CallingConv::X86_FastCall &&
   1930                     CallConv != CallingConv::X86_ThisCall)) {
   1931       FuncInfo->setVarArgsFrameIndex(MFI->CreateFixedObject(1, StackSize,true));
   1932     }
   1933     if (Is64Bit) {
   1934       unsigned TotalNumIntRegs = 0, TotalNumXMMRegs = 0;
   1935 
   1936       // FIXME: We should really autogenerate these arrays
   1937       static const uint16_t GPR64ArgRegsWin64[] = {
   1938         X86::RCX, X86::RDX, X86::R8,  X86::R9
   1939       };
   1940       static const uint16_t GPR64ArgRegs64Bit[] = {
   1941         X86::RDI, X86::RSI, X86::RDX, X86::RCX, X86::R8, X86::R9
   1942       };
   1943       static const uint16_t XMMArgRegs64Bit[] = {
   1944         X86::XMM0, X86::XMM1, X86::XMM2, X86::XMM3,
   1945         X86::XMM4, X86::XMM5, X86::XMM6, X86::XMM7
   1946       };
   1947       const uint16_t *GPR64ArgRegs;
   1948       unsigned NumXMMRegs = 0;
   1949 
   1950       if (IsWin64) {
   1951         // The XMM registers which might contain var arg parameters are shadowed
   1952         // in their paired GPR.  So we only need to save the GPR to their home
   1953         // slots.
   1954         TotalNumIntRegs = 4;
   1955         GPR64ArgRegs = GPR64ArgRegsWin64;
   1956       } else {
   1957         TotalNumIntRegs = 6; TotalNumXMMRegs = 8;
   1958         GPR64ArgRegs = GPR64ArgRegs64Bit;
   1959 
   1960         NumXMMRegs = CCInfo.getFirstUnallocated(XMMArgRegs64Bit,
   1961                                                 TotalNumXMMRegs);
   1962       }
   1963       unsigned NumIntRegs = CCInfo.getFirstUnallocated(GPR64ArgRegs,
   1964                                                        TotalNumIntRegs);
   1965 
   1966       bool NoImplicitFloatOps = Fn->hasFnAttr(Attribute::NoImplicitFloat);
   1967       assert(!(NumXMMRegs && !Subtarget->hasSSE1()) &&
   1968              "SSE register cannot be used when SSE is disabled!");
   1969       assert(!(NumXMMRegs && MF.getTarget().Options.UseSoftFloat &&
   1970                NoImplicitFloatOps) &&
   1971              "SSE register cannot be used when SSE is disabled!");
   1972       if (MF.getTarget().Options.UseSoftFloat || NoImplicitFloatOps ||
   1973           !Subtarget->hasSSE1())
   1974         // Kernel mode asks for SSE to be disabled, so don't push them
   1975         // on the stack.
   1976         TotalNumXMMRegs = 0;
   1977 
   1978       if (IsWin64) {
   1979         const TargetFrameLowering &TFI = *getTargetMachine().getFrameLowering();
   1980         // Get to the caller-allocated home save location.  Add 8 to account
   1981         // for the return address.
   1982         int HomeOffset = TFI.getOffsetOfLocalArea() + 8;
   1983         FuncInfo->setRegSaveFrameIndex(
   1984           MFI->CreateFixedObject(1, NumIntRegs * 8 + HomeOffset, false));
   1985         // Fixup to set vararg frame on shadow area (4 x i64).
   1986         if (NumIntRegs < 4)
   1987           FuncInfo->setVarArgsFrameIndex(FuncInfo->getRegSaveFrameIndex());
   1988       } else {
   1989         // For X86-64, if there are vararg parameters that are passed via
   1990         // registers, then we must store them to their spots on the stack so
   1991         // they may be loaded by deferencing the result of va_next.
   1992         FuncInfo->setVarArgsGPOffset(NumIntRegs * 8);
   1993         FuncInfo->setVarArgsFPOffset(TotalNumIntRegs * 8 + NumXMMRegs * 16);
   1994         FuncInfo->setRegSaveFrameIndex(
   1995           MFI->CreateStackObject(TotalNumIntRegs * 8 + TotalNumXMMRegs * 16, 16,
   1996                                false));
   1997       }
   1998 
   1999       // Store the integer parameter registers.
   2000       SmallVector<SDValue, 8> MemOps;
   2001       SDValue RSFIN = DAG.getFrameIndex(FuncInfo->getRegSaveFrameIndex(),
   2002                                         getPointerTy());
   2003       unsigned Offset = FuncInfo->getVarArgsGPOffset();
   2004       for (; NumIntRegs != TotalNumIntRegs; ++NumIntRegs) {
   2005         SDValue FIN = DAG.getNode(ISD::ADD, dl, getPointerTy(), RSFIN,
   2006                                   DAG.getIntPtrConstant(Offset));
   2007         unsigned VReg = MF.addLiveIn(GPR64ArgRegs[NumIntRegs],
   2008                                      X86::GR64RegisterClass);
   2009         SDValue Val = DAG.getCopyFromReg(Chain, dl, VReg, MVT::i64);
   2010         SDValue Store =
   2011           DAG.getStore(Val.getValue(1), dl, Val, FIN,
   2012                        MachinePointerInfo::getFixedStack(
   2013                          FuncInfo->getRegSaveFrameIndex(), Offset),
   2014                        false, false, 0);
   2015         MemOps.push_back(Store);
   2016         Offset += 8;
   2017       }
   2018 
   2019       if (TotalNumXMMRegs != 0 && NumXMMRegs != TotalNumXMMRegs) {
   2020         // Now store the XMM (fp + vector) parameter registers.
   2021         SmallVector<SDValue, 11> SaveXMMOps;
   2022         SaveXMMOps.push_back(Chain);
   2023 
   2024         unsigned AL = MF.addLiveIn(X86::AL, X86::GR8RegisterClass);
   2025         SDValue ALVal = DAG.getCopyFromReg(DAG.getEntryNode(), dl, AL, MVT::i8);
   2026         SaveXMMOps.push_back(ALVal);
   2027 
   2028         SaveXMMOps.push_back(DAG.getIntPtrConstant(
   2029                                FuncInfo->getRegSaveFrameIndex()));
   2030         SaveXMMOps.push_back(DAG.getIntPtrConstant(
   2031                                FuncInfo->getVarArgsFPOffset()));
   2032 
   2033         for (; NumXMMRegs != TotalNumXMMRegs; ++NumXMMRegs) {
   2034           unsigned VReg = MF.addLiveIn(XMMArgRegs64Bit[NumXMMRegs],
   2035                                        X86::VR128RegisterClass);
   2036           SDValue Val = DAG.getCopyFromReg(Chain, dl, VReg, MVT::v4f32);
   2037           SaveXMMOps.push_back(Val);
   2038         }
   2039         MemOps.push_back(DAG.getNode(X86ISD::VASTART_SAVE_XMM_REGS, dl,
   2040                                      MVT::Other,
   2041                                      &SaveXMMOps[0], SaveXMMOps.size()));
   2042       }
   2043 
   2044       if (!MemOps.empty())
   2045         Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
   2046                             &MemOps[0], MemOps.size());
   2047     }
   2048   }
   2049 
   2050   // Some CCs need callee pop.
   2051   if (X86::isCalleePop(CallConv, Is64Bit, isVarArg,
   2052                        MF.getTarget().Options.GuaranteedTailCallOpt)) {
   2053     FuncInfo->setBytesToPopOnReturn(StackSize); // Callee pops everything.
   2054   } else {
   2055     FuncInfo->setBytesToPopOnReturn(0); // Callee pops nothing.
   2056     // If this is an sret function, the return should pop the hidden pointer.
   2057     if (!Is64Bit && !IsTailCallConvention(CallConv) && !IsWindows &&
   2058         ArgsAreStructReturn(Ins))
   2059       FuncInfo->setBytesToPopOnReturn(4);
   2060   }
   2061 
   2062   if (!Is64Bit) {
   2063     // RegSaveFrameIndex is X86-64 only.
   2064     FuncInfo->setRegSaveFrameIndex(0xAAAAAAA);
   2065     if (CallConv == CallingConv::X86_FastCall ||
   2066         CallConv == CallingConv::X86_ThisCall)
   2067       // fastcc functions can't have varargs.
   2068       FuncInfo->setVarArgsFrameIndex(0xAAAAAAA);
   2069   }
   2070 
   2071   FuncInfo->setArgumentStackSize(StackSize);
   2072 
   2073   return Chain;
   2074 }
   2075 
   2076 SDValue
   2077 X86TargetLowering::LowerMemOpCallTo(SDValue Chain,
   2078                                     SDValue StackPtr, SDValue Arg,
   2079                                     DebugLoc dl, SelectionDAG &DAG,
   2080                                     const CCValAssign &VA,
   2081                                     ISD::ArgFlagsTy Flags) const {
   2082   unsigned LocMemOffset = VA.getLocMemOffset();
   2083   SDValue PtrOff = DAG.getIntPtrConstant(LocMemOffset);
   2084   PtrOff = DAG.getNode(ISD::ADD, dl, getPointerTy(), StackPtr, PtrOff);
   2085   if (Flags.isByVal())
   2086     return CreateCopyOfByValArgument(Arg, PtrOff, Chain, Flags, DAG, dl);
   2087 
   2088   return DAG.getStore(Chain, dl, Arg, PtrOff,
   2089                       MachinePointerInfo::getStack(LocMemOffset),
   2090                       false, false, 0);
   2091 }
   2092 
   2093 /// EmitTailCallLoadRetAddr - Emit a load of return address if tail call
   2094 /// optimization is performed and it is required.
   2095 SDValue
   2096 X86TargetLowering::EmitTailCallLoadRetAddr(SelectionDAG &DAG,
   2097                                            SDValue &OutRetAddr, SDValue Chain,
   2098                                            bool IsTailCall, bool Is64Bit,
   2099                                            int FPDiff, DebugLoc dl) const {
   2100   // Adjust the Return address stack slot.
   2101   EVT VT = getPointerTy();
   2102   OutRetAddr = getReturnAddressFrameIndex(DAG);
   2103 
   2104   // Load the "old" Return address.
   2105   OutRetAddr = DAG.getLoad(VT, dl, Chain, OutRetAddr, MachinePointerInfo(),
   2106                            false, false, false, 0);
   2107   return SDValue(OutRetAddr.getNode(), 1);
   2108 }
   2109 
   2110 /// EmitTailCallStoreRetAddr - Emit a store of the return address if tail call
   2111 /// optimization is performed and it is required (FPDiff!=0).
   2112 static SDValue
   2113 EmitTailCallStoreRetAddr(SelectionDAG & DAG, MachineFunction &MF,
   2114                          SDValue Chain, SDValue RetAddrFrIdx,
   2115                          bool Is64Bit, int FPDiff, DebugLoc dl) {
   2116   // Store the return address to the appropriate stack slot.
   2117   if (!FPDiff) return Chain;
   2118   // Calculate the new stack slot for the return address.
   2119   int SlotSize = Is64Bit ? 8 : 4;
   2120   int NewReturnAddrFI =
   2121     MF.getFrameInfo()->CreateFixedObject(SlotSize, FPDiff-SlotSize, false);
   2122   EVT VT = Is64Bit ? MVT::i64 : MVT::i32;
   2123   SDValue NewRetAddrFrIdx = DAG.getFrameIndex(NewReturnAddrFI, VT);
   2124   Chain = DAG.getStore(Chain, dl, RetAddrFrIdx, NewRetAddrFrIdx,
   2125                        MachinePointerInfo::getFixedStack(NewReturnAddrFI),
   2126                        false, false, 0);
   2127   return Chain;
   2128 }
   2129 
   2130 SDValue
   2131 X86TargetLowering::LowerCall(SDValue Chain, SDValue Callee,
   2132                              CallingConv::ID CallConv, bool isVarArg,
   2133                              bool doesNotRet, bool &isTailCall,
   2134                              const SmallVectorImpl<ISD::OutputArg> &Outs,
   2135                              const SmallVectorImpl<SDValue> &OutVals,
   2136                              const SmallVectorImpl<ISD::InputArg> &Ins,
   2137                              DebugLoc dl, SelectionDAG &DAG,
   2138                              SmallVectorImpl<SDValue> &InVals) const {
   2139   MachineFunction &MF = DAG.getMachineFunction();
   2140   bool Is64Bit        = Subtarget->is64Bit();
   2141   bool IsWin64        = Subtarget->isTargetWin64();
   2142   bool IsWindows      = Subtarget->isTargetWindows();
   2143   bool IsStructRet    = CallIsStructReturn(Outs);
   2144   bool IsSibcall      = false;
   2145 
   2146   if (MF.getTarget().Options.DisableTailCalls)
   2147     isTailCall = false;
   2148 
   2149   if (isTailCall) {
   2150     // Check if it's really possible to do a tail call.
   2151     isTailCall = IsEligibleForTailCallOptimization(Callee, CallConv,
   2152                     isVarArg, IsStructRet, MF.getFunction()->hasStructRetAttr(),
   2153                                                    Outs, OutVals, Ins, DAG);
   2154 
   2155     // Sibcalls are automatically detected tailcalls which do not require
   2156     // ABI changes.
   2157     if (!MF.getTarget().Options.GuaranteedTailCallOpt && isTailCall)
   2158       IsSibcall = true;
   2159 
   2160     if (isTailCall)
   2161       ++NumTailCalls;
   2162   }
   2163 
   2164   assert(!(isVarArg && IsTailCallConvention(CallConv)) &&
   2165          "Var args not supported with calling convention fastcc or ghc");
   2166 
   2167   // Analyze operands of the call, assigning locations to each operand.
   2168   SmallVector<CCValAssign, 16> ArgLocs;
   2169   CCState CCInfo(CallConv, isVarArg, MF, getTargetMachine(),
   2170                  ArgLocs, *DAG.getContext());
   2171 
   2172   // Allocate shadow area for Win64
   2173   if (IsWin64) {
   2174     CCInfo.AllocateStack(32, 8);
   2175   }
   2176 
   2177   CCInfo.AnalyzeCallOperands(Outs, CC_X86);
   2178 
   2179   // Get a count of how many bytes are to be pushed on the stack.
   2180   unsigned NumBytes = CCInfo.getNextStackOffset();
   2181   if (IsSibcall)
   2182     // This is a sibcall. The memory operands are available in caller's
   2183     // own caller's stack.
   2184     NumBytes = 0;
   2185   else if (getTargetMachine().Options.GuaranteedTailCallOpt &&
   2186            IsTailCallConvention(CallConv))
   2187     NumBytes = GetAlignedArgumentStackSize(NumBytes, DAG);
   2188 
   2189   int FPDiff = 0;
   2190   if (isTailCall && !IsSibcall) {
   2191     // Lower arguments at fp - stackoffset + fpdiff.
   2192     unsigned NumBytesCallerPushed =
   2193       MF.getInfo<X86MachineFunctionInfo>()->getBytesToPopOnReturn();
   2194     FPDiff = NumBytesCallerPushed - NumBytes;
   2195 
   2196     // Set the delta of movement of the returnaddr stackslot.
   2197     // But only set if delta is greater than previous delta.
   2198     if (FPDiff < (MF.getInfo<X86MachineFunctionInfo>()->getTCReturnAddrDelta()))
   2199       MF.getInfo<X86MachineFunctionInfo>()->setTCReturnAddrDelta(FPDiff);
   2200   }
   2201 
   2202   if (!IsSibcall)
   2203     Chain = DAG.getCALLSEQ_START(Chain, DAG.getIntPtrConstant(NumBytes, true));
   2204 
   2205   SDValue RetAddrFrIdx;
   2206   // Load return address for tail calls.
   2207   if (isTailCall && FPDiff)
   2208     Chain = EmitTailCallLoadRetAddr(DAG, RetAddrFrIdx, Chain, isTailCall,
   2209                                     Is64Bit, FPDiff, dl);
   2210 
   2211   SmallVector<std::pair<unsigned, SDValue>, 8> RegsToPass;
   2212   SmallVector<SDValue, 8> MemOpChains;
   2213   SDValue StackPtr;
   2214 
   2215   // Walk the register/memloc assignments, inserting copies/loads.  In the case
   2216   // of tail call optimization arguments are handle later.
   2217   for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
   2218     CCValAssign &VA = ArgLocs[i];
   2219     EVT RegVT = VA.getLocVT();
   2220     SDValue Arg = OutVals[i];
   2221     ISD::ArgFlagsTy Flags = Outs[i].Flags;
   2222     bool isByVal = Flags.isByVal();
   2223 
   2224     // Promote the value if needed.
   2225     switch (VA.getLocInfo()) {
   2226     default: llvm_unreachable("Unknown loc info!");
   2227     case CCValAssign::Full: break;
   2228     case CCValAssign::SExt:
   2229       Arg = DAG.getNode(ISD::SIGN_EXTEND, dl, RegVT, Arg);
   2230       break;
   2231     case CCValAssign::ZExt:
   2232       Arg = DAG.getNode(ISD::ZERO_EXTEND, dl, RegVT, Arg);
   2233       break;
   2234     case CCValAssign::AExt:
   2235       if (RegVT.isVector() && RegVT.getSizeInBits() == 128) {
   2236         // Special case: passing MMX values in XMM registers.
   2237         Arg = DAG.getNode(ISD::BITCAST, dl, MVT::i64, Arg);
   2238         Arg = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, Arg);
   2239         Arg = getMOVL(DAG, dl, MVT::v2i64, DAG.getUNDEF(MVT::v2i64), Arg);
   2240       } else
   2241         Arg = DAG.getNode(ISD::ANY_EXTEND, dl, RegVT, Arg);
   2242       break;
   2243     case CCValAssign::BCvt:
   2244       Arg = DAG.getNode(ISD::BITCAST, dl, RegVT, Arg);
   2245       break;
   2246     case CCValAssign::Indirect: {
   2247       // Store the argument.
   2248       SDValue SpillSlot = DAG.CreateStackTemporary(VA.getValVT());
   2249       int FI = cast<FrameIndexSDNode>(SpillSlot)->getIndex();
   2250       Chain = DAG.getStore(Chain, dl, Arg, SpillSlot,
   2251                            MachinePointerInfo::getFixedStack(FI),
   2252                            false, false, 0);
   2253       Arg = SpillSlot;
   2254       break;
   2255     }
   2256     }
   2257 
   2258     if (VA.isRegLoc()) {
   2259       RegsToPass.push_back(std::make_pair(VA.getLocReg(), Arg));
   2260       if (isVarArg && IsWin64) {
   2261         // Win64 ABI requires argument XMM reg to be copied to the corresponding
   2262         // shadow reg if callee is a varargs function.
   2263         unsigned ShadowReg = 0;
   2264         switch (VA.getLocReg()) {
   2265         case X86::XMM0: ShadowReg = X86::RCX; break;
   2266         case X86::XMM1: ShadowReg = X86::RDX; break;
   2267         case X86::XMM2: ShadowReg = X86::R8; break;
   2268         case X86::XMM3: ShadowReg = X86::R9; break;
   2269         }
   2270         if (ShadowReg)
   2271           RegsToPass.push_back(std::make_pair(ShadowReg, Arg));
   2272       }
   2273     } else if (!IsSibcall && (!isTailCall || isByVal)) {
   2274       assert(VA.isMemLoc());
   2275       if (StackPtr.getNode() == 0)
   2276         StackPtr = DAG.getCopyFromReg(Chain, dl, X86StackPtr, getPointerTy());
   2277       MemOpChains.push_back(LowerMemOpCallTo(Chain, StackPtr, Arg,
   2278                                              dl, DAG, VA, Flags));
   2279     }
   2280   }
   2281 
   2282   if (!MemOpChains.empty())
   2283     Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
   2284                         &MemOpChains[0], MemOpChains.size());
   2285 
   2286   // Build a sequence of copy-to-reg nodes chained together with token chain
   2287   // and flag operands which copy the outgoing args into registers.
   2288   SDValue InFlag;
   2289   // Tail call byval lowering might overwrite argument registers so in case of
   2290   // tail call optimization the copies to registers are lowered later.
   2291   if (!isTailCall)
   2292     for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) {
   2293       Chain = DAG.getCopyToReg(Chain, dl, RegsToPass[i].first,
   2294                                RegsToPass[i].second, InFlag);
   2295       InFlag = Chain.getValue(1);
   2296     }
   2297 
   2298   if (Subtarget->isPICStyleGOT()) {
   2299     // ELF / PIC requires GOT in the EBX register before function calls via PLT
   2300     // GOT pointer.
   2301     if (!isTailCall) {
   2302       Chain = DAG.getCopyToReg(Chain, dl, X86::EBX,
   2303                                DAG.getNode(X86ISD::GlobalBaseReg,
   2304                                            DebugLoc(), getPointerTy()),
   2305                                InFlag);
   2306       InFlag = Chain.getValue(1);
   2307     } else {
   2308       // If we are tail calling and generating PIC/GOT style code load the
   2309       // address of the callee into ECX. The value in ecx is used as target of
   2310       // the tail jump. This is done to circumvent the ebx/callee-saved problem
   2311       // for tail calls on PIC/GOT architectures. Normally we would just put the
   2312       // address of GOT into ebx and then call target@PLT. But for tail calls
   2313       // ebx would be restored (since ebx is callee saved) before jumping to the
   2314       // target@PLT.
   2315 
   2316       // Note: The actual moving to ECX is done further down.
   2317       GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee);
   2318       if (G && !G->getGlobal()->hasHiddenVisibility() &&
   2319           !G->getGlobal()->hasProtectedVisibility())
   2320         Callee = LowerGlobalAddress(Callee, DAG);
   2321       else if (isa<ExternalSymbolSDNode>(Callee))
   2322         Callee = LowerExternalSymbol(Callee, DAG);
   2323     }
   2324   }
   2325 
   2326   if (Is64Bit && isVarArg && !IsWin64) {
   2327     // From AMD64 ABI document:
   2328     // For calls that may call functions that use varargs or stdargs
   2329     // (prototype-less calls or calls to functions containing ellipsis (...) in
   2330     // the declaration) %al is used as hidden argument to specify the number
   2331     // of SSE registers used. The contents of %al do not need to match exactly
   2332     // the number of registers, but must be an ubound on the number of SSE
   2333     // registers used and is in the range 0 - 8 inclusive.
   2334 
   2335     // Count the number of XMM registers allocated.
   2336     static const uint16_t XMMArgRegs[] = {
   2337       X86::XMM0, X86::XMM1, X86::XMM2, X86::XMM3,
   2338       X86::XMM4, X86::XMM5, X86::XMM6, X86::XMM7
   2339     };
   2340     unsigned NumXMMRegs = CCInfo.getFirstUnallocated(XMMArgRegs, 8);
   2341     assert((Subtarget->hasSSE1() || !NumXMMRegs)
   2342            && "SSE registers cannot be used when SSE is disabled");
   2343 
   2344     Chain = DAG.getCopyToReg(Chain, dl, X86::AL,
   2345                              DAG.getConstant(NumXMMRegs, MVT::i8), InFlag);
   2346     InFlag = Chain.getValue(1);
   2347   }
   2348 
   2349 
   2350   // For tail calls lower the arguments to the 'real' stack slot.
   2351   if (isTailCall) {
   2352     // Force all the incoming stack arguments to be loaded from the stack
   2353     // before any new outgoing arguments are stored to the stack, because the
   2354     // outgoing stack slots may alias the incoming argument stack slots, and
   2355     // the alias isn't otherwise explicit. This is slightly more conservative
   2356     // than necessary, because it means that each store effectively depends
   2357     // on every argument instead of just those arguments it would clobber.
   2358     SDValue ArgChain = DAG.getStackArgumentTokenFactor(Chain);
   2359 
   2360     SmallVector<SDValue, 8> MemOpChains2;
   2361     SDValue FIN;
   2362     int FI = 0;
   2363     // Do not flag preceding copytoreg stuff together with the following stuff.
   2364     InFlag = SDValue();
   2365     if (getTargetMachine().Options.GuaranteedTailCallOpt) {
   2366       for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
   2367         CCValAssign &VA = ArgLocs[i];
   2368         if (VA.isRegLoc())
   2369           continue;
   2370         assert(VA.isMemLoc());
   2371         SDValue Arg = OutVals[i];
   2372         ISD::ArgFlagsTy Flags = Outs[i].Flags;
   2373         // Create frame index.
   2374         int32_t Offset = VA.getLocMemOffset()+FPDiff;
   2375         uint32_t OpSize = (VA.getLocVT().getSizeInBits()+7)/8;
   2376         FI = MF.getFrameInfo()->CreateFixedObject(OpSize, Offset, true);
   2377         FIN = DAG.getFrameIndex(FI, getPointerTy());
   2378 
   2379         if (Flags.isByVal()) {
   2380           // Copy relative to framepointer.
   2381           SDValue Source = DAG.getIntPtrConstant(VA.getLocMemOffset());
   2382           if (StackPtr.getNode() == 0)
   2383             StackPtr = DAG.getCopyFromReg(Chain, dl, X86StackPtr,
   2384                                           getPointerTy());
   2385           Source = DAG.getNode(ISD::ADD, dl, getPointerTy(), StackPtr, Source);
   2386 
   2387           MemOpChains2.push_back(CreateCopyOfByValArgument(Source, FIN,
   2388                                                            ArgChain,
   2389                                                            Flags, DAG, dl));
   2390         } else {
   2391           // Store relative to framepointer.
   2392           MemOpChains2.push_back(
   2393             DAG.getStore(ArgChain, dl, Arg, FIN,
   2394                          MachinePointerInfo::getFixedStack(FI),
   2395                          false, false, 0));
   2396         }
   2397       }
   2398     }
   2399 
   2400     if (!MemOpChains2.empty())
   2401       Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
   2402                           &MemOpChains2[0], MemOpChains2.size());
   2403 
   2404     // Copy arguments to their registers.
   2405     for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) {
   2406       Chain = DAG.getCopyToReg(Chain, dl, RegsToPass[i].first,
   2407                                RegsToPass[i].second, InFlag);
   2408       InFlag = Chain.getValue(1);
   2409     }
   2410     InFlag =SDValue();
   2411 
   2412     // Store the return address to the appropriate stack slot.
   2413     Chain = EmitTailCallStoreRetAddr(DAG, MF, Chain, RetAddrFrIdx, Is64Bit,
   2414                                      FPDiff, dl);
   2415   }
   2416 
   2417   if (getTargetMachine().getCodeModel() == CodeModel::Large) {
   2418     assert(Is64Bit && "Large code model is only legal in 64-bit mode.");
   2419     // In the 64-bit large code model, we have to make all calls
   2420     // through a register, since the call instruction's 32-bit
   2421     // pc-relative offset may not be large enough to hold the whole
   2422     // address.
   2423   } else if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee)) {
   2424     // If the callee is a GlobalAddress node (quite common, every direct call
   2425     // is) turn it into a TargetGlobalAddress node so that legalize doesn't hack
   2426     // it.
   2427 
   2428     // We should use extra load for direct calls to dllimported functions in
   2429     // non-JIT mode.
   2430     const GlobalValue *GV = G->getGlobal();
   2431     if (!GV->hasDLLImportLinkage()) {
   2432       unsigned char OpFlags = 0;
   2433       bool ExtraLoad = false;
   2434       unsigned WrapperKind = ISD::DELETED_NODE;
   2435 
   2436       // On ELF targets, in both X86-64 and X86-32 mode, direct calls to
   2437       // external symbols most go through the PLT in PIC mode.  If the symbol
   2438       // has hidden or protected visibility, or if it is static or local, then
   2439       // we don't need to use the PLT - we can directly call it.
   2440       if (Subtarget->isTargetELF() &&
   2441           getTargetMachine().getRelocationModel() == Reloc::PIC_ &&
   2442           GV->hasDefaultVisibility() && !GV->hasLocalLinkage()) {
   2443         OpFlags = X86II::MO_PLT;
   2444       } else if (Subtarget->isPICStyleStubAny() &&
   2445                  (GV->isDeclaration() || GV->isWeakForLinker()) &&
   2446                  (!Subtarget->getTargetTriple().isMacOSX() ||
   2447                   Subtarget->getTargetTriple().isMacOSXVersionLT(10, 5))) {
   2448         // PC-relative references to external symbols should go through $stub,
   2449         // unless we're building with the leopard linker or later, which
   2450         // automatically synthesizes these stubs.
   2451         OpFlags = X86II::MO_DARWIN_STUB;
   2452       } else if (Subtarget->isPICStyleRIPRel() &&
   2453                  isa<Function>(GV) &&
   2454                  cast<Function>(GV)->hasFnAttr(Attribute::NonLazyBind)) {
   2455         // If the function is marked as non-lazy, generate an indirect call
   2456         // which loads from the GOT directly. This avoids runtime overhead
   2457         // at the cost of eager binding (and one extra byte of encoding).
   2458         OpFlags = X86II::MO_GOTPCREL;
   2459         WrapperKind = X86ISD::WrapperRIP;
   2460         ExtraLoad = true;
   2461       }
   2462 
   2463       Callee = DAG.getTargetGlobalAddress(GV, dl, getPointerTy(),
   2464                                           G->getOffset(), OpFlags);
   2465 
   2466       // Add a wrapper if needed.
   2467       if (WrapperKind != ISD::DELETED_NODE)
   2468         Callee = DAG.getNode(X86ISD::WrapperRIP, dl, getPointerTy(), Callee);
   2469       // Add extra indirection if needed.
   2470       if (ExtraLoad)
   2471         Callee = DAG.getLoad(getPointerTy(), dl, DAG.getEntryNode(), Callee,
   2472                              MachinePointerInfo::getGOT(),
   2473                              false, false, false, 0);
   2474     }
   2475   } else if (ExternalSymbolSDNode *S = dyn_cast<ExternalSymbolSDNode>(Callee)) {
   2476     unsigned char OpFlags = 0;
   2477 
   2478     // On ELF targets, in either X86-64 or X86-32 mode, direct calls to
   2479     // external symbols should go through the PLT.
   2480     if (Subtarget->isTargetELF() &&
   2481         getTargetMachine().getRelocationModel() == Reloc::PIC_) {
   2482       OpFlags = X86II::MO_PLT;
   2483     } else if (Subtarget->isPICStyleStubAny() &&
   2484                (!Subtarget->getTargetTriple().isMacOSX() ||
   2485                 Subtarget->getTargetTriple().isMacOSXVersionLT(10, 5))) {
   2486       // PC-relative references to external symbols should go through $stub,
   2487       // unless we're building with the leopard linker or later, which
   2488       // automatically synthesizes these stubs.
   2489       OpFlags = X86II::MO_DARWIN_STUB;
   2490     }
   2491 
   2492     Callee = DAG.getTargetExternalSymbol(S->getSymbol(), getPointerTy(),
   2493                                          OpFlags);
   2494   }
   2495 
   2496   // Returns a chain & a flag for retval copy to use.
   2497   SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
   2498   SmallVector<SDValue, 8> Ops;
   2499 
   2500   if (!IsSibcall && isTailCall) {
   2501     Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(NumBytes, true),
   2502                            DAG.getIntPtrConstant(0, true), InFlag);
   2503     InFlag = Chain.getValue(1);
   2504   }
   2505 
   2506   Ops.push_back(Chain);
   2507   Ops.push_back(Callee);
   2508 
   2509   if (isTailCall)
   2510     Ops.push_back(DAG.getConstant(FPDiff, MVT::i32));
   2511 
   2512   // Add argument registers to the end of the list so that they are known live
   2513   // into the call.
   2514   for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i)
   2515     Ops.push_back(DAG.getRegister(RegsToPass[i].first,
   2516                                   RegsToPass[i].second.getValueType()));
   2517 
   2518   // Add an implicit use GOT pointer in EBX.
   2519   if (!isTailCall && Subtarget->isPICStyleGOT())
   2520     Ops.push_back(DAG.getRegister(X86::EBX, getPointerTy()));
   2521 
   2522   // Add an implicit use of AL for non-Windows x86 64-bit vararg functions.
   2523   if (Is64Bit && isVarArg && !IsWin64)
   2524     Ops.push_back(DAG.getRegister(X86::AL, MVT::i8));
   2525 
   2526   // Add a register mask operand representing the call-preserved registers.
   2527   const TargetRegisterInfo *TRI = getTargetMachine().getRegisterInfo();
   2528   const uint32_t *Mask = TRI->getCallPreservedMask(CallConv);
   2529   assert(Mask && "Missing call preserved mask for calling convention");
   2530   Ops.push_back(DAG.getRegisterMask(Mask));
   2531 
   2532   if (InFlag.getNode())
   2533     Ops.push_back(InFlag);
   2534 
   2535   if (isTailCall) {
   2536     // We used to do:
   2537     //// If this is the first return lowered for this function, add the regs
   2538     //// to the liveout set for the function.
   2539     // This isn't right, although it's probably harmless on x86; liveouts
   2540     // should be computed from returns not tail calls.  Consider a void
   2541     // function making a tail call to a function returning int.
   2542     return DAG.getNode(X86ISD::TC_RETURN, dl,
   2543                        NodeTys, &Ops[0], Ops.size());
   2544   }
   2545 
   2546   Chain = DAG.getNode(X86ISD::CALL, dl, NodeTys, &Ops[0], Ops.size());
   2547   InFlag = Chain.getValue(1);
   2548 
   2549   // Create the CALLSEQ_END node.
   2550   unsigned NumBytesForCalleeToPush;
   2551   if (X86::isCalleePop(CallConv, Is64Bit, isVarArg,
   2552                        getTargetMachine().Options.GuaranteedTailCallOpt))
   2553     NumBytesForCalleeToPush = NumBytes;    // Callee pops everything
   2554   else if (!Is64Bit && !IsTailCallConvention(CallConv) && !IsWindows &&
   2555            IsStructRet)
   2556     // If this is a call to a struct-return function, the callee
   2557     // pops the hidden struct pointer, so we have to push it back.
   2558     // This is common for Darwin/X86, Linux & Mingw32 targets.
   2559     // For MSVC Win32 targets, the caller pops the hidden struct pointer.
   2560     NumBytesForCalleeToPush = 4;
   2561   else
   2562     NumBytesForCalleeToPush = 0;  // Callee pops nothing.
   2563 
   2564   // Returns a flag for retval copy to use.
   2565   if (!IsSibcall) {
   2566     Chain = DAG.getCALLSEQ_END(Chain,
   2567                                DAG.getIntPtrConstant(NumBytes, true),
   2568                                DAG.getIntPtrConstant(NumBytesForCalleeToPush,
   2569                                                      true),
   2570                                InFlag);
   2571     InFlag = Chain.getValue(1);
   2572   }
   2573 
   2574   // Handle result values, copying them out of physregs into vregs that we
   2575   // return.
   2576   return LowerCallResult(Chain, InFlag, CallConv, isVarArg,
   2577                          Ins, dl, DAG, InVals);
   2578 }
   2579 
   2580 
   2581 //===----------------------------------------------------------------------===//
   2582 //                Fast Calling Convention (tail call) implementation
   2583 //===----------------------------------------------------------------------===//
   2584 
   2585 //  Like std call, callee cleans arguments, convention except that ECX is
   2586 //  reserved for storing the tail called function address. Only 2 registers are
   2587 //  free for argument passing (inreg). Tail call optimization is performed
   2588 //  provided:
   2589 //                * tailcallopt is enabled
   2590 //                * caller/callee are fastcc
   2591 //  On X86_64 architecture with GOT-style position independent code only local
   2592 //  (within module) calls are supported at the moment.
   2593 //  To keep the stack aligned according to platform abi the function
   2594 //  GetAlignedArgumentStackSize ensures that argument delta is always multiples
   2595 //  of stack alignment. (Dynamic linkers need this - darwin's dyld for example)
   2596 //  If a tail called function callee has more arguments than the caller the
   2597 //  caller needs to make sure that there is room to move the RETADDR to. This is
   2598 //  achieved by reserving an area the size of the argument delta right after the
   2599 //  original REtADDR, but before the saved framepointer or the spilled registers
   2600 //  e.g. caller(arg1, arg2) calls callee(arg1, arg2,arg3,arg4)
   2601 //  stack layout:
   2602 //    arg1
   2603 //    arg2
   2604 //    RETADDR
   2605 //    [ new RETADDR
   2606 //      move area ]
   2607 //    (possible EBP)
   2608 //    ESI
   2609 //    EDI
   2610 //    local1 ..
   2611 
   2612 /// GetAlignedArgumentStackSize - Make the stack size align e.g 16n + 12 aligned
   2613 /// for a 16 byte align requirement.
   2614 unsigned
   2615 X86TargetLowering::GetAlignedArgumentStackSize(unsigned StackSize,
   2616                                                SelectionDAG& DAG) const {
   2617   MachineFunction &MF = DAG.getMachineFunction();
   2618   const TargetMachine &TM = MF.getTarget();
   2619   const TargetFrameLowering &TFI = *TM.getFrameLowering();
   2620   unsigned StackAlignment = TFI.getStackAlignment();
   2621   uint64_t AlignMask = StackAlignment - 1;
   2622   int64_t Offset = StackSize;
   2623   uint64_t SlotSize = TD->getPointerSize();
   2624   if ( (Offset & AlignMask) <= (StackAlignment - SlotSize) ) {
   2625     // Number smaller than 12 so just add the difference.
   2626     Offset += ((StackAlignment - SlotSize) - (Offset & AlignMask));
   2627   } else {
   2628     // Mask out lower bits, add stackalignment once plus the 12 bytes.
   2629     Offset = ((~AlignMask) & Offset) + StackAlignment +
   2630       (StackAlignment-SlotSize);
   2631   }
   2632   return Offset;
   2633 }
   2634 
   2635 /// MatchingStackOffset - Return true if the given stack call argument is
   2636 /// already available in the same position (relatively) of the caller's
   2637 /// incoming argument stack.
   2638 static
   2639 bool MatchingStackOffset(SDValue Arg, unsigned Offset, ISD::ArgFlagsTy Flags,
   2640                          MachineFrameInfo *MFI, const MachineRegisterInfo *MRI,
   2641                          const X86InstrInfo *TII) {
   2642   unsigned Bytes = Arg.getValueType().getSizeInBits() / 8;
   2643   int FI = INT_MAX;
   2644   if (Arg.getOpcode() == ISD::CopyFromReg) {
   2645     unsigned VR = cast<RegisterSDNode>(Arg.getOperand(1))->getReg();
   2646     if (!TargetRegisterInfo::isVirtualRegister(VR))
   2647       return false;
   2648     MachineInstr *Def = MRI->getVRegDef(VR);
   2649     if (!Def)
   2650       return false;
   2651     if (!Flags.isByVal()) {
   2652       if (!TII->isLoadFromStackSlot(Def, FI))
   2653         return false;
   2654     } else {
   2655       unsigned Opcode = Def->getOpcode();
   2656       if ((Opcode == X86::LEA32r || Opcode == X86::LEA64r) &&
   2657           Def->getOperand(1).isFI()) {
   2658         FI = Def->getOperand(1).getIndex();
   2659         Bytes = Flags.getByValSize();
   2660       } else
   2661         return false;
   2662     }
   2663   } else if (LoadSDNode *Ld = dyn_cast<LoadSDNode>(Arg)) {
   2664     if (Flags.isByVal())
   2665       // ByVal argument is passed in as a pointer but it's now being
   2666       // dereferenced. e.g.
   2667       // define @foo(%struct.X* %A) {
   2668       //   tail call @bar(%struct.X* byval %A)
   2669       // }
   2670       return false;
   2671     SDValue Ptr = Ld->getBasePtr();
   2672     FrameIndexSDNode *FINode = dyn_cast<FrameIndexSDNode>(Ptr);
   2673     if (!FINode)
   2674       return false;
   2675     FI = FINode->getIndex();
   2676   } else if (Arg.getOpcode() == ISD::FrameIndex && Flags.isByVal()) {
   2677     FrameIndexSDNode *FINode = cast<FrameIndexSDNode>(Arg);
   2678     FI = FINode->getIndex();
   2679     Bytes = Flags.getByValSize();
   2680   } else
   2681     return false;
   2682 
   2683   assert(FI != INT_MAX);
   2684   if (!MFI->isFixedObjectIndex(FI))
   2685     return false;
   2686   return Offset == MFI->getObjectOffset(FI) && Bytes == MFI->getObjectSize(FI);
   2687 }
   2688 
   2689 /// IsEligibleForTailCallOptimization - Check whether the call is eligible
   2690 /// for tail call optimization. Targets which want to do tail call
   2691 /// optimization should implement this function.
   2692 bool
   2693 X86TargetLowering::IsEligibleForTailCallOptimization(SDValue Callee,
   2694                                                      CallingConv::ID CalleeCC,
   2695                                                      bool isVarArg,
   2696                                                      bool isCalleeStructRet,
   2697                                                      bool isCallerStructRet,
   2698                                     const SmallVectorImpl<ISD::OutputArg> &Outs,
   2699                                     const SmallVectorImpl<SDValue> &OutVals,
   2700                                     const SmallVectorImpl<ISD::InputArg> &Ins,
   2701                                                      SelectionDAG& DAG) const {
   2702   if (!IsTailCallConvention(CalleeCC) &&
   2703       CalleeCC != CallingConv::C)
   2704     return false;
   2705 
   2706   // If -tailcallopt is specified, make fastcc functions tail-callable.
   2707   const MachineFunction &MF = DAG.getMachineFunction();
   2708   const Function *CallerF = DAG.getMachineFunction().getFunction();
   2709   CallingConv::ID CallerCC = CallerF->getCallingConv();
   2710   bool CCMatch = CallerCC == CalleeCC;
   2711 
   2712   if (getTargetMachine().Options.GuaranteedTailCallOpt) {
   2713     if (IsTailCallConvention(CalleeCC) && CCMatch)
   2714       return true;
   2715     return false;
   2716   }
   2717 
   2718   // Look for obvious safe cases to perform tail call optimization that do not
   2719   // require ABI changes. This is what gcc calls sibcall.
   2720 
   2721   // Can't do sibcall if stack needs to be dynamically re-aligned. PEI needs to
   2722   // emit a special epilogue.
   2723   if (RegInfo->needsStackRealignment(MF))
   2724     return false;
   2725 
   2726   // Also avoid sibcall optimization if either caller or callee uses struct
   2727   // return semantics.
   2728   if (isCalleeStructRet || isCallerStructRet)
   2729     return false;
   2730 
   2731   // An stdcall caller is expected to clean up its arguments; the callee
   2732   // isn't going to do that.
   2733   if (!CCMatch && CallerCC==CallingConv::X86_StdCall)
   2734     return false;
   2735 
   2736   // Do not sibcall optimize vararg calls unless all arguments are passed via
   2737   // registers.
   2738   if (isVarArg && !Outs.empty()) {
   2739 
   2740     // Optimizing for varargs on Win64 is unlikely to be safe without
   2741     // additional testing.
   2742     if (Subtarget->isTargetWin64())
   2743       return false;
   2744 
   2745     SmallVector<CCValAssign, 16> ArgLocs;
   2746     CCState CCInfo(CalleeCC, isVarArg, DAG.getMachineFunction(),
   2747 		   getTargetMachine(), ArgLocs, *DAG.getContext());
   2748 
   2749     CCInfo.AnalyzeCallOperands(Outs, CC_X86);
   2750     for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i)
   2751       if (!ArgLocs[i].isRegLoc())
   2752         return false;
   2753   }
   2754 
   2755   // If the call result is in ST0 / ST1, it needs to be popped off the x87
   2756   // stack.  Therefore, if it's not used by the call it is not safe to optimize
   2757   // this into a sibcall.
   2758   bool Unused = false;
   2759   for (unsigned i = 0, e = Ins.size(); i != e; ++i) {
   2760     if (!Ins[i].Used) {
   2761       Unused = true;
   2762       break;
   2763     }
   2764   }
   2765   if (Unused) {
   2766     SmallVector<CCValAssign, 16> RVLocs;
   2767     CCState CCInfo(CalleeCC, false, DAG.getMachineFunction(),
   2768 		   getTargetMachine(), RVLocs, *DAG.getContext());
   2769     CCInfo.AnalyzeCallResult(Ins, RetCC_X86);
   2770     for (unsigned i = 0, e = RVLocs.size(); i != e; ++i) {
   2771       CCValAssign &VA = RVLocs[i];
   2772       if (VA.getLocReg() == X86::ST0 || VA.getLocReg() == X86::ST1)
   2773         return false;
   2774     }
   2775   }
   2776 
   2777   // If the calling conventions do not match, then we'd better make sure the
   2778   // results are returned in the same way as what the caller expects.
   2779   if (!CCMatch) {
   2780     SmallVector<CCValAssign, 16> RVLocs1;
   2781     CCState CCInfo1(CalleeCC, false, DAG.getMachineFunction(),
   2782 		    getTargetMachine(), RVLocs1, *DAG.getContext());
   2783     CCInfo1.AnalyzeCallResult(Ins, RetCC_X86);
   2784 
   2785     SmallVector<CCValAssign, 16> RVLocs2;
   2786     CCState CCInfo2(CallerCC, false, DAG.getMachineFunction(),
   2787 		    getTargetMachine(), RVLocs2, *DAG.getContext());
   2788     CCInfo2.AnalyzeCallResult(Ins, RetCC_X86);
   2789 
   2790     if (RVLocs1.size() != RVLocs2.size())
   2791       return false;
   2792     for (unsigned i = 0, e = RVLocs1.size(); i != e; ++i) {
   2793       if (RVLocs1[i].isRegLoc() != RVLocs2[i].isRegLoc())
   2794         return false;
   2795       if (RVLocs1[i].getLocInfo() != RVLocs2[i].getLocInfo())
   2796         return false;
   2797       if (RVLocs1[i].isRegLoc()) {
   2798         if (RVLocs1[i].getLocReg() != RVLocs2[i].getLocReg())
   2799           return false;
   2800       } else {
   2801         if (RVLocs1[i].getLocMemOffset() != RVLocs2[i].getLocMemOffset())
   2802           return false;
   2803       }
   2804     }
   2805   }
   2806 
   2807   // If the callee takes no arguments then go on to check the results of the
   2808   // call.
   2809   if (!Outs.empty()) {
   2810     // Check if stack adjustment is needed. For now, do not do this if any
   2811     // argument is passed on the stack.
   2812     SmallVector<CCValAssign, 16> ArgLocs;
   2813     CCState CCInfo(CalleeCC, isVarArg, DAG.getMachineFunction(),
   2814 		   getTargetMachine(), ArgLocs, *DAG.getContext());
   2815 
   2816     // Allocate shadow area for Win64
   2817     if (Subtarget->isTargetWin64()) {
   2818       CCInfo.AllocateStack(32, 8);
   2819     }
   2820 
   2821     CCInfo.AnalyzeCallOperands(Outs, CC_X86);
   2822     if (CCInfo.getNextStackOffset()) {
   2823       MachineFunction &MF = DAG.getMachineFunction();
   2824       if (MF.getInfo<X86MachineFunctionInfo>()->getBytesToPopOnReturn())
   2825         return false;
   2826 
   2827       // Check if the arguments are already laid out in the right way as
   2828       // the caller's fixed stack objects.
   2829       MachineFrameInfo *MFI = MF.getFrameInfo();
   2830       const MachineRegisterInfo *MRI = &MF.getRegInfo();
   2831       const X86InstrInfo *TII =
   2832         ((X86TargetMachine&)getTargetMachine()).getInstrInfo();
   2833       for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
   2834         CCValAssign &VA = ArgLocs[i];
   2835         SDValue Arg = OutVals[i];
   2836         ISD::ArgFlagsTy Flags = Outs[i].Flags;
   2837         if (VA.getLocInfo() == CCValAssign::Indirect)
   2838           return false;
   2839         if (!VA.isRegLoc()) {
   2840           if (!MatchingStackOffset(Arg, VA.getLocMemOffset(), Flags,
   2841                                    MFI, MRI, TII))
   2842             return false;
   2843         }
   2844       }
   2845     }
   2846 
   2847     // If the tailcall address may be in a register, then make sure it's
   2848     // possible to register allocate for it. In 32-bit, the call address can
   2849     // only target EAX, EDX, or ECX since the tail call must be scheduled after
   2850     // callee-saved registers are restored. These happen to be the same
   2851     // registers used to pass 'inreg' arguments so watch out for those.
   2852     if (!Subtarget->is64Bit() &&
   2853         !isa<GlobalAddressSDNode>(Callee) &&
   2854         !isa<ExternalSymbolSDNode>(Callee)) {
   2855       unsigned NumInRegs = 0;
   2856       for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
   2857         CCValAssign &VA = ArgLocs[i];
   2858         if (!VA.isRegLoc())
   2859           continue;
   2860         unsigned Reg = VA.getLocReg();
   2861         switch (Reg) {
   2862         default: break;
   2863         case X86::EAX: case X86::EDX: case X86::ECX:
   2864           if (++NumInRegs == 3)
   2865             return false;
   2866           break;
   2867         }
   2868       }
   2869     }
   2870   }
   2871 
   2872   return true;
   2873 }
   2874 
   2875 FastISel *
   2876 X86TargetLowering::createFastISel(FunctionLoweringInfo &funcInfo) const {
   2877   return X86::createFastISel(funcInfo);
   2878 }
   2879 
   2880 
   2881 //===----------------------------------------------------------------------===//
   2882 //                           Other Lowering Hooks
   2883 //===----------------------------------------------------------------------===//
   2884 
   2885 static bool MayFoldLoad(SDValue Op) {
   2886   return Op.hasOneUse() && ISD::isNormalLoad(Op.getNode());
   2887 }
   2888 
   2889 static bool MayFoldIntoStore(SDValue Op) {
   2890   return Op.hasOneUse() && ISD::isNormalStore(*Op.getNode()->use_begin());
   2891 }
   2892 
   2893 static bool isTargetShuffle(unsigned Opcode) {
   2894   switch(Opcode) {
   2895   default: return false;
   2896   case X86ISD::PSHUFD:
   2897   case X86ISD::PSHUFHW:
   2898   case X86ISD::PSHUFLW:
   2899   case X86ISD::SHUFP:
   2900   case X86ISD::PALIGN:
   2901   case X86ISD::MOVLHPS:
   2902   case X86ISD::MOVLHPD:
   2903   case X86ISD::MOVHLPS:
   2904   case X86ISD::MOVLPS:
   2905   case X86ISD::MOVLPD:
   2906   case X86ISD::MOVSHDUP:
   2907   case X86ISD::MOVSLDUP:
   2908   case X86ISD::MOVDDUP:
   2909   case X86ISD::MOVSS:
   2910   case X86ISD::MOVSD:
   2911   case X86ISD::UNPCKL:
   2912   case X86ISD::UNPCKH:
   2913   case X86ISD::VPERMILP:
   2914   case X86ISD::VPERM2X128:
   2915     return true;
   2916   }
   2917 }
   2918 
   2919 static SDValue getTargetShuffleNode(unsigned Opc, DebugLoc dl, EVT VT,
   2920                                     SDValue V1, SelectionDAG &DAG) {
   2921   switch(Opc) {
   2922   default: llvm_unreachable("Unknown x86 shuffle node");
   2923   case X86ISD::MOVSHDUP:
   2924   case X86ISD::MOVSLDUP:
   2925   case X86ISD::MOVDDUP:
   2926     return DAG.getNode(Opc, dl, VT, V1);
   2927   }
   2928 }
   2929 
   2930 static SDValue getTargetShuffleNode(unsigned Opc, DebugLoc dl, EVT VT,
   2931                                     SDValue V1, unsigned TargetMask,
   2932                                     SelectionDAG &DAG) {
   2933   switch(Opc) {
   2934   default: llvm_unreachable("Unknown x86 shuffle node");
   2935   case X86ISD::PSHUFD:
   2936   case X86ISD::PSHUFHW:
   2937   case X86ISD::PSHUFLW:
   2938   case X86ISD::VPERMILP:
   2939   case X86ISD::VPERMI:
   2940     return DAG.getNode(Opc, dl, VT, V1, DAG.getConstant(TargetMask, MVT::i8));
   2941   }
   2942 }
   2943 
   2944 static SDValue getTargetShuffleNode(unsigned Opc, DebugLoc dl, EVT VT,
   2945                                     SDValue V1, SDValue V2, unsigned TargetMask,
   2946                                     SelectionDAG &DAG) {
   2947   switch(Opc) {
   2948   default: llvm_unreachable("Unknown x86 shuffle node");
   2949   case X86ISD::PALIGN:
   2950   case X86ISD::SHUFP:
   2951   case X86ISD::VPERM2X128:
   2952     return DAG.getNode(Opc, dl, VT, V1, V2,
   2953                        DAG.getConstant(TargetMask, MVT::i8));
   2954   }
   2955 }
   2956 
   2957 static SDValue getTargetShuffleNode(unsigned Opc, DebugLoc dl, EVT VT,
   2958                                     SDValue V1, SDValue V2, SelectionDAG &DAG) {
   2959   switch(Opc) {
   2960   default: llvm_unreachable("Unknown x86 shuffle node");
   2961   case X86ISD::MOVLHPS:
   2962   case X86ISD::MOVLHPD:
   2963   case X86ISD::MOVHLPS:
   2964   case X86ISD::MOVLPS:
   2965   case X86ISD::MOVLPD:
   2966   case X86ISD::MOVSS:
   2967   case X86ISD::MOVSD:
   2968   case X86ISD::UNPCKL:
   2969   case X86ISD::UNPCKH:
   2970     return DAG.getNode(Opc, dl, VT, V1, V2);
   2971   }
   2972 }
   2973 
   2974 SDValue X86TargetLowering::getReturnAddressFrameIndex(SelectionDAG &DAG) const {
   2975   MachineFunction &MF = DAG.getMachineFunction();
   2976   X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
   2977   int ReturnAddrIndex = FuncInfo->getRAIndex();
   2978 
   2979   if (ReturnAddrIndex == 0) {
   2980     // Set up a frame object for the return address.
   2981     uint64_t SlotSize = TD->getPointerSize();
   2982     ReturnAddrIndex = MF.getFrameInfo()->CreateFixedObject(SlotSize, -SlotSize,
   2983                                                            false);
   2984     FuncInfo->setRAIndex(ReturnAddrIndex);
   2985   }
   2986 
   2987   return DAG.getFrameIndex(ReturnAddrIndex, getPointerTy());
   2988 }
   2989 
   2990 
   2991 bool X86::isOffsetSuitableForCodeModel(int64_t Offset, CodeModel::Model M,
   2992                                        bool hasSymbolicDisplacement) {
   2993   // Offset should fit into 32 bit immediate field.
   2994   if (!isInt<32>(Offset))
   2995     return false;
   2996 
   2997   // If we don't have a symbolic displacement - we don't have any extra
   2998   // restrictions.
   2999   if (!hasSymbolicDisplacement)
   3000     return true;
   3001 
   3002   // FIXME: Some tweaks might be needed for medium code model.
   3003   if (M != CodeModel::Small && M != CodeModel::Kernel)
   3004     return false;
   3005 
   3006   // For small code model we assume that latest object is 16MB before end of 31
   3007   // bits boundary. We may also accept pretty large negative constants knowing
   3008   // that all objects are in the positive half of address space.
   3009   if (M == CodeModel::Small && Offset < 16*1024*1024)
   3010     return true;
   3011 
   3012   // For kernel code model we know that all object resist in the negative half
   3013   // of 32bits address space. We may not accept negative offsets, since they may
   3014   // be just off and we may accept pretty large positive ones.
   3015   if (M == CodeModel::Kernel && Offset > 0)
   3016     return true;
   3017 
   3018   return false;
   3019 }
   3020 
   3021 /// isCalleePop - Determines whether the callee is required to pop its
   3022 /// own arguments. Callee pop is necessary to support tail calls.
   3023 bool X86::isCalleePop(CallingConv::ID CallingConv,
   3024                       bool is64Bit, bool IsVarArg, bool TailCallOpt) {
   3025   if (IsVarArg)
   3026     return false;
   3027 
   3028   switch (CallingConv) {
   3029   default:
   3030     return false;
   3031   case CallingConv::X86_StdCall:
   3032     return !is64Bit;
   3033   case CallingConv::X86_FastCall:
   3034     return !is64Bit;
   3035   case CallingConv::X86_ThisCall:
   3036     return !is64Bit;
   3037   case CallingConv::Fast:
   3038     return TailCallOpt;
   3039   case CallingConv::GHC:
   3040     return TailCallOpt;
   3041   }
   3042 }
   3043 
   3044 /// TranslateX86CC - do a one to one translation of a ISD::CondCode to the X86
   3045 /// specific condition code, returning the condition code and the LHS/RHS of the
   3046 /// comparison to make.
   3047 static unsigned TranslateX86CC(ISD::CondCode SetCCOpcode, bool isFP,
   3048                                SDValue &LHS, SDValue &RHS, SelectionDAG &DAG) {
   3049   if (!isFP) {
   3050     if (ConstantSDNode *RHSC = dyn_cast<ConstantSDNode>(RHS)) {
   3051       if (SetCCOpcode == ISD::SETGT && RHSC->isAllOnesValue()) {
   3052         // X > -1   -> X == 0, jump !sign.
   3053         RHS = DAG.getConstant(0, RHS.getValueType());
   3054         return X86::COND_NS;
   3055       } else if (SetCCOpcode == ISD::SETLT && RHSC->isNullValue()) {
   3056         // X < 0   -> X == 0, jump on sign.
   3057         return X86::COND_S;
   3058       } else if (SetCCOpcode == ISD::SETLT && RHSC->getZExtValue() == 1) {
   3059         // X < 1   -> X <= 0
   3060         RHS = DAG.getConstant(0, RHS.getValueType());
   3061         return X86::COND_LE;
   3062       }
   3063     }
   3064 
   3065     switch (SetCCOpcode) {
   3066     default: llvm_unreachable("Invalid integer condition!");
   3067     case ISD::SETEQ:  return X86::COND_E;
   3068     case ISD::SETGT:  return X86::COND_G;
   3069     case ISD::SETGE:  return X86::COND_GE;
   3070     case ISD::SETLT:  return X86::COND_L;
   3071     case ISD::SETLE:  return X86::COND_LE;
   3072     case ISD::SETNE:  return X86::COND_NE;
   3073     case ISD::SETULT: return X86::COND_B;
   3074     case ISD::SETUGT: return X86::COND_A;
   3075     case ISD::SETULE: return X86::COND_BE;
   3076     case ISD::SETUGE: return X86::COND_AE;
   3077     }
   3078   }
   3079 
   3080   // First determine if it is required or is profitable to flip the operands.
   3081 
   3082   // If LHS is a foldable load, but RHS is not, flip the condition.
   3083   if (ISD::isNON_EXTLoad(LHS.getNode()) &&
   3084       !ISD::isNON_EXTLoad(RHS.getNode())) {
   3085     SetCCOpcode = getSetCCSwappedOperands(SetCCOpcode);
   3086     std::swap(LHS, RHS);
   3087   }
   3088 
   3089   switch (SetCCOpcode) {
   3090   default: break;
   3091   case ISD::SETOLT:
   3092   case ISD::SETOLE:
   3093   case ISD::SETUGT:
   3094   case ISD::SETUGE:
   3095     std::swap(LHS, RHS);
   3096     break;
   3097   }
   3098 
   3099   // On a floating point condition, the flags are set as follows:
   3100   // ZF  PF  CF   op
   3101   //  0 | 0 | 0 | X > Y
   3102   //  0 | 0 | 1 | X < Y
   3103   //  1 | 0 | 0 | X == Y
   3104   //  1 | 1 | 1 | unordered
   3105   switch (SetCCOpcode) {
   3106   default: llvm_unreachable("Condcode should be pre-legalized away");
   3107   case ISD::SETUEQ:
   3108   case ISD::SETEQ:   return X86::COND_E;
   3109   case ISD::SETOLT:              // flipped
   3110   case ISD::SETOGT:
   3111   case ISD::SETGT:   return X86::COND_A;
   3112   case ISD::SETOLE:              // flipped
   3113   case ISD::SETOGE:
   3114   case ISD::SETGE:   return X86::COND_AE;
   3115   case ISD::SETUGT:              // flipped
   3116   case ISD::SETULT:
   3117   case ISD::SETLT:   return X86::COND_B;
   3118   case ISD::SETUGE:              // flipped
   3119   case ISD::SETULE:
   3120   case ISD::SETLE:   return X86::COND_BE;
   3121   case ISD::SETONE:
   3122   case ISD::SETNE:   return X86::COND_NE;
   3123   case ISD::SETUO:   return X86::COND_P;
   3124   case ISD::SETO:    return X86::COND_NP;
   3125   case ISD::SETOEQ:
   3126   case ISD::SETUNE:  return X86::COND_INVALID;
   3127   }
   3128 }
   3129 
   3130 /// hasFPCMov - is there a floating point cmov for the specific X86 condition
   3131 /// code. Current x86 isa includes the following FP cmov instructions:
   3132 /// fcmovb, fcomvbe, fcomve, fcmovu, fcmovae, fcmova, fcmovne, fcmovnu.
   3133 static bool hasFPCMov(unsigned X86CC) {
   3134   switch (X86CC) {
   3135   default:
   3136     return false;
   3137   case X86::COND_B:
   3138   case X86::COND_BE:
   3139   case X86::COND_E:
   3140   case X86::COND_P:
   3141   case X86::COND_A:
   3142   case X86::COND_AE:
   3143   case X86::COND_NE:
   3144   case X86::COND_NP:
   3145     return true;
   3146   }
   3147 }
   3148 
   3149 /// isFPImmLegal - Returns true if the target can instruction select the
   3150 /// specified FP immediate natively. If false, the legalizer will
   3151 /// materialize the FP immediate as a load from a constant pool.
   3152 bool X86TargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT) const {
   3153   for (unsigned i = 0, e = LegalFPImmediates.size(); i != e; ++i) {
   3154     if (Imm.bitwiseIsEqual(LegalFPImmediates[i]))
   3155       return true;
   3156   }
   3157   return false;
   3158 }
   3159 
   3160 /// isUndefOrInRange - Return true if Val is undef or if its value falls within
   3161 /// the specified range (L, H].
   3162 static bool isUndefOrInRange(int Val, int Low, int Hi) {
   3163   return (Val < 0) || (Val >= Low && Val < Hi);
   3164 }
   3165 
   3166 /// isUndefOrEqual - Val is either less than zero (undef) or equal to the
   3167 /// specified value.
   3168 static bool isUndefOrEqual(int Val, int CmpVal) {
   3169   if (Val < 0 || Val == CmpVal)
   3170     return true;
   3171   return false;
   3172 }
   3173 
   3174 /// isSequentialOrUndefInRange - Return true if every element in Mask, begining
   3175 /// from position Pos and ending in Pos+Size, falls within the specified
   3176 /// sequential range (L, L+Pos]. or is undef.
   3177 static bool isSequentialOrUndefInRange(ArrayRef<int> Mask,
   3178                                        int Pos, int Size, int Low) {
   3179   for (int i = Pos, e = Pos+Size; i != e; ++i, ++Low)
   3180     if (!isUndefOrEqual(Mask[i], Low))
   3181       return false;
   3182   return true;
   3183 }
   3184 
   3185 /// isPSHUFDMask - Return true if the node specifies a shuffle of elements that
   3186 /// is suitable for input to PSHUFD or PSHUFW.  That is, it doesn't reference
   3187 /// the second operand.
   3188 static bool isPSHUFDMask(ArrayRef<int> Mask, EVT VT) {
   3189   if (VT == MVT::v4f32 || VT == MVT::v4i32 )
   3190     return (Mask[0] < 4 && Mask[1] < 4 && Mask[2] < 4 && Mask[3] < 4);
   3191   if (VT == MVT::v2f64 || VT == MVT::v2i64)
   3192     return (Mask[0] < 2 && Mask[1] < 2);
   3193   return false;
   3194 }
   3195 
   3196 /// isPSHUFHWMask - Return true if the node specifies a shuffle of elements that
   3197 /// is suitable for input to PSHUFHW.
   3198 static bool isPSHUFHWMask(ArrayRef<int> Mask, EVT VT) {
   3199   if (VT != MVT::v8i16)
   3200     return false;
   3201 
   3202   // Lower quadword copied in order or undef.
   3203   if (!isSequentialOrUndefInRange(Mask, 0, 4, 0))
   3204     return false;
   3205 
   3206   // Upper quadword shuffled.
   3207   for (unsigned i = 4; i != 8; ++i)
   3208     if (Mask[i] >= 0 && (Mask[i] < 4 || Mask[i] > 7))
   3209       return false;
   3210 
   3211   return true;
   3212 }
   3213 
   3214 /// isPSHUFLWMask - Return true if the node specifies a shuffle of elements that
   3215 /// is suitable for input to PSHUFLW.
   3216 static bool isPSHUFLWMask(ArrayRef<int> Mask, EVT VT) {
   3217   if (VT != MVT::v8i16)
   3218     return false;
   3219 
   3220   // Upper quadword copied in order.
   3221   if (!isSequentialOrUndefInRange(Mask, 4, 4, 4))
   3222     return false;
   3223 
   3224   // Lower quadword shuffled.
   3225   for (unsigned i = 0; i != 4; ++i)
   3226     if (Mask[i] >= 4)
   3227       return false;
   3228 
   3229   return true;
   3230 }
   3231 
   3232 /// isPALIGNRMask - Return true if the node specifies a shuffle of elements that
   3233 /// is suitable for input to PALIGNR.
   3234 static bool isPALIGNRMask(ArrayRef<int> Mask, EVT VT,
   3235                           const X86Subtarget *Subtarget) {
   3236   if ((VT.getSizeInBits() == 128 && !Subtarget->hasSSSE3()) ||
   3237       (VT.getSizeInBits() == 256 && !Subtarget->hasAVX2()))
   3238     return false;
   3239 
   3240   unsigned NumElts = VT.getVectorNumElements();
   3241   unsigned NumLanes = VT.getSizeInBits()/128;
   3242   unsigned NumLaneElts = NumElts/NumLanes;
   3243 
   3244   // Do not handle 64-bit element shuffles with palignr.
   3245   if (NumLaneElts == 2)
   3246     return false;
   3247 
   3248   for (unsigned l = 0; l != NumElts; l+=NumLaneElts) {
   3249     unsigned i;
   3250     for (i = 0; i != NumLaneElts; ++i) {
   3251       if (Mask[i+l] >= 0)
   3252         break;
   3253     }
   3254 
   3255     // Lane is all undef, go to next lane
   3256     if (i == NumLaneElts)
   3257       continue;
   3258 
   3259     int Start = Mask[i+l];
   3260 
   3261     // Make sure its in this lane in one of the sources
   3262     if (!isUndefOrInRange(Start, l, l+NumLaneElts) &&
   3263         !isUndefOrInRange(Start, l+NumElts, l+NumElts+NumLaneElts))
   3264       return false;
   3265 
   3266     // If not lane 0, then we must match lane 0
   3267     if (l != 0 && Mask[i] >= 0 && !isUndefOrEqual(Start, Mask[i]+l))
   3268       return false;
   3269 
   3270     // Correct second source to be contiguous with first source
   3271     if (Start >= (int)NumElts)
   3272       Start -= NumElts - NumLaneElts;
   3273 
   3274     // Make sure we're shifting in the right direction.
   3275     if (Start <= (int)(i+l))
   3276       return false;
   3277 
   3278     Start -= i;
   3279 
   3280     // Check the rest of the elements to see if they are consecutive.
   3281     for (++i; i != NumLaneElts; ++i) {
   3282       int Idx = Mask[i+l];
   3283 
   3284       // Make sure its in this lane
   3285       if (!isUndefOrInRange(Idx, l, l+NumLaneElts) &&
   3286           !isUndefOrInRange(Idx, l+NumElts, l+NumElts+NumLaneElts))
   3287         return false;
   3288 
   3289       // If not lane 0, then we must match lane 0
   3290       if (l != 0 && Mask[i] >= 0 && !isUndefOrEqual(Idx, Mask[i]+l))
   3291         return false;
   3292 
   3293       if (Idx >= (int)NumElts)
   3294         Idx -= NumElts - NumLaneElts;
   3295 
   3296       if (!isUndefOrEqual(Idx, Start+i))
   3297         return false;
   3298 
   3299     }
   3300   }
   3301 
   3302   return true;
   3303 }
   3304 
   3305 /// CommuteVectorShuffleMask - Change values in a shuffle permute mask assuming
   3306 /// the two vector operands have swapped position.
   3307 static void CommuteVectorShuffleMask(SmallVectorImpl<int> &Mask,
   3308                                      unsigned NumElems) {
   3309   for (unsigned i = 0; i != NumElems; ++i) {
   3310     int idx = Mask[i];
   3311     if (idx < 0)
   3312       continue;
   3313     else if (idx < (int)NumElems)
   3314       Mask[i] = idx + NumElems;
   3315     else
   3316       Mask[i] = idx - NumElems;
   3317   }
   3318 }
   3319 
   3320 /// isSHUFPMask - Return true if the specified VECTOR_SHUFFLE operand
   3321 /// specifies a shuffle of elements that is suitable for input to 128/256-bit
   3322 /// SHUFPS and SHUFPD. If Commuted is true, then it checks for sources to be
   3323 /// reverse of what x86 shuffles want.
   3324 static bool isSHUFPMask(ArrayRef<int> Mask, EVT VT, bool HasAVX,
   3325                         bool Commuted = false) {
   3326   if (!HasAVX && VT.getSizeInBits() == 256)
   3327     return false;
   3328 
   3329   unsigned NumElems = VT.getVectorNumElements();
   3330   unsigned NumLanes = VT.getSizeInBits()/128;
   3331   unsigned NumLaneElems = NumElems/NumLanes;
   3332 
   3333   if (NumLaneElems != 2 && NumLaneElems != 4)
   3334     return false;
   3335 
   3336   // VSHUFPSY divides the resulting vector into 4 chunks.
   3337   // The sources are also splitted into 4 chunks, and each destination
   3338   // chunk must come from a different source chunk.
   3339   //
   3340   //  SRC1 =>   X7    X6    X5    X4    X3    X2    X1    X0
   3341   //  SRC2 =>   Y7    Y6    Y5    Y4    Y3    Y2    Y1    Y9
   3342   //
   3343   //  DST  =>  Y7..Y4,   Y7..Y4,   X7..X4,   X7..X4,
   3344   //           Y3..Y0,   Y3..Y0,   X3..X0,   X3..X0
   3345   //
   3346   // VSHUFPDY divides the resulting vector into 4 chunks.
   3347   // The sources are also splitted into 4 chunks, and each destination
   3348   // chunk must come from a different source chunk.
   3349   //
   3350   //  SRC1 =>      X3       X2       X1       X0
   3351   //  SRC2 =>      Y3       Y2       Y1       Y0
   3352   //
   3353   //  DST  =>  Y3..Y2,  X3..X2,  Y1..Y0,  X1..X0
   3354   //
   3355   unsigned HalfLaneElems = NumLaneElems/2;
   3356   for (unsigned l = 0; l != NumElems; l += NumLaneElems) {
   3357     for (unsigned i = 0; i != NumLaneElems; ++i) {
   3358       int Idx = Mask[i+l];
   3359       unsigned RngStart = l + ((Commuted == (i<HalfLaneElems)) ? NumElems : 0);
   3360       if (!isUndefOrInRange(Idx, RngStart, RngStart+NumLaneElems))
   3361         return false;
   3362       // For VSHUFPSY, the mask of the second half must be the same as the
   3363       // first but with the appropriate offsets. This works in the same way as
   3364       // VPERMILPS works with masks.
   3365       if (NumElems != 8 || l == 0 || Mask[i] < 0)
   3366         continue;
   3367       if (!isUndefOrEqual(Idx, Mask[i]+l))
   3368         return false;
   3369     }
   3370   }
   3371 
   3372   return true;
   3373 }
   3374 
   3375 /// isMOVHLPSMask - Return true if the specified VECTOR_SHUFFLE operand
   3376 /// specifies a shuffle of elements that is suitable for input to MOVHLPS.
   3377 static bool isMOVHLPSMask(ArrayRef<int> Mask, EVT VT) {
   3378   unsigned NumElems = VT.getVectorNumElements();
   3379 
   3380   if (VT.getSizeInBits() != 128)
   3381     return false;
   3382 
   3383   if (NumElems != 4)
   3384     return false;
   3385 
   3386   // Expect bit0 == 6, bit1 == 7, bit2 == 2, bit3 == 3
   3387   return isUndefOrEqual(Mask[0], 6) &&
   3388          isUndefOrEqual(Mask[1], 7) &&
   3389          isUndefOrEqual(Mask[2], 2) &&
   3390          isUndefOrEqual(Mask[3], 3);
   3391 }
   3392 
   3393 /// isMOVHLPS_v_undef_Mask - Special case of isMOVHLPSMask for canonical form
   3394 /// of vector_shuffle v, v, <2, 3, 2, 3>, i.e. vector_shuffle v, undef,
   3395 /// <2, 3, 2, 3>
   3396 static bool isMOVHLPS_v_undef_Mask(ArrayRef<int> Mask, EVT VT) {
   3397   unsigned NumElems = VT.getVectorNumElements();
   3398 
   3399   if (VT.getSizeInBits() != 128)
   3400     return false;
   3401 
   3402   if (NumElems != 4)
   3403     return false;
   3404 
   3405   return isUndefOrEqual(Mask[0], 2) &&
   3406          isUndefOrEqual(Mask[1], 3) &&
   3407          isUndefOrEqual(Mask[2], 2) &&
   3408          isUndefOrEqual(Mask[3], 3);
   3409 }
   3410 
   3411 /// isMOVLPMask - Return true if the specified VECTOR_SHUFFLE operand
   3412 /// specifies a shuffle of elements that is suitable for input to MOVLP{S|D}.
   3413 static bool isMOVLPMask(ArrayRef<int> Mask, EVT VT) {
   3414   if (VT.getSizeInBits() != 128)
   3415     return false;
   3416 
   3417   unsigned NumElems = VT.getVectorNumElements();
   3418 
   3419   if (NumElems != 2 && NumElems != 4)
   3420     return false;
   3421 
   3422   for (unsigned i = 0; i != NumElems/2; ++i)
   3423     if (!isUndefOrEqual(Mask[i], i + NumElems))
   3424       return false;
   3425 
   3426   for (unsigned i = NumElems/2; i != NumElems; ++i)
   3427     if (!isUndefOrEqual(Mask[i], i))
   3428       return false;
   3429 
   3430   return true;
   3431 }
   3432 
   3433 /// isMOVLHPSMask - Return true if the specified VECTOR_SHUFFLE operand
   3434 /// specifies a shuffle of elements that is suitable for input to MOVLHPS.
   3435 static bool isMOVLHPSMask(ArrayRef<int> Mask, EVT VT) {
   3436   unsigned NumElems = VT.getVectorNumElements();
   3437 
   3438   if ((NumElems != 2 && NumElems != 4)
   3439       || VT.getSizeInBits() > 128)
   3440     return false;
   3441 
   3442   for (unsigned i = 0; i != NumElems/2; ++i)
   3443     if (!isUndefOrEqual(Mask[i], i))
   3444       return false;
   3445 
   3446   for (unsigned i = 0; i != NumElems/2; ++i)
   3447     if (!isUndefOrEqual(Mask[i + NumElems/2], i + NumElems))
   3448       return false;
   3449 
   3450   return true;
   3451 }
   3452 
   3453 /// isUNPCKLMask - Return true if the specified VECTOR_SHUFFLE operand
   3454 /// specifies a shuffle of elements that is suitable for input to UNPCKL.
   3455 static bool isUNPCKLMask(ArrayRef<int> Mask, EVT VT,
   3456                          bool HasAVX2, bool V2IsSplat = false) {
   3457   unsigned NumElts = VT.getVectorNumElements();
   3458 
   3459   assert((VT.is128BitVector() || VT.is256BitVector()) &&
   3460          "Unsupported vector type for unpckh");
   3461 
   3462   if (VT.getSizeInBits() == 256 && NumElts != 4 && NumElts != 8 &&
   3463       (!HasAVX2 || (NumElts != 16 && NumElts != 32)))
   3464     return false;
   3465 
   3466   // Handle 128 and 256-bit vector lengths. AVX defines UNPCK* to operate
   3467   // independently on 128-bit lanes.
   3468   unsigned NumLanes = VT.getSizeInBits()/128;
   3469   unsigned NumLaneElts = NumElts/NumLanes;
   3470 
   3471   for (unsigned l = 0; l != NumLanes; ++l) {
   3472     for (unsigned i = l*NumLaneElts, j = l*NumLaneElts;
   3473          i != (l+1)*NumLaneElts;
   3474          i += 2, ++j) {
   3475       int BitI  = Mask[i];
   3476       int BitI1 = Mask[i+1];
   3477       if (!isUndefOrEqual(BitI, j))
   3478         return false;
   3479       if (V2IsSplat) {
   3480         if (!isUndefOrEqual(BitI1, NumElts))
   3481           return false;
   3482       } else {
   3483         if (!isUndefOrEqual(BitI1, j + NumElts))
   3484           return false;
   3485       }
   3486     }
   3487   }
   3488 
   3489   return true;
   3490 }
   3491 
   3492 /// isUNPCKHMask - Return true if the specified VECTOR_SHUFFLE operand
   3493 /// specifies a shuffle of elements that is suitable for input to UNPCKH.
   3494 static bool isUNPCKHMask(ArrayRef<int> Mask, EVT VT,
   3495                          bool HasAVX2, bool V2IsSplat = false) {
   3496   unsigned NumElts = VT.getVectorNumElements();
   3497 
   3498   assert((VT.is128BitVector() || VT.is256BitVector()) &&
   3499          "Unsupported vector type for unpckh");
   3500 
   3501   if (VT.getSizeInBits() == 256 && NumElts != 4 && NumElts != 8 &&
   3502       (!HasAVX2 || (NumElts != 16 && NumElts != 32)))
   3503     return false;
   3504 
   3505   // Handle 128 and 256-bit vector lengths. AVX defines UNPCK* to operate
   3506   // independently on 128-bit lanes.
   3507   unsigned NumLanes = VT.getSizeInBits()/128;
   3508   unsigned NumLaneElts = NumElts/NumLanes;
   3509 
   3510   for (unsigned l = 0; l != NumLanes; ++l) {
   3511     for (unsigned i = l*NumLaneElts, j = (l*NumLaneElts)+NumLaneElts/2;
   3512          i != (l+1)*NumLaneElts; i += 2, ++j) {
   3513       int BitI  = Mask[i];
   3514       int BitI1 = Mask[i+1];
   3515       if (!isUndefOrEqual(BitI, j))
   3516         return false;
   3517       if (V2IsSplat) {
   3518         if (isUndefOrEqual(BitI1, NumElts))
   3519           return false;
   3520       } else {
   3521         if (!isUndefOrEqual(BitI1, j+NumElts))
   3522           return false;
   3523       }
   3524     }
   3525   }
   3526   return true;
   3527 }
   3528 
   3529 /// isUNPCKL_v_undef_Mask - Special case of isUNPCKLMask for canonical form
   3530 /// of vector_shuffle v, v, <0, 4, 1, 5>, i.e. vector_shuffle v, undef,
   3531 /// <0, 0, 1, 1>
   3532 static bool isUNPCKL_v_undef_Mask(ArrayRef<int> Mask, EVT VT,
   3533                                   bool HasAVX2) {
   3534   unsigned NumElts = VT.getVectorNumElements();
   3535 
   3536   assert((VT.is128BitVector() || VT.is256BitVector()) &&
   3537          "Unsupported vector type for unpckh");
   3538 
   3539   if (VT.getSizeInBits() == 256 && NumElts != 4 && NumElts != 8 &&
   3540       (!HasAVX2 || (NumElts != 16 && NumElts != 32)))
   3541     return false;
   3542 
   3543   // For 256-bit i64/f64, use MOVDDUPY instead, so reject the matching pattern
   3544   // FIXME: Need a better way to get rid of this, there's no latency difference
   3545   // between UNPCKLPD and MOVDDUP, the later should always be checked first and
   3546   // the former later. We should also remove the "_undef" special mask.
   3547   if (NumElts == 4 && VT.getSizeInBits() == 256)
   3548     return false;
   3549 
   3550   // Handle 128 and 256-bit vector lengths. AVX defines UNPCK* to operate
   3551   // independently on 128-bit lanes.
   3552   unsigned NumLanes = VT.getSizeInBits()/128;
   3553   unsigned NumLaneElts = NumElts/NumLanes;
   3554 
   3555   for (unsigned l = 0; l != NumLanes; ++l) {
   3556     for (unsigned i = l*NumLaneElts, j = l*NumLaneElts;
   3557          i != (l+1)*NumLaneElts;
   3558          i += 2, ++j) {
   3559       int BitI  = Mask[i];
   3560       int BitI1 = Mask[i+1];
   3561 
   3562       if (!isUndefOrEqual(BitI, j))
   3563         return false;
   3564       if (!isUndefOrEqual(BitI1, j))
   3565         return false;
   3566     }
   3567   }
   3568 
   3569   return true;
   3570 }
   3571 
   3572 /// isUNPCKH_v_undef_Mask - Special case of isUNPCKHMask for canonical form
   3573 /// of vector_shuffle v, v, <2, 6, 3, 7>, i.e. vector_shuffle v, undef,
   3574 /// <2, 2, 3, 3>
   3575 static bool isUNPCKH_v_undef_Mask(ArrayRef<int> Mask, EVT VT, bool HasAVX2) {
   3576   unsigned NumElts = VT.getVectorNumElements();
   3577 
   3578   assert((VT.is128BitVector() || VT.is256BitVector()) &&
   3579          "Unsupported vector type for unpckh");
   3580 
   3581   if (VT.getSizeInBits() == 256 && NumElts != 4 && NumElts != 8 &&
   3582       (!HasAVX2 || (NumElts != 16 && NumElts != 32)))
   3583     return false;
   3584 
   3585   // Handle 128 and 256-bit vector lengths. AVX defines UNPCK* to operate
   3586   // independently on 128-bit lanes.
   3587   unsigned NumLanes = VT.getSizeInBits()/128;
   3588   unsigned NumLaneElts = NumElts/NumLanes;
   3589 
   3590   for (unsigned l = 0; l != NumLanes; ++l) {
   3591     for (unsigned i = l*NumLaneElts, j = (l*NumLaneElts)+NumLaneElts/2;
   3592          i != (l+1)*NumLaneElts; i += 2, ++j) {
   3593       int BitI  = Mask[i];
   3594       int BitI1 = Mask[i+1];
   3595       if (!isUndefOrEqual(BitI, j))
   3596         return false;
   3597       if (!isUndefOrEqual(BitI1, j))
   3598         return false;
   3599     }
   3600   }
   3601   return true;
   3602 }
   3603 
   3604 /// isMOVLMask - Return true if the specified VECTOR_SHUFFLE operand
   3605 /// specifies a shuffle of elements that is suitable for input to MOVSS,
   3606 /// MOVSD, and MOVD, i.e. setting the lowest element.
   3607 static bool isMOVLMask(ArrayRef<int> Mask, EVT VT) {
   3608   if (VT.getVectorElementType().getSizeInBits() < 32)
   3609     return false;
   3610   if (VT.getSizeInBits() == 256)
   3611     return false;
   3612 
   3613   unsigned NumElts = VT.getVectorNumElements();
   3614 
   3615   if (!isUndefOrEqual(Mask[0], NumElts))
   3616     return false;
   3617 
   3618   for (unsigned i = 1; i != NumElts; ++i)
   3619     if (!isUndefOrEqual(Mask[i], i))
   3620       return false;
   3621 
   3622   return true;
   3623 }
   3624 
   3625 /// isVPERM2X128Mask - Match 256-bit shuffles where the elements are considered
   3626 /// as permutations between 128-bit chunks or halves. As an example: this
   3627 /// shuffle bellow:
   3628 ///   vector_shuffle <4, 5, 6, 7, 12, 13, 14, 15>
   3629 /// The first half comes from the second half of V1 and the second half from the
   3630 /// the second half of V2.
   3631 static bool isVPERM2X128Mask(ArrayRef<int> Mask, EVT VT, bool HasAVX) {
   3632   if (!HasAVX || VT.getSizeInBits() != 256)
   3633     return false;
   3634 
   3635   // The shuffle result is divided into half A and half B. In total the two
   3636   // sources have 4 halves, namely: C, D, E, F. The final values of A and
   3637   // B must come from C, D, E or F.
   3638   unsigned HalfSize = VT.getVectorNumElements()/2;
   3639   bool MatchA = false, MatchB = false;
   3640 
   3641   // Check if A comes from one of C, D, E, F.
   3642   for (unsigned Half = 0; Half != 4; ++Half) {
   3643     if (isSequentialOrUndefInRange(Mask, 0, HalfSize, Half*HalfSize)) {
   3644       MatchA = true;
   3645       break;
   3646     }
   3647   }
   3648 
   3649   // Check if B comes from one of C, D, E, F.
   3650   for (unsigned Half = 0; Half != 4; ++Half) {
   3651     if (isSequentialOrUndefInRange(Mask, HalfSize, HalfSize, Half*HalfSize)) {
   3652       MatchB = true;
   3653       break;
   3654     }
   3655   }
   3656 
   3657   return MatchA && MatchB;
   3658 }
   3659 
   3660 /// getShuffleVPERM2X128Immediate - Return the appropriate immediate to shuffle
   3661 /// the specified VECTOR_MASK mask with VPERM2F128/VPERM2I128 instructions.
   3662 static unsigned getShuffleVPERM2X128Immediate(ShuffleVectorSDNode *SVOp) {
   3663   EVT VT = SVOp->getValueType(0);
   3664 
   3665   unsigned HalfSize = VT.getVectorNumElements()/2;
   3666 
   3667   unsigned FstHalf = 0, SndHalf = 0;
   3668   for (unsigned i = 0; i < HalfSize; ++i) {
   3669     if (SVOp->getMaskElt(i) > 0) {
   3670       FstHalf = SVOp->getMaskElt(i)/HalfSize;
   3671       break;
   3672     }
   3673   }
   3674   for (unsigned i = HalfSize; i < HalfSize*2; ++i) {
   3675     if (SVOp->getMaskElt(i) > 0) {
   3676       SndHalf = SVOp->getMaskElt(i)/HalfSize;
   3677       break;
   3678     }
   3679   }
   3680 
   3681   return (FstHalf | (SndHalf << 4));
   3682 }
   3683 
   3684 /// isVPERMILPMask - Return true if the specified VECTOR_SHUFFLE operand
   3685 /// specifies a shuffle of elements that is suitable for input to VPERMILPD*.
   3686 /// Note that VPERMIL mask matching is different depending whether theunderlying
   3687 /// type is 32 or 64. In the VPERMILPS the high half of the mask should point
   3688 /// to the same elements of the low, but to the higher half of the source.
   3689 /// In VPERMILPD the two lanes could be shuffled independently of each other
   3690 /// with the same restriction that lanes can't be crossed. Also handles PSHUFDY.
   3691 static bool isVPERMILPMask(ArrayRef<int> Mask, EVT VT, bool HasAVX) {
   3692   if (!HasAVX)
   3693     return false;
   3694 
   3695   unsigned NumElts = VT.getVectorNumElements();
   3696   // Only match 256-bit with 32/64-bit types
   3697   if (VT.getSizeInBits() != 256 || (NumElts != 4 && NumElts != 8))
   3698     return false;
   3699 
   3700   unsigned NumLanes = VT.getSizeInBits()/128;
   3701   unsigned LaneSize = NumElts/NumLanes;
   3702   for (unsigned l = 0; l != NumElts; l += LaneSize) {
   3703     for (unsigned i = 0; i != LaneSize; ++i) {
   3704       if (!isUndefOrInRange(Mask[i+l], l, l+LaneSize))
   3705         return false;
   3706       if (NumElts != 8 || l == 0)
   3707         continue;
   3708       // VPERMILPS handling
   3709       if (Mask[i] < 0)
   3710         continue;
   3711       if (!isUndefOrEqual(Mask[i+l], Mask[i]+l))
   3712         return false;
   3713     }
   3714   }
   3715 
   3716   return true;
   3717 }
   3718 
   3719 /// isCommutedMOVLMask - Returns true if the shuffle mask is except the reverse
   3720 /// of what x86 movss want. X86 movs requires the lowest  element to be lowest
   3721 /// element of vector 2 and the other elements to come from vector 1 in order.
   3722 static bool isCommutedMOVLMask(ArrayRef<int> Mask, EVT VT,
   3723                                bool V2IsSplat = false, bool V2IsUndef = false) {
   3724   unsigned NumOps = VT.getVectorNumElements();
   3725   if (VT.getSizeInBits() == 256)
   3726     return false;
   3727   if (NumOps != 2 && NumOps != 4 && NumOps != 8 && NumOps != 16)
   3728     return false;
   3729 
   3730   if (!isUndefOrEqual(Mask[0], 0))
   3731     return false;
   3732 
   3733   for (unsigned i = 1; i != NumOps; ++i)
   3734     if (!(isUndefOrEqual(Mask[i], i+NumOps) ||
   3735           (V2IsUndef && isUndefOrInRange(Mask[i], NumOps, NumOps*2)) ||
   3736           (V2IsSplat && isUndefOrEqual(Mask[i], NumOps))))
   3737       return false;
   3738 
   3739   return true;
   3740 }
   3741 
   3742 /// isMOVSHDUPMask - Return true if the specified VECTOR_SHUFFLE operand
   3743 /// specifies a shuffle of elements that is suitable for input to MOVSHDUP.
   3744 /// Masks to match: <1, 1, 3, 3> or <1, 1, 3, 3, 5, 5, 7, 7>
   3745 static bool isMOVSHDUPMask(ArrayRef<int> Mask, EVT VT,
   3746                            const X86Subtarget *Subtarget) {
   3747   if (!Subtarget->hasSSE3())
   3748     return false;
   3749 
   3750   unsigned NumElems = VT.getVectorNumElements();
   3751 
   3752   if ((VT.getSizeInBits() == 128 && NumElems != 4) ||
   3753       (VT.getSizeInBits() == 256 && NumElems != 8))
   3754     return false;
   3755 
   3756   // "i+1" is the value the indexed mask element must have
   3757   for (unsigned i = 0; i != NumElems; i += 2)
   3758     if (!isUndefOrEqual(Mask[i], i+1) ||
   3759         !isUndefOrEqual(Mask[i+1], i+1))
   3760       return false;
   3761 
   3762   return true;
   3763 }
   3764 
   3765 /// isMOVSLDUPMask - Return true if the specified VECTOR_SHUFFLE operand
   3766 /// specifies a shuffle of elements that is suitable for input to MOVSLDUP.
   3767 /// Masks to match: <0, 0, 2, 2> or <0, 0, 2, 2, 4, 4, 6, 6>
   3768 static bool isMOVSLDUPMask(ArrayRef<int> Mask, EVT VT,
   3769                            const X86Subtarget *Subtarget) {
   3770   if (!Subtarget->hasSSE3())
   3771     return false;
   3772 
   3773   unsigned NumElems = VT.getVectorNumElements();
   3774 
   3775   if ((VT.getSizeInBits() == 128 && NumElems != 4) ||
   3776       (VT.getSizeInBits() == 256 && NumElems != 8))
   3777     return false;
   3778 
   3779   // "i" is the value the indexed mask element must have
   3780   for (unsigned i = 0; i != NumElems; i += 2)
   3781     if (!isUndefOrEqual(Mask[i], i) ||
   3782         !isUndefOrEqual(Mask[i+1], i))
   3783       return false;
   3784 
   3785   return true;
   3786 }
   3787 
   3788 /// isMOVDDUPYMask - Return true if the specified VECTOR_SHUFFLE operand
   3789 /// specifies a shuffle of elements that is suitable for input to 256-bit
   3790 /// version of MOVDDUP.
   3791 static bool isMOVDDUPYMask(ArrayRef<int> Mask, EVT VT, bool HasAVX) {
   3792   unsigned NumElts = VT.getVectorNumElements();
   3793 
   3794   if (!HasAVX || VT.getSizeInBits() != 256 || NumElts != 4)
   3795     return false;
   3796 
   3797   for (unsigned i = 0; i != NumElts/2; ++i)
   3798     if (!isUndefOrEqual(Mask[i], 0))
   3799       return false;
   3800   for (unsigned i = NumElts/2; i != NumElts; ++i)
   3801     if (!isUndefOrEqual(Mask[i], NumElts/2))
   3802       return false;
   3803   return true;
   3804 }
   3805 
   3806 /// isMOVDDUPMask - Return true if the specified VECTOR_SHUFFLE operand
   3807 /// specifies a shuffle of elements that is suitable for input to 128-bit
   3808 /// version of MOVDDUP.
   3809 static bool isMOVDDUPMask(ArrayRef<int> Mask, EVT VT) {
   3810   if (VT.getSizeInBits() != 128)
   3811     return false;
   3812 
   3813   unsigned e = VT.getVectorNumElements() / 2;
   3814   for (unsigned i = 0; i != e; ++i)
   3815     if (!isUndefOrEqual(Mask[i], i))
   3816       return false;
   3817   for (unsigned i = 0; i != e; ++i)
   3818     if (!isUndefOrEqual(Mask[e+i], i))
   3819       return false;
   3820   return true;
   3821 }
   3822 
   3823 /// isVEXTRACTF128Index - Return true if the specified
   3824 /// EXTRACT_SUBVECTOR operand specifies a vector extract that is
   3825 /// suitable for input to VEXTRACTF128.
   3826 bool X86::isVEXTRACTF128Index(SDNode *N) {
   3827   if (!isa<ConstantSDNode>(N->getOperand(1).getNode()))
   3828     return false;
   3829 
   3830   // The index should be aligned on a 128-bit boundary.
   3831   uint64_t Index =
   3832     cast<ConstantSDNode>(N->getOperand(1).getNode())->getZExtValue();
   3833 
   3834   unsigned VL = N->getValueType(0).getVectorNumElements();
   3835   unsigned VBits = N->getValueType(0).getSizeInBits();
   3836   unsigned ElSize = VBits / VL;
   3837   bool Result = (Index * ElSize) % 128 == 0;
   3838 
   3839   return Result;
   3840 }
   3841 
   3842 /// isVINSERTF128Index - Return true if the specified INSERT_SUBVECTOR
   3843 /// operand specifies a subvector insert that is suitable for input to
   3844 /// VINSERTF128.
   3845 bool X86::isVINSERTF128Index(SDNode *N) {
   3846   if (!isa<ConstantSDNode>(N->getOperand(2).getNode()))
   3847     return false;
   3848 
   3849   // The index should be aligned on a 128-bit boundary.
   3850   uint64_t Index =
   3851     cast<ConstantSDNode>(N->getOperand(2).getNode())->getZExtValue();
   3852 
   3853   unsigned VL = N->getValueType(0).getVectorNumElements();
   3854   unsigned VBits = N->getValueType(0).getSizeInBits();
   3855   unsigned ElSize = VBits / VL;
   3856   bool Result = (Index * ElSize) % 128 == 0;
   3857 
   3858   return Result;
   3859 }
   3860 
   3861 /// getShuffleSHUFImmediate - Return the appropriate immediate to shuffle
   3862 /// the specified VECTOR_SHUFFLE mask with PSHUF* and SHUFP* instructions.
   3863 /// Handles 128-bit and 256-bit.
   3864 static unsigned getShuffleSHUFImmediate(ShuffleVectorSDNode *N) {
   3865   EVT VT = N->getValueType(0);
   3866 
   3867   assert((VT.is128BitVector() || VT.is256BitVector()) &&
   3868          "Unsupported vector type for PSHUF/SHUFP");
   3869 
   3870   // Handle 128 and 256-bit vector lengths. AVX defines PSHUF/SHUFP to operate
   3871   // independently on 128-bit lanes.
   3872   unsigned NumElts = VT.getVectorNumElements();
   3873   unsigned NumLanes = VT.getSizeInBits()/128;
   3874   unsigned NumLaneElts = NumElts/NumLanes;
   3875 
   3876   assert((NumLaneElts == 2 || NumLaneElts == 4) &&
   3877          "Only supports 2 or 4 elements per lane");
   3878 
   3879   unsigned Shift = (NumLaneElts == 4) ? 1 : 0;
   3880   unsigned Mask = 0;
   3881   for (unsigned i = 0; i != NumElts; ++i) {
   3882     int Elt = N->getMaskElt(i);
   3883     if (Elt < 0) continue;
   3884     Elt %= NumLaneElts;
   3885     unsigned ShAmt = i << Shift;
   3886     if (ShAmt >= 8) ShAmt -= 8;
   3887     Mask |= Elt << ShAmt;
   3888   }
   3889 
   3890   return Mask;
   3891 }
   3892 
   3893 /// getShufflePSHUFHWImmediate - Return the appropriate immediate to shuffle
   3894 /// the specified VECTOR_SHUFFLE mask with the PSHUFHW instruction.
   3895 static unsigned getShufflePSHUFHWImmediate(ShuffleVectorSDNode *N) {
   3896   unsigned Mask = 0;
   3897   // 8 nodes, but we only care about the last 4.
   3898   for (unsigned i = 7; i >= 4; --i) {
   3899     int Val = N->getMaskElt(i);
   3900     if (Val >= 0)
   3901       Mask |= (Val - 4);
   3902     if (i != 4)
   3903       Mask <<= 2;
   3904   }
   3905   return Mask;
   3906 }
   3907 
   3908 /// getShufflePSHUFLWImmediate - Return the appropriate immediate to shuffle
   3909 /// the specified VECTOR_SHUFFLE mask with the PSHUFLW instruction.
   3910 static unsigned getShufflePSHUFLWImmediate(ShuffleVectorSDNode *N) {
   3911   unsigned Mask = 0;
   3912   // 8 nodes, but we only care about the first 4.
   3913   for (int i = 3; i >= 0; --i) {
   3914     int Val = N->getMaskElt(i);
   3915     if (Val >= 0)
   3916       Mask |= Val;
   3917     if (i != 0)
   3918       Mask <<= 2;
   3919   }
   3920   return Mask;
   3921 }
   3922 
   3923 /// getShufflePALIGNRImmediate - Return the appropriate immediate to shuffle
   3924 /// the specified VECTOR_SHUFFLE mask with the PALIGNR instruction.
   3925 static unsigned getShufflePALIGNRImmediate(ShuffleVectorSDNode *SVOp) {
   3926   EVT VT = SVOp->getValueType(0);
   3927   unsigned EltSize = VT.getVectorElementType().getSizeInBits() >> 3;
   3928 
   3929   unsigned NumElts = VT.getVectorNumElements();
   3930   unsigned NumLanes = VT.getSizeInBits()/128;
   3931   unsigned NumLaneElts = NumElts/NumLanes;
   3932 
   3933   int Val = 0;
   3934   unsigned i;
   3935   for (i = 0; i != NumElts; ++i) {
   3936     Val = SVOp->getMaskElt(i);
   3937     if (Val >= 0)
   3938       break;
   3939   }
   3940   if (Val >= (int)NumElts)
   3941     Val -= NumElts - NumLaneElts;
   3942 
   3943   assert(Val - i > 0 && "PALIGNR imm should be positive");
   3944   return (Val - i) * EltSize;
   3945 }
   3946 
   3947 /// getExtractVEXTRACTF128Immediate - Return the appropriate immediate
   3948 /// to extract the specified EXTRACT_SUBVECTOR index with VEXTRACTF128
   3949 /// instructions.
   3950 unsigned X86::getExtractVEXTRACTF128Immediate(SDNode *N) {
   3951   if (!isa<ConstantSDNode>(N->getOperand(1).getNode()))
   3952     llvm_unreachable("Illegal extract subvector for VEXTRACTF128");
   3953 
   3954   uint64_t Index =
   3955     cast<ConstantSDNode>(N->getOperand(1).getNode())->getZExtValue();
   3956 
   3957   EVT VecVT = N->getOperand(0).getValueType();
   3958   EVT ElVT = VecVT.getVectorElementType();
   3959 
   3960   unsigned NumElemsPerChunk = 128 / ElVT.getSizeInBits();
   3961   return Index / NumElemsPerChunk;
   3962 }
   3963 
   3964 /// getInsertVINSERTF128Immediate - Return the appropriate immediate
   3965 /// to insert at the specified INSERT_SUBVECTOR index with VINSERTF128
   3966 /// instructions.
   3967 unsigned X86::getInsertVINSERTF128Immediate(SDNode *N) {
   3968   if (!isa<ConstantSDNode>(N->getOperand(2).getNode()))
   3969     llvm_unreachable("Illegal insert subvector for VINSERTF128");
   3970 
   3971   uint64_t Index =
   3972     cast<ConstantSDNode>(N->getOperand(2).getNode())->getZExtValue();
   3973 
   3974   EVT VecVT = N->getValueType(0);
   3975   EVT ElVT = VecVT.getVectorElementType();
   3976 
   3977   unsigned NumElemsPerChunk = 128 / ElVT.getSizeInBits();
   3978   return Index / NumElemsPerChunk;
   3979 }
   3980 
   3981 /// getShuffleCLImmediate - Return the appropriate immediate to shuffle
   3982 /// the specified VECTOR_SHUFFLE mask with VPERMQ and VPERMPD instructions.
   3983 /// Handles 256-bit.
   3984 static unsigned getShuffleCLImmediate(ShuffleVectorSDNode *N) {
   3985   EVT VT = N->getValueType(0);
   3986 
   3987   unsigned NumElts = VT.getVectorNumElements();
   3988 
   3989   assert((VT.is256BitVector() && NumElts == 4) &&
   3990          "Unsupported vector type for VPERMQ/VPERMPD");
   3991 
   3992   unsigned Mask = 0;
   3993   for (unsigned i = 0; i != NumElts; ++i) {
   3994     int Elt = N->getMaskElt(i);
   3995     if (Elt < 0)
   3996       continue;
   3997     Mask |= Elt << (i*2);
   3998   }
   3999 
   4000   return Mask;
   4001 }
   4002 /// isZeroNode - Returns true if Elt is a constant zero or a floating point
   4003 /// constant +0.0.
   4004 bool X86::isZeroNode(SDValue Elt) {
   4005   return ((isa<ConstantSDNode>(Elt) &&
   4006            cast<ConstantSDNode>(Elt)->isNullValue()) ||
   4007           (isa<ConstantFPSDNode>(Elt) &&
   4008            cast<ConstantFPSDNode>(Elt)->getValueAPF().isPosZero()));
   4009 }
   4010 
   4011 /// CommuteVectorShuffle - Swap vector_shuffle operands as well as values in
   4012 /// their permute mask.
   4013 static SDValue CommuteVectorShuffle(ShuffleVectorSDNode *SVOp,
   4014                                     SelectionDAG &DAG) {
   4015   EVT VT = SVOp->getValueType(0);
   4016   unsigned NumElems = VT.getVectorNumElements();
   4017   SmallVector<int, 8> MaskVec;
   4018 
   4019   for (unsigned i = 0; i != NumElems; ++i) {
   4020     int idx = SVOp->getMaskElt(i);
   4021     if (idx < 0)
   4022       MaskVec.push_back(idx);
   4023     else if (idx < (int)NumElems)
   4024       MaskVec.push_back(idx + NumElems);
   4025     else
   4026       MaskVec.push_back(idx - NumElems);
   4027   }
   4028   return DAG.getVectorShuffle(VT, SVOp->getDebugLoc(), SVOp->getOperand(1),
   4029                               SVOp->getOperand(0), &MaskVec[0]);
   4030 }
   4031 
   4032 /// ShouldXformToMOVHLPS - Return true if the node should be transformed to
   4033 /// match movhlps. The lower half elements should come from upper half of
   4034 /// V1 (and in order), and the upper half elements should come from the upper
   4035 /// half of V2 (and in order).
   4036 static bool ShouldXformToMOVHLPS(ArrayRef<int> Mask, EVT VT) {
   4037   if (VT.getSizeInBits() != 128)
   4038     return false;
   4039   if (VT.getVectorNumElements() != 4)
   4040     return false;
   4041   for (unsigned i = 0, e = 2; i != e; ++i)
   4042     if (!isUndefOrEqual(Mask[i], i+2))
   4043       return false;
   4044   for (unsigned i = 2; i != 4; ++i)
   4045     if (!isUndefOrEqual(Mask[i], i+4))
   4046       return false;
   4047   return true;
   4048 }
   4049 
   4050 /// isScalarLoadToVector - Returns true if the node is a scalar load that
   4051 /// is promoted to a vector. It also returns the LoadSDNode by reference if
   4052 /// required.
   4053 static bool isScalarLoadToVector(SDNode *N, LoadSDNode **LD = NULL) {
   4054   if (N->getOpcode() != ISD::SCALAR_TO_VECTOR)
   4055     return false;
   4056   N = N->getOperand(0).getNode();
   4057   if (!ISD::isNON_EXTLoad(N))
   4058     return false;
   4059   if (LD)
   4060     *LD = cast<LoadSDNode>(N);
   4061   return true;
   4062 }
   4063 
   4064 // Test whether the given value is a vector value which will be legalized
   4065 // into a load.
   4066 static bool WillBeConstantPoolLoad(SDNode *N) {
   4067   if (N->getOpcode() != ISD::BUILD_VECTOR)
   4068     return false;
   4069 
   4070   // Check for any non-constant elements.
   4071   for (unsigned i = 0, e = N->getNumOperands(); i != e; ++i)
   4072     switch (N->getOperand(i).getNode()->getOpcode()) {
   4073     case ISD::UNDEF:
   4074     case ISD::ConstantFP:
   4075     case ISD::Constant:
   4076       break;
   4077     default:
   4078       return false;
   4079     }
   4080 
   4081   // Vectors of all-zeros and all-ones are materialized with special
   4082   // instructions rather than being loaded.
   4083   return !ISD::isBuildVectorAllZeros(N) &&
   4084          !ISD::isBuildVectorAllOnes(N);
   4085 }
   4086 
   4087 /// ShouldXformToMOVLP{S|D} - Return true if the node should be transformed to
   4088 /// match movlp{s|d}. The lower half elements should come from lower half of
   4089 /// V1 (and in order), and the upper half elements should come from the upper
   4090 /// half of V2 (and in order). And since V1 will become the source of the
   4091 /// MOVLP, it must be either a vector load or a scalar load to vector.
   4092 static bool ShouldXformToMOVLP(SDNode *V1, SDNode *V2,
   4093                                ArrayRef<int> Mask, EVT VT) {
   4094   if (VT.getSizeInBits() != 128)
   4095     return false;
   4096 
   4097   if (!ISD::isNON_EXTLoad(V1) && !isScalarLoadToVector(V1))
   4098     return false;
   4099   // Is V2 is a vector load, don't do this transformation. We will try to use
   4100   // load folding shufps op.
   4101   if (ISD::isNON_EXTLoad(V2) || WillBeConstantPoolLoad(V2))
   4102     return false;
   4103 
   4104   unsigned NumElems = VT.getVectorNumElements();
   4105 
   4106   if (NumElems != 2 && NumElems != 4)
   4107     return false;
   4108   for (unsigned i = 0, e = NumElems/2; i != e; ++i)
   4109     if (!isUndefOrEqual(Mask[i], i))
   4110       return false;
   4111   for (unsigned i = NumElems/2; i != NumElems; ++i)
   4112     if (!isUndefOrEqual(Mask[i], i+NumElems))
   4113       return false;
   4114   return true;
   4115 }
   4116 
   4117 /// isSplatVector - Returns true if N is a BUILD_VECTOR node whose elements are
   4118 /// all the same.
   4119 static bool isSplatVector(SDNode *N) {
   4120   if (N->getOpcode() != ISD::BUILD_VECTOR)
   4121     return false;
   4122 
   4123   SDValue SplatValue = N->getOperand(0);
   4124   for (unsigned i = 1, e = N->getNumOperands(); i != e; ++i)
   4125     if (N->getOperand(i) != SplatValue)
   4126       return false;
   4127   return true;
   4128 }
   4129 
   4130 /// isZeroShuffle - Returns true if N is a VECTOR_SHUFFLE that can be resolved
   4131 /// to an zero vector.
   4132 /// FIXME: move to dag combiner / method on ShuffleVectorSDNode
   4133 static bool isZeroShuffle(ShuffleVectorSDNode *N) {
   4134   SDValue V1 = N->getOperand(0);
   4135   SDValue V2 = N->getOperand(1);
   4136   unsigned NumElems = N->getValueType(0).getVectorNumElements();
   4137   for (unsigned i = 0; i != NumElems; ++i) {
   4138     int Idx = N->getMaskElt(i);
   4139     if (Idx >= (int)NumElems) {
   4140       unsigned Opc = V2.getOpcode();
   4141       if (Opc == ISD::UNDEF || ISD::isBuildVectorAllZeros(V2.getNode()))
   4142         continue;
   4143       if (Opc != ISD::BUILD_VECTOR ||
   4144           !X86::isZeroNode(V2.getOperand(Idx-NumElems)))
   4145         return false;
   4146     } else if (Idx >= 0) {
   4147       unsigned Opc = V1.getOpcode();
   4148       if (Opc == ISD::UNDEF || ISD::isBuildVectorAllZeros(V1.getNode()))
   4149         continue;
   4150       if (Opc != ISD::BUILD_VECTOR ||
   4151           !X86::isZeroNode(V1.getOperand(Idx)))
   4152         return false;
   4153     }
   4154   }
   4155   return true;
   4156 }
   4157 
   4158 /// getZeroVector - Returns a vector of specified type with all zero elements.
   4159 ///
   4160 static SDValue getZeroVector(EVT VT, const X86Subtarget *Subtarget,
   4161                              SelectionDAG &DAG, DebugLoc dl) {
   4162   assert(VT.isVector() && "Expected a vector type");
   4163 
   4164   // Always build SSE zero vectors as <4 x i32> bitcasted
   4165   // to their dest type. This ensures they get CSE'd.
   4166   SDValue Vec;
   4167   if (VT.getSizeInBits() == 128) {  // SSE
   4168     if (Subtarget->hasSSE2()) {  // SSE2
   4169       SDValue Cst = DAG.getTargetConstant(0, MVT::i32);
   4170       Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32, Cst, Cst, Cst, Cst);
   4171     } else { // SSE1
   4172       SDValue Cst = DAG.getTargetConstantFP(+0.0, MVT::f32);
   4173       Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4f32, Cst, Cst, Cst, Cst);
   4174     }
   4175   } else if (VT.getSizeInBits() == 256) { // AVX
   4176     if (Subtarget->hasAVX2()) { // AVX2
   4177       SDValue Cst = DAG.getTargetConstant(0, MVT::i32);
   4178       SDValue Ops[] = { Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst };
   4179       Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v8i32, Ops, 8);
   4180     } else {
   4181       // 256-bit logic and arithmetic instructions in AVX are all
   4182       // floating-point, no support for integer ops. Emit fp zeroed vectors.
   4183       SDValue Cst = DAG.getTargetConstantFP(+0.0, MVT::f32);
   4184       SDValue Ops[] = { Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst };
   4185       Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v8f32, Ops, 8);
   4186     }
   4187   }
   4188   return DAG.getNode(ISD::BITCAST, dl, VT, Vec);
   4189 }
   4190 
   4191 /// getOnesVector - Returns a vector of specified type with all bits set.
   4192 /// Always build ones vectors as <4 x i32> or <8 x i32>. For 256-bit types with
   4193 /// no AVX2 supprt, use two <4 x i32> inserted in a <8 x i32> appropriately.
   4194 /// Then bitcast to their original type, ensuring they get CSE'd.
   4195 static SDValue getOnesVector(EVT VT, bool HasAVX2, SelectionDAG &DAG,
   4196                              DebugLoc dl) {
   4197   assert(VT.isVector() && "Expected a vector type");
   4198   assert((VT.is128BitVector() || VT.is256BitVector())
   4199          && "Expected a 128-bit or 256-bit vector type");
   4200 
   4201   SDValue Cst = DAG.getTargetConstant(~0U, MVT::i32);
   4202   SDValue Vec;
   4203   if (VT.getSizeInBits() == 256) {
   4204     if (HasAVX2) { // AVX2
   4205       SDValue Ops[] = { Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst };
   4206       Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v8i32, Ops, 8);
   4207     } else { // AVX
   4208       Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32, Cst, Cst, Cst, Cst);
   4209       SDValue InsV = Insert128BitVector(DAG.getNode(ISD::UNDEF, dl, MVT::v8i32),
   4210                                 Vec, DAG.getConstant(0, MVT::i32), DAG, dl);
   4211       Vec = Insert128BitVector(InsV, Vec,
   4212                     DAG.getConstant(4 /* NumElems/2 */, MVT::i32), DAG, dl);
   4213     }
   4214   } else {
   4215     Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32, Cst, Cst, Cst, Cst);
   4216   }
   4217 
   4218   return DAG.getNode(ISD::BITCAST, dl, VT, Vec);
   4219 }
   4220 
   4221 /// NormalizeMask - V2 is a splat, modify the mask (if needed) so all elements
   4222 /// that point to V2 points to its first element.
   4223 static void NormalizeMask(SmallVectorImpl<int> &Mask, unsigned NumElems) {
   4224   for (unsigned i = 0; i != NumElems; ++i) {
   4225     if (Mask[i] > (int)NumElems) {
   4226       Mask[i] = NumElems;
   4227     }
   4228   }
   4229 }
   4230 
   4231 /// getMOVLMask - Returns a vector_shuffle mask for an movs{s|d}, movd
   4232 /// operation of specified width.
   4233 static SDValue getMOVL(SelectionDAG &DAG, DebugLoc dl, EVT VT, SDValue V1,
   4234                        SDValue V2) {
   4235   unsigned NumElems = VT.getVectorNumElements();
   4236   SmallVector<int, 8> Mask;
   4237   Mask.push_back(NumElems);
   4238   for (unsigned i = 1; i != NumElems; ++i)
   4239     Mask.push_back(i);
   4240   return DAG.getVectorShuffle(VT, dl, V1, V2, &Mask[0]);
   4241 }
   4242 
   4243 /// getUnpackl - Returns a vector_shuffle node for an unpackl operation.
   4244 static SDValue getUnpackl(SelectionDAG &DAG, DebugLoc dl, EVT VT, SDValue V1,
   4245                           SDValue V2) {
   4246   unsigned NumElems = VT.getVectorNumElements();
   4247   SmallVector<int, 8> Mask;
   4248   for (unsigned i = 0, e = NumElems/2; i != e; ++i) {
   4249     Mask.push_back(i);
   4250     Mask.push_back(i + NumElems);
   4251   }
   4252   return DAG.getVectorShuffle(VT, dl, V1, V2, &Mask[0]);
   4253 }
   4254 
   4255 /// getUnpackh - Returns a vector_shuffle node for an unpackh operation.
   4256 static SDValue getUnpackh(SelectionDAG &DAG, DebugLoc dl, EVT VT, SDValue V1,
   4257                           SDValue V2) {
   4258   unsigned NumElems = VT.getVectorNumElements();
   4259   unsigned Half = NumElems/2;
   4260   SmallVector<int, 8> Mask;
   4261   for (unsigned i = 0; i != Half; ++i) {
   4262     Mask.push_back(i + Half);
   4263     Mask.push_back(i + NumElems + Half);
   4264   }
   4265   return DAG.getVectorShuffle(VT, dl, V1, V2, &Mask[0]);
   4266 }
   4267 
   4268 // PromoteSplati8i16 - All i16 and i8 vector types can't be used directly by
   4269 // a generic shuffle instruction because the target has no such instructions.
   4270 // Generate shuffles which repeat i16 and i8 several times until they can be
   4271 // represented by v4f32 and then be manipulated by target suported shuffles.
   4272 static SDValue PromoteSplati8i16(SDValue V, SelectionDAG &DAG, int &EltNo) {
   4273   EVT VT = V.getValueType();
   4274   int NumElems = VT.getVectorNumElements();
   4275   DebugLoc dl = V.getDebugLoc();
   4276 
   4277   while (NumElems > 4) {
   4278     if (EltNo < NumElems/2) {
   4279       V = getUnpackl(DAG, dl, VT, V, V);
   4280     } else {
   4281       V = getUnpackh(DAG, dl, VT, V, V);
   4282       EltNo -= NumElems/2;
   4283     }
   4284     NumElems >>= 1;
   4285   }
   4286   return V;
   4287 }
   4288 
   4289 /// getLegalSplat - Generate a legal splat with supported x86 shuffles
   4290 static SDValue getLegalSplat(SelectionDAG &DAG, SDValue V, int EltNo) {
   4291   EVT VT = V.getValueType();
   4292   DebugLoc dl = V.getDebugLoc();
   4293   assert((VT.getSizeInBits() == 128 || VT.getSizeInBits() == 256)
   4294          && "Vector size not supported");
   4295 
   4296   if (VT.getSizeInBits() == 128) {
   4297     V = DAG.getNode(ISD::BITCAST, dl, MVT::v4f32, V);
   4298     int SplatMask[4] = { EltNo, EltNo, EltNo, EltNo };
   4299     V = DAG.getVectorShuffle(MVT::v4f32, dl, V, DAG.getUNDEF(MVT::v4f32),
   4300                              &SplatMask[0]);
   4301   } else {
   4302     // To use VPERMILPS to splat scalars, the second half of indicies must
   4303     // refer to the higher part, which is a duplication of the lower one,
   4304     // because VPERMILPS can only handle in-lane permutations.
   4305     int SplatMask[8] = { EltNo, EltNo, EltNo, EltNo,
   4306                          EltNo+4, EltNo+4, EltNo+4, EltNo+4 };
   4307 
   4308     V = DAG.getNode(ISD::BITCAST, dl, MVT::v8f32, V);
   4309     V = DAG.getVectorShuffle(MVT::v8f32, dl, V, DAG.getUNDEF(MVT::v8f32),
   4310                              &SplatMask[0]);
   4311   }
   4312 
   4313   return DAG.getNode(ISD::BITCAST, dl, VT, V);
   4314 }
   4315 
   4316 /// PromoteSplat - Splat is promoted to target supported vector shuffles.
   4317 static SDValue PromoteSplat(ShuffleVectorSDNode *SV, SelectionDAG &DAG) {
   4318   EVT SrcVT = SV->getValueType(0);
   4319   SDValue V1 = SV->getOperand(0);
   4320   DebugLoc dl = SV->getDebugLoc();
   4321 
   4322   int EltNo = SV->getSplatIndex();
   4323   int NumElems = SrcVT.getVectorNumElements();
   4324   unsigned Size = SrcVT.getSizeInBits();
   4325 
   4326   assert(((Size == 128 && NumElems > 4) || Size == 256) &&
   4327           "Unknown how to promote splat for type");
   4328 
   4329   // Extract the 128-bit part containing the splat element and update
   4330   // the splat element index when it refers to the higher register.
   4331   if (Size == 256) {
   4332     unsigned Idx = (EltNo >= NumElems/2) ? NumElems/2 : 0;
   4333     V1 = Extract128BitVector(V1, DAG.getConstant(Idx, MVT::i32), DAG, dl);
   4334     if (Idx > 0)
   4335       EltNo -= NumElems/2;
   4336   }
   4337 
   4338   // All i16 and i8 vector types can't be used directly by a generic shuffle
   4339   // instruction because the target has no such instruction. Generate shuffles
   4340   // which repeat i16 and i8 several times until they fit in i32, and then can
   4341   // be manipulated by target suported shuffles.
   4342   EVT EltVT = SrcVT.getVectorElementType();
   4343   if (EltVT == MVT::i8 || EltVT == MVT::i16)
   4344     V1 = PromoteSplati8i16(V1, DAG, EltNo);
   4345 
   4346   // Recreate the 256-bit vector and place the same 128-bit vector
   4347   // into the low and high part. This is necessary because we want
   4348   // to use VPERM* to shuffle the vectors
   4349   if (Size == 256) {
   4350     SDValue InsV = Insert128BitVector(DAG.getUNDEF(SrcVT), V1,
   4351                          DAG.getConstant(0, MVT::i32), DAG, dl);
   4352     V1 = Insert128BitVector(InsV, V1,
   4353                DAG.getConstant(NumElems/2, MVT::i32), DAG, dl);
   4354   }
   4355 
   4356   return getLegalSplat(DAG, V1, EltNo);
   4357 }
   4358 
   4359 /// getShuffleVectorZeroOrUndef - Return a vector_shuffle of the specified
   4360 /// vector of zero or undef vector.  This produces a shuffle where the low
   4361 /// element of V2 is swizzled into the zero/undef vector, landing at element
   4362 /// Idx.  This produces a shuffle mask like 4,1,2,3 (idx=0) or  0,1,2,4 (idx=3).
   4363 static SDValue getShuffleVectorZeroOrUndef(SDValue V2, unsigned Idx,
   4364                                            bool IsZero,
   4365                                            const X86Subtarget *Subtarget,
   4366                                            SelectionDAG &DAG) {
   4367   EVT VT = V2.getValueType();
   4368   SDValue V1 = IsZero
   4369     ? getZeroVector(VT, Subtarget, DAG, V2.getDebugLoc()) : DAG.getUNDEF(VT);
   4370   unsigned NumElems = VT.getVectorNumElements();
   4371   SmallVector<int, 16> MaskVec;
   4372   for (unsigned i = 0; i != NumElems; ++i)
   4373     // If this is the insertion idx, put the low elt of V2 here.
   4374     MaskVec.push_back(i == Idx ? NumElems : i);
   4375   return DAG.getVectorShuffle(VT, V2.getDebugLoc(), V1, V2, &MaskVec[0]);
   4376 }
   4377 
   4378 /// getTargetShuffleMask - Calculates the shuffle mask corresponding to the
   4379 /// target specific opcode. Returns true if the Mask could be calculated.
   4380 /// Sets IsUnary to true if only uses one source.
   4381 static bool getTargetShuffleMask(SDNode *N, EVT VT,
   4382                                  SmallVectorImpl<int> &Mask, bool &IsUnary) {
   4383   unsigned NumElems = VT.getVectorNumElements();
   4384   SDValue ImmN;
   4385 
   4386   IsUnary = false;
   4387   switch(N->getOpcode()) {
   4388   case X86ISD::SHUFP:
   4389     ImmN = N->getOperand(N->getNumOperands()-1);
   4390     DecodeSHUFPMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
   4391     break;
   4392   case X86ISD::UNPCKH:
   4393     DecodeUNPCKHMask(VT, Mask);
   4394     break;
   4395   case X86ISD::UNPCKL:
   4396     DecodeUNPCKLMask(VT, Mask);
   4397     break;
   4398   case X86ISD::MOVHLPS:
   4399     DecodeMOVHLPSMask(NumElems, Mask);
   4400     break;
   4401   case X86ISD::MOVLHPS:
   4402     DecodeMOVLHPSMask(NumElems, Mask);
   4403     break;
   4404   case X86ISD::PSHUFD:
   4405   case X86ISD::VPERMILP:
   4406     ImmN = N->getOperand(N->getNumOperands()-1);
   4407     DecodePSHUFMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
   4408     IsUnary = true;
   4409     break;
   4410   case X86ISD::PSHUFHW:
   4411     ImmN = N->getOperand(N->getNumOperands()-1);
   4412     DecodePSHUFHWMask(cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
   4413     IsUnary = true;
   4414     break;
   4415   case X86ISD::PSHUFLW:
   4416     ImmN = N->getOperand(N->getNumOperands()-1);
   4417     DecodePSHUFLWMask(cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
   4418     IsUnary = true;
   4419     break;
   4420   case X86ISD::MOVSS:
   4421   case X86ISD::MOVSD: {
   4422     // The index 0 always comes from the first element of the second source,
   4423     // this is why MOVSS and MOVSD are used in the first place. The other
   4424     // elements come from the other positions of the first source vector
   4425     Mask.push_back(NumElems);
   4426     for (unsigned i = 1; i != NumElems; ++i) {
   4427       Mask.push_back(i);
   4428     }
   4429     break;
   4430   }
   4431   case X86ISD::VPERM2X128:
   4432     ImmN = N->getOperand(N->getNumOperands()-1);
   4433     DecodeVPERM2X128Mask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
   4434     if (Mask.empty()) return false;
   4435     break;
   4436   case X86ISD::MOVDDUP:
   4437   case X86ISD::MOVLHPD:
   4438   case X86ISD::MOVLPD:
   4439   case X86ISD::MOVLPS:
   4440   case X86ISD::MOVSHDUP:
   4441   case X86ISD::MOVSLDUP:
   4442   case X86ISD::PALIGN:
   4443     // Not yet implemented
   4444     return false;
   4445   default: llvm_unreachable("unknown target shuffle node");
   4446   }
   4447 
   4448   return true;
   4449 }
   4450 
   4451 /// getShuffleScalarElt - Returns the scalar element that will make up the ith
   4452 /// element of the result of the vector shuffle.
   4453 static SDValue getShuffleScalarElt(SDNode *N, unsigned Index, SelectionDAG &DAG,
   4454                                    unsigned Depth) {
   4455   if (Depth == 6)
   4456     return SDValue();  // Limit search depth.
   4457 
   4458   SDValue V = SDValue(N, 0);
   4459   EVT VT = V.getValueType();
   4460   unsigned Opcode = V.getOpcode();
   4461 
   4462   // Recurse into ISD::VECTOR_SHUFFLE node to find scalars.
   4463   if (const ShuffleVectorSDNode *SV = dyn_cast<ShuffleVectorSDNode>(N)) {
   4464     int Elt = SV->getMaskElt(Index);
   4465 
   4466     if (Elt < 0)
   4467       return DAG.getUNDEF(VT.getVectorElementType());
   4468 
   4469     unsigned NumElems = VT.getVectorNumElements();
   4470     SDValue NewV = (Elt < (int)NumElems) ? SV->getOperand(0)
   4471                                          : SV->getOperand(1);
   4472     return getShuffleScalarElt(NewV.getNode(), Elt % NumElems, DAG, Depth+1);
   4473   }
   4474 
   4475   // Recurse into target specific vector shuffles to find scalars.
   4476   if (isTargetShuffle(Opcode)) {
   4477     unsigned NumElems = VT.getVectorNumElements();
   4478     SmallVector<int, 16> ShuffleMask;
   4479     SDValue ImmN;
   4480     bool IsUnary;
   4481 
   4482     if (!getTargetShuffleMask(N, VT, ShuffleMask, IsUnary))
   4483       return SDValue();
   4484 
   4485     int Elt = ShuffleMask[Index];
   4486     if (Elt < 0)
   4487       return DAG.getUNDEF(VT.getVectorElementType());
   4488 
   4489     SDValue NewV = (Elt < (int)NumElems) ? N->getOperand(0)
   4490                                            : N->getOperand(1);
   4491     return getShuffleScalarElt(NewV.getNode(), Elt % NumElems, DAG,
   4492                                Depth+1);
   4493   }
   4494 
   4495   // Actual nodes that may contain scalar elements
   4496   if (Opcode == ISD::BITCAST) {
   4497     V = V.getOperand(0);
   4498     EVT SrcVT = V.getValueType();
   4499     unsigned NumElems = VT.getVectorNumElements();
   4500 
   4501     if (!SrcVT.isVector() || SrcVT.getVectorNumElements() != NumElems)
   4502       return SDValue();
   4503   }
   4504 
   4505   if (V.getOpcode() == ISD::SCALAR_TO_VECTOR)
   4506     return (Index == 0) ? V.getOperand(0)
   4507                         : DAG.getUNDEF(VT.getVectorElementType());
   4508 
   4509   if (V.getOpcode() == ISD::BUILD_VECTOR)
   4510     return V.getOperand(Index);
   4511 
   4512   return SDValue();
   4513 }
   4514 
   4515 /// getNumOfConsecutiveZeros - Return the number of elements of a vector
   4516 /// shuffle operation which come from a consecutively from a zero. The
   4517 /// search can start in two different directions, from left or right.
   4518 static
   4519 unsigned getNumOfConsecutiveZeros(ShuffleVectorSDNode *SVOp, unsigned NumElems,
   4520                                   bool ZerosFromLeft, SelectionDAG &DAG) {
   4521   unsigned i;
   4522   for (i = 0; i != NumElems; ++i) {
   4523     unsigned Index = ZerosFromLeft ? i : NumElems-i-1;
   4524     SDValue Elt = getShuffleScalarElt(SVOp, Index, DAG, 0);
   4525     if (!(Elt.getNode() &&
   4526          (Elt.getOpcode() == ISD::UNDEF || X86::isZeroNode(Elt))))
   4527       break;
   4528   }
   4529 
   4530   return i;
   4531 }
   4532 
   4533 /// isShuffleMaskConsecutive - Check if the shuffle mask indicies [MaskI, MaskE)
   4534 /// correspond consecutively to elements from one of the vector operands,
   4535 /// starting from its index OpIdx. Also tell OpNum which source vector operand.
   4536 static
   4537 bool isShuffleMaskConsecutive(ShuffleVectorSDNode *SVOp,
   4538                               unsigned MaskI, unsigned MaskE, unsigned OpIdx,
   4539                               unsigned NumElems, unsigned &OpNum) {
   4540   bool SeenV1 = false;
   4541   bool SeenV2 = false;
   4542 
   4543   for (unsigned i = MaskI; i != MaskE; ++i, ++OpIdx) {
   4544     int Idx = SVOp->getMaskElt(i);
   4545     // Ignore undef indicies
   4546     if (Idx < 0)
   4547       continue;
   4548 
   4549     if (Idx < (int)NumElems)
   4550       SeenV1 = true;
   4551     else
   4552       SeenV2 = true;
   4553 
   4554     // Only accept consecutive elements from the same vector
   4555     if ((Idx % NumElems != OpIdx) || (SeenV1 && SeenV2))
   4556       return false;
   4557   }
   4558 
   4559   OpNum = SeenV1 ? 0 : 1;
   4560   return true;
   4561 }
   4562 
   4563 /// isVectorShiftRight - Returns true if the shuffle can be implemented as a
   4564 /// logical left shift of a vector.
   4565 static bool isVectorShiftRight(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG,
   4566                                bool &isLeft, SDValue &ShVal, unsigned &ShAmt) {
   4567   unsigned NumElems = SVOp->getValueType(0).getVectorNumElements();
   4568   unsigned NumZeros = getNumOfConsecutiveZeros(SVOp, NumElems,
   4569               false /* check zeros from right */, DAG);
   4570   unsigned OpSrc;
   4571 
   4572   if (!NumZeros)
   4573     return false;
   4574 
   4575   // Considering the elements in the mask that are not consecutive zeros,
   4576   // check if they consecutively come from only one of the source vectors.
   4577   //
   4578   //               V1 = {X, A, B, C}     0
   4579   //                         \  \  \    /
   4580   //   vector_shuffle V1, V2 <1, 2, 3, X>
   4581   //
   4582   if (!isShuffleMaskConsecutive(SVOp,
   4583             0,                   // Mask Start Index
   4584             NumElems-NumZeros,   // Mask End Index(exclusive)
   4585             NumZeros,            // Where to start looking in the src vector
   4586             NumElems,            // Number of elements in vector
   4587             OpSrc))              // Which source operand ?
   4588     return false;
   4589 
   4590   isLeft = false;
   4591   ShAmt = NumZeros;
   4592   ShVal = SVOp->getOperand(OpSrc);
   4593   return true;
   4594 }
   4595 
   4596 /// isVectorShiftLeft - Returns true if the shuffle can be implemented as a
   4597 /// logical left shift of a vector.
   4598 static bool isVectorShiftLeft(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG,
   4599                               bool &isLeft, SDValue &ShVal, unsigned &ShAmt) {
   4600   unsigned NumElems = SVOp->getValueType(0).getVectorNumElements();
   4601   unsigned NumZeros = getNumOfConsecutiveZeros(SVOp, NumElems,
   4602               true /* check zeros from left */, DAG);
   4603   unsigned OpSrc;
   4604 
   4605   if (!NumZeros)
   4606     return false;
   4607 
   4608   // Considering the elements in the mask that are not consecutive zeros,
   4609   // check if they consecutively come from only one of the source vectors.
   4610   //
   4611   //                           0    { A, B, X, X } = V2
   4612   //                          / \    /  /
   4613   //   vector_shuffle V1, V2 <X, X, 4, 5>
   4614   //
   4615   if (!isShuffleMaskConsecutive(SVOp,
   4616             NumZeros,     // Mask Start Index
   4617             NumElems,     // Mask End Index(exclusive)
   4618             0,            // Where to start looking in the src vector
   4619             NumElems,     // Number of elements in vector
   4620             OpSrc))       // Which source operand ?
   4621     return false;
   4622 
   4623   isLeft = true;
   4624   ShAmt = NumZeros;
   4625   ShVal = SVOp->getOperand(OpSrc);
   4626   return true;
   4627 }
   4628 
   4629 /// isVectorShift - Returns true if the shuffle can be implemented as a
   4630 /// logical left or right shift of a vector.
   4631 static bool isVectorShift(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG,
   4632                           bool &isLeft, SDValue &ShVal, unsigned &ShAmt) {
   4633   // Although the logic below support any bitwidth size, there are no
   4634   // shift instructions which handle more than 128-bit vectors.
   4635   if (SVOp->getValueType(0).getSizeInBits() > 128)
   4636     return false;
   4637 
   4638   if (isVectorShiftLeft(SVOp, DAG, isLeft, ShVal, ShAmt) ||
   4639       isVectorShiftRight(SVOp, DAG, isLeft, ShVal, ShAmt))
   4640     return true;
   4641 
   4642   return false;
   4643 }
   4644 
   4645 /// LowerBuildVectorv16i8 - Custom lower build_vector of v16i8.
   4646 ///
   4647 static SDValue LowerBuildVectorv16i8(SDValue Op, unsigned NonZeros,
   4648                                        unsigned NumNonZero, unsigned NumZero,
   4649                                        SelectionDAG &DAG,
   4650                                        const X86Subtarget* Subtarget,
   4651                                        const TargetLowering &TLI) {
   4652   if (NumNonZero > 8)
   4653     return SDValue();
   4654 
   4655   DebugLoc dl = Op.getDebugLoc();
   4656   SDValue V(0, 0);
   4657   bool First = true;
   4658   for (unsigned i = 0; i < 16; ++i) {
   4659     bool ThisIsNonZero = (NonZeros & (1 << i)) != 0;
   4660     if (ThisIsNonZero && First) {
   4661       if (NumZero)
   4662         V = getZeroVector(MVT::v8i16, Subtarget, DAG, dl);
   4663       else
   4664         V = DAG.getUNDEF(MVT::v8i16);
   4665       First = false;
   4666     }
   4667 
   4668     if ((i & 1) != 0) {
   4669       SDValue ThisElt(0, 0), LastElt(0, 0);
   4670       bool LastIsNonZero = (NonZeros & (1 << (i-1))) != 0;
   4671       if (LastIsNonZero) {
   4672         LastElt = DAG.getNode(ISD::ZERO_EXTEND, dl,
   4673                               MVT::i16, Op.getOperand(i-1));
   4674       }
   4675       if (ThisIsNonZero) {
   4676         ThisElt = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i16, Op.getOperand(i));
   4677         ThisElt = DAG.getNode(ISD::SHL, dl, MVT::i16,
   4678                               ThisElt, DAG.getConstant(8, MVT::i8));
   4679         if (LastIsNonZero)
   4680           ThisElt = DAG.getNode(ISD::OR, dl, MVT::i16, ThisElt, LastElt);
   4681       } else
   4682         ThisElt = LastElt;
   4683 
   4684       if (ThisElt.getNode())
   4685         V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v8i16, V, ThisElt,
   4686                         DAG.getIntPtrConstant(i/2));
   4687     }
   4688   }
   4689 
   4690   return DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, V);
   4691 }
   4692 
   4693 /// LowerBuildVectorv8i16 - Custom lower build_vector of v8i16.
   4694 ///
   4695 static SDValue LowerBuildVectorv8i16(SDValue Op, unsigned NonZeros,
   4696                                      unsigned NumNonZero, unsigned NumZero,
   4697                                      SelectionDAG &DAG,
   4698                                      const X86Subtarget* Subtarget,
   4699                                      const TargetLowering &TLI) {
   4700   if (NumNonZero > 4)
   4701     return SDValue();
   4702 
   4703   DebugLoc dl = Op.getDebugLoc();
   4704   SDValue V(0, 0);
   4705   bool First = true;
   4706   for (unsigned i = 0; i < 8; ++i) {
   4707     bool isNonZero = (NonZeros & (1 << i)) != 0;
   4708     if (isNonZero) {
   4709       if (First) {
   4710         if (NumZero)
   4711           V = getZeroVector(MVT::v8i16, Subtarget, DAG, dl);
   4712         else
   4713           V = DAG.getUNDEF(MVT::v8i16);
   4714         First = false;
   4715       }
   4716       V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl,
   4717                       MVT::v8i16, V, Op.getOperand(i),
   4718                       DAG.getIntPtrConstant(i));
   4719     }
   4720   }
   4721 
   4722   return V;
   4723 }
   4724 
   4725 /// getVShift - Return a vector logical shift node.
   4726 ///
   4727 static SDValue getVShift(bool isLeft, EVT VT, SDValue SrcOp,
   4728                          unsigned NumBits, SelectionDAG &DAG,
   4729                          const TargetLowering &TLI, DebugLoc dl) {
   4730   assert(VT.getSizeInBits() == 128 && "Unknown type for VShift");
   4731   EVT ShVT = MVT::v2i64;
   4732   unsigned Opc = isLeft ? X86ISD::VSHLDQ : X86ISD::VSRLDQ;
   4733   SrcOp = DAG.getNode(ISD::BITCAST, dl, ShVT, SrcOp);
   4734   return DAG.getNode(ISD::BITCAST, dl, VT,
   4735                      DAG.getNode(Opc, dl, ShVT, SrcOp,
   4736                              DAG.getConstant(NumBits,
   4737                                   TLI.getShiftAmountTy(SrcOp.getValueType()))));
   4738 }
   4739 
   4740 SDValue
   4741 X86TargetLowering::LowerAsSplatVectorLoad(SDValue SrcOp, EVT VT, DebugLoc dl,
   4742                                           SelectionDAG &DAG) const {
   4743 
   4744   // Check if the scalar load can be widened into a vector load. And if
   4745   // the address is "base + cst" see if the cst can be "absorbed" into
   4746   // the shuffle mask.
   4747   if (LoadSDNode *LD = dyn_cast<LoadSDNode>(SrcOp)) {
   4748     SDValue Ptr = LD->getBasePtr();
   4749     if (!ISD::isNormalLoad(LD) || LD->isVolatile())
   4750       return SDValue();
   4751     EVT PVT = LD->getValueType(0);
   4752     if (PVT != MVT::i32 && PVT != MVT::f32)
   4753       return SDValue();
   4754 
   4755     int FI = -1;
   4756     int64_t Offset = 0;
   4757     if (FrameIndexSDNode *FINode = dyn_cast<FrameIndexSDNode>(Ptr)) {
   4758       FI = FINode->getIndex();
   4759       Offset = 0;
   4760     } else if (DAG.isBaseWithConstantOffset(Ptr) &&
   4761                isa<FrameIndexSDNode>(Ptr.getOperand(0))) {
   4762       FI = cast<FrameIndexSDNode>(Ptr.getOperand(0))->getIndex();
   4763       Offset = Ptr.getConstantOperandVal(1);
   4764       Ptr = Ptr.getOperand(0);
   4765     } else {
   4766       return SDValue();
   4767     }
   4768 
   4769     // FIXME: 256-bit vector instructions don't require a strict alignment,
   4770     // improve this code to support it better.
   4771     unsigned RequiredAlign = VT.getSizeInBits()/8;
   4772     SDValue Chain = LD->getChain();
   4773     // Make sure the stack object alignment is at least 16 or 32.
   4774     MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo();
   4775     if (DAG.InferPtrAlignment(Ptr) < RequiredAlign) {
   4776       if (MFI->isFixedObjectIndex(FI)) {
   4777         // Can't change the alignment. FIXME: It's possible to compute
   4778         // the exact stack offset and reference FI + adjust offset instead.
   4779         // If someone *really* cares about this. That's the way to implement it.
   4780         return SDValue();
   4781       } else {
   4782         MFI->setObjectAlignment(FI, RequiredAlign);
   4783       }
   4784     }
   4785 
   4786     // (Offset % 16 or 32) must be multiple of 4. Then address is then
   4787     // Ptr + (Offset & ~15).
   4788     if (Offset < 0)
   4789       return SDValue();
   4790     if ((Offset % RequiredAlign) & 3)
   4791       return SDValue();
   4792     int64_t StartOffset = Offset & ~(RequiredAlign-1);
   4793     if (StartOffset)
   4794       Ptr = DAG.getNode(ISD::ADD, Ptr.getDebugLoc(), Ptr.getValueType(),
   4795                         Ptr,DAG.getConstant(StartOffset, Ptr.getValueType()));
   4796 
   4797     int EltNo = (Offset - StartOffset) >> 2;
   4798     int NumElems = VT.getVectorNumElements();
   4799 
   4800     EVT NVT = EVT::getVectorVT(*DAG.getContext(), PVT, NumElems);
   4801     SDValue V1 = DAG.getLoad(NVT, dl, Chain, Ptr,
   4802                              LD->getPointerInfo().getWithOffset(StartOffset),
   4803                              false, false, false, 0);
   4804 
   4805     SmallVector<int, 8> Mask;
   4806     for (int i = 0; i < NumElems; ++i)
   4807       Mask.push_back(EltNo);
   4808 
   4809     return DAG.getVectorShuffle(NVT, dl, V1, DAG.getUNDEF(NVT), &Mask[0]);
   4810   }
   4811 
   4812   return SDValue();
   4813 }
   4814 
   4815 /// EltsFromConsecutiveLoads - Given the initializing elements 'Elts' of a
   4816 /// vector of type 'VT', see if the elements can be replaced by a single large
   4817 /// load which has the same value as a build_vector whose operands are 'elts'.
   4818 ///
   4819 /// Example: <load i32 *a, load i32 *a+4, undef, undef> -> zextload a
   4820 ///
   4821 /// FIXME: we'd also like to handle the case where the last elements are zero
   4822 /// rather than undef via VZEXT_LOAD, but we do not detect that case today.
   4823 /// There's even a handy isZeroNode for that purpose.
   4824 static SDValue EltsFromConsecutiveLoads(EVT VT, SmallVectorImpl<SDValue> &Elts,
   4825                                         DebugLoc &DL, SelectionDAG &DAG) {
   4826   EVT EltVT = VT.getVectorElementType();
   4827   unsigned NumElems = Elts.size();
   4828 
   4829   LoadSDNode *LDBase = NULL;
   4830   unsigned LastLoadedElt = -1U;
   4831 
   4832   // For each element in the initializer, see if we've found a load or an undef.
   4833   // If we don't find an initial load element, or later load elements are
   4834   // non-consecutive, bail out.
   4835   for (unsigned i = 0; i < NumElems; ++i) {
   4836     SDValue Elt = Elts[i];
   4837 
   4838     if (!Elt.getNode() ||
   4839         (Elt.getOpcode() != ISD::UNDEF && !ISD::isNON_EXTLoad(Elt.getNode())))
   4840       return SDValue();
   4841     if (!LDBase) {
   4842       if (Elt.getNode()->getOpcode() == ISD::UNDEF)
   4843         return SDValue();
   4844       LDBase = cast<LoadSDNode>(Elt.getNode());
   4845       LastLoadedElt = i;
   4846       continue;
   4847     }
   4848     if (Elt.getOpcode() == ISD::UNDEF)
   4849       continue;
   4850 
   4851     LoadSDNode *LD = cast<LoadSDNode>(Elt);
   4852     if (!DAG.isConsecutiveLoad(LD, LDBase, EltVT.getSizeInBits()/8, i))
   4853       return SDValue();
   4854     LastLoadedElt = i;
   4855   }
   4856 
   4857   // If we have found an entire vector of loads and undefs, then return a large
   4858   // load of the entire vector width starting at the base pointer.  If we found
   4859   // consecutive loads for the low half, generate a vzext_load node.
   4860   if (LastLoadedElt == NumElems - 1) {
   4861     if (DAG.InferPtrAlignment(LDBase->getBasePtr()) >= 16)
   4862       return DAG.getLoad(VT, DL, LDBase->getChain(), LDBase->getBasePtr(),
   4863                          LDBase->getPointerInfo(),
   4864                          LDBase->isVolatile(), LDBase->isNonTemporal(),
   4865                          LDBase->isInvariant(), 0);
   4866     return DAG.getLoad(VT, DL, LDBase->getChain(), LDBase->getBasePtr(),
   4867                        LDBase->getPointerInfo(),
   4868                        LDBase->isVolatile(), LDBase->isNonTemporal(),
   4869                        LDBase->isInvariant(), LDBase->getAlignment());
   4870   } else if (NumElems == 4 && LastLoadedElt == 1 &&
   4871              DAG.getTargetLoweringInfo().isTypeLegal(MVT::v2i64)) {
   4872     SDVTList Tys = DAG.getVTList(MVT::v2i64, MVT::Other);
   4873     SDValue Ops[] = { LDBase->getChain(), LDBase->getBasePtr() };
   4874     SDValue ResNode =
   4875         DAG.getMemIntrinsicNode(X86ISD::VZEXT_LOAD, DL, Tys, Ops, 2, MVT::i64,
   4876                                 LDBase->getPointerInfo(),
   4877                                 LDBase->getAlignment(),
   4878                                 false/*isVolatile*/, true/*ReadMem*/,
   4879                                 false/*WriteMem*/);
   4880     return DAG.getNode(ISD::BITCAST, DL, VT, ResNode);
   4881   }
   4882   return SDValue();
   4883 }
   4884 
   4885 /// LowerVectorBroadcast - Attempt to use the vbroadcast instruction
   4886 /// to generate a splat value for the following cases:
   4887 /// 1. A splat BUILD_VECTOR which uses a single scalar load, or a constant.
   4888 /// 2. A splat shuffle which uses a scalar_to_vector node which comes from
   4889 /// a scalar load, or a constant.
   4890 /// The VBROADCAST node is returned when a pattern is found,
   4891 /// or SDValue() otherwise.
   4892 SDValue
   4893 X86TargetLowering::LowerVectorBroadcast(SDValue &Op, SelectionDAG &DAG) const {
   4894   if (!Subtarget->hasAVX())
   4895     return SDValue();
   4896 
   4897   EVT VT = Op.getValueType();
   4898   DebugLoc dl = Op.getDebugLoc();
   4899 
   4900   SDValue Ld;
   4901   bool ConstSplatVal;
   4902 
   4903   switch (Op.getOpcode()) {
   4904     default:
   4905       // Unknown pattern found.
   4906       return SDValue();
   4907 
   4908     case ISD::BUILD_VECTOR: {
   4909       // The BUILD_VECTOR node must be a splat.
   4910       if (!isSplatVector(Op.getNode()))
   4911         return SDValue();
   4912 
   4913       Ld = Op.getOperand(0);
   4914       ConstSplatVal = (Ld.getOpcode() == ISD::Constant ||
   4915                      Ld.getOpcode() == ISD::ConstantFP);
   4916 
   4917       // The suspected load node has several users. Make sure that all
   4918       // of its users are from the BUILD_VECTOR node.
   4919       // Constants may have multiple users.
   4920       if (!ConstSplatVal && !Ld->hasNUsesOfValue(VT.getVectorNumElements(), 0))
   4921         return SDValue();
   4922       break;
   4923     }
   4924 
   4925     case ISD::VECTOR_SHUFFLE: {
   4926       ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
   4927 
   4928       // Shuffles must have a splat mask where the first element is
   4929       // broadcasted.
   4930       if ((!SVOp->isSplat()) || SVOp->getMaskElt(0) != 0)
   4931         return SDValue();
   4932 
   4933       SDValue Sc = Op.getOperand(0);
   4934       if (Sc.getOpcode() != ISD::SCALAR_TO_VECTOR)
   4935         return SDValue();
   4936 
   4937       Ld = Sc.getOperand(0);
   4938       ConstSplatVal = (Ld.getOpcode() == ISD::Constant ||
   4939                        Ld.getOpcode() == ISD::ConstantFP);
   4940 
   4941       // The scalar_to_vector node and the suspected
   4942       // load node must have exactly one user.
   4943       // Constants may have multiple users.
   4944       if (!ConstSplatVal && (!Sc.hasOneUse() || !Ld.hasOneUse()))
   4945         return SDValue();
   4946       break;
   4947     }
   4948   }
   4949 
   4950   bool Is256 = VT.getSizeInBits() == 256;
   4951   bool Is128 = VT.getSizeInBits() == 128;
   4952 
   4953   // Handle the broadcasting a single constant scalar from the constant pool
   4954   // into a vector. On Sandybridge it is still better to load a constant vector
   4955   // from the constant pool and not to broadcast it from a scalar.
   4956   if (ConstSplatVal && Subtarget->hasAVX2()) {
   4957     EVT CVT = Ld.getValueType();
   4958     assert(!CVT.isVector() && "Must not broadcast a vector type");
   4959     unsigned ScalarSize = CVT.getSizeInBits();
   4960 
   4961     if ((Is256 && (ScalarSize == 32 || ScalarSize == 64)) ||
   4962         (Is128 && (ScalarSize == 32))) {
   4963 
   4964       const Constant *C = 0;
   4965       if (ConstantSDNode *CI = dyn_cast<ConstantSDNode>(Ld))
   4966         C = CI->getConstantIntValue();
   4967       else if (ConstantFPSDNode *CF = dyn_cast<ConstantFPSDNode>(Ld))
   4968         C = CF->getConstantFPValue();
   4969 
   4970       assert(C && "Invalid constant type");
   4971 
   4972       SDValue CP = DAG.getConstantPool(C, getPointerTy());
   4973       unsigned Alignment = cast<ConstantPoolSDNode>(CP)->getAlignment();
   4974       Ld = DAG.getLoad(CVT, dl, DAG.getEntryNode(), CP,
   4975                          MachinePointerInfo::getConstantPool(),
   4976                          false, false, false, Alignment);
   4977 
   4978       return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld);
   4979     }
   4980   }
   4981 
   4982   // The scalar source must be a normal load.
   4983   if (!ISD::isNormalLoad(Ld.getNode()))
   4984     return SDValue();
   4985 
   4986   // Reject loads that have uses of the chain result
   4987   if (Ld->hasAnyUseOfValue(1))
   4988     return SDValue();
   4989 
   4990   unsigned ScalarSize = Ld.getValueType().getSizeInBits();
   4991 
   4992   // VBroadcast to YMM
   4993   if (Is256 && (ScalarSize == 32 || ScalarSize == 64))
   4994     return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld);
   4995 
   4996   // VBroadcast to XMM
   4997   if (Is128 && (ScalarSize == 32))
   4998     return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld);
   4999 
   5000   // The integer check is needed for the 64-bit into 128-bit so it doesn't match
   5001   // double since there is vbroadcastsd xmm
   5002   if (Subtarget->hasAVX2() && Ld.getValueType().isInteger()) {
   5003     // VBroadcast to YMM
   5004     if (Is256 && (ScalarSize == 8 || ScalarSize == 16))
   5005       return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld);
   5006 
   5007     // VBroadcast to XMM
   5008     if (Is128 && (ScalarSize ==  8 || ScalarSize == 16 || ScalarSize == 64))
   5009       return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld);
   5010   }
   5011 
   5012   // Unsupported broadcast.
   5013   return SDValue();
   5014 }
   5015 
   5016 SDValue
   5017 X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const {
   5018   DebugLoc dl = Op.getDebugLoc();
   5019 
   5020   EVT VT = Op.getValueType();
   5021   EVT ExtVT = VT.getVectorElementType();
   5022   unsigned NumElems = Op.getNumOperands();
   5023 
   5024   // Vectors containing all zeros can be matched by pxor and xorps later
   5025   if (ISD::isBuildVectorAllZeros(Op.getNode())) {
   5026     // Canonicalize this to <4 x i32> to 1) ensure the zero vectors are CSE'd
   5027     // and 2) ensure that i64 scalars are eliminated on x86-32 hosts.
   5028     if (VT == MVT::v4i32 || VT == MVT::v8i32)
   5029       return Op;
   5030 
   5031     return getZeroVector(VT, Subtarget, DAG, dl);
   5032   }
   5033 
   5034   // Vectors containing all ones can be matched by pcmpeqd on 128-bit width
   5035   // vectors or broken into v4i32 operations on 256-bit vectors. AVX2 can use
   5036   // vpcmpeqd on 256-bit vectors.
   5037   if (ISD::isBuildVectorAllOnes(Op.getNode())) {
   5038     if (VT == MVT::v4i32 || (VT == MVT::v8i32 && Subtarget->hasAVX2()))
   5039       return Op;
   5040 
   5041     return getOnesVector(VT, Subtarget->hasAVX2(), DAG, dl);
   5042   }
   5043 
   5044   SDValue Broadcast = LowerVectorBroadcast(Op, DAG);
   5045   if (Broadcast.getNode())
   5046     return Broadcast;
   5047 
   5048   unsigned EVTBits = ExtVT.getSizeInBits();
   5049 
   5050   unsigned NumZero  = 0;
   5051   unsigned NumNonZero = 0;
   5052   unsigned NonZeros = 0;
   5053   bool IsAllConstants = true;
   5054   SmallSet<SDValue, 8> Values;
   5055   for (unsigned i = 0; i < NumElems; ++i) {
   5056     SDValue Elt = Op.getOperand(i);
   5057     if (Elt.getOpcode() == ISD::UNDEF)
   5058       continue;
   5059     Values.insert(Elt);
   5060     if (Elt.getOpcode() != ISD::Constant &&
   5061         Elt.getOpcode() != ISD::ConstantFP)
   5062       IsAllConstants = false;
   5063     if (X86::isZeroNode(Elt))
   5064       NumZero++;
   5065     else {
   5066       NonZeros |= (1 << i);
   5067       NumNonZero++;
   5068     }
   5069   }
   5070 
   5071   // All undef vector. Return an UNDEF.  All zero vectors were handled above.
   5072   if (NumNonZero == 0)
   5073     return DAG.getUNDEF(VT);
   5074 
   5075   // Special case for single non-zero, non-undef, element.
   5076   if (NumNonZero == 1) {
   5077     unsigned Idx = CountTrailingZeros_32(NonZeros);
   5078     SDValue Item = Op.getOperand(Idx);
   5079 
   5080     // If this is an insertion of an i64 value on x86-32, and if the top bits of
   5081     // the value are obviously zero, truncate the value to i32 and do the
   5082     // insertion that way.  Only do this if the value is non-constant or if the
   5083     // value is a constant being inserted into element 0.  It is cheaper to do
   5084     // a constant pool load than it is to do a movd + shuffle.
   5085     if (ExtVT == MVT::i64 && !Subtarget->is64Bit() &&
   5086         (!IsAllConstants || Idx == 0)) {
   5087       if (DAG.MaskedValueIsZero(Item, APInt::getBitsSet(64, 32, 64))) {
   5088         // Handle SSE only.
   5089         assert(VT == MVT::v2i64 && "Expected an SSE value type!");
   5090         EVT VecVT = MVT::v4i32;
   5091         unsigned VecElts = 4;
   5092 
   5093         // Truncate the value (which may itself be a constant) to i32, and
   5094         // convert it to a vector with movd (S2V+shuffle to zero extend).
   5095         Item = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, Item);
   5096         Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VecVT, Item);
   5097         Item = getShuffleVectorZeroOrUndef(Item, 0, true, Subtarget, DAG);
   5098 
   5099         // Now we have our 32-bit value zero extended in the low element of
   5100         // a vector.  If Idx != 0, swizzle it into place.
   5101         if (Idx != 0) {
   5102           SmallVector<int, 4> Mask;
   5103           Mask.push_back(Idx);
   5104           for (unsigned i = 1; i != VecElts; ++i)
   5105             Mask.push_back(i);
   5106           Item = DAG.getVectorShuffle(VecVT, dl, Item,
   5107                                       DAG.getUNDEF(Item.getValueType()),
   5108                                       &Mask[0]);
   5109         }
   5110         return DAG.getNode(ISD::BITCAST, dl, VT, Item);
   5111       }
   5112     }
   5113 
   5114     // If we have a constant or non-constant insertion into the low element of
   5115     // a vector, we can do this with SCALAR_TO_VECTOR + shuffle of zero into
   5116     // the rest of the elements.  This will be matched as movd/movq/movss/movsd
   5117     // depending on what the source datatype is.
   5118     if (Idx == 0) {
   5119       if (NumZero == 0)
   5120         return DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item);
   5121 
   5122       if (ExtVT == MVT::i32 || ExtVT == MVT::f32 || ExtVT == MVT::f64 ||
   5123           (ExtVT == MVT::i64 && Subtarget->is64Bit())) {
   5124         if (VT.getSizeInBits() == 256) {
   5125           SDValue ZeroVec = getZeroVector(VT, Subtarget, DAG, dl);
   5126           return DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, ZeroVec,
   5127                              Item, DAG.getIntPtrConstant(0));
   5128         }
   5129         assert(VT.getSizeInBits() == 128 && "Expected an SSE value type!");
   5130         Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item);
   5131         // Turn it into a MOVL (i.e. movss, movsd, or movd) to a zero vector.
   5132         return getShuffleVectorZeroOrUndef(Item, 0, true, Subtarget, DAG);
   5133       }
   5134 
   5135       if (ExtVT == MVT::i16 || ExtVT == MVT::i8) {
   5136         Item = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, Item);
   5137         Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, Item);
   5138         if (VT.getSizeInBits() == 256) {
   5139           SDValue ZeroVec = getZeroVector(MVT::v8i32, Subtarget, DAG, dl);
   5140           Item = Insert128BitVector(ZeroVec, Item, DAG.getConstant(0, MVT::i32),
   5141                                     DAG, dl);
   5142         } else {
   5143           assert(VT.getSizeInBits() == 128 && "Expected an SSE value type!");
   5144           Item = getShuffleVectorZeroOrUndef(Item, 0, true, Subtarget, DAG);
   5145         }
   5146         return DAG.getNode(ISD::BITCAST, dl, VT, Item);
   5147       }
   5148     }
   5149 
   5150     // Is it a vector logical left shift?
   5151     if (NumElems == 2 && Idx == 1 &&
   5152         X86::isZeroNode(Op.getOperand(0)) &&
   5153         !X86::isZeroNode(Op.getOperand(1))) {
   5154       unsigned NumBits = VT.getSizeInBits();
   5155       return getVShift(true, VT,
   5156                        DAG.getNode(ISD::SCALAR_TO_VECTOR, dl,
   5157                                    VT, Op.getOperand(1)),
   5158                        NumBits/2, DAG, *this, dl);
   5159     }
   5160 
   5161     if (IsAllConstants) // Otherwise, it's better to do a constpool load.
   5162       return SDValue();
   5163 
   5164     // Otherwise, if this is a vector with i32 or f32 elements, and the element
   5165     // is a non-constant being inserted into an element other than the low one,
   5166     // we can't use a constant pool load.  Instead, use SCALAR_TO_VECTOR (aka
   5167     // movd/movss) to move this into the low element, then shuffle it into
   5168     // place.
   5169     if (EVTBits == 32) {
   5170       Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item);
   5171 
   5172       // Turn it into a shuffle of zero and zero-extended scalar to vector.
   5173       Item = getShuffleVectorZeroOrUndef(Item, 0, NumZero > 0, Subtarget, DAG);
   5174       SmallVector<int, 8> MaskVec;
   5175       for (unsigned i = 0; i < NumElems; i++)
   5176         MaskVec.push_back(i == Idx ? 0 : 1);
   5177       return DAG.getVectorShuffle(VT, dl, Item, DAG.getUNDEF(VT), &MaskVec[0]);
   5178     }
   5179   }
   5180 
   5181   // Splat is obviously ok. Let legalizer expand it to a shuffle.
   5182   if (Values.size() == 1) {
   5183     if (EVTBits == 32) {
   5184       // Instead of a shuffle like this:
   5185       // shuffle (scalar_to_vector (load (ptr + 4))), undef, <0, 0, 0, 0>
   5186       // Check if it's possible to issue this instead.
   5187       // shuffle (vload ptr)), undef, <1, 1, 1, 1>
   5188       unsigned Idx = CountTrailingZeros_32(NonZeros);
   5189       SDValue Item = Op.getOperand(Idx);
   5190       if (Op.getNode()->isOnlyUserOf(Item.getNode()))
   5191         return LowerAsSplatVectorLoad(Item, VT, dl, DAG);
   5192     }
   5193     return SDValue();
   5194   }
   5195 
   5196   // A vector full of immediates; various special cases are already
   5197   // handled, so this is best done with a single constant-pool load.
   5198   if (IsAllConstants)
   5199     return SDValue();
   5200 
   5201   // For AVX-length vectors, build the individual 128-bit pieces and use
   5202   // shuffles to put them in place.
   5203   if (VT.getSizeInBits() == 256) {
   5204     SmallVector<SDValue, 32> V;
   5205     for (unsigned i = 0; i != NumElems; ++i)
   5206       V.push_back(Op.getOperand(i));
   5207 
   5208     EVT HVT = EVT::getVectorVT(*DAG.getContext(), ExtVT, NumElems/2);
   5209 
   5210     // Build both the lower and upper subvector.
   5211     SDValue Lower = DAG.getNode(ISD::BUILD_VECTOR, dl, HVT, &V[0], NumElems/2);
   5212     SDValue Upper = DAG.getNode(ISD::BUILD_VECTOR, dl, HVT, &V[NumElems / 2],
   5213                                 NumElems/2);
   5214 
   5215     // Recreate the wider vector with the lower and upper part.
   5216     SDValue Vec = Insert128BitVector(DAG.getNode(ISD::UNDEF, dl, VT), Lower,
   5217                                 DAG.getConstant(0, MVT::i32), DAG, dl);
   5218     return Insert128BitVector(Vec, Upper, DAG.getConstant(NumElems/2, MVT::i32),
   5219                               DAG, dl);
   5220   }
   5221 
   5222   // Let legalizer expand 2-wide build_vectors.
   5223   if (EVTBits == 64) {
   5224     if (NumNonZero == 1) {
   5225       // One half is zero or undef.
   5226       unsigned Idx = CountTrailingZeros_32(NonZeros);
   5227       SDValue V2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT,
   5228                                  Op.getOperand(Idx));
   5229       return getShuffleVectorZeroOrUndef(V2, Idx, true, Subtarget, DAG);
   5230     }
   5231     return SDValue();
   5232   }
   5233 
   5234   // If element VT is < 32 bits, convert it to inserts into a zero vector.
   5235   if (EVTBits == 8 && NumElems == 16) {
   5236     SDValue V = LowerBuildVectorv16i8(Op, NonZeros,NumNonZero,NumZero, DAG,
   5237                                         Subtarget, *this);
   5238     if (V.getNode()) return V;
   5239   }
   5240 
   5241   if (EVTBits == 16 && NumElems == 8) {
   5242     SDValue V = LowerBuildVectorv8i16(Op, NonZeros,NumNonZero,NumZero, DAG,
   5243                                       Subtarget, *this);
   5244     if (V.getNode()) return V;
   5245   }
   5246 
   5247   // If element VT is == 32 bits, turn it into a number of shuffles.
   5248   SmallVector<SDValue, 8> V(NumElems);
   5249   if (NumElems == 4 && NumZero > 0) {
   5250     for (unsigned i = 0; i < 4; ++i) {
   5251       bool isZero = !(NonZeros & (1 << i));
   5252       if (isZero)
   5253         V[i] = getZeroVector(VT, Subtarget, DAG, dl);
   5254       else
   5255         V[i] = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(i));
   5256     }
   5257 
   5258     for (unsigned i = 0; i < 2; ++i) {
   5259       switch ((NonZeros & (0x3 << i*2)) >> (i*2)) {
   5260         default: break;
   5261         case 0:
   5262           V[i] = V[i*2];  // Must be a zero vector.
   5263           break;
   5264         case 1:
   5265           V[i] = getMOVL(DAG, dl, VT, V[i*2+1], V[i*2]);
   5266           break;
   5267         case 2:
   5268           V[i] = getMOVL(DAG, dl, VT, V[i*2], V[i*2+1]);
   5269           break;
   5270         case 3:
   5271           V[i] = getUnpackl(DAG, dl, VT, V[i*2], V[i*2+1]);
   5272           break;
   5273       }
   5274     }
   5275 
   5276     bool Reverse1 = (NonZeros & 0x3) == 2;
   5277     bool Reverse2 = ((NonZeros & (0x3 << 2)) >> 2) == 2;
   5278     int MaskVec[] = {
   5279       Reverse1 ? 1 : 0,
   5280       Reverse1 ? 0 : 1,
   5281       static_cast<int>(Reverse2 ? NumElems+1 : NumElems),
   5282       static_cast<int>(Reverse2 ? NumElems   : NumElems+1)
   5283     };
   5284     return DAG.getVectorShuffle(VT, dl, V[0], V[1], &MaskVec[0]);
   5285   }
   5286 
   5287   if (Values.size() > 1 && VT.getSizeInBits() == 128) {
   5288     // Check for a build vector of consecutive loads.
   5289     for (unsigned i = 0; i < NumElems; ++i)
   5290       V[i] = Op.getOperand(i);
   5291 
   5292     // Check for elements which are consecutive loads.
   5293     SDValue LD = EltsFromConsecutiveLoads(VT, V, dl, DAG);
   5294     if (LD.getNode())
   5295       return LD;
   5296 
   5297     // For SSE 4.1, use insertps to put the high elements into the low element.
   5298     if (getSubtarget()->hasSSE41()) {
   5299       SDValue Result;
   5300       if (Op.getOperand(0).getOpcode() != ISD::UNDEF)
   5301         Result = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(0));
   5302       else
   5303         Result = DAG.getUNDEF(VT);
   5304 
   5305       for (unsigned i = 1; i < NumElems; ++i) {
   5306         if (Op.getOperand(i).getOpcode() == ISD::UNDEF) continue;
   5307         Result = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, Result,
   5308                              Op.getOperand(i), DAG.getIntPtrConstant(i));
   5309       }
   5310       return Result;
   5311     }
   5312 
   5313     // Otherwise, expand into a number of unpckl*, start by extending each of
   5314     // our (non-undef) elements to the full vector width with the element in the
   5315     // bottom slot of the vector (which generates no code for SSE).
   5316     for (unsigned i = 0; i < NumElems; ++i) {
   5317       if (Op.getOperand(i).getOpcode() != ISD::UNDEF)
   5318         V[i] = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(i));
   5319       else
   5320         V[i] = DAG.getUNDEF(VT);
   5321     }
   5322 
   5323     // Next, we iteratively mix elements, e.g. for v4f32:
   5324     //   Step 1: unpcklps 0, 2 ==> X: <?, ?, 2, 0>
   5325     //         : unpcklps 1, 3 ==> Y: <?, ?, 3, 1>
   5326     //   Step 2: unpcklps X, Y ==>    <3, 2, 1, 0>
   5327     unsigned EltStride = NumElems >> 1;
   5328     while (EltStride != 0) {
   5329       for (unsigned i = 0; i < EltStride; ++i) {
   5330         // If V[i+EltStride] is undef and this is the first round of mixing,
   5331         // then it is safe to just drop this shuffle: V[i] is already in the
   5332         // right place, the one element (since it's the first round) being
   5333         // inserted as undef can be dropped.  This isn't safe for successive
   5334         // rounds because they will permute elements within both vectors.
   5335         if (V[i+EltStride].getOpcode() == ISD::UNDEF &&
   5336             EltStride == NumElems/2)
   5337           continue;
   5338 
   5339         V[i] = getUnpackl(DAG, dl, VT, V[i], V[i + EltStride]);
   5340       }
   5341       EltStride >>= 1;
   5342     }
   5343     return V[0];
   5344   }
   5345   return SDValue();
   5346 }
   5347 
   5348 // LowerMMXCONCAT_VECTORS - We support concatenate two MMX registers and place
   5349 // them in a MMX register.  This is better than doing a stack convert.
   5350 static SDValue LowerMMXCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) {
   5351   DebugLoc dl = Op.getDebugLoc();
   5352   EVT ResVT = Op.getValueType();
   5353 
   5354   assert(ResVT == MVT::v2i64 || ResVT == MVT::v4i32 ||
   5355          ResVT == MVT::v8i16 || ResVT == MVT::v16i8);
   5356   int Mask[2];
   5357   SDValue InVec = DAG.getNode(ISD::BITCAST,dl, MVT::v1i64, Op.getOperand(0));
   5358   SDValue VecOp = DAG.getNode(X86ISD::MOVQ2DQ, dl, MVT::v2i64, InVec);
   5359   InVec = Op.getOperand(1);
   5360   if (InVec.getOpcode() == ISD::SCALAR_TO_VECTOR) {
   5361     unsigned NumElts = ResVT.getVectorNumElements();
   5362     VecOp = DAG.getNode(ISD::BITCAST, dl, ResVT, VecOp);
   5363     VecOp = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, ResVT, VecOp,
   5364                        InVec.getOperand(0), DAG.getIntPtrConstant(NumElts/2+1));
   5365   } else {
   5366     InVec = DAG.getNode(ISD::BITCAST, dl, MVT::v1i64, InVec);
   5367     SDValue VecOp2 = DAG.getNode(X86ISD::MOVQ2DQ, dl, MVT::v2i64, InVec);
   5368     Mask[0] = 0; Mask[1] = 2;
   5369     VecOp = DAG.getVectorShuffle(MVT::v2i64, dl, VecOp, VecOp2, Mask);
   5370   }
   5371   return DAG.getNode(ISD::BITCAST, dl, ResVT, VecOp);
   5372 }
   5373 
   5374 // LowerAVXCONCAT_VECTORS - 256-bit AVX can use the vinsertf128 instruction
   5375 // to create 256-bit vectors from two other 128-bit ones.
   5376 static SDValue LowerAVXCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) {
   5377   DebugLoc dl = Op.getDebugLoc();
   5378   EVT ResVT = Op.getValueType();
   5379 
   5380   assert(ResVT.getSizeInBits() == 256 && "Value type must be 256-bit wide");
   5381 
   5382   SDValue V1 = Op.getOperand(0);
   5383   SDValue V2 = Op.getOperand(1);
   5384   unsigned NumElems = ResVT.getVectorNumElements();
   5385 
   5386   SDValue V = Insert128BitVector(DAG.getNode(ISD::UNDEF, dl, ResVT), V1,
   5387                                  DAG.getConstant(0, MVT::i32), DAG, dl);
   5388   return Insert128BitVector(V, V2, DAG.getConstant(NumElems/2, MVT::i32),
   5389                             DAG, dl);
   5390 }
   5391 
   5392 SDValue
   5393 X86TargetLowering::LowerCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) const {
   5394   EVT ResVT = Op.getValueType();
   5395 
   5396   assert(Op.getNumOperands() == 2);
   5397   assert((ResVT.getSizeInBits() == 128 || ResVT.getSizeInBits() == 256) &&
   5398          "Unsupported CONCAT_VECTORS for value type");
   5399 
   5400   // We support concatenate two MMX registers and place them in a MMX register.
   5401   // This is better than doing a stack convert.
   5402   if (ResVT.is128BitVector())
   5403     return LowerMMXCONCAT_VECTORS(Op, DAG);
   5404 
   5405   // 256-bit AVX can use the vinsertf128 instruction to create 256-bit vectors
   5406   // from two other 128-bit ones.
   5407   return LowerAVXCONCAT_VECTORS(Op, DAG);
   5408 }
   5409 
   5410 // Try to lower a shuffle node into a simple blend instruction.
   5411 static SDValue LowerVECTOR_SHUFFLEtoBlend(SDValue Op,
   5412                                           const X86Subtarget *Subtarget,
   5413                                           SelectionDAG &DAG) {
   5414   ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
   5415   SDValue V1 = SVOp->getOperand(0);
   5416   SDValue V2 = SVOp->getOperand(1);
   5417   DebugLoc dl = SVOp->getDebugLoc();
   5418   EVT VT = Op.getValueType();
   5419   EVT InVT = V1.getValueType();
   5420   int MaskSize = VT.getVectorNumElements();
   5421   int InSize = InVT.getVectorNumElements();
   5422 
   5423   if (!Subtarget->hasSSE41())
   5424     return SDValue();
   5425 
   5426   if (MaskSize != InSize)
   5427     return SDValue();
   5428 
   5429   int ISDNo = 0;
   5430   MVT OpTy;
   5431 
   5432   switch (VT.getSimpleVT().SimpleTy) {
   5433   default: return SDValue();
   5434   case MVT::v8i16:
   5435            ISDNo = X86ISD::BLENDPW;
   5436            OpTy = MVT::v8i16;
   5437            break;
   5438   case MVT::v4i32:
   5439   case MVT::v4f32:
   5440            ISDNo = X86ISD::BLENDPS;
   5441            OpTy = MVT::v4f32;
   5442            break;
   5443   case MVT::v2i64:
   5444   case MVT::v2f64:
   5445            ISDNo = X86ISD::BLENDPD;
   5446            OpTy = MVT::v2f64;
   5447            break;
   5448   case MVT::v8i32:
   5449   case MVT::v8f32:
   5450            if (!Subtarget->hasAVX())
   5451              return SDValue();
   5452            ISDNo = X86ISD::BLENDPS;
   5453            OpTy = MVT::v8f32;
   5454            break;
   5455   case MVT::v4i64:
   5456   case MVT::v4f64:
   5457            if (!Subtarget->hasAVX())
   5458              return SDValue();
   5459            ISDNo = X86ISD::BLENDPD;
   5460            OpTy = MVT::v4f64;
   5461            break;
   5462   case MVT::v16i16:
   5463            if (!Subtarget->hasAVX2())
   5464              return SDValue();
   5465            ISDNo = X86ISD::BLENDPW;
   5466            OpTy = MVT::v16i16;
   5467            break;
   5468   }
   5469   assert(ISDNo && "Invalid Op Number");
   5470 
   5471   unsigned MaskVals = 0;
   5472 
   5473   for (int i = 0; i < MaskSize; ++i) {
   5474     int EltIdx = SVOp->getMaskElt(i);
   5475     if (EltIdx == i || EltIdx == -1)
   5476       MaskVals |= (1<<i);
   5477     else if (EltIdx == (i + MaskSize))
   5478       continue; // Bit is set to zero;
   5479     else return SDValue();
   5480   }
   5481 
   5482   V1 = DAG.getNode(ISD::BITCAST, dl, OpTy, V1);
   5483   V2 = DAG.getNode(ISD::BITCAST, dl, OpTy, V2);
   5484   SDValue Ret =  DAG.getNode(ISDNo, dl, OpTy, V1, V2,
   5485                              DAG.getConstant(MaskVals, MVT::i32));
   5486   return DAG.getNode(ISD::BITCAST, dl, VT, Ret);
   5487 }
   5488 
   5489 // v8i16 shuffles - Prefer shuffles in the following order:
   5490 // 1. [all]   pshuflw, pshufhw, optional move
   5491 // 2. [ssse3] 1 x pshufb
   5492 // 3. [ssse3] 2 x pshufb + 1 x por
   5493 // 4. [all]   mov + pshuflw + pshufhw + N x (pextrw + pinsrw)
   5494 SDValue
   5495 X86TargetLowering::LowerVECTOR_SHUFFLEv8i16(SDValue Op,
   5496                                             SelectionDAG &DAG) const {
   5497   ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
   5498   SDValue V1 = SVOp->getOperand(0);
   5499   SDValue V2 = SVOp->getOperand(1);
   5500   DebugLoc dl = SVOp->getDebugLoc();
   5501   SmallVector<int, 8> MaskVals;
   5502 
   5503   // Determine if more than 1 of the words in each of the low and high quadwords
   5504   // of the result come from the same quadword of one of the two inputs.  Undef
   5505   // mask values count as coming from any quadword, for better codegen.
   5506   unsigned LoQuad[] = { 0, 0, 0, 0 };
   5507   unsigned HiQuad[] = { 0, 0, 0, 0 };
   5508   std::bitset<4> InputQuads;
   5509   for (unsigned i = 0; i < 8; ++i) {
   5510     unsigned *Quad = i < 4 ? LoQuad : HiQuad;
   5511     int EltIdx = SVOp->getMaskElt(i);
   5512     MaskVals.push_back(EltIdx);
   5513     if (EltIdx < 0) {
   5514       ++Quad[0];
   5515       ++Quad[1];
   5516       ++Quad[2];
   5517       ++Quad[3];
   5518       continue;
   5519     }
   5520     ++Quad[EltIdx / 4];
   5521     InputQuads.set(EltIdx / 4);
   5522   }
   5523 
   5524   int BestLoQuad = -1;
   5525   unsigned MaxQuad = 1;
   5526   for (unsigned i = 0; i < 4; ++i) {
   5527     if (LoQuad[i] > MaxQuad) {
   5528       BestLoQuad = i;
   5529       MaxQuad = LoQuad[i];
   5530     }
   5531   }
   5532 
   5533   int BestHiQuad = -1;
   5534   MaxQuad = 1;
   5535   for (unsigned i = 0; i < 4; ++i) {
   5536     if (HiQuad[i] > MaxQuad) {
   5537       BestHiQuad = i;
   5538       MaxQuad = HiQuad[i];
   5539     }
   5540   }
   5541 
   5542   // For SSSE3, If all 8 words of the result come from only 1 quadword of each
   5543   // of the two input vectors, shuffle them into one input vector so only a
   5544   // single pshufb instruction is necessary. If There are more than 2 input
   5545   // quads, disable the next transformation since it does not help SSSE3.
   5546   bool V1Used = InputQuads[0] || InputQuads[1];
   5547   bool V2Used = InputQuads[2] || InputQuads[3];
   5548   if (Subtarget->hasSSSE3()) {
   5549     if (InputQuads.count() == 2 && V1Used && V2Used) {
   5550       BestLoQuad = InputQuads[0] ? 0 : 1;
   5551       BestHiQuad = InputQuads[2] ? 2 : 3;
   5552     }
   5553     if (InputQuads.count() > 2) {
   5554       BestLoQuad = -1;
   5555       BestHiQuad = -1;
   5556     }
   5557   }
   5558 
   5559   // If BestLoQuad or BestHiQuad are set, shuffle the quads together and update
   5560   // the shuffle mask.  If a quad is scored as -1, that means that it contains
   5561   // words from all 4 input quadwords.
   5562   SDValue NewV;
   5563   if (BestLoQuad >= 0 || BestHiQuad >= 0) {
   5564     int MaskV[] = {
   5565       BestLoQuad < 0 ? 0 : BestLoQuad,
   5566       BestHiQuad < 0 ? 1 : BestHiQuad
   5567     };
   5568     NewV = DAG.getVectorShuffle(MVT::v2i64, dl,
   5569                   DAG.getNode(ISD::BITCAST, dl, MVT::v2i64, V1),
   5570                   DAG.getNode(ISD::BITCAST, dl, MVT::v2i64, V2), &MaskV[0]);
   5571     NewV = DAG.getNode(ISD::BITCAST, dl, MVT::v8i16, NewV);
   5572 
   5573     // Rewrite the MaskVals and assign NewV to V1 if NewV now contains all the
   5574     // source words for the shuffle, to aid later transformations.
   5575     bool AllWordsInNewV = true;
   5576     bool InOrder[2] = { true, true };
   5577     for (unsigned i = 0; i != 8; ++i) {
   5578       int idx = MaskVals[i];
   5579       if (idx != (int)i)
   5580         InOrder[i/4] = false;
   5581       if (idx < 0 || (idx/4) == BestLoQuad || (idx/4) == BestHiQuad)
   5582         continue;
   5583       AllWordsInNewV = false;
   5584       break;
   5585     }
   5586 
   5587     bool pshuflw = AllWordsInNewV, pshufhw = AllWordsInNewV;
   5588     if (AllWordsInNewV) {
   5589       for (int i = 0; i != 8; ++i) {
   5590         int idx = MaskVals[i];
   5591         if (idx < 0)
   5592           continue;
   5593         idx = MaskVals[i] = (idx / 4) == BestLoQuad ? (idx & 3) : (idx & 3) + 4;
   5594         if ((idx != i) && idx < 4)
   5595           pshufhw = false;
   5596         if ((idx != i) && idx > 3)
   5597           pshuflw = false;
   5598       }
   5599       V1 = NewV;
   5600       V2Used = false;
   5601       BestLoQuad = 0;
   5602       BestHiQuad = 1;
   5603     }
   5604 
   5605     // If we've eliminated the use of V2, and the new mask is a pshuflw or
   5606     // pshufhw, that's as cheap as it gets.  Return the new shuffle.
   5607     if ((pshufhw && InOrder[0]) || (pshuflw && InOrder[1])) {
   5608       unsigned Opc = pshufhw ? X86ISD::PSHUFHW : X86ISD::PSHUFLW;
   5609       unsigned TargetMask = 0;
   5610       NewV = DAG.getVectorShuffle(MVT::v8i16, dl, NewV,
   5611                                   DAG.getUNDEF(MVT::v8i16), &MaskVals[0]);
   5612       ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(NewV.getNode());
   5613       TargetMask = pshufhw ? getShufflePSHUFHWImmediate(SVOp):
   5614                              getShufflePSHUFLWImmediate(SVOp);
   5615       V1 = NewV.getOperand(0);
   5616       return getTargetShuffleNode(Opc, dl, MVT::v8i16, V1, TargetMask, DAG);
   5617     }
   5618   }
   5619 
   5620   // If we have SSSE3, and all words of the result are from 1 input vector,
   5621   // case 2 is generated, otherwise case 3 is generated.  If no SSSE3
   5622   // is present, fall back to case 4.
   5623   if (Subtarget->hasSSSE3()) {
   5624     SmallVector<SDValue,16> pshufbMask;
   5625 
   5626     // If we have elements from both input vectors, set the high bit of the
   5627     // shuffle mask element to zero out elements that come from V2 in the V1
   5628     // mask, and elements that come from V1 in the V2 mask, so that the two
   5629     // results can be OR'd together.
   5630     bool TwoInputs = V1Used && V2Used;
   5631     for (unsigned i = 0; i != 8; ++i) {
   5632       int EltIdx = MaskVals[i] * 2;
   5633       if (TwoInputs && (EltIdx >= 16)) {
   5634         pshufbMask.push_back(DAG.getConstant(0x80, MVT::i8));
   5635         pshufbMask.push_back(DAG.getConstant(0x80, MVT::i8));
   5636         continue;
   5637       }
   5638       pshufbMask.push_back(DAG.getConstant(EltIdx,   MVT::i8));
   5639       pshufbMask.push_back(DAG.getConstant(EltIdx+1, MVT::i8));
   5640     }
   5641     V1 = DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, V1);
   5642     V1 = DAG.getNode(X86ISD::PSHUFB, dl, MVT::v16i8, V1,
   5643                      DAG.getNode(ISD::BUILD_VECTOR, dl,
   5644                                  MVT::v16i8, &pshufbMask[0], 16));
   5645     if (!TwoInputs)
   5646       return DAG.getNode(ISD::BITCAST, dl, MVT::v8i16, V1);
   5647 
   5648     // Calculate the shuffle mask for the second input, shuffle it, and
   5649     // OR it with the first shuffled input.
   5650     pshufbMask.clear();
   5651     for (unsigned i = 0; i != 8; ++i) {
   5652       int EltIdx = MaskVals[i] * 2;
   5653       if (EltIdx < 16) {
   5654         pshufbMask.push_back(DAG.getConstant(0x80, MVT::i8));
   5655         pshufbMask.push_back(DAG.getConstant(0x80, MVT::i8));
   5656         continue;
   5657       }
   5658       pshufbMask.push_back(DAG.getConstant(EltIdx - 16, MVT::i8));
   5659       pshufbMask.push_back(DAG.getConstant(EltIdx - 15, MVT::i8));
   5660     }
   5661     V2 = DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, V2);
   5662     V2 = DAG.getNode(X86ISD::PSHUFB, dl, MVT::v16i8, V2,
   5663                      DAG.getNode(ISD::BUILD_VECTOR, dl,
   5664                                  MVT::v16i8, &pshufbMask[0], 16));
   5665     V1 = DAG.getNode(ISD::OR, dl, MVT::v16i8, V1, V2);
   5666     return DAG.getNode(ISD::BITCAST, dl, MVT::v8i16, V1);
   5667   }
   5668 
   5669   // If BestLoQuad >= 0, generate a pshuflw to put the low elements in order,
   5670   // and update MaskVals with new element order.
   5671   std::bitset<8> InOrder;
   5672   if (BestLoQuad >= 0) {
   5673     int MaskV[] = { -1, -1, -1, -1, 4, 5, 6, 7 };
   5674     for (int i = 0; i != 4; ++i) {
   5675       int idx = MaskVals[i];
   5676       if (idx < 0) {
   5677         InOrder.set(i);
   5678       } else if ((idx / 4) == BestLoQuad) {
   5679         MaskV[i] = idx & 3;
   5680         InOrder.set(i);
   5681       }
   5682     }
   5683     NewV = DAG.getVectorShuffle(MVT::v8i16, dl, NewV, DAG.getUNDEF(MVT::v8i16),
   5684                                 &MaskV[0]);
   5685 
   5686     if (NewV.getOpcode() == ISD::VECTOR_SHUFFLE && Subtarget->hasSSSE3()) {
   5687       ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(NewV.getNode());
   5688       NewV = getTargetShuffleNode(X86ISD::PSHUFLW, dl, MVT::v8i16,
   5689                                   NewV.getOperand(0),
   5690                                   getShufflePSHUFLWImmediate(SVOp), DAG);
   5691     }
   5692   }
   5693 
   5694   // If BestHi >= 0, generate a pshufhw to put the high elements in order,
   5695   // and update MaskVals with the new element order.
   5696   if (BestHiQuad >= 0) {
   5697     int MaskV[] = { 0, 1, 2, 3, -1, -1, -1, -1 };
   5698     for (unsigned i = 4; i != 8; ++i) {
   5699       int idx = MaskVals[i];
   5700       if (idx < 0) {
   5701         InOrder.set(i);
   5702       } else if ((idx / 4) == BestHiQuad) {
   5703         MaskV[i] = (idx & 3) + 4;
   5704         InOrder.set(i);
   5705       }
   5706     }
   5707     NewV = DAG.getVectorShuffle(MVT::v8i16, dl, NewV, DAG.getUNDEF(MVT::v8i16),
   5708                                 &MaskV[0]);
   5709 
   5710     if (NewV.getOpcode() == ISD::VECTOR_SHUFFLE && Subtarget->hasSSSE3()) {
   5711       ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(NewV.getNode());
   5712       NewV = getTargetShuffleNode(X86ISD::PSHUFHW, dl, MVT::v8i16,
   5713                                   NewV.getOperand(0),
   5714                                   getShufflePSHUFHWImmediate(SVOp), DAG);
   5715     }
   5716   }
   5717 
   5718   // In case BestHi & BestLo were both -1, which means each quadword has a word
   5719   // from each of the four input quadwords, calculate the InOrder bitvector now
   5720   // before falling through to the insert/extract cleanup.
   5721   if (BestLoQuad == -1 && BestHiQuad == -1) {
   5722     NewV = V1;
   5723     for (int i = 0; i != 8; ++i)
   5724       if (MaskVals[i] < 0 || MaskVals[i] == i)
   5725         InOrder.set(i);
   5726   }
   5727 
   5728   // The other elements are put in the right place using pextrw and pinsrw.
   5729   for (unsigned i = 0; i != 8; ++i) {
   5730     if (InOrder[i])
   5731       continue;
   5732     int EltIdx = MaskVals[i];
   5733     if (EltIdx < 0)
   5734       continue;
   5735     SDValue ExtOp = (EltIdx < 8)
   5736     ? DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16, V1,
   5737                   DAG.getIntPtrConstant(EltIdx))
   5738     : DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16, V2,
   5739                   DAG.getIntPtrConstant(EltIdx - 8));
   5740     NewV = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v8i16, NewV, ExtOp,
   5741                        DAG.getIntPtrConstant(i));
   5742   }
   5743   return NewV;
   5744 }
   5745 
   5746 // v16i8 shuffles - Prefer shuffles in the following order:
   5747 // 1. [ssse3] 1 x pshufb
   5748 // 2. [ssse3] 2 x pshufb + 1 x por
   5749 // 3. [all]   v8i16 shuffle + N x pextrw + rotate + pinsrw
   5750 static
   5751 SDValue LowerVECTOR_SHUFFLEv16i8(ShuffleVectorSDNode *SVOp,
   5752                                  SelectionDAG &DAG,
   5753                                  const X86TargetLowering &TLI) {
   5754   SDValue V1 = SVOp->getOperand(0);
   5755   SDValue V2 = SVOp->getOperand(1);
   5756   DebugLoc dl = SVOp->getDebugLoc();
   5757   ArrayRef<int> MaskVals = SVOp->getMask();
   5758 
   5759   // If we have SSSE3, case 1 is generated when all result bytes come from
   5760   // one of  the inputs.  Otherwise, case 2 is generated.  If no SSSE3 is
   5761   // present, fall back to case 3.
   5762   // FIXME: kill V2Only once shuffles are canonizalized by getNode.
   5763   bool V1Only = true;
   5764   bool V2Only = true;
   5765   for (unsigned i = 0; i < 16; ++i) {
   5766     int EltIdx = MaskVals[i];
   5767     if (EltIdx < 0)
   5768       continue;
   5769     if (EltIdx < 16)
   5770       V2Only = false;
   5771     else
   5772       V1Only = false;
   5773   }
   5774 
   5775   // If SSSE3, use 1 pshufb instruction per vector with elements in the result.
   5776   if (TLI.getSubtarget()->hasSSSE3()) {
   5777     SmallVector<SDValue,16> pshufbMask;
   5778 
   5779     // If all result elements are from one input vector, then only translate
   5780     // undef mask values to 0x80 (zero out result) in the pshufb mask.
   5781     //
   5782     // Otherwise, we have elements from both input vectors, and must zero out
   5783     // elements that come from V2 in the first mask, and V1 in the second mask
   5784     // so that we can OR them together.
   5785     bool TwoInputs = !(V1Only || V2Only);
   5786     for (unsigned i = 0; i != 16; ++i) {
   5787       int EltIdx = MaskVals[i];
   5788       if (EltIdx < 0 || (TwoInputs && EltIdx >= 16)) {
   5789         pshufbMask.push_back(DAG.getConstant(0x80, MVT::i8));
   5790         continue;
   5791       }
   5792       pshufbMask.push_back(DAG.getConstant(EltIdx, MVT::i8));
   5793     }
   5794     // If all the elements are from V2, assign it to V1 and return after
   5795     // building the first pshufb.
   5796     if (V2Only)
   5797       V1 = V2;
   5798     V1 = DAG.getNode(X86ISD::PSHUFB, dl, MVT::v16i8, V1,
   5799                      DAG.getNode(ISD::BUILD_VECTOR, dl,
   5800                                  MVT::v16i8, &pshufbMask[0], 16));
   5801     if (!TwoInputs)
   5802       return V1;
   5803 
   5804     // Calculate the shuffle mask for the second input, shuffle it, and
   5805     // OR it with the first shuffled input.
   5806     pshufbMask.clear();
   5807     for (unsigned i = 0; i != 16; ++i) {
   5808       int EltIdx = MaskVals[i];
   5809       if (EltIdx < 16) {
   5810         pshufbMask.push_back(DAG.getConstant(0x80, MVT::i8));
   5811         continue;
   5812       }
   5813       pshufbMask.push_back(DAG.getConstant(EltIdx - 16, MVT::i8));
   5814     }
   5815     V2 = DAG.getNode(X86ISD::PSHUFB, dl, MVT::v16i8, V2,
   5816                      DAG.getNode(ISD::BUILD_VECTOR, dl,
   5817                                  MVT::v16i8, &pshufbMask[0], 16));
   5818     return DAG.getNode(ISD::OR, dl, MVT::v16i8, V1, V2);
   5819   }
   5820 
   5821   // No SSSE3 - Calculate in place words and then fix all out of place words
   5822   // With 0-16 extracts & inserts.  Worst case is 16 bytes out of order from
   5823   // the 16 different words that comprise the two doublequadword input vectors.
   5824   V1 = DAG.getNode(ISD::BITCAST, dl, MVT::v8i16, V1);
   5825   V2 = DAG.getNode(ISD::BITCAST, dl, MVT::v8i16, V2);
   5826   SDValue NewV = V2Only ? V2 : V1;
   5827   for (int i = 0; i != 8; ++i) {
   5828     int Elt0 = MaskVals[i*2];
   5829     int Elt1 = MaskVals[i*2+1];
   5830 
   5831     // This word of the result is all undef, skip it.
   5832     if (Elt0 < 0 && Elt1 < 0)
   5833       continue;
   5834 
   5835     // This word of the result is already in the correct place, skip it.
   5836     if (V1Only && (Elt0 == i*2) && (Elt1 == i*2+1))
   5837       continue;
   5838     if (V2Only && (Elt0 == i*2+16) && (Elt1 == i*2+17))
   5839       continue;
   5840 
   5841     SDValue Elt0Src = Elt0 < 16 ? V1 : V2;
   5842     SDValue Elt1Src = Elt1 < 16 ? V1 : V2;
   5843     SDValue InsElt;
   5844 
   5845     // If Elt0 and Elt1 are defined, are consecutive, and can be load
   5846     // using a single extract together, load it and store it.
   5847     if ((Elt0 >= 0) && ((Elt0 + 1) == Elt1) && ((Elt0 & 1) == 0)) {
   5848       InsElt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16, Elt1Src,
   5849                            DAG.getIntPtrConstant(Elt1 / 2));
   5850       NewV = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v8i16, NewV, InsElt,
   5851                         DAG.getIntPtrConstant(i));
   5852       continue;
   5853     }
   5854 
   5855     // If Elt1 is defined, extract it from the appropriate source.  If the
   5856     // source byte is not also odd, shift the extracted word left 8 bits
   5857     // otherwise clear the bottom 8 bits if we need to do an or.
   5858     if (Elt1 >= 0) {
   5859       InsElt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16, Elt1Src,
   5860                            DAG.getIntPtrConstant(Elt1 / 2));
   5861       if ((Elt1 & 1) == 0)
   5862         InsElt = DAG.getNode(ISD::SHL, dl, MVT::i16, InsElt,
   5863                              DAG.getConstant(8,
   5864                                   TLI.getShiftAmountTy(InsElt.getValueType())));
   5865       else if (Elt0 >= 0)
   5866         InsElt = DAG.getNode(ISD::AND, dl, MVT::i16, InsElt,
   5867                              DAG.getConstant(0xFF00, MVT::i16));
   5868     }
   5869     // If Elt0 is defined, extract it from the appropriate source.  If the
   5870     // source byte is not also even, shift the extracted word right 8 bits. If
   5871     // Elt1 was also defined, OR the extracted values together before
   5872     // inserting them in the result.
   5873     if (Elt0 >= 0) {
   5874       SDValue InsElt0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16,
   5875                                     Elt0Src, DAG.getIntPtrConstant(Elt0 / 2));
   5876       if ((Elt0 & 1) != 0)
   5877         InsElt0 = DAG.getNode(ISD::SRL, dl, MVT::i16, InsElt0,
   5878                               DAG.getConstant(8,
   5879                                  TLI.getShiftAmountTy(InsElt0.getValueType())));
   5880       else if (Elt1 >= 0)
   5881         InsElt0 = DAG.getNode(ISD::AND, dl, MVT::i16, InsElt0,
   5882                              DAG.getConstant(0x00FF, MVT::i16));
   5883       InsElt = Elt1 >= 0 ? DAG.getNode(ISD::OR, dl, MVT::i16, InsElt, InsElt0)
   5884                          : InsElt0;
   5885     }
   5886     NewV = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v8i16, NewV, InsElt,
   5887                        DAG.getIntPtrConstant(i));
   5888   }
   5889   return DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, NewV);
   5890 }
   5891 
   5892 /// RewriteAsNarrowerShuffle - Try rewriting v8i16 and v16i8 shuffles as 4 wide
   5893 /// ones, or rewriting v4i32 / v4f32 as 2 wide ones if possible. This can be
   5894 /// done when every pair / quad of shuffle mask elements point to elements in
   5895 /// the right sequence. e.g.
   5896 /// vector_shuffle X, Y, <2, 3, | 10, 11, | 0, 1, | 14, 15>
   5897 static
   5898 SDValue RewriteAsNarrowerShuffle(ShuffleVectorSDNode *SVOp,
   5899                                  SelectionDAG &DAG, DebugLoc dl) {
   5900   EVT VT = SVOp->getValueType(0);
   5901   SDValue V1 = SVOp->getOperand(0);
   5902   SDValue V2 = SVOp->getOperand(1);
   5903   unsigned NumElems = VT.getVectorNumElements();
   5904   unsigned NewWidth = (NumElems == 4) ? 2 : 4;
   5905   EVT NewVT;
   5906   switch (VT.getSimpleVT().SimpleTy) {
   5907   default: llvm_unreachable("Unexpected!");
   5908   case MVT::v4f32: NewVT = MVT::v2f64; break;
   5909   case MVT::v4i32: NewVT = MVT::v2i64; break;
   5910   case MVT::v8i16: NewVT = MVT::v4i32; break;
   5911   case MVT::v16i8: NewVT = MVT::v4i32; break;
   5912   }
   5913 
   5914   int Scale = NumElems / NewWidth;
   5915   SmallVector<int, 8> MaskVec;
   5916   for (unsigned i = 0; i < NumElems; i += Scale) {
   5917     int StartIdx = -1;
   5918     for (int j = 0; j < Scale; ++j) {
   5919       int EltIdx = SVOp->getMaskElt(i+j);
   5920       if (EltIdx < 0)
   5921         continue;
   5922       if (StartIdx == -1)
   5923         StartIdx = EltIdx - (EltIdx % Scale);
   5924       if (EltIdx != StartIdx + j)
   5925         return SDValue();
   5926     }
   5927     if (StartIdx == -1)
   5928       MaskVec.push_back(-1);
   5929     else
   5930       MaskVec.push_back(StartIdx / Scale);
   5931   }
   5932 
   5933   V1 = DAG.getNode(ISD::BITCAST, dl, NewVT, V1);
   5934   V2 = DAG.getNode(ISD::BITCAST, dl, NewVT, V2);
   5935   return DAG.getVectorShuffle(NewVT, dl, V1, V2, &MaskVec[0]);
   5936 }
   5937 
   5938 /// getVZextMovL - Return a zero-extending vector move low node.
   5939 ///
   5940 static SDValue getVZextMovL(EVT VT, EVT OpVT,
   5941                             SDValue SrcOp, SelectionDAG &DAG,
   5942                             const X86Subtarget *Subtarget, DebugLoc dl) {
   5943   if (VT == MVT::v2f64 || VT == MVT::v4f32) {
   5944     LoadSDNode *LD = NULL;
   5945     if (!isScalarLoadToVector(SrcOp.getNode(), &LD))
   5946       LD = dyn_cast<LoadSDNode>(SrcOp);
   5947     if (!LD) {
   5948       // movssrr and movsdrr do not clear top bits. Try to use movd, movq
   5949       // instead.
   5950       MVT ExtVT = (OpVT == MVT::v2f64) ? MVT::i64 : MVT::i32;
   5951       if ((ExtVT != MVT::i64 || Subtarget->is64Bit()) &&
   5952           SrcOp.getOpcode() == ISD::SCALAR_TO_VECTOR &&
   5953           SrcOp.getOperand(0).getOpcode() == ISD::BITCAST &&
   5954           SrcOp.getOperand(0).getOperand(0).getValueType() == ExtVT) {
   5955         // PR2108
   5956         OpVT = (OpVT == MVT::v2f64) ? MVT::v2i64 : MVT::v4i32;
   5957         return DAG.getNode(ISD::BITCAST, dl, VT,
   5958                            DAG.getNode(X86ISD::VZEXT_MOVL, dl, OpVT,
   5959                                        DAG.getNode(ISD::SCALAR_TO_VECTOR, dl,
   5960                                                    OpVT,
   5961                                                    SrcOp.getOperand(0)
   5962                                                           .getOperand(0))));
   5963       }
   5964     }
   5965   }
   5966 
   5967   return DAG.getNode(ISD::BITCAST, dl, VT,
   5968                      DAG.getNode(X86ISD::VZEXT_MOVL, dl, OpVT,
   5969                                  DAG.getNode(ISD::BITCAST, dl,
   5970                                              OpVT, SrcOp)));
   5971 }
   5972 
   5973 /// LowerVECTOR_SHUFFLE_256 - Handle all 256-bit wide vectors shuffles
   5974 /// which could not be matched by any known target speficic shuffle
   5975 static SDValue
   5976 LowerVECTOR_SHUFFLE_256(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG) {
   5977   EVT VT = SVOp->getValueType(0);
   5978 
   5979   unsigned NumElems = VT.getVectorNumElements();
   5980   unsigned NumLaneElems = NumElems / 2;
   5981 
   5982   DebugLoc dl = SVOp->getDebugLoc();
   5983   MVT EltVT = VT.getVectorElementType().getSimpleVT();
   5984   EVT NVT = MVT::getVectorVT(EltVT, NumLaneElems);
   5985   SDValue Shufs[2];
   5986 
   5987   SmallVector<int, 16> Mask;
   5988   for (unsigned l = 0; l < 2; ++l) {
   5989     // Build a shuffle mask for the output, discovering on the fly which
   5990     // input vectors to use as shuffle operands (recorded in InputUsed).
   5991     // If building a suitable shuffle vector proves too hard, then bail
   5992     // out with useBuildVector set.
   5993     int InputUsed[2] = { -1, -1 }; // Not yet discovered.
   5994     unsigned LaneStart = l * NumLaneElems;
   5995     for (unsigned i = 0; i != NumLaneElems; ++i) {
   5996       // The mask element.  This indexes into the input.
   5997       int Idx = SVOp->getMaskElt(i+LaneStart);
   5998       if (Idx < 0) {
   5999         // the mask element does not index into any input vector.
   6000         Mask.push_back(-1);
   6001         continue;
   6002       }
   6003 
   6004       // The input vector this mask element indexes into.
   6005       int Input = Idx / NumLaneElems;
   6006 
   6007       // Turn the index into an offset from the start of the input vector.
   6008       Idx -= Input * NumLaneElems;
   6009 
   6010       // Find or create a shuffle vector operand to hold this input.
   6011       unsigned OpNo;
   6012       for (OpNo = 0; OpNo < array_lengthof(InputUsed); ++OpNo) {
   6013         if (InputUsed[OpNo] == Input)
   6014           // This input vector is already an operand.
   6015           break;
   6016         if (InputUsed[OpNo] < 0) {
   6017           // Create a new operand for this input vector.
   6018           InputUsed[OpNo] = Input;
   6019           break;
   6020         }
   6021       }
   6022 
   6023       if (OpNo >= array_lengthof(InputUsed)) {
   6024         // More than two input vectors used! Give up.
   6025         return SDValue();
   6026       }
   6027 
   6028       // Add the mask index for the new shuffle vector.
   6029       Mask.push_back(Idx + OpNo * NumLaneElems);
   6030     }
   6031 
   6032     if (InputUsed[0] < 0) {
   6033       // No input vectors were used! The result is undefined.
   6034       Shufs[l] = DAG.getUNDEF(NVT);
   6035     } else {
   6036       SDValue Op0 = Extract128BitVector(SVOp->getOperand(InputUsed[0] / 2),
   6037                    DAG.getConstant((InputUsed[0] % 2) * NumLaneElems, MVT::i32),
   6038                                    DAG, dl);
   6039       // If only one input was used, use an undefined vector for the other.
   6040       SDValue Op1 = (InputUsed[1] < 0) ? DAG.getUNDEF(NVT) :
   6041         Extract128BitVector(SVOp->getOperand(InputUsed[1] / 2),
   6042                    DAG.getConstant((InputUsed[1] % 2) * NumLaneElems, MVT::i32),
   6043                                    DAG, dl);
   6044       // At least one input vector was used. Create a new shuffle vector.
   6045       Shufs[l] = DAG.getVectorShuffle(NVT, dl, Op0, Op1, &Mask[0]);
   6046     }
   6047 
   6048     Mask.clear();
   6049   }
   6050 
   6051   // Concatenate the result back
   6052   SDValue V = Insert128BitVector(DAG.getNode(ISD::UNDEF, dl, VT), Shufs[0],
   6053                                  DAG.getConstant(0, MVT::i32), DAG, dl);
   6054   return Insert128BitVector(V, Shufs[1],DAG.getConstant(NumLaneElems, MVT::i32),
   6055                             DAG, dl);
   6056 }
   6057 
   6058 /// LowerVECTOR_SHUFFLE_128v4 - Handle all 128-bit wide vectors with
   6059 /// 4 elements, and match them with several different shuffle types.
   6060 static SDValue
   6061 LowerVECTOR_SHUFFLE_128v4(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG) {
   6062   SDValue V1 = SVOp->getOperand(0);
   6063   SDValue V2 = SVOp->getOperand(1);
   6064   DebugLoc dl = SVOp->getDebugLoc();
   6065   EVT VT = SVOp->getValueType(0);
   6066 
   6067   assert(VT.getSizeInBits() == 128 && "Unsupported vector size");
   6068 
   6069   std::pair<int, int> Locs[4];
   6070   int Mask1[] = { -1, -1, -1, -1 };
   6071   SmallVector<int, 8> PermMask(SVOp->getMask().begin(), SVOp->getMask().end());
   6072 
   6073   unsigned NumHi = 0;
   6074   unsigned NumLo = 0;
   6075   for (unsigned i = 0; i != 4; ++i) {
   6076     int Idx = PermMask[i];
   6077     if (Idx < 0) {
   6078       Locs[i] = std::make_pair(-1, -1);
   6079     } else {
   6080       assert(Idx < 8 && "Invalid VECTOR_SHUFFLE index!");
   6081       if (Idx < 4) {
   6082         Locs[i] = std::make_pair(0, NumLo);
   6083         Mask1[NumLo] = Idx;
   6084         NumLo++;
   6085       } else {
   6086         Locs[i] = std::make_pair(1, NumHi);
   6087         if (2+NumHi < 4)
   6088           Mask1[2+NumHi] = Idx;
   6089         NumHi++;
   6090       }
   6091     }
   6092   }
   6093 
   6094   if (NumLo <= 2 && NumHi <= 2) {
   6095     // If no more than two elements come from either vector. This can be
   6096     // implemented with two shuffles. First shuffle gather the elements.
   6097     // The second shuffle, which takes the first shuffle as both of its
   6098     // vector operands, put the elements into the right order.
   6099     V1 = DAG.getVectorShuffle(VT, dl, V1, V2, &Mask1[0]);
   6100 
   6101     int Mask2[] = { -1, -1, -1, -1 };
   6102 
   6103     for (unsigned i = 0; i != 4; ++i)
   6104       if (Locs[i].first != -1) {
   6105         unsigned Idx = (i < 2) ? 0 : 4;
   6106         Idx += Locs[i].first * 2 + Locs[i].second;
   6107         Mask2[i] = Idx;
   6108       }
   6109 
   6110     return DAG.getVectorShuffle(VT, dl, V1, V1, &Mask2[0]);
   6111   } else if (NumLo == 3 || NumHi == 3) {
   6112     // Otherwise, we must have three elements from one vector, call it X, and
   6113     // one element from the other, call it Y.  First, use a shufps to build an
   6114     // intermediate vector with the one element from Y and the element from X
   6115     // that will be in the same half in the final destination (the indexes don't
   6116     // matter). Then, use a shufps to build the final vector, taking the half
   6117     // containing the element from Y from the intermediate, and the other half
   6118     // from X.
   6119     if (NumHi == 3) {
   6120       // Normalize it so the 3 elements come from V1.
   6121       CommuteVectorShuffleMask(PermMask, 4);
   6122       std::swap(V1, V2);
   6123     }
   6124 
   6125     // Find the element from V2.
   6126     unsigned HiIndex;
   6127     for (HiIndex = 0; HiIndex < 3; ++HiIndex) {
   6128       int Val = PermMask[HiIndex];
   6129       if (Val < 0)
   6130         continue;
   6131       if (Val >= 4)
   6132         break;
   6133     }
   6134 
   6135     Mask1[0] = PermMask[HiIndex];
   6136     Mask1[1] = -1;
   6137     Mask1[2] = PermMask[HiIndex^1];
   6138     Mask1[3] = -1;
   6139     V2 = DAG.getVectorShuffle(VT, dl, V1, V2, &Mask1[0]);
   6140 
   6141     if (HiIndex >= 2) {
   6142       Mask1[0] = PermMask[0];
   6143       Mask1[1] = PermMask[1];
   6144       Mask1[2] = HiIndex & 1 ? 6 : 4;
   6145       Mask1[3] = HiIndex & 1 ? 4 : 6;
   6146       return DAG.getVectorShuffle(VT, dl, V1, V2, &Mask1[0]);
   6147     } else {
   6148       Mask1[0] = HiIndex & 1 ? 2 : 0;
   6149       Mask1[1] = HiIndex & 1 ? 0 : 2;
   6150       Mask1[2] = PermMask[2];
   6151       Mask1[3] = PermMask[3];
   6152       if (Mask1[2] >= 0)
   6153         Mask1[2] += 4;
   6154       if (Mask1[3] >= 0)
   6155         Mask1[3] += 4;
   6156       return DAG.getVectorShuffle(VT, dl, V2, V1, &Mask1[0]);
   6157     }
   6158   }
   6159 
   6160   // Break it into (shuffle shuffle_hi, shuffle_lo).
   6161   int LoMask[] = { -1, -1, -1, -1 };
   6162   int HiMask[] = { -1, -1, -1, -1 };
   6163 
   6164   int *MaskPtr = LoMask;
   6165   unsigned MaskIdx = 0;
   6166   unsigned LoIdx = 0;
   6167   unsigned HiIdx = 2;
   6168   for (unsigned i = 0; i != 4; ++i) {
   6169     if (i == 2) {
   6170       MaskPtr = HiMask;
   6171       MaskIdx = 1;
   6172       LoIdx = 0;
   6173       HiIdx = 2;
   6174     }
   6175     int Idx = PermMask[i];
   6176     if (Idx < 0) {
   6177       Locs[i] = std::make_pair(-1, -1);
   6178     } else if (Idx < 4) {
   6179       Locs[i] = std::make_pair(MaskIdx, LoIdx);
   6180       MaskPtr[LoIdx] = Idx;
   6181       LoIdx++;
   6182     } else {
   6183       Locs[i] = std::make_pair(MaskIdx, HiIdx);
   6184       MaskPtr[HiIdx] = Idx;
   6185       HiIdx++;
   6186     }
   6187   }
   6188 
   6189   SDValue LoShuffle = DAG.getVectorShuffle(VT, dl, V1, V2, &LoMask[0]);
   6190   SDValue HiShuffle = DAG.getVectorShuffle(VT, dl, V1, V2, &HiMask[0]);
   6191   int MaskOps[] = { -1, -1, -1, -1 };
   6192   for (unsigned i = 0; i != 4; ++i)
   6193     if (Locs[i].first != -1)
   6194       MaskOps[i] = Locs[i].first * 4 + Locs[i].second;
   6195   return DAG.getVectorShuffle(VT, dl, LoShuffle, HiShuffle, &MaskOps[0]);
   6196 }
   6197 
   6198 static bool MayFoldVectorLoad(SDValue V) {
   6199   if (V.hasOneUse() && V.getOpcode() == ISD::BITCAST)
   6200     V = V.getOperand(0);
   6201   if (V.hasOneUse() && V.getOpcode() == ISD::SCALAR_TO_VECTOR)
   6202     V = V.getOperand(0);
   6203   if (V.hasOneUse() && V.getOpcode() == ISD::BUILD_VECTOR &&
   6204       V.getNumOperands() == 2 && V.getOperand(1).getOpcode() == ISD::UNDEF)
   6205     // BUILD_VECTOR (load), undef
   6206     V = V.getOperand(0);
   6207   if (MayFoldLoad(V))
   6208     return true;
   6209   return false;
   6210 }
   6211 
   6212 // FIXME: the version above should always be used. Since there's
   6213 // a bug where several vector shuffles can't be folded because the
   6214 // DAG is not updated during lowering and a node claims to have two
   6215 // uses while it only has one, use this version, and let isel match
   6216 // another instruction if the load really happens to have more than
   6217 // one use. Remove this version after this bug get fixed.
   6218 // rdar://8434668, PR8156
   6219 static bool RelaxedMayFoldVectorLoad(SDValue V) {
   6220   if (V.hasOneUse() && V.getOpcode() == ISD::BITCAST)
   6221     V = V.getOperand(0);
   6222   if (V.hasOneUse() && V.getOpcode() == ISD::SCALAR_TO_VECTOR)
   6223     V = V.getOperand(0);
   6224   if (ISD::isNormalLoad(V.getNode()))
   6225     return true;
   6226   return false;
   6227 }
   6228 
   6229 static
   6230 SDValue getMOVDDup(SDValue &Op, DebugLoc &dl, SDValue V1, SelectionDAG &DAG) {
   6231   EVT VT = Op.getValueType();
   6232 
   6233   // Canonizalize to v2f64.
   6234   V1 = DAG.getNode(ISD::BITCAST, dl, MVT::v2f64, V1);
   6235   return DAG.getNode(ISD::BITCAST, dl, VT,
   6236                      getTargetShuffleNode(X86ISD::MOVDDUP, dl, MVT::v2f64,
   6237                                           V1, DAG));
   6238 }
   6239 
   6240 static
   6241 SDValue getMOVLowToHigh(SDValue &Op, DebugLoc &dl, SelectionDAG &DAG,
   6242                         bool HasSSE2) {
   6243   SDValue V1 = Op.getOperand(0);
   6244   SDValue V2 = Op.getOperand(1);
   6245   EVT VT = Op.getValueType();
   6246 
   6247   assert(VT != MVT::v2i64 && "unsupported shuffle type");
   6248 
   6249   if (HasSSE2 && VT == MVT::v2f64)
   6250     return getTargetShuffleNode(X86ISD::MOVLHPD, dl, VT, V1, V2, DAG);
   6251 
   6252   // v4f32 or v4i32: canonizalized to v4f32 (which is legal for SSE1)
   6253   return DAG.getNode(ISD::BITCAST, dl, VT,
   6254                      getTargetShuffleNode(X86ISD::MOVLHPS, dl, MVT::v4f32,
   6255                            DAG.getNode(ISD::BITCAST, dl, MVT::v4f32, V1),
   6256                            DAG.getNode(ISD::BITCAST, dl, MVT::v4f32, V2), DAG));
   6257 }
   6258 
   6259 static
   6260 SDValue getMOVHighToLow(SDValue &Op, DebugLoc &dl, SelectionDAG &DAG) {
   6261   SDValue V1 = Op.getOperand(0);
   6262   SDValue V2 = Op.getOperand(1);
   6263   EVT VT = Op.getValueType();
   6264 
   6265   assert((VT == MVT::v4i32 || VT == MVT::v4f32) &&
   6266          "unsupported shuffle type");
   6267 
   6268   if (V2.getOpcode() == ISD::UNDEF)
   6269     V2 = V1;
   6270 
   6271   // v4i32 or v4f32
   6272   return getTargetShuffleNode(X86ISD::MOVHLPS, dl, VT, V1, V2, DAG);
   6273 }
   6274 
   6275 static
   6276 SDValue getMOVLP(SDValue &Op, DebugLoc &dl, SelectionDAG &DAG, bool HasSSE2) {
   6277   SDValue V1 = Op.getOperand(0);
   6278   SDValue V2 = Op.getOperand(1);
   6279   EVT VT = Op.getValueType();
   6280   unsigned NumElems = VT.getVectorNumElements();
   6281 
   6282   // Use MOVLPS and MOVLPD in case V1 or V2 are loads. During isel, the second
   6283   // operand of these instructions is only memory, so check if there's a
   6284   // potencial load folding here, otherwise use SHUFPS or MOVSD to match the
   6285   // same masks.
   6286   bool CanFoldLoad = false;
   6287 
   6288   // Trivial case, when V2 comes from a load.
   6289   if (MayFoldVectorLoad(V2))
   6290     CanFoldLoad = true;
   6291 
   6292   // When V1 is a load, it can be folded later into a store in isel, example:
   6293   //  (store (v4f32 (X86Movlps (load addr:$src1), VR128:$src2)), addr:$src1)
   6294   //    turns into:
   6295   //  (MOVLPSmr addr:$src1, VR128:$src2)
   6296   // So, recognize this potential and also use MOVLPS or MOVLPD
   6297   else if (MayFoldVectorLoad(V1) && MayFoldIntoStore(Op))
   6298     CanFoldLoad = true;
   6299 
   6300   ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
   6301   if (CanFoldLoad) {
   6302     if (HasSSE2 && NumElems == 2)
   6303       return getTargetShuffleNode(X86ISD::MOVLPD, dl, VT, V1, V2, DAG);
   6304 
   6305     if (NumElems == 4)
   6306       // If we don't care about the second element, procede to use movss.
   6307       if (SVOp->getMaskElt(1) != -1)
   6308         return getTargetShuffleNode(X86ISD::MOVLPS, dl, VT, V1, V2, DAG);
   6309   }
   6310 
   6311   // movl and movlp will both match v2i64, but v2i64 is never matched by
   6312   // movl earlier because we make it strict to avoid messing with the movlp load
   6313   // folding logic (see the code above getMOVLP call). Match it here then,
   6314   // this is horrible, but will stay like this until we move all shuffle
   6315   // matching to x86 specific nodes. Note that for the 1st condition all
   6316   // types are matched with movsd.
   6317   if (HasSSE2) {
   6318     // FIXME: isMOVLMask should be checked and matched before getMOVLP,
   6319     // as to remove this logic from here, as much as possible
   6320     if (NumElems == 2 || !isMOVLMask(SVOp->getMask(), VT))
   6321       return getTargetShuffleNode(X86ISD::MOVSD, dl, VT, V1, V2, DAG);
   6322     return getTargetShuffleNode(X86ISD::MOVSS, dl, VT, V1, V2, DAG);
   6323   }
   6324 
   6325   assert(VT != MVT::v4i32 && "unsupported shuffle type");
   6326 
   6327   // Invert the operand order and use SHUFPS to match it.
   6328   return getTargetShuffleNode(X86ISD::SHUFP, dl, VT, V2, V1,
   6329                               getShuffleSHUFImmediate(SVOp), DAG);
   6330 }
   6331 
   6332 SDValue
   6333 X86TargetLowering::NormalizeVectorShuffle(SDValue Op, SelectionDAG &DAG) const {
   6334   ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
   6335   EVT VT = Op.getValueType();
   6336   DebugLoc dl = Op.getDebugLoc();
   6337   SDValue V1 = Op.getOperand(0);
   6338   SDValue V2 = Op.getOperand(1);
   6339 
   6340   if (isZeroShuffle(SVOp))
   6341     return getZeroVector(VT, Subtarget, DAG, dl);
   6342 
   6343   // Handle splat operations
   6344   if (SVOp->isSplat()) {
   6345     unsigned NumElem = VT.getVectorNumElements();
   6346     int Size = VT.getSizeInBits();
   6347 
   6348     // Use vbroadcast whenever the splat comes from a foldable load
   6349     SDValue Broadcast = LowerVectorBroadcast(Op, DAG);
   6350     if (Broadcast.getNode())
   6351       return Broadcast;
   6352 
   6353     // Handle splats by matching through known shuffle masks
   6354     if ((Size == 128 && NumElem <= 4) ||
   6355         (Size == 256 && NumElem < 8))
   6356       return SDValue();
   6357 
   6358     // All remaning splats are promoted to target supported vector shuffles.
   6359     return PromoteSplat(SVOp, DAG);
   6360   }
   6361 
   6362   // If the shuffle can be profitably rewritten as a narrower shuffle, then
   6363   // do it!
   6364   if (VT == MVT::v8i16 || VT == MVT::v16i8) {
   6365     SDValue NewOp = RewriteAsNarrowerShuffle(SVOp, DAG, dl);
   6366     if (NewOp.getNode())
   6367       return DAG.getNode(ISD::BITCAST, dl, VT, NewOp);
   6368   } else if ((VT == MVT::v4i32 ||
   6369              (VT == MVT::v4f32 && Subtarget->hasSSE2()))) {
   6370     // FIXME: Figure out a cleaner way to do this.
   6371     // Try to make use of movq to zero out the top part.
   6372     if (ISD::isBuildVectorAllZeros(V2.getNode())) {
   6373       SDValue NewOp = RewriteAsNarrowerShuffle(SVOp, DAG, dl);
   6374       if (NewOp.getNode()) {
   6375         EVT NewVT = NewOp.getValueType();
   6376         if (isCommutedMOVLMask(cast<ShuffleVectorSDNode>(NewOp)->getMask(),
   6377                                NewVT, true, false))
   6378           return getVZextMovL(VT, NewVT, NewOp.getOperand(0),
   6379                               DAG, Subtarget, dl);
   6380       }
   6381     } else if (ISD::isBuildVectorAllZeros(V1.getNode())) {
   6382       SDValue NewOp = RewriteAsNarrowerShuffle(SVOp, DAG, dl);
   6383       if (NewOp.getNode()) {
   6384         EVT NewVT = NewOp.getValueType();
   6385         if (isMOVLMask(cast<ShuffleVectorSDNode>(NewOp)->getMask(), NewVT))
   6386           return getVZextMovL(VT, NewVT, NewOp.getOperand(1),
   6387                               DAG, Subtarget, dl);
   6388       }
   6389     }
   6390   }
   6391   return SDValue();
   6392 }
   6393 
   6394 SDValue
   6395 X86TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const {
   6396   ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
   6397   SDValue V1 = Op.getOperand(0);
   6398   SDValue V2 = Op.getOperand(1);
   6399   EVT VT = Op.getValueType();
   6400   DebugLoc dl = Op.getDebugLoc();
   6401   unsigned NumElems = VT.getVectorNumElements();
   6402   bool V1IsUndef = V1.getOpcode() == ISD::UNDEF;
   6403   bool V2IsUndef = V2.getOpcode() == ISD::UNDEF;
   6404   bool V1IsSplat = false;
   6405   bool V2IsSplat = false;
   6406   bool HasSSE2 = Subtarget->hasSSE2();
   6407   bool HasAVX    = Subtarget->hasAVX();
   6408   bool HasAVX2   = Subtarget->hasAVX2();
   6409   MachineFunction &MF = DAG.getMachineFunction();
   6410   bool OptForSize = MF.getFunction()->hasFnAttr(Attribute::OptimizeForSize);
   6411 
   6412   assert(VT.getSizeInBits() != 64 && "Can't lower MMX shuffles");
   6413 
   6414   if (V1IsUndef && V2IsUndef)
   6415     return DAG.getUNDEF(VT);
   6416 
   6417   assert(!V1IsUndef && "Op 1 of shuffle should not be undef");
   6418 
   6419   // Vector shuffle lowering takes 3 steps:
   6420   //
   6421   // 1) Normalize the input vectors. Here splats, zeroed vectors, profitable
   6422   //    narrowing and commutation of operands should be handled.
   6423   // 2) Matching of shuffles with known shuffle masks to x86 target specific
   6424   //    shuffle nodes.
   6425   // 3) Rewriting of unmatched masks into new generic shuffle operations,
   6426   //    so the shuffle can be broken into other shuffles and the legalizer can
   6427   //    try the lowering again.
   6428   //
   6429   // The general idea is that no vector_shuffle operation should be left to
   6430   // be matched during isel, all of them must be converted to a target specific
   6431   // node here.
   6432 
   6433   // Normalize the input vectors. Here splats, zeroed vectors, profitable
   6434   // narrowing and commutation of operands should be handled. The actual code
   6435   // doesn't include all of those, work in progress...
   6436   SDValue NewOp = NormalizeVectorShuffle(Op, DAG);
   6437   if (NewOp.getNode())
   6438     return NewOp;
   6439 
   6440   SmallVector<int, 8> M(SVOp->getMask().begin(), SVOp->getMask().end());
   6441 
   6442   // NOTE: isPSHUFDMask can also match both masks below (unpckl_undef and
   6443   // unpckh_undef). Only use pshufd if speed is more important than size.
   6444   if (OptForSize && isUNPCKL_v_undef_Mask(M, VT, HasAVX2))
   6445     return getTargetShuffleNode(X86ISD::UNPCKL, dl, VT, V1, V1, DAG);
   6446   if (OptForSize && isUNPCKH_v_undef_Mask(M, VT, HasAVX2))
   6447     return getTargetShuffleNode(X86ISD::UNPCKH, dl, VT, V1, V1, DAG);
   6448 
   6449   if (isMOVDDUPMask(M, VT) && Subtarget->hasSSE3() &&
   6450       V2IsUndef && RelaxedMayFoldVectorLoad(V1))
   6451     return getMOVDDup(Op, dl, V1, DAG);
   6452 
   6453   if (isMOVHLPS_v_undef_Mask(M, VT))
   6454     return getMOVHighToLow(Op, dl, DAG);
   6455 
   6456   // Use to match splats
   6457   if (HasSSE2 && isUNPCKHMask(M, VT, HasAVX2) && V2IsUndef &&
   6458       (VT == MVT::v2f64 || VT == MVT::v2i64))
   6459     return getTargetShuffleNode(X86ISD::UNPCKH, dl, VT, V1, V1, DAG);
   6460 
   6461   if (isPSHUFDMask(M, VT)) {
   6462     // The actual implementation will match the mask in the if above and then
   6463     // during isel it can match several different instructions, not only pshufd
   6464     // as its name says, sad but true, emulate the behavior for now...
   6465     if (isMOVDDUPMask(M, VT) && ((VT == MVT::v4f32 || VT == MVT::v2i64)))
   6466       return getTargetShuffleNode(X86ISD::MOVLHPS, dl, VT, V1, V1, DAG);
   6467 
   6468     unsigned TargetMask = getShuffleSHUFImmediate(SVOp);
   6469 
   6470     if (HasAVX && (VT == MVT::v4f32 || VT == MVT::v2f64))
   6471       return getTargetShuffleNode(X86ISD::VPERMILP, dl, VT, V1, TargetMask, DAG);
   6472 
   6473     if (HasSSE2 && (VT == MVT::v4f32 || VT == MVT::v4i32))
   6474       return getTargetShuffleNode(X86ISD::PSHUFD, dl, VT, V1, TargetMask, DAG);
   6475 
   6476     return getTargetShuffleNode(X86ISD::SHUFP, dl, VT, V1, V1,
   6477                                 TargetMask, DAG);
   6478   }
   6479 
   6480   // Check if this can be converted into a logical shift.
   6481   bool isLeft = false;
   6482   unsigned ShAmt = 0;
   6483   SDValue ShVal;
   6484   bool isShift = HasSSE2 && isVectorShift(SVOp, DAG, isLeft, ShVal, ShAmt);
   6485   if (isShift && ShVal.hasOneUse()) {
   6486     // If the shifted value has multiple uses, it may be cheaper to use
   6487     // v_set0 + movlhps or movhlps, etc.
   6488     EVT EltVT = VT.getVectorElementType();
   6489     ShAmt *= EltVT.getSizeInBits();
   6490     return getVShift(isLeft, VT, ShVal, ShAmt, DAG, *this, dl);
   6491   }
   6492 
   6493   if (isMOVLMask(M, VT)) {
   6494     if (ISD::isBuildVectorAllZeros(V1.getNode()))
   6495       return getVZextMovL(VT, VT, V2, DAG, Subtarget, dl);
   6496     if (!isMOVLPMask(M, VT)) {
   6497       if (HasSSE2 && (VT == MVT::v2i64 || VT == MVT::v2f64))
   6498         return getTargetShuffleNode(X86ISD::MOVSD, dl, VT, V1, V2, DAG);
   6499 
   6500       if (VT == MVT::v4i32 || VT == MVT::v4f32)
   6501         return getTargetShuffleNode(X86ISD::MOVSS, dl, VT, V1, V2, DAG);
   6502     }
   6503   }
   6504 
   6505   // FIXME: fold these into legal mask.
   6506   if (isMOVLHPSMask(M, VT) && !isUNPCKLMask(M, VT, HasAVX2))
   6507     return getMOVLowToHigh(Op, dl, DAG, HasSSE2);
   6508 
   6509   if (isMOVHLPSMask(M, VT))
   6510     return getMOVHighToLow(Op, dl, DAG);
   6511 
   6512   if (V2IsUndef && isMOVSHDUPMask(M, VT, Subtarget))
   6513     return getTargetShuffleNode(X86ISD::MOVSHDUP, dl, VT, V1, DAG);
   6514 
   6515   if (V2IsUndef && isMOVSLDUPMask(M, VT, Subtarget))
   6516     return getTargetShuffleNode(X86ISD::MOVSLDUP, dl, VT, V1, DAG);
   6517 
   6518   if (isMOVLPMask(M, VT))
   6519     return getMOVLP(Op, dl, DAG, HasSSE2);
   6520 
   6521   if (ShouldXformToMOVHLPS(M, VT) ||
   6522       ShouldXformToMOVLP(V1.getNode(), V2.getNode(), M, VT))
   6523     return CommuteVectorShuffle(SVOp, DAG);
   6524 
   6525   if (isShift) {
   6526     // No better options. Use a vshldq / vsrldq.
   6527     EVT EltVT = VT.getVectorElementType();
   6528     ShAmt *= EltVT.getSizeInBits();
   6529     return getVShift(isLeft, VT, ShVal, ShAmt, DAG, *this, dl);
   6530   }
   6531 
   6532   bool Commuted = false;
   6533   // FIXME: This should also accept a bitcast of a splat?  Be careful, not
   6534   // 1,1,1,1 -> v8i16 though.
   6535   V1IsSplat = isSplatVector(V1.getNode());
   6536   V2IsSplat = isSplatVector(V2.getNode());
   6537 
   6538   // Canonicalize the splat or undef, if present, to be on the RHS.
   6539   if (!V2IsUndef && V1IsSplat && !V2IsSplat) {
   6540     CommuteVectorShuffleMask(M, NumElems);
   6541     std::swap(V1, V2);
   6542     std::swap(V1IsSplat, V2IsSplat);
   6543     Commuted = true;
   6544   }
   6545 
   6546   if (isCommutedMOVLMask(M, VT, V2IsSplat, V2IsUndef)) {
   6547     // Shuffling low element of v1 into undef, just return v1.
   6548     if (V2IsUndef)
   6549       return V1;
   6550     // If V2 is a splat, the mask may be malformed such as <4,3,3,3>, which
   6551     // the instruction selector will not match, so get a canonical MOVL with
   6552     // swapped operands to undo the commute.
   6553     return getMOVL(DAG, dl, VT, V2, V1);
   6554   }
   6555 
   6556   if (isUNPCKLMask(M, VT, HasAVX2))
   6557     return getTargetShuffleNode(X86ISD::UNPCKL, dl, VT, V1, V2, DAG);
   6558 
   6559   if (isUNPCKHMask(M, VT, HasAVX2))
   6560     return getTargetShuffleNode(X86ISD::UNPCKH, dl, VT, V1, V2, DAG);
   6561 
   6562   if (V2IsSplat) {
   6563     // Normalize mask so all entries that point to V2 points to its first
   6564     // element then try to match unpck{h|l} again. If match, return a
   6565     // new vector_shuffle with the corrected mask.p
   6566     SmallVector<int, 8> NewMask(M.begin(), M.end());
   6567     NormalizeMask(NewMask, NumElems);
   6568     if (isUNPCKLMask(NewMask, VT, HasAVX2, true)) {
   6569       return getTargetShuffleNode(X86ISD::UNPCKL, dl, VT, V1, V2, DAG);
   6570     } else if (isUNPCKHMask(NewMask, VT, HasAVX2, true)) {
   6571       return getTargetShuffleNode(X86ISD::UNPCKH, dl, VT, V1, V2, DAG);
   6572     }
   6573   }
   6574 
   6575   if (Commuted) {
   6576     // Commute is back and try unpck* again.
   6577     // FIXME: this seems wrong.
   6578     CommuteVectorShuffleMask(M, NumElems);
   6579     std::swap(V1, V2);
   6580     std::swap(V1IsSplat, V2IsSplat);
   6581     Commuted = false;
   6582 
   6583     if (isUNPCKLMask(M, VT, HasAVX2))
   6584       return getTargetShuffleNode(X86ISD::UNPCKL, dl, VT, V1, V2, DAG);
   6585 
   6586     if (isUNPCKHMask(M, VT, HasAVX2))
   6587       return getTargetShuffleNode(X86ISD::UNPCKH, dl, VT, V1, V2, DAG);
   6588   }
   6589 
   6590   // Normalize the node to match x86 shuffle ops if needed
   6591   if (!V2IsUndef && (isSHUFPMask(M, VT, HasAVX, /* Commuted */ true)))
   6592     return CommuteVectorShuffle(SVOp, DAG);
   6593 
   6594   // The checks below are all present in isShuffleMaskLegal, but they are
   6595   // inlined here right now to enable us to directly emit target specific
   6596   // nodes, and remove one by one until they don't return Op anymore.
   6597 
   6598   if (isPALIGNRMask(M, VT, Subtarget))
   6599     return getTargetShuffleNode(X86ISD::PALIGN, dl, VT, V1, V2,
   6600                                 getShufflePALIGNRImmediate(SVOp),
   6601                                 DAG);
   6602 
   6603   if (ShuffleVectorSDNode::isSplatMask(&M[0], VT) &&
   6604       SVOp->getSplatIndex() == 0 && V2IsUndef) {
   6605     if (VT == MVT::v2f64 || VT == MVT::v2i64)
   6606       return getTargetShuffleNode(X86ISD::UNPCKL, dl, VT, V1, V1, DAG);
   6607   }
   6608 
   6609   if (isPSHUFHWMask(M, VT))
   6610     return getTargetShuffleNode(X86ISD::PSHUFHW, dl, VT, V1,
   6611                                 getShufflePSHUFHWImmediate(SVOp),
   6612                                 DAG);
   6613 
   6614   if (isPSHUFLWMask(M, VT))
   6615     return getTargetShuffleNode(X86ISD::PSHUFLW, dl, VT, V1,
   6616                                 getShufflePSHUFLWImmediate(SVOp),
   6617                                 DAG);
   6618 
   6619   if (isSHUFPMask(M, VT, HasAVX))
   6620     return getTargetShuffleNode(X86ISD::SHUFP, dl, VT, V1, V2,
   6621                                 getShuffleSHUFImmediate(SVOp), DAG);
   6622 
   6623   if (isUNPCKL_v_undef_Mask(M, VT, HasAVX2))
   6624     return getTargetShuffleNode(X86ISD::UNPCKL, dl, VT, V1, V1, DAG);
   6625   if (isUNPCKH_v_undef_Mask(M, VT, HasAVX2))
   6626     return getTargetShuffleNode(X86ISD::UNPCKH, dl, VT, V1, V1, DAG);
   6627 
   6628   //===--------------------------------------------------------------------===//
   6629   // Generate target specific nodes for 128 or 256-bit shuffles only
   6630   // supported in the AVX instruction set.
   6631   //
   6632 
   6633   // Handle VMOVDDUPY permutations
   6634   if (V2IsUndef && isMOVDDUPYMask(M, VT, HasAVX))
   6635     return getTargetShuffleNode(X86ISD::MOVDDUP, dl, VT, V1, DAG);
   6636 
   6637   // Handle VPERMILPS/D* permutations
   6638   if (isVPERMILPMask(M, VT, HasAVX)) {
   6639     if (HasAVX2 && VT == MVT::v8i32)
   6640       return getTargetShuffleNode(X86ISD::PSHUFD, dl, VT, V1,
   6641                                   getShuffleSHUFImmediate(SVOp), DAG);
   6642     return getTargetShuffleNode(X86ISD::VPERMILP, dl, VT, V1,
   6643                                 getShuffleSHUFImmediate(SVOp), DAG);
   6644   }
   6645 
   6646   // Handle VPERM2F128/VPERM2I128 permutations
   6647   if (isVPERM2X128Mask(M, VT, HasAVX))
   6648     return getTargetShuffleNode(X86ISD::VPERM2X128, dl, VT, V1,
   6649                                 V2, getShuffleVPERM2X128Immediate(SVOp), DAG);
   6650 
   6651   SDValue BlendOp = LowerVECTOR_SHUFFLEtoBlend(Op, Subtarget, DAG);
   6652   if (BlendOp.getNode())
   6653     return BlendOp;
   6654 
   6655   if (V2IsUndef && HasAVX2 && (VT == MVT::v8i32 || VT == MVT::v8f32)) {
   6656     SmallVector<SDValue, 8> permclMask;
   6657     for (unsigned i = 0; i != 8; ++i) {
   6658       permclMask.push_back(DAG.getConstant((M[i]>=0) ? M[i] : 0, MVT::i32));
   6659     }
   6660     SDValue Mask = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v8i32,
   6661                                &permclMask[0], 8);
   6662     // Bitcast is for VPERMPS since mask is v8i32 but node takes v8f32
   6663     return DAG.getNode(X86ISD::VPERMV, dl, VT,
   6664                        DAG.getNode(ISD::BITCAST, dl, VT, Mask), V1);
   6665   }
   6666 
   6667   if (V2IsUndef && HasAVX2 && (VT == MVT::v4i64 || VT == MVT::v4f64))
   6668     return getTargetShuffleNode(X86ISD::VPERMI, dl, VT, V1,
   6669                                 getShuffleCLImmediate(SVOp), DAG);
   6670 
   6671 
   6672   //===--------------------------------------------------------------------===//
   6673   // Since no target specific shuffle was selected for this generic one,
   6674   // lower it into other known shuffles. FIXME: this isn't true yet, but
   6675   // this is the plan.
   6676   //
   6677 
   6678   // Handle v8i16 specifically since SSE can do byte extraction and insertion.
   6679   if (VT == MVT::v8i16) {
   6680     SDValue NewOp = LowerVECTOR_SHUFFLEv8i16(Op, DAG);
   6681     if (NewOp.getNode())
   6682       return NewOp;
   6683   }
   6684 
   6685   if (VT == MVT::v16i8) {
   6686     SDValue NewOp = LowerVECTOR_SHUFFLEv16i8(SVOp, DAG, *this);
   6687     if (NewOp.getNode())
   6688       return NewOp;
   6689   }
   6690 
   6691   // Handle all 128-bit wide vectors with 4 elements, and match them with
   6692   // several different shuffle types.
   6693   if (NumElems == 4 && VT.getSizeInBits() == 128)
   6694     return LowerVECTOR_SHUFFLE_128v4(SVOp, DAG);
   6695 
   6696   // Handle general 256-bit shuffles
   6697   if (VT.is256BitVector())
   6698     return LowerVECTOR_SHUFFLE_256(SVOp, DAG);
   6699 
   6700   return SDValue();
   6701 }
   6702 
   6703 SDValue
   6704 X86TargetLowering::LowerEXTRACT_VECTOR_ELT_SSE4(SDValue Op,
   6705                                                 SelectionDAG &DAG) const {
   6706   EVT VT = Op.getValueType();
   6707   DebugLoc dl = Op.getDebugLoc();
   6708 
   6709   if (Op.getOperand(0).getValueType().getSizeInBits() != 128)
   6710     return SDValue();
   6711 
   6712   if (VT.getSizeInBits() == 8) {
   6713     SDValue Extract = DAG.getNode(X86ISD::PEXTRB, dl, MVT::i32,
   6714                                     Op.getOperand(0), Op.getOperand(1));
   6715     SDValue Assert  = DAG.getNode(ISD::AssertZext, dl, MVT::i32, Extract,
   6716                                     DAG.getValueType(VT));
   6717     return DAG.getNode(ISD::TRUNCATE, dl, VT, Assert);
   6718   } else if (VT.getSizeInBits() == 16) {
   6719     unsigned Idx = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
   6720     // If Idx is 0, it's cheaper to do a move instead of a pextrw.
   6721     if (Idx == 0)
   6722       return DAG.getNode(ISD::TRUNCATE, dl, MVT::i16,
   6723                          DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32,
   6724                                      DAG.getNode(ISD::BITCAST, dl,
   6725                                                  MVT::v4i32,
   6726                                                  Op.getOperand(0)),
   6727                                      Op.getOperand(1)));
   6728     SDValue Extract = DAG.getNode(X86ISD::PEXTRW, dl, MVT::i32,
   6729                                     Op.getOperand(0), Op.getOperand(1));
   6730     SDValue Assert  = DAG.getNode(ISD::AssertZext, dl, MVT::i32, Extract,
   6731                                     DAG.getValueType(VT));
   6732     return DAG.getNode(ISD::TRUNCATE, dl, VT, Assert);
   6733   } else if (VT == MVT::f32) {
   6734     // EXTRACTPS outputs to a GPR32 register which will require a movd to copy
   6735     // the result back to FR32 register. It's only worth matching if the
   6736     // result has a single use which is a store or a bitcast to i32.  And in
   6737     // the case of a store, it's not worth it if the index is a constant 0,
   6738     // because a MOVSSmr can be used instead, which is smaller and faster.
   6739     if (!Op.hasOneUse())
   6740       return SDValue();
   6741     SDNode *User = *Op.getNode()->use_begin();
   6742     if ((User->getOpcode() != ISD::STORE ||
   6743          (isa<ConstantSDNode>(Op.getOperand(1)) &&
   6744           cast<ConstantSDNode>(Op.getOperand(1))->isNullValue())) &&
   6745         (User->getOpcode() != ISD::BITCAST ||
   6746          User->getValueType(0) != MVT::i32))
   6747       return SDValue();
   6748     SDValue Extract = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32,
   6749                                   DAG.getNode(ISD::BITCAST, dl, MVT::v4i32,
   6750                                               Op.getOperand(0)),
   6751                                               Op.getOperand(1));
   6752     return DAG.getNode(ISD::BITCAST, dl, MVT::f32, Extract);
   6753   } else if (VT == MVT::i32 || VT == MVT::i64) {
   6754     // ExtractPS/pextrq works with constant index.
   6755     if (isa<ConstantSDNode>(Op.getOperand(1)))
   6756       return Op;
   6757   }
   6758   return SDValue();
   6759 }
   6760 
   6761 
   6762 SDValue
   6763 X86TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op,
   6764                                            SelectionDAG &DAG) const {
   6765   if (!isa<ConstantSDNode>(Op.getOperand(1)))
   6766     return SDValue();
   6767 
   6768   SDValue Vec = Op.getOperand(0);
   6769   EVT VecVT = Vec.getValueType();
   6770 
   6771   // If this is a 256-bit vector result, first extract the 128-bit vector and
   6772   // then extract the element from the 128-bit vector.
   6773   if (VecVT.getSizeInBits() == 256) {
   6774     DebugLoc dl = Op.getNode()->getDebugLoc();
   6775     unsigned NumElems = VecVT.getVectorNumElements();
   6776     SDValue Idx = Op.getOperand(1);
   6777     unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
   6778 
   6779     // Get the 128-bit vector.
   6780     bool Upper = IdxVal >= NumElems/2;
   6781     Vec = Extract128BitVector(Vec,
   6782                     DAG.getConstant(Upper ? NumElems/2 : 0, MVT::i32), DAG, dl);
   6783 
   6784     return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, Op.getValueType(), Vec,
   6785                     Upper ? DAG.getConstant(IdxVal-NumElems/2, MVT::i32) : Idx);
   6786   }
   6787 
   6788   assert(Vec.getValueSizeInBits() <= 128 && "Unexpected vector length");
   6789 
   6790   if (Subtarget->hasSSE41()) {
   6791     SDValue Res = LowerEXTRACT_VECTOR_ELT_SSE4(Op, DAG);
   6792     if (Res.getNode())
   6793       return Res;
   6794   }
   6795 
   6796   EVT VT = Op.getValueType();
   6797   DebugLoc dl = Op.getDebugLoc();
   6798   // TODO: handle v16i8.
   6799   if (VT.getSizeInBits() == 16) {
   6800     SDValue Vec = Op.getOperand(0);
   6801     unsigned Idx = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
   6802     if (Idx == 0)
   6803       return DAG.getNode(ISD::TRUNCATE, dl, MVT::i16,
   6804                          DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32,
   6805                                      DAG.getNode(ISD::BITCAST, dl,
   6806                                                  MVT::v4i32, Vec),
   6807                                      Op.getOperand(1)));
   6808     // Transform it so it match pextrw which produces a 32-bit result.
   6809     EVT EltVT = MVT::i32;
   6810     SDValue Extract = DAG.getNode(X86ISD::PEXTRW, dl, EltVT,
   6811                                     Op.getOperand(0), Op.getOperand(1));
   6812     SDValue Assert  = DAG.getNode(ISD::AssertZext, dl, EltVT, Extract,
   6813                                     DAG.getValueType(VT));
   6814     return DAG.getNode(ISD::TRUNCATE, dl, VT, Assert);
   6815   } else if (VT.getSizeInBits() == 32) {
   6816     unsigned Idx = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
   6817     if (Idx == 0)
   6818       return Op;
   6819 
   6820     // SHUFPS the element to the lowest double word, then movss.
   6821     int Mask[4] = { static_cast<int>(Idx), -1, -1, -1 };
   6822     EVT VVT = Op.getOperand(0).getValueType();
   6823     SDValue Vec = DAG.getVectorShuffle(VVT, dl, Op.getOperand(0),
   6824                                        DAG.getUNDEF(VVT), Mask);
   6825     return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Vec,
   6826                        DAG.getIntPtrConstant(0));
   6827   } else if (VT.getSizeInBits() == 64) {
   6828     // FIXME: .td only matches this for <2 x f64>, not <2 x i64> on 32b
   6829     // FIXME: seems like this should be unnecessary if mov{h,l}pd were taught
   6830     //        to match extract_elt for f64.
   6831     unsigned Idx = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
   6832     if (Idx == 0)
   6833       return Op;
   6834 
   6835     // UNPCKHPD the element to the lowest double word, then movsd.
   6836     // Note if the lower 64 bits of the result of the UNPCKHPD is then stored
   6837     // to a f64mem, the whole operation is folded into a single MOVHPDmr.
   6838     int Mask[2] = { 1, -1 };
   6839     EVT VVT = Op.getOperand(0).getValueType();
   6840     SDValue Vec = DAG.getVectorShuffle(VVT, dl, Op.getOperand(0),
   6841                                        DAG.getUNDEF(VVT), Mask);
   6842     return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Vec,
   6843                        DAG.getIntPtrConstant(0));
   6844   }
   6845 
   6846   return SDValue();
   6847 }
   6848 
   6849 SDValue
   6850 X86TargetLowering::LowerINSERT_VECTOR_ELT_SSE4(SDValue Op,
   6851                                                SelectionDAG &DAG) const {
   6852   EVT VT = Op.getValueType();
   6853   EVT EltVT = VT.getVectorElementType();
   6854   DebugLoc dl = Op.getDebugLoc();
   6855 
   6856   SDValue N0 = Op.getOperand(0);
   6857   SDValue N1 = Op.getOperand(1);
   6858   SDValue N2 = Op.getOperand(2);
   6859 
   6860   if (VT.getSizeInBits() == 256)
   6861     return SDValue();
   6862 
   6863   if ((EltVT.getSizeInBits() == 8 || EltVT.getSizeInBits() == 16) &&
   6864       isa<ConstantSDNode>(N2)) {
   6865     unsigned Opc;
   6866     if (VT == MVT::v8i16)
   6867       Opc = X86ISD::PINSRW;
   6868     else if (VT == MVT::v16i8)
   6869       Opc = X86ISD::PINSRB;
   6870     else
   6871       Opc = X86ISD::PINSRB;
   6872 
   6873     // Transform it so it match pinsr{b,w} which expects a GR32 as its second
   6874     // argument.
   6875     if (N1.getValueType() != MVT::i32)
   6876       N1 = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, N1);
   6877     if (N2.getValueType() != MVT::i32)
   6878       N2 = DAG.getIntPtrConstant(cast<ConstantSDNode>(N2)->getZExtValue());
   6879     return DAG.getNode(Opc, dl, VT, N0, N1, N2);
   6880   } else if (EltVT == MVT::f32 && isa<ConstantSDNode>(N2)) {
   6881     // Bits [7:6] of the constant are the source select.  This will always be
   6882     //  zero here.  The DAG Combiner may combine an extract_elt index into these
   6883     //  bits.  For example (insert (extract, 3), 2) could be matched by putting
   6884     //  the '3' into bits [7:6] of X86ISD::INSERTPS.
   6885     // Bits [5:4] of the constant are the destination select.  This is the
   6886     //  value of the incoming immediate.
   6887     // Bits [3:0] of the constant are the zero mask.  The DAG Combiner may
   6888     //   combine either bitwise AND or insert of float 0.0 to set these bits.
   6889     N2 = DAG.getIntPtrConstant(cast<ConstantSDNode>(N2)->getZExtValue() << 4);
   6890     // Create this as a scalar to vector..
   6891     N1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4f32, N1);
   6892     return DAG.getNode(X86ISD::INSERTPS, dl, VT, N0, N1, N2);
   6893   } else if ((EltVT == MVT::i32 || EltVT == MVT::i64) &&
   6894              isa<ConstantSDNode>(N2)) {
   6895     // PINSR* works with constant index.
   6896     return Op;
   6897   }
   6898   return SDValue();
   6899 }
   6900 
   6901 SDValue
   6902 X86TargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const {
   6903   EVT VT = Op.getValueType();
   6904   EVT EltVT = VT.getVectorElementType();
   6905 
   6906   DebugLoc dl = Op.getDebugLoc();
   6907   SDValue N0 = Op.getOperand(0);
   6908   SDValue N1 = Op.getOperand(1);
   6909   SDValue N2 = Op.getOperand(2);
   6910 
   6911   // If this is a 256-bit vector result, first extract the 128-bit vector,
   6912   // insert the element into the extracted half and then place it back.
   6913   if (VT.getSizeInBits() == 256) {
   6914     if (!isa<ConstantSDNode>(N2))
   6915       return SDValue();
   6916 
   6917     // Get the desired 128-bit vector half.
   6918     unsigned NumElems = VT.getVectorNumElements();
   6919     unsigned IdxVal = cast<ConstantSDNode>(N2)->getZExtValue();
   6920     bool Upper = IdxVal >= NumElems/2;
   6921     SDValue Ins128Idx = DAG.getConstant(Upper ? NumElems/2 : 0, MVT::i32);
   6922     SDValue V = Extract128BitVector(N0, Ins128Idx, DAG, dl);
   6923 
   6924     // Insert the element into the desired half.
   6925     V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, V.getValueType(), V,
   6926                  N1, Upper ? DAG.getConstant(IdxVal-NumElems/2, MVT::i32) : N2);
   6927 
   6928     // Insert the changed part back to the 256-bit vector
   6929     return Insert128BitVector(N0, V, Ins128Idx, DAG, dl);
   6930   }
   6931 
   6932   if (Subtarget->hasSSE41())
   6933     return LowerINSERT_VECTOR_ELT_SSE4(Op, DAG);
   6934 
   6935   if (EltVT == MVT::i8)
   6936     return SDValue();
   6937 
   6938   if (EltVT.getSizeInBits() == 16 && isa<ConstantSDNode>(N2)) {
   6939     // Transform it so it match pinsrw which expects a 16-bit value in a GR32
   6940     // as its second argument.
   6941     if (N1.getValueType() != MVT::i32)
   6942       N1 = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, N1);
   6943     if (N2.getValueType() != MVT::i32)
   6944       N2 = DAG.getIntPtrConstant(cast<ConstantSDNode>(N2)->getZExtValue());
   6945     return DAG.getNode(X86ISD::PINSRW, dl, VT, N0, N1, N2);
   6946   }
   6947   return SDValue();
   6948 }
   6949 
   6950 SDValue
   6951 X86TargetLowering::LowerSCALAR_TO_VECTOR(SDValue Op, SelectionDAG &DAG) const {
   6952   LLVMContext *Context = DAG.getContext();
   6953   DebugLoc dl = Op.getDebugLoc();
   6954   EVT OpVT = Op.getValueType();
   6955 
   6956   // If this is a 256-bit vector result, first insert into a 128-bit
   6957   // vector and then insert into the 256-bit vector.
   6958   if (OpVT.getSizeInBits() > 128) {
   6959     // Insert into a 128-bit vector.
   6960     EVT VT128 = EVT::getVectorVT(*Context,
   6961                                  OpVT.getVectorElementType(),
   6962                                  OpVT.getVectorNumElements() / 2);
   6963 
   6964     Op = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT128, Op.getOperand(0));
   6965 
   6966     // Insert the 128-bit vector.
   6967     return Insert128BitVector(DAG.getNode(ISD::UNDEF, dl, OpVT), Op,
   6968                               DAG.getConstant(0, MVT::i32),
   6969                               DAG, dl);
   6970   }
   6971 
   6972   if (Op.getValueType() == MVT::v1i64 &&
   6973       Op.getOperand(0).getValueType() == MVT::i64)
   6974     return DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v1i64, Op.getOperand(0));
   6975 
   6976   SDValue AnyExt = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, Op.getOperand(0));
   6977   assert(Op.getValueType().getSimpleVT().getSizeInBits() == 128 &&
   6978          "Expected an SSE type!");
   6979   return DAG.getNode(ISD::BITCAST, dl, Op.getValueType(),
   6980                      DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32,AnyExt));
   6981 }
   6982 
   6983 // Lower a node with an EXTRACT_SUBVECTOR opcode.  This may result in
   6984 // a simple subregister reference or explicit instructions to grab
   6985 // upper bits of a vector.
   6986 SDValue
   6987 X86TargetLowering::LowerEXTRACT_SUBVECTOR(SDValue Op, SelectionDAG &DAG) const {
   6988   if (Subtarget->hasAVX()) {
   6989     DebugLoc dl = Op.getNode()->getDebugLoc();
   6990     SDValue Vec = Op.getNode()->getOperand(0);
   6991     SDValue Idx = Op.getNode()->getOperand(1);
   6992 
   6993     if (Op.getNode()->getValueType(0).getSizeInBits() == 128
   6994         && Vec.getNode()->getValueType(0).getSizeInBits() == 256) {
   6995         return Extract128BitVector(Vec, Idx, DAG, dl);
   6996     }
   6997   }
   6998   return SDValue();
   6999 }
   7000 
   7001 // Lower a node with an INSERT_SUBVECTOR opcode.  This may result in a
   7002 // simple superregister reference or explicit instructions to insert
   7003 // the upper bits of a vector.
   7004 SDValue
   7005 X86TargetLowering::LowerINSERT_SUBVECTOR(SDValue Op, SelectionDAG &DAG) const {
   7006   if (Subtarget->hasAVX()) {
   7007     DebugLoc dl = Op.getNode()->getDebugLoc();
   7008     SDValue Vec = Op.getNode()->getOperand(0);
   7009     SDValue SubVec = Op.getNode()->getOperand(1);
   7010     SDValue Idx = Op.getNode()->getOperand(2);
   7011 
   7012     if (Op.getNode()->getValueType(0).getSizeInBits() == 256
   7013         && SubVec.getNode()->getValueType(0).getSizeInBits() == 128) {
   7014       return Insert128BitVector(Vec, SubVec, Idx, DAG, dl);
   7015     }
   7016   }
   7017   return SDValue();
   7018 }
   7019 
   7020 // ConstantPool, JumpTable, GlobalAddress, and ExternalSymbol are lowered as
   7021 // their target countpart wrapped in the X86ISD::Wrapper node. Suppose N is
   7022 // one of the above mentioned nodes. It has to be wrapped because otherwise
   7023 // Select(N) returns N. So the raw TargetGlobalAddress nodes, etc. can only
   7024 // be used to form addressing mode. These wrapped nodes will be selected
   7025 // into MOV32ri.
   7026 SDValue
   7027 X86TargetLowering::LowerConstantPool(SDValue Op, SelectionDAG &DAG) const {
   7028   ConstantPoolSDNode *CP = cast<ConstantPoolSDNode>(Op);
   7029 
   7030   // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the
   7031   // global base reg.
   7032   unsigned char OpFlag = 0;
   7033   unsigned WrapperKind = X86ISD::Wrapper;
   7034   CodeModel::Model M = getTargetMachine().getCodeModel();
   7035 
   7036   if (Subtarget->isPICStyleRIPRel() &&
   7037       (M == CodeModel::Small || M == CodeModel::Kernel))
   7038     WrapperKind = X86ISD::WrapperRIP;
   7039   else if (Subtarget->isPICStyleGOT())
   7040     OpFlag = X86II::MO_GOTOFF;
   7041   else if (Subtarget->isPICStyleStubPIC())
   7042     OpFlag = X86II::MO_PIC_BASE_OFFSET;
   7043 
   7044   SDValue Result = DAG.getTargetConstantPool(CP->getConstVal(), getPointerTy(),
   7045                                              CP->getAlignment(),
   7046                                              CP->getOffset(), OpFlag);
   7047   DebugLoc DL = CP->getDebugLoc();
   7048   Result = DAG.getNode(WrapperKind, DL, getPointerTy(), Result);
   7049   // With PIC, the address is actually $g + Offset.
   7050   if (OpFlag) {
   7051     Result = DAG.getNode(ISD::ADD, DL, getPointerTy(),
   7052                          DAG.getNode(X86ISD::GlobalBaseReg,
   7053                                      DebugLoc(), getPointerTy()),
   7054                          Result);
   7055   }
   7056 
   7057   return Result;
   7058 }
   7059 
   7060 SDValue X86TargetLowering::LowerJumpTable(SDValue Op, SelectionDAG &DAG) const {
   7061   JumpTableSDNode *JT = cast<JumpTableSDNode>(Op);
   7062 
   7063   // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the
   7064   // global base reg.
   7065   unsigned char OpFlag = 0;
   7066   unsigned WrapperKind = X86ISD::Wrapper;
   7067   CodeModel::Model M = getTargetMachine().getCodeModel();
   7068 
   7069   if (Subtarget->isPICStyleRIPRel() &&
   7070       (M == CodeModel::Small || M == CodeModel::Kernel))
   7071     WrapperKind = X86ISD::WrapperRIP;
   7072   else if (Subtarget->isPICStyleGOT())
   7073     OpFlag = X86II::MO_GOTOFF;
   7074   else if (Subtarget->isPICStyleStubPIC())
   7075     OpFlag = X86II::MO_PIC_BASE_OFFSET;
   7076 
   7077   SDValue Result = DAG.getTargetJumpTable(JT->getIndex(), getPointerTy(),
   7078                                           OpFlag);
   7079   DebugLoc DL = JT->getDebugLoc();
   7080   Result = DAG.getNode(WrapperKind, DL, getPointerTy(), Result);
   7081 
   7082   // With PIC, the address is actually $g + Offset.
   7083   if (OpFlag)
   7084     Result = DAG.getNode(ISD::ADD, DL, getPointerTy(),
   7085                          DAG.getNode(X86ISD::GlobalBaseReg,
   7086                                      DebugLoc(), getPointerTy()),
   7087                          Result);
   7088 
   7089   return Result;
   7090 }
   7091 
   7092 SDValue
   7093 X86TargetLowering::LowerExternalSymbol(SDValue Op, SelectionDAG &DAG) const {
   7094   const char *Sym = cast<ExternalSymbolSDNode>(Op)->getSymbol();
   7095 
   7096   // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the
   7097   // global base reg.
   7098   unsigned char OpFlag = 0;
   7099   unsigned WrapperKind = X86ISD::Wrapper;
   7100   CodeModel::Model M = getTargetMachine().getCodeModel();
   7101 
   7102   if (Subtarget->isPICStyleRIPRel() &&
   7103       (M == CodeModel::Small || M == CodeModel::Kernel)) {
   7104     if (Subtarget->isTargetDarwin() || Subtarget->isTargetELF())
   7105       OpFlag = X86II::MO_GOTPCREL;
   7106     WrapperKind = X86ISD::WrapperRIP;
   7107   } else if (Subtarget->isPICStyleGOT()) {
   7108     OpFlag = X86II::MO_GOT;
   7109   } else if (Subtarget->isPICStyleStubPIC()) {
   7110     OpFlag = X86II::MO_DARWIN_NONLAZY_PIC_BASE;
   7111   } else if (Subtarget->isPICStyleStubNoDynamic()) {
   7112     OpFlag = X86II::MO_DARWIN_NONLAZY;
   7113   }
   7114 
   7115   SDValue Result = DAG.getTargetExternalSymbol(Sym, getPointerTy(), OpFlag);
   7116 
   7117   DebugLoc DL = Op.getDebugLoc();
   7118   Result = DAG.getNode(WrapperKind, DL, getPointerTy(), Result);
   7119 
   7120 
   7121   // With PIC, the address is actually $g + Offset.
   7122   if (getTargetMachine().getRelocationModel() == Reloc::PIC_ &&
   7123       !Subtarget->is64Bit()) {
   7124     Result = DAG.getNode(ISD::ADD, DL, getPointerTy(),
   7125                          DAG.getNode(X86ISD::GlobalBaseReg,
   7126                                      DebugLoc(), getPointerTy()),
   7127                          Result);
   7128   }
   7129 
   7130   // For symbols that require a load from a stub to get the address, emit the
   7131   // load.
   7132   if (isGlobalStubReference(OpFlag))
   7133     Result = DAG.getLoad(getPointerTy(), DL, DAG.getEntryNode(), Result,
   7134                          MachinePointerInfo::getGOT(), false, false, false, 0);
   7135 
   7136   return Result;
   7137 }
   7138 
   7139 SDValue
   7140 X86TargetLowering::LowerBlockAddress(SDValue Op, SelectionDAG &DAG) const {
   7141   // Create the TargetBlockAddressAddress node.
   7142   unsigned char OpFlags =
   7143     Subtarget->ClassifyBlockAddressReference();
   7144   CodeModel::Model M = getTargetMachine().getCodeModel();
   7145   const BlockAddress *BA = cast<BlockAddressSDNode>(Op)->getBlockAddress();
   7146   DebugLoc dl = Op.getDebugLoc();
   7147   SDValue Result = DAG.getBlockAddress(BA, getPointerTy(),
   7148                                        /*isTarget=*/true, OpFlags);
   7149 
   7150   if (Subtarget->isPICStyleRIPRel() &&
   7151       (M == CodeModel::Small || M == CodeModel::Kernel))
   7152     Result = DAG.getNode(X86ISD::WrapperRIP, dl, getPointerTy(), Result);
   7153   else
   7154     Result = DAG.getNode(X86ISD::Wrapper, dl, getPointerTy(), Result);
   7155 
   7156   // With PIC, the address is actually $g + Offset.
   7157   if (isGlobalRelativeToPICBase(OpFlags)) {
   7158     Result = DAG.getNode(ISD::ADD, dl, getPointerTy(),
   7159                          DAG.getNode(X86ISD::GlobalBaseReg, dl, getPointerTy()),
   7160                          Result);
   7161   }
   7162 
   7163   return Result;
   7164 }
   7165 
   7166 SDValue
   7167 X86TargetLowering::LowerGlobalAddress(const GlobalValue *GV, DebugLoc dl,
   7168                                       int64_t Offset,
   7169                                       SelectionDAG &DAG) const {
   7170   // Create the TargetGlobalAddress node, folding in the constant
   7171   // offset if it is legal.
   7172   unsigned char OpFlags =
   7173     Subtarget->ClassifyGlobalReference(GV, getTargetMachine());
   7174   CodeModel::Model M = getTargetMachine().getCodeModel();
   7175   SDValue Result;
   7176   if (OpFlags == X86II::MO_NO_FLAG &&
   7177       X86::isOffsetSuitableForCodeModel(Offset, M)) {
   7178     // A direct static reference to a global.
   7179     Result = DAG.getTargetGlobalAddress(GV, dl, getPointerTy(), Offset);
   7180     Offset = 0;
   7181   } else {
   7182     Result = DAG.getTargetGlobalAddress(GV, dl, getPointerTy(), 0, OpFlags);
   7183   }
   7184 
   7185   if (Subtarget->isPICStyleRIPRel() &&
   7186       (M == CodeModel::Small || M == CodeModel::Kernel))
   7187     Result = DAG.getNode(X86ISD::WrapperRIP, dl, getPointerTy(), Result);
   7188   else
   7189     Result = DAG.getNode(X86ISD::Wrapper, dl, getPointerTy(), Result);
   7190 
   7191   // With PIC, the address is actually $g + Offset.
   7192   if (isGlobalRelativeToPICBase(OpFlags)) {
   7193     Result = DAG.getNode(ISD::ADD, dl, getPointerTy(),
   7194                          DAG.getNode(X86ISD::GlobalBaseReg, dl, getPointerTy()),
   7195                          Result);
   7196   }
   7197 
   7198   // For globals that require a load from a stub to get the address, emit the
   7199   // load.
   7200   if (isGlobalStubReference(OpFlags))
   7201     Result = DAG.getLoad(getPointerTy(), dl, DAG.getEntryNode(), Result,
   7202                          MachinePointerInfo::getGOT(), false, false, false, 0);
   7203 
   7204   // If there was a non-zero offset that we didn't fold, create an explicit
   7205   // addition for it.
   7206   if (Offset != 0)
   7207     Result = DAG.getNode(ISD::ADD, dl, getPointerTy(), Result,
   7208                          DAG.getConstant(Offset, getPointerTy()));
   7209 
   7210   return Result;
   7211 }
   7212 
   7213 SDValue
   7214 X86TargetLowering::LowerGlobalAddress(SDValue Op, SelectionDAG &DAG) const {
   7215   const GlobalValue *GV = cast<GlobalAddressSDNode>(Op)->getGlobal();
   7216   int64_t Offset = cast<GlobalAddressSDNode>(Op)->getOffset();
   7217   return LowerGlobalAddress(GV, Op.getDebugLoc(), Offset, DAG);
   7218 }
   7219 
   7220 static SDValue
   7221 GetTLSADDR(SelectionDAG &DAG, SDValue Chain, GlobalAddressSDNode *GA,
   7222            SDValue *InFlag, const EVT PtrVT, unsigned ReturnReg,
   7223            unsigned char OperandFlags) {
   7224   MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo();
   7225   SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
   7226   DebugLoc dl = GA->getDebugLoc();
   7227   SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), dl,
   7228                                            GA->getValueType(0),
   7229                                            GA->getOffset(),
   7230                                            OperandFlags);
   7231   if (InFlag) {
   7232     SDValue Ops[] = { Chain,  TGA, *InFlag };
   7233     Chain = DAG.getNode(X86ISD::TLSADDR, dl, NodeTys, Ops, 3);
   7234   } else {
   7235     SDValue Ops[]  = { Chain, TGA };
   7236     Chain = DAG.getNode(X86ISD::TLSADDR, dl, NodeTys, Ops, 2);
   7237   }
   7238 
   7239   // TLSADDR will be codegen'ed as call. Inform MFI that function has calls.
   7240   MFI->setAdjustsStack(true);
   7241 
   7242   SDValue Flag = Chain.getValue(1);
   7243   return DAG.getCopyFromReg(Chain, dl, ReturnReg, PtrVT, Flag);
   7244 }
   7245 
   7246 // Lower ISD::GlobalTLSAddress using the "general dynamic" model, 32 bit
   7247 static SDValue
   7248 LowerToTLSGeneralDynamicModel32(GlobalAddressSDNode *GA, SelectionDAG &DAG,
   7249                                 const EVT PtrVT) {
   7250   SDValue InFlag;
   7251   DebugLoc dl = GA->getDebugLoc();  // ? function entry point might be better
   7252   SDValue Chain = DAG.getCopyToReg(DAG.getEntryNode(), dl, X86::EBX,
   7253                                      DAG.getNode(X86ISD::GlobalBaseReg,
   7254                                                  DebugLoc(), PtrVT), InFlag);
   7255   InFlag = Chain.getValue(1);
   7256 
   7257   return GetTLSADDR(DAG, Chain, GA, &InFlag, PtrVT, X86::EAX, X86II::MO_TLSGD);
   7258 }
   7259 
   7260 // Lower ISD::GlobalTLSAddress using the "general dynamic" model, 64 bit
   7261 static SDValue
   7262 LowerToTLSGeneralDynamicModel64(GlobalAddressSDNode *GA, SelectionDAG &DAG,
   7263                                 const EVT PtrVT) {
   7264   return GetTLSADDR(DAG, DAG.getEntryNode(), GA, NULL, PtrVT,
   7265                     X86::RAX, X86II::MO_TLSGD);
   7266 }
   7267 
   7268 // Lower ISD::GlobalTLSAddress using the "initial exec" (for no-pic) or
   7269 // "local exec" model.
   7270 static SDValue LowerToTLSExecModel(GlobalAddressSDNode *GA, SelectionDAG &DAG,
   7271                                    const EVT PtrVT, TLSModel::Model model,
   7272                                    bool is64Bit) {
   7273   DebugLoc dl = GA->getDebugLoc();
   7274 
   7275   // Get the Thread Pointer, which is %gs:0 (32-bit) or %fs:0 (64-bit).
   7276   Value *Ptr = Constant::getNullValue(Type::getInt8PtrTy(*DAG.getContext(),
   7277                                                          is64Bit ? 257 : 256));
   7278 
   7279   SDValue ThreadPointer = DAG.getLoad(PtrVT, dl, DAG.getEntryNode(),
   7280                                       DAG.getIntPtrConstant(0),
   7281                                       MachinePointerInfo(Ptr),
   7282                                       false, false, false, 0);
   7283 
   7284   unsigned char OperandFlags = 0;
   7285   // Most TLS accesses are not RIP relative, even on x86-64.  One exception is
   7286   // initialexec.
   7287   unsigned WrapperKind = X86ISD::Wrapper;
   7288   if (model == TLSModel::LocalExec) {
   7289     OperandFlags = is64Bit ? X86II::MO_TPOFF : X86II::MO_NTPOFF;
   7290   } else if (is64Bit) {
   7291     assert(model == TLSModel::InitialExec);
   7292     OperandFlags = X86II::MO_GOTTPOFF;
   7293     WrapperKind = X86ISD::WrapperRIP;
   7294   } else {
   7295     assert(model == TLSModel::InitialExec);
   7296     OperandFlags = X86II::MO_INDNTPOFF;
   7297   }
   7298 
   7299   // emit "addl x@ntpoff,%eax" (local exec) or "addl x@indntpoff,%eax" (initial
   7300   // exec)
   7301   SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), dl,
   7302                                            GA->getValueType(0),
   7303                                            GA->getOffset(), OperandFlags);
   7304   SDValue Offset = DAG.getNode(WrapperKind, dl, PtrVT, TGA);
   7305 
   7306   if (model == TLSModel::InitialExec)
   7307     Offset = DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), Offset,
   7308                          MachinePointerInfo::getGOT(), false, false, false, 0);
   7309 
   7310   // The address of the thread local variable is the add of the thread
   7311   // pointer with the offset of the variable.
   7312   return DAG.getNode(ISD::ADD, dl, PtrVT, ThreadPointer, Offset);
   7313 }
   7314 
   7315 SDValue
   7316 X86TargetLowering::LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const {
   7317 
   7318   GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(Op);
   7319   const GlobalValue *GV = GA->getGlobal();
   7320 
   7321   if (Subtarget->isTargetELF()) {
   7322     // TODO: implement the "local dynamic" model
   7323     // TODO: implement the "initial exec"model for pic executables
   7324 
   7325     // If GV is an alias then use the aliasee for determining
   7326     // thread-localness.
   7327     if (const GlobalAlias *GA = dyn_cast<GlobalAlias>(GV))
   7328       GV = GA->resolveAliasedGlobal(false);
   7329 
   7330     TLSModel::Model model = getTargetMachine().getTLSModel(GV);
   7331 
   7332     switch (model) {
   7333       case TLSModel::GeneralDynamic:
   7334       case TLSModel::LocalDynamic: // not implemented
   7335         if (Subtarget->is64Bit())
   7336           return LowerToTLSGeneralDynamicModel64(GA, DAG, getPointerTy());
   7337         return LowerToTLSGeneralDynamicModel32(GA, DAG, getPointerTy());
   7338 
   7339       case TLSModel::InitialExec:
   7340       case TLSModel::LocalExec:
   7341         return LowerToTLSExecModel(GA, DAG, getPointerTy(), model,
   7342                                    Subtarget->is64Bit());
   7343     }
   7344   } else if (Subtarget->isTargetDarwin()) {
   7345     // Darwin only has one model of TLS.  Lower to that.
   7346     unsigned char OpFlag = 0;
   7347     unsigned WrapperKind = Subtarget->isPICStyleRIPRel() ?
   7348                            X86ISD::WrapperRIP : X86ISD::Wrapper;
   7349 
   7350     // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the
   7351     // global base reg.
   7352     bool PIC32 = (getTargetMachine().getRelocationModel() == Reloc::PIC_) &&
   7353                   !Subtarget->is64Bit();
   7354     if (PIC32)
   7355       OpFlag = X86II::MO_TLVP_PIC_BASE;
   7356     else
   7357       OpFlag = X86II::MO_TLVP;
   7358     DebugLoc DL = Op.getDebugLoc();
   7359     SDValue Result = DAG.getTargetGlobalAddress(GA->getGlobal(), DL,
   7360                                                 GA->getValueType(0),
   7361                                                 GA->getOffset(), OpFlag);
   7362     SDValue Offset = DAG.getNode(WrapperKind, DL, getPointerTy(), Result);
   7363 
   7364     // With PIC32, the address is actually $g + Offset.
   7365     if (PIC32)
   7366       Offset = DAG.getNode(ISD::ADD, DL, getPointerTy(),
   7367                            DAG.getNode(X86ISD::GlobalBaseReg,
   7368                                        DebugLoc(), getPointerTy()),
   7369                            Offset);
   7370 
   7371     // Lowering the machine isd will make sure everything is in the right
   7372     // location.
   7373     SDValue Chain = DAG.getEntryNode();
   7374     SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
   7375     SDValue Args[] = { Chain, Offset };
   7376     Chain = DAG.getNode(X86ISD::TLSCALL, DL, NodeTys, Args, 2);
   7377 
   7378     // TLSCALL will be codegen'ed as call. Inform MFI that function has calls.
   7379     MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo();
   7380     MFI->setAdjustsStack(true);
   7381 
   7382     // And our return value (tls address) is in the standard call return value
   7383     // location.
   7384     unsigned Reg = Subtarget->is64Bit() ? X86::RAX : X86::EAX;
   7385     return DAG.getCopyFromReg(Chain, DL, Reg, getPointerTy(),
   7386                               Chain.getValue(1));
   7387   } else if (Subtarget->isTargetWindows()) {
   7388     // Just use the implicit TLS architecture
   7389     // Need to generate someting similar to:
   7390     //   mov     rdx, qword [gs:abs 58H]; Load pointer to ThreadLocalStorage
   7391     //                                  ; from TEB
   7392     //   mov     ecx, dword [rel _tls_index]: Load index (from C runtime)
   7393     //   mov     rcx, qword [rdx+rcx*8]
   7394     //   mov     eax, .tls$:tlsvar
   7395     //   [rax+rcx] contains the address
   7396     // Windows 64bit: gs:0x58
   7397     // Windows 32bit: fs:__tls_array
   7398 
   7399     // If GV is an alias then use the aliasee for determining
   7400     // thread-localness.
   7401     if (const GlobalAlias *GA = dyn_cast<GlobalAlias>(GV))
   7402       GV = GA->resolveAliasedGlobal(false);
   7403     DebugLoc dl = GA->getDebugLoc();
   7404     SDValue Chain = DAG.getEntryNode();
   7405 
   7406     // Get the Thread Pointer, which is %fs:__tls_array (32-bit) or
   7407     // %gs:0x58 (64-bit).
   7408     Value *Ptr = Constant::getNullValue(Subtarget->is64Bit()
   7409                                         ? Type::getInt8PtrTy(*DAG.getContext(),
   7410                                                              256)
   7411                                         : Type::getInt32PtrTy(*DAG.getContext(),
   7412                                                               257));
   7413 
   7414     SDValue ThreadPointer = DAG.getLoad(getPointerTy(), dl, Chain,
   7415                                         Subtarget->is64Bit()
   7416                                         ? DAG.getIntPtrConstant(0x58)
   7417                                         : DAG.getExternalSymbol("_tls_array",
   7418                                                                 getPointerTy()),
   7419                                         MachinePointerInfo(Ptr),
   7420                                         false, false, false, 0);
   7421 
   7422     // Load the _tls_index variable
   7423     SDValue IDX = DAG.getExternalSymbol("_tls_index", getPointerTy());
   7424     if (Subtarget->is64Bit())
   7425       IDX = DAG.getExtLoad(ISD::ZEXTLOAD, dl, getPointerTy(), Chain,
   7426                            IDX, MachinePointerInfo(), MVT::i32,
   7427                            false, false, 0);
   7428     else
   7429       IDX = DAG.getLoad(getPointerTy(), dl, Chain, IDX, MachinePointerInfo(),
   7430                         false, false, false, 0);
   7431 
   7432     SDValue Scale = DAG.getConstant(Log2_64_Ceil(TD->getPointerSize()),
   7433 		                            getPointerTy());
   7434     IDX = DAG.getNode(ISD::SHL, dl, getPointerTy(), IDX, Scale);
   7435 
   7436     SDValue res = DAG.getNode(ISD::ADD, dl, getPointerTy(), ThreadPointer, IDX);
   7437     res = DAG.getLoad(getPointerTy(), dl, Chain, res, MachinePointerInfo(),
   7438                       false, false, false, 0);
   7439 
   7440     // Get the offset of start of .tls section
   7441     SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), dl,
   7442                                              GA->getValueType(0),
   7443                                              GA->getOffset(), X86II::MO_SECREL);
   7444     SDValue Offset = DAG.getNode(X86ISD::Wrapper, dl, getPointerTy(), TGA);
   7445 
   7446     // The address of the thread local variable is the add of the thread
   7447     // pointer with the offset of the variable.
   7448     return DAG.getNode(ISD::ADD, dl, getPointerTy(), res, Offset);
   7449   }
   7450 
   7451   llvm_unreachable("TLS not implemented for this target.");
   7452 }
   7453 
   7454 
   7455 /// LowerShiftParts - Lower SRA_PARTS and friends, which return two i32 values
   7456 /// and take a 2 x i32 value to shift plus a shift amount.
   7457 SDValue X86TargetLowering::LowerShiftParts(SDValue Op, SelectionDAG &DAG) const{
   7458   assert(Op.getNumOperands() == 3 && "Not a double-shift!");
   7459   EVT VT = Op.getValueType();
   7460   unsigned VTBits = VT.getSizeInBits();
   7461   DebugLoc dl = Op.getDebugLoc();
   7462   bool isSRA = Op.getOpcode() == ISD::SRA_PARTS;
   7463   SDValue ShOpLo = Op.getOperand(0);
   7464   SDValue ShOpHi = Op.getOperand(1);
   7465   SDValue ShAmt  = Op.getOperand(2);
   7466   SDValue Tmp1 = isSRA ? DAG.getNode(ISD::SRA, dl, VT, ShOpHi,
   7467                                      DAG.getConstant(VTBits - 1, MVT::i8))
   7468                        : DAG.getConstant(0, VT);
   7469 
   7470   SDValue Tmp2, Tmp3;
   7471   if (Op.getOpcode() == ISD::SHL_PARTS) {
   7472     Tmp2 = DAG.getNode(X86ISD::SHLD, dl, VT, ShOpHi, ShOpLo, ShAmt);
   7473     Tmp3 = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, ShAmt);
   7474   } else {
   7475     Tmp2 = DAG.getNode(X86ISD::SHRD, dl, VT, ShOpLo, ShOpHi, ShAmt);
   7476     Tmp3 = DAG.getNode(isSRA ? ISD::SRA : ISD::SRL, dl, VT, ShOpHi, ShAmt);
   7477   }
   7478 
   7479   SDValue AndNode = DAG.getNode(ISD::AND, dl, MVT::i8, ShAmt,
   7480                                 DAG.getConstant(VTBits, MVT::i8));
   7481   SDValue Cond = DAG.getNode(X86ISD::CMP, dl, MVT::i32,
   7482                              AndNode, DAG.getConstant(0, MVT::i8));
   7483 
   7484   SDValue Hi, Lo;
   7485   SDValue CC = DAG.getConstant(X86::COND_NE, MVT::i8);
   7486   SDValue Ops0[4] = { Tmp2, Tmp3, CC, Cond };
   7487   SDValue Ops1[4] = { Tmp3, Tmp1, CC, Cond };
   7488 
   7489   if (Op.getOpcode() == ISD::SHL_PARTS) {
   7490     Hi = DAG.getNode(X86ISD::CMOV, dl, VT, Ops0, 4);
   7491     Lo = DAG.getNode(X86ISD::CMOV, dl, VT, Ops1, 4);
   7492   } else {
   7493     Lo = DAG.getNode(X86ISD::CMOV, dl, VT, Ops0, 4);
   7494     Hi = DAG.getNode(X86ISD::CMOV, dl, VT, Ops1, 4);
   7495   }
   7496 
   7497   SDValue Ops[2] = { Lo, Hi };
   7498   return DAG.getMergeValues(Ops, 2, dl);
   7499 }
   7500 
   7501 SDValue X86TargetLowering::LowerSINT_TO_FP(SDValue Op,
   7502                                            SelectionDAG &DAG) const {
   7503   EVT SrcVT = Op.getOperand(0).getValueType();
   7504 
   7505   if (SrcVT.isVector())
   7506     return SDValue();
   7507 
   7508   assert(SrcVT.getSimpleVT() <= MVT::i64 && SrcVT.getSimpleVT() >= MVT::i16 &&
   7509          "Unknown SINT_TO_FP to lower!");
   7510 
   7511   // These are really Legal; return the operand so the caller accepts it as
   7512   // Legal.
   7513   if (SrcVT == MVT::i32 && isScalarFPTypeInSSEReg(Op.getValueType()))
   7514     return Op;
   7515   if (SrcVT == MVT::i64 && isScalarFPTypeInSSEReg(Op.getValueType()) &&
   7516       Subtarget->is64Bit()) {
   7517     return Op;
   7518   }
   7519 
   7520   DebugLoc dl = Op.getDebugLoc();
   7521   unsigned Size = SrcVT.getSizeInBits()/8;
   7522   MachineFunction &MF = DAG.getMachineFunction();
   7523   int SSFI = MF.getFrameInfo()->CreateStackObject(Size, Size, false);
   7524   SDValue StackSlot = DAG.getFrameIndex(SSFI, getPointerTy());
   7525   SDValue Chain = DAG.getStore(DAG.getEntryNode(), dl, Op.getOperand(0),
   7526                                StackSlot,
   7527                                MachinePointerInfo::getFixedStack(SSFI),
   7528                                false, false, 0);
   7529   return BuildFILD(Op, SrcVT, Chain, StackSlot, DAG);
   7530 }
   7531 
   7532 SDValue X86TargetLowering::BuildFILD(SDValue Op, EVT SrcVT, SDValue Chain,
   7533                                      SDValue StackSlot,
   7534                                      SelectionDAG &DAG) const {
   7535   // Build the FILD
   7536   DebugLoc DL = Op.getDebugLoc();
   7537   SDVTList Tys;
   7538   bool useSSE = isScalarFPTypeInSSEReg(Op.getValueType());
   7539   if (useSSE)
   7540     Tys = DAG.getVTList(MVT::f64, MVT::Other, MVT::Glue);
   7541   else
   7542     Tys = DAG.getVTList(Op.getValueType(), MVT::Other);
   7543 
   7544   unsigned ByteSize = SrcVT.getSizeInBits()/8;
   7545 
   7546   FrameIndexSDNode *FI = dyn_cast<FrameIndexSDNode>(StackSlot);
   7547   MachineMemOperand *MMO;
   7548   if (FI) {
   7549     int SSFI = FI->getIndex();
   7550     MMO =
   7551       DAG.getMachineFunction()
   7552       .getMachineMemOperand(MachinePointerInfo::getFixedStack(SSFI),
   7553                             MachineMemOperand::MOLoad, ByteSize, ByteSize);
   7554   } else {
   7555     MMO = cast<LoadSDNode>(StackSlot)->getMemOperand();
   7556     StackSlot = StackSlot.getOperand(1);
   7557   }
   7558   SDValue Ops[] = { Chain, StackSlot, DAG.getValueType(SrcVT) };
   7559   SDValue Result = DAG.getMemIntrinsicNode(useSSE ? X86ISD::FILD_FLAG :
   7560                                            X86ISD::FILD, DL,
   7561                                            Tys, Ops, array_lengthof(Ops),
   7562                                            SrcVT, MMO);
   7563 
   7564   if (useSSE) {
   7565     Chain = Result.getValue(1);
   7566     SDValue InFlag = Result.getValue(2);
   7567 
   7568     // FIXME: Currently the FST is flagged to the FILD_FLAG. This
   7569     // shouldn't be necessary except that RFP cannot be live across
   7570     // multiple blocks. When stackifier is fixed, they can be uncoupled.
   7571     MachineFunction &MF = DAG.getMachineFunction();
   7572     unsigned SSFISize = Op.getValueType().getSizeInBits()/8;
   7573     int SSFI = MF.getFrameInfo()->CreateStackObject(SSFISize, SSFISize, false);
   7574     SDValue StackSlot = DAG.getFrameIndex(SSFI, getPointerTy());
   7575     Tys = DAG.getVTList(MVT::Other);
   7576     SDValue Ops[] = {
   7577       Chain, Result, StackSlot, DAG.getValueType(Op.getValueType()), InFlag
   7578     };
   7579     MachineMemOperand *MMO =
   7580       DAG.getMachineFunction()
   7581       .getMachineMemOperand(MachinePointerInfo::getFixedStack(SSFI),
   7582                             MachineMemOperand::MOStore, SSFISize, SSFISize);
   7583 
   7584     Chain = DAG.getMemIntrinsicNode(X86ISD::FST, DL, Tys,
   7585                                     Ops, array_lengthof(Ops),
   7586                                     Op.getValueType(), MMO);
   7587     Result = DAG.getLoad(Op.getValueType(), DL, Chain, StackSlot,
   7588                          MachinePointerInfo::getFixedStack(SSFI),
   7589                          false, false, false, 0);
   7590   }
   7591 
   7592   return Result;
   7593 }
   7594 
   7595 // LowerUINT_TO_FP_i64 - 64-bit unsigned integer to double expansion.
   7596 SDValue X86TargetLowering::LowerUINT_TO_FP_i64(SDValue Op,
   7597                                                SelectionDAG &DAG) const {
   7598   // This algorithm is not obvious. Here it is what we're trying to output:
   7599   /*
   7600      movq       %rax,  %xmm0
   7601      punpckldq  (c0),  %xmm0  // c0: (uint4){ 0x43300000U, 0x45300000U, 0U, 0U }
   7602      subpd      (c1),  %xmm0  // c1: (double2){ 0x1.0p52, 0x1.0p52 * 0x1.0p32 }
   7603      #ifdef __SSE3__
   7604        haddpd   %xmm0, %xmm0
   7605      #else
   7606        pshufd   $0x4e, %xmm0, %xmm1
   7607        addpd    %xmm1, %xmm0
   7608      #endif
   7609   */
   7610 
   7611   DebugLoc dl = Op.getDebugLoc();
   7612   LLVMContext *Context = DAG.getContext();
   7613 
   7614   // Build some magic constants.
   7615   const uint32_t CV0[] = { 0x43300000, 0x45300000, 0, 0 };
   7616   Constant *C0 = ConstantDataVector::get(*Context, CV0);
   7617   SDValue CPIdx0 = DAG.getConstantPool(C0, getPointerTy(), 16);
   7618 
   7619   SmallVector<Constant*,2> CV1;
   7620   CV1.push_back(
   7621         ConstantFP::get(*Context, APFloat(APInt(64, 0x4330000000000000ULL))));
   7622   CV1.push_back(
   7623         ConstantFP::get(*Context, APFloat(APInt(64, 0x4530000000000000ULL))));
   7624   Constant *C1 = ConstantVector::get(CV1);
   7625   SDValue CPIdx1 = DAG.getConstantPool(C1, getPointerTy(), 16);
   7626 
   7627   // Load the 64-bit value into an XMM register.
   7628   SDValue XR1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64,
   7629                             Op.getOperand(0));
   7630   SDValue CLod0 = DAG.getLoad(MVT::v4i32, dl, DAG.getEntryNode(), CPIdx0,
   7631                               MachinePointerInfo::getConstantPool(),
   7632                               false, false, false, 16);
   7633   SDValue Unpck1 = getUnpackl(DAG, dl, MVT::v4i32,
   7634                               DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, XR1),
   7635                               CLod0);
   7636 
   7637   SDValue CLod1 = DAG.getLoad(MVT::v2f64, dl, CLod0.getValue(1), CPIdx1,
   7638                               MachinePointerInfo::getConstantPool(),
   7639                               false, false, false, 16);
   7640   SDValue XR2F = DAG.getNode(ISD::BITCAST, dl, MVT::v2f64, Unpck1);
   7641   SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::v2f64, XR2F, CLod1);
   7642   SDValue Result;
   7643 
   7644   if (Subtarget->hasSSE3()) {
   7645     // FIXME: The 'haddpd' instruction may be slower than 'movhlps + addsd'.
   7646     Result = DAG.getNode(X86ISD::FHADD, dl, MVT::v2f64, Sub, Sub);
   7647   } else {
   7648     SDValue S2F = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, Sub);
   7649     SDValue Shuffle = getTargetShuffleNode(X86ISD::PSHUFD, dl, MVT::v4i32,
   7650                                            S2F, 0x4E, DAG);
   7651     Result = DAG.getNode(ISD::FADD, dl, MVT::v2f64,
   7652                          DAG.getNode(ISD::BITCAST, dl, MVT::v2f64, Shuffle),
   7653                          Sub);
   7654   }
   7655 
   7656   return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, Result,
   7657                      DAG.getIntPtrConstant(0));
   7658 }
   7659 
   7660 // LowerUINT_TO_FP_i32 - 32-bit unsigned integer to float expansion.
   7661 SDValue X86TargetLowering::LowerUINT_TO_FP_i32(SDValue Op,
   7662                                                SelectionDAG &DAG) const {
   7663   DebugLoc dl = Op.getDebugLoc();
   7664   // FP constant to bias correct the final result.
   7665   SDValue Bias = DAG.getConstantFP(BitsToDouble(0x4330000000000000ULL),
   7666                                    MVT::f64);
   7667 
   7668   // Load the 32-bit value into an XMM register.
   7669   SDValue Load = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32,
   7670                              Op.getOperand(0));
   7671 
   7672   // Zero out the upper parts of the register.
   7673   Load = getShuffleVectorZeroOrUndef(Load, 0, true, Subtarget, DAG);
   7674 
   7675   Load = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64,
   7676                      DAG.getNode(ISD::BITCAST, dl, MVT::v2f64, Load),
   7677                      DAG.getIntPtrConstant(0));
   7678 
   7679   // Or the load with the bias.
   7680   SDValue Or = DAG.getNode(ISD::OR, dl, MVT::v2i64,
   7681                            DAG.getNode(ISD::BITCAST, dl, MVT::v2i64,
   7682                                        DAG.getNode(ISD::SCALAR_TO_VECTOR, dl,
   7683                                                    MVT::v2f64, Load)),
   7684                            DAG.getNode(ISD::BITCAST, dl, MVT::v2i64,
   7685                                        DAG.getNode(ISD::SCALAR_TO_VECTOR, dl,
   7686                                                    MVT::v2f64, Bias)));
   7687   Or = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64,
   7688                    DAG.getNode(ISD::BITCAST, dl, MVT::v2f64, Or),
   7689                    DAG.getIntPtrConstant(0));
   7690 
   7691   // Subtract the bias.
   7692   SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::f64, Or, Bias);
   7693 
   7694   // Handle final rounding.
   7695   EVT DestVT = Op.getValueType();
   7696 
   7697   if (DestVT.bitsLT(MVT::f64)) {
   7698     return DAG.getNode(ISD::FP_ROUND, dl, DestVT, Sub,
   7699                        DAG.getIntPtrConstant(0));
   7700   } else if (DestVT.bitsGT(MVT::f64)) {
   7701     return DAG.getNode(ISD::FP_EXTEND, dl, DestVT, Sub);
   7702   }
   7703 
   7704   // Handle final rounding.
   7705   return Sub;
   7706 }
   7707 
   7708 SDValue X86TargetLowering::LowerUINT_TO_FP(SDValue Op,
   7709                                            SelectionDAG &DAG) const {
   7710   SDValue N0 = Op.getOperand(0);
   7711   DebugLoc dl = Op.getDebugLoc();
   7712 
   7713   // Since UINT_TO_FP is legal (it's marked custom), dag combiner won't
   7714   // optimize it to a SINT_TO_FP when the sign bit is known zero. Perform
   7715   // the optimization here.
   7716   if (DAG.SignBitIsZero(N0))
   7717     return DAG.getNode(ISD::SINT_TO_FP, dl, Op.getValueType(), N0);
   7718 
   7719   EVT SrcVT = N0.getValueType();
   7720   EVT DstVT = Op.getValueType();
   7721   if (SrcVT == MVT::i64 && DstVT == MVT::f64 && X86ScalarSSEf64)
   7722     return LowerUINT_TO_FP_i64(Op, DAG);
   7723   else if (SrcVT == MVT::i32 && X86ScalarSSEf64)
   7724     return LowerUINT_TO_FP_i32(Op, DAG);
   7725   else if (Subtarget->is64Bit() &&
   7726            SrcVT == MVT::i64 && DstVT == MVT::f32)
   7727     return SDValue();
   7728 
   7729   // Make a 64-bit buffer, and use it to build an FILD.
   7730   SDValue StackSlot = DAG.CreateStackTemporary(MVT::i64);
   7731   if (SrcVT == MVT::i32) {
   7732     SDValue WordOff = DAG.getConstant(4, getPointerTy());
   7733     SDValue OffsetSlot = DAG.getNode(ISD::ADD, dl,
   7734                                      getPointerTy(), StackSlot, WordOff);
   7735     SDValue Store1 = DAG.getStore(DAG.getEntryNode(), dl, Op.getOperand(0),
   7736                                   StackSlot, MachinePointerInfo(),
   7737                                   false, false, 0);
   7738     SDValue Store2 = DAG.getStore(Store1, dl, DAG.getConstant(0, MVT::i32),
   7739                                   OffsetSlot, MachinePointerInfo(),
   7740                                   false, false, 0);
   7741     SDValue Fild = BuildFILD(Op, MVT::i64, Store2, StackSlot, DAG);
   7742     return Fild;
   7743   }
   7744 
   7745   assert(SrcVT == MVT::i64 && "Unexpected type in UINT_TO_FP");
   7746   SDValue Store = DAG.getStore(DAG.getEntryNode(), dl, Op.getOperand(0),
   7747                                StackSlot, MachinePointerInfo(),
   7748                                false, false, 0);
   7749   // For i64 source, we need to add the appropriate power of 2 if the input
   7750   // was negative.  This is the same as the optimization in
   7751   // DAGTypeLegalizer::ExpandIntOp_UNIT_TO_FP, and for it to be safe here,
   7752   // we must be careful to do the computation in x87 extended precision, not
   7753   // in SSE. (The generic code can't know it's OK to do this, or how to.)
   7754   int SSFI = cast<FrameIndexSDNode>(StackSlot)->getIndex();
   7755   MachineMemOperand *MMO =
   7756     DAG.getMachineFunction()
   7757     .getMachineMemOperand(MachinePointerInfo::getFixedStack(SSFI),
   7758                           MachineMemOperand::MOLoad, 8, 8);
   7759 
   7760   SDVTList Tys = DAG.getVTList(MVT::f80, MVT::Other);
   7761   SDValue Ops[] = { Store, StackSlot, DAG.getValueType(MVT::i64) };
   7762   SDValue Fild = DAG.getMemIntrinsicNode(X86ISD::FILD, dl, Tys, Ops, 3,
   7763                                          MVT::i64, MMO);
   7764 
   7765   APInt FF(32, 0x5F800000ULL);
   7766 
   7767   // Check whether the sign bit is set.
   7768   SDValue SignSet = DAG.getSetCC(dl, getSetCCResultType(MVT::i64),
   7769                                  Op.getOperand(0), DAG.getConstant(0, MVT::i64),
   7770                                  ISD::SETLT);
   7771 
   7772   // Build a 64 bit pair (0, FF) in the constant pool, with FF in the lo bits.
   7773   SDValue FudgePtr = DAG.getConstantPool(
   7774                              ConstantInt::get(*DAG.getContext(), FF.zext(64)),
   7775                                          getPointerTy());
   7776 
   7777   // Get a pointer to FF if the sign bit was set, or to 0 otherwise.
   7778   SDValue Zero = DAG.getIntPtrConstant(0);
   7779   SDValue Four = DAG.getIntPtrConstant(4);
   7780   SDValue Offset = DAG.getNode(ISD::SELECT, dl, Zero.getValueType(), SignSet,
   7781                                Zero, Four);
   7782   FudgePtr = DAG.getNode(ISD::ADD, dl, getPointerTy(), FudgePtr, Offset);
   7783 
   7784   // Load the value out, extending it from f32 to f80.
   7785   // FIXME: Avoid the extend by constructing the right constant pool?
   7786   SDValue Fudge = DAG.getExtLoad(ISD::EXTLOAD, dl, MVT::f80, DAG.getEntryNode(),
   7787                                  FudgePtr, MachinePointerInfo::getConstantPool(),
   7788                                  MVT::f32, false, false, 4);
   7789   // Extend everything to 80 bits to force it to be done on x87.
   7790   SDValue Add = DAG.getNode(ISD::FADD, dl, MVT::f80, Fild, Fudge);
   7791   return DAG.getNode(ISD::FP_ROUND, dl, DstVT, Add, DAG.getIntPtrConstant(0));
   7792 }
   7793 
   7794 std::pair<SDValue,SDValue> X86TargetLowering::
   7795 FP_TO_INTHelper(SDValue Op, SelectionDAG &DAG, bool IsSigned, bool IsReplace) const {
   7796   DebugLoc DL = Op.getDebugLoc();
   7797 
   7798   EVT DstTy = Op.getValueType();
   7799 
   7800   if (!IsSigned && !isIntegerTypeFTOL(DstTy)) {
   7801     assert(DstTy == MVT::i32 && "Unexpected FP_TO_UINT");
   7802     DstTy = MVT::i64;
   7803   }
   7804 
   7805   assert(DstTy.getSimpleVT() <= MVT::i64 &&
   7806          DstTy.getSimpleVT() >= MVT::i16 &&
   7807          "Unknown FP_TO_INT to lower!");
   7808 
   7809   // These are really Legal.
   7810   if (DstTy == MVT::i32 &&
   7811       isScalarFPTypeInSSEReg(Op.getOperand(0).getValueType()))
   7812     return std::make_pair(SDValue(), SDValue());
   7813   if (Subtarget->is64Bit() &&
   7814       DstTy == MVT::i64 &&
   7815       isScalarFPTypeInSSEReg(Op.getOperand(0).getValueType()))
   7816     return std::make_pair(SDValue(), SDValue());
   7817 
   7818   // We lower FP->int64 either into FISTP64 followed by a load from a temporary
   7819   // stack slot, or into the FTOL runtime function.
   7820   MachineFunction &MF = DAG.getMachineFunction();
   7821   unsigned MemSize = DstTy.getSizeInBits()/8;
   7822   int SSFI = MF.getFrameInfo()->CreateStackObject(MemSize, MemSize, false);
   7823   SDValue StackSlot = DAG.getFrameIndex(SSFI, getPointerTy());
   7824 
   7825   unsigned Opc;
   7826   if (!IsSigned && isIntegerTypeFTOL(DstTy))
   7827     Opc = X86ISD::WIN_FTOL;
   7828   else
   7829     switch (DstTy.getSimpleVT().SimpleTy) {
   7830     default: llvm_unreachable("Invalid FP_TO_SINT to lower!");
   7831     case MVT::i16: Opc = X86ISD::FP_TO_INT16_IN_MEM; break;
   7832     case MVT::i32: Opc = X86ISD::FP_TO_INT32_IN_MEM; break;
   7833     case MVT::i64: Opc = X86ISD::FP_TO_INT64_IN_MEM; break;
   7834     }
   7835 
   7836   SDValue Chain = DAG.getEntryNode();
   7837   SDValue Value = Op.getOperand(0);
   7838   EVT TheVT = Op.getOperand(0).getValueType();
   7839   // FIXME This causes a redundant load/store if the SSE-class value is already
   7840   // in memory, such as if it is on the callstack.
   7841   if (isScalarFPTypeInSSEReg(TheVT)) {
   7842     assert(DstTy == MVT::i64 && "Invalid FP_TO_SINT to lower!");
   7843     Chain = DAG.getStore(Chain, DL, Value, StackSlot,
   7844                          MachinePointerInfo::getFixedStack(SSFI),
   7845                          false, false, 0);
   7846     SDVTList Tys = DAG.getVTList(Op.getOperand(0).getValueType(), MVT::Other);
   7847     SDValue Ops[] = {
   7848       Chain, StackSlot, DAG.getValueType(TheVT)
   7849     };
   7850 
   7851     MachineMemOperand *MMO =
   7852       MF.getMachineMemOperand(MachinePointerInfo::getFixedStack(SSFI),
   7853                               MachineMemOperand::MOLoad, MemSize, MemSize);
   7854     Value = DAG.getMemIntrinsicNode(X86ISD::FLD, DL, Tys, Ops, 3,
   7855                                     DstTy, MMO);
   7856     Chain = Value.getValue(1);
   7857     SSFI = MF.getFrameInfo()->CreateStackObject(MemSize, MemSize, false);
   7858     StackSlot = DAG.getFrameIndex(SSFI, getPointerTy());
   7859   }
   7860 
   7861   MachineMemOperand *MMO =
   7862     MF.getMachineMemOperand(MachinePointerInfo::getFixedStack(SSFI),
   7863                             MachineMemOperand::MOStore, MemSize, MemSize);
   7864 
   7865   if (Opc != X86ISD::WIN_FTOL) {
   7866     // Build the FP_TO_INT*_IN_MEM
   7867     SDValue Ops[] = { Chain, Value, StackSlot };
   7868     SDValue FIST = DAG.getMemIntrinsicNode(Opc, DL, DAG.getVTList(MVT::Other),
   7869                                            Ops, 3, DstTy, MMO);
   7870     return std::make_pair(FIST, StackSlot);
   7871   } else {
   7872     SDValue ftol = DAG.getNode(X86ISD::WIN_FTOL, DL,
   7873       DAG.getVTList(MVT::Other, MVT::Glue),
   7874       Chain, Value);
   7875     SDValue eax = DAG.getCopyFromReg(ftol, DL, X86::EAX,
   7876       MVT::i32, ftol.getValue(1));
   7877     SDValue edx = DAG.getCopyFromReg(eax.getValue(1), DL, X86::EDX,
   7878       MVT::i32, eax.getValue(2));
   7879     SDValue Ops[] = { eax, edx };
   7880     SDValue pair = IsReplace
   7881       ? DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, Ops, 2)
   7882       : DAG.getMergeValues(Ops, 2, DL);
   7883     return std::make_pair(pair, SDValue());
   7884   }
   7885 }
   7886 
   7887 SDValue X86TargetLowering::LowerFP_TO_SINT(SDValue Op,
   7888                                            SelectionDAG &DAG) const {
   7889   if (Op.getValueType().isVector())
   7890     return SDValue();
   7891 
   7892   std::pair<SDValue,SDValue> Vals = FP_TO_INTHelper(Op, DAG,
   7893     /*IsSigned=*/ true, /*IsReplace=*/ false);
   7894   SDValue FIST = Vals.first, StackSlot = Vals.second;
   7895   // If FP_TO_INTHelper failed, the node is actually supposed to be Legal.
   7896   if (FIST.getNode() == 0) return Op;
   7897 
   7898   if (StackSlot.getNode())
   7899     // Load the result.
   7900     return DAG.getLoad(Op.getValueType(), Op.getDebugLoc(),
   7901                        FIST, StackSlot, MachinePointerInfo(),
   7902                        false, false, false, 0);
   7903   else
   7904     // The node is the result.
   7905     return FIST;
   7906 }
   7907 
   7908 SDValue X86TargetLowering::LowerFP_TO_UINT(SDValue Op,
   7909                                            SelectionDAG &DAG) const {
   7910   std::pair<SDValue,SDValue> Vals = FP_TO_INTHelper(Op, DAG,
   7911     /*IsSigned=*/ false, /*IsReplace=*/ false);
   7912   SDValue FIST = Vals.first, StackSlot = Vals.second;
   7913   assert(FIST.getNode() && "Unexpected failure");
   7914 
   7915   if (StackSlot.getNode())
   7916     // Load the result.
   7917     return DAG.getLoad(Op.getValueType(), Op.getDebugLoc(),
   7918                        FIST, StackSlot, MachinePointerInfo(),
   7919                        false, false, false, 0);
   7920   else
   7921     // The node is the result.
   7922     return FIST;
   7923 }
   7924 
   7925 SDValue X86TargetLowering::LowerFABS(SDValue Op,
   7926                                      SelectionDAG &DAG) const {
   7927   LLVMContext *Context = DAG.getContext();
   7928   DebugLoc dl = Op.getDebugLoc();
   7929   EVT VT = Op.getValueType();
   7930   EVT EltVT = VT;
   7931   if (VT.isVector())
   7932     EltVT = VT.getVectorElementType();
   7933   Constant *C;
   7934   if (EltVT == MVT::f64) {
   7935     C = ConstantVector::getSplat(2,
   7936                 ConstantFP::get(*Context, APFloat(APInt(64, ~(1ULL << 63)))));
   7937   } else {
   7938     C = ConstantVector::getSplat(4,
   7939                ConstantFP::get(*Context, APFloat(APInt(32, ~(1U << 31)))));
   7940   }
   7941   SDValue CPIdx = DAG.getConstantPool(C, getPointerTy(), 16);
   7942   SDValue Mask = DAG.getLoad(VT, dl, DAG.getEntryNode(), CPIdx,
   7943                              MachinePointerInfo::getConstantPool(),
   7944                              false, false, false, 16);
   7945   return DAG.getNode(X86ISD::FAND, dl, VT, Op.getOperand(0), Mask);
   7946 }
   7947 
   7948 SDValue X86TargetLowering::LowerFNEG(SDValue Op, SelectionDAG &DAG) const {
   7949   LLVMContext *Context = DAG.getContext();
   7950   DebugLoc dl = Op.getDebugLoc();
   7951   EVT VT = Op.getValueType();
   7952   EVT EltVT = VT;
   7953   unsigned NumElts = VT == MVT::f64 ? 2 : 4;
   7954   if (VT.isVector()) {
   7955     EltVT = VT.getVectorElementType();
   7956     NumElts = VT.getVectorNumElements();
   7957   }
   7958   Constant *C;
   7959   if (EltVT == MVT::f64)
   7960     C = ConstantFP::get(*Context, APFloat(APInt(64, 1ULL << 63)));
   7961   else
   7962     C = ConstantFP::get(*Context, APFloat(APInt(32, 1U << 31)));
   7963   C = ConstantVector::getSplat(NumElts, C);
   7964   SDValue CPIdx = DAG.getConstantPool(C, getPointerTy(), 16);
   7965   SDValue Mask = DAG.getLoad(VT, dl, DAG.getEntryNode(), CPIdx,
   7966                              MachinePointerInfo::getConstantPool(),
   7967                              false, false, false, 16);
   7968   if (VT.isVector()) {
   7969     MVT XORVT = VT.getSizeInBits() == 128 ? MVT::v2i64 : MVT::v4i64;
   7970     return DAG.getNode(ISD::BITCAST, dl, VT,
   7971                        DAG.getNode(ISD::XOR, dl, XORVT,
   7972                     DAG.getNode(ISD::BITCAST, dl, XORVT,
   7973                                 Op.getOperand(0)),
   7974                     DAG.getNode(ISD::BITCAST, dl, XORVT, Mask)));
   7975   } else {
   7976     return DAG.getNode(X86ISD::FXOR, dl, VT, Op.getOperand(0), Mask);
   7977   }
   7978 }
   7979 
   7980 SDValue X86TargetLowering::LowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) const {
   7981   LLVMContext *Context = DAG.getContext();
   7982   SDValue Op0 = Op.getOperand(0);
   7983   SDValue Op1 = Op.getOperand(1);
   7984   DebugLoc dl = Op.getDebugLoc();
   7985   EVT VT = Op.getValueType();
   7986   EVT SrcVT = Op1.getValueType();
   7987 
   7988   // If second operand is smaller, extend it first.
   7989   if (SrcVT.bitsLT(VT)) {
   7990     Op1 = DAG.getNode(ISD::FP_EXTEND, dl, VT, Op1);
   7991     SrcVT = VT;
   7992   }
   7993   // And if it is bigger, shrink it first.
   7994   if (SrcVT.bitsGT(VT)) {
   7995     Op1 = DAG.getNode(ISD::FP_ROUND, dl, VT, Op1, DAG.getIntPtrConstant(1));
   7996     SrcVT = VT;
   7997   }
   7998 
   7999   // At this point the operands and the result should have the same
   8000   // type, and that won't be f80 since that is not custom lowered.
   8001 
   8002   // First get the sign bit of second operand.
   8003   SmallVector<Constant*,4> CV;
   8004   if (SrcVT == MVT::f64) {
   8005     CV.push_back(ConstantFP::get(*Context, APFloat(APInt(64, 1ULL << 63))));
   8006     CV.push_back(ConstantFP::get(*Context, APFloat(APInt(64, 0))));
   8007   } else {
   8008     CV.push_back(ConstantFP::get(*Context, APFloat(APInt(32, 1U << 31))));
   8009     CV.push_back(ConstantFP::get(*Context, APFloat(APInt(32, 0))));
   8010     CV.push_back(ConstantFP::get(*Context, APFloat(APInt(32, 0))));
   8011     CV.push_back(ConstantFP::get(*Context, APFloat(APInt(32, 0))));
   8012   }
   8013   Constant *C = ConstantVector::get(CV);
   8014   SDValue CPIdx = DAG.getConstantPool(C, getPointerTy(), 16);
   8015   SDValue Mask1 = DAG.getLoad(SrcVT, dl, DAG.getEntryNode(), CPIdx,
   8016                               MachinePointerInfo::getConstantPool(),
   8017                               false, false, false, 16);
   8018   SDValue SignBit = DAG.getNode(X86ISD::FAND, dl, SrcVT, Op1, Mask1);
   8019 
   8020   // Shift sign bit right or left if the two operands have different types.
   8021   if (SrcVT.bitsGT(VT)) {
   8022     // Op0 is MVT::f32, Op1 is MVT::f64.
   8023     SignBit = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2f64, SignBit);
   8024     SignBit = DAG.getNode(X86ISD::FSRL, dl, MVT::v2f64, SignBit,
   8025                           DAG.getConstant(32, MVT::i32));
   8026     SignBit = DAG.getNode(ISD::BITCAST, dl, MVT::v4f32, SignBit);
   8027     SignBit = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f32, SignBit,
   8028                           DAG.getIntPtrConstant(0));
   8029   }
   8030 
   8031   // Clear first operand sign bit.
   8032   CV.clear();
   8033   if (VT == MVT::f64) {
   8034     CV.push_back(ConstantFP::get(*Context, APFloat(APInt(64, ~(1ULL << 63)))));
   8035     CV.push_back(ConstantFP::get(*Context, APFloat(APInt(64, 0))));
   8036   } else {
   8037     CV.push_back(ConstantFP::get(*Context, APFloat(APInt(32, ~(1U << 31)))));
   8038     CV.push_back(ConstantFP::get(*Context, APFloat(APInt(32, 0))));
   8039     CV.push_back(ConstantFP::get(*Context, APFloat(APInt(32, 0))));
   8040     CV.push_back(ConstantFP::get(*Context, APFloat(APInt(32, 0))));
   8041   }
   8042   C = ConstantVector::get(CV);
   8043   CPIdx = DAG.getConstantPool(C, getPointerTy(), 16);
   8044   SDValue Mask2 = DAG.getLoad(VT, dl, DAG.getEntryNode(), CPIdx,
   8045                               MachinePointerInfo::getConstantPool(),
   8046                               false, false, false, 16);
   8047   SDValue Val = DAG.getNode(X86ISD::FAND, dl, VT, Op0, Mask2);
   8048 
   8049   // Or the value with the sign bit.
   8050   return DAG.getNode(X86ISD::FOR, dl, VT, Val, SignBit);
   8051 }
   8052 
   8053 SDValue X86TargetLowering::LowerFGETSIGN(SDValue Op, SelectionDAG &DAG) const {
   8054   SDValue N0 = Op.getOperand(0);
   8055   DebugLoc dl = Op.getDebugLoc();
   8056   EVT VT = Op.getValueType();
   8057 
   8058   // Lower ISD::FGETSIGN to (AND (X86ISD::FGETSIGNx86 ...) 1).
   8059   SDValue xFGETSIGN = DAG.getNode(X86ISD::FGETSIGNx86, dl, VT, N0,
   8060                                   DAG.getConstant(1, VT));
   8061   return DAG.getNode(ISD::AND, dl, VT, xFGETSIGN, DAG.getConstant(1, VT));
   8062 }
   8063 
   8064 /// Emit nodes that will be selected as "test Op0,Op0", or something
   8065 /// equivalent.
   8066 SDValue X86TargetLowering::EmitTest(SDValue Op, unsigned X86CC,
   8067                                     SelectionDAG &DAG) const {
   8068   DebugLoc dl = Op.getDebugLoc();
   8069 
   8070   // CF and OF aren't always set the way we want. Determine which
   8071   // of these we need.
   8072   bool NeedCF = false;
   8073   bool NeedOF = false;
   8074   switch (X86CC) {
   8075   default: break;
   8076   case X86::COND_A: case X86::COND_AE:
   8077   case X86::COND_B: case X86::COND_BE:
   8078     NeedCF = true;
   8079     break;
   8080   case X86::COND_G: case X86::COND_GE:
   8081   case X86::COND_L: case X86::COND_LE:
   8082   case X86::COND_O: case X86::COND_NO:
   8083     NeedOF = true;
   8084     break;
   8085   }
   8086 
   8087   // See if we can use the EFLAGS value from the operand instead of
   8088   // doing a separate TEST. TEST always sets OF and CF to 0, so unless
   8089   // we prove that the arithmetic won't overflow, we can't use OF or CF.
   8090   if (Op.getResNo() != 0 || NeedOF || NeedCF)
   8091     // Emit a CMP with 0, which is the TEST pattern.
   8092     return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op,
   8093                        DAG.getConstant(0, Op.getValueType()));
   8094 
   8095   unsigned Opcode = 0;
   8096   unsigned NumOperands = 0;
   8097   switch (Op.getNode()->getOpcode()) {
   8098   case ISD::ADD:
   8099     // Due to an isel shortcoming, be conservative if this add is likely to be
   8100     // selected as part of a load-modify-store instruction. When the root node
   8101     // in a match is a store, isel doesn't know how to remap non-chain non-flag
   8102     // uses of other nodes in the match, such as the ADD in this case. This
   8103     // leads to the ADD being left around and reselected, with the result being
   8104     // two adds in the output.  Alas, even if none our users are stores, that
   8105     // doesn't prove we're O.K.  Ergo, if we have any parents that aren't
   8106     // CopyToReg or SETCC, eschew INC/DEC.  A better fix seems to require
   8107     // climbing the DAG back to the root, and it doesn't seem to be worth the
   8108     // effort.
   8109     for (SDNode::use_iterator UI = Op.getNode()->use_begin(),
   8110          UE = Op.getNode()->use_end(); UI != UE; ++UI)
   8111       if (UI->getOpcode() != ISD::CopyToReg &&
   8112           UI->getOpcode() != ISD::SETCC &&
   8113           UI->getOpcode() != ISD::STORE)
   8114         goto default_case;
   8115 
   8116     if (ConstantSDNode *C =
   8117         dyn_cast<ConstantSDNode>(Op.getNode()->getOperand(1))) {
   8118       // An add of one will be selected as an INC.
   8119       if (C->getAPIntValue() == 1) {
   8120         Opcode = X86ISD::INC;
   8121         NumOperands = 1;
   8122         break;
   8123       }
   8124 
   8125       // An add of negative one (subtract of one) will be selected as a DEC.
   8126       if (C->getAPIntValue().isAllOnesValue()) {
   8127         Opcode = X86ISD::DEC;
   8128         NumOperands = 1;
   8129         break;
   8130       }
   8131     }
   8132 
   8133     // Otherwise use a regular EFLAGS-setting add.
   8134     Opcode = X86ISD::ADD;
   8135     NumOperands = 2;
   8136     break;
   8137   case ISD::AND: {
   8138     // If the primary and result isn't used, don't bother using X86ISD::AND,
   8139     // because a TEST instruction will be better.
   8140     bool NonFlagUse = false;
   8141     for (SDNode::use_iterator UI = Op.getNode()->use_begin(),
   8142            UE = Op.getNode()->use_end(); UI != UE; ++UI) {
   8143       SDNode *User = *UI;
   8144       unsigned UOpNo = UI.getOperandNo();
   8145       if (User->getOpcode() == ISD::TRUNCATE && User->hasOneUse()) {
   8146         // Look pass truncate.
   8147         UOpNo = User->use_begin().getOperandNo();
   8148         User = *User->use_begin();
   8149       }
   8150 
   8151       if (User->getOpcode() != ISD::BRCOND &&
   8152           User->getOpcode() != ISD::SETCC &&
   8153           (User->getOpcode() != ISD::SELECT || UOpNo != 0)) {
   8154         NonFlagUse = true;
   8155         break;
   8156       }
   8157     }
   8158 
   8159     if (!NonFlagUse)
   8160       break;
   8161   }
   8162     // FALL THROUGH
   8163   case ISD::SUB:
   8164   case ISD::OR:
   8165   case ISD::XOR:
   8166     // Due to the ISEL shortcoming noted above, be conservative if this op is
   8167     // likely to be selected as part of a load-modify-store instruction.
   8168     for (SDNode::use_iterator UI = Op.getNode()->use_begin(),
   8169            UE = Op.getNode()->use_end(); UI != UE; ++UI)
   8170       if (UI->getOpcode() == ISD::STORE)
   8171         goto default_case;
   8172 
   8173     // Otherwise use a regular EFLAGS-setting instruction.
   8174     switch (Op.getNode()->getOpcode()) {
   8175     default: llvm_unreachable("unexpected operator!");
   8176     case ISD::SUB: Opcode = X86ISD::SUB; break;
   8177     case ISD::OR:  Opcode = X86ISD::OR;  break;
   8178     case ISD::XOR: Opcode = X86ISD::XOR; break;
   8179     case ISD::AND: Opcode = X86ISD::AND; break;
   8180     }
   8181 
   8182     NumOperands = 2;
   8183     break;
   8184   case X86ISD::ADD:
   8185   case X86ISD::SUB:
   8186   case X86ISD::INC:
   8187   case X86ISD::DEC:
   8188   case X86ISD::OR:
   8189   case X86ISD::XOR:
   8190   case X86ISD::AND:
   8191     return SDValue(Op.getNode(), 1);
   8192   default:
   8193   default_case:
   8194     break;
   8195   }
   8196 
   8197   if (Opcode == 0)
   8198     // Emit a CMP with 0, which is the TEST pattern.
   8199     return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op,
   8200                        DAG.getConstant(0, Op.getValueType()));
   8201 
   8202   SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32);
   8203   SmallVector<SDValue, 4> Ops;
   8204   for (unsigned i = 0; i != NumOperands; ++i)
   8205     Ops.push_back(Op.getOperand(i));
   8206 
   8207   SDValue New = DAG.getNode(Opcode, dl, VTs, &Ops[0], NumOperands);
   8208   DAG.ReplaceAllUsesWith(Op, New);
   8209   return SDValue(New.getNode(), 1);
   8210 }
   8211 
   8212 /// Emit nodes that will be selected as "cmp Op0,Op1", or something
   8213 /// equivalent.
   8214 SDValue X86TargetLowering::EmitCmp(SDValue Op0, SDValue Op1, unsigned X86CC,
   8215                                    SelectionDAG &DAG) const {
   8216   if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op1))
   8217     if (C->getAPIntValue() == 0)
   8218       return EmitTest(Op0, X86CC, DAG);
   8219 
   8220   DebugLoc dl = Op0.getDebugLoc();
   8221   return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op0, Op1);
   8222 }
   8223 
   8224 /// LowerToBT - Result of 'and' is compared against zero. Turn it into a BT node
   8225 /// if it's possible.
   8226 SDValue X86TargetLowering::LowerToBT(SDValue And, ISD::CondCode CC,
   8227                                      DebugLoc dl, SelectionDAG &DAG) const {
   8228   SDValue Op0 = And.getOperand(0);
   8229   SDValue Op1 = And.getOperand(1);
   8230   if (Op0.getOpcode() == ISD::TRUNCATE)
   8231     Op0 = Op0.getOperand(0);
   8232   if (Op1.getOpcode() == ISD::TRUNCATE)
   8233     Op1 = Op1.getOperand(0);
   8234 
   8235   SDValue LHS, RHS;
   8236   if (Op1.getOpcode() == ISD::SHL)
   8237     std::swap(Op0, Op1);
   8238   if (Op0.getOpcode() == ISD::SHL) {
   8239     if (ConstantSDNode *And00C = dyn_cast<ConstantSDNode>(Op0.getOperand(0)))
   8240       if (And00C->getZExtValue() == 1) {
   8241         // If we looked past a truncate, check that it's only truncating away
   8242         // known zeros.
   8243         unsigned BitWidth = Op0.getValueSizeInBits();
   8244         unsigned AndBitWidth = And.getValueSizeInBits();
   8245         if (BitWidth > AndBitWidth) {
   8246           APInt Zeros, Ones;
   8247           DAG.ComputeMaskedBits(Op0, Zeros, Ones);
   8248           if (Zeros.countLeadingOnes() < BitWidth - AndBitWidth)
   8249             return SDValue();
   8250         }
   8251         LHS = Op1;
   8252         RHS = Op0.getOperand(1);
   8253       }
   8254   } else if (Op1.getOpcode() == ISD::Constant) {
   8255     ConstantSDNode *AndRHS = cast<ConstantSDNode>(Op1);
   8256     uint64_t AndRHSVal = AndRHS->getZExtValue();
   8257     SDValue AndLHS = Op0;
   8258 
   8259     if (AndRHSVal == 1 && AndLHS.getOpcode() == ISD::SRL) {
   8260       LHS = AndLHS.getOperand(0);
   8261       RHS = AndLHS.getOperand(1);
   8262     }
   8263 
   8264     // Use BT if the immediate can't be encoded in a TEST instruction.
   8265     if (!isUInt<32>(AndRHSVal) && isPowerOf2_64(AndRHSVal)) {
   8266       LHS = AndLHS;
   8267       RHS = DAG.getConstant(Log2_64_Ceil(AndRHSVal), LHS.getValueType());
   8268     }
   8269   }
   8270 
   8271   if (LHS.getNode()) {
   8272     // If LHS is i8, promote it to i32 with any_extend.  There is no i8 BT
   8273     // instruction.  Since the shift amount is in-range-or-undefined, we know
   8274     // that doing a bittest on the i32 value is ok.  We extend to i32 because
   8275     // the encoding for the i16 version is larger than the i32 version.
   8276     // Also promote i16 to i32 for performance / code size reason.
   8277     if (LHS.getValueType() == MVT::i8 ||
   8278         LHS.getValueType() == MVT::i16)
   8279       LHS = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, LHS);
   8280 
   8281     // If the operand types disagree, extend the shift amount to match.  Since
   8282     // BT ignores high bits (like shifts) we can use anyextend.
   8283     if (LHS.getValueType() != RHS.getValueType())
   8284       RHS = DAG.getNode(ISD::ANY_EXTEND, dl, LHS.getValueType(), RHS);
   8285 
   8286     SDValue BT = DAG.getNode(X86ISD::BT, dl, MVT::i32, LHS, RHS);
   8287     unsigned Cond = CC == ISD::SETEQ ? X86::COND_AE : X86::COND_B;
   8288     return DAG.getNode(X86ISD::SETCC, dl, MVT::i8,
   8289                        DAG.getConstant(Cond, MVT::i8), BT);
   8290   }
   8291 
   8292   return SDValue();
   8293 }
   8294 
   8295 SDValue X86TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const {
   8296 
   8297   if (Op.getValueType().isVector()) return LowerVSETCC(Op, DAG);
   8298 
   8299   assert(Op.getValueType() == MVT::i8 && "SetCC type must be 8-bit integer");
   8300   SDValue Op0 = Op.getOperand(0);
   8301   SDValue Op1 = Op.getOperand(1);
   8302   DebugLoc dl = Op.getDebugLoc();
   8303   ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(2))->get();
   8304 
   8305   // Optimize to BT if possible.
   8306   // Lower (X & (1 << N)) == 0 to BT(X, N).
   8307   // Lower ((X >>u N) & 1) != 0 to BT(X, N).
   8308   // Lower ((X >>s N) & 1) != 0 to BT(X, N).
   8309   if (Op0.getOpcode() == ISD::AND && Op0.hasOneUse() &&
   8310       Op1.getOpcode() == ISD::Constant &&
   8311       cast<ConstantSDNode>(Op1)->isNullValue() &&
   8312       (CC == ISD::SETEQ || CC == ISD::SETNE)) {
   8313     SDValue NewSetCC = LowerToBT(Op0, CC, dl, DAG);
   8314     if (NewSetCC.getNode())
   8315       return NewSetCC;
   8316   }
   8317 
   8318   // Look for X == 0, X == 1, X != 0, or X != 1.  We can simplify some forms of
   8319   // these.
   8320   if (Op1.getOpcode() == ISD::Constant &&
   8321       (cast<ConstantSDNode>(Op1)->getZExtValue() == 1 ||
   8322        cast<ConstantSDNode>(Op1)->isNullValue()) &&
   8323       (CC == ISD::SETEQ || CC == ISD::SETNE)) {
   8324 
   8325     // If the input is a setcc, then reuse the input setcc or use a new one with
   8326     // the inverted condition.
   8327     if (Op0.getOpcode() == X86ISD::SETCC) {
   8328       X86::CondCode CCode = (X86::CondCode)Op0.getConstantOperandVal(0);
   8329       bool Invert = (CC == ISD::SETNE) ^
   8330         cast<ConstantSDNode>(Op1)->isNullValue();
   8331       if (!Invert) return Op0;
   8332 
   8333       CCode = X86::GetOppositeBranchCondition(CCode);
   8334       return DAG.getNode(X86ISD::SETCC, dl, MVT::i8,
   8335                          DAG.getConstant(CCode, MVT::i8), Op0.getOperand(1));
   8336     }
   8337   }
   8338 
   8339   bool isFP = Op1.getValueType().isFloatingPoint();
   8340   unsigned X86CC = TranslateX86CC(CC, isFP, Op0, Op1, DAG);
   8341   if (X86CC == X86::COND_INVALID)
   8342     return SDValue();
   8343 
   8344   SDValue EFLAGS = EmitCmp(Op0, Op1, X86CC, DAG);
   8345   return DAG.getNode(X86ISD::SETCC, dl, MVT::i8,
   8346                      DAG.getConstant(X86CC, MVT::i8), EFLAGS);
   8347 }
   8348 
   8349 // Lower256IntVSETCC - Break a VSETCC 256-bit integer VSETCC into two new 128
   8350 // ones, and then concatenate the result back.
   8351 static SDValue Lower256IntVSETCC(SDValue Op, SelectionDAG &DAG) {
   8352   EVT VT = Op.getValueType();
   8353 
   8354   assert(VT.getSizeInBits() == 256 && Op.getOpcode() == ISD::SETCC &&
   8355          "Unsupported value type for operation");
   8356 
   8357   int NumElems = VT.getVectorNumElements();
   8358   DebugLoc dl = Op.getDebugLoc();
   8359   SDValue CC = Op.getOperand(2);
   8360   SDValue Idx0 = DAG.getConstant(0, MVT::i32);
   8361   SDValue Idx1 = DAG.getConstant(NumElems/2, MVT::i32);
   8362 
   8363   // Extract the LHS vectors
   8364   SDValue LHS = Op.getOperand(0);
   8365   SDValue LHS1 = Extract128BitVector(LHS, Idx0, DAG, dl);
   8366   SDValue LHS2 = Extract128BitVector(LHS, Idx1, DAG, dl);
   8367 
   8368   // Extract the RHS vectors
   8369   SDValue RHS = Op.getOperand(1);
   8370   SDValue RHS1 = Extract128BitVector(RHS, Idx0, DAG, dl);
   8371   SDValue RHS2 = Extract128BitVector(RHS, Idx1, DAG, dl);
   8372 
   8373   // Issue the operation on the smaller types and concatenate the result back
   8374   MVT EltVT = VT.getVectorElementType().getSimpleVT();
   8375   EVT NewVT = MVT::getVectorVT(EltVT, NumElems/2);
   8376   return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT,
   8377                      DAG.getNode(Op.getOpcode(), dl, NewVT, LHS1, RHS1, CC),
   8378                      DAG.getNode(Op.getOpcode(), dl, NewVT, LHS2, RHS2, CC));
   8379 }
   8380 
   8381 
   8382 SDValue X86TargetLowering::LowerVSETCC(SDValue Op, SelectionDAG &DAG) const {
   8383   SDValue Cond;
   8384   SDValue Op0 = Op.getOperand(0);
   8385   SDValue Op1 = Op.getOperand(1);
   8386   SDValue CC = Op.getOperand(2);
   8387   EVT VT = Op.getValueType();
   8388   ISD::CondCode SetCCOpcode = cast<CondCodeSDNode>(CC)->get();
   8389   bool isFP = Op.getOperand(1).getValueType().isFloatingPoint();
   8390   DebugLoc dl = Op.getDebugLoc();
   8391 
   8392   if (isFP) {
   8393     unsigned SSECC = 8;
   8394     EVT EltVT = Op0.getValueType().getVectorElementType();
   8395     assert(EltVT == MVT::f32 || EltVT == MVT::f64); (void)EltVT;
   8396 
   8397     bool Swap = false;
   8398 
   8399     // SSE Condition code mapping:
   8400     //  0 - EQ
   8401     //  1 - LT
   8402     //  2 - LE
   8403     //  3 - UNORD
   8404     //  4 - NEQ
   8405     //  5 - NLT
   8406     //  6 - NLE
   8407     //  7 - ORD
   8408     switch (SetCCOpcode) {
   8409     default: break;
   8410     case ISD::SETOEQ:
   8411     case ISD::SETEQ:  SSECC = 0; break;
   8412     case ISD::SETOGT:
   8413     case ISD::SETGT: Swap = true; // Fallthrough
   8414     case ISD::SETLT:
   8415     case ISD::SETOLT: SSECC = 1; break;
   8416     case ISD::SETOGE:
   8417     case ISD::SETGE: Swap = true; // Fallthrough
   8418     case ISD::SETLE:
   8419     case ISD::SETOLE: SSECC = 2; break;
   8420     case ISD::SETUO:  SSECC = 3; break;
   8421     case ISD::SETUNE:
   8422     case ISD::SETNE:  SSECC = 4; break;
   8423     case ISD::SETULE: Swap = true;
   8424     case ISD::SETUGE: SSECC = 5; break;
   8425     case ISD::SETULT: Swap = true;
   8426     case ISD::SETUGT: SSECC = 6; break;
   8427     case ISD::SETO:   SSECC = 7; break;
   8428     }
   8429     if (Swap)
   8430       std::swap(Op0, Op1);
   8431 
   8432     // In the two special cases we can't handle, emit two comparisons.
   8433     if (SSECC == 8) {
   8434       if (SetCCOpcode == ISD::SETUEQ) {
   8435         SDValue UNORD, EQ;
   8436         UNORD = DAG.getNode(X86ISD::CMPP, dl, VT, Op0, Op1,
   8437                             DAG.getConstant(3, MVT::i8));
   8438         EQ = DAG.getNode(X86ISD::CMPP, dl, VT, Op0, Op1,
   8439                          DAG.getConstant(0, MVT::i8));
   8440         return DAG.getNode(ISD::OR, dl, VT, UNORD, EQ);
   8441       } else if (SetCCOpcode == ISD::SETONE) {
   8442         SDValue ORD, NEQ;
   8443         ORD = DAG.getNode(X86ISD::CMPP, dl, VT, Op0, Op1,
   8444                           DAG.getConstant(7, MVT::i8));
   8445         NEQ = DAG.getNode(X86ISD::CMPP, dl, VT, Op0, Op1,
   8446                           DAG.getConstant(4, MVT::i8));
   8447         return DAG.getNode(ISD::AND, dl, VT, ORD, NEQ);
   8448       }
   8449       llvm_unreachable("Illegal FP comparison");
   8450     }
   8451     // Handle all other FP comparisons here.
   8452     return DAG.getNode(X86ISD::CMPP, dl, VT, Op0, Op1,
   8453                        DAG.getConstant(SSECC, MVT::i8));
   8454   }
   8455 
   8456   // Break 256-bit integer vector compare into smaller ones.
   8457   if (VT.getSizeInBits() == 256 && !Subtarget->hasAVX2())
   8458     return Lower256IntVSETCC(Op, DAG);
   8459 
   8460   // We are handling one of the integer comparisons here.  Since SSE only has
   8461   // GT and EQ comparisons for integer, swapping operands and multiple
   8462   // operations may be required for some comparisons.
   8463   unsigned Opc = 0;
   8464   bool Swap = false, Invert = false, FlipSigns = false;
   8465 
   8466   switch (SetCCOpcode) {
   8467   default: break;
   8468   case ISD::SETNE:  Invert = true;
   8469   case ISD::SETEQ:  Opc = X86ISD::PCMPEQ; break;
   8470   case ISD::SETLT:  Swap = true;
   8471   case ISD::SETGT:  Opc = X86ISD::PCMPGT; break;
   8472   case ISD::SETGE:  Swap = true;
   8473   case ISD::SETLE:  Opc = X86ISD::PCMPGT; Invert = true; break;
   8474   case ISD::SETULT: Swap = true;
   8475   case ISD::SETUGT: Opc = X86ISD::PCMPGT; FlipSigns = true; break;
   8476   case ISD::SETUGE: Swap = true;
   8477   case ISD::SETULE: Opc = X86ISD::PCMPGT; FlipSigns = true; Invert = true; break;
   8478   }
   8479   if (Swap)
   8480     std::swap(Op0, Op1);
   8481 
   8482   // Check that the operation in question is available (most are plain SSE2,
   8483   // but PCMPGTQ and PCMPEQQ have different requirements).
   8484   if (Opc == X86ISD::PCMPGT && VT == MVT::v2i64 && !Subtarget->hasSSE42())
   8485     return SDValue();
   8486   if (Opc == X86ISD::PCMPEQ && VT == MVT::v2i64 && !Subtarget->hasSSE41())
   8487     return SDValue();
   8488 
   8489   // Since SSE has no unsigned integer comparisons, we need to flip  the sign
   8490   // bits of the inputs before performing those operations.
   8491   if (FlipSigns) {
   8492     EVT EltVT = VT.getVectorElementType();
   8493     SDValue SignBit = DAG.getConstant(APInt::getSignBit(EltVT.getSizeInBits()),
   8494                                       EltVT);
   8495     std::vector<SDValue> SignBits(VT.getVectorNumElements(), SignBit);
   8496     SDValue SignVec = DAG.getNode(ISD::BUILD_VECTOR, dl, VT, &SignBits[0],
   8497                                     SignBits.size());
   8498     Op0 = DAG.getNode(ISD::XOR, dl, VT, Op0, SignVec);
   8499     Op1 = DAG.getNode(ISD::XOR, dl, VT, Op1, SignVec);
   8500   }
   8501 
   8502   SDValue Result = DAG.getNode(Opc, dl, VT, Op0, Op1);
   8503 
   8504   // If the logical-not of the result is required, perform that now.
   8505   if (Invert)
   8506     Result = DAG.getNOT(dl, Result, VT);
   8507 
   8508   return Result;
   8509 }
   8510 
   8511 // isX86LogicalCmp - Return true if opcode is a X86 logical comparison.
   8512 static bool isX86LogicalCmp(SDValue Op) {
   8513   unsigned Opc = Op.getNode()->getOpcode();
   8514   if (Opc == X86ISD::CMP || Opc == X86ISD::COMI || Opc == X86ISD::UCOMI)
   8515     return true;
   8516   if (Op.getResNo() == 1 &&
   8517       (Opc == X86ISD::ADD ||
   8518        Opc == X86ISD::SUB ||
   8519        Opc == X86ISD::ADC ||
   8520        Opc == X86ISD::SBB ||
   8521        Opc == X86ISD::SMUL ||
   8522        Opc == X86ISD::UMUL ||
   8523        Opc == X86ISD::INC ||
   8524        Opc == X86ISD::DEC ||
   8525        Opc == X86ISD::OR ||
   8526        Opc == X86ISD::XOR ||
   8527        Opc == X86ISD::AND))
   8528     return true;
   8529 
   8530   if (Op.getResNo() == 2 && Opc == X86ISD::UMUL)
   8531     return true;
   8532 
   8533   return false;
   8534 }
   8535 
   8536 static bool isZero(SDValue V) {
   8537   ConstantSDNode *C = dyn_cast<ConstantSDNode>(V);
   8538   return C && C->isNullValue();
   8539 }
   8540 
   8541 static bool isAllOnes(SDValue V) {
   8542   ConstantSDNode *C = dyn_cast<ConstantSDNode>(V);
   8543   return C && C->isAllOnesValue();
   8544 }
   8545 
   8546 SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
   8547   bool addTest = true;
   8548   SDValue Cond  = Op.getOperand(0);
   8549   SDValue Op1 = Op.getOperand(1);
   8550   SDValue Op2 = Op.getOperand(2);
   8551   DebugLoc DL = Op.getDebugLoc();
   8552   SDValue CC;
   8553 
   8554   if (Cond.getOpcode() == ISD::SETCC) {
   8555     SDValue NewCond = LowerSETCC(Cond, DAG);
   8556     if (NewCond.getNode())
   8557       Cond = NewCond;
   8558   }
   8559 
   8560   // (select (x == 0), -1, y) -> (sign_bit (x - 1)) | y
   8561   // (select (x == 0), y, -1) -> ~(sign_bit (x - 1)) | y
   8562   // (select (x != 0), y, -1) -> (sign_bit (x - 1)) | y
   8563   // (select (x != 0), -1, y) -> ~(sign_bit (x - 1)) | y
   8564   if (Cond.getOpcode() == X86ISD::SETCC &&
   8565       Cond.getOperand(1).getOpcode() == X86ISD::CMP &&
   8566       isZero(Cond.getOperand(1).getOperand(1))) {
   8567     SDValue Cmp = Cond.getOperand(1);
   8568 
   8569     unsigned CondCode =cast<ConstantSDNode>(Cond.getOperand(0))->getZExtValue();
   8570 
   8571     if ((isAllOnes(Op1) || isAllOnes(Op2)) &&
   8572         (CondCode == X86::COND_E || CondCode == X86::COND_NE)) {
   8573       SDValue Y = isAllOnes(Op2) ? Op1 : Op2;
   8574 
   8575       SDValue CmpOp0 = Cmp.getOperand(0);
   8576       Cmp = DAG.getNode(X86ISD::CMP, DL, MVT::i32,
   8577                         CmpOp0, DAG.getConstant(1, CmpOp0.getValueType()));
   8578 
   8579       SDValue Res =   // Res = 0 or -1.
   8580         DAG.getNode(X86ISD::SETCC_CARRY, DL, Op.getValueType(),
   8581                     DAG.getConstant(X86::COND_B, MVT::i8), Cmp);
   8582 
   8583       if (isAllOnes(Op1) != (CondCode == X86::COND_E))
   8584         Res = DAG.getNOT(DL, Res, Res.getValueType());
   8585 
   8586       ConstantSDNode *N2C = dyn_cast<ConstantSDNode>(Op2);
   8587       if (N2C == 0 || !N2C->isNullValue())
   8588         Res = DAG.getNode(ISD::OR, DL, Res.getValueType(), Res, Y);
   8589       return Res;
   8590     }
   8591   }
   8592 
   8593   // Look past (and (setcc_carry (cmp ...)), 1).
   8594   if (Cond.getOpcode() == ISD::AND &&
   8595       Cond.getOperand(0).getOpcode() == X86ISD::SETCC_CARRY) {
   8596     ConstantSDNode *C = dyn_cast<ConstantSDNode>(Cond.getOperand(1));
   8597     if (C && C->getAPIntValue() == 1)
   8598       Cond = Cond.getOperand(0);
   8599   }
   8600 
   8601   // If condition flag is set by a X86ISD::CMP, then use it as the condition
   8602   // setting operand in place of the X86ISD::SETCC.
   8603   unsigned CondOpcode = Cond.getOpcode();
   8604   if (CondOpcode == X86ISD::SETCC ||
   8605       CondOpcode == X86ISD::SETCC_CARRY) {
   8606     CC = Cond.getOperand(0);
   8607 
   8608     SDValue Cmp = Cond.getOperand(1);
   8609     unsigned Opc = Cmp.getOpcode();
   8610     EVT VT = Op.getValueType();
   8611 
   8612     bool IllegalFPCMov = false;
   8613     if (VT.isFloatingPoint() && !VT.isVector() &&
   8614         !isScalarFPTypeInSSEReg(VT))  // FPStack?
   8615       IllegalFPCMov = !hasFPCMov(cast<ConstantSDNode>(CC)->getSExtValue());
   8616 
   8617     if ((isX86LogicalCmp(Cmp) && !IllegalFPCMov) ||
   8618         Opc == X86ISD::BT) { // FIXME
   8619       Cond = Cmp;
   8620       addTest = false;
   8621     }
   8622   } else if (CondOpcode == ISD::USUBO || CondOpcode == ISD::SSUBO ||
   8623              CondOpcode == ISD::UADDO || CondOpcode == ISD::SADDO ||
   8624              ((CondOpcode == ISD::UMULO || CondOpcode == ISD::SMULO) &&
   8625               Cond.getOperand(0).getValueType() != MVT::i8)) {
   8626     SDValue LHS = Cond.getOperand(0);
   8627     SDValue RHS = Cond.getOperand(1);
   8628     unsigned X86Opcode;
   8629     unsigned X86Cond;
   8630     SDVTList VTs;
   8631     switch (CondOpcode) {
   8632     case ISD::UADDO: X86Opcode = X86ISD::ADD; X86Cond = X86::COND_B; break;
   8633     case ISD::SADDO: X86Opcode = X86ISD::ADD; X86Cond = X86::COND_O; break;
   8634     case ISD::USUBO: X86Opcode = X86ISD::SUB; X86Cond = X86::COND_B; break;
   8635     case ISD::SSUBO: X86Opcode = X86ISD::SUB; X86Cond = X86::COND_O; break;
   8636     case ISD::UMULO: X86Opcode = X86ISD::UMUL; X86Cond = X86::COND_O; break;
   8637     case ISD::SMULO: X86Opcode = X86ISD::SMUL; X86Cond = X86::COND_O; break;
   8638     default: llvm_unreachable("unexpected overflowing operator");
   8639     }
   8640     if (CondOpcode == ISD::UMULO)
   8641       VTs = DAG.getVTList(LHS.getValueType(), LHS.getValueType(),
   8642                           MVT::i32);
   8643     else
   8644       VTs = DAG.getVTList(LHS.getValueType(), MVT::i32);
   8645 
   8646     SDValue X86Op = DAG.getNode(X86Opcode, DL, VTs, LHS, RHS);
   8647 
   8648     if (CondOpcode == ISD::UMULO)
   8649       Cond = X86Op.getValue(2);
   8650     else
   8651       Cond = X86Op.getValue(1);
   8652 
   8653     CC = DAG.getConstant(X86Cond, MVT::i8);
   8654     addTest = false;
   8655   }
   8656 
   8657   if (addTest) {
   8658     // Look pass the truncate.
   8659     if (Cond.getOpcode() == ISD::TRUNCATE)
   8660       Cond = Cond.getOperand(0);
   8661 
   8662     // We know the result of AND is compared against zero. Try to match
   8663     // it to BT.
   8664     if (Cond.getOpcode() == ISD::AND && Cond.hasOneUse()) {
   8665       SDValue NewSetCC = LowerToBT(Cond, ISD::SETNE, DL, DAG);
   8666       if (NewSetCC.getNode()) {
   8667         CC = NewSetCC.getOperand(0);
   8668         Cond = NewSetCC.getOperand(1);
   8669         addTest = false;
   8670       }
   8671     }
   8672   }
   8673 
   8674   if (addTest) {
   8675     CC = DAG.getConstant(X86::COND_NE, MVT::i8);
   8676     Cond = EmitTest(Cond, X86::COND_NE, DAG);
   8677   }
   8678 
   8679   // a <  b ? -1 :  0 -> RES = ~setcc_carry
   8680   // a <  b ?  0 : -1 -> RES = setcc_carry
   8681   // a >= b ? -1 :  0 -> RES = setcc_carry
   8682   // a >= b ?  0 : -1 -> RES = ~setcc_carry
   8683   if (Cond.getOpcode() == X86ISD::CMP) {
   8684     unsigned CondCode = cast<ConstantSDNode>(CC)->getZExtValue();
   8685 
   8686     if ((CondCode == X86::COND_AE || CondCode == X86::COND_B) &&
   8687         (isAllOnes(Op1) || isAllOnes(Op2)) && (isZero(Op1) || isZero(Op2))) {
   8688       SDValue Res = DAG.getNode(X86ISD::SETCC_CARRY, DL, Op.getValueType(),
   8689                                 DAG.getConstant(X86::COND_B, MVT::i8), Cond);
   8690       if (isAllOnes(Op1) != (CondCode == X86::COND_B))
   8691         return DAG.getNOT(DL, Res, Res.getValueType());
   8692       return Res;
   8693     }
   8694   }
   8695 
   8696   // X86ISD::CMOV means set the result (which is operand 1) to the RHS if
   8697   // condition is true.
   8698   SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::Glue);
   8699   SDValue Ops[] = { Op2, Op1, CC, Cond };
   8700   return DAG.getNode(X86ISD::CMOV, DL, VTs, Ops, array_lengthof(Ops));
   8701 }
   8702 
   8703 // isAndOrOfSingleUseSetCCs - Return true if node is an ISD::AND or
   8704 // ISD::OR of two X86ISD::SETCC nodes each of which has no other use apart
   8705 // from the AND / OR.
   8706 static bool isAndOrOfSetCCs(SDValue Op, unsigned &Opc) {
   8707   Opc = Op.getOpcode();
   8708   if (Opc != ISD::OR && Opc != ISD::AND)
   8709     return false;
   8710   return (Op.getOperand(0).getOpcode() == X86ISD::SETCC &&
   8711           Op.getOperand(0).hasOneUse() &&
   8712           Op.getOperand(1).getOpcode() == X86ISD::SETCC &&
   8713           Op.getOperand(1).hasOneUse());
   8714 }
   8715 
   8716 // isXor1OfSetCC - Return true if node is an ISD::XOR of a X86ISD::SETCC and
   8717 // 1 and that the SETCC node has a single use.
   8718 static bool isXor1OfSetCC(SDValue Op) {
   8719   if (Op.getOpcode() != ISD::XOR)
   8720     return false;
   8721   ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(Op.getOperand(1));
   8722   if (N1C && N1C->getAPIntValue() == 1) {
   8723     return Op.getOperand(0).getOpcode() == X86ISD::SETCC &&
   8724       Op.getOperand(0).hasOneUse();
   8725   }
   8726   return false;
   8727 }
   8728 
   8729 SDValue X86TargetLowering::LowerBRCOND(SDValue Op, SelectionDAG &DAG) const {
   8730   bool addTest = true;
   8731   SDValue Chain = Op.getOperand(0);
   8732   SDValue Cond  = Op.getOperand(1);
   8733   SDValue Dest  = Op.getOperand(2);
   8734   DebugLoc dl = Op.getDebugLoc();
   8735   SDValue CC;
   8736   bool Inverted = false;
   8737 
   8738   if (Cond.getOpcode() == ISD::SETCC) {
   8739     // Check for setcc([su]{add,sub,mul}o == 0).
   8740     if (cast<CondCodeSDNode>(Cond.getOperand(2))->get() == ISD::SETEQ &&
   8741         isa<ConstantSDNode>(Cond.getOperand(1)) &&
   8742         cast<ConstantSDNode>(Cond.getOperand(1))->isNullValue() &&
   8743         Cond.getOperand(0).getResNo() == 1 &&
   8744         (Cond.getOperand(0).getOpcode() == ISD::SADDO ||
   8745          Cond.getOperand(0).getOpcode() == ISD::UADDO ||
   8746          Cond.getOperand(0).getOpcode() == ISD::SSUBO ||
   8747          Cond.getOperand(0).getOpcode() == ISD::USUBO ||
   8748          Cond.getOperand(0).getOpcode() == ISD::SMULO ||
   8749          Cond.getOperand(0).getOpcode() == ISD::UMULO)) {
   8750       Inverted = true;
   8751       Cond = Cond.getOperand(0);
   8752     } else {
   8753       SDValue NewCond = LowerSETCC(Cond, DAG);
   8754       if (NewCond.getNode())
   8755         Cond = NewCond;
   8756     }
   8757   }
   8758 #if 0
   8759   // FIXME: LowerXALUO doesn't handle these!!
   8760   else if (Cond.getOpcode() == X86ISD::ADD  ||
   8761            Cond.getOpcode() == X86ISD::SUB  ||
   8762            Cond.getOpcode() == X86ISD::SMUL ||
   8763            Cond.getOpcode() == X86ISD::UMUL)
   8764     Cond = LowerXALUO(Cond, DAG);
   8765 #endif
   8766 
   8767   // Look pass (and (setcc_carry (cmp ...)), 1).
   8768   if (Cond.getOpcode() == ISD::AND &&
   8769       Cond.getOperand(0).getOpcode() == X86ISD::SETCC_CARRY) {
   8770     ConstantSDNode *C = dyn_cast<ConstantSDNode>(Cond.getOperand(1));
   8771     if (C && C->getAPIntValue() == 1)
   8772       Cond = Cond.getOperand(0);
   8773   }
   8774 
   8775   // If condition flag is set by a X86ISD::CMP, then use it as the condition
   8776   // setting operand in place of the X86ISD::SETCC.
   8777   unsigned CondOpcode = Cond.getOpcode();
   8778   if (CondOpcode == X86ISD::SETCC ||
   8779       CondOpcode == X86ISD::SETCC_CARRY) {
   8780     CC = Cond.getOperand(0);
   8781 
   8782     SDValue Cmp = Cond.getOperand(1);
   8783     unsigned Opc = Cmp.getOpcode();
   8784     // FIXME: WHY THE SPECIAL CASING OF LogicalCmp??
   8785     if (isX86LogicalCmp(Cmp) || Opc == X86ISD::BT) {
   8786       Cond = Cmp;
   8787       addTest = false;
   8788     } else {
   8789       switch (cast<ConstantSDNode>(CC)->getZExtValue()) {
   8790       default: break;
   8791       case X86::COND_O:
   8792       case X86::COND_B:
   8793         // These can only come from an arithmetic instruction with overflow,
   8794         // e.g. SADDO, UADDO.
   8795         Cond = Cond.getNode()->getOperand(1);
   8796         addTest = false;
   8797         break;
   8798       }
   8799     }
   8800   }
   8801   CondOpcode = Cond.getOpcode();
   8802   if (CondOpcode == ISD::UADDO || CondOpcode == ISD::SADDO ||
   8803       CondOpcode == ISD::USUBO || CondOpcode == ISD::SSUBO ||
   8804       ((CondOpcode == ISD::UMULO || CondOpcode == ISD::SMULO) &&
   8805        Cond.getOperand(0).getValueType() != MVT::i8)) {
   8806     SDValue LHS = Cond.getOperand(0);
   8807     SDValue RHS = Cond.getOperand(1);
   8808     unsigned X86Opcode;
   8809     unsigned X86Cond;
   8810     SDVTList VTs;
   8811     switch (CondOpcode) {
   8812     case ISD::UADDO: X86Opcode = X86ISD::ADD; X86Cond = X86::COND_B; break;
   8813     case ISD::SADDO: X86Opcode = X86ISD::ADD; X86Cond = X86::COND_O; break;
   8814     case ISD::USUBO: X86Opcode = X86ISD::SUB; X86Cond = X86::COND_B; break;
   8815     case ISD::SSUBO: X86Opcode = X86ISD::SUB; X86Cond = X86::COND_O; break;
   8816     case ISD::UMULO: X86Opcode = X86ISD::UMUL; X86Cond = X86::COND_O; break;
   8817     case ISD::SMULO: X86Opcode = X86ISD::SMUL; X86Cond = X86::COND_O; break;
   8818     default: llvm_unreachable("unexpected overflowing operator");
   8819     }
   8820     if (Inverted)
   8821       X86Cond = X86::GetOppositeBranchCondition((X86::CondCode)X86Cond);
   8822     if (CondOpcode == ISD::UMULO)
   8823       VTs = DAG.getVTList(LHS.getValueType(), LHS.getValueType(),
   8824                           MVT::i32);
   8825     else
   8826       VTs = DAG.getVTList(LHS.getValueType(), MVT::i32);
   8827 
   8828     SDValue X86Op = DAG.getNode(X86Opcode, dl, VTs, LHS, RHS);
   8829 
   8830     if (CondOpcode == ISD::UMULO)
   8831       Cond = X86Op.getValue(2);
   8832     else
   8833       Cond = X86Op.getValue(1);
   8834 
   8835     CC = DAG.getConstant(X86Cond, MVT::i8);
   8836     addTest = false;
   8837   } else {
   8838     unsigned CondOpc;
   8839     if (Cond.hasOneUse() && isAndOrOfSetCCs(Cond, CondOpc)) {
   8840       SDValue Cmp = Cond.getOperand(0).getOperand(1);
   8841       if (CondOpc == ISD::OR) {
   8842         // Also, recognize the pattern generated by an FCMP_UNE. We can emit
   8843         // two branches instead of an explicit OR instruction with a
   8844         // separate test.
   8845         if (Cmp == Cond.getOperand(1).getOperand(1) &&
   8846             isX86LogicalCmp(Cmp)) {
   8847           CC = Cond.getOperand(0).getOperand(0);
   8848           Chain = DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(),
   8849                               Chain, Dest, CC, Cmp);
   8850           CC = Cond.getOperand(1).getOperand(0);
   8851           Cond = Cmp;
   8852           addTest = false;
   8853         }
   8854       } else { // ISD::AND
   8855         // Also, recognize the pattern generated by an FCMP_OEQ. We can emit
   8856         // two branches instead of an explicit AND instruction with a
   8857         // separate test. However, we only do this if this block doesn't
   8858         // have a fall-through edge, because this requires an explicit
   8859         // jmp when the condition is false.
   8860         if (Cmp == Cond.getOperand(1).getOperand(1) &&
   8861             isX86LogicalCmp(Cmp) &&
   8862             Op.getNode()->hasOneUse()) {
   8863           X86::CondCode CCode =
   8864             (X86::CondCode)Cond.getOperand(0).getConstantOperandVal(0);
   8865           CCode = X86::GetOppositeBranchCondition(CCode);
   8866           CC = DAG.getConstant(CCode, MVT::i8);
   8867           SDNode *User = *Op.getNode()->use_begin();
   8868           // Look for an unconditional branch following this conditional branch.
   8869           // We need this because we need to reverse the successors in order
   8870           // to implement FCMP_OEQ.
   8871           if (User->getOpcode() == ISD::BR) {
   8872             SDValue FalseBB = User->getOperand(1);
   8873             SDNode *NewBR =
   8874               DAG.UpdateNodeOperands(User, User->getOperand(0), Dest);
   8875             assert(NewBR == User);
   8876             (void)NewBR;
   8877             Dest = FalseBB;
   8878 
   8879             Chain = DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(),
   8880                                 Chain, Dest, CC, Cmp);
   8881             X86::CondCode CCode =
   8882               (X86::CondCode)Cond.getOperand(1).getConstantOperandVal(0);
   8883             CCode = X86::GetOppositeBranchCondition(CCode);
   8884             CC = DAG.getConstant(CCode, MVT::i8);
   8885             Cond = Cmp;
   8886             addTest = false;
   8887           }
   8888         }
   8889       }
   8890     } else if (Cond.hasOneUse() && isXor1OfSetCC(Cond)) {
   8891       // Recognize for xorb (setcc), 1 patterns. The xor inverts the condition.
   8892       // It should be transformed during dag combiner except when the condition
   8893       // is set by a arithmetics with overflow node.
   8894       X86::CondCode CCode =
   8895         (X86::CondCode)Cond.getOperand(0).getConstantOperandVal(0);
   8896       CCode = X86::GetOppositeBranchCondition(CCode);
   8897       CC = DAG.getConstant(CCode, MVT::i8);
   8898       Cond = Cond.getOperand(0).getOperand(1);
   8899       addTest = false;
   8900     } else if (Cond.getOpcode() == ISD::SETCC &&
   8901                cast<CondCodeSDNode>(Cond.getOperand(2))->get() == ISD::SETOEQ) {
   8902       // For FCMP_OEQ, we can emit
   8903       // two branches instead of an explicit AND instruction with a
   8904       // separate test. However, we only do this if this block doesn't
   8905       // have a fall-through edge, because this requires an explicit
   8906       // jmp when the condition is false.
   8907       if (Op.getNode()->hasOneUse()) {
   8908         SDNode *User = *Op.getNode()->use_begin();
   8909         // Look for an unconditional branch following this conditional branch.
   8910         // We need this because we need to reverse the successors in order
   8911         // to implement FCMP_OEQ.
   8912         if (User->getOpcode() == ISD::BR) {
   8913           SDValue FalseBB = User->getOperand(1);
   8914           SDNode *NewBR =
   8915             DAG.UpdateNodeOperands(User, User->getOperand(0), Dest);
   8916           assert(NewBR == User);
   8917           (void)NewBR;
   8918           Dest = FalseBB;
   8919 
   8920           SDValue Cmp = DAG.getNode(X86ISD::CMP, dl, MVT::i32,
   8921                                     Cond.getOperand(0), Cond.getOperand(1));
   8922           CC = DAG.getConstant(X86::COND_NE, MVT::i8);
   8923           Chain = DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(),
   8924                               Chain, Dest, CC, Cmp);
   8925           CC = DAG.getConstant(X86::COND_P, MVT::i8);
   8926           Cond = Cmp;
   8927           addTest = false;
   8928         }
   8929       }
   8930     } else if (Cond.getOpcode() == ISD::SETCC &&
   8931                cast<CondCodeSDNode>(Cond.getOperand(2))->get() == ISD::SETUNE) {
   8932       // For FCMP_UNE, we can emit
   8933       // two branches instead of an explicit AND instruction with a
   8934       // separate test. However, we only do this if this block doesn't
   8935       // have a fall-through edge, because this requires an explicit
   8936       // jmp when the condition is false.
   8937       if (Op.getNode()->hasOneUse()) {
   8938         SDNode *User = *Op.getNode()->use_begin();
   8939         // Look for an unconditional branch following this conditional branch.
   8940         // We need this because we need to reverse the successors in order
   8941         // to implement FCMP_UNE.
   8942         if (User->getOpcode() == ISD::BR) {
   8943           SDValue FalseBB = User->getOperand(1);
   8944           SDNode *NewBR =
   8945             DAG.UpdateNodeOperands(User, User->getOperand(0), Dest);
   8946           assert(NewBR == User);
   8947           (void)NewBR;
   8948 
   8949           SDValue Cmp = DAG.getNode(X86ISD::CMP, dl, MVT::i32,
   8950                                     Cond.getOperand(0), Cond.getOperand(1));
   8951           CC = DAG.getConstant(X86::COND_NE, MVT::i8);
   8952           Chain = DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(),
   8953                               Chain, Dest, CC, Cmp);
   8954           CC = DAG.getConstant(X86::COND_NP, MVT::i8);
   8955           Cond = Cmp;
   8956           addTest = false;
   8957           Dest = FalseBB;
   8958         }
   8959       }
   8960     }
   8961   }
   8962 
   8963   if (addTest) {
   8964     // Look pass the truncate.
   8965     if (Cond.getOpcode() == ISD::TRUNCATE)
   8966       Cond = Cond.getOperand(0);
   8967 
   8968     // We know the result of AND is compared against zero. Try to match
   8969     // it to BT.
   8970     if (Cond.getOpcode() == ISD::AND && Cond.hasOneUse()) {
   8971       SDValue NewSetCC = LowerToBT(Cond, ISD::SETNE, dl, DAG);
   8972       if (NewSetCC.getNode()) {
   8973         CC = NewSetCC.getOperand(0);
   8974         Cond = NewSetCC.getOperand(1);
   8975         addTest = false;
   8976       }
   8977     }
   8978   }
   8979 
   8980   if (addTest) {
   8981     CC = DAG.getConstant(X86::COND_NE, MVT::i8);
   8982     Cond = EmitTest(Cond, X86::COND_NE, DAG);
   8983   }
   8984   return DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(),
   8985                      Chain, Dest, CC, Cond);
   8986 }
   8987 
   8988 
   8989 // Lower dynamic stack allocation to _alloca call for Cygwin/Mingw targets.
   8990 // Calls to _alloca is needed to probe the stack when allocating more than 4k
   8991 // bytes in one go. Touching the stack at 4K increments is necessary to ensure
   8992 // that the guard pages used by the OS virtual memory manager are allocated in
   8993 // correct sequence.
   8994 SDValue
   8995 X86TargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op,
   8996                                            SelectionDAG &DAG) const {
   8997   assert((Subtarget->isTargetCygMing() || Subtarget->isTargetWindows() ||
   8998           getTargetMachine().Options.EnableSegmentedStacks) &&
   8999          "This should be used only on Windows targets or when segmented stacks "
   9000          "are being used");
   9001   assert(!Subtarget->isTargetEnvMacho() && "Not implemented");
   9002   DebugLoc dl = Op.getDebugLoc();
   9003 
   9004   // Get the inputs.
   9005   SDValue Chain = Op.getOperand(0);
   9006   SDValue Size  = Op.getOperand(1);
   9007   // FIXME: Ensure alignment here
   9008 
   9009   bool Is64Bit = Subtarget->is64Bit();
   9010   EVT SPTy = Is64Bit ? MVT::i64 : MVT::i32;
   9011 
   9012   if (getTargetMachine().Options.EnableSegmentedStacks) {
   9013     MachineFunction &MF = DAG.getMachineFunction();
   9014     MachineRegisterInfo &MRI = MF.getRegInfo();
   9015 
   9016     if (Is64Bit) {
   9017       // The 64 bit implementation of segmented stacks needs to clobber both r10
   9018       // r11. This makes it impossible to use it along with nested parameters.
   9019       const Function *F = MF.getFunction();
   9020 
   9021       for (Function::const_arg_iterator I = F->arg_begin(), E = F->arg_end();
   9022            I != E; I++)
   9023         if (I->hasNestAttr())
   9024           report_fatal_error("Cannot use segmented stacks with functions that "
   9025                              "have nested arguments.");
   9026     }
   9027 
   9028     const TargetRegisterClass *AddrRegClass =
   9029       getRegClassFor(Subtarget->is64Bit() ? MVT::i64:MVT::i32);
   9030     unsigned Vreg = MRI.createVirtualRegister(AddrRegClass);
   9031     Chain = DAG.getCopyToReg(Chain, dl, Vreg, Size);
   9032     SDValue Value = DAG.getNode(X86ISD::SEG_ALLOCA, dl, SPTy, Chain,
   9033                                 DAG.getRegister(Vreg, SPTy));
   9034     SDValue Ops1[2] = { Value, Chain };
   9035     return DAG.getMergeValues(Ops1, 2, dl);
   9036   } else {
   9037     SDValue Flag;
   9038     unsigned Reg = (Subtarget->is64Bit() ? X86::RAX : X86::EAX);
   9039 
   9040     Chain = DAG.getCopyToReg(Chain, dl, Reg, Size, Flag);
   9041     Flag = Chain.getValue(1);
   9042     SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
   9043 
   9044     Chain = DAG.getNode(X86ISD::WIN_ALLOCA, dl, NodeTys, Chain, Flag);
   9045     Flag = Chain.getValue(1);
   9046 
   9047     Chain = DAG.getCopyFromReg(Chain, dl, X86StackPtr, SPTy).getValue(1);
   9048 
   9049     SDValue Ops1[2] = { Chain.getValue(0), Chain };
   9050     return DAG.getMergeValues(Ops1, 2, dl);
   9051   }
   9052 }
   9053 
   9054 SDValue X86TargetLowering::LowerVASTART(SDValue Op, SelectionDAG &DAG) const {
   9055   MachineFunction &MF = DAG.getMachineFunction();
   9056   X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
   9057 
   9058   const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
   9059   DebugLoc DL = Op.getDebugLoc();
   9060 
   9061   if (!Subtarget->is64Bit() || Subtarget->isTargetWin64()) {
   9062     // vastart just stores the address of the VarArgsFrameIndex slot into the
   9063     // memory location argument.
   9064     SDValue FR = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(),
   9065                                    getPointerTy());
   9066     return DAG.getStore(Op.getOperand(0), DL, FR, Op.getOperand(1),
   9067                         MachinePointerInfo(SV), false, false, 0);
   9068   }
   9069 
   9070   // __va_list_tag:
   9071   //   gp_offset         (0 - 6 * 8)
   9072   //   fp_offset         (48 - 48 + 8 * 16)
   9073   //   overflow_arg_area (point to parameters coming in memory).
   9074   //   reg_save_area
   9075   SmallVector<SDValue, 8> MemOps;
   9076   SDValue FIN = Op.getOperand(1);
   9077   // Store gp_offset
   9078   SDValue Store = DAG.getStore(Op.getOperand(0), DL,
   9079                                DAG.getConstant(FuncInfo->getVarArgsGPOffset(),
   9080                                                MVT::i32),
   9081                                FIN, MachinePointerInfo(SV), false, false, 0);
   9082   MemOps.push_back(Store);
   9083 
   9084   // Store fp_offset
   9085   FIN = DAG.getNode(ISD::ADD, DL, getPointerTy(),
   9086                     FIN, DAG.getIntPtrConstant(4));
   9087   Store = DAG.getStore(Op.getOperand(0), DL,
   9088                        DAG.getConstant(FuncInfo->getVarArgsFPOffset(),
   9089                                        MVT::i32),
   9090                        FIN, MachinePointerInfo(SV, 4), false, false, 0);
   9091   MemOps.push_back(Store);
   9092 
   9093   // Store ptr to overflow_arg_area
   9094   FIN = DAG.getNode(ISD::ADD, DL, getPointerTy(),
   9095                     FIN, DAG.getIntPtrConstant(4));
   9096   SDValue OVFIN = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(),
   9097                                     getPointerTy());
   9098   Store = DAG.getStore(Op.getOperand(0), DL, OVFIN, FIN,
   9099                        MachinePointerInfo(SV, 8),
   9100                        false, false, 0);
   9101   MemOps.push_back(Store);
   9102 
   9103   // Store ptr to reg_save_area.
   9104   FIN = DAG.getNode(ISD::ADD, DL, getPointerTy(),
   9105                     FIN, DAG.getIntPtrConstant(8));
   9106   SDValue RSFIN = DAG.getFrameIndex(FuncInfo->getRegSaveFrameIndex(),
   9107                                     getPointerTy());
   9108   Store = DAG.getStore(Op.getOperand(0), DL, RSFIN, FIN,
   9109                        MachinePointerInfo(SV, 16), false, false, 0);
   9110   MemOps.push_back(Store);
   9111   return DAG.getNode(ISD::TokenFactor, DL, MVT::Other,
   9112                      &MemOps[0], MemOps.size());
   9113 }
   9114 
   9115 SDValue X86TargetLowering::LowerVAARG(SDValue Op, SelectionDAG &DAG) const {
   9116   assert(Subtarget->is64Bit() &&
   9117          "LowerVAARG only handles 64-bit va_arg!");
   9118   assert((Subtarget->isTargetLinux() ||
   9119           Subtarget->isTargetDarwin()) &&
   9120           "Unhandled target in LowerVAARG");
   9121   assert(Op.getNode()->getNumOperands() == 4);
   9122   SDValue Chain = Op.getOperand(0);
   9123   SDValue SrcPtr = Op.getOperand(1);
   9124   const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
   9125   unsigned Align = Op.getConstantOperandVal(3);
   9126   DebugLoc dl = Op.getDebugLoc();
   9127 
   9128   EVT ArgVT = Op.getNode()->getValueType(0);
   9129   Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext());
   9130   uint32_t ArgSize = getTargetData()->getTypeAllocSize(ArgTy);
   9131   uint8_t ArgMode;
   9132 
   9133   // Decide which area this value should be read from.
   9134   // TODO: Implement the AMD64 ABI in its entirety. This simple
   9135   // selection mechanism works only for the basic types.
   9136   if (ArgVT == MVT::f80) {
   9137     llvm_unreachable("va_arg for f80 not yet implemented");
   9138   } else if (ArgVT.isFloatingPoint() && ArgSize <= 16 /*bytes*/) {
   9139     ArgMode = 2;  // Argument passed in XMM register. Use fp_offset.
   9140   } else if (ArgVT.isInteger() && ArgSize <= 32 /*bytes*/) {
   9141     ArgMode = 1;  // Argument passed in GPR64 register(s). Use gp_offset.
   9142   } else {
   9143     llvm_unreachable("Unhandled argument type in LowerVAARG");
   9144   }
   9145 
   9146   if (ArgMode == 2) {
   9147     // Sanity Check: Make sure using fp_offset makes sense.
   9148     assert(!getTargetMachine().Options.UseSoftFloat &&
   9149            !(DAG.getMachineFunction()
   9150                 .getFunction()->hasFnAttr(Attribute::NoImplicitFloat)) &&
   9151            Subtarget->hasSSE1());
   9152   }
   9153 
   9154   // Insert VAARG_64 node into the DAG
   9155   // VAARG_64 returns two values: Variable Argument Address, Chain
   9156   SmallVector<SDValue, 11> InstOps;
   9157   InstOps.push_back(Chain);
   9158   InstOps.push_back(SrcPtr);
   9159   InstOps.push_back(DAG.getConstant(ArgSize, MVT::i32));
   9160   InstOps.push_back(DAG.getConstant(ArgMode, MVT::i8));
   9161   InstOps.push_back(DAG.getConstant(Align, MVT::i32));
   9162   SDVTList VTs = DAG.getVTList(getPointerTy(), MVT::Other);
   9163   SDValue VAARG = DAG.getMemIntrinsicNode(X86ISD::VAARG_64, dl,
   9164                                           VTs, &InstOps[0], InstOps.size(),
   9165                                           MVT::i64,
   9166                                           MachinePointerInfo(SV),
   9167                                           /*Align=*/0,
   9168                                           /*Volatile=*/false,
   9169                                           /*ReadMem=*/true,
   9170                                           /*WriteMem=*/true);
   9171   Chain = VAARG.getValue(1);
   9172 
   9173   // Load the next argument and return it
   9174   return DAG.getLoad(ArgVT, dl,
   9175                      Chain,
   9176                      VAARG,
   9177                      MachinePointerInfo(),
   9178                      false, false, false, 0);
   9179 }
   9180 
   9181 SDValue X86TargetLowering::LowerVACOPY(SDValue Op, SelectionDAG &DAG) const {
   9182   // X86-64 va_list is a struct { i32, i32, i8*, i8* }.
   9183   assert(Subtarget->is64Bit() && "This code only handles 64-bit va_copy!");
   9184   SDValue Chain = Op.getOperand(0);
   9185   SDValue DstPtr = Op.getOperand(1);
   9186   SDValue SrcPtr = Op.getOperand(2);
   9187   const Value *DstSV = cast<SrcValueSDNode>(Op.getOperand(3))->getValue();
   9188   const Value *SrcSV = cast<SrcValueSDNode>(Op.getOperand(4))->getValue();
   9189   DebugLoc DL = Op.getDebugLoc();
   9190 
   9191   return DAG.getMemcpy(Chain, DL, DstPtr, SrcPtr,
   9192                        DAG.getIntPtrConstant(24), 8, /*isVolatile*/false,
   9193                        false,
   9194                        MachinePointerInfo(DstSV), MachinePointerInfo(SrcSV));
   9195 }
   9196 
   9197 // getTargetVShiftNOde - Handle vector element shifts where the shift amount
   9198 // may or may not be a constant. Takes immediate version of shift as input.
   9199 static SDValue getTargetVShiftNode(unsigned Opc, DebugLoc dl, EVT VT,
   9200                                    SDValue SrcOp, SDValue ShAmt,
   9201                                    SelectionDAG &DAG) {
   9202   assert(ShAmt.getValueType() == MVT::i32 && "ShAmt is not i32");
   9203 
   9204   if (isa<ConstantSDNode>(ShAmt)) {
   9205     switch (Opc) {
   9206       default: llvm_unreachable("Unknown target vector shift node");
   9207       case X86ISD::VSHLI:
   9208       case X86ISD::VSRLI:
   9209       case X86ISD::VSRAI:
   9210         return DAG.getNode(Opc, dl, VT, SrcOp, ShAmt);
   9211     }
   9212   }
   9213 
   9214   // Change opcode to non-immediate version
   9215   switch (Opc) {
   9216     default: llvm_unreachable("Unknown target vector shift node");
   9217     case X86ISD::VSHLI: Opc = X86ISD::VSHL; break;
   9218     case X86ISD::VSRLI: Opc = X86ISD::VSRL; break;
   9219     case X86ISD::VSRAI: Opc = X86ISD::VSRA; break;
   9220   }
   9221 
   9222   // Need to build a vector containing shift amount
   9223   // Shift amount is 32-bits, but SSE instructions read 64-bit, so fill with 0
   9224   SDValue ShOps[4];
   9225   ShOps[0] = ShAmt;
   9226   ShOps[1] = DAG.getConstant(0, MVT::i32);
   9227   ShOps[2] = DAG.getUNDEF(MVT::i32);
   9228   ShOps[3] = DAG.getUNDEF(MVT::i32);
   9229   ShAmt = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32, &ShOps[0], 4);
   9230   ShAmt = DAG.getNode(ISD::BITCAST, dl, VT, ShAmt);
   9231   return DAG.getNode(Opc, dl, VT, SrcOp, ShAmt);
   9232 }
   9233 
   9234 SDValue
   9235 X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) const {
   9236   DebugLoc dl = Op.getDebugLoc();
   9237   unsigned IntNo = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
   9238   switch (IntNo) {
   9239   default: return SDValue();    // Don't custom lower most intrinsics.
   9240   // Comparison intrinsics.
   9241   case Intrinsic::x86_sse_comieq_ss:
   9242   case Intrinsic::x86_sse_comilt_ss:
   9243   case Intrinsic::x86_sse_comile_ss:
   9244   case Intrinsic::x86_sse_comigt_ss:
   9245   case Intrinsic::x86_sse_comige_ss:
   9246   case Intrinsic::x86_sse_comineq_ss:
   9247   case Intrinsic::x86_sse_ucomieq_ss:
   9248   case Intrinsic::x86_sse_ucomilt_ss:
   9249   case Intrinsic::x86_sse_ucomile_ss:
   9250   case Intrinsic::x86_sse_ucomigt_ss:
   9251   case Intrinsic::x86_sse_ucomige_ss:
   9252   case Intrinsic::x86_sse_ucomineq_ss:
   9253   case Intrinsic::x86_sse2_comieq_sd:
   9254   case Intrinsic::x86_sse2_comilt_sd:
   9255   case Intrinsic::x86_sse2_comile_sd:
   9256   case Intrinsic::x86_sse2_comigt_sd:
   9257   case Intrinsic::x86_sse2_comige_sd:
   9258   case Intrinsic::x86_sse2_comineq_sd:
   9259   case Intrinsic::x86_sse2_ucomieq_sd:
   9260   case Intrinsic::x86_sse2_ucomilt_sd:
   9261   case Intrinsic::x86_sse2_ucomile_sd:
   9262   case Intrinsic::x86_sse2_ucomigt_sd:
   9263   case Intrinsic::x86_sse2_ucomige_sd:
   9264   case Intrinsic::x86_sse2_ucomineq_sd: {
   9265     unsigned Opc = 0;
   9266     ISD::CondCode CC = ISD::SETCC_INVALID;
   9267     switch (IntNo) {
   9268     default: llvm_unreachable("Impossible intrinsic");  // Can't reach here.
   9269     case Intrinsic::x86_sse_comieq_ss:
   9270     case Intrinsic::x86_sse2_comieq_sd:
   9271       Opc = X86ISD::COMI;
   9272       CC = ISD::SETEQ;
   9273       break;
   9274     case Intrinsic::x86_sse_comilt_ss:
   9275     case Intrinsic::x86_sse2_comilt_sd:
   9276       Opc = X86ISD::COMI;
   9277       CC = ISD::SETLT;
   9278       break;
   9279     case Intrinsic::x86_sse_comile_ss:
   9280     case Intrinsic::x86_sse2_comile_sd:
   9281       Opc = X86ISD::COMI;
   9282       CC = ISD::SETLE;
   9283       break;
   9284     case Intrinsic::x86_sse_comigt_ss:
   9285     case Intrinsic::x86_sse2_comigt_sd:
   9286       Opc = X86ISD::COMI;
   9287       CC = ISD::SETGT;
   9288       break;
   9289     case Intrinsic::x86_sse_comige_ss:
   9290     case Intrinsic::x86_sse2_comige_sd:
   9291       Opc = X86ISD::COMI;
   9292       CC = ISD::SETGE;
   9293       break;
   9294     case Intrinsic::x86_sse_comineq_ss:
   9295     case Intrinsic::x86_sse2_comineq_sd:
   9296       Opc = X86ISD::COMI;
   9297       CC = ISD::SETNE;
   9298       break;
   9299     case Intrinsic::x86_sse_ucomieq_ss:
   9300     case Intrinsic::x86_sse2_ucomieq_sd:
   9301       Opc = X86ISD::UCOMI;
   9302       CC = ISD::SETEQ;
   9303       break;
   9304     case Intrinsic::x86_sse_ucomilt_ss:
   9305     case Intrinsic::x86_sse2_ucomilt_sd:
   9306       Opc = X86ISD::UCOMI;
   9307       CC = ISD::SETLT;
   9308       break;
   9309     case Intrinsic::x86_sse_ucomile_ss:
   9310     case Intrinsic::x86_sse2_ucomile_sd:
   9311       Opc = X86ISD::UCOMI;
   9312       CC = ISD::SETLE;
   9313       break;
   9314     case Intrinsic::x86_sse_ucomigt_ss:
   9315     case Intrinsic::x86_sse2_ucomigt_sd:
   9316       Opc = X86ISD::UCOMI;
   9317       CC = ISD::SETGT;
   9318       break;
   9319     case Intrinsic::x86_sse_ucomige_ss:
   9320     case Intrinsic::x86_sse2_ucomige_sd:
   9321       Opc = X86ISD::UCOMI;
   9322       CC = ISD::SETGE;
   9323       break;
   9324     case Intrinsic::x86_sse_ucomineq_ss:
   9325     case Intrinsic::x86_sse2_ucomineq_sd:
   9326       Opc = X86ISD::UCOMI;
   9327       CC = ISD::SETNE;
   9328       break;
   9329     }
   9330 
   9331     SDValue LHS = Op.getOperand(1);
   9332     SDValue RHS = Op.getOperand(2);
   9333     unsigned X86CC = TranslateX86CC(CC, true, LHS, RHS, DAG);
   9334     assert(X86CC != X86::COND_INVALID && "Unexpected illegal condition!");
   9335     SDValue Cond = DAG.getNode(Opc, dl, MVT::i32, LHS, RHS);
   9336     SDValue SetCC = DAG.getNode(X86ISD::SETCC, dl, MVT::i8,
   9337                                 DAG.getConstant(X86CC, MVT::i8), Cond);
   9338     return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC);
   9339   }
   9340   // XOP comparison intrinsics
   9341   case Intrinsic::x86_xop_vpcomltb:
   9342   case Intrinsic::x86_xop_vpcomltw:
   9343   case Intrinsic::x86_xop_vpcomltd:
   9344   case Intrinsic::x86_xop_vpcomltq:
   9345   case Intrinsic::x86_xop_vpcomltub:
   9346   case Intrinsic::x86_xop_vpcomltuw:
   9347   case Intrinsic::x86_xop_vpcomltud:
   9348   case Intrinsic::x86_xop_vpcomltuq:
   9349   case Intrinsic::x86_xop_vpcomleb:
   9350   case Intrinsic::x86_xop_vpcomlew:
   9351   case Intrinsic::x86_xop_vpcomled:
   9352   case Intrinsic::x86_xop_vpcomleq:
   9353   case Intrinsic::x86_xop_vpcomleub:
   9354   case Intrinsic::x86_xop_vpcomleuw:
   9355   case Intrinsic::x86_xop_vpcomleud:
   9356   case Intrinsic::x86_xop_vpcomleuq:
   9357   case Intrinsic::x86_xop_vpcomgtb:
   9358   case Intrinsic::x86_xop_vpcomgtw:
   9359   case Intrinsic::x86_xop_vpcomgtd:
   9360   case Intrinsic::x86_xop_vpcomgtq:
   9361   case Intrinsic::x86_xop_vpcomgtub:
   9362   case Intrinsic::x86_xop_vpcomgtuw:
   9363   case Intrinsic::x86_xop_vpcomgtud:
   9364   case Intrinsic::x86_xop_vpcomgtuq:
   9365   case Intrinsic::x86_xop_vpcomgeb:
   9366   case Intrinsic::x86_xop_vpcomgew:
   9367   case Intrinsic::x86_xop_vpcomged:
   9368   case Intrinsic::x86_xop_vpcomgeq:
   9369   case Intrinsic::x86_xop_vpcomgeub:
   9370   case Intrinsic::x86_xop_vpcomgeuw:
   9371   case Intrinsic::x86_xop_vpcomgeud:
   9372   case Intrinsic::x86_xop_vpcomgeuq:
   9373   case Intrinsic::x86_xop_vpcomeqb:
   9374   case Intrinsic::x86_xop_vpcomeqw:
   9375   case Intrinsic::x86_xop_vpcomeqd:
   9376   case Intrinsic::x86_xop_vpcomeqq:
   9377   case Intrinsic::x86_xop_vpcomequb:
   9378   case Intrinsic::x86_xop_vpcomequw:
   9379   case Intrinsic::x86_xop_vpcomequd:
   9380   case Intrinsic::x86_xop_vpcomequq:
   9381   case Intrinsic::x86_xop_vpcomneb:
   9382   case Intrinsic::x86_xop_vpcomnew:
   9383   case Intrinsic::x86_xop_vpcomned:
   9384   case Intrinsic::x86_xop_vpcomneq:
   9385   case Intrinsic::x86_xop_vpcomneub:
   9386   case Intrinsic::x86_xop_vpcomneuw:
   9387   case Intrinsic::x86_xop_vpcomneud:
   9388   case Intrinsic::x86_xop_vpcomneuq:
   9389   case Intrinsic::x86_xop_vpcomfalseb:
   9390   case Intrinsic::x86_xop_vpcomfalsew:
   9391   case Intrinsic::x86_xop_vpcomfalsed:
   9392   case Intrinsic::x86_xop_vpcomfalseq:
   9393   case Intrinsic::x86_xop_vpcomfalseub:
   9394   case Intrinsic::x86_xop_vpcomfalseuw:
   9395   case Intrinsic::x86_xop_vpcomfalseud:
   9396   case Intrinsic::x86_xop_vpcomfalseuq:
   9397   case Intrinsic::x86_xop_vpcomtrueb:
   9398   case Intrinsic::x86_xop_vpcomtruew:
   9399   case Intrinsic::x86_xop_vpcomtrued:
   9400   case Intrinsic::x86_xop_vpcomtrueq:
   9401   case Intrinsic::x86_xop_vpcomtrueub:
   9402   case Intrinsic::x86_xop_vpcomtrueuw:
   9403   case Intrinsic::x86_xop_vpcomtrueud:
   9404   case Intrinsic::x86_xop_vpcomtrueuq: {
   9405     unsigned CC = 0;
   9406     unsigned Opc = 0;
   9407 
   9408     switch (IntNo) {
   9409     default: llvm_unreachable("Impossible intrinsic");  // Can't reach here.
   9410     case Intrinsic::x86_xop_vpcomltb:
   9411     case Intrinsic::x86_xop_vpcomltw:
   9412     case Intrinsic::x86_xop_vpcomltd:
   9413     case Intrinsic::x86_xop_vpcomltq:
   9414       CC = 0;
   9415       Opc = X86ISD::VPCOM;
   9416       break;
   9417     case Intrinsic::x86_xop_vpcomltub:
   9418     case Intrinsic::x86_xop_vpcomltuw:
   9419     case Intrinsic::x86_xop_vpcomltud:
   9420     case Intrinsic::x86_xop_vpcomltuq:
   9421       CC = 0;
   9422       Opc = X86ISD::VPCOMU;
   9423       break;
   9424     case Intrinsic::x86_xop_vpcomleb:
   9425     case Intrinsic::x86_xop_vpcomlew:
   9426     case Intrinsic::x86_xop_vpcomled:
   9427     case Intrinsic::x86_xop_vpcomleq:
   9428       CC = 1;
   9429       Opc = X86ISD::VPCOM;
   9430       break;
   9431     case Intrinsic::x86_xop_vpcomleub:
   9432     case Intrinsic::x86_xop_vpcomleuw:
   9433     case Intrinsic::x86_xop_vpcomleud:
   9434     case Intrinsic::x86_xop_vpcomleuq:
   9435       CC = 1;
   9436       Opc = X86ISD::VPCOMU;
   9437       break;
   9438     case Intrinsic::x86_xop_vpcomgtb:
   9439     case Intrinsic::x86_xop_vpcomgtw:
   9440     case Intrinsic::x86_xop_vpcomgtd:
   9441     case Intrinsic::x86_xop_vpcomgtq:
   9442       CC = 2;
   9443       Opc = X86ISD::VPCOM;
   9444       break;
   9445     case Intrinsic::x86_xop_vpcomgtub:
   9446     case Intrinsic::x86_xop_vpcomgtuw:
   9447     case Intrinsic::x86_xop_vpcomgtud:
   9448     case Intrinsic::x86_xop_vpcomgtuq:
   9449       CC = 2;
   9450       Opc = X86ISD::VPCOMU;
   9451       break;
   9452     case Intrinsic::x86_xop_vpcomgeb:
   9453     case Intrinsic::x86_xop_vpcomgew:
   9454     case Intrinsic::x86_xop_vpcomged:
   9455     case Intrinsic::x86_xop_vpcomgeq:
   9456       CC = 3;
   9457       Opc = X86ISD::VPCOM;
   9458       break;
   9459     case Intrinsic::x86_xop_vpcomgeub:
   9460     case Intrinsic::x86_xop_vpcomgeuw:
   9461     case Intrinsic::x86_xop_vpcomgeud:
   9462     case Intrinsic::x86_xop_vpcomgeuq:
   9463       CC = 3;
   9464       Opc = X86ISD::VPCOMU;
   9465       break;
   9466     case Intrinsic::x86_xop_vpcomeqb:
   9467     case Intrinsic::x86_xop_vpcomeqw:
   9468     case Intrinsic::x86_xop_vpcomeqd:
   9469     case Intrinsic::x86_xop_vpcomeqq:
   9470       CC = 4;
   9471       Opc = X86ISD::VPCOM;
   9472       break;
   9473     case Intrinsic::x86_xop_vpcomequb:
   9474     case Intrinsic::x86_xop_vpcomequw:
   9475     case Intrinsic::x86_xop_vpcomequd:
   9476     case Intrinsic::x86_xop_vpcomequq:
   9477       CC = 4;
   9478       Opc = X86ISD::VPCOMU;
   9479       break;
   9480     case Intrinsic::x86_xop_vpcomneb:
   9481     case Intrinsic::x86_xop_vpcomnew:
   9482     case Intrinsic::x86_xop_vpcomned:
   9483     case Intrinsic::x86_xop_vpcomneq:
   9484       CC = 5;
   9485       Opc = X86ISD::VPCOM;
   9486       break;
   9487     case Intrinsic::x86_xop_vpcomneub:
   9488     case Intrinsic::x86_xop_vpcomneuw:
   9489     case Intrinsic::x86_xop_vpcomneud:
   9490     case Intrinsic::x86_xop_vpcomneuq:
   9491       CC = 5;
   9492       Opc = X86ISD::VPCOMU;
   9493       break;
   9494     case Intrinsic::x86_xop_vpcomfalseb:
   9495     case Intrinsic::x86_xop_vpcomfalsew:
   9496     case Intrinsic::x86_xop_vpcomfalsed:
   9497     case Intrinsic::x86_xop_vpcomfalseq:
   9498       CC = 6;
   9499       Opc = X86ISD::VPCOM;
   9500       break;
   9501     case Intrinsic::x86_xop_vpcomfalseub:
   9502     case Intrinsic::x86_xop_vpcomfalseuw:
   9503     case Intrinsic::x86_xop_vpcomfalseud:
   9504     case Intrinsic::x86_xop_vpcomfalseuq:
   9505       CC = 6;
   9506       Opc = X86ISD::VPCOMU;
   9507       break;
   9508     case Intrinsic::x86_xop_vpcomtrueb:
   9509     case Intrinsic::x86_xop_vpcomtruew:
   9510     case Intrinsic::x86_xop_vpcomtrued:
   9511     case Intrinsic::x86_xop_vpcomtrueq:
   9512       CC = 7;
   9513       Opc = X86ISD::VPCOM;
   9514       break;
   9515     case Intrinsic::x86_xop_vpcomtrueub:
   9516     case Intrinsic::x86_xop_vpcomtrueuw:
   9517     case Intrinsic::x86_xop_vpcomtrueud:
   9518     case Intrinsic::x86_xop_vpcomtrueuq:
   9519       CC = 7;
   9520       Opc = X86ISD::VPCOMU;
   9521       break;
   9522     }
   9523 
   9524     SDValue LHS = Op.getOperand(1);
   9525     SDValue RHS = Op.getOperand(2);
   9526     return DAG.getNode(Opc, dl, Op.getValueType(), LHS, RHS,
   9527                        DAG.getConstant(CC, MVT::i8));
   9528   }
   9529 
   9530   // Arithmetic intrinsics.
   9531   case Intrinsic::x86_sse2_pmulu_dq:
   9532   case Intrinsic::x86_avx2_pmulu_dq:
   9533     return DAG.getNode(X86ISD::PMULUDQ, dl, Op.getValueType(),
   9534                        Op.getOperand(1), Op.getOperand(2));
   9535   case Intrinsic::x86_sse3_hadd_ps:
   9536   case Intrinsic::x86_sse3_hadd_pd:
   9537   case Intrinsic::x86_avx_hadd_ps_256:
   9538   case Intrinsic::x86_avx_hadd_pd_256:
   9539     return DAG.getNode(X86ISD::FHADD, dl, Op.getValueType(),
   9540                        Op.getOperand(1), Op.getOperand(2));
   9541   case Intrinsic::x86_sse3_hsub_ps:
   9542   case Intrinsic::x86_sse3_hsub_pd:
   9543   case Intrinsic::x86_avx_hsub_ps_256:
   9544   case Intrinsic::x86_avx_hsub_pd_256:
   9545     return DAG.getNode(X86ISD::FHSUB, dl, Op.getValueType(),
   9546                        Op.getOperand(1), Op.getOperand(2));
   9547   case Intrinsic::x86_ssse3_phadd_w_128:
   9548   case Intrinsic::x86_ssse3_phadd_d_128:
   9549   case Intrinsic::x86_avx2_phadd_w:
   9550   case Intrinsic::x86_avx2_phadd_d:
   9551     return DAG.getNode(X86ISD::HADD, dl, Op.getValueType(),
   9552                        Op.getOperand(1), Op.getOperand(2));
   9553   case Intrinsic::x86_ssse3_phsub_w_128:
   9554   case Intrinsic::x86_ssse3_phsub_d_128:
   9555   case Intrinsic::x86_avx2_phsub_w:
   9556   case Intrinsic::x86_avx2_phsub_d:
   9557     return DAG.getNode(X86ISD::HSUB, dl, Op.getValueType(),
   9558                        Op.getOperand(1), Op.getOperand(2));
   9559   case Intrinsic::x86_avx2_psllv_d:
   9560   case Intrinsic::x86_avx2_psllv_q:
   9561   case Intrinsic::x86_avx2_psllv_d_256:
   9562   case Intrinsic::x86_avx2_psllv_q_256:
   9563     return DAG.getNode(ISD::SHL, dl, Op.getValueType(),
   9564                       Op.getOperand(1), Op.getOperand(2));
   9565   case Intrinsic::x86_avx2_psrlv_d:
   9566   case Intrinsic::x86_avx2_psrlv_q:
   9567   case Intrinsic::x86_avx2_psrlv_d_256:
   9568   case Intrinsic::x86_avx2_psrlv_q_256:
   9569     return DAG.getNode(ISD::SRL, dl, Op.getValueType(),
   9570                       Op.getOperand(1), Op.getOperand(2));
   9571   case Intrinsic::x86_avx2_psrav_d:
   9572   case Intrinsic::x86_avx2_psrav_d_256:
   9573     return DAG.getNode(ISD::SRA, dl, Op.getValueType(),