Home | History | Annotate | Download | only in X86
      1 //===-- X86ISelLowering.cpp - X86 DAG Lowering Implementation -------------===//
      2 //
      3 //                     The LLVM Compiler Infrastructure
      4 //
      5 // This file is distributed under the University of Illinois Open Source
      6 // License. See LICENSE.TXT for details.
      7 //
      8 //===----------------------------------------------------------------------===//
      9 //
     10 // This file defines the interfaces that X86 uses to lower LLVM code into a
     11 // selection DAG.
     12 //
     13 //===----------------------------------------------------------------------===//
     14 
     15 #define DEBUG_TYPE "x86-isel"
     16 #include "X86ISelLowering.h"
     17 #include "X86.h"
     18 #include "X86InstrBuilder.h"
     19 #include "X86TargetMachine.h"
     20 #include "X86TargetObjectFile.h"
     21 #include "Utils/X86ShuffleDecode.h"
     22 #include "llvm/CallingConv.h"
     23 #include "llvm/Constants.h"
     24 #include "llvm/DerivedTypes.h"
     25 #include "llvm/GlobalAlias.h"
     26 #include "llvm/GlobalVariable.h"
     27 #include "llvm/Function.h"
     28 #include "llvm/Instructions.h"
     29 #include "llvm/Intrinsics.h"
     30 #include "llvm/LLVMContext.h"
     31 #include "llvm/CodeGen/IntrinsicLowering.h"
     32 #include "llvm/CodeGen/MachineFrameInfo.h"
     33 #include "llvm/CodeGen/MachineFunction.h"
     34 #include "llvm/CodeGen/MachineInstrBuilder.h"
     35 #include "llvm/CodeGen/MachineJumpTableInfo.h"
     36 #include "llvm/CodeGen/MachineModuleInfo.h"
     37 #include "llvm/CodeGen/MachineRegisterInfo.h"
     38 #include "llvm/MC/MCAsmInfo.h"
     39 #include "llvm/MC/MCContext.h"
     40 #include "llvm/MC/MCExpr.h"
     41 #include "llvm/MC/MCSymbol.h"
     42 #include "llvm/ADT/SmallSet.h"
     43 #include "llvm/ADT/Statistic.h"
     44 #include "llvm/ADT/StringExtras.h"
     45 #include "llvm/ADT/VariadicFunction.h"
     46 #include "llvm/Support/CallSite.h"
     47 #include "llvm/Support/Debug.h"
     48 #include "llvm/Support/ErrorHandling.h"
     49 #include "llvm/Support/MathExtras.h"
     50 #include "llvm/Target/TargetOptions.h"
     51 #include <bitset>
     52 #include <cctype>
     53 using namespace llvm;
     54 
     55 STATISTIC(NumTailCalls, "Number of tail calls");
     56 
     57 // Forward declarations.
     58 static SDValue getMOVL(SelectionDAG &DAG, DebugLoc dl, EVT VT, SDValue V1,
     59                        SDValue V2);
     60 
     61 /// Generate a DAG to grab 128-bits from a vector > 128 bits.  This
     62 /// sets things up to match to an AVX VEXTRACTF128 instruction or a
     63 /// simple subregister reference.  Idx is an index in the 128 bits we
     64 /// want.  It need not be aligned to a 128-bit bounday.  That makes
     65 /// lowering EXTRACT_VECTOR_ELT operations easier.
     66 static SDValue Extract128BitVector(SDValue Vec,
     67                                    SDValue Idx,
     68                                    SelectionDAG &DAG,
     69                                    DebugLoc dl) {
     70   EVT VT = Vec.getValueType();
     71   assert(VT.getSizeInBits() == 256 && "Unexpected vector size!");
     72   EVT ElVT = VT.getVectorElementType();
     73   int Factor = VT.getSizeInBits()/128;
     74   EVT ResultVT = EVT::getVectorVT(*DAG.getContext(), ElVT,
     75                                   VT.getVectorNumElements()/Factor);
     76 
     77   // Extract from UNDEF is UNDEF.
     78   if (Vec.getOpcode() == ISD::UNDEF)
     79     return DAG.getNode(ISD::UNDEF, dl, ResultVT);
     80 
     81   if (isa<ConstantSDNode>(Idx)) {
     82     unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
     83 
     84     // Extract the relevant 128 bits.  Generate an EXTRACT_SUBVECTOR
     85     // we can match to VEXTRACTF128.
     86     unsigned ElemsPerChunk = 128 / ElVT.getSizeInBits();
     87 
     88     // This is the index of the first element of the 128-bit chunk
     89     // we want.
     90     unsigned NormalizedIdxVal = (((IdxVal * ElVT.getSizeInBits()) / 128)
     91                                  * ElemsPerChunk);
     92 
     93     SDValue VecIdx = DAG.getConstant(NormalizedIdxVal, MVT::i32);
     94     SDValue Result = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, ResultVT, Vec,
     95                                  VecIdx);
     96 
     97     return Result;
     98   }
     99 
    100   return SDValue();
    101 }
    102 
    103 /// Generate a DAG to put 128-bits into a vector > 128 bits.  This
    104 /// sets things up to match to an AVX VINSERTF128 instruction or a
    105 /// simple superregister reference.  Idx is an index in the 128 bits
    106 /// we want.  It need not be aligned to a 128-bit bounday.  That makes
    107 /// lowering INSERT_VECTOR_ELT operations easier.
    108 static SDValue Insert128BitVector(SDValue Result,
    109                                   SDValue Vec,
    110                                   SDValue Idx,
    111                                   SelectionDAG &DAG,
    112                                   DebugLoc dl) {
    113   if (isa<ConstantSDNode>(Idx)) {
    114     EVT VT = Vec.getValueType();
    115     assert(VT.getSizeInBits() == 128 && "Unexpected vector size!");
    116 
    117     EVT ElVT = VT.getVectorElementType();
    118     unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
    119     EVT ResultVT = Result.getValueType();
    120 
    121     // Insert the relevant 128 bits.
    122     unsigned ElemsPerChunk = 128/ElVT.getSizeInBits();
    123 
    124     // This is the index of the first element of the 128-bit chunk
    125     // we want.
    126     unsigned NormalizedIdxVal = (((IdxVal * ElVT.getSizeInBits())/128)
    127                                  * ElemsPerChunk);
    128 
    129     SDValue VecIdx = DAG.getConstant(NormalizedIdxVal, MVT::i32);
    130     Result = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResultVT, Result, Vec,
    131                          VecIdx);
    132     return Result;
    133   }
    134 
    135   return SDValue();
    136 }
    137 
    138 static TargetLoweringObjectFile *createTLOF(X86TargetMachine &TM) {
    139   const X86Subtarget *Subtarget = &TM.getSubtarget<X86Subtarget>();
    140   bool is64Bit = Subtarget->is64Bit();
    141 
    142   if (Subtarget->isTargetEnvMacho()) {
    143     if (is64Bit)
    144       return new X8664_MachoTargetObjectFile();
    145     return new TargetLoweringObjectFileMachO();
    146   }
    147 
    148   if (Subtarget->isTargetELF())
    149     return new TargetLoweringObjectFileELF();
    150   if (Subtarget->isTargetCOFF() && !Subtarget->isTargetEnvMacho())
    151     return new TargetLoweringObjectFileCOFF();
    152   llvm_unreachable("unknown subtarget type");
    153 }
    154 
    155 X86TargetLowering::X86TargetLowering(X86TargetMachine &TM)
    156   : TargetLowering(TM, createTLOF(TM)) {
    157   Subtarget = &TM.getSubtarget<X86Subtarget>();
    158   X86ScalarSSEf64 = Subtarget->hasSSE2();
    159   X86ScalarSSEf32 = Subtarget->hasSSE1();
    160   X86StackPtr = Subtarget->is64Bit() ? X86::RSP : X86::ESP;
    161 
    162   RegInfo = TM.getRegisterInfo();
    163   TD = getTargetData();
    164 
    165   // Set up the TargetLowering object.
    166   static MVT IntVTs[] = { MVT::i8, MVT::i16, MVT::i32, MVT::i64 };
    167 
    168   // X86 is weird, it always uses i8 for shift amounts and setcc results.
    169   setBooleanContents(ZeroOrOneBooleanContent);
    170   // X86-SSE is even stranger. It uses -1 or 0 for vector masks.
    171   setBooleanVectorContents(ZeroOrNegativeOneBooleanContent);
    172 
    173   // For 64-bit since we have so many registers use the ILP scheduler, for
    174   // 32-bit code use the register pressure specific scheduling.
    175   // For 32 bit Atom, use Hybrid (register pressure + latency) scheduling.
    176   if (Subtarget->is64Bit())
    177     setSchedulingPreference(Sched::ILP);
    178   else if (Subtarget->isAtom())
    179     setSchedulingPreference(Sched::Hybrid);
    180   else
    181     setSchedulingPreference(Sched::RegPressure);
    182   setStackPointerRegisterToSaveRestore(X86StackPtr);
    183 
    184   if (Subtarget->isTargetWindows() && !Subtarget->isTargetCygMing()) {
    185     // Setup Windows compiler runtime calls.
    186     setLibcallName(RTLIB::SDIV_I64, "_alldiv");
    187     setLibcallName(RTLIB::UDIV_I64, "_aulldiv");
    188     setLibcallName(RTLIB::SREM_I64, "_allrem");
    189     setLibcallName(RTLIB::UREM_I64, "_aullrem");
    190     setLibcallName(RTLIB::MUL_I64, "_allmul");
    191     setLibcallCallingConv(RTLIB::SDIV_I64, CallingConv::X86_StdCall);
    192     setLibcallCallingConv(RTLIB::UDIV_I64, CallingConv::X86_StdCall);
    193     setLibcallCallingConv(RTLIB::SREM_I64, CallingConv::X86_StdCall);
    194     setLibcallCallingConv(RTLIB::UREM_I64, CallingConv::X86_StdCall);
    195     setLibcallCallingConv(RTLIB::MUL_I64, CallingConv::X86_StdCall);
    196 
    197     // The _ftol2 runtime function has an unusual calling conv, which
    198     // is modeled by a special pseudo-instruction.
    199     setLibcallName(RTLIB::FPTOUINT_F64_I64, 0);
    200     setLibcallName(RTLIB::FPTOUINT_F32_I64, 0);
    201     setLibcallName(RTLIB::FPTOUINT_F64_I32, 0);
    202     setLibcallName(RTLIB::FPTOUINT_F32_I32, 0);
    203   }
    204 
    205   if (Subtarget->isTargetDarwin()) {
    206     // Darwin should use _setjmp/_longjmp instead of setjmp/longjmp.
    207     setUseUnderscoreSetJmp(false);
    208     setUseUnderscoreLongJmp(false);
    209   } else if (Subtarget->isTargetMingw()) {
    210     // MS runtime is weird: it exports _setjmp, but longjmp!
    211     setUseUnderscoreSetJmp(true);
    212     setUseUnderscoreLongJmp(false);
    213   } else {
    214     setUseUnderscoreSetJmp(true);
    215     setUseUnderscoreLongJmp(true);
    216   }
    217 
    218   // Set up the register classes.
    219   addRegisterClass(MVT::i8, X86::GR8RegisterClass);
    220   addRegisterClass(MVT::i16, X86::GR16RegisterClass);
    221   addRegisterClass(MVT::i32, X86::GR32RegisterClass);
    222   if (Subtarget->is64Bit())
    223     addRegisterClass(MVT::i64, X86::GR64RegisterClass);
    224 
    225   setLoadExtAction(ISD::SEXTLOAD, MVT::i1, Promote);
    226 
    227   // We don't accept any truncstore of integer registers.
    228   setTruncStoreAction(MVT::i64, MVT::i32, Expand);
    229   setTruncStoreAction(MVT::i64, MVT::i16, Expand);
    230   setTruncStoreAction(MVT::i64, MVT::i8 , Expand);
    231   setTruncStoreAction(MVT::i32, MVT::i16, Expand);
    232   setTruncStoreAction(MVT::i32, MVT::i8 , Expand);
    233   setTruncStoreAction(MVT::i16, MVT::i8,  Expand);
    234 
    235   // SETOEQ and SETUNE require checking two conditions.
    236   setCondCodeAction(ISD::SETOEQ, MVT::f32, Expand);
    237   setCondCodeAction(ISD::SETOEQ, MVT::f64, Expand);
    238   setCondCodeAction(ISD::SETOEQ, MVT::f80, Expand);
    239   setCondCodeAction(ISD::SETUNE, MVT::f32, Expand);
    240   setCondCodeAction(ISD::SETUNE, MVT::f64, Expand);
    241   setCondCodeAction(ISD::SETUNE, MVT::f80, Expand);
    242 
    243   // Promote all UINT_TO_FP to larger SINT_TO_FP's, as X86 doesn't have this
    244   // operation.
    245   setOperationAction(ISD::UINT_TO_FP       , MVT::i1   , Promote);
    246   setOperationAction(ISD::UINT_TO_FP       , MVT::i8   , Promote);
    247   setOperationAction(ISD::UINT_TO_FP       , MVT::i16  , Promote);
    248 
    249   if (Subtarget->is64Bit()) {
    250     setOperationAction(ISD::UINT_TO_FP     , MVT::i32  , Promote);
    251     setOperationAction(ISD::UINT_TO_FP     , MVT::i64  , Custom);
    252   } else if (!TM.Options.UseSoftFloat) {
    253     // We have an algorithm for SSE2->double, and we turn this into a
    254     // 64-bit FILD followed by conditional FADD for other targets.
    255     setOperationAction(ISD::UINT_TO_FP     , MVT::i64  , Custom);
    256     // We have an algorithm for SSE2, and we turn this into a 64-bit
    257     // FILD for other targets.
    258     setOperationAction(ISD::UINT_TO_FP     , MVT::i32  , Custom);
    259   }
    260 
    261   // Promote i1/i8 SINT_TO_FP to larger SINT_TO_FP's, as X86 doesn't have
    262   // this operation.
    263   setOperationAction(ISD::SINT_TO_FP       , MVT::i1   , Promote);
    264   setOperationAction(ISD::SINT_TO_FP       , MVT::i8   , Promote);
    265 
    266   if (!TM.Options.UseSoftFloat) {
    267     // SSE has no i16 to fp conversion, only i32
    268     if (X86ScalarSSEf32) {
    269       setOperationAction(ISD::SINT_TO_FP     , MVT::i16  , Promote);
    270       // f32 and f64 cases are Legal, f80 case is not
    271       setOperationAction(ISD::SINT_TO_FP     , MVT::i32  , Custom);
    272     } else {
    273       setOperationAction(ISD::SINT_TO_FP     , MVT::i16  , Custom);
    274       setOperationAction(ISD::SINT_TO_FP     , MVT::i32  , Custom);
    275     }
    276   } else {
    277     setOperationAction(ISD::SINT_TO_FP     , MVT::i16  , Promote);
    278     setOperationAction(ISD::SINT_TO_FP     , MVT::i32  , Promote);
    279   }
    280 
    281   // In 32-bit mode these are custom lowered.  In 64-bit mode F32 and F64
    282   // are Legal, f80 is custom lowered.
    283   setOperationAction(ISD::FP_TO_SINT     , MVT::i64  , Custom);
    284   setOperationAction(ISD::SINT_TO_FP     , MVT::i64  , Custom);
    285 
    286   // Promote i1/i8 FP_TO_SINT to larger FP_TO_SINTS's, as X86 doesn't have
    287   // this operation.
    288   setOperationAction(ISD::FP_TO_SINT       , MVT::i1   , Promote);
    289   setOperationAction(ISD::FP_TO_SINT       , MVT::i8   , Promote);
    290 
    291   if (X86ScalarSSEf32) {
    292     setOperationAction(ISD::FP_TO_SINT     , MVT::i16  , Promote);
    293     // f32 and f64 cases are Legal, f80 case is not
    294     setOperationAction(ISD::FP_TO_SINT     , MVT::i32  , Custom);
    295   } else {
    296     setOperationAction(ISD::FP_TO_SINT     , MVT::i16  , Custom);
    297     setOperationAction(ISD::FP_TO_SINT     , MVT::i32  , Custom);
    298   }
    299 
    300   // Handle FP_TO_UINT by promoting the destination to a larger signed
    301   // conversion.
    302   setOperationAction(ISD::FP_TO_UINT       , MVT::i1   , Promote);
    303   setOperationAction(ISD::FP_TO_UINT       , MVT::i8   , Promote);
    304   setOperationAction(ISD::FP_TO_UINT       , MVT::i16  , Promote);
    305 
    306   if (Subtarget->is64Bit()) {
    307     setOperationAction(ISD::FP_TO_UINT     , MVT::i64  , Expand);
    308     setOperationAction(ISD::FP_TO_UINT     , MVT::i32  , Promote);
    309   } else if (!TM.Options.UseSoftFloat) {
    310     // Since AVX is a superset of SSE3, only check for SSE here.
    311     if (Subtarget->hasSSE1() && !Subtarget->hasSSE3())
    312       // Expand FP_TO_UINT into a select.
    313       // FIXME: We would like to use a Custom expander here eventually to do
    314       // the optimal thing for SSE vs. the default expansion in the legalizer.
    315       setOperationAction(ISD::FP_TO_UINT   , MVT::i32  , Expand);
    316     else
    317       // With SSE3 we can use fisttpll to convert to a signed i64; without
    318       // SSE, we're stuck with a fistpll.
    319       setOperationAction(ISD::FP_TO_UINT   , MVT::i32  , Custom);
    320   }
    321 
    322   if (isTargetFTOL()) {
    323     // Use the _ftol2 runtime function, which has a pseudo-instruction
    324     // to handle its weird calling convention.
    325     setOperationAction(ISD::FP_TO_UINT     , MVT::i64  , Custom);
    326   }
    327 
    328   // TODO: when we have SSE, these could be more efficient, by using movd/movq.
    329   if (!X86ScalarSSEf64) {
    330     setOperationAction(ISD::BITCAST        , MVT::f32  , Expand);
    331     setOperationAction(ISD::BITCAST        , MVT::i32  , Expand);
    332     if (Subtarget->is64Bit()) {
    333       setOperationAction(ISD::BITCAST      , MVT::f64  , Expand);
    334       // Without SSE, i64->f64 goes through memory.
    335       setOperationAction(ISD::BITCAST      , MVT::i64  , Expand);
    336     }
    337   }
    338 
    339   // Scalar integer divide and remainder are lowered to use operations that
    340   // produce two results, to match the available instructions. This exposes
    341   // the two-result form to trivial CSE, which is able to combine x/y and x%y
    342   // into a single instruction.
    343   //
    344   // Scalar integer multiply-high is also lowered to use two-result
    345   // operations, to match the available instructions. However, plain multiply
    346   // (low) operations are left as Legal, as there are single-result
    347   // instructions for this in x86. Using the two-result multiply instructions
    348   // when both high and low results are needed must be arranged by dagcombine.
    349   for (unsigned i = 0, e = 4; i != e; ++i) {
    350     MVT VT = IntVTs[i];
    351     setOperationAction(ISD::MULHS, VT, Expand);
    352     setOperationAction(ISD::MULHU, VT, Expand);
    353     setOperationAction(ISD::SDIV, VT, Expand);
    354     setOperationAction(ISD::UDIV, VT, Expand);
    355     setOperationAction(ISD::SREM, VT, Expand);
    356     setOperationAction(ISD::UREM, VT, Expand);
    357 
    358     // Add/Sub overflow ops with MVT::Glues are lowered to EFLAGS dependences.
    359     setOperationAction(ISD::ADDC, VT, Custom);
    360     setOperationAction(ISD::ADDE, VT, Custom);
    361     setOperationAction(ISD::SUBC, VT, Custom);
    362     setOperationAction(ISD::SUBE, VT, Custom);
    363   }
    364 
    365   setOperationAction(ISD::BR_JT            , MVT::Other, Expand);
    366   setOperationAction(ISD::BRCOND           , MVT::Other, Custom);
    367   setOperationAction(ISD::BR_CC            , MVT::Other, Expand);
    368   setOperationAction(ISD::SELECT_CC        , MVT::Other, Expand);
    369   if (Subtarget->is64Bit())
    370     setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i32, Legal);
    371   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i16  , Legal);
    372   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i8   , Legal);
    373   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1   , Expand);
    374   setOperationAction(ISD::FP_ROUND_INREG   , MVT::f32  , Expand);
    375   setOperationAction(ISD::FREM             , MVT::f32  , Expand);
    376   setOperationAction(ISD::FREM             , MVT::f64  , Expand);
    377   setOperationAction(ISD::FREM             , MVT::f80  , Expand);
    378   setOperationAction(ISD::FLT_ROUNDS_      , MVT::i32  , Custom);
    379 
    380   // Promote the i8 variants and force them on up to i32 which has a shorter
    381   // encoding.
    382   setOperationAction(ISD::CTTZ             , MVT::i8   , Promote);
    383   AddPromotedToType (ISD::CTTZ             , MVT::i8   , MVT::i32);
    384   setOperationAction(ISD::CTTZ_ZERO_UNDEF  , MVT::i8   , Promote);
    385   AddPromotedToType (ISD::CTTZ_ZERO_UNDEF  , MVT::i8   , MVT::i32);
    386   if (Subtarget->hasBMI()) {
    387     setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i16  , Expand);
    388     setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i32  , Expand);
    389     if (Subtarget->is64Bit())
    390       setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i64, Expand);
    391   } else {
    392     setOperationAction(ISD::CTTZ           , MVT::i16  , Custom);
    393     setOperationAction(ISD::CTTZ           , MVT::i32  , Custom);
    394     if (Subtarget->is64Bit())
    395       setOperationAction(ISD::CTTZ         , MVT::i64  , Custom);
    396   }
    397 
    398   if (Subtarget->hasLZCNT()) {
    399     // When promoting the i8 variants, force them to i32 for a shorter
    400     // encoding.
    401     setOperationAction(ISD::CTLZ           , MVT::i8   , Promote);
    402     AddPromotedToType (ISD::CTLZ           , MVT::i8   , MVT::i32);
    403     setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i8   , Promote);
    404     AddPromotedToType (ISD::CTLZ_ZERO_UNDEF, MVT::i8   , MVT::i32);
    405     setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i16  , Expand);
    406     setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i32  , Expand);
    407     if (Subtarget->is64Bit())
    408       setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i64, Expand);
    409   } else {
    410     setOperationAction(ISD::CTLZ           , MVT::i8   , Custom);
    411     setOperationAction(ISD::CTLZ           , MVT::i16  , Custom);
    412     setOperationAction(ISD::CTLZ           , MVT::i32  , Custom);
    413     setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i8   , Custom);
    414     setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i16  , Custom);
    415     setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i32  , Custom);
    416     if (Subtarget->is64Bit()) {
    417       setOperationAction(ISD::CTLZ         , MVT::i64  , Custom);
    418       setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i64, Custom);
    419     }
    420   }
    421 
    422   if (Subtarget->hasPOPCNT()) {
    423     setOperationAction(ISD::CTPOP          , MVT::i8   , Promote);
    424   } else {
    425     setOperationAction(ISD::CTPOP          , MVT::i8   , Expand);
    426     setOperationAction(ISD::CTPOP          , MVT::i16  , Expand);
    427     setOperationAction(ISD::CTPOP          , MVT::i32  , Expand);
    428     if (Subtarget->is64Bit())
    429       setOperationAction(ISD::CTPOP        , MVT::i64  , Expand);
    430   }
    431 
    432   setOperationAction(ISD::READCYCLECOUNTER , MVT::i64  , Custom);
    433   setOperationAction(ISD::BSWAP            , MVT::i16  , Expand);
    434 
    435   // These should be promoted to a larger select which is supported.
    436   setOperationAction(ISD::SELECT          , MVT::i1   , Promote);
    437   // X86 wants to expand cmov itself.
    438   setOperationAction(ISD::SELECT          , MVT::i8   , Custom);
    439   setOperationAction(ISD::SELECT          , MVT::i16  , Custom);
    440   setOperationAction(ISD::SELECT          , MVT::i32  , Custom);
    441   setOperationAction(ISD::SELECT          , MVT::f32  , Custom);
    442   setOperationAction(ISD::SELECT          , MVT::f64  , Custom);
    443   setOperationAction(ISD::SELECT          , MVT::f80  , Custom);
    444   setOperationAction(ISD::SETCC           , MVT::i8   , Custom);
    445   setOperationAction(ISD::SETCC           , MVT::i16  , Custom);
    446   setOperationAction(ISD::SETCC           , MVT::i32  , Custom);
    447   setOperationAction(ISD::SETCC           , MVT::f32  , Custom);
    448   setOperationAction(ISD::SETCC           , MVT::f64  , Custom);
    449   setOperationAction(ISD::SETCC           , MVT::f80  , Custom);
    450   if (Subtarget->is64Bit()) {
    451     setOperationAction(ISD::SELECT        , MVT::i64  , Custom);
    452     setOperationAction(ISD::SETCC         , MVT::i64  , Custom);
    453   }
    454   setOperationAction(ISD::EH_RETURN       , MVT::Other, Custom);
    455 
    456   // Darwin ABI issue.
    457   setOperationAction(ISD::ConstantPool    , MVT::i32  , Custom);
    458   setOperationAction(ISD::JumpTable       , MVT::i32  , Custom);
    459   setOperationAction(ISD::GlobalAddress   , MVT::i32  , Custom);
    460   setOperationAction(ISD::GlobalTLSAddress, MVT::i32  , Custom);
    461   if (Subtarget->is64Bit())
    462     setOperationAction(ISD::GlobalTLSAddress, MVT::i64, Custom);
    463   setOperationAction(ISD::ExternalSymbol  , MVT::i32  , Custom);
    464   setOperationAction(ISD::BlockAddress    , MVT::i32  , Custom);
    465   if (Subtarget->is64Bit()) {
    466     setOperationAction(ISD::ConstantPool  , MVT::i64  , Custom);
    467     setOperationAction(ISD::JumpTable     , MVT::i64  , Custom);
    468     setOperationAction(ISD::GlobalAddress , MVT::i64  , Custom);
    469     setOperationAction(ISD::ExternalSymbol, MVT::i64  , Custom);
    470     setOperationAction(ISD::BlockAddress  , MVT::i64  , Custom);
    471   }
    472   // 64-bit addm sub, shl, sra, srl (iff 32-bit x86)
    473   setOperationAction(ISD::SHL_PARTS       , MVT::i32  , Custom);
    474   setOperationAction(ISD::SRA_PARTS       , MVT::i32  , Custom);
    475   setOperationAction(ISD::SRL_PARTS       , MVT::i32  , Custom);
    476   if (Subtarget->is64Bit()) {
    477     setOperationAction(ISD::SHL_PARTS     , MVT::i64  , Custom);
    478     setOperationAction(ISD::SRA_PARTS     , MVT::i64  , Custom);
    479     setOperationAction(ISD::SRL_PARTS     , MVT::i64  , Custom);
    480   }
    481 
    482   if (Subtarget->hasSSE1())
    483     setOperationAction(ISD::PREFETCH      , MVT::Other, Legal);
    484 
    485   setOperationAction(ISD::MEMBARRIER    , MVT::Other, Custom);
    486   setOperationAction(ISD::ATOMIC_FENCE  , MVT::Other, Custom);
    487 
    488   // On X86 and X86-64, atomic operations are lowered to locked instructions.
    489   // Locked instructions, in turn, have implicit fence semantics (all memory
    490   // operations are flushed before issuing the locked instruction, and they
    491   // are not buffered), so we can fold away the common pattern of
    492   // fence-atomic-fence.
    493   setShouldFoldAtomicFences(true);
    494 
    495   // Expand certain atomics
    496   for (unsigned i = 0, e = 4; i != e; ++i) {
    497     MVT VT = IntVTs[i];
    498     setOperationAction(ISD::ATOMIC_CMP_SWAP, VT, Custom);
    499     setOperationAction(ISD::ATOMIC_LOAD_SUB, VT, Custom);
    500     setOperationAction(ISD::ATOMIC_STORE, VT, Custom);
    501   }
    502 
    503   if (!Subtarget->is64Bit()) {
    504     setOperationAction(ISD::ATOMIC_LOAD, MVT::i64, Custom);
    505     setOperationAction(ISD::ATOMIC_LOAD_ADD, MVT::i64, Custom);
    506     setOperationAction(ISD::ATOMIC_LOAD_SUB, MVT::i64, Custom);
    507     setOperationAction(ISD::ATOMIC_LOAD_AND, MVT::i64, Custom);
    508     setOperationAction(ISD::ATOMIC_LOAD_OR, MVT::i64, Custom);
    509     setOperationAction(ISD::ATOMIC_LOAD_XOR, MVT::i64, Custom);
    510     setOperationAction(ISD::ATOMIC_LOAD_NAND, MVT::i64, Custom);
    511     setOperationAction(ISD::ATOMIC_SWAP, MVT::i64, Custom);
    512   }
    513 
    514   if (Subtarget->hasCmpxchg16b()) {
    515     setOperationAction(ISD::ATOMIC_CMP_SWAP, MVT::i128, Custom);
    516   }
    517 
    518   // FIXME - use subtarget debug flags
    519   if (!Subtarget->isTargetDarwin() &&
    520       !Subtarget->isTargetELF() &&
    521       !Subtarget->isTargetCygMing()) {
    522     setOperationAction(ISD::EH_LABEL, MVT::Other, Expand);
    523   }
    524 
    525   setOperationAction(ISD::EXCEPTIONADDR, MVT::i64, Expand);
    526   setOperationAction(ISD::EHSELECTION,   MVT::i64, Expand);
    527   setOperationAction(ISD::EXCEPTIONADDR, MVT::i32, Expand);
    528   setOperationAction(ISD::EHSELECTION,   MVT::i32, Expand);
    529   if (Subtarget->is64Bit()) {
    530     setExceptionPointerRegister(X86::RAX);
    531     setExceptionSelectorRegister(X86::RDX);
    532   } else {
    533     setExceptionPointerRegister(X86::EAX);
    534     setExceptionSelectorRegister(X86::EDX);
    535   }
    536   setOperationAction(ISD::FRAME_TO_ARGS_OFFSET, MVT::i32, Custom);
    537   setOperationAction(ISD::FRAME_TO_ARGS_OFFSET, MVT::i64, Custom);
    538 
    539   setOperationAction(ISD::INIT_TRAMPOLINE, MVT::Other, Custom);
    540   setOperationAction(ISD::ADJUST_TRAMPOLINE, MVT::Other, Custom);
    541 
    542   setOperationAction(ISD::TRAP, MVT::Other, Legal);
    543 
    544   // VASTART needs to be custom lowered to use the VarArgsFrameIndex
    545   setOperationAction(ISD::VASTART           , MVT::Other, Custom);
    546   setOperationAction(ISD::VAEND             , MVT::Other, Expand);
    547   if (Subtarget->is64Bit()) {
    548     setOperationAction(ISD::VAARG           , MVT::Other, Custom);
    549     setOperationAction(ISD::VACOPY          , MVT::Other, Custom);
    550   } else {
    551     setOperationAction(ISD::VAARG           , MVT::Other, Expand);
    552     setOperationAction(ISD::VACOPY          , MVT::Other, Expand);
    553   }
    554 
    555   setOperationAction(ISD::STACKSAVE,          MVT::Other, Expand);
    556   setOperationAction(ISD::STACKRESTORE,       MVT::Other, Expand);
    557 
    558   if (Subtarget->isTargetCOFF() && !Subtarget->isTargetEnvMacho())
    559     setOperationAction(ISD::DYNAMIC_STACKALLOC, Subtarget->is64Bit() ?
    560                        MVT::i64 : MVT::i32, Custom);
    561   else if (TM.Options.EnableSegmentedStacks)
    562     setOperationAction(ISD::DYNAMIC_STACKALLOC, Subtarget->is64Bit() ?
    563                        MVT::i64 : MVT::i32, Custom);
    564   else
    565     setOperationAction(ISD::DYNAMIC_STACKALLOC, Subtarget->is64Bit() ?
    566                        MVT::i64 : MVT::i32, Expand);
    567 
    568   if (!TM.Options.UseSoftFloat && X86ScalarSSEf64) {
    569     // f32 and f64 use SSE.
    570     // Set up the FP register classes.
    571     addRegisterClass(MVT::f32, X86::FR32RegisterClass);
    572     addRegisterClass(MVT::f64, X86::FR64RegisterClass);
    573 
    574     // Use ANDPD to simulate FABS.
    575     setOperationAction(ISD::FABS , MVT::f64, Custom);
    576     setOperationAction(ISD::FABS , MVT::f32, Custom);
    577 
    578     // Use XORP to simulate FNEG.
    579     setOperationAction(ISD::FNEG , MVT::f64, Custom);
    580     setOperationAction(ISD::FNEG , MVT::f32, Custom);
    581 
    582     // Use ANDPD and ORPD to simulate FCOPYSIGN.
    583     setOperationAction(ISD::FCOPYSIGN, MVT::f64, Custom);
    584     setOperationAction(ISD::FCOPYSIGN, MVT::f32, Custom);
    585 
    586     // Lower this to FGETSIGNx86 plus an AND.
    587     setOperationAction(ISD::FGETSIGN, MVT::i64, Custom);
    588     setOperationAction(ISD::FGETSIGN, MVT::i32, Custom);
    589 
    590     // We don't support sin/cos/fmod
    591     setOperationAction(ISD::FSIN , MVT::f64, Expand);
    592     setOperationAction(ISD::FCOS , MVT::f64, Expand);
    593     setOperationAction(ISD::FSIN , MVT::f32, Expand);
    594     setOperationAction(ISD::FCOS , MVT::f32, Expand);
    595 
    596     // Expand FP immediates into loads from the stack, except for the special
    597     // cases we handle.
    598     addLegalFPImmediate(APFloat(+0.0)); // xorpd
    599     addLegalFPImmediate(APFloat(+0.0f)); // xorps
    600   } else if (!TM.Options.UseSoftFloat && X86ScalarSSEf32) {
    601     // Use SSE for f32, x87 for f64.
    602     // Set up the FP register classes.
    603     addRegisterClass(MVT::f32, X86::FR32RegisterClass);
    604     addRegisterClass(MVT::f64, X86::RFP64RegisterClass);
    605 
    606     // Use ANDPS to simulate FABS.
    607     setOperationAction(ISD::FABS , MVT::f32, Custom);
    608 
    609     // Use XORP to simulate FNEG.
    610     setOperationAction(ISD::FNEG , MVT::f32, Custom);
    611 
    612     setOperationAction(ISD::UNDEF,     MVT::f64, Expand);
    613 
    614     // Use ANDPS and ORPS to simulate FCOPYSIGN.
    615     setOperationAction(ISD::FCOPYSIGN, MVT::f64, Expand);
    616     setOperationAction(ISD::FCOPYSIGN, MVT::f32, Custom);
    617 
    618     // We don't support sin/cos/fmod
    619     setOperationAction(ISD::FSIN , MVT::f32, Expand);
    620     setOperationAction(ISD::FCOS , MVT::f32, Expand);
    621 
    622     // Special cases we handle for FP constants.
    623     addLegalFPImmediate(APFloat(+0.0f)); // xorps
    624     addLegalFPImmediate(APFloat(+0.0)); // FLD0
    625     addLegalFPImmediate(APFloat(+1.0)); // FLD1
    626     addLegalFPImmediate(APFloat(-0.0)); // FLD0/FCHS
    627     addLegalFPImmediate(APFloat(-1.0)); // FLD1/FCHS
    628 
    629     if (!TM.Options.UnsafeFPMath) {
    630       setOperationAction(ISD::FSIN           , MVT::f64  , Expand);
    631       setOperationAction(ISD::FCOS           , MVT::f64  , Expand);
    632     }
    633   } else if (!TM.Options.UseSoftFloat) {
    634     // f32 and f64 in x87.
    635     // Set up the FP register classes.
    636     addRegisterClass(MVT::f64, X86::RFP64RegisterClass);
    637     addRegisterClass(MVT::f32, X86::RFP32RegisterClass);
    638 
    639     setOperationAction(ISD::UNDEF,     MVT::f64, Expand);
    640     setOperationAction(ISD::UNDEF,     MVT::f32, Expand);
    641     setOperationAction(ISD::FCOPYSIGN, MVT::f64, Expand);
    642     setOperationAction(ISD::FCOPYSIGN, MVT::f32, Expand);
    643 
    644     if (!TM.Options.UnsafeFPMath) {
    645       setOperationAction(ISD::FSIN           , MVT::f64  , Expand);
    646       setOperationAction(ISD::FCOS           , MVT::f64  , Expand);
    647     }
    648     addLegalFPImmediate(APFloat(+0.0)); // FLD0
    649     addLegalFPImmediate(APFloat(+1.0)); // FLD1
    650     addLegalFPImmediate(APFloat(-0.0)); // FLD0/FCHS
    651     addLegalFPImmediate(APFloat(-1.0)); // FLD1/FCHS
    652     addLegalFPImmediate(APFloat(+0.0f)); // FLD0
    653     addLegalFPImmediate(APFloat(+1.0f)); // FLD1
    654     addLegalFPImmediate(APFloat(-0.0f)); // FLD0/FCHS
    655     addLegalFPImmediate(APFloat(-1.0f)); // FLD1/FCHS
    656   }
    657 
    658   // We don't support FMA.
    659   setOperationAction(ISD::FMA, MVT::f64, Expand);
    660   setOperationAction(ISD::FMA, MVT::f32, Expand);
    661 
    662   // Long double always uses X87.
    663   if (!TM.Options.UseSoftFloat) {
    664     addRegisterClass(MVT::f80, X86::RFP80RegisterClass);
    665     setOperationAction(ISD::UNDEF,     MVT::f80, Expand);
    666     setOperationAction(ISD::FCOPYSIGN, MVT::f80, Expand);
    667     {
    668       APFloat TmpFlt = APFloat::getZero(APFloat::x87DoubleExtended);
    669       addLegalFPImmediate(TmpFlt);  // FLD0
    670       TmpFlt.changeSign();
    671       addLegalFPImmediate(TmpFlt);  // FLD0/FCHS
    672 
    673       bool ignored;
    674       APFloat TmpFlt2(+1.0);
    675       TmpFlt2.convert(APFloat::x87DoubleExtended, APFloat::rmNearestTiesToEven,
    676                       &ignored);
    677       addLegalFPImmediate(TmpFlt2);  // FLD1
    678       TmpFlt2.changeSign();
    679       addLegalFPImmediate(TmpFlt2);  // FLD1/FCHS
    680     }
    681 
    682     if (!TM.Options.UnsafeFPMath) {
    683       setOperationAction(ISD::FSIN           , MVT::f80  , Expand);
    684       setOperationAction(ISD::FCOS           , MVT::f80  , Expand);
    685     }
    686 
    687     setOperationAction(ISD::FFLOOR, MVT::f80, Expand);
    688     setOperationAction(ISD::FCEIL,  MVT::f80, Expand);
    689     setOperationAction(ISD::FTRUNC, MVT::f80, Expand);
    690     setOperationAction(ISD::FRINT,  MVT::f80, Expand);
    691     setOperationAction(ISD::FNEARBYINT, MVT::f80, Expand);
    692     setOperationAction(ISD::FMA, MVT::f80, Expand);
    693   }
    694 
    695   // Always use a library call for pow.
    696   setOperationAction(ISD::FPOW             , MVT::f32  , Expand);
    697   setOperationAction(ISD::FPOW             , MVT::f64  , Expand);
    698   setOperationAction(ISD::FPOW             , MVT::f80  , Expand);
    699 
    700   setOperationAction(ISD::FLOG, MVT::f80, Expand);
    701   setOperationAction(ISD::FLOG2, MVT::f80, Expand);
    702   setOperationAction(ISD::FLOG10, MVT::f80, Expand);
    703   setOperationAction(ISD::FEXP, MVT::f80, Expand);
    704   setOperationAction(ISD::FEXP2, MVT::f80, Expand);
    705 
    706   // First set operation action for all vector types to either promote
    707   // (for widening) or expand (for scalarization). Then we will selectively
    708   // turn on ones that can be effectively codegen'd.
    709   for (unsigned VT = (unsigned)MVT::FIRST_VECTOR_VALUETYPE;
    710        VT <= (unsigned)MVT::LAST_VECTOR_VALUETYPE; ++VT) {
    711     setOperationAction(ISD::ADD , (MVT::SimpleValueType)VT, Expand);
    712     setOperationAction(ISD::SUB , (MVT::SimpleValueType)VT, Expand);
    713     setOperationAction(ISD::FADD, (MVT::SimpleValueType)VT, Expand);
    714     setOperationAction(ISD::FNEG, (MVT::SimpleValueType)VT, Expand);
    715     setOperationAction(ISD::FSUB, (MVT::SimpleValueType)VT, Expand);
    716     setOperationAction(ISD::MUL , (MVT::SimpleValueType)VT, Expand);
    717     setOperationAction(ISD::FMUL, (MVT::SimpleValueType)VT, Expand);
    718     setOperationAction(ISD::SDIV, (MVT::SimpleValueType)VT, Expand);
    719     setOperationAction(ISD::UDIV, (MVT::SimpleValueType)VT, Expand);
    720     setOperationAction(ISD::FDIV, (MVT::SimpleValueType)VT, Expand);
    721     setOperationAction(ISD::SREM, (MVT::SimpleValueType)VT, Expand);
    722     setOperationAction(ISD::UREM, (MVT::SimpleValueType)VT, Expand);
    723     setOperationAction(ISD::LOAD, (MVT::SimpleValueType)VT, Expand);
    724     setOperationAction(ISD::VECTOR_SHUFFLE, (MVT::SimpleValueType)VT, Expand);
    725     setOperationAction(ISD::EXTRACT_VECTOR_ELT,(MVT::SimpleValueType)VT,Expand);
    726     setOperationAction(ISD::INSERT_VECTOR_ELT,(MVT::SimpleValueType)VT, Expand);
    727     setOperationAction(ISD::EXTRACT_SUBVECTOR,(MVT::SimpleValueType)VT,Expand);
    728     setOperationAction(ISD::INSERT_SUBVECTOR,(MVT::SimpleValueType)VT,Expand);
    729     setOperationAction(ISD::FABS, (MVT::SimpleValueType)VT, Expand);
    730     setOperationAction(ISD::FSIN, (MVT::SimpleValueType)VT, Expand);
    731     setOperationAction(ISD::FCOS, (MVT::SimpleValueType)VT, Expand);
    732     setOperationAction(ISD::FREM, (MVT::SimpleValueType)VT, Expand);
    733     setOperationAction(ISD::FPOWI, (MVT::SimpleValueType)VT, Expand);
    734     setOperationAction(ISD::FSQRT, (MVT::SimpleValueType)VT, Expand);
    735     setOperationAction(ISD::FCOPYSIGN, (MVT::SimpleValueType)VT, Expand);
    736     setOperationAction(ISD::SMUL_LOHI, (MVT::SimpleValueType)VT, Expand);
    737     setOperationAction(ISD::UMUL_LOHI, (MVT::SimpleValueType)VT, Expand);
    738     setOperationAction(ISD::SDIVREM, (MVT::SimpleValueType)VT, Expand);
    739     setOperationAction(ISD::UDIVREM, (MVT::SimpleValueType)VT, Expand);
    740     setOperationAction(ISD::FPOW, (MVT::SimpleValueType)VT, Expand);
    741     setOperationAction(ISD::CTPOP, (MVT::SimpleValueType)VT, Expand);
    742     setOperationAction(ISD::CTTZ, (MVT::SimpleValueType)VT, Expand);
    743     setOperationAction(ISD::CTTZ_ZERO_UNDEF, (MVT::SimpleValueType)VT, Expand);
    744     setOperationAction(ISD::CTLZ, (MVT::SimpleValueType)VT, Expand);
    745     setOperationAction(ISD::CTLZ_ZERO_UNDEF, (MVT::SimpleValueType)VT, Expand);
    746     setOperationAction(ISD::SHL, (MVT::SimpleValueType)VT, Expand);
    747     setOperationAction(ISD::SRA, (MVT::SimpleValueType)VT, Expand);
    748     setOperationAction(ISD::SRL, (MVT::SimpleValueType)VT, Expand);
    749     setOperationAction(ISD::ROTL, (MVT::SimpleValueType)VT, Expand);
    750     setOperationAction(ISD::ROTR, (MVT::SimpleValueType)VT, Expand);
    751     setOperationAction(ISD::BSWAP, (MVT::SimpleValueType)VT, Expand);
    752     setOperationAction(ISD::SETCC, (MVT::SimpleValueType)VT, Expand);
    753     setOperationAction(ISD::FLOG, (MVT::SimpleValueType)VT, Expand);
    754     setOperationAction(ISD::FLOG2, (MVT::SimpleValueType)VT, Expand);
    755     setOperationAction(ISD::FLOG10, (MVT::SimpleValueType)VT, Expand);
    756     setOperationAction(ISD::FEXP, (MVT::SimpleValueType)VT, Expand);
    757     setOperationAction(ISD::FEXP2, (MVT::SimpleValueType)VT, Expand);
    758     setOperationAction(ISD::FP_TO_UINT, (MVT::SimpleValueType)VT, Expand);
    759     setOperationAction(ISD::FP_TO_SINT, (MVT::SimpleValueType)VT, Expand);
    760     setOperationAction(ISD::UINT_TO_FP, (MVT::SimpleValueType)VT, Expand);
    761     setOperationAction(ISD::SINT_TO_FP, (MVT::SimpleValueType)VT, Expand);
    762     setOperationAction(ISD::SIGN_EXTEND_INREG, (MVT::SimpleValueType)VT,Expand);
    763     setOperationAction(ISD::TRUNCATE,  (MVT::SimpleValueType)VT, Expand);
    764     setOperationAction(ISD::SIGN_EXTEND,  (MVT::SimpleValueType)VT, Expand);
    765     setOperationAction(ISD::ZERO_EXTEND,  (MVT::SimpleValueType)VT, Expand);
    766     setOperationAction(ISD::ANY_EXTEND,  (MVT::SimpleValueType)VT, Expand);
    767     setOperationAction(ISD::VSELECT,  (MVT::SimpleValueType)VT, Expand);
    768     for (unsigned InnerVT = (unsigned)MVT::FIRST_VECTOR_VALUETYPE;
    769          InnerVT <= (unsigned)MVT::LAST_VECTOR_VALUETYPE; ++InnerVT)
    770       setTruncStoreAction((MVT::SimpleValueType)VT,
    771                           (MVT::SimpleValueType)InnerVT, Expand);
    772     setLoadExtAction(ISD::SEXTLOAD, (MVT::SimpleValueType)VT, Expand);
    773     setLoadExtAction(ISD::ZEXTLOAD, (MVT::SimpleValueType)VT, Expand);
    774     setLoadExtAction(ISD::EXTLOAD, (MVT::SimpleValueType)VT, Expand);
    775   }
    776 
    777   // FIXME: In order to prevent SSE instructions being expanded to MMX ones
    778   // with -msoft-float, disable use of MMX as well.
    779   if (!TM.Options.UseSoftFloat && Subtarget->hasMMX()) {
    780     addRegisterClass(MVT::x86mmx, X86::VR64RegisterClass);
    781     // No operations on x86mmx supported, everything uses intrinsics.
    782   }
    783 
    784   // MMX-sized vectors (other than x86mmx) are expected to be expanded
    785   // into smaller operations.
    786   setOperationAction(ISD::MULHS,              MVT::v8i8,  Expand);
    787   setOperationAction(ISD::MULHS,              MVT::v4i16, Expand);
    788   setOperationAction(ISD::MULHS,              MVT::v2i32, Expand);
    789   setOperationAction(ISD::MULHS,              MVT::v1i64, Expand);
    790   setOperationAction(ISD::AND,                MVT::v8i8,  Expand);
    791   setOperationAction(ISD::AND,                MVT::v4i16, Expand);
    792   setOperationAction(ISD::AND,                MVT::v2i32, Expand);
    793   setOperationAction(ISD::AND,                MVT::v1i64, Expand);
    794   setOperationAction(ISD::OR,                 MVT::v8i8,  Expand);
    795   setOperationAction(ISD::OR,                 MVT::v4i16, Expand);
    796   setOperationAction(ISD::OR,                 MVT::v2i32, Expand);
    797   setOperationAction(ISD::OR,                 MVT::v1i64, Expand);
    798   setOperationAction(ISD::XOR,                MVT::v8i8,  Expand);
    799   setOperationAction(ISD::XOR,                MVT::v4i16, Expand);
    800   setOperationAction(ISD::XOR,                MVT::v2i32, Expand);
    801   setOperationAction(ISD::XOR,                MVT::v1i64, Expand);
    802   setOperationAction(ISD::SCALAR_TO_VECTOR,   MVT::v8i8,  Expand);
    803   setOperationAction(ISD::SCALAR_TO_VECTOR,   MVT::v4i16, Expand);
    804   setOperationAction(ISD::SCALAR_TO_VECTOR,   MVT::v2i32, Expand);
    805   setOperationAction(ISD::SCALAR_TO_VECTOR,   MVT::v1i64, Expand);
    806   setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v1i64, Expand);
    807   setOperationAction(ISD::SELECT,             MVT::v8i8,  Expand);
    808   setOperationAction(ISD::SELECT,             MVT::v4i16, Expand);
    809   setOperationAction(ISD::SELECT,             MVT::v2i32, Expand);
    810   setOperationAction(ISD::SELECT,             MVT::v1i64, Expand);
    811   setOperationAction(ISD::BITCAST,            MVT::v8i8,  Expand);
    812   setOperationAction(ISD::BITCAST,            MVT::v4i16, Expand);
    813   setOperationAction(ISD::BITCAST,            MVT::v2i32, Expand);
    814   setOperationAction(ISD::BITCAST,            MVT::v1i64, Expand);
    815 
    816   if (!TM.Options.UseSoftFloat && Subtarget->hasSSE1()) {
    817     addRegisterClass(MVT::v4f32, X86::VR128RegisterClass);
    818 
    819     setOperationAction(ISD::FADD,               MVT::v4f32, Legal);
    820     setOperationAction(ISD::FSUB,               MVT::v4f32, Legal);
    821     setOperationAction(ISD::FMUL,               MVT::v4f32, Legal);
    822     setOperationAction(ISD::FDIV,               MVT::v4f32, Legal);
    823     setOperationAction(ISD::FSQRT,              MVT::v4f32, Legal);
    824     setOperationAction(ISD::FNEG,               MVT::v4f32, Custom);
    825     setOperationAction(ISD::LOAD,               MVT::v4f32, Legal);
    826     setOperationAction(ISD::BUILD_VECTOR,       MVT::v4f32, Custom);
    827     setOperationAction(ISD::VECTOR_SHUFFLE,     MVT::v4f32, Custom);
    828     setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4f32, Custom);
    829     setOperationAction(ISD::SELECT,             MVT::v4f32, Custom);
    830     setOperationAction(ISD::SETCC,              MVT::v4f32, Custom);
    831   }
    832 
    833   if (!TM.Options.UseSoftFloat && Subtarget->hasSSE2()) {
    834     addRegisterClass(MVT::v2f64, X86::VR128RegisterClass);
    835 
    836     // FIXME: Unfortunately -soft-float and -no-implicit-float means XMM
    837     // registers cannot be used even for integer operations.
    838     addRegisterClass(MVT::v16i8, X86::VR128RegisterClass);
    839     addRegisterClass(MVT::v8i16, X86::VR128RegisterClass);
    840     addRegisterClass(MVT::v4i32, X86::VR128RegisterClass);
    841     addRegisterClass(MVT::v2i64, X86::VR128RegisterClass);
    842 
    843     setOperationAction(ISD::ADD,                MVT::v16i8, Legal);
    844     setOperationAction(ISD::ADD,                MVT::v8i16, Legal);
    845     setOperationAction(ISD::ADD,                MVT::v4i32, Legal);
    846     setOperationAction(ISD::ADD,                MVT::v2i64, Legal);
    847     setOperationAction(ISD::MUL,                MVT::v2i64, Custom);
    848     setOperationAction(ISD::SUB,                MVT::v16i8, Legal);
    849     setOperationAction(ISD::SUB,                MVT::v8i16, Legal);
    850     setOperationAction(ISD::SUB,                MVT::v4i32, Legal);
    851     setOperationAction(ISD::SUB,                MVT::v2i64, Legal);
    852     setOperationAction(ISD::MUL,                MVT::v8i16, Legal);
    853     setOperationAction(ISD::FADD,               MVT::v2f64, Legal);
    854     setOperationAction(ISD::FSUB,               MVT::v2f64, Legal);
    855     setOperationAction(ISD::FMUL,               MVT::v2f64, Legal);
    856     setOperationAction(ISD::FDIV,               MVT::v2f64, Legal);
    857     setOperationAction(ISD::FSQRT,              MVT::v2f64, Legal);
    858     setOperationAction(ISD::FNEG,               MVT::v2f64, Custom);
    859 
    860     setOperationAction(ISD::SETCC,              MVT::v2i64, Custom);
    861     setOperationAction(ISD::SETCC,              MVT::v16i8, Custom);
    862     setOperationAction(ISD::SETCC,              MVT::v8i16, Custom);
    863     setOperationAction(ISD::SETCC,              MVT::v4i32, Custom);
    864 
    865     setOperationAction(ISD::SCALAR_TO_VECTOR,   MVT::v16i8, Custom);
    866     setOperationAction(ISD::SCALAR_TO_VECTOR,   MVT::v8i16, Custom);
    867     setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v8i16, Custom);
    868     setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v4i32, Custom);
    869     setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v4f32, Custom);
    870 
    871     setOperationAction(ISD::CONCAT_VECTORS,     MVT::v2f64, Custom);
    872     setOperationAction(ISD::CONCAT_VECTORS,     MVT::v2i64, Custom);
    873     setOperationAction(ISD::CONCAT_VECTORS,     MVT::v16i8, Custom);
    874     setOperationAction(ISD::CONCAT_VECTORS,     MVT::v8i16, Custom);
    875     setOperationAction(ISD::CONCAT_VECTORS,     MVT::v4i32, Custom);
    876 
    877     // Custom lower build_vector, vector_shuffle, and extract_vector_elt.
    878     for (unsigned i = (unsigned)MVT::v16i8; i != (unsigned)MVT::v2i64; ++i) {
    879       EVT VT = (MVT::SimpleValueType)i;
    880       // Do not attempt to custom lower non-power-of-2 vectors
    881       if (!isPowerOf2_32(VT.getVectorNumElements()))
    882         continue;
    883       // Do not attempt to custom lower non-128-bit vectors
    884       if (!VT.is128BitVector())
    885         continue;
    886       setOperationAction(ISD::BUILD_VECTOR,
    887                          VT.getSimpleVT().SimpleTy, Custom);
    888       setOperationAction(ISD::VECTOR_SHUFFLE,
    889                          VT.getSimpleVT().SimpleTy, Custom);
    890       setOperationAction(ISD::EXTRACT_VECTOR_ELT,
    891                          VT.getSimpleVT().SimpleTy, Custom);
    892     }
    893 
    894     setOperationAction(ISD::BUILD_VECTOR,       MVT::v2f64, Custom);
    895     setOperationAction(ISD::BUILD_VECTOR,       MVT::v2i64, Custom);
    896     setOperationAction(ISD::VECTOR_SHUFFLE,     MVT::v2f64, Custom);
    897     setOperationAction(ISD::VECTOR_SHUFFLE,     MVT::v2i64, Custom);
    898     setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v2f64, Custom);
    899     setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2f64, Custom);
    900 
    901     if (Subtarget->is64Bit()) {
    902       setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v2i64, Custom);
    903       setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2i64, Custom);
    904     }
    905 
    906     // Promote v16i8, v8i16, v4i32 load, select, and, or, xor to v2i64.
    907     for (unsigned i = (unsigned)MVT::v16i8; i != (unsigned)MVT::v2i64; i++) {
    908       MVT::SimpleValueType SVT = (MVT::SimpleValueType)i;
    909       EVT VT = SVT;
    910 
    911       // Do not attempt to promote non-128-bit vectors
    912       if (!VT.is128BitVector())
    913         continue;
    914 
    915       setOperationAction(ISD::AND,    SVT, Promote);
    916       AddPromotedToType (ISD::AND,    SVT, MVT::v2i64);
    917       setOperationAction(ISD::OR,     SVT, Promote);
    918       AddPromotedToType (ISD::OR,     SVT, MVT::v2i64);
    919       setOperationAction(ISD::XOR,    SVT, Promote);
    920       AddPromotedToType (ISD::XOR,    SVT, MVT::v2i64);
    921       setOperationAction(ISD::LOAD,   SVT, Promote);
    922       AddPromotedToType (ISD::LOAD,   SVT, MVT::v2i64);
    923       setOperationAction(ISD::SELECT, SVT, Promote);
    924       AddPromotedToType (ISD::SELECT, SVT, MVT::v2i64);
    925     }
    926 
    927     setTruncStoreAction(MVT::f64, MVT::f32, Expand);
    928 
    929     // Custom lower v2i64 and v2f64 selects.
    930     setOperationAction(ISD::LOAD,               MVT::v2f64, Legal);
    931     setOperationAction(ISD::LOAD,               MVT::v2i64, Legal);
    932     setOperationAction(ISD::SELECT,             MVT::v2f64, Custom);
    933     setOperationAction(ISD::SELECT,             MVT::v2i64, Custom);
    934 
    935     setOperationAction(ISD::FP_TO_SINT,         MVT::v4i32, Legal);
    936     setOperationAction(ISD::SINT_TO_FP,         MVT::v4i32, Legal);
    937   }
    938 
    939   if (Subtarget->hasSSE41()) {
    940     setOperationAction(ISD::FFLOOR,             MVT::f32,   Legal);
    941     setOperationAction(ISD::FCEIL,              MVT::f32,   Legal);
    942     setOperationAction(ISD::FTRUNC,             MVT::f32,   Legal);
    943     setOperationAction(ISD::FRINT,              MVT::f32,   Legal);
    944     setOperationAction(ISD::FNEARBYINT,         MVT::f32,   Legal);
    945     setOperationAction(ISD::FFLOOR,             MVT::f64,   Legal);
    946     setOperationAction(ISD::FCEIL,              MVT::f64,   Legal);
    947     setOperationAction(ISD::FTRUNC,             MVT::f64,   Legal);
    948     setOperationAction(ISD::FRINT,              MVT::f64,   Legal);
    949     setOperationAction(ISD::FNEARBYINT,         MVT::f64,   Legal);
    950 
    951     // FIXME: Do we need to handle scalar-to-vector here?
    952     setOperationAction(ISD::MUL,                MVT::v4i32, Legal);
    953 
    954     setOperationAction(ISD::VSELECT,            MVT::v2f64, Legal);
    955     setOperationAction(ISD::VSELECT,            MVT::v2i64, Legal);
    956     setOperationAction(ISD::VSELECT,            MVT::v16i8, Legal);
    957     setOperationAction(ISD::VSELECT,            MVT::v4i32, Legal);
    958     setOperationAction(ISD::VSELECT,            MVT::v4f32, Legal);
    959 
    960     // i8 and i16 vectors are custom , because the source register and source
    961     // source memory operand types are not the same width.  f32 vectors are
    962     // custom since the immediate controlling the insert encodes additional
    963     // information.
    964     setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v16i8, Custom);
    965     setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v8i16, Custom);
    966     setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v4i32, Custom);
    967     setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v4f32, Custom);
    968 
    969     setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v16i8, Custom);
    970     setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v8i16, Custom);
    971     setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4i32, Custom);
    972     setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4f32, Custom);
    973 
    974     // FIXME: these should be Legal but thats only for the case where
    975     // the index is constant.  For now custom expand to deal with that.
    976     if (Subtarget->is64Bit()) {
    977       setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v2i64, Custom);
    978       setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2i64, Custom);
    979     }
    980   }
    981 
    982   if (Subtarget->hasSSE2()) {
    983     setOperationAction(ISD::SRL,               MVT::v8i16, Custom);
    984     setOperationAction(ISD::SRL,               MVT::v16i8, Custom);
    985 
    986     setOperationAction(ISD::SHL,               MVT::v8i16, Custom);
    987     setOperationAction(ISD::SHL,               MVT::v16i8, Custom);
    988 
    989     setOperationAction(ISD::SRA,               MVT::v8i16, Custom);
    990     setOperationAction(ISD::SRA,               MVT::v16i8, Custom);
    991 
    992     if (Subtarget->hasAVX2()) {
    993       setOperationAction(ISD::SRL,             MVT::v2i64, Legal);
    994       setOperationAction(ISD::SRL,             MVT::v4i32, Legal);
    995 
    996       setOperationAction(ISD::SHL,             MVT::v2i64, Legal);
    997       setOperationAction(ISD::SHL,             MVT::v4i32, Legal);
    998 
    999       setOperationAction(ISD::SRA,             MVT::v4i32, Legal);
   1000     } else {
   1001       setOperationAction(ISD::SRL,             MVT::v2i64, Custom);
   1002       setOperationAction(ISD::SRL,             MVT::v4i32, Custom);
   1003 
   1004       setOperationAction(ISD::SHL,             MVT::v2i64, Custom);
   1005       setOperationAction(ISD::SHL,             MVT::v4i32, Custom);
   1006 
   1007       setOperationAction(ISD::SRA,             MVT::v4i32, Custom);
   1008     }
   1009   }
   1010 
   1011   if (Subtarget->hasSSE42())
   1012     setOperationAction(ISD::SETCC,             MVT::v2i64, Custom);
   1013 
   1014   if (!TM.Options.UseSoftFloat && Subtarget->hasAVX()) {
   1015     addRegisterClass(MVT::v32i8,  X86::VR256RegisterClass);
   1016     addRegisterClass(MVT::v16i16, X86::VR256RegisterClass);
   1017     addRegisterClass(MVT::v8i32,  X86::VR256RegisterClass);
   1018     addRegisterClass(MVT::v8f32,  X86::VR256RegisterClass);
   1019     addRegisterClass(MVT::v4i64,  X86::VR256RegisterClass);
   1020     addRegisterClass(MVT::v4f64,  X86::VR256RegisterClass);
   1021 
   1022     setOperationAction(ISD::LOAD,               MVT::v8f32, Legal);
   1023     setOperationAction(ISD::LOAD,               MVT::v4f64, Legal);
   1024     setOperationAction(ISD::LOAD,               MVT::v4i64, Legal);
   1025 
   1026     setOperationAction(ISD::FADD,               MVT::v8f32, Legal);
   1027     setOperationAction(ISD::FSUB,               MVT::v8f32, Legal);
   1028     setOperationAction(ISD::FMUL,               MVT::v8f32, Legal);
   1029     setOperationAction(ISD::FDIV,               MVT::v8f32, Legal);
   1030     setOperationAction(ISD::FSQRT,              MVT::v8f32, Legal);
   1031     setOperationAction(ISD::FNEG,               MVT::v8f32, Custom);
   1032 
   1033     setOperationAction(ISD::FADD,               MVT::v4f64, Legal);
   1034     setOperationAction(ISD::FSUB,               MVT::v4f64, Legal);
   1035     setOperationAction(ISD::FMUL,               MVT::v4f64, Legal);
   1036     setOperationAction(ISD::FDIV,               MVT::v4f64, Legal);
   1037     setOperationAction(ISD::FSQRT,              MVT::v4f64, Legal);
   1038     setOperationAction(ISD::FNEG,               MVT::v4f64, Custom);
   1039 
   1040     setOperationAction(ISD::FP_TO_SINT,         MVT::v8i32, Legal);
   1041     setOperationAction(ISD::SINT_TO_FP,         MVT::v8i32, Legal);
   1042     setOperationAction(ISD::FP_ROUND,           MVT::v4f32, Legal);
   1043 
   1044     setOperationAction(ISD::CONCAT_VECTORS,     MVT::v4f64,  Custom);
   1045     setOperationAction(ISD::CONCAT_VECTORS,     MVT::v4i64,  Custom);
   1046     setOperationAction(ISD::CONCAT_VECTORS,     MVT::v8f32,  Custom);
   1047     setOperationAction(ISD::CONCAT_VECTORS,     MVT::v8i32,  Custom);
   1048     setOperationAction(ISD::CONCAT_VECTORS,     MVT::v32i8,  Custom);
   1049     setOperationAction(ISD::CONCAT_VECTORS,     MVT::v16i16, Custom);
   1050 
   1051     setOperationAction(ISD::SRL,               MVT::v16i16, Custom);
   1052     setOperationAction(ISD::SRL,               MVT::v32i8, Custom);
   1053 
   1054     setOperationAction(ISD::SHL,               MVT::v16i16, Custom);
   1055     setOperationAction(ISD::SHL,               MVT::v32i8, Custom);
   1056 
   1057     setOperationAction(ISD::SRA,               MVT::v16i16, Custom);
   1058     setOperationAction(ISD::SRA,               MVT::v32i8, Custom);
   1059 
   1060     setOperationAction(ISD::SETCC,             MVT::v32i8, Custom);
   1061     setOperationAction(ISD::SETCC,             MVT::v16i16, Custom);
   1062     setOperationAction(ISD::SETCC,             MVT::v8i32, Custom);
   1063     setOperationAction(ISD::SETCC,             MVT::v4i64, Custom);
   1064 
   1065     setOperationAction(ISD::SELECT,            MVT::v4f64, Custom);
   1066     setOperationAction(ISD::SELECT,            MVT::v4i64, Custom);
   1067     setOperationAction(ISD::SELECT,            MVT::v8f32, Custom);
   1068 
   1069     setOperationAction(ISD::VSELECT,           MVT::v4f64, Legal);
   1070     setOperationAction(ISD::VSELECT,           MVT::v4i64, Legal);
   1071     setOperationAction(ISD::VSELECT,           MVT::v8i32, Legal);
   1072     setOperationAction(ISD::VSELECT,           MVT::v8f32, Legal);
   1073 
   1074     if (Subtarget->hasAVX2()) {
   1075       setOperationAction(ISD::ADD,             MVT::v4i64, Legal);
   1076       setOperationAction(ISD::ADD,             MVT::v8i32, Legal);
   1077       setOperationAction(ISD::ADD,             MVT::v16i16, Legal);
   1078       setOperationAction(ISD::ADD,             MVT::v32i8, Legal);
   1079 
   1080       setOperationAction(ISD::SUB,             MVT::v4i64, Legal);
   1081       setOperationAction(ISD::SUB,             MVT::v8i32, Legal);
   1082       setOperationAction(ISD::SUB,             MVT::v16i16, Legal);
   1083       setOperationAction(ISD::SUB,             MVT::v32i8, Legal);
   1084 
   1085       setOperationAction(ISD::MUL,             MVT::v4i64, Custom);
   1086       setOperationAction(ISD::MUL,             MVT::v8i32, Legal);
   1087       setOperationAction(ISD::MUL,             MVT::v16i16, Legal);
   1088       // Don't lower v32i8 because there is no 128-bit byte mul
   1089 
   1090       setOperationAction(ISD::VSELECT,         MVT::v32i8, Legal);
   1091 
   1092       setOperationAction(ISD::SRL,             MVT::v4i64, Legal);
   1093       setOperationAction(ISD::SRL,             MVT::v8i32, Legal);
   1094 
   1095       setOperationAction(ISD::SHL,             MVT::v4i64, Legal);
   1096       setOperationAction(ISD::SHL,             MVT::v8i32, Legal);
   1097 
   1098       setOperationAction(ISD::SRA,             MVT::v8i32, Legal);
   1099     } else {
   1100       setOperationAction(ISD::ADD,             MVT::v4i64, Custom);
   1101       setOperationAction(ISD::ADD,             MVT::v8i32, Custom);
   1102       setOperationAction(ISD::ADD,             MVT::v16i16, Custom);
   1103       setOperationAction(ISD::ADD,             MVT::v32i8, Custom);
   1104 
   1105       setOperationAction(ISD::SUB,             MVT::v4i64, Custom);
   1106       setOperationAction(ISD::SUB,             MVT::v8i32, Custom);
   1107       setOperationAction(ISD::SUB,             MVT::v16i16, Custom);
   1108       setOperationAction(ISD::SUB,             MVT::v32i8, Custom);
   1109 
   1110       setOperationAction(ISD::MUL,             MVT::v4i64, Custom);
   1111       setOperationAction(ISD::MUL,             MVT::v8i32, Custom);
   1112       setOperationAction(ISD::MUL,             MVT::v16i16, Custom);
   1113       // Don't lower v32i8 because there is no 128-bit byte mul
   1114 
   1115       setOperationAction(ISD::SRL,             MVT::v4i64, Custom);
   1116       setOperationAction(ISD::SRL,             MVT::v8i32, Custom);
   1117 
   1118       setOperationAction(ISD::SHL,             MVT::v4i64, Custom);
   1119       setOperationAction(ISD::SHL,             MVT::v8i32, Custom);
   1120 
   1121       setOperationAction(ISD::SRA,             MVT::v8i32, Custom);
   1122     }
   1123 
   1124     // Custom lower several nodes for 256-bit types.
   1125     for (unsigned i = (unsigned)MVT::FIRST_VECTOR_VALUETYPE;
   1126                   i <= (unsigned)MVT::LAST_VECTOR_VALUETYPE; ++i) {
   1127       MVT::SimpleValueType SVT = (MVT::SimpleValueType)i;
   1128       EVT VT = SVT;
   1129 
   1130       // Extract subvector is special because the value type
   1131       // (result) is 128-bit but the source is 256-bit wide.
   1132       if (VT.is128BitVector())
   1133         setOperationAction(ISD::EXTRACT_SUBVECTOR, SVT, Custom);
   1134 
   1135       // Do not attempt to custom lower other non-256-bit vectors
   1136       if (!VT.is256BitVector())
   1137         continue;
   1138 
   1139       setOperationAction(ISD::BUILD_VECTOR,       SVT, Custom);
   1140       setOperationAction(ISD::VECTOR_SHUFFLE,     SVT, Custom);
   1141       setOperationAction(ISD::INSERT_VECTOR_ELT,  SVT, Custom);
   1142       setOperationAction(ISD::EXTRACT_VECTOR_ELT, SVT, Custom);
   1143       setOperationAction(ISD::SCALAR_TO_VECTOR,   SVT, Custom);
   1144       setOperationAction(ISD::INSERT_SUBVECTOR,   SVT, Custom);
   1145     }
   1146 
   1147     // Promote v32i8, v16i16, v8i32 select, and, or, xor to v4i64.
   1148     for (unsigned i = (unsigned)MVT::v32i8; i != (unsigned)MVT::v4i64; ++i) {
   1149       MVT::SimpleValueType SVT = (MVT::SimpleValueType)i;
   1150       EVT VT = SVT;
   1151 
   1152       // Do not attempt to promote non-256-bit vectors
   1153       if (!VT.is256BitVector())
   1154         continue;
   1155 
   1156       setOperationAction(ISD::AND,    SVT, Promote);
   1157       AddPromotedToType (ISD::AND,    SVT, MVT::v4i64);
   1158       setOperationAction(ISD::OR,     SVT, Promote);
   1159       AddPromotedToType (ISD::OR,     SVT, MVT::v4i64);
   1160       setOperationAction(ISD::XOR,    SVT, Promote);
   1161       AddPromotedToType (ISD::XOR,    SVT, MVT::v4i64);
   1162       setOperationAction(ISD::LOAD,   SVT, Promote);
   1163       AddPromotedToType (ISD::LOAD,   SVT, MVT::v4i64);
   1164       setOperationAction(ISD::SELECT, SVT, Promote);
   1165       AddPromotedToType (ISD::SELECT, SVT, MVT::v4i64);
   1166     }
   1167   }
   1168 
   1169   // SIGN_EXTEND_INREGs are evaluated by the extend type. Handle the expansion
   1170   // of this type with custom code.
   1171   for (unsigned VT = (unsigned)MVT::FIRST_VECTOR_VALUETYPE;
   1172          VT != (unsigned)MVT::LAST_VECTOR_VALUETYPE; VT++) {
   1173     setOperationAction(ISD::SIGN_EXTEND_INREG, (MVT::SimpleValueType)VT,
   1174                        Custom);
   1175   }
   1176 
   1177   // We want to custom lower some of our intrinsics.
   1178   setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom);
   1179 
   1180 
   1181   // Only custom-lower 64-bit SADDO and friends on 64-bit because we don't
   1182   // handle type legalization for these operations here.
   1183   //
   1184   // FIXME: We really should do custom legalization for addition and
   1185   // subtraction on x86-32 once PR3203 is fixed.  We really can't do much better
   1186   // than generic legalization for 64-bit multiplication-with-overflow, though.
   1187   for (unsigned i = 0, e = 3+Subtarget->is64Bit(); i != e; ++i) {
   1188     // Add/Sub/Mul with overflow operations are custom lowered.
   1189     MVT VT = IntVTs[i];
   1190     setOperationAction(ISD::SADDO, VT, Custom);
   1191     setOperationAction(ISD::UADDO, VT, Custom);
   1192     setOperationAction(ISD::SSUBO, VT, Custom);
   1193     setOperationAction(ISD::USUBO, VT, Custom);
   1194     setOperationAction(ISD::SMULO, VT, Custom);
   1195     setOperationAction(ISD::UMULO, VT, Custom);
   1196   }
   1197 
   1198   // There are no 8-bit 3-address imul/mul instructions
   1199   setOperationAction(ISD::SMULO, MVT::i8, Expand);
   1200   setOperationAction(ISD::UMULO, MVT::i8, Expand);
   1201 
   1202   if (!Subtarget->is64Bit()) {
   1203     // These libcalls are not available in 32-bit.
   1204     setLibcallName(RTLIB::SHL_I128, 0);
   1205     setLibcallName(RTLIB::SRL_I128, 0);
   1206     setLibcallName(RTLIB::SRA_I128, 0);
   1207   }
   1208 
   1209   // We have target-specific dag combine patterns for the following nodes:
   1210   setTargetDAGCombine(ISD::VECTOR_SHUFFLE);
   1211   setTargetDAGCombine(ISD::EXTRACT_VECTOR_ELT);
   1212   setTargetDAGCombine(ISD::VSELECT);
   1213   setTargetDAGCombine(ISD::SELECT);
   1214   setTargetDAGCombine(ISD::SHL);
   1215   setTargetDAGCombine(ISD::SRA);
   1216   setTargetDAGCombine(ISD::SRL);
   1217   setTargetDAGCombine(ISD::OR);
   1218   setTargetDAGCombine(ISD::AND);
   1219   setTargetDAGCombine(ISD::ADD);
   1220   setTargetDAGCombine(ISD::FADD);
   1221   setTargetDAGCombine(ISD::FSUB);
   1222   setTargetDAGCombine(ISD::SUB);
   1223   setTargetDAGCombine(ISD::LOAD);
   1224   setTargetDAGCombine(ISD::STORE);
   1225   setTargetDAGCombine(ISD::ZERO_EXTEND);
   1226   setTargetDAGCombine(ISD::SIGN_EXTEND);
   1227   setTargetDAGCombine(ISD::TRUNCATE);
   1228   setTargetDAGCombine(ISD::SINT_TO_FP);
   1229   if (Subtarget->is64Bit())
   1230     setTargetDAGCombine(ISD::MUL);
   1231   if (Subtarget->hasBMI())
   1232     setTargetDAGCombine(ISD::XOR);
   1233 
   1234   computeRegisterProperties();
   1235 
   1236   // On Darwin, -Os means optimize for size without hurting performance,
   1237   // do not reduce the limit.
   1238   maxStoresPerMemset = 16; // For @llvm.memset -> sequence of stores
   1239   maxStoresPerMemsetOptSize = Subtarget->isTargetDarwin() ? 16 : 8;
   1240   maxStoresPerMemcpy = 8; // For @llvm.memcpy -> sequence of stores
   1241   maxStoresPerMemcpyOptSize = Subtarget->isTargetDarwin() ? 8 : 4;
   1242   maxStoresPerMemmove = 8; // For @llvm.memmove -> sequence of stores
   1243   maxStoresPerMemmoveOptSize = Subtarget->isTargetDarwin() ? 8 : 4;
   1244   setPrefLoopAlignment(4); // 2^4 bytes.
   1245   benefitFromCodePlacementOpt = true;
   1246 
   1247   setPrefFunctionAlignment(4); // 2^4 bytes.
   1248 }
   1249 
   1250 
   1251 EVT X86TargetLowering::getSetCCResultType(EVT VT) const {
   1252   if (!VT.isVector()) return MVT::i8;
   1253   return VT.changeVectorElementTypeToInteger();
   1254 }
   1255 
   1256 
   1257 /// getMaxByValAlign - Helper for getByValTypeAlignment to determine
   1258 /// the desired ByVal argument alignment.
   1259 static void getMaxByValAlign(Type *Ty, unsigned &MaxAlign) {
   1260   if (MaxAlign == 16)
   1261     return;
   1262   if (VectorType *VTy = dyn_cast<VectorType>(Ty)) {
   1263     if (VTy->getBitWidth() == 128)
   1264       MaxAlign = 16;
   1265   } else if (ArrayType *ATy = dyn_cast<ArrayType>(Ty)) {
   1266     unsigned EltAlign = 0;
   1267     getMaxByValAlign(ATy->getElementType(), EltAlign);
   1268     if (EltAlign > MaxAlign)
   1269       MaxAlign = EltAlign;
   1270   } else if (StructType *STy = dyn_cast<StructType>(Ty)) {
   1271     for (unsigned i = 0, e = STy->getNumElements(); i != e; ++i) {
   1272       unsigned EltAlign = 0;
   1273       getMaxByValAlign(STy->getElementType(i), EltAlign);
   1274       if (EltAlign > MaxAlign)
   1275         MaxAlign = EltAlign;
   1276       if (MaxAlign == 16)
   1277         break;
   1278     }
   1279   }
   1280   return;
   1281 }
   1282 
   1283 /// getByValTypeAlignment - Return the desired alignment for ByVal aggregate
   1284 /// function arguments in the caller parameter area. For X86, aggregates
   1285 /// that contain SSE vectors are placed at 16-byte boundaries while the rest
   1286 /// are at 4-byte boundaries.
   1287 unsigned X86TargetLowering::getByValTypeAlignment(Type *Ty) const {
   1288   if (Subtarget->is64Bit()) {
   1289     // Max of 8 and alignment of type.
   1290     unsigned TyAlign = TD->getABITypeAlignment(Ty);
   1291     if (TyAlign > 8)
   1292       return TyAlign;
   1293     return 8;
   1294   }
   1295 
   1296   unsigned Align = 4;
   1297   if (Subtarget->hasSSE1())
   1298     getMaxByValAlign(Ty, Align);
   1299   return Align;
   1300 }
   1301 
   1302 /// getOptimalMemOpType - Returns the target specific optimal type for load
   1303 /// and store operations as a result of memset, memcpy, and memmove
   1304 /// lowering. If DstAlign is zero that means it's safe to destination
   1305 /// alignment can satisfy any constraint. Similarly if SrcAlign is zero it
   1306 /// means there isn't a need to check it against alignment requirement,
   1307 /// probably because the source does not need to be loaded. If
   1308 /// 'IsZeroVal' is true, that means it's safe to return a
   1309 /// non-scalar-integer type, e.g. empty string source, constant, or loaded
   1310 /// from memory. 'MemcpyStrSrc' indicates whether the memcpy source is
   1311 /// constant so it does not need to be loaded.
   1312 /// It returns EVT::Other if the type should be determined using generic
   1313 /// target-independent logic.
   1314 EVT
   1315 X86TargetLowering::getOptimalMemOpType(uint64_t Size,
   1316                                        unsigned DstAlign, unsigned SrcAlign,
   1317                                        bool IsZeroVal,
   1318                                        bool MemcpyStrSrc,
   1319                                        MachineFunction &MF) const {
   1320   // FIXME: This turns off use of xmm stores for memset/memcpy on targets like
   1321   // linux.  This is because the stack realignment code can't handle certain
   1322   // cases like PR2962.  This should be removed when PR2962 is fixed.
   1323   const Function *F = MF.getFunction();
   1324   if (IsZeroVal &&
   1325       !F->hasFnAttr(Attribute::NoImplicitFloat)) {
   1326     if (Size >= 16 &&
   1327         (Subtarget->isUnalignedMemAccessFast() ||
   1328          ((DstAlign == 0 || DstAlign >= 16) &&
   1329           (SrcAlign == 0 || SrcAlign >= 16))) &&
   1330         Subtarget->getStackAlignment() >= 16) {
   1331       if (Subtarget->getStackAlignment() >= 32) {
   1332         if (Subtarget->hasAVX2())
   1333           return MVT::v8i32;
   1334         if (Subtarget->hasAVX())
   1335           return MVT::v8f32;
   1336       }
   1337       if (Subtarget->hasSSE2())
   1338         return MVT::v4i32;
   1339       if (Subtarget->hasSSE1())
   1340         return MVT::v4f32;
   1341     } else if (!MemcpyStrSrc && Size >= 8 &&
   1342                !Subtarget->is64Bit() &&
   1343                Subtarget->getStackAlignment() >= 8 &&
   1344                Subtarget->hasSSE2()) {
   1345       // Do not use f64 to lower memcpy if source is string constant. It's
   1346       // better to use i32 to avoid the loads.
   1347       return MVT::f64;
   1348     }
   1349   }
   1350   if (Subtarget->is64Bit() && Size >= 8)
   1351     return MVT::i64;
   1352   return MVT::i32;
   1353 }
   1354 
   1355 /// getJumpTableEncoding - Return the entry encoding for a jump table in the
   1356 /// current function.  The returned value is a member of the
   1357 /// MachineJumpTableInfo::JTEntryKind enum.
   1358 unsigned X86TargetLowering::getJumpTableEncoding() const {
   1359   // In GOT pic mode, each entry in the jump table is emitted as a @GOTOFF
   1360   // symbol.
   1361   if (getTargetMachine().getRelocationModel() == Reloc::PIC_ &&
   1362       Subtarget->isPICStyleGOT())
   1363     return MachineJumpTableInfo::EK_Custom32;
   1364 
   1365   // Otherwise, use the normal jump table encoding heuristics.
   1366   return TargetLowering::getJumpTableEncoding();
   1367 }
   1368 
   1369 const MCExpr *
   1370 X86TargetLowering::LowerCustomJumpTableEntry(const MachineJumpTableInfo *MJTI,
   1371                                              const MachineBasicBlock *MBB,
   1372                                              unsigned uid,MCContext &Ctx) const{
   1373   assert(getTargetMachine().getRelocationModel() == Reloc::PIC_ &&
   1374          Subtarget->isPICStyleGOT());
   1375   // In 32-bit ELF systems, our jump table entries are formed with @GOTOFF
   1376   // entries.
   1377   return MCSymbolRefExpr::Create(MBB->getSymbol(),
   1378                                  MCSymbolRefExpr::VK_GOTOFF, Ctx);
   1379 }
   1380 
   1381 /// getPICJumpTableRelocaBase - Returns relocation base for the given PIC
   1382 /// jumptable.
   1383 SDValue X86TargetLowering::getPICJumpTableRelocBase(SDValue Table,
   1384                                                     SelectionDAG &DAG) const {
   1385   if (!Subtarget->is64Bit())
   1386     // This doesn't have DebugLoc associated with it, but is not really the
   1387     // same as a Register.
   1388     return DAG.getNode(X86ISD::GlobalBaseReg, DebugLoc(), getPointerTy());
   1389   return Table;
   1390 }
   1391 
   1392 /// getPICJumpTableRelocBaseExpr - This returns the relocation base for the
   1393 /// given PIC jumptable, the same as getPICJumpTableRelocBase, but as an
   1394 /// MCExpr.
   1395 const MCExpr *X86TargetLowering::
   1396 getPICJumpTableRelocBaseExpr(const MachineFunction *MF, unsigned JTI,
   1397                              MCContext &Ctx) const {
   1398   // X86-64 uses RIP relative addressing based on the jump table label.
   1399   if (Subtarget->isPICStyleRIPRel())
   1400     return TargetLowering::getPICJumpTableRelocBaseExpr(MF, JTI, Ctx);
   1401 
   1402   // Otherwise, the reference is relative to the PIC base.
   1403   return MCSymbolRefExpr::Create(MF->getPICBaseSymbol(), Ctx);
   1404 }
   1405 
   1406 // FIXME: Why this routine is here? Move to RegInfo!
   1407 std::pair<const TargetRegisterClass*, uint8_t>
   1408 X86TargetLowering::findRepresentativeClass(EVT VT) const{
   1409   const TargetRegisterClass *RRC = 0;
   1410   uint8_t Cost = 1;
   1411   switch (VT.getSimpleVT().SimpleTy) {
   1412   default:
   1413     return TargetLowering::findRepresentativeClass(VT);
   1414   case MVT::i8: case MVT::i16: case MVT::i32: case MVT::i64:
   1415     RRC = (Subtarget->is64Bit()
   1416            ? X86::GR64RegisterClass : X86::GR32RegisterClass);
   1417     break;
   1418   case MVT::x86mmx:
   1419     RRC = X86::VR64RegisterClass;
   1420     break;
   1421   case MVT::f32: case MVT::f64:
   1422   case MVT::v16i8: case MVT::v8i16: case MVT::v4i32: case MVT::v2i64:
   1423   case MVT::v4f32: case MVT::v2f64:
   1424   case MVT::v32i8: case MVT::v8i32: case MVT::v4i64: case MVT::v8f32:
   1425   case MVT::v4f64:
   1426     RRC = X86::VR128RegisterClass;
   1427     break;
   1428   }
   1429   return std::make_pair(RRC, Cost);
   1430 }
   1431 
   1432 bool X86TargetLowering::getStackCookieLocation(unsigned &AddressSpace,
   1433                                                unsigned &Offset) const {
   1434   if (!Subtarget->isTargetLinux())
   1435     return false;
   1436 
   1437   if (Subtarget->is64Bit()) {
   1438     // %fs:0x28, unless we're using a Kernel code model, in which case it's %gs:
   1439     Offset = 0x28;
   1440     if (getTargetMachine().getCodeModel() == CodeModel::Kernel)
   1441       AddressSpace = 256;
   1442     else
   1443       AddressSpace = 257;
   1444   } else {
   1445     // %gs:0x14 on i386
   1446     Offset = 0x14;
   1447     AddressSpace = 256;
   1448   }
   1449   return true;
   1450 }
   1451 
   1452 
   1453 //===----------------------------------------------------------------------===//
   1454 //               Return Value Calling Convention Implementation
   1455 //===----------------------------------------------------------------------===//
   1456 
   1457 #include "X86GenCallingConv.inc"
   1458 
   1459 bool
   1460 X86TargetLowering::CanLowerReturn(CallingConv::ID CallConv,
   1461 				  MachineFunction &MF, bool isVarArg,
   1462                         const SmallVectorImpl<ISD::OutputArg> &Outs,
   1463                         LLVMContext &Context) const {
   1464   SmallVector<CCValAssign, 16> RVLocs;
   1465   CCState CCInfo(CallConv, isVarArg, MF, getTargetMachine(),
   1466                  RVLocs, Context);
   1467   return CCInfo.CheckReturn(Outs, RetCC_X86);
   1468 }
   1469 
   1470 SDValue
   1471 X86TargetLowering::LowerReturn(SDValue Chain,
   1472                                CallingConv::ID CallConv, bool isVarArg,
   1473                                const SmallVectorImpl<ISD::OutputArg> &Outs,
   1474                                const SmallVectorImpl<SDValue> &OutVals,
   1475                                DebugLoc dl, SelectionDAG &DAG) const {
   1476   MachineFunction &MF = DAG.getMachineFunction();
   1477   X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
   1478 
   1479   SmallVector<CCValAssign, 16> RVLocs;
   1480   CCState CCInfo(CallConv, isVarArg, MF, getTargetMachine(),
   1481                  RVLocs, *DAG.getContext());
   1482   CCInfo.AnalyzeReturn(Outs, RetCC_X86);
   1483 
   1484   // Add the regs to the liveout set for the function.
   1485   MachineRegisterInfo &MRI = DAG.getMachineFunction().getRegInfo();
   1486   for (unsigned i = 0; i != RVLocs.size(); ++i)
   1487     if (RVLocs[i].isRegLoc() && !MRI.isLiveOut(RVLocs[i].getLocReg()))
   1488       MRI.addLiveOut(RVLocs[i].getLocReg());
   1489 
   1490   SDValue Flag;
   1491 
   1492   SmallVector<SDValue, 6> RetOps;
   1493   RetOps.push_back(Chain); // Operand #0 = Chain (updated below)
   1494   // Operand #1 = Bytes To Pop
   1495   RetOps.push_back(DAG.getTargetConstant(FuncInfo->getBytesToPopOnReturn(),
   1496                    MVT::i16));
   1497 
   1498   // Copy the result values into the output registers.
   1499   for (unsigned i = 0; i != RVLocs.size(); ++i) {
   1500     CCValAssign &VA = RVLocs[i];
   1501     assert(VA.isRegLoc() && "Can only return in registers!");
   1502     SDValue ValToCopy = OutVals[i];
   1503     EVT ValVT = ValToCopy.getValueType();
   1504 
   1505     // If this is x86-64, and we disabled SSE, we can't return FP values,
   1506     // or SSE or MMX vectors.
   1507     if ((ValVT == MVT::f32 || ValVT == MVT::f64 ||
   1508          VA.getLocReg() == X86::XMM0 || VA.getLocReg() == X86::XMM1) &&
   1509           (Subtarget->is64Bit() && !Subtarget->hasSSE1())) {
   1510       report_fatal_error("SSE register return with SSE disabled");
   1511     }
   1512     // Likewise we can't return F64 values with SSE1 only.  gcc does so, but
   1513     // llvm-gcc has never done it right and no one has noticed, so this
   1514     // should be OK for now.
   1515     if (ValVT == MVT::f64 &&
   1516         (Subtarget->is64Bit() && !Subtarget->hasSSE2()))
   1517       report_fatal_error("SSE2 register return with SSE2 disabled");
   1518 
   1519     // Returns in ST0/ST1 are handled specially: these are pushed as operands to
   1520     // the RET instruction and handled by the FP Stackifier.
   1521     if (VA.getLocReg() == X86::ST0 ||
   1522         VA.getLocReg() == X86::ST1) {
   1523       // If this is a copy from an xmm register to ST(0), use an FPExtend to
   1524       // change the value to the FP stack register class.
   1525       if (isScalarFPTypeInSSEReg(VA.getValVT()))
   1526         ValToCopy = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f80, ValToCopy);
   1527       RetOps.push_back(ValToCopy);
   1528       // Don't emit a copytoreg.
   1529       continue;
   1530     }
   1531 
   1532     // 64-bit vector (MMX) values are returned in XMM0 / XMM1 except for v1i64
   1533     // which is returned in RAX / RDX.
   1534     if (Subtarget->is64Bit()) {
   1535       if (ValVT == MVT::x86mmx) {
   1536         if (VA.getLocReg() == X86::XMM0 || VA.getLocReg() == X86::XMM1) {
   1537           ValToCopy = DAG.getNode(ISD::BITCAST, dl, MVT::i64, ValToCopy);
   1538           ValToCopy = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64,
   1539                                   ValToCopy);
   1540           // If we don't have SSE2 available, convert to v4f32 so the generated
   1541           // register is legal.
   1542           if (!Subtarget->hasSSE2())
   1543             ValToCopy = DAG.getNode(ISD::BITCAST, dl, MVT::v4f32,ValToCopy);
   1544         }
   1545       }
   1546     }
   1547 
   1548     Chain = DAG.getCopyToReg(Chain, dl, VA.getLocReg(), ValToCopy, Flag);
   1549     Flag = Chain.getValue(1);
   1550   }
   1551 
   1552   // The x86-64 ABI for returning structs by value requires that we copy
   1553   // the sret argument into %rax for the return. We saved the argument into
   1554   // a virtual register in the entry block, so now we copy the value out
   1555   // and into %rax.
   1556   if (Subtarget->is64Bit() &&
   1557       DAG.getMachineFunction().getFunction()->hasStructRetAttr()) {
   1558     MachineFunction &MF = DAG.getMachineFunction();
   1559     X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
   1560     unsigned Reg = FuncInfo->getSRetReturnReg();
   1561     assert(Reg &&
   1562            "SRetReturnReg should have been set in LowerFormalArguments().");
   1563     SDValue Val = DAG.getCopyFromReg(Chain, dl, Reg, getPointerTy());
   1564 
   1565     Chain = DAG.getCopyToReg(Chain, dl, X86::RAX, Val, Flag);
   1566     Flag = Chain.getValue(1);
   1567 
   1568     // RAX now acts like a return value.
   1569     MRI.addLiveOut(X86::RAX);
   1570   }
   1571 
   1572   RetOps[0] = Chain;  // Update chain.
   1573 
   1574   // Add the flag if we have it.
   1575   if (Flag.getNode())
   1576     RetOps.push_back(Flag);
   1577 
   1578   return DAG.getNode(X86ISD::RET_FLAG, dl,
   1579                      MVT::Other, &RetOps[0], RetOps.size());
   1580 }
   1581 
   1582 bool X86TargetLowering::isUsedByReturnOnly(SDNode *N, SDValue &Chain) const {
   1583   if (N->getNumValues() != 1)
   1584     return false;
   1585   if (!N->hasNUsesOfValue(1, 0))
   1586     return false;
   1587 
   1588   SDValue TCChain = Chain;
   1589   SDNode *Copy = *N->use_begin();
   1590   if (Copy->getOpcode() == ISD::CopyToReg) {
   1591     // If the copy has a glue operand, we conservatively assume it isn't safe to
   1592     // perform a tail call.
   1593     if (Copy->getOperand(Copy->getNumOperands()-1).getValueType() == MVT::Glue)
   1594       return false;
   1595     TCChain = Copy->getOperand(0);
   1596   } else if (Copy->getOpcode() != ISD::FP_EXTEND)
   1597     return false;
   1598 
   1599   bool HasRet = false;
   1600   for (SDNode::use_iterator UI = Copy->use_begin(), UE = Copy->use_end();
   1601        UI != UE; ++UI) {
   1602     if (UI->getOpcode() != X86ISD::RET_FLAG)
   1603       return false;
   1604     HasRet = true;
   1605   }
   1606 
   1607   if (!HasRet)
   1608     return false;
   1609 
   1610   Chain = TCChain;
   1611   return true;
   1612 }
   1613 
   1614 EVT
   1615 X86TargetLowering::getTypeForExtArgOrReturn(LLVMContext &Context, EVT VT,
   1616                                             ISD::NodeType ExtendKind) const {
   1617   MVT ReturnMVT;
   1618   // TODO: Is this also valid on 32-bit?
   1619   if (Subtarget->is64Bit() && VT == MVT::i1 && ExtendKind == ISD::ZERO_EXTEND)
   1620     ReturnMVT = MVT::i8;
   1621   else
   1622     ReturnMVT = MVT::i32;
   1623 
   1624   EVT MinVT = getRegisterType(Context, ReturnMVT);
   1625   return VT.bitsLT(MinVT) ? MinVT : VT;
   1626 }
   1627 
   1628 /// LowerCallResult - Lower the result values of a call into the
   1629 /// appropriate copies out of appropriate physical registers.
   1630 ///
   1631 SDValue
   1632 X86TargetLowering::LowerCallResult(SDValue Chain, SDValue InFlag,
   1633                                    CallingConv::ID CallConv, bool isVarArg,
   1634                                    const SmallVectorImpl<ISD::InputArg> &Ins,
   1635                                    DebugLoc dl, SelectionDAG &DAG,
   1636                                    SmallVectorImpl<SDValue> &InVals) const {
   1637 
   1638   // Assign locations to each value returned by this call.
   1639   SmallVector<CCValAssign, 16> RVLocs;
   1640   bool Is64Bit = Subtarget->is64Bit();
   1641   CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(),
   1642 		 getTargetMachine(), RVLocs, *DAG.getContext());
   1643   CCInfo.AnalyzeCallResult(Ins, RetCC_X86);
   1644 
   1645   // Copy all of the result registers out of their specified physreg.
   1646   for (unsigned i = 0; i != RVLocs.size(); ++i) {
   1647     CCValAssign &VA = RVLocs[i];
   1648     EVT CopyVT = VA.getValVT();
   1649 
   1650     // If this is x86-64, and we disabled SSE, we can't return FP values
   1651     if ((CopyVT == MVT::f32 || CopyVT == MVT::f64) &&
   1652         ((Is64Bit || Ins[i].Flags.isInReg()) && !Subtarget->hasSSE1())) {
   1653       report_fatal_error("SSE register return with SSE disabled");
   1654     }
   1655 
   1656     SDValue Val;
   1657 
   1658     // If this is a call to a function that returns an fp value on the floating
   1659     // point stack, we must guarantee the the value is popped from the stack, so
   1660     // a CopyFromReg is not good enough - the copy instruction may be eliminated
   1661     // if the return value is not used. We use the FpPOP_RETVAL instruction
   1662     // instead.
   1663     if (VA.getLocReg() == X86::ST0 || VA.getLocReg() == X86::ST1) {
   1664       // If we prefer to use the value in xmm registers, copy it out as f80 and
   1665       // use a truncate to move it from fp stack reg to xmm reg.
   1666       if (isScalarFPTypeInSSEReg(VA.getValVT())) CopyVT = MVT::f80;
   1667       SDValue Ops[] = { Chain, InFlag };
   1668       Chain = SDValue(DAG.getMachineNode(X86::FpPOP_RETVAL, dl, CopyVT,
   1669                                          MVT::Other, MVT::Glue, Ops, 2), 1);
   1670       Val = Chain.getValue(0);
   1671 
   1672       // Round the f80 to the right size, which also moves it to the appropriate
   1673       // xmm register.
   1674       if (CopyVT != VA.getValVT())
   1675         Val = DAG.getNode(ISD::FP_ROUND, dl, VA.getValVT(), Val,
   1676                           // This truncation won't change the value.
   1677                           DAG.getIntPtrConstant(1));
   1678     } else {
   1679       Chain = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(),
   1680                                  CopyVT, InFlag).getValue(1);
   1681       Val = Chain.getValue(0);
   1682     }
   1683     InFlag = Chain.getValue(2);
   1684     InVals.push_back(Val);
   1685   }
   1686 
   1687   return Chain;
   1688 }
   1689 
   1690 
   1691 //===----------------------------------------------------------------------===//
   1692 //                C & StdCall & Fast Calling Convention implementation
   1693 //===----------------------------------------------------------------------===//
   1694 //  StdCall calling convention seems to be standard for many Windows' API
   1695 //  routines and around. It differs from C calling convention just a little:
   1696 //  callee should clean up the stack, not caller. Symbols should be also
   1697 //  decorated in some fancy way :) It doesn't support any vector arguments.
   1698 //  For info on fast calling convention see Fast Calling Convention (tail call)
   1699 //  implementation LowerX86_32FastCCCallTo.
   1700 
   1701 /// CallIsStructReturn - Determines whether a call uses struct return
   1702 /// semantics.
   1703 static bool CallIsStructReturn(const SmallVectorImpl<ISD::OutputArg> &Outs) {
   1704   if (Outs.empty())
   1705     return false;
   1706 
   1707   return Outs[0].Flags.isSRet();
   1708 }
   1709 
   1710 /// ArgsAreStructReturn - Determines whether a function uses struct
   1711 /// return semantics.
   1712 static bool
   1713 ArgsAreStructReturn(const SmallVectorImpl<ISD::InputArg> &Ins) {
   1714   if (Ins.empty())
   1715     return false;
   1716 
   1717   return Ins[0].Flags.isSRet();
   1718 }
   1719 
   1720 /// CreateCopyOfByValArgument - Make a copy of an aggregate at address specified
   1721 /// by "Src" to address "Dst" with size and alignment information specified by
   1722 /// the specific parameter attribute. The copy will be passed as a byval
   1723 /// function parameter.
   1724 static SDValue
   1725 CreateCopyOfByValArgument(SDValue Src, SDValue Dst, SDValue Chain,
   1726                           ISD::ArgFlagsTy Flags, SelectionDAG &DAG,
   1727                           DebugLoc dl) {
   1728   SDValue SizeNode = DAG.getConstant(Flags.getByValSize(), MVT::i32);
   1729 
   1730   return DAG.getMemcpy(Chain, dl, Dst, Src, SizeNode, Flags.getByValAlign(),
   1731                        /*isVolatile*/false, /*AlwaysInline=*/true,
   1732                        MachinePointerInfo(), MachinePointerInfo());
   1733 }
   1734 
   1735 /// IsTailCallConvention - Return true if the calling convention is one that
   1736 /// supports tail call optimization.
   1737 static bool IsTailCallConvention(CallingConv::ID CC) {
   1738   return (CC == CallingConv::Fast || CC == CallingConv::GHC);
   1739 }
   1740 
   1741 bool X86TargetLowering::mayBeEmittedAsTailCall(CallInst *CI) const {
   1742   if (!CI->isTailCall() || getTargetMachine().Options.DisableTailCalls)
   1743     return false;
   1744 
   1745   CallSite CS(CI);
   1746   CallingConv::ID CalleeCC = CS.getCallingConv();
   1747   if (!IsTailCallConvention(CalleeCC) && CalleeCC != CallingConv::C)
   1748     return false;
   1749 
   1750   return true;
   1751 }
   1752 
   1753 /// FuncIsMadeTailCallSafe - Return true if the function is being made into
   1754 /// a tailcall target by changing its ABI.
   1755 static bool FuncIsMadeTailCallSafe(CallingConv::ID CC,
   1756                                    bool GuaranteedTailCallOpt) {
   1757   return GuaranteedTailCallOpt && IsTailCallConvention(CC);
   1758 }
   1759 
   1760 SDValue
   1761 X86TargetLowering::LowerMemArgument(SDValue Chain,
   1762                                     CallingConv::ID CallConv,
   1763                                     const SmallVectorImpl<ISD::InputArg> &Ins,
   1764                                     DebugLoc dl, SelectionDAG &DAG,
   1765                                     const CCValAssign &VA,
   1766                                     MachineFrameInfo *MFI,
   1767                                     unsigned i) const {
   1768   // Create the nodes corresponding to a load from this parameter slot.
   1769   ISD::ArgFlagsTy Flags = Ins[i].Flags;
   1770   bool AlwaysUseMutable = FuncIsMadeTailCallSafe(CallConv,
   1771                               getTargetMachine().Options.GuaranteedTailCallOpt);
   1772   bool isImmutable = !AlwaysUseMutable && !Flags.isByVal();
   1773   EVT ValVT;
   1774 
   1775   // If value is passed by pointer we have address passed instead of the value
   1776   // itself.
   1777   if (VA.getLocInfo() == CCValAssign::Indirect)
   1778     ValVT = VA.getLocVT();
   1779   else
   1780     ValVT = VA.getValVT();
   1781 
   1782   // FIXME: For now, all byval parameter objects are marked mutable. This can be
   1783   // changed with more analysis.
   1784   // In case of tail call optimization mark all arguments mutable. Since they
   1785   // could be overwritten by lowering of arguments in case of a tail call.
   1786   if (Flags.isByVal()) {
   1787     unsigned Bytes = Flags.getByValSize();
   1788     if (Bytes == 0) Bytes = 1; // Don't create zero-sized stack objects.
   1789     int FI = MFI->CreateFixedObject(Bytes, VA.getLocMemOffset(), isImmutable);
   1790     return DAG.getFrameIndex(FI, getPointerTy());
   1791   } else {
   1792     int FI = MFI->CreateFixedObject(ValVT.getSizeInBits()/8,
   1793                                     VA.getLocMemOffset(), isImmutable);
   1794     SDValue FIN = DAG.getFrameIndex(FI, getPointerTy());
   1795     return DAG.getLoad(ValVT, dl, Chain, FIN,
   1796                        MachinePointerInfo::getFixedStack(FI),
   1797                        false, false, false, 0);
   1798   }
   1799 }
   1800 
   1801 SDValue
   1802 X86TargetLowering::LowerFormalArguments(SDValue Chain,
   1803                                         CallingConv::ID CallConv,
   1804                                         bool isVarArg,
   1805                                       const SmallVectorImpl<ISD::InputArg> &Ins,
   1806                                         DebugLoc dl,
   1807                                         SelectionDAG &DAG,
   1808                                         SmallVectorImpl<SDValue> &InVals)
   1809                                           const {
   1810   MachineFunction &MF = DAG.getMachineFunction();
   1811   X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
   1812 
   1813   const Function* Fn = MF.getFunction();
   1814   if (Fn->hasExternalLinkage() &&
   1815       Subtarget->isTargetCygMing() &&
   1816       Fn->getName() == "main")
   1817     FuncInfo->setForceFramePointer(true);
   1818 
   1819   MachineFrameInfo *MFI = MF.getFrameInfo();
   1820   bool Is64Bit = Subtarget->is64Bit();
   1821   bool IsWindows = Subtarget->isTargetWindows();
   1822   bool IsWin64 = Subtarget->isTargetWin64();
   1823 
   1824   assert(!(isVarArg && IsTailCallConvention(CallConv)) &&
   1825          "Var args not supported with calling convention fastcc or ghc");
   1826 
   1827   // Assign locations to all of the incoming arguments.
   1828   SmallVector<CCValAssign, 16> ArgLocs;
   1829   CCState CCInfo(CallConv, isVarArg, MF, getTargetMachine(),
   1830                  ArgLocs, *DAG.getContext());
   1831 
   1832   // Allocate shadow area for Win64
   1833   if (IsWin64) {
   1834     CCInfo.AllocateStack(32, 8);
   1835   }
   1836 
   1837   CCInfo.AnalyzeFormalArguments(Ins, CC_X86);
   1838 
   1839   unsigned LastVal = ~0U;
   1840   SDValue ArgValue;
   1841   for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
   1842     CCValAssign &VA = ArgLocs[i];
   1843     // TODO: If an arg is passed in two places (e.g. reg and stack), skip later
   1844     // places.
   1845     assert(VA.getValNo() != LastVal &&
   1846            "Don't support value assigned to multiple locs yet");
   1847     (void)LastVal;
   1848     LastVal = VA.getValNo();
   1849 
   1850     if (VA.isRegLoc()) {
   1851       EVT RegVT = VA.getLocVT();
   1852       const TargetRegisterClass *RC;
   1853       if (RegVT == MVT::i32)
   1854         RC = X86::GR32RegisterClass;
   1855       else if (Is64Bit && RegVT == MVT::i64)
   1856         RC = X86::GR64RegisterClass;
   1857       else if (RegVT == MVT::f32)
   1858         RC = X86::FR32RegisterClass;
   1859       else if (RegVT == MVT::f64)
   1860         RC = X86::FR64RegisterClass;
   1861       else if (RegVT.isVector() && RegVT.getSizeInBits() == 256)
   1862         RC = X86::VR256RegisterClass;
   1863       else if (RegVT.isVector() && RegVT.getSizeInBits() == 128)
   1864         RC = X86::VR128RegisterClass;
   1865       else if (RegVT == MVT::x86mmx)
   1866         RC = X86::VR64RegisterClass;
   1867       else
   1868         llvm_unreachable("Unknown argument type!");
   1869 
   1870       unsigned Reg = MF.addLiveIn(VA.getLocReg(), RC);
   1871       ArgValue = DAG.getCopyFromReg(Chain, dl, Reg, RegVT);
   1872 
   1873       // If this is an 8 or 16-bit value, it is really passed promoted to 32
   1874       // bits.  Insert an assert[sz]ext to capture this, then truncate to the
   1875       // right size.
   1876       if (VA.getLocInfo() == CCValAssign::SExt)
   1877         ArgValue = DAG.getNode(ISD::AssertSext, dl, RegVT, ArgValue,
   1878                                DAG.getValueType(VA.getValVT()));
   1879       else if (VA.getLocInfo() == CCValAssign::ZExt)
   1880         ArgValue = DAG.getNode(ISD::AssertZext, dl, RegVT, ArgValue,
   1881                                DAG.getValueType(VA.getValVT()));
   1882       else if (VA.getLocInfo() == CCValAssign::BCvt)
   1883         ArgValue = DAG.getNode(ISD::BITCAST, dl, VA.getValVT(), ArgValue);
   1884 
   1885       if (VA.isExtInLoc()) {
   1886         // Handle MMX values passed in XMM regs.
   1887         if (RegVT.isVector()) {
   1888           ArgValue = DAG.getNode(X86ISD::MOVDQ2Q, dl, VA.getValVT(),
   1889                                  ArgValue);
   1890         } else
   1891           ArgValue = DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), ArgValue);
   1892       }
   1893     } else {
   1894       assert(VA.isMemLoc());
   1895       ArgValue = LowerMemArgument(Chain, CallConv, Ins, dl, DAG, VA, MFI, i);
   1896     }
   1897 
   1898     // If value is passed via pointer - do a load.
   1899     if (VA.getLocInfo() == CCValAssign::Indirect)
   1900       ArgValue = DAG.getLoad(VA.getValVT(), dl, Chain, ArgValue,
   1901                              MachinePointerInfo(), false, false, false, 0);
   1902 
   1903     InVals.push_back(ArgValue);
   1904   }
   1905 
   1906   // The x86-64 ABI for returning structs by value requires that we copy
   1907   // the sret argument into %rax for the return. Save the argument into
   1908   // a virtual register so that we can access it from the return points.
   1909   if (Is64Bit && MF.getFunction()->hasStructRetAttr()) {
   1910     X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
   1911     unsigned Reg = FuncInfo->getSRetReturnReg();
   1912     if (!Reg) {
   1913       Reg = MF.getRegInfo().createVirtualRegister(getRegClassFor(MVT::i64));
   1914       FuncInfo->setSRetReturnReg(Reg);
   1915     }
   1916     SDValue Copy = DAG.getCopyToReg(DAG.getEntryNode(), dl, Reg, InVals[0]);
   1917     Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Copy, Chain);
   1918   }
   1919 
   1920   unsigned StackSize = CCInfo.getNextStackOffset();
   1921   // Align stack specially for tail calls.
   1922   if (FuncIsMadeTailCallSafe(CallConv,
   1923                              MF.getTarget().Options.GuaranteedTailCallOpt))
   1924     StackSize = GetAlignedArgumentStackSize(StackSize, DAG);
   1925 
   1926   // If the function takes variable number of arguments, make a frame index for
   1927   // the start of the first vararg value... for expansion of llvm.va_start.
   1928   if (isVarArg) {
   1929     if (Is64Bit || (CallConv != CallingConv::X86_FastCall &&
   1930                     CallConv != CallingConv::X86_ThisCall)) {
   1931       FuncInfo->setVarArgsFrameIndex(MFI->CreateFixedObject(1, StackSize,true));
   1932     }
   1933     if (Is64Bit) {
   1934       unsigned TotalNumIntRegs = 0, TotalNumXMMRegs = 0;
   1935 
   1936       // FIXME: We should really autogenerate these arrays
   1937       static const uint16_t GPR64ArgRegsWin64[] = {
   1938         X86::RCX, X86::RDX, X86::R8,  X86::R9
   1939       };
   1940       static const uint16_t GPR64ArgRegs64Bit[] = {
   1941         X86::RDI, X86::RSI, X86::RDX, X86::RCX, X86::R8, X86::R9
   1942       };
   1943       static const uint16_t XMMArgRegs64Bit[] = {
   1944         X86::XMM0, X86::XMM1, X86::XMM2, X86::XMM3,
   1945         X86::XMM4, X86::XMM5, X86::XMM6, X86::XMM7
   1946       };
   1947       const uint16_t *GPR64ArgRegs;
   1948       unsigned NumXMMRegs = 0;
   1949 
   1950       if (IsWin64) {
   1951         // The XMM registers which might contain var arg parameters are shadowed
   1952         // in their paired GPR.  So we only need to save the GPR to their home
   1953         // slots.
   1954         TotalNumIntRegs = 4;
   1955         GPR64ArgRegs = GPR64ArgRegsWin64;
   1956       } else {
   1957         TotalNumIntRegs = 6; TotalNumXMMRegs = 8;
   1958         GPR64ArgRegs = GPR64ArgRegs64Bit;
   1959 
   1960         NumXMMRegs = CCInfo.getFirstUnallocated(XMMArgRegs64Bit,
   1961                                                 TotalNumXMMRegs);
   1962       }
   1963       unsigned NumIntRegs = CCInfo.getFirstUnallocated(GPR64ArgRegs,
   1964                                                        TotalNumIntRegs);
   1965 
   1966       bool NoImplicitFloatOps = Fn->hasFnAttr(Attribute::NoImplicitFloat);
   1967       assert(!(NumXMMRegs && !Subtarget->hasSSE1()) &&
   1968              "SSE register cannot be used when SSE is disabled!");
   1969       assert(!(NumXMMRegs && MF.getTarget().Options.UseSoftFloat &&
   1970                NoImplicitFloatOps) &&
   1971              "SSE register cannot be used when SSE is disabled!");
   1972       if (MF.getTarget().Options.UseSoftFloat || NoImplicitFloatOps ||
   1973           !Subtarget->hasSSE1())
   1974         // Kernel mode asks for SSE to be disabled, so don't push them
   1975         // on the stack.
   1976         TotalNumXMMRegs = 0;
   1977 
   1978       if (IsWin64) {
   1979         const TargetFrameLowering &TFI = *getTargetMachine().getFrameLowering();
   1980         // Get to the caller-allocated home save location.  Add 8 to account
   1981         // for the return address.
   1982         int HomeOffset = TFI.getOffsetOfLocalArea() + 8;
   1983         FuncInfo->setRegSaveFrameIndex(
   1984           MFI->CreateFixedObject(1, NumIntRegs * 8 + HomeOffset, false));
   1985         // Fixup to set vararg frame on shadow area (4 x i64).
   1986         if (NumIntRegs < 4)
   1987           FuncInfo->setVarArgsFrameIndex(FuncInfo->getRegSaveFrameIndex());
   1988       } else {
   1989         // For X86-64, if there are vararg parameters that are passed via
   1990         // registers, then we must store them to their spots on the stack so
   1991         // they may be loaded by deferencing the result of va_next.
   1992         FuncInfo->setVarArgsGPOffset(NumIntRegs * 8);
   1993         FuncInfo->setVarArgsFPOffset(TotalNumIntRegs * 8 + NumXMMRegs * 16);
   1994         FuncInfo->setRegSaveFrameIndex(
   1995           MFI->CreateStackObject(TotalNumIntRegs * 8 + TotalNumXMMRegs * 16, 16,
   1996                                false));
   1997       }
   1998 
   1999       // Store the integer parameter registers.
   2000       SmallVector<SDValue, 8> MemOps;
   2001       SDValue RSFIN = DAG.getFrameIndex(FuncInfo->getRegSaveFrameIndex(),
   2002                                         getPointerTy());
   2003       unsigned Offset = FuncInfo->getVarArgsGPOffset();
   2004       for (; NumIntRegs != TotalNumIntRegs; ++NumIntRegs) {
   2005         SDValue FIN = DAG.getNode(ISD::ADD, dl, getPointerTy(), RSFIN,
   2006                                   DAG.getIntPtrConstant(Offset));
   2007         unsigned VReg = MF.addLiveIn(GPR64ArgRegs[NumIntRegs],
   2008                                      X86::GR64RegisterClass);
   2009         SDValue Val = DAG.getCopyFromReg(Chain, dl, VReg, MVT::i64);
   2010         SDValue Store =
   2011           DAG.getStore(Val.getValue(1), dl, Val, FIN,
   2012                        MachinePointerInfo::getFixedStack(
   2013                          FuncInfo->getRegSaveFrameIndex(), Offset),
   2014                        false, false, 0);
   2015         MemOps.push_back(Store);
   2016         Offset += 8;
   2017       }
   2018 
   2019       if (TotalNumXMMRegs != 0 && NumXMMRegs != TotalNumXMMRegs) {
   2020         // Now store the XMM (fp + vector) parameter registers.
   2021         SmallVector<SDValue, 11> SaveXMMOps;
   2022         SaveXMMOps.push_back(Chain);
   2023 
   2024         unsigned AL = MF.addLiveIn(X86::AL, X86::GR8RegisterClass);
   2025         SDValue ALVal = DAG.getCopyFromReg(DAG.getEntryNode(), dl, AL, MVT::i8);
   2026         SaveXMMOps.push_back(ALVal);
   2027 
   2028         SaveXMMOps.push_back(DAG.getIntPtrConstant(
   2029                                FuncInfo->getRegSaveFrameIndex()));
   2030         SaveXMMOps.push_back(DAG.getIntPtrConstant(
   2031                                FuncInfo->getVarArgsFPOffset()));
   2032 
   2033         for (; NumXMMRegs != TotalNumXMMRegs; ++NumXMMRegs) {
   2034           unsigned VReg = MF.addLiveIn(XMMArgRegs64Bit[NumXMMRegs],
   2035                                        X86::VR128RegisterClass);
   2036           SDValue Val = DAG.getCopyFromReg(Chain, dl, VReg, MVT::v4f32);
   2037           SaveXMMOps.push_back(Val);
   2038         }
   2039         MemOps.push_back(DAG.getNode(X86ISD::VASTART_SAVE_XMM_REGS, dl,
   2040                                      MVT::Other,
   2041                                      &SaveXMMOps[0], SaveXMMOps.size()));
   2042       }
   2043 
   2044       if (!MemOps.empty())
   2045         Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
   2046                             &MemOps[0], MemOps.size());
   2047     }
   2048   }
   2049 
   2050   // Some CCs need callee pop.
   2051   if (X86::isCalleePop(CallConv, Is64Bit, isVarArg,
   2052                        MF.getTarget().Options.GuaranteedTailCallOpt)) {
   2053     FuncInfo->setBytesToPopOnReturn(StackSize); // Callee pops everything.
   2054   } else {
   2055     FuncInfo->setBytesToPopOnReturn(0); // Callee pops nothing.
   2056     // If this is an sret function, the return should pop the hidden pointer.
   2057     if (!Is64Bit && !IsTailCallConvention(CallConv) && !IsWindows &&
   2058         ArgsAreStructReturn(Ins))
   2059       FuncInfo->setBytesToPopOnReturn(4);
   2060   }
   2061 
   2062   if (!Is64Bit) {
   2063     // RegSaveFrameIndex is X86-64 only.
   2064     FuncInfo->setRegSaveFrameIndex(0xAAAAAAA);
   2065     if (CallConv == CallingConv::X86_FastCall ||
   2066         CallConv == CallingConv::X86_ThisCall)
   2067       // fastcc functions can't have varargs.
   2068       FuncInfo->setVarArgsFrameIndex(0xAAAAAAA);
   2069   }
   2070 
   2071   FuncInfo->setArgumentStackSize(StackSize);
   2072 
   2073   return Chain;
   2074 }
   2075 
   2076 SDValue
   2077 X86TargetLowering::LowerMemOpCallTo(SDValue Chain,
   2078                                     SDValue StackPtr, SDValue Arg,
   2079                                     DebugLoc dl, SelectionDAG &DAG,
   2080                                     const CCValAssign &VA,
   2081                                     ISD::ArgFlagsTy Flags) const {
   2082   unsigned LocMemOffset = VA.getLocMemOffset();
   2083   SDValue PtrOff = DAG.getIntPtrConstant(LocMemOffset);
   2084   PtrOff = DAG.getNode(ISD::ADD, dl, getPointerTy(), StackPtr, PtrOff);
   2085   if (Flags.isByVal())
   2086     return CreateCopyOfByValArgument(Arg, PtrOff, Chain, Flags, DAG, dl);
   2087 
   2088   return DAG.getStore(Chain, dl, Arg, PtrOff,
   2089                       MachinePointerInfo::getStack(LocMemOffset),
   2090                       false, false, 0);
   2091 }
   2092 
   2093 /// EmitTailCallLoadRetAddr - Emit a load of return address if tail call
   2094 /// optimization is performed and it is required.
   2095 SDValue
   2096 X86TargetLowering::EmitTailCallLoadRetAddr(SelectionDAG &DAG,
   2097                                            SDValue &OutRetAddr, SDValue Chain,
   2098                                            bool IsTailCall, bool Is64Bit,
   2099                                            int FPDiff, DebugLoc dl) const {
   2100   // Adjust the Return address stack slot.
   2101   EVT VT = getPointerTy();
   2102   OutRetAddr = getReturnAddressFrameIndex(DAG);
   2103 
   2104   // Load the "old" Return address.
   2105   OutRetAddr = DAG.getLoad(VT, dl, Chain, OutRetAddr, MachinePointerInfo(),
   2106                            false, false, false, 0);
   2107   return SDValue(OutRetAddr.getNode(), 1);
   2108 }
   2109 
   2110 /// EmitTailCallStoreRetAddr - Emit a store of the return address if tail call
   2111 /// optimization is performed and it is required (FPDiff!=0).
   2112 static SDValue
   2113 EmitTailCallStoreRetAddr(SelectionDAG & DAG, MachineFunction &MF,
   2114                          SDValue Chain, SDValue RetAddrFrIdx,
   2115                          bool Is64Bit, int FPDiff, DebugLoc dl) {
   2116   // Store the return address to the appropriate stack slot.
   2117   if (!FPDiff) return Chain;
   2118   // Calculate the new stack slot for the return address.
   2119   int SlotSize = Is64Bit ? 8 : 4;
   2120   int NewReturnAddrFI =
   2121     MF.getFrameInfo()->CreateFixedObject(SlotSize, FPDiff-SlotSize, false);
   2122   EVT VT = Is64Bit ? MVT::i64 : MVT::i32;
   2123   SDValue NewRetAddrFrIdx = DAG.getFrameIndex(NewReturnAddrFI, VT);
   2124   Chain = DAG.getStore(Chain, dl, RetAddrFrIdx, NewRetAddrFrIdx,
   2125                        MachinePointerInfo::getFixedStack(NewReturnAddrFI),
   2126                        false, false, 0);
   2127   return Chain;
   2128 }
   2129 
   2130 SDValue
   2131 X86TargetLowering::LowerCall(SDValue Chain, SDValue Callee,
   2132                              CallingConv::ID CallConv, bool isVarArg,
   2133                              bool doesNotRet, bool &isTailCall,
   2134                              const SmallVectorImpl<ISD::OutputArg> &Outs,
   2135                              const SmallVectorImpl<SDValue> &OutVals,
   2136                              const SmallVectorImpl<ISD::InputArg> &Ins,
   2137                              DebugLoc dl, SelectionDAG &DAG,
   2138                              SmallVectorImpl<SDValue> &InVals) const {
   2139   MachineFunction &MF = DAG.getMachineFunction();
   2140   bool Is64Bit        = Subtarget->is64Bit();
   2141   bool IsWin64        = Subtarget->isTargetWin64();
   2142   bool IsWindows      = Subtarget->isTargetWindows();
   2143   bool IsStructRet    = CallIsStructReturn(Outs);
   2144   bool IsSibcall      = false;
   2145 
   2146   if (MF.getTarget().Options.DisableTailCalls)
   2147     isTailCall = false;
   2148 
   2149   if (isTailCall) {
   2150     // Check if it's really possible to do a tail call.
   2151     isTailCall = IsEligibleForTailCallOptimization(Callee, CallConv,
   2152                     isVarArg, IsStructRet, MF.getFunction()->hasStructRetAttr(),
   2153                                                    Outs, OutVals, Ins, DAG);
   2154 
   2155     // Sibcalls are automatically detected tailcalls which do not require
   2156     // ABI changes.
   2157     if (!MF.getTarget().Options.GuaranteedTailCallOpt && isTailCall)
   2158       IsSibcall = true;
   2159 
   2160     if (isTailCall)
   2161       ++NumTailCalls;
   2162   }
   2163 
   2164   assert(!(isVarArg && IsTailCallConvention(CallConv)) &&
   2165          "Var args not supported with calling convention fastcc or ghc");
   2166 
   2167   // Analyze operands of the call, assigning locations to each operand.
   2168   SmallVector<CCValAssign, 16> ArgLocs;
   2169   CCState CCInfo(CallConv, isVarArg, MF, getTargetMachine(),
   2170                  ArgLocs, *DAG.getContext());
   2171 
   2172   // Allocate shadow area for Win64
   2173   if (IsWin64) {
   2174     CCInfo.AllocateStack(32, 8);
   2175   }
   2176 
   2177   CCInfo.AnalyzeCallOperands(Outs, CC_X86);
   2178 
   2179   // Get a count of how many bytes are to be pushed on the stack.
   2180   unsigned NumBytes = CCInfo.getNextStackOffset();
   2181   if (IsSibcall)
   2182     // This is a sibcall. The memory operands are available in caller's
   2183     // own caller's stack.
   2184     NumBytes = 0;
   2185   else if (getTargetMachine().Options.GuaranteedTailCallOpt &&
   2186            IsTailCallConvention(CallConv))
   2187     NumBytes = GetAlignedArgumentStackSize(NumBytes, DAG);
   2188 
   2189   int FPDiff = 0;
   2190   if (isTailCall && !IsSibcall) {
   2191     // Lower arguments at fp - stackoffset + fpdiff.
   2192     unsigned NumBytesCallerPushed =
   2193       MF.getInfo<X86MachineFunctionInfo>()->getBytesToPopOnReturn();
   2194     FPDiff = NumBytesCallerPushed - NumBytes;
   2195 
   2196     // Set the delta of movement of the returnaddr stackslot.
   2197     // But only set if delta is greater than previous delta.
   2198     if (FPDiff < (MF.getInfo<X86MachineFunctionInfo>()->getTCReturnAddrDelta()))
   2199       MF.getInfo<X86MachineFunctionInfo>()->setTCReturnAddrDelta(FPDiff);
   2200   }
   2201 
   2202   if (!IsSibcall)
   2203     Chain = DAG.getCALLSEQ_START(Chain, DAG.getIntPtrConstant(NumBytes, true));
   2204 
   2205   SDValue RetAddrFrIdx;
   2206   // Load return address for tail calls.
   2207   if (isTailCall && FPDiff)
   2208     Chain = EmitTailCallLoadRetAddr(DAG, RetAddrFrIdx, Chain, isTailCall,
   2209                                     Is64Bit, FPDiff, dl);
   2210 
   2211   SmallVector<std::pair<unsigned, SDValue>, 8> RegsToPass;
   2212   SmallVector<SDValue, 8> MemOpChains;
   2213   SDValue StackPtr;
   2214 
   2215   // Walk the register/memloc assignments, inserting copies/loads.  In the case
   2216   // of tail call optimization arguments are handle later.
   2217   for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
   2218     CCValAssign &VA = ArgLocs[i];
   2219     EVT RegVT = VA.getLocVT();
   2220     SDValue Arg = OutVals[i];
   2221     ISD::ArgFlagsTy Flags = Outs[i].Flags;
   2222     bool isByVal = Flags.isByVal();
   2223 
   2224     // Promote the value if needed.
   2225     switch (VA.getLocInfo()) {
   2226     default: llvm_unreachable("Unknown loc info!");
   2227     case CCValAssign::Full: break;
   2228     case CCValAssign::SExt:
   2229       Arg = DAG.getNode(ISD::SIGN_EXTEND, dl, RegVT, Arg);
   2230       break;
   2231     case CCValAssign::ZExt:
   2232       Arg = DAG.getNode(ISD::ZERO_EXTEND, dl, RegVT, Arg);
   2233       break;
   2234     case CCValAssign::AExt:
   2235       if (RegVT.isVector() && RegVT.getSizeInBits() == 128) {
   2236         // Special case: passing MMX values in XMM registers.
   2237         Arg = DAG.getNode(ISD::BITCAST, dl, MVT::i64, Arg);
   2238         Arg = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, Arg);
   2239         Arg = getMOVL(DAG, dl, MVT::v2i64, DAG.getUNDEF(MVT::v2i64), Arg);
   2240       } else
   2241         Arg = DAG.getNode(ISD::ANY_EXTEND, dl, RegVT, Arg);
   2242       break;
   2243     case CCValAssign::BCvt:
   2244       Arg = DAG.getNode(ISD::BITCAST, dl, RegVT, Arg);
   2245       break;
   2246     case CCValAssign::Indirect: {
   2247       // Store the argument.
   2248       SDValue SpillSlot = DAG.CreateStackTemporary(VA.getValVT());
   2249       int FI = cast<FrameIndexSDNode>(SpillSlot)->getIndex();
   2250       Chain = DAG.getStore(Chain, dl, Arg, SpillSlot,
   2251                            MachinePointerInfo::getFixedStack(FI),
   2252                            false, false, 0);
   2253       Arg = SpillSlot;
   2254       break;
   2255     }
   2256     }
   2257 
   2258     if (VA.isRegLoc()) {
   2259       RegsToPass.push_back(std::make_pair(VA.getLocReg(), Arg));
   2260       if (isVarArg && IsWin64) {
   2261         // Win64 ABI requires argument XMM reg to be copied to the corresponding
   2262         // shadow reg if callee is a varargs function.
   2263         unsigned ShadowReg = 0;
   2264         switch (VA.getLocReg()) {
   2265         case X86::XMM0: ShadowReg = X86::RCX; break;
   2266         case X86::XMM1: ShadowReg = X86::RDX; break;
   2267         case X86::XMM2: ShadowReg = X86::R8; break;
   2268         case X86::XMM3: ShadowReg = X86::R9; break;
   2269         }
   2270         if (ShadowReg)
   2271           RegsToPass.push_back(std::make_pair(ShadowReg, Arg));
   2272       }
   2273     } else if (!IsSibcall && (!isTailCall || isByVal)) {
   2274       assert(VA.isMemLoc());
   2275       if (StackPtr.getNode() == 0)
   2276         StackPtr = DAG.getCopyFromReg(Chain, dl, X86StackPtr, getPointerTy());
   2277       MemOpChains.push_back(LowerMemOpCallTo(Chain, StackPtr, Arg,
   2278                                              dl, DAG, VA, Flags));
   2279     }
   2280   }
   2281 
   2282   if (!MemOpChains.empty())
   2283     Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
   2284                         &MemOpChains[0], MemOpChains.size());
   2285 
   2286   // Build a sequence of copy-to-reg nodes chained together with token chain
   2287   // and flag operands which copy the outgoing args into registers.
   2288   SDValue InFlag;
   2289   // Tail call byval lowering might overwrite argument registers so in case of
   2290   // tail call optimization the copies to registers are lowered later.
   2291   if (!isTailCall)
   2292     for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) {
   2293       Chain = DAG.getCopyToReg(Chain, dl, RegsToPass[i].first,
   2294                                RegsToPass[i].second, InFlag);
   2295       InFlag = Chain.getValue(1);
   2296     }
   2297 
   2298   if (Subtarget->isPICStyleGOT()) {
   2299     // ELF / PIC requires GOT in the EBX register before function calls via PLT
   2300     // GOT pointer.
   2301     if (!isTailCall) {
   2302       Chain = DAG.getCopyToReg(Chain, dl, X86::EBX,
   2303                                DAG.getNode(X86ISD::GlobalBaseReg,
   2304                                            DebugLoc(), getPointerTy()),
   2305                                InFlag);
   2306       InFlag = Chain.getValue(1);
   2307     } else {
   2308       // If we are tail calling and generating PIC/GOT style code load the
   2309       // address of the callee into ECX. The value in ecx is used as target of
   2310       // the tail jump. This is done to circumvent the ebx/callee-saved problem
   2311       // for tail calls on PIC/GOT architectures. Normally we would just put the
   2312       // address of GOT into ebx and then call target@PLT. But for tail calls
   2313       // ebx would be restored (since ebx is callee saved) before jumping to the
   2314       // target@PLT.
   2315 
   2316       // Note: The actual moving to ECX is done further down.
   2317       GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee);
   2318       if (G && !G->getGlobal()->hasHiddenVisibility() &&
   2319           !G->getGlobal()->hasProtectedVisibility())
   2320         Callee = LowerGlobalAddress(Callee, DAG);
   2321       else if (isa<ExternalSymbolSDNode>(Callee))
   2322         Callee = LowerExternalSymbol(Callee, DAG);
   2323     }
   2324   }
   2325 
   2326   if (Is64Bit && isVarArg && !IsWin64) {
   2327     // From AMD64 ABI document:
   2328     // For calls that may call functions that use varargs or stdargs
   2329     // (prototype-less calls or calls to functions containing ellipsis (...) in
   2330     // the declaration) %al is used as hidden argument to specify the number
   2331     // of SSE registers used. The contents of %al do not need to match exactly
   2332     // the number of registers, but must be an ubound on the number of SSE
   2333     // registers used and is in the range 0 - 8 inclusive.
   2334 
   2335     // Count the number of XMM registers allocated.
   2336     static const uint16_t XMMArgRegs[] = {
   2337       X86::XMM0, X86::XMM1, X86::XMM2, X86::XMM3,
   2338       X86::XMM4, X86::XMM5, X86::XMM6, X86::XMM7
   2339     };
   2340     unsigned NumXMMRegs = CCInfo.getFirstUnallocated(XMMArgRegs, 8);
   2341     assert((Subtarget->hasSSE1() || !NumXMMRegs)
   2342            && "SSE registers cannot be used when SSE is disabled");
   2343 
   2344     Chain = DAG.getCopyToReg(Chain, dl, X86::AL,
   2345                              DAG.getConstant(NumXMMRegs, MVT::i8), InFlag);
   2346     InFlag = Chain.getValue(1);
   2347   }
   2348 
   2349 
   2350   // For tail calls lower the arguments to the 'real' stack slot.
   2351   if (isTailCall) {
   2352     // Force all the incoming stack arguments to be loaded from the stack
   2353     // before any new outgoing arguments are stored to the stack, because the
   2354     // outgoing stack slots may alias the incoming argument stack slots, and
   2355     // the alias isn't otherwise explicit. This is slightly more conservative
   2356     // than necessary, because it means that each store effectively depends
   2357     // on every argument instead of just those arguments it would clobber.
   2358     SDValue ArgChain = DAG.getStackArgumentTokenFactor(Chain);
   2359 
   2360     SmallVector<SDValue, 8> MemOpChains2;
   2361     SDValue FIN;
   2362     int FI = 0;
   2363     // Do not flag preceding copytoreg stuff together with the following stuff.
   2364     InFlag = SDValue();
   2365     if (getTargetMachine().Options.GuaranteedTailCallOpt) {
   2366       for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
   2367         CCValAssign &VA = ArgLocs[i];
   2368         if (VA.isRegLoc())
   2369           continue;
   2370         assert(VA.isMemLoc());
   2371         SDValue Arg = OutVals[i];
   2372         ISD::ArgFlagsTy Flags = Outs[i].Flags;
   2373         // Create frame index.
   2374         int32_t Offset = VA.getLocMemOffset()+FPDiff;
   2375         uint32_t OpSize = (VA.getLocVT().getSizeInBits()+7)/8;
   2376         FI = MF.getFrameInfo()->CreateFixedObject(OpSize, Offset, true);
   2377         FIN = DAG.getFrameIndex(FI, getPointerTy());
   2378 
   2379         if (Flags.isByVal()) {
   2380           // Copy relative to framepointer.
   2381           SDValue Source = DAG.getIntPtrConstant(VA.getLocMemOffset());
   2382           if (StackPtr.getNode() == 0)
   2383             StackPtr = DAG.getCopyFromReg(Chain, dl, X86StackPtr,
   2384                                           getPointerTy());
   2385           Source = DAG.getNode(ISD::ADD, dl, getPointerTy(), StackPtr, Source);
   2386 
   2387           MemOpChains2.push_back(CreateCopyOfByValArgument(Source, FIN,
   2388                                                            ArgChain,
   2389                                                            Flags, DAG, dl));
   2390         } else {
   2391           // Store relative to framepointer.
   2392           MemOpChains2.push_back(
   2393             DAG.getStore(ArgChain, dl, Arg, FIN,
   2394                          MachinePointerInfo::getFixedStack(FI),
   2395                          false, false, 0));
   2396         }
   2397       }
   2398     }
   2399 
   2400     if (!MemOpChains2.empty())
   2401       Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
   2402                           &MemOpChains2[0], MemOpChains2.size());
   2403 
   2404     // Copy arguments to their registers.
   2405     for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) {
   2406       Chain = DAG.getCopyToReg(Chain, dl, RegsToPass[i].first,
   2407                                RegsToPass[i].second, InFlag);
   2408       InFlag = Chain.getValue(1);
   2409     }
   2410     InFlag =SDValue();
   2411 
   2412     // Store the return address to the appropriate stack slot.
   2413     Chain = EmitTailCallStoreRetAddr(DAG, MF, Chain, RetAddrFrIdx, Is64Bit,
   2414                                      FPDiff, dl);
   2415   }
   2416 
   2417   if (getTargetMachine().getCodeModel() == CodeModel::Large) {
   2418     assert(Is64Bit && "Large code model is only legal in 64-bit mode.");
   2419     // In the 64-bit large code model, we have to make all calls
   2420     // through a register, since the call instruction's 32-bit
   2421     // pc-relative offset may not be large enough to hold the whole
   2422     // address.
   2423   } else if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee)) {
   2424     // If the callee is a GlobalAddress node (quite common, every direct call
   2425     // is) turn it into a TargetGlobalAddress node so that legalize doesn't hack
   2426     // it.
   2427 
   2428     // We should use extra load for direct calls to dllimported functions in
   2429     // non-JIT mode.
   2430     const GlobalValue *GV = G->getGlobal();
   2431     if (!GV->hasDLLImportLinkage()) {
   2432       unsigned char OpFlags = 0;
   2433       bool ExtraLoad = false;
   2434       unsigned WrapperKind = ISD::DELETED_NODE;
   2435 
   2436       // On ELF targets, in both X86-64 and X86-32 mode, direct calls to
   2437       // external symbols most go through the PLT in PIC mode.  If the symbol
   2438       // has hidden or protected visibility, or if it is static or local, then
   2439       // we don't need to use the PLT - we can directly call it.
   2440       if (Subtarget->isTargetELF() &&
   2441           getTargetMachine().getRelocationModel() == Reloc::PIC_ &&
   2442           GV->hasDefaultVisibility() && !GV->hasLocalLinkage()) {
   2443         OpFlags = X86II::MO_PLT;
   2444       } else if (Subtarget->isPICStyleStubAny() &&
   2445                  (GV->isDeclaration() || GV->isWeakForLinker()) &&
   2446                  (!Subtarget->getTargetTriple().isMacOSX() ||
   2447                   Subtarget->getTargetTriple().isMacOSXVersionLT(10, 5))) {
   2448         // PC-relative references to external symbols should go through $stub,
   2449         // unless we're building with the leopard linker or later, which
   2450         // automatically synthesizes these stubs.
   2451         OpFlags = X86II::MO_DARWIN_STUB;
   2452       } else if (Subtarget->isPICStyleRIPRel() &&
   2453                  isa<Function>(GV) &&
   2454                  cast<Function>(GV)->hasFnAttr(Attribute::NonLazyBind)) {
   2455         // If the function is marked as non-lazy, generate an indirect call
   2456         // which loads from the GOT directly. This avoids runtime overhead
   2457         // at the cost of eager binding (and one extra byte of encoding).
   2458         OpFlags = X86II::MO_GOTPCREL;
   2459         WrapperKind = X86ISD::WrapperRIP;
   2460         ExtraLoad = true;
   2461       }
   2462 
   2463       Callee = DAG.getTargetGlobalAddress(GV, dl, getPointerTy(),
   2464                                           G->getOffset(), OpFlags);
   2465 
   2466       // Add a wrapper if needed.
   2467       if (WrapperKind != ISD::DELETED_NODE)
   2468         Callee = DAG.getNode(X86ISD::WrapperRIP, dl, getPointerTy(), Callee);
   2469       // Add extra indirection if needed.
   2470       if (ExtraLoad)
   2471         Callee = DAG.getLoad(getPointerTy(), dl, DAG.getEntryNode(), Callee,
   2472                              MachinePointerInfo::getGOT(),
   2473                              false, false, false, 0);
   2474     }
   2475   } else if (ExternalSymbolSDNode *S = dyn_cast<ExternalSymbolSDNode>(Callee)) {
   2476     unsigned char OpFlags = 0;
   2477 
   2478     // On ELF targets, in either X86-64 or X86-32 mode, direct calls to
   2479     // external symbols should go through the PLT.
   2480     if (Subtarget->isTargetELF() &&
   2481         getTargetMachine().getRelocationModel() == Reloc::PIC_) {
   2482       OpFlags = X86II::MO_PLT;
   2483     } else if (Subtarget->isPICStyleStubAny() &&
   2484                (!Subtarget->getTargetTriple().isMacOSX() ||
   2485                 Subtarget->getTargetTriple().isMacOSXVersionLT(10, 5))) {
   2486       // PC-relative references to external symbols should go through $stub,
   2487       // unless we're building with the leopard linker or later, which
   2488       // automatically synthesizes these stubs.
   2489       OpFlags = X86II::MO_DARWIN_STUB;
   2490     }
   2491 
   2492     Callee = DAG.getTargetExternalSymbol(S->getSymbol(), getPointerTy(),
   2493                                          OpFlags);
   2494   }
   2495 
   2496   // Returns a chain & a flag for retval copy to use.
   2497   SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
   2498   SmallVector<SDValue, 8> Ops;
   2499 
   2500   if (!IsSibcall && isTailCall) {
   2501     Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(NumBytes, true),
   2502                            DAG.getIntPtrConstant(0, true), InFlag);
   2503     InFlag = Chain.getValue(1);
   2504   }
   2505 
   2506   Ops.push_back(Chain);
   2507   Ops.push_back(Callee);
   2508 
   2509   if (isTailCall)
   2510     Ops.push_back(DAG.getConstant(FPDiff, MVT::i32));
   2511 
   2512   // Add argument registers to the end of the list so that they are known live
   2513   // into the call.
   2514   for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i)
   2515     Ops.push_back(DAG.getRegister(RegsToPass[i].first,
   2516                                   RegsToPass[i].second.getValueType()));
   2517 
   2518   // Add an implicit use GOT pointer in EBX.
   2519   if (!isTailCall && Subtarget->isPICStyleGOT())
   2520     Ops.push_back(DAG.getRegister(X86::EBX, getPointerTy()));
   2521 
   2522   // Add an implicit use of AL for non-Windows x86 64-bit vararg functions.
   2523   if (Is64Bit && isVarArg && !IsWin64)
   2524     Ops.push_back(DAG.getRegister(X86::AL, MVT::i8));
   2525 
   2526   // Add a register mask operand representing the call-preserved registers.
   2527   const TargetRegisterInfo *TRI = getTargetMachine().getRegisterInfo();
   2528   const uint32_t *Mask = TRI->getCallPreservedMask(CallConv);
   2529   assert(Mask && "Missing call preserved mask for calling convention");
   2530   Ops.push_back(DAG.getRegisterMask(Mask));
   2531 
   2532   if (InFlag.getNode())
   2533     Ops.push_back(InFlag);
   2534 
   2535   if (isTailCall) {
   2536     // We used to do:
   2537     //// If this is the first return lowered for this function, add the regs
   2538     //// to the liveout set for the function.
   2539     // This isn't right, although it's probably harmless on x86; liveouts
   2540     // should be computed from returns not tail calls.  Consider a void
   2541     // function making a tail call to a function returning int.
   2542     return DAG.getNode(X86ISD::TC_RETURN, dl,
   2543                        NodeTys, &Ops[0], Ops.size());
   2544   }
   2545 
   2546   Chain = DAG.getNode(X86ISD::CALL, dl, NodeTys, &Ops[0], Ops.size());
   2547   InFlag = Chain.getValue(1);
   2548 
   2549   // Create the CALLSEQ_END node.
   2550   unsigned NumBytesForCalleeToPush;
   2551   if (X86::isCalleePop(CallConv, Is64Bit, isVarArg,
   2552                        getTargetMachine().Options.GuaranteedTailCallOpt))
   2553     NumBytesForCalleeToPush = NumBytes;    // Callee pops everything
   2554   else if (!Is64Bit && !IsTailCallConvention(CallConv) && !IsWindows &&
   2555            IsStructRet)
   2556     // If this is a call to a struct-return function, the callee
   2557     // pops the hidden struct pointer, so we have to push it back.
   2558     // This is common for Darwin/X86, Linux & Mingw32 targets.
   2559     // For MSVC Win32 targets, the caller pops the hidden struct pointer.
   2560     NumBytesForCalleeToPush = 4;
   2561   else
   2562     NumBytesForCalleeToPush = 0;  // Callee pops nothing.
   2563 
   2564   // Returns a flag for retval copy to use.
   2565   if (!IsSibcall) {
   2566     Chain = DAG.getCALLSEQ_END(Chain,
   2567                                DAG.getIntPtrConstant(NumBytes, true),
   2568                                DAG.getIntPtrConstant(NumBytesForCalleeToPush,
   2569                                                      true),
   2570                                InFlag);
   2571     InFlag = Chain.getValue(1);
   2572   }
   2573 
   2574   // Handle result values, copying them out of physregs into vregs that we
   2575   // return.
   2576   return LowerCallResult(Chain, InFlag, CallConv, isVarArg,
   2577                          Ins, dl, DAG, InVals);
   2578 }
   2579 
   2580 
   2581 //===----------------------------------------------------------------------===//
   2582 //                Fast Calling Convention (tail call) implementation
   2583 //===----------------------------------------------------------------------===//
   2584 
   2585 //  Like std call, callee cleans arguments, convention except that ECX is
   2586 //  reserved for storing the tail called function address. Only 2 registers are
   2587 //  free for argument passing (inreg). Tail call optimization is performed
   2588 //  provided:
   2589 //                * tailcallopt is enabled
   2590 //                * caller/callee are fastcc
   2591 //  On X86_64 architecture with GOT-style position independent code only local
   2592 //  (within module) calls are supported at the moment.
   2593 //  To keep the stack aligned according to platform abi the function
   2594 //  GetAlignedArgumentStackSize ensures that argument delta is always multiples
   2595 //  of stack alignment. (Dynamic linkers need this - darwin's dyld for example)
   2596 //  If a tail called function callee has more arguments than the caller the
   2597 //  caller needs to make sure that there is room to move the RETADDR to. This is
   2598 //  achieved by reserving an area the size of the argument delta right after the
   2599 //  original REtADDR, but before the saved framepointer or the spilled registers
   2600 //  e.g. caller(arg1, arg2) calls callee(arg1, arg2,arg3,arg4)
   2601 //  stack layout:
   2602 //    arg1
   2603 //    arg2
   2604 //    RETADDR
   2605 //    [ new RETADDR
   2606 //      move area ]
   2607 //    (possible EBP)
   2608 //    ESI
   2609 //    EDI
   2610 //    local1 ..
   2611 
   2612 /// GetAlignedArgumentStackSize - Make the stack size align e.g 16n + 12 aligned
   2613 /// for a 16 byte align requirement.
   2614 unsigned
   2615 X86TargetLowering::GetAlignedArgumentStackSize(unsigned StackSize,
   2616                                                SelectionDAG& DAG) const {
   2617   MachineFunction &MF = DAG.getMachineFunction();
   2618   const TargetMachine &TM = MF.getTarget();
   2619   const TargetFrameLowering &TFI = *TM.getFrameLowering();
   2620   unsigned StackAlignment = TFI.getStackAlignment();
   2621   uint64_t AlignMask = StackAlignment - 1;
   2622   int64_t Offset = StackSize;
   2623   uint64_t SlotSize = TD->getPointerSize();
   2624   if ( (Offset & AlignMask) <= (StackAlignment - SlotSize) ) {
   2625     // Number smaller than 12 so just add the difference.
   2626     Offset += ((StackAlignment - SlotSize) - (Offset & AlignMask));
   2627   } else {
   2628     // Mask out lower bits, add stackalignment once plus the 12 bytes.
   2629     Offset = ((~AlignMask) & Offset) + StackAlignment +
   2630       (StackAlignment-SlotSize);
   2631   }
   2632   return Offset;
   2633 }
   2634 
   2635 /// MatchingStackOffset - Return true if the given stack call argument is
   2636 /// already available in the same position (relatively) of the caller's
   2637 /// incoming argument stack.
   2638 static
   2639 bool MatchingStackOffset(SDValue Arg, unsigned Offset, ISD::ArgFlagsTy Flags,
   2640                          MachineFrameInfo *MFI, const MachineRegisterInfo *MRI,
   2641                          const X86InstrInfo *TII) {
   2642   unsigned Bytes = Arg.getValueType().getSizeInBits() / 8;
   2643   int FI = INT_MAX;
   2644   if (Arg.getOpcode() == ISD::CopyFromReg) {
   2645     unsigned VR = cast<RegisterSDNode>(Arg.getOperand(1))->getReg();
   2646     if (!TargetRegisterInfo::isVirtualRegister(VR))
   2647       return false;
   2648     MachineInstr *Def = MRI->getVRegDef(VR);
   2649     if (!Def)
   2650       return false;
   2651     if (!Flags.isByVal()) {
   2652       if (!TII->isLoadFromStackSlot(Def, FI))
   2653         return false;
   2654     } else {
   2655       unsigned Opcode = Def->getOpcode();
   2656       if ((Opcode == X86::LEA32r || Opcode == X86::LEA64r) &&
   2657           Def->getOperand(1).isFI()) {
   2658         FI = Def->getOperand(1).getIndex();
   2659         Bytes = Flags.getByValSize();
   2660       } else
   2661         return false;
   2662     }
   2663   } else if (LoadSDNode *Ld = dyn_cast<LoadSDNode>(Arg)) {
   2664     if (Flags.isByVal())
   2665       // ByVal argument is passed in as a pointer but it's now being
   2666       // dereferenced. e.g.
   2667       // define @foo(%struct.X* %A) {
   2668       //   tail call @bar(%struct.X* byval %A)
   2669       // }
   2670       return false;
   2671     SDValue Ptr = Ld->getBasePtr();
   2672     FrameIndexSDNode *FINode = dyn_cast<FrameIndexSDNode>(Ptr);
   2673     if (!FINode)
   2674       return false;
   2675     FI = FINode->getIndex();
   2676   } else if (Arg.getOpcode() == ISD::FrameIndex && Flags.isByVal()) {
   2677     FrameIndexSDNode *FINode = cast<FrameIndexSDNode>(Arg);
   2678     FI = FINode->getIndex();
   2679     Bytes = Flags.getByValSize();
   2680   } else
   2681     return false;
   2682 
   2683   assert(FI != INT_MAX);
   2684   if (!MFI->isFixedObjectIndex(FI))
   2685     return false;
   2686   return Offset == MFI->getObjectOffset(FI) && Bytes == MFI->getObjectSize(FI);
   2687 }
   2688 
   2689 /// IsEligibleForTailCallOptimization - Check whether the call is eligible
   2690 /// for tail call optimization. Targets which want to do tail call
   2691 /// optimization should implement this function.
   2692 bool
   2693 X86TargetLowering::IsEligibleForTailCallOptimization(SDValue Callee,
   2694                                                      CallingConv::ID CalleeCC,
   2695                                                      bool isVarArg,
   2696                                                      bool isCalleeStructRet,
   2697                                                      bool isCallerStructRet,
   2698                                     const SmallVectorImpl<ISD::OutputArg> &Outs,
   2699                                     const SmallVectorImpl<SDValue> &OutVals,
   2700                                     const SmallVectorImpl<ISD::InputArg> &Ins,
   2701                                                      SelectionDAG& DAG) const {
   2702   if (!IsTailCallConvention(CalleeCC) &&
   2703       CalleeCC != CallingConv::C)
   2704     return false;
   2705 
   2706   // If -tailcallopt is specified, make fastcc functions tail-callable.
   2707   const MachineFunction &MF = DAG.getMachineFunction();
   2708   const Function *CallerF = DAG.getMachineFunction().getFunction();
   2709   CallingConv::ID CallerCC = CallerF->getCallingConv();
   2710   bool CCMatch = CallerCC == CalleeCC;
   2711 
   2712   if (getTargetMachine().Options.GuaranteedTailCallOpt) {
   2713     if (IsTailCallConvention(CalleeCC) && CCMatch)
   2714       return true;
   2715     return false;
   2716   }
   2717 
   2718   // Look for obvious safe cases to perform tail call optimization that do not
   2719   // require ABI changes. This is what gcc calls sibcall.
   2720 
   2721   // Can't do sibcall if stack needs to be dynamically re-aligned. PEI needs to
   2722   // emit a special epilogue.
   2723   if (RegInfo->needsStackRealignment(MF))
   2724     return false;
   2725 
   2726   // Also avoid sibcall optimization if either caller or callee uses struct
   2727   // return semantics.
   2728   if (isCalleeStructRet || isCallerStructRet)
   2729     return false;
   2730 
   2731   // An stdcall caller is expected to clean up its arguments; the callee
   2732   // isn't going to do that.
   2733   if (!CCMatch && CallerCC==CallingConv::X86_StdCall)
   2734     return false;
   2735 
   2736   // Do not sibcall optimize vararg calls unless all arguments are passed via
   2737   // registers.
   2738   if (isVarArg && !Outs.empty()) {
   2739 
   2740     // Optimizing for varargs on Win64 is unlikely to be safe without
   2741     // additional testing.
   2742     if (Subtarget->isTargetWin64())
   2743       return false;
   2744 
   2745     SmallVector<CCValAssign, 16> ArgLocs;
   2746     CCState CCInfo(CalleeCC, isVarArg, DAG.getMachineFunction(),
   2747 		   getTargetMachine(), ArgLocs, *DAG.getContext());
   2748 
   2749     CCInfo.AnalyzeCallOperands(Outs, CC_X86);
   2750     for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i)
   2751       if (!ArgLocs[i].isRegLoc())
   2752         return false;
   2753   }
   2754 
   2755   // If the call result is in ST0 / ST1, it needs to be popped off the x87
   2756   // stack.  Therefore, if it's not used by the call it is not safe to optimize
   2757   // this into a sibcall.
   2758   bool Unused = false;
   2759   for (unsigned i = 0, e = Ins.size(); i != e; ++i) {
   2760     if (!Ins[i].Used) {
   2761       Unused = true;
   2762       break;
   2763     }
   2764   }
   2765   if (Unused) {
   2766     SmallVector<CCValAssign, 16> RVLocs;
   2767     CCState CCInfo(CalleeCC, false, DAG.getMachineFunction(),
   2768 		   getTargetMachine(), RVLocs, *DAG.getContext());
   2769     CCInfo.AnalyzeCallResult(Ins, RetCC_X86);
   2770     for (unsigned i = 0, e = RVLocs.size(); i != e; ++i) {
   2771       CCValAssign &VA = RVLocs[i];
   2772       if (VA.getLocReg() == X86::ST0 || VA.getLocReg() == X86::ST1)
   2773         return false;
   2774     }
   2775   }
   2776 
   2777   // If the calling conventions do not match, then we'd better make sure the
   2778   // results are returned in the same way as what the caller expects.
   2779   if (!CCMatch) {
   2780     SmallVector<CCValAssign, 16> RVLocs1;
   2781     CCState CCInfo1(CalleeCC, false, DAG.getMachineFunction(),
   2782 		    getTargetMachine(), RVLocs1, *DAG.getContext());
   2783     CCInfo1.AnalyzeCallResult(Ins, RetCC_X86);
   2784 
   2785     SmallVector<CCValAssign, 16> RVLocs2;
   2786     CCState CCInfo2(CallerCC, false, DAG.getMachineFunction(),
   2787 		    getTargetMachine(), RVLocs2, *DAG.getContext());
   2788     CCInfo2.AnalyzeCallResult(Ins, RetCC_X86);
   2789 
   2790     if (RVLocs1.size() != RVLocs2.size())
   2791       return false;
   2792     for (unsigned i = 0, e = RVLocs1.size(); i != e; ++i) {
   2793       if (RVLocs1[i].isRegLoc() != RVLocs2[i].isRegLoc())
   2794         return false;
   2795       if (RVLocs1[i].getLocInfo() != RVLocs2[i].getLocInfo())
   2796         return false;
   2797       if (RVLocs1[i].isRegLoc()) {
   2798         if (RVLocs1[i].getLocReg() != RVLocs2[i].getLocReg())
   2799           return false;
   2800       } else {
   2801         if (RVLocs1[i].getLocMemOffset() != RVLocs2[i].getLocMemOffset())
   2802           return false;
   2803       }
   2804     }
   2805   }
   2806 
   2807   // If the callee takes no arguments then go on to check the results of the
   2808   // call.
   2809   if (!Outs.empty()) {
   2810     // Check if stack adjustment is needed. For now, do not do this if any
   2811     // argument is passed on the stack.
   2812     SmallVector<CCValAssign, 16> ArgLocs;
   2813     CCState CCInfo(CalleeCC, isVarArg, DAG.getMachineFunction(),
   2814 		   getTargetMachine(), ArgLocs, *DAG.getContext());
   2815 
   2816     // Allocate shadow area for Win64
   2817     if (Subtarget->isTargetWin64()) {
   2818       CCInfo.AllocateStack(32, 8);
   2819     }
   2820 
   2821     CCInfo.AnalyzeCallOperands(Outs, CC_X86);
   2822     if (CCInfo.getNextStackOffset()) {
   2823       MachineFunction &MF = DAG.getMachineFunction();
   2824       if (MF.getInfo<X86MachineFunctionInfo>()->getBytesToPopOnReturn())
   2825         return false;
   2826 
   2827       // Check if the arguments are already laid out in the right way as
   2828       // the caller's fixed stack objects.
   2829       MachineFrameInfo *MFI = MF.getFrameInfo();
   2830       const MachineRegisterInfo *MRI = &MF.getRegInfo();
   2831       const X86InstrInfo *TII =
   2832         ((X86TargetMachine&)getTargetMachine()).getInstrInfo();
   2833       for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
   2834         CCValAssign &VA = ArgLocs[i];
   2835         SDValue Arg = OutVals[i];
   2836         ISD::ArgFlagsTy Flags = Outs[i].Flags;
   2837         if (VA.getLocInfo() == CCValAssign::Indirect)
   2838           return false;
   2839         if (!VA.isRegLoc()) {
   2840           if (!MatchingStackOffset(Arg, VA.getLocMemOffset(), Flags,
   2841                                    MFI, MRI, TII))
   2842             return false;
   2843         }
   2844       }
   2845     }
   2846 
   2847     // If the tailcall address may be in a register, then make sure it's
   2848     // possible to register allocate for it. In 32-bit, the call address can
   2849     // only target EAX, EDX, or ECX since the tail call must be scheduled after
   2850     // callee-saved registers are restored. These happen to be the same
   2851     // registers used to pass 'inreg' arguments so watch out for those.
   2852     if (!Subtarget->is64Bit() &&
   2853         !isa<GlobalAddressSDNode>(Callee) &&
   2854         !isa<ExternalSymbolSDNode>(Callee)) {
   2855       unsigned NumInRegs = 0;
   2856       for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
   2857         CCValAssign &VA = ArgLocs[i];
   2858         if (!VA.isRegLoc())
   2859           continue;
   2860         unsigned Reg = VA.getLocReg();
   2861         switch (Reg) {
   2862         default: break;
   2863         case X86::EAX: case X86::EDX: case X86::ECX:
   2864           if (++NumInRegs == 3)
   2865             return false;
   2866           break;
   2867         }
   2868       }
   2869     }
   2870   }
   2871 
   2872   return true;
   2873 }
   2874 
   2875 FastISel *
   2876 X86TargetLowering::createFastISel(FunctionLoweringInfo &funcInfo) const {
   2877   return X86::createFastISel(funcInfo);
   2878 }
   2879 
   2880 
   2881 //===----------------------------------------------------------------------===//
   2882 //                           Other Lowering Hooks
   2883 //===----------------------------------------------------------------------===//
   2884 
   2885 static bool MayFoldLoad(SDValue Op) {
   2886   return Op.hasOneUse() && ISD::isNormalLoad(Op.getNode());
   2887 }
   2888 
   2889 static bool MayFoldIntoStore(SDValue Op) {
   2890   return Op.hasOneUse() && ISD::isNormalStore(*Op.getNode()->use_begin());
   2891 }
   2892 
   2893 static bool isTargetShuffle(unsigned Opcode) {
   2894   switch(Opcode) {
   2895   default: return false;
   2896   case X86ISD::PSHUFD:
   2897   case X86ISD::PSHUFHW:
   2898   case X86ISD::PSHUFLW:
   2899   case X86ISD::SHUFP:
   2900   case X86ISD::PALIGN:
   2901   case X86ISD::MOVLHPS:
   2902   case X86ISD::MOVLHPD:
   2903   case X86ISD::MOVHLPS:
   2904   case X86ISD::MOVLPS:
   2905   case X86ISD::MOVLPD:
   2906   case X86ISD::MOVSHDUP:
   2907   case X86ISD::MOVSLDUP:
   2908   case X86ISD::MOVDDUP:
   2909   case X86ISD::MOVSS:
   2910   case X86ISD::MOVSD:
   2911   case X86ISD::UNPCKL:
   2912   case X86ISD::UNPCKH:
   2913   case X86ISD::VPERMILP:
   2914   case X86ISD::VPERM2X128:
   2915     return true;
   2916   }
   2917 }
   2918 
   2919 static SDValue getTargetShuffleNode(unsigned Opc, DebugLoc dl, EVT VT,
   2920                                     SDValue V1, SelectionDAG &DAG) {
   2921   switch(Opc) {
   2922   default: llvm_unreachable("Unknown x86 shuffle node");
   2923   case X86ISD::MOVSHDUP:
   2924   case X86ISD::MOVSLDUP:
   2925   case X86ISD::MOVDDUP:
   2926     return DAG.getNode(Opc, dl, VT, V1);
   2927   }
   2928 }
   2929 
   2930 static SDValue getTargetShuffleNode(unsigned Opc, DebugLoc dl, EVT VT,
   2931                                     SDValue V1, unsigned TargetMask,
   2932                                     SelectionDAG &DAG) {
   2933   switch(Opc) {
   2934   default: llvm_unreachable("Unknown x86 shuffle node");
   2935   case X86ISD::PSHUFD:
   2936   case X86ISD::PSHUFHW:
   2937   case X86ISD::PSHUFLW:
   2938   case X86ISD::VPERMILP:
   2939   case X86ISD::VPERMI:
   2940     return DAG.getNode(Opc, dl, VT, V1, DAG.getConstant(TargetMask, MVT::i8));
   2941   }
   2942 }
   2943 
   2944 static SDValue getTargetShuffleNode(unsigned Opc, DebugLoc dl, EVT VT,
   2945                                     SDValue V1, SDValue V2, unsigned TargetMask,
   2946                                     SelectionDAG &DAG) {
   2947   switch(Opc) {
   2948   default: llvm_unreachable("Unknown x86 shuffle node");
   2949   case X86ISD::PALIGN:
   2950   case X86ISD::SHUFP:
   2951   case X86ISD::VPERM2X128:
   2952     return DAG.getNode(Opc, dl, VT, V1, V2,
   2953                        DAG.getConstant(TargetMask, MVT::i8));
   2954   }
   2955 }
   2956 
   2957 static SDValue getTargetShuffleNode(unsigned Opc, DebugLoc dl, EVT VT,
   2958                                     SDValue V1, SDValue V2, SelectionDAG &DAG) {
   2959   switch(Opc) {
   2960   default: llvm_unreachable("Unknown x86 shuffle node");
   2961   case X86ISD::MOVLHPS:
   2962   case X86ISD::MOVLHPD:
   2963   case X86ISD::MOVHLPS:
   2964   case X86ISD::MOVLPS:
   2965   case X86ISD::MOVLPD:
   2966   case X86ISD::MOVSS:
   2967   case X86ISD::MOVSD:
   2968   case X86ISD::UNPCKL:
   2969   case X86ISD::UNPCKH:
   2970     return DAG.getNode(Opc, dl, VT, V1, V2);
   2971   }
   2972 }
   2973 
   2974 SDValue X86TargetLowering::getReturnAddressFrameIndex(SelectionDAG &DAG) const {
   2975   MachineFunction &MF = DAG.getMachineFunction();
   2976   X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
   2977   int ReturnAddrIndex = FuncInfo->getRAIndex();
   2978 
   2979   if (ReturnAddrIndex == 0) {
   2980     // Set up a frame object for the return address.
   2981     uint64_t SlotSize = TD->getPointerSize();
   2982     ReturnAddrIndex = MF.getFrameInfo()->CreateFixedObject(SlotSize, -SlotSize,
   2983                                                            false);
   2984     FuncInfo->setRAIndex(ReturnAddrIndex);
   2985   }
   2986 
   2987   return DAG.getFrameIndex(ReturnAddrIndex, getPointerTy());
   2988 }
   2989 
   2990 
   2991 bool X86::isOffsetSuitableForCodeModel(int64_t Offset, CodeModel::Model M,
   2992                                        bool hasSymbolicDisplacement) {
   2993   // Offset should fit into 32 bit immediate field.
   2994   if (!isInt<32>(Offset))
   2995     return false;
   2996 
   2997   // If we don't have a symbolic displacement - we don't have any extra
   2998   // restrictions.
   2999   if (!hasSymbolicDisplacement)
   3000     return true;
   3001 
   3002   // FIXME: Some tweaks might be needed for medium code model.
   3003   if (M != CodeModel::Small && M != CodeModel::Kernel)
   3004     return false;
   3005 
   3006   // For small code model we assume that latest object is 16MB before end of 31
   3007   // bits boundary. We may also accept pretty large negative constants knowing
   3008   // that all objects are in the positive half of address space.
   3009   if (M == CodeModel::Small && Offset < 16*1024*1024)
   3010     return true;
   3011 
   3012   // For kernel code model we know that all object resist in the negative half
   3013   // of 32bits address space. We may not accept negative offsets, since they may
   3014   // be just off and we may accept pretty large positive ones.
   3015   if (M == CodeModel::Kernel && Offset > 0)
   3016     return true;
   3017 
   3018   return false;
   3019 }
   3020 
   3021 /// isCalleePop - Determines whether the callee is required to pop its
   3022 /// own arguments. Callee pop is necessary to support tail calls.
   3023 bool X86::isCalleePop(CallingConv::ID CallingConv,
   3024                       bool is64Bit, bool IsVarArg, bool TailCallOpt) {
   3025   if (IsVarArg)
   3026     return false;
   3027 
   3028   switch (CallingConv) {
   3029   default:
   3030     return false;
   3031   case CallingConv::X86_StdCall:
   3032     return !is64Bit;
   3033   case CallingConv::X86_FastCall:
   3034     return !is64Bit;
   3035   case CallingConv::X86_ThisCall:
   3036     return !is64Bit;
   3037   case CallingConv::Fast:
   3038     return TailCallOpt;
   3039   case CallingConv::GHC:
   3040     return TailCallOpt;
   3041   }
   3042 }
   3043 
   3044 /// TranslateX86CC - do a one to one translation of a ISD::CondCode to the X86
   3045 /// specific condition code, returning the condition code and the LHS/RHS of the
   3046 /// comparison to make.
   3047 static unsigned TranslateX86CC(ISD::CondCode SetCCOpcode, bool isFP,
   3048                                SDValue &LHS, SDValue &RHS, SelectionDAG &DAG) {
   3049   if (!isFP) {
   3050     if (ConstantSDNode *RHSC = dyn_cast<ConstantSDNode>(RHS)) {
   3051       if (SetCCOpcode == ISD::SETGT && RHSC->isAllOnesValue()) {
   3052         // X > -1   -> X == 0, jump !sign.
   3053         RHS = DAG.getConstant(0, RHS.getValueType());
   3054         return X86::COND_NS;
   3055       } else if (SetCCOpcode == ISD::SETLT && RHSC->isNullValue()) {
   3056         // X < 0   -> X == 0, jump on sign.
   3057         return X86::COND_S;
   3058       } else if (SetCCOpcode == ISD::SETLT && RHSC->getZExtValue() == 1) {
   3059         // X < 1   -> X <= 0
   3060         RHS = DAG.getConstant(0, RHS.getValueType());
   3061         return X86::COND_LE;
   3062       }
   3063     }
   3064 
   3065     switch (SetCCOpcode) {
   3066     default: llvm_unreachable("Invalid integer condition!");
   3067     case ISD::SETEQ:  return X86::COND_E;
   3068     case ISD::SETGT:  return X86::COND_G;
   3069     case ISD::SETGE:  return X86::COND_GE;
   3070     case ISD::SETLT:  return X86::COND_L;
   3071     case ISD::SETLE:  return X86::COND_LE;
   3072     case ISD::SETNE:  return X86::COND_NE;
   3073     case ISD::SETULT: return X86::COND_B;
   3074     case ISD::SETUGT: return X86::COND_A;
   3075     case ISD::SETULE: return X86::COND_BE;
   3076     case ISD::SETUGE: return X86::COND_AE;
   3077     }
   3078   }
   3079 
   3080   // First determine if it is required or is profitable to flip the operands.
   3081 
   3082   // If LHS is a foldable load, but RHS is not, flip the condition.
   3083   if (ISD::isNON_EXTLoad(LHS.getNode()) &&
   3084       !ISD::isNON_EXTLoad(RHS.getNode())) {
   3085     SetCCOpcode = getSetCCSwappedOperands(SetCCOpcode);
   3086     std::swap(LHS, RHS);
   3087   }
   3088 
   3089   switch (SetCCOpcode) {
   3090   default: break;
   3091   case ISD::SETOLT:
   3092   case ISD::SETOLE:
   3093   case ISD::SETUGT:
   3094   case ISD::SETUGE:
   3095     std::swap(LHS, RHS);
   3096     break;
   3097   }
   3098 
   3099   // On a floating point condition, the flags are set as follows:
   3100   // ZF  PF  CF   op
   3101   //  0 | 0 | 0 | X > Y
   3102   //  0 | 0 | 1 | X < Y
   3103   //  1 | 0 | 0 | X == Y
   3104   //  1 | 1 | 1 | unordered
   3105   switch (SetCCOpcode) {
   3106   default: llvm_unreachable("Condcode should be pre-legalized away");
   3107   case ISD::SETUEQ:
   3108   case ISD::SETEQ:   return X86::COND_E;
   3109   case ISD::SETOLT:              // flipped
   3110   case ISD::SETOGT:
   3111   case ISD::SETGT:   return X86::COND_A;
   3112   case ISD::SETOLE:              // flipped
   3113   case ISD::SETOGE:
   3114   case ISD::SETGE:   return X86::COND_AE;
   3115   case ISD::SETUGT:              // flipped
   3116   case ISD::SETULT:
   3117   case ISD::SETLT:   return X86::COND_B;
   3118   case ISD::SETUGE:              // flipped
   3119   case ISD::SETULE:
   3120   case ISD::SETLE:   return X86::COND_BE;
   3121   case ISD::SETONE:
   3122   case ISD::SETNE:   return X86::COND_NE;
   3123   case ISD::SETUO:   return X86::COND_P;
   3124   case ISD::SETO:    return X86::COND_NP;
   3125   case ISD::SETOEQ:
   3126   case ISD::SETUNE:  return X86::COND_INVALID;
   3127   }
   3128 }
   3129 
   3130 /// hasFPCMov - is there a floating point cmov for the specific X86 condition
   3131 /// code. Current x86 isa includes the following FP cmov instructions:
   3132 /// fcmovb, fcomvbe, fcomve, fcmovu, fcmovae, fcmova, fcmovne, fcmovnu.
   3133 static bool hasFPCMov(unsigned X86CC) {
   3134   switch (X86CC) {
   3135   default:
   3136     return false;
   3137   case X86::COND_B:
   3138   case X86::COND_BE:
   3139   case X86::COND_E:
   3140   case X86::COND_P:
   3141   case X86::COND_A:
   3142   case X86::COND_AE:
   3143   case X86::COND_NE:
   3144   case X86::COND_NP:
   3145     return true;
   3146   }
   3147 }
   3148 
   3149 /// isFPImmLegal - Returns true if the target can instruction select the
   3150 /// specified FP immediate natively. If false, the legalizer will
   3151 /// materialize the FP immediate as a load from a constant pool.
   3152 bool X86TargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT) const {
   3153   for (unsigned i = 0, e = LegalFPImmediates.size(); i != e; ++i) {
   3154     if (Imm.bitwiseIsEqual(LegalFPImmediates[i]))
   3155       return true;
   3156   }
   3157   return false;
   3158 }
   3159 
   3160 /// isUndefOrInRange - Return true if Val is undef or if its value falls within
   3161 /// the specified range (L, H].
   3162 static bool isUndefOrInRange(int Val, int Low, int Hi) {
   3163   return (Val < 0) || (Val >= Low && Val < Hi);
   3164 }
   3165 
   3166 /// isUndefOrEqual - Val is either less than zero (undef) or equal to the
   3167 /// specified value.
   3168 static bool isUndefOrEqual(int Val, int CmpVal) {
   3169   if (Val < 0 || Val == CmpVal)
   3170     return true;
   3171   return false;
   3172 }
   3173 
   3174 /// isSequentialOrUndefInRange - Return true if every element in Mask, begining
   3175 /// from position Pos and ending in Pos+Size, falls within the specified
   3176 /// sequential range (L, L+Pos]. or is undef.
   3177 static bool isSequentialOrUndefInRange(ArrayRef<int> Mask,
   3178                                        int Pos, int Size, int Low) {
   3179   for (int i = Pos, e = Pos+Size; i != e; ++i, ++Low)
   3180     if (!isUndefOrEqual(Mask[i], Low))
   3181       return false;
   3182   return true;
   3183 }
   3184 
   3185 /// isPSHUFDMask - Return true if the node specifies a shuffle of elements that
   3186 /// is suitable for input to PSHUFD or PSHUFW.  That is, it doesn't reference
   3187 /// the second operand.
   3188 static bool isPSHUFDMask(ArrayRef<int> Mask, EVT VT) {
   3189   if (VT == MVT::v4f32 || VT == MVT::v4i32 )
   3190     return (Mask[0] < 4 && Mask[1] < 4 && Mask[2] < 4 && Mask[3] < 4);
   3191   if (VT == MVT::v2f64 || VT == MVT::v2i64)
   3192     return (Mask[0] < 2 && Mask[1] < 2);
   3193   return false;
   3194 }
   3195 
   3196 /// isPSHUFHWMask - Return true if the node specifies a shuffle of elements that
   3197 /// is suitable for input to PSHUFHW.
   3198 static bool isPSHUFHWMask(ArrayRef<int> Mask, EVT VT) {
   3199   if (VT != MVT::v8i16)
   3200     return false;
   3201 
   3202   // Lower quadword copied in order or undef.
   3203   if (!isSequentialOrUndefInRange(Mask, 0, 4, 0))
   3204     return false;
   3205 
   3206   // Upper quadword shuffled.
   3207   for (unsigned i = 4; i != 8; ++i)
   3208     if (Mask[i] >= 0 && (Mask[i] < 4 || Mask[i] > 7))
   3209       return false;
   3210 
   3211   return true;
   3212 }
   3213 
   3214 /// isPSHUFLWMask - Return true if the node specifies a shuffle of elements that
   3215 /// is suitable for input to PSHUFLW.
   3216 static bool isPSHUFLWMask(ArrayRef<int> Mask, EVT VT) {
   3217   if (VT != MVT::v8i16)
   3218     return false;
   3219 
   3220   // Upper quadword copied in order.
   3221   if (!isSequentialOrUndefInRange(Mask, 4, 4, 4))
   3222     return false;
   3223 
   3224   // Lower quadword shuffled.
   3225   for (unsigned i = 0; i != 4; ++i)
   3226     if (Mask[i] >= 4)
   3227       return false;
   3228 
   3229   return true;
   3230 }
   3231 
   3232 /// isPALIGNRMask - Return true if the node specifies a shuffle of elements that
   3233 /// is suitable for input to PALIGNR.
   3234 static bool isPALIGNRMask(ArrayRef<int> Mask, EVT VT,
   3235                           const X86Subtarget *Subtarget) {
   3236   if ((VT.getSizeInBits() == 128 && !Subtarget->hasSSSE3()) ||
   3237       (VT.getSizeInBits() == 256 && !Subtarget->hasAVX2()))
   3238     return false;
   3239 
   3240   unsigned NumElts = VT.getVectorNumElements();
   3241   unsigned NumLanes = VT.getSizeInBits()/128;
   3242   unsigned NumLaneElts = NumElts/NumLanes;
   3243 
   3244   // Do not handle 64-bit element shuffles with palignr.
   3245   if (NumLaneElts == 2)
   3246     return false;
   3247 
   3248   for (unsigned l = 0; l != NumElts; l+=NumLaneElts) {
   3249     unsigned i;
   3250     for (i = 0; i != NumLaneElts; ++i) {
   3251       if (Mask[i+l] >= 0)
   3252         break;
   3253     }
   3254 
   3255     // Lane is all undef, go to next lane
   3256     if (i == NumLaneElts)
   3257       continue;
   3258 
   3259     int Start = Mask[i+l];
   3260 
   3261     // Make sure its in this lane in one of the sources
   3262     if (!isUndefOrInRange(Start, l, l+NumLaneElts) &&
   3263         !isUndefOrInRange(Start, l+NumElts, l+NumElts+NumLaneElts))
   3264       return false;
   3265 
   3266     // If not lane 0, then we must match lane 0
   3267     if (l != 0 && Mask[i] >= 0 && !isUndefOrEqual(Start, Mask[i]+l))
   3268       return false;
   3269 
   3270     // Correct second source to be contiguous with first source
   3271     if (Start >= (int)NumElts)
   3272       Start -= NumElts - NumLaneElts;
   3273 
   3274     // Make sure we're shifting in the right direction.
   3275     if (Start <= (int)(i+l))
   3276       return false;
   3277 
   3278     Start -= i;
   3279 
   3280     // Check the rest of the elements to see if they are consecutive.
   3281     for (++i; i != NumLaneElts; ++i) {
   3282       int Idx = Mask[i+l];
   3283 
   3284       // Make sure its in this lane
   3285       if (!isUndefOrInRange(Idx, l, l+NumLaneElts) &&
   3286           !isUndefOrInRange(Idx, l+NumElts, l+NumElts+NumLaneElts))
   3287         return false;
   3288 
   3289       // If not lane 0, then we must match lane 0
   3290       if (l != 0 && Mask[i] >= 0 && !isUndefOrEqual(Idx, Mask[i]+l))
   3291         return false;
   3292 
   3293       if (Idx >= (int)NumElts)
   3294         Idx -= NumElts - NumLaneElts;
   3295 
   3296       if (!isUndefOrEqual(Idx, Start+i))
   3297         return false;
   3298 
   3299     }
   3300   }
   3301 
   3302   return true;
   3303 }
   3304 
   3305 /// CommuteVectorShuffleMask - Change values in a shuffle permute mask assuming
   3306 /// the two vector operands have swapped position.
   3307 static void CommuteVectorShuffleMask(SmallVectorImpl<int> &Mask,
   3308                                      unsigned NumElems) {
   3309   for (unsigned i = 0; i != NumElems; ++i) {
   3310     int idx = Mask[i];
   3311     if (idx < 0)
   3312       continue;
   3313     else if (idx < (int)NumElems)
   3314       Mask[i] = idx + NumElems;
   3315     else
   3316       Mask[i] = idx - NumElems;
   3317   }
   3318 }
   3319 
   3320 /// isSHUFPMask - Return true if the specified VECTOR_SHUFFLE operand
   3321 /// specifies a shuffle of elements that is suitable for input to 128/256-bit
   3322 /// SHUFPS and SHUFPD. If Commuted is true, then it checks for sources to be
   3323 /// reverse of what x86 shuffles want.
   3324 static bool isSHUFPMask(ArrayRef<int> Mask, EVT VT, bool HasAVX,
   3325                         bool Commuted = false) {
   3326   if (!HasAVX && VT.getSizeInBits() == 256)
   3327     return false;
   3328 
   3329   unsigned NumElems = VT.getVectorNumElements();
   3330   unsigned NumLanes = VT.getSizeInBits()/128;
   3331   unsigned NumLaneElems = NumElems/NumLanes;
   3332 
   3333   if (NumLaneElems != 2 && NumLaneElems != 4)
   3334     return false;
   3335 
   3336   // VSHUFPSY divides the resulting vector into 4 chunks.
   3337   // The sources are also splitted into 4 chunks, and each destination
   3338   // chunk must come from a different source chunk.
   3339   //
   3340   //  SRC1 =>   X7    X6    X5    X4    X3    X2    X1    X0
   3341   //  SRC2 =>   Y7    Y6    Y5    Y4    Y3    Y2    Y1    Y9
   3342   //
   3343   //  DST  =>  Y7..Y4,   Y7..Y4,   X7..X4,   X7..X4,
   3344   //           Y3..Y0,   Y3..Y0,   X3..X0,   X3..X0
   3345   //
   3346   // VSHUFPDY divides the resulting vector into 4 chunks.
   3347   // The sources are also splitted into 4 chunks, and each destination
   3348   // chunk must come from a different source chunk.
   3349   //
   3350   //  SRC1 =>      X3       X2       X1       X0
   3351   //  SRC2 =>      Y3       Y2       Y1       Y0
   3352   //
   3353   //  DST  =>  Y3..Y2,  X3..X2,  Y1..Y0,  X1..X0
   3354   //
   3355   unsigned HalfLaneElems = NumLaneElems/2;
   3356   for (unsigned l = 0; l != NumElems; l += NumLaneElems) {
   3357     for (unsigned i = 0; i != NumLaneElems; ++i) {
   3358       int Idx = Mask[i+l];
   3359       unsigned RngStart = l + ((Commuted == (i<HalfLaneElems)) ? NumElems : 0);
   3360       if (!isUndefOrInRange(Idx, RngStart, RngStart+NumLaneElems))
   3361         return false;
   3362       // For VSHUFPSY, the mask of the second half must be the same as the
   3363       // first but with the appropriate offsets. This works in the same way as
   3364       // VPERMILPS works with masks.
   3365       if (NumElems != 8 || l == 0 || Mask[i] < 0)
   3366         continue;
   3367       if (!isUndefOrEqual(Idx, Mask[i]+l))
   3368         return false;
   3369     }
   3370   }
   3371 
   3372   return true;
   3373 }
   3374 
   3375 /// isMOVHLPSMask - Return true if the specified VECTOR_SHUFFLE operand
   3376 /// specifies a shuffle of elements that is suitable for input to MOVHLPS.
   3377 static bool isMOVHLPSMask(ArrayRef<int> Mask, EVT VT) {
   3378   unsigned NumElems = VT.getVectorNumElements();
   3379 
   3380   if (VT.getSizeInBits() != 128)
   3381     return false;
   3382 
   3383   if (NumElems != 4)
   3384     return false;
   3385 
   3386   // Expect bit0 == 6, bit1 == 7, bit2 == 2, bit3 == 3
   3387   return isUndefOrEqual(Mask[0], 6) &&
   3388          isUndefOrEqual(Mask[1], 7) &&
   3389          isUndefOrEqual(Mask[2], 2) &&
   3390          isUndefOrEqual(Mask[3], 3);
   3391 }
   3392 
   3393 /// isMOVHLPS_v_undef_Mask - Special case of isMOVHLPSMask for canonical form
   3394 /// of vector_shuffle v, v, <2, 3, 2, 3>, i.e. vector_shuffle v, undef,
   3395 /// <2, 3, 2, 3>
   3396 static bool isMOVHLPS_v_undef_Mask(ArrayRef<int> Mask, EVT VT) {
   3397   unsigned NumElems = VT.getVectorNumElements();
   3398 
   3399   if (VT.getSizeInBits() != 128)
   3400     return false;
   3401 
   3402   if (NumElems != 4)
   3403     return false;
   3404 
   3405   return isUndefOrEqual(Mask[0], 2) &&
   3406          isUndefOrEqual(Mask[1], 3) &&
   3407          isUndefOrEqual(Mask[2], 2) &&
   3408          isUndefOrEqual(Mask[3], 3);
   3409 }
   3410 
   3411 /// isMOVLPMask - Return true if the specified VECTOR_SHUFFLE operand
   3412 /// specifies a shuffle of elements that is suitable for input to MOVLP{S|D}.
   3413 static bool isMOVLPMask(ArrayRef<int> Mask, EVT VT) {
   3414   if (VT.getSizeInBits() != 128)
   3415     return false;
   3416 
   3417   unsigned NumElems = VT.getVectorNumElements();
   3418 
   3419   if (NumElems != 2 && NumElems != 4)
   3420     return false;
   3421 
   3422   for (unsigned i = 0; i != NumElems/2; ++i)
   3423     if (!isUndefOrEqual(Mask[i], i + NumElems))
   3424       return false;
   3425 
   3426   for (unsigned i = NumElems/2; i != NumElems; ++i)
   3427     if (!isUndefOrEqual(Mask[i], i))
   3428       return false;
   3429 
   3430   return true;
   3431 }
   3432 
   3433 /// isMOVLHPSMask - Return true if the specified VECTOR_SHUFFLE operand
   3434 /// specifies a shuffle of elements that is suitable for input to MOVLHPS.
   3435 static bool isMOVLHPSMask(ArrayRef<int> Mask, EVT VT) {
   3436   unsigned NumElems = VT.getVectorNumElements();
   3437 
   3438   if ((NumElems != 2 && NumElems != 4)
   3439       || VT.getSizeInBits() > 128)
   3440     return false;
   3441 
   3442   for (unsigned i = 0; i != NumElems/2; ++i)
   3443     if (!isUndefOrEqual(Mask[i], i))
   3444       return false;
   3445 
   3446   for (unsigned i = 0; i != NumElems/2; ++i)
   3447     if (!isUndefOrEqual(Mask[i + NumElems/2], i + NumElems))
   3448       return false;
   3449 
   3450   return true;
   3451 }
   3452 
   3453 /// isUNPCKLMask - Return true if the specified VECTOR_SHUFFLE operand
   3454 /// specifies a shuffle of elements that is suitable for input to UNPCKL.
   3455 static bool isUNPCKLMask(ArrayRef<int> Mask, EVT VT,
   3456                          bool HasAVX2, bool V2IsSplat = false) {
   3457   unsigned NumElts = VT.getVectorNumElements();
   3458 
   3459   assert((VT.is128BitVector() || VT.is256BitVector()) &&
   3460          "Unsupported vector type for unpckh");
   3461 
   3462   if (VT.getSizeInBits() == 256 && NumElts != 4 && NumElts != 8 &&
   3463       (!HasAVX2 || (NumElts != 16 && NumElts != 32)))
   3464     return false;
   3465 
   3466   // Handle 128 and 256-bit vector lengths. AVX defines UNPCK* to operate
   3467   // independently on 128-bit lanes.
   3468   unsigned NumLanes = VT.getSizeInBits()/128;
   3469   unsigned NumLaneElts = NumElts/NumLanes;
   3470 
   3471   for (unsigned l = 0; l != NumLanes; ++l) {
   3472     for (unsigned i = l*NumLaneElts, j = l*NumLaneElts;
   3473          i != (l+1)*NumLaneElts;
   3474          i += 2, ++j) {
   3475       int BitI  = Mask[i];
   3476       int BitI1 = Mask[i+1];
   3477       if (!isUndefOrEqual(BitI, j))
   3478         return false;
   3479       if (V2IsSplat) {
   3480         if (!isUndefOrEqual(BitI1, NumElts))
   3481           return false;
   3482       } else {
   3483         if (!isUndefOrEqual(BitI1, j + NumElts))
   3484           return false;
   3485       }
   3486     }
   3487   }
   3488 
   3489   return true;
   3490 }
   3491 
   3492 /// isUNPCKHMask - Return true if the specified VECTOR_SHUFFLE operand
   3493 /// specifies a shuffle of elements that is suitable for input to UNPCKH.
   3494 static bool isUNPCKHMask(ArrayRef<int> Mask, EVT VT,
   3495                          bool HasAVX2, bool V2IsSplat = false) {
   3496   unsigned NumElts = VT.getVectorNumElements();
   3497 
   3498   assert((VT.is128BitVector() || VT.is256BitVector()) &&
   3499          "Unsupported vector type for unpckh");
   3500 
   3501   if (VT.getSizeInBits() == 256 && NumElts != 4 && NumElts != 8 &&
   3502       (!HasAVX2 || (NumElts != 16 && NumElts != 32)))
   3503     return false;
   3504 
   3505   // Handle 128 and 256-bit vector lengths. AVX defines UNPCK* to operate
   3506   // independently on 128-bit lanes.
   3507   unsigned NumLanes = VT.getSizeInBits()/128;
   3508   unsigned NumLaneElts = NumElts/NumLanes;
   3509 
   3510   for (unsigned l = 0; l != NumLanes; ++l) {
   3511     for (unsigned i = l*NumLaneElts, j = (l*NumLaneElts)+NumLaneElts/2;
   3512          i != (l+1)*NumLaneElts; i += 2, ++j) {
   3513       int BitI  = Mask[i];
   3514       int BitI1 = Mask[i+1];
   3515       if (!isUndefOrEqual(BitI, j))
   3516         return false;
   3517       if (V2IsSplat) {
   3518         if (isUndefOrEqual(BitI1, NumElts))
   3519           return false;
   3520       } else {
   3521         if (!isUndefOrEqual(BitI1, j+NumElts))
   3522           return false;
   3523       }
   3524     }
   3525   }
   3526   return true;
   3527 }
   3528 
   3529 /// isUNPCKL_v_undef_Mask - Special case of isUNPCKLMask for canonical form
   3530 /// of vector_shuffle v, v, <0, 4, 1, 5>, i.e. vector_shuffle v, undef,
   3531 /// <0, 0, 1, 1>
   3532 static bool isUNPCKL_v_undef_Mask(ArrayRef<int> Mask, EVT VT,
   3533                                   bool HasAVX2) {
   3534   unsigned NumElts = VT.getVectorNumElements();
   3535 
   3536   assert((VT.is128BitVector() || VT.is256BitVector()) &&
   3537          "Unsupported vector type for unpckh");
   3538 
   3539   if (VT.getSizeInBits() == 256 && NumElts != 4 && NumElts != 8 &&
   3540       (!HasAVX2 || (NumElts != 16 && NumElts != 32)))
   3541     return false;
   3542 
   3543   // For 256-bit i64/f64, use MOVDDUPY instead, so reject the matching pattern
   3544   // FIXME: Need a better way to get rid of this, there's no latency difference
   3545   // between UNPCKLPD and MOVDDUP, the later should always be checked first and
   3546   // the former later. We should also remove the "_undef" special mask.
   3547   if (NumElts == 4 && VT.getSizeInBits() == 256)
   3548     return false;
   3549 
   3550   // Handle 128 and 256-bit vector lengths. AVX defines UNPCK* to operate
   3551   // independently on 128-bit lanes.
   3552   unsigned NumLanes = VT.getSizeInBits()/128;
   3553   unsigned NumLaneElts = NumElts/NumLanes;
   3554 
   3555   for (unsigned l = 0; l != NumLanes; ++l) {
   3556     for (unsigned i = l*NumLaneElts, j = l*NumLaneElts;
   3557          i != (l+1)*NumLaneElts;
   3558          i += 2, ++j) {
   3559       int BitI  = Mask[i];
   3560       int BitI1 = Mask[i+1];
   3561 
   3562       if (!isUndefOrEqual(BitI, j))
   3563         return false;
   3564       if (!isUndefOrEqual(BitI1, j))
   3565         return false;
   3566     }
   3567   }
   3568 
   3569   return true;
   3570 }
   3571 
   3572 /// isUNPCKH_v_undef_Mask - Special case of isUNPCKHMask for canonical form
   3573 /// of vector_shuffle v, v, <2, 6, 3, 7>, i.e. vector_shuffle v, undef,
   3574 /// <2, 2, 3, 3>
   3575 static bool isUNPCKH_v_undef_Mask(ArrayRef<int> Mask, EVT VT, bool HasAVX2) {
   3576   unsigned NumElts = VT.getVectorNumElements();
   3577 
   3578   assert((VT.is128BitVector() || VT.is256BitVector()) &&
   3579          "Unsupported vector type for unpckh");
   3580 
   3581   if (VT.getSizeInBits() == 256 && NumElts != 4 && NumElts != 8 &&
   3582       (!HasAVX2 || (NumElts != 16 && NumElts != 32)))
   3583     return false;
   3584 
   3585   // Handle 128 and 256-bit vector lengths. AVX defines UNPCK* to operate
   3586   // independently on 128-bit lanes.
   3587   unsigned NumLanes = VT.getSizeInBits()/128;
   3588   unsigned NumLaneElts = NumElts/NumLanes;
   3589 
   3590   for (unsigned l = 0; l != NumLanes; ++l) {
   3591     for (unsigned i = l*NumLaneElts, j = (l*NumLaneElts)+NumLaneElts/2;
   3592          i != (l+1)*NumLaneElts; i += 2, ++j) {
   3593       int BitI  = Mask[i];
   3594       int BitI1 = Mask[i+1];
   3595       if (!isUndefOrEqual(BitI, j))
   3596         return false;
   3597       if (!isUndefOrEqual(BitI1, j))
   3598         return false;
   3599     }
   3600   }
   3601   return true;
   3602 }
   3603 
   3604 /// isMOVLMask - Return true if the specified VECTOR_SHUFFLE operand
   3605 /// specifies a shuffle of elements that is suitable for input to MOVSS,
   3606 /// MOVSD, and MOVD, i.e. setting the lowest element.
   3607 static bool isMOVLMask(ArrayRef<int> Mask, EVT VT) {
   3608   if (VT.getVectorElementType().getSizeInBits() < 32)
   3609     return false;
   3610   if (VT.getSizeInBits() == 256)
   3611     return false;
   3612 
   3613   unsigned NumElts = VT.getVectorNumElements();
   3614 
   3615   if (!isUndefOrEqual(Mask[0], NumElts))
   3616     return false;
   3617 
   3618   for (unsigned i = 1; i != NumElts; ++i)
   3619     if (!isUndefOrEqual(Mask[i], i))
   3620       return false;
   3621 
   3622   return true;
   3623 }
   3624 
   3625 /// isVPERM2X128Mask - Match 256-bit shuffles where the elements are considered
   3626 /// as permutations between 128-bit chunks or halves. As an example: this
   3627 /// shuffle bellow:
   3628 ///   vector_shuffle <4, 5, 6, 7, 12, 13, 14, 15>
   3629 /// The first half comes from the second half of V1 and the second half from the
   3630 /// the second half of V2.
   3631 static bool isVPERM2X128Mask(ArrayRef<int> Mask, EVT VT, bool HasAVX) {
   3632   if (!HasAVX || VT.getSizeInBits() != 256)
   3633     return false;
   3634 
   3635   // The shuffle result is divided into half A and half B. In total the two
   3636   // sources have 4 halves, namely: C, D, E, F. The final values of A and
   3637   // B must come from C, D, E or F.
   3638   unsigned HalfSize = VT.getVectorNumElements()/2;
   3639   bool MatchA = false, MatchB = false;
   3640 
   3641   // Check if A comes from one of C, D, E, F.
   3642   for (unsigned Half = 0; Half != 4; ++Half) {
   3643     if (isSequentialOrUndefInRange(Mask, 0, HalfSize, Half*HalfSize)) {
   3644       MatchA = true;
   3645       break;
   3646     }
   3647   }
   3648 
   3649   // Check if B comes from one of C, D, E, F.
   3650   for (unsigned Half = 0; Half != 4; ++Half) {
   3651     if (isSequentialOrUndefInRange(Mask, HalfSize, HalfSize, Half*HalfSize)) {
   3652       MatchB = true;
   3653       break;
   3654     }
   3655   }
   3656 
   3657   return MatchA && MatchB;
   3658 }
   3659 
   3660 /// getShuffleVPERM2X128Immediate - Return the appropriate immediate to shuffle
   3661 /// the specified VECTOR_MASK mask with VPERM2F128/VPERM2I128 instructions.
   3662 static unsigned getShuffleVPERM2X128Immediate(ShuffleVectorSDNode *SVOp) {
   3663   EVT VT = SVOp->getValueType(0);
   3664 
   3665   unsigned HalfSize = VT.getVectorNumElements()/2;
   3666 
   3667   unsigned FstHalf = 0, SndHalf = 0;
   3668   for (unsigned i = 0; i < HalfSize; ++i) {
   3669     if (SVOp->getMaskElt(i) > 0) {
   3670       FstHalf = SVOp->getMaskElt(i)/HalfSize;
   3671       break;
   3672     }
   3673   }
   3674   for (unsigned i = HalfSize; i < HalfSize*2; ++i) {
   3675     if (SVOp->getMaskElt(i) > 0) {
   3676       SndHalf = SVOp->getMaskElt(i)/HalfSize;
   3677       break;
   3678     }
   3679   }
   3680 
   3681   return (FstHalf | (SndHalf << 4));
   3682 }
   3683 
   3684 /// isVPERMILPMask - Return true if the specified VECTOR_SHUFFLE operand
   3685 /// specifies a shuffle of elements that is suitable for input to VPERMILPD*.
   3686 /// Note that VPERMIL mask matching is different depending whether theunderlying
   3687 /// type is 32 or 64. In the VPERMILPS the high half of the mask should point
   3688 /// to the same elements of the low, but to the higher half of the source.
   3689 /// In VPERMILPD the two lanes could be shuffled independently of each other
   3690 /// with the same restriction that lanes can't be crossed. Also handles PSHUFDY.
   3691 static bool isVPERMILPMask(ArrayRef<int> Mask, EVT VT, bool HasAVX) {
   3692   if (!HasAVX)
   3693     return false;
   3694 
   3695   unsigned NumElts = VT.getVectorNumElements();
   3696   // Only match 256-bit with 32/64-bit types
   3697   if (VT.getSizeInBits() != 256 || (NumElts != 4 && NumElts != 8))
   3698     return false;
   3699 
   3700   unsigned NumLanes = VT.getSizeInBits()/128;
   3701   unsigned LaneSize = NumElts/NumLanes;
   3702   for (unsigned l = 0; l != NumElts; l += LaneSize) {
   3703     for (unsigned i = 0; i != LaneSize; ++i) {
   3704       if (!isUndefOrInRange(Mask[i+l], l, l+LaneSize))
   3705         return false;
   3706       if (NumElts != 8 || l == 0)
   3707         continue;
   3708       // VPERMILPS handling
   3709       if (Mask[i] < 0)
   3710         continue;
   3711       if (!isUndefOrEqual(Mask[i+l], Mask[i]+l))
   3712         return false;
   3713     }
   3714   }
   3715 
   3716   return true;
   3717 }
   3718 
   3719 /// isCommutedMOVLMask - Returns true if the shuffle mask is except the reverse
   3720 /// of what x86 movss want. X86 movs requires the lowest  element to be lowest
   3721 /// element of vector 2 and the other elements to come from vector 1 in order.
   3722 static bool isCommutedMOVLMask(ArrayRef<int> Mask, EVT VT,
   3723                                bool V2IsSplat = false, bool V2IsUndef = false) {
   3724   unsigned NumOps = VT.getVectorNumElements();
   3725   if (VT.getSizeInBits() == 256)
   3726     return false;
   3727   if (NumOps != 2 && NumOps != 4 && NumOps != 8 && NumOps != 16)
   3728     return false;
   3729 
   3730   if (!isUndefOrEqual(Mask[0], 0))
   3731     return false;
   3732 
   3733   for (unsigned i = 1; i != NumOps; ++i)
   3734     if (!(isUndefOrEqual(Mask[i], i+NumOps) ||
   3735           (V2IsUndef && isUndefOrInRange(Mask[i], NumOps, NumOps*2)) ||
   3736           (V2IsSplat && isUndefOrEqual(Mask[i], NumOps))))
   3737       return false;
   3738 
   3739   return true;
   3740 }
   3741 
   3742 /// isMOVSHDUPMask - Return true if the specified VECTOR_SHUFFLE operand
   3743 /// specifies a shuffle of elements that is suitable for input to MOVSHDUP.
   3744 /// Masks to match: <1, 1, 3, 3> or <1, 1, 3, 3, 5, 5, 7, 7>
   3745 static bool isMOVSHDUPMask(ArrayRef<int> Mask, EVT VT,
   3746                            const X86Subtarget *Subtarget) {
   3747   if (!Subtarget->hasSSE3())
   3748     return false;
   3749 
   3750   unsigned NumElems = VT.getVectorNumElements();
   3751 
   3752   if ((VT.getSizeInBits() == 128 && NumElems != 4) ||
   3753       (VT.getSizeInBits() == 256 && NumElems != 8))
   3754     return false;
   3755 
   3756   // "i+1" is the value the indexed mask element must have
   3757   for (unsigned i = 0; i != NumElems; i += 2)
   3758     if (!isUndefOrEqual(Mask[i], i+1) ||
   3759         !isUndefOrEqual(Mask[i+1], i+1))
   3760       return false;
   3761 
   3762   return true;
   3763 }
   3764 
   3765 /// isMOVSLDUPMask - Return true if the specified VECTOR_SHUFFLE operand
   3766 /// specifies a shuffle of elements that is suitable for input to MOVSLDUP.
   3767 /// Masks to match: <0, 0, 2, 2> or <0, 0, 2, 2, 4, 4, 6, 6>
   3768 static bool isMOVSLDUPMask(ArrayRef<int> Mask, EVT VT,
   3769                            const X86Subtarget *Subtarget) {
   3770   if (!Subtarget->hasSSE3())
   3771     return false;
   3772 
   3773   unsigned NumElems = VT.getVectorNumElements();
   3774 
   3775   if ((VT.getSizeInBits() == 128 && NumElems != 4) ||
   3776       (VT.getSizeInBits() == 256 && NumElems != 8))
   3777     return false;
   3778 
   3779   // "i" is the value the indexed mask element must have
   3780   for (unsigned i = 0; i != NumElems; i += 2)
   3781     if (!isUndefOrEqual(Mask[i], i) ||
   3782         !isUndefOrEqual(Mask[i+1], i))
   3783       return false;
   3784 
   3785   return true;
   3786 }
   3787 
   3788 /// isMOVDDUPYMask - Return true if the specified VECTOR_SHUFFLE operand
   3789 /// specifies a shuffle of elements that is suitable for input to 256-bit
   3790 /// version of MOVDDUP.
   3791 static bool isMOVDDUPYMask(ArrayRef<int> Mask, EVT VT, bool HasAVX) {
   3792   unsigned NumElts = VT.getVectorNumElements();
   3793 
   3794   if (!HasAVX || VT.getSizeInBits() != 256 || NumElts != 4)
   3795     return false;
   3796 
   3797   for (unsigned i = 0; i != NumElts/2; ++i)
   3798     if (!isUndefOrEqual(Mask[i], 0))
   3799       return false;
   3800   for (unsigned i = NumElts/2; i != NumElts; ++i)
   3801     if (!isUndefOrEqual(Mask[i], NumElts/2))
   3802       return false;
   3803   return true;
   3804 }
   3805 
   3806 /// isMOVDDUPMask - Return true if the specified VECTOR_SHUFFLE operand
   3807 /// specifies a shuffle of elements that is suitable for input to 128-bit
   3808 /// version of MOVDDUP.
   3809 static bool isMOVDDUPMask(ArrayRef<int> Mask, EVT VT) {
   3810   if (VT.getSizeInBits() != 128)
   3811     return false;
   3812 
   3813   unsigned e = VT.getVectorNumElements() / 2;
   3814   for (unsigned i = 0; i != e; ++i)
   3815     if (!isUndefOrEqual(Mask[i], i))
   3816       return false;
   3817   for (unsigned i = 0; i != e; ++i)
   3818     if (!isUndefOrEqual(Mask[e+i], i))
   3819       return false;
   3820   return true;
   3821 }
   3822 
   3823 /// isVEXTRACTF128Index - Return true if the specified
   3824 /// EXTRACT_SUBVECTOR operand specifies a vector extract that is
   3825 /// suitable for input to VEXTRACTF128.
   3826 bool X86::isVEXTRACTF128Index(SDNode *N) {
   3827   if (!isa<ConstantSDNode>(N->getOperand(1).getNode()))
   3828     return false;
   3829 
   3830   // The index should be aligned on a 128-bit boundary.
   3831   uint64_t Index =
   3832     cast<ConstantSDNode>(N->getOperand(1).getNode())->getZExtValue();
   3833 
   3834   unsigned VL = N->getValueType(0).getVectorNumElements();
   3835   unsigned VBits = N->getValueType(0).getSizeInBits();
   3836   unsigned ElSize = VBits / VL;
   3837   bool Result = (Index * ElSize) % 128 == 0;
   3838 
   3839   return Result;
   3840 }
   3841 
   3842 /// isVINSERTF128Index - Return true if the specified INSERT_SUBVECTOR
   3843 /// operand specifies a subvector insert that is suitable for input to
   3844 /// VINSERTF128.
   3845 bool X86::isVINSERTF128Index(SDNode *N) {
   3846   if (!isa<ConstantSDNode>(N->getOperand(2).getNode()))
   3847     return false;
   3848 
   3849   // The index should be aligned on a 128-bit boundary.
   3850   uint64_t Index =
   3851     cast<ConstantSDNode>(N->getOperand(2).getNode())->getZExtValue();
   3852 
   3853   unsigned VL = N->getValueType(0).getVectorNumElements();
   3854   unsigned VBits = N->getValueType(0).getSizeInBits();
   3855   unsigned ElSize = VBits / VL;
   3856   bool Result = (Index * ElSize) % 128 == 0;
   3857 
   3858   return Result;
   3859 }
   3860 
   3861 /// getShuffleSHUFImmediate - Return the appropriate immediate to shuffle
   3862 /// the specified VECTOR_SHUFFLE mask with PSHUF* and SHUFP* instructions.
   3863 /// Handles 128-bit and 256-bit.
   3864 static unsigned getShuffleSHUFImmediate(ShuffleVectorSDNode *N) {
   3865   EVT VT = N->getValueType(0);
   3866 
   3867   assert((VT.is128BitVector() || VT.is256BitVector()) &&
   3868          "Unsupported vector type for PSHUF/SHUFP");
   3869 
   3870   // Handle 128 and 256-bit vector lengths. AVX defines PSHUF/SHUFP to operate
   3871   // independently on 128-bit lanes.
   3872   unsigned NumElts = VT.getVectorNumElements();
   3873   unsigned NumLanes = VT.getSizeInBits()/128;
   3874   unsigned NumLaneElts = NumElts/NumLanes;
   3875 
   3876   assert((NumLaneElts == 2 || NumLaneElts == 4) &&
   3877          "Only supports 2 or 4 elements per lane");
   3878 
   3879   unsigned Shift = (NumLaneElts == 4) ? 1 : 0;
   3880   unsigned Mask = 0;
   3881   for (unsigned i = 0; i != NumElts; ++i) {
   3882     int Elt = N->getMaskElt(i);
   3883     if (Elt < 0) continue;
   3884     Elt %= NumLaneElts;
   3885     unsigned ShAmt = i << Shift;
   3886     if (ShAmt >= 8) ShAmt -= 8;
   3887     Mask |= Elt << ShAmt;
   3888   }
   3889 
   3890   return Mask;
   3891 }
   3892 
   3893 /// getShufflePSHUFHWImmediate - Return the appropriate immediate to shuffle
   3894 /// the specified VECTOR_SHUFFLE mask with the PSHUFHW instruction.
   3895 static unsigned getShufflePSHUFHWImmediate(ShuffleVectorSDNode *N) {
   3896   unsigned Mask = 0;
   3897   // 8 nodes, but we only care about the last 4.
   3898   for (unsigned i = 7; i >= 4; --i) {
   3899     int Val = N->getMaskElt(i);
   3900     if (Val >= 0)
   3901       Mask |= (Val - 4);
   3902     if (i != 4)
   3903       Mask <<= 2;
   3904   }
   3905   return Mask;
   3906 }
   3907 
   3908 /// getShufflePSHUFLWImmediate - Return the appropriate immediate to shuffle
   3909 /// the specified VECTOR_SHUFFLE mask with the PSHUFLW instruction.
   3910 static unsigned getShufflePSHUFLWImmediate(ShuffleVectorSDNode *N) {
   3911   unsigned Mask = 0;
   3912   // 8 nodes, but we only care about the first 4.
   3913   for (int i = 3; i >= 0; --i) {
   3914     int Val = N->getMaskElt(i);
   3915     if (Val >= 0)
   3916       Mask |= Val;
   3917     if (i != 0)
   3918       Mask <<= 2;
   3919   }
   3920   return Mask;
   3921 }
   3922 
   3923 /// getShufflePALIGNRImmediate - Return the appropriate immediate to shuffle
   3924 /// the specified VECTOR_SHUFFLE mask with the PALIGNR instruction.
   3925 static unsigned getShufflePALIGNRImmediate(ShuffleVectorSDNode *SVOp) {
   3926   EVT VT = SVOp->getValueType(0);
   3927   unsigned EltSize = VT.getVectorElementType().getSizeInBits() >> 3;
   3928 
   3929   unsigned NumElts = VT.getVectorNumElements();
   3930   unsigned NumLanes = VT.getSizeInBits()/128;
   3931   unsigned NumLaneElts = NumElts/NumLanes;
   3932 
   3933   int Val = 0;
   3934   unsigned i;
   3935   for (i = 0; i != NumElts; ++i) {
   3936     Val = SVOp->getMaskElt(i);
   3937     if (Val >= 0)
   3938       break;
   3939   }
   3940   if (Val >= (int)NumElts)
   3941     Val -= NumElts - NumLaneElts;
   3942 
   3943   assert(Val - i > 0 && "PALIGNR imm should be positive");
   3944   return (Val - i) * EltSize;
   3945 }
   3946 
   3947 /// getExtractVEXTRACTF128Immediate - Return the appropriate immediate
   3948 /// to extract the specified EXTRACT_SUBVECTOR index with VEXTRACTF128
   3949 /// instructions.
   3950 unsigned X86::getExtractVEXTRACTF128Immediate(SDNode *N) {
   3951   if (!isa<ConstantSDNode>(N->getOperand(1).getNode()))
   3952     llvm_unreachable("Illegal extract subvector for VEXTRACTF128");
   3953 
   3954   uint64_t Index =
   3955     cast<ConstantSDNode>(N->getOperand(1).getNode())->getZExtValue();
   3956 
   3957   EVT VecVT = N->getOperand(0).getValueType();
   3958   EVT ElVT = VecVT.getVectorElementType();
   3959 
   3960   unsigned NumElemsPerChunk = 128 / ElVT.getSizeInBits();
   3961   return Index / NumElemsPerChunk;
   3962 }
   3963 
   3964 /// getInsertVINSERTF128Immediate - Return the appropriate immediate
   3965 /// to insert at the specified INSERT_SUBVECTOR index with VINSERTF128
   3966 /// instructions.
   3967 unsigned X86::getInsertVINSERTF128Immediate(SDNode *N) {
   3968   if (!isa<ConstantSDNode>(N->getOperand(2).getNode()))
   3969     llvm_unreachable("Illegal insert subvector for VINSERTF128");
   3970 
   3971   uint64_t Index =
   3972     cast<ConstantSDNode>(N->getOperand(2).getNode())->getZExtValue();
   3973 
   3974   EVT VecVT = N->getValueType(0);
   3975   EVT ElVT = VecVT.getVectorElementType();
   3976 
   3977   unsigned NumElemsPerChunk = 128 / ElVT.getSizeInBits();
   3978   return Index / NumElemsPerChunk;
   3979 }
   3980 
   3981 /// getShuffleCLImmediate - Return the appropriate immediate to shuffle
   3982 /// the specified VECTOR_SHUFFLE mask with VPERMQ and VPERMPD instructions.
   3983 /// Handles 256-bit.
   3984 static unsigned getShuffleCLImmediate(ShuffleVectorSDNode *N) {
   3985   EVT VT = N->getValueType(0);
   3986 
   3987   unsigned NumElts = VT.getVectorNumElements();
   3988 
   3989   assert((VT.is256BitVector() && NumElts == 4) &&
   3990          "Unsupported vector type for VPERMQ/VPERMPD");
   3991 
   3992   unsigned Mask = 0;
   3993   for (unsigned i = 0; i != NumElts; ++i) {
   3994     int Elt = N->getMaskElt(i);
   3995     if (Elt < 0)
   3996       continue;
   3997     Mask |= Elt << (i*2);
   3998   }
   3999 
   4000   return Mask;
   4001 }
   4002 /// isZeroNode - Returns true if Elt is a constant zero or a floating point
   4003 /// constant +0.0.
   4004 bool X86::isZeroNode(SDValue Elt) {
   4005   return ((isa<ConstantSDNode>(Elt) &&
   4006            cast<ConstantSDNode>(Elt)->isNullValue()) ||
   4007           (isa<ConstantFPSDNode>(Elt) &&
   4008            cast<ConstantFPSDNode>(Elt)->getValueAPF().isPosZero()));
   4009 }
   4010 
   4011 /// CommuteVectorShuffle - Swap vector_shuffle operands as well as values in
   4012 /// their permute mask.
   4013 static SDValue CommuteVectorShuffle(ShuffleVectorSDNode *SVOp,
   4014                                     SelectionDAG &DAG) {
   4015   EVT VT = SVOp->getValueType(0);
   4016   unsigned NumElems = VT.getVectorNumElements();
   4017   SmallVector<int, 8> MaskVec;
   4018 
   4019   for (unsigned i = 0; i != NumElems; ++i) {
   4020     int idx = SVOp->getMaskElt(i);
   4021     if (idx < 0)
   4022       MaskVec.push_back(idx);
   4023     else if (idx < (int)NumElems)
   4024       MaskVec.push_back(idx + NumElems);
   4025     else
   4026       MaskVec.push_back(idx - NumElems);
   4027   }
   4028   return DAG.getVectorShuffle(VT, SVOp->getDebugLoc(), SVOp->getOperand(1),
   4029                               SVOp->getOperand(0), &MaskVec[0]);
   4030 }
   4031 
   4032 /// ShouldXformToMOVHLPS - Return true if the node should be transformed to
   4033 /// match movhlps. The lower half elements should come from upper half of
   4034 /// V1 (and in order), and the upper half elements should come from the upper
   4035 /// half of V2 (and in order).
   4036 static bool ShouldXformToMOVHLPS(ArrayRef<int> Mask, EVT VT) {
   4037   if (VT.getSizeInBits() != 128)
   4038     return false;
   4039   if (VT.getVectorNumElements() != 4)
   4040     return false;
   4041   for (unsigned i = 0, e = 2; i != e; ++i)
   4042     if (!isUndefOrEqual(Mask[i], i+2))
   4043       return false;
   4044   for (unsigned i = 2; i != 4; ++i)
   4045     if (!isUndefOrEqual(Mask[i], i+4))
   4046       return false;
   4047   return true;
   4048 }
   4049 
   4050 /// isScalarLoadToVector - Returns true if the node is a scalar load that
   4051 /// is promoted to a vector. It also returns the LoadSDNode by reference if
   4052 /// required.
   4053 static bool isScalarLoadToVector(SDNode *N, LoadSDNode **LD = NULL) {
   4054   if (N->getOpcode() != ISD::SCALAR_TO_VECTOR)
   4055     return false;
   4056   N = N->getOperand(0).getNode();
   4057   if (!ISD::isNON_EXTLoad(N))
   4058     return false;
   4059   if (LD)
   4060     *LD = cast<LoadSDNode>(N);
   4061   return true;
   4062 }
   4063 
   4064 // Test whether the given value is a vector value which will be legalized
   4065 // into a load.
   4066 static bool WillBeConstantPoolLoad(SDNode *N) {
   4067   if (N->getOpcode() != ISD::BUILD_VECTOR)
   4068     return false;
   4069 
   4070   // Check for any non-constant elements.
   4071   for (unsigned i = 0, e = N->getNumOperands(); i != e; ++i)
   4072     switch (N->getOperand(i).getNode()->getOpcode()) {
   4073     case ISD::UNDEF:
   4074     case ISD::ConstantFP:
   4075     case ISD::Constant:
   4076       break;
   4077     default:
   4078       return false;
   4079     }
   4080 
   4081   // Vectors of all-zeros and all-ones are materialized with special
   4082   // instructions rather than being loaded.
   4083   return !ISD::isBuildVectorAllZeros(N) &&
   4084          !ISD::isBuildVectorAllOnes(N);
   4085 }
   4086 
   4087 /// ShouldXformToMOVLP{S|D} - Return true if the node should be transformed to
   4088 /// match movlp{s|d}. The lower half elements should come from lower half of
   4089 /// V1 (and in order), and the upper half elements should come from the upper
   4090 /// half of V2 (and in order). And since V1 will become the source of the
   4091 /// MOVLP, it must be either a vector load or a scalar load to vector.
   4092 static bool ShouldXformToMOVLP(SDNode *V1, SDNode *V2,
   4093                                ArrayRef<int> Mask, EVT VT) {
   4094   if (VT.getSizeInBits() != 128)
   4095     return false;
   4096 
   4097   if (!ISD::isNON_EXTLoad(V1) && !isScalarLoadToVector(V1))
   4098     return false;
   4099   // Is V2 is a vector load, don't do this transformation. We will try to use
   4100   // load folding shufps op.
   4101   if (ISD::isNON_EXTLoad(V2) || WillBeConstantPoolLoad(V2))
   4102     return false;
   4103 
   4104   unsigned NumElems = VT.getVectorNumElements();
   4105 
   4106   if (NumElems != 2 && NumElems != 4)
   4107     return false;
   4108   for (unsigned i = 0, e = NumElems/2; i != e; ++i)
   4109     if (!isUndefOrEqual(Mask[i], i))
   4110       return false;
   4111   for (unsigned i = NumElems/2; i != NumElems; ++i)
   4112     if (!isUndefOrEqual(Mask[i], i+NumElems))
   4113       return false;
   4114   return true;
   4115 }
   4116 
   4117 /// isSplatVector - Returns true if N is a BUILD_VECTOR node whose elements are
   4118 /// all the same.
   4119 static bool isSplatVector(SDNode *N) {
   4120   if (N->getOpcode() != ISD::BUILD_VECTOR)
   4121     return false;
   4122 
   4123   SDValue SplatValue = N->getOperand(0);
   4124   for (unsigned i = 1, e = N->getNumOperands(); i != e; ++i)
   4125     if (N->getOperand(i) != SplatValue)
   4126       return false;
   4127   return true;
   4128 }
   4129 
   4130 /// isZeroShuffle - Returns true if N is a VECTOR_SHUFFLE that can be resolved
   4131 /// to an zero vector.
   4132 /// FIXME: move to dag combiner / method on ShuffleVectorSDNode
   4133 static bool isZeroShuffle(ShuffleVectorSDNode *N) {
   4134   SDValue V1 = N->getOperand(0);
   4135   SDValue V2 = N->getOperand(1);
   4136   unsigned NumElems = N->getValueType(0).getVectorNumElements();
   4137   for (unsigned i = 0; i != NumElems; ++i) {
   4138     int Idx = N->getMaskElt(i);
   4139     if (Idx >= (int)NumElems) {
   4140       unsigned Opc = V2.getOpcode();
   4141       if (Opc == ISD::UNDEF || ISD::isBuildVectorAllZeros(V2.getNode()))
   4142         continue;
   4143       if (Opc != ISD::BUILD_VECTOR ||
   4144           !X86::isZeroNode(V2.getOperand(Idx-NumElems)))
   4145         return false;
   4146     } else if (Idx >= 0) {
   4147       unsigned Opc = V1.getOpcode();
   4148       if (Opc == ISD::UNDEF || ISD::isBuildVectorAllZeros(V1.getNode()))
   4149         continue;
   4150       if (Opc != ISD::BUILD_VECTOR ||
   4151           !X86::isZeroNode(V1.getOperand(Idx)))
   4152         return false;
   4153     }
   4154   }
   4155   return true;
   4156 }
   4157 
   4158 /// getZeroVector - Returns a vector of specified type with all zero elements.
   4159 ///
   4160 static SDValue getZeroVector(EVT VT, const X86Subtarget *Subtarget,
   4161                              SelectionDAG &DAG, DebugLoc dl) {
   4162   assert(VT.isVector() && "Expected a vector type");
   4163 
   4164   // Always build SSE zero vectors as <4 x i32> bitcasted
   4165   // to their dest type. This ensures they get CSE'd.
   4166   SDValue Vec;
   4167   if (VT.getSizeInBits() == 128) {  // SSE
   4168     if (Subtarget->hasSSE2()) {  // SSE2
   4169       SDValue Cst = DAG.getTargetConstant(0, MVT::i32);
   4170       Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32, Cst, Cst, Cst, Cst);
   4171     } else { // SSE1
   4172       SDValue Cst = DAG.getTargetConstantFP(+0.0, MVT::f32);
   4173       Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4f32, Cst, Cst, Cst, Cst);
   4174     }
   4175   } else if (VT.getSizeInBits() == 256) { // AVX
   4176     if (Subtarget->hasAVX2()) { // AVX2
   4177       SDValue Cst = DAG.getTargetConstant(0, MVT::i32);
   4178       SDValue Ops[] = { Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst };
   4179       Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v8i32, Ops, 8);
   4180     } else {
   4181       // 256-bit logic and arithmetic instructions in AVX are all
   4182       // floating-point, no support for integer ops. Emit fp zeroed vectors.
   4183       SDValue Cst = DAG.getTargetConstantFP(+0.0, MVT::f32);
   4184       SDValue Ops[] = { Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst };
   4185       Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v8f32, Ops, 8);
   4186     }
   4187   }
   4188   return DAG.getNode(ISD::BITCAST, dl, VT, Vec);
   4189 }
   4190 
   4191 /// getOnesVector - Returns a vector of specified type with all bits set.
   4192 /// Always build ones vectors as <4 x i32> or <8 x i32>. For 256-bit types with
   4193 /// no AVX2 supprt, use two <4 x i32> inserted in a <8 x i32> appropriately.
   4194 /// Then bitcast to their original type, ensuring they get CSE'd.
   4195 static SDValue getOnesVector(EVT VT, bool HasAVX2, SelectionDAG &DAG,
   4196                              DebugLoc dl) {
   4197   assert(VT.isVector() && "Expected a vector type");
   4198   assert((VT.is128BitVector() || VT.is256BitVector())
   4199          && "Expected a 128-bit or 256-bit vector type");
   4200 
   4201   SDValue Cst = DAG.getTargetConstant(~0U, MVT::i32);
   4202   SDValue Vec;
   4203   if (VT.getSizeInBits() == 256) {
   4204     if (HasAVX2) { // AVX2
   4205       SDValue Ops[] = { Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst };
   4206       Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v8i32, Ops, 8);
   4207     } else { // AVX
   4208       Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32, Cst, Cst, Cst, Cst);
   4209       SDValue InsV = Insert128BitVector(DAG.getNode(ISD::UNDEF, dl, MVT::v8i32),
   4210                                 Vec, DAG.getConstant(0, MVT::i32), DAG, dl);
   4211       Vec = Insert128BitVector(InsV, Vec,
   4212                     DAG.getConstant(4 /* NumElems/2 */, MVT::i32), DAG, dl);
   4213     }
   4214   } else {
   4215     Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32, Cst, Cst, Cst, Cst);
   4216   }
   4217 
   4218   return DAG.getNode(ISD::BITCAST, dl, VT, Vec);
   4219 }
   4220 
   4221 /// NormalizeMask - V2 is a splat, modify the mask (if needed) so all elements
   4222 /// that point to V2 points to its first element.
   4223 static void NormalizeMask(SmallVectorImpl<int> &Mask, unsigned NumElems) {
   4224   for (unsigned i = 0; i != NumElems; ++i) {
   4225     if (Mask[i] > (int)NumElems) {
   4226       Mask[i] = NumElems;
   4227     }
   4228   }
   4229 }
   4230 
   4231 /// getMOVLMask - Returns a vector_shuffle mask for an movs{s|d}, movd
   4232 /// operation of specified width.
   4233 static SDValue getMOVL(SelectionDAG &DAG, DebugLoc dl, EVT VT, SDValue V1,
   4234                        SDValue V2) {
   4235   unsigned NumElems = VT.getVectorNumElements();
   4236   SmallVector<int, 8> Mask;
   4237   Mask.push_back(NumElems);
   4238   for (unsigned i = 1; i != NumElems; ++i)
   4239     Mask.push_back(i);
   4240   return DAG.getVectorShuffle(VT, dl, V1, V2, &Mask[0]);
   4241 }
   4242 
   4243 /// getUnpackl - Returns a vector_shuffle node for an unpackl operation.
   4244 static SDValue getUnpackl(SelectionDAG &DAG, DebugLoc dl, EVT VT, SDValue V1,
   4245                           SDValue V2) {
   4246   unsigned NumElems = VT.getVectorNumElements();
   4247   SmallVector<int, 8> Mask;
   4248   for (unsigned i = 0, e = NumElems/2; i != e; ++i) {
   4249     Mask.push_back(i);
   4250     Mask.push_back(i + NumElems);
   4251   }
   4252   return DAG.getVectorShuffle(VT, dl, V1, V2, &Mask[0]);
   4253 }
   4254 
   4255 /// getUnpackh - Returns a vector_shuffle node for an unpackh operation.
   4256 static SDValue getUnpackh(SelectionDAG &DAG, DebugLoc dl, EVT VT, SDValue V1,
   4257                           SDValue V2) {
   4258   unsigned NumElems = VT.getVectorNumElements();
   4259   unsigned Half = NumElems/2;
   4260   SmallVector<int, 8> Mask;
   4261   for (unsigned i = 0; i != Half; ++i) {
   4262     Mask.push_back(i + Half);
   4263     Mask.push_back(i + NumElems + Half);
   4264   }
   4265   return DAG.getVectorShuffle(VT, dl, V1, V2, &Mask[0]);
   4266 }
   4267 
   4268 // PromoteSplati8i16 - All i16 and i8 vector types can't be used directly by
   4269 // a generic shuffle instruction because the target has no such instructions.
   4270 // Generate shuffles which repeat i16 and i8 several times until they can be
   4271 // represented by v4f32 and then be manipulated by target suported shuffles.
   4272 static SDValue PromoteSplati8i16(SDValue V, SelectionDAG &DAG, int &EltNo) {
   4273   EVT VT = V.getValueType();
   4274   int NumElems = VT.getVectorNumElements();
   4275   DebugLoc dl = V.getDebugLoc();
   4276 
   4277   while (NumElems > 4) {
   4278     if (EltNo < NumElems/2) {
   4279       V = getUnpackl(DAG, dl, VT, V, V);
   4280     } else {
   4281       V = getUnpackh(DAG, dl, VT, V, V);
   4282       EltNo -= NumElems/2;
   4283     }
   4284     NumElems >>= 1;
   4285   }
   4286   return V;
   4287 }
   4288 
   4289 /// getLegalSplat - Generate a legal splat with supported x86 shuffles
   4290 static SDValue getLegalSplat(SelectionDAG &DAG, SDValue V, int EltNo) {
   4291   EVT VT = V.getValueType();
   4292   DebugLoc dl = V.getDebugLoc();
   4293   assert((VT.getSizeInBits() == 128 || VT.getSizeInBits() == 256)
   4294          && "Vector size not supported");
   4295 
   4296   if (VT.getSizeInBits() == 128) {
   4297     V = DAG.getNode(ISD::BITCAST, dl, MVT::v4f32, V);
   4298     int SplatMask[4] = { EltNo, EltNo, EltNo, EltNo };
   4299     V = DAG.getVectorShuffle(MVT::v4f32, dl, V, DAG.getUNDEF(MVT::v4f32),
   4300                              &SplatMask[0]);
   4301   } else {
   4302     // To use VPERMILPS to splat scalars, the second half of indicies must
   4303     // refer to the higher part, which is a duplication of the lower one,
   4304     // because VPERMILPS can only handle in-lane permutations.
   4305     int SplatMask[8] = { EltNo, EltNo, EltNo, EltNo,
   4306                          EltNo+4, EltNo+4, EltNo+4, EltNo+4 };
   4307 
   4308     V = DAG.getNode(ISD::BITCAST, dl, MVT::v8f32, V);
   4309     V = DAG.getVectorShuffle(MVT::v8f32, dl, V, DAG.getUNDEF(MVT::v8f32),
   4310                              &SplatMask[0]);
   4311   }
   4312 
   4313   return DAG.getNode(ISD::BITCAST, dl, VT, V);
   4314 }
   4315 
   4316 /// PromoteSplat - Splat is promoted to target supported vector shuffles.
   4317 static SDValue PromoteSplat(ShuffleVectorSDNode *SV, SelectionDAG &DAG) {
   4318   EVT SrcVT = SV->getValueType(0);
   4319   SDValue V1 = SV->getOperand(0);
   4320   DebugLoc dl = SV->getDebugLoc();
   4321 
   4322   int EltNo = SV->getSplatIndex();
   4323   int NumElems = SrcVT.getVectorNumElements();
   4324   unsigned Size = SrcVT.getSizeInBits();
   4325 
   4326   assert(((Size == 128 && NumElems > 4) || Size == 256) &&
   4327           "Unknown how to promote splat for type");
   4328 
   4329   // Extract the 128-bit part containing the splat element and update
   4330   // the splat element index when it refers to the higher register.
   4331   if (Size == 256) {
   4332     unsigned Idx = (EltNo >= NumElems/2) ? NumElems/2 : 0;
   4333     V1 = Extract128BitVector(V1, DAG.getConstant(Idx, MVT::i32), DAG, dl);
   4334     if (Idx > 0)
   4335       EltNo -= NumElems/2;
   4336   }
   4337 
   4338   // All i16 and i8 vector types can't be used directly by a generic shuffle
   4339   // instruction because the target has no such instruction. Generate shuffles
   4340   // which repeat i16 and i8 several times until they fit in i32, and then can
   4341   // be manipulated by target suported shuffles.
   4342   EVT EltVT = SrcVT.getVectorElementType();
   4343   if (EltVT == MVT::i8 || EltVT == MVT::i16)
   4344     V1 = PromoteSplati8i16(V1, DAG, EltNo);
   4345 
   4346   // Recreate the 256-bit vector and place the same 128-bit vector
   4347   // into the low and high part. This is necessary because we want
   4348   // to use VPERM* to shuffle the vectors
   4349   if (Size == 256) {
   4350     SDValue InsV = Insert128BitVector(DAG.getUNDEF(SrcVT), V1,
   4351                          DAG.getConstant(0, MVT::i32), DAG, dl);
   4352     V1 = Insert128BitVector(InsV, V1,
   4353                DAG.getConstant(NumElems/2, MVT::i32), DAG, dl);
   4354   }
   4355 
   4356   return getLegalSplat(DAG, V1, EltNo);
   4357 }
   4358 
   4359 /// getShuffleVectorZeroOrUndef - Return a vector_shuffle of the specified
   4360 /// vector of zero or undef vector.  This produces a shuffle where the low
   4361 /// element of V2 is swizzled into the zero/undef vector, landing at element
   4362 /// Idx.  This produces a shuffle mask like 4,1,2,3 (idx=0) or  0,1,2,4 (idx=3).
   4363 static SDValue getShuffleVectorZeroOrUndef(SDValue V2, unsigned Idx,
   4364                                            bool IsZero,
   4365                                            const X86Subtarget *Subtarget,
   4366                                            SelectionDAG &DAG) {
   4367   EVT VT = V2.getValueType();
   4368   SDValue V1 = IsZero
   4369     ? getZeroVector(VT, Subtarget, DAG, V2.getDebugLoc()) : DAG.getUNDEF(VT);
   4370   unsigned NumElems = VT.getVectorNumElements();
   4371   SmallVector<int, 16> MaskVec;
   4372   for (unsigned i = 0; i != NumElems; ++i)
   4373     // If this is the insertion idx, put the low elt of V2 here.
   4374     MaskVec.push_back(i == Idx ? NumElems : i);
   4375   return DAG.getVectorShuffle(VT, V2.getDebugLoc(), V1, V2, &MaskVec[0]);
   4376 }
   4377 
   4378 /// getTargetShuffleMask - Calculates the shuffle mask corresponding to the
   4379 /// target specific opcode. Returns true if the Mask could be calculated.
   4380 /// Sets IsUnary to true if only uses one source.
   4381 static bool getTargetShuffleMask(SDNode *N, EVT VT,
   4382                                  SmallVectorImpl<int> &Mask, bool &IsUnary) {
   4383   unsigned NumElems = VT.getVectorNumElements();
   4384   SDValue ImmN;
   4385 
   4386   IsUnary = false;
   4387   switch(N->getOpcode()) {
   4388   case X86ISD::SHUFP:
   4389     ImmN = N->getOperand(N->getNumOperands()-1);
   4390     DecodeSHUFPMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
   4391     break;
   4392   case X86ISD::UNPCKH:
   4393     DecodeUNPCKHMask(VT, Mask);
   4394     break;
   4395   case X86ISD::UNPCKL:
   4396     DecodeUNPCKLMask(VT, Mask);
   4397     break;
   4398   case X86ISD::MOVHLPS:
   4399     DecodeMOVHLPSMask(NumElems, Mask);
   4400     break;
   4401   case X86ISD::MOVLHPS:
   4402     DecodeMOVLHPSMask(NumElems, Mask);
   4403     break;
   4404   case X86ISD::PSHUFD:
   4405   case X86ISD::VPERMILP:
   4406     ImmN = N->getOperand(N->getNumOperands()-1);
   4407     DecodePSHUFMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
   4408     IsUnary = true;
   4409     break;
   4410   case X86ISD::PSHUFHW:
   4411     ImmN = N->getOperand(N->getNumOperands()-1);
   4412     DecodePSHUFHWMask(cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
   4413     IsUnary = true;
   4414     break;
   4415   case X86ISD::PSHUFLW:
   4416     ImmN = N->getOperand(N->getNumOperands()-1);
   4417     DecodePSHUFLWMask(cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
   4418     IsUnary = true;
   4419     break;
   4420   case X86ISD::MOVSS:
   4421   case X86ISD::MOVSD: {
   4422     // The index 0 always comes from the first element of the second source,
   4423     // this is why MOVSS and MOVSD are used in the first place. The other
   4424     // elements come from the other positions of the first source vector
   4425     Mask.push_back(NumElems);
   4426     for (unsigned i = 1; i != NumElems; ++i) {
   4427       Mask.push_back(i);
   4428     }
   4429     break;
   4430   }
   4431   case X86ISD::VPERM2X128:
   4432     ImmN = N->getOperand(N->getNumOperands()-1);
   4433     DecodeVPERM2X128Mask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
   4434     if (Mask.empty()) return false;
   4435     break;
   4436   case X86ISD::MOVDDUP:
   4437   case X86ISD::MOVLHPD:
   4438   case X86ISD::MOVLPD:
   4439   case X86ISD::MOVLPS:
   4440   case X86ISD::MOVSHDUP:
   4441   case X86ISD::MOVSLDUP:
   4442   case X86ISD::PALIGN:
   4443     // Not yet implemented
   4444     return false;
   4445   default: llvm_unreachable("unknown target shuffle node");
   4446   }
   4447 
   4448   return true;
   4449 }
   4450 
   4451 /// getShuffleScalarElt - Returns the scalar element that will make up the ith
   4452 /// element of the result of the vector shuffle.
   4453 static SDValue getShuffleScalarElt(SDNode *N, unsigned Index, SelectionDAG &DAG,
   4454                                    unsigned Depth) {
   4455   if (Depth == 6)
   4456     return SDValue();  // Limit search depth.
   4457 
   4458   SDValue V = SDValue(N, 0);
   4459   EVT VT = V.getValueType();
   4460   unsigned Opcode = V.getOpcode();
   4461 
   4462   // Recurse into ISD::VECTOR_SHUFFLE node to find scalars.
   4463   if (const ShuffleVectorSDNode *SV = dyn_cast<ShuffleVectorSDNode>(N)) {
   4464     int Elt = SV->getMaskElt(Index);
   4465 
   4466     if (Elt < 0)
   4467       return DAG.getUNDEF(VT.getVectorElementType());
   4468 
   4469     unsigned NumElems = VT.getVectorNumElements();
   4470     SDValue NewV = (Elt < (int)NumElems) ? SV->getOperand(0)
   4471                                          : SV->getOperand(1);
   4472     return getShuffleScalarElt(NewV.getNode(), Elt % NumElems, DAG, Depth+1);
   4473   }
   4474 
   4475   // Recurse into target specific vector shuffles to find scalars.
   4476   if (isTargetShuffle(Opcode)) {
   4477     unsigned NumElems = VT.getVectorNumElements();
   4478     SmallVector<int, 16> ShuffleMask;
   4479     SDValue ImmN;
   4480     bool IsUnary;
   4481 
   4482     if (!getTargetShuffleMask(N, VT, ShuffleMask, IsUnary))
   4483       return SDValue();
   4484 
   4485     int Elt = ShuffleMask[Index];
   4486     if (Elt < 0)
   4487       return DAG.getUNDEF(VT.getVectorElementType());
   4488 
   4489     SDValue NewV = (Elt < (int)NumElems) ? N->getOperand(0)
   4490                                            : N->getOperand(1);
   4491     return getShuffleScalarElt(NewV.getNode(), Elt % NumElems, DAG,
   4492                                Depth+1);
   4493   }
   4494 
   4495   // Actual nodes that may contain scalar elements
   4496   if (Opcode == ISD::BITCAST) {
   4497     V = V.getOperand(0);
   4498     EVT SrcVT = V.getValueType();
   4499     unsigned NumElems = VT.getVectorNumElements();
   4500 
   4501     if (!SrcVT.isVector() || SrcVT.getVectorNumElements() != NumElems)
   4502       return SDValue();
   4503   }
   4504 
   4505   if (V.getOpcode() == ISD::SCALAR_TO_VECTOR)
   4506     return (Index == 0) ? V.getOperand(0)
   4507                         : DAG.getUNDEF(VT.getVectorElementType());
   4508 
   4509   if (V.getOpcode() == ISD::BUILD_VECTOR)
   4510     return V.getOperand(Index);
   4511 
   4512   return SDValue();
   4513 }
   4514 
   4515 /// getNumOfConsecutiveZeros - Return the number of elements of a vector
   4516 /// shuffle operation which come from a consecutively from a zero. The
   4517 /// search can start in two different directions, from left or right.
   4518 static
   4519 unsigned getNumOfConsecutiveZeros(ShuffleVectorSDNode *SVOp, unsigned NumElems,
   4520                                   bool ZerosFromLeft, SelectionDAG &DAG) {
   4521   unsigned i;
   4522   for (i = 0; i != NumElems; ++i) {
   4523     unsigned Index = ZerosFromLeft ? i : NumElems-i-1;
   4524     SDValue Elt = getShuffleScalarElt(SVOp, Index, DAG, 0);
   4525     if (!(Elt.getNode() &&
   4526          (Elt.getOpcode() == ISD::UNDEF || X86::isZeroNode(Elt))))
   4527       break;
   4528   }
   4529 
   4530   return i;
   4531 }
   4532 
   4533 /// isShuffleMaskConsecutive - Check if the shuffle mask indicies [MaskI, MaskE)
   4534 /// correspond consecutively to elements from one of the vector operands,
   4535 /// starting from its index OpIdx. Also tell OpNum which source vector operand.
   4536 static
   4537 bool isShuffleMaskConsecutive(ShuffleVectorSDNode *SVOp,
   4538                               unsigned MaskI, unsigned MaskE, unsigned OpIdx,
   4539                               unsigned NumElems, unsigned &OpNum) {
   4540   bool SeenV1 = false;
   4541   bool SeenV2 = false;
   4542 
   4543   for (unsigned i = MaskI; i != MaskE; ++i, ++OpIdx) {
   4544     int Idx = SVOp->getMaskElt(i);
   4545     // Ignore undef indicies
   4546     if (Idx < 0)
   4547       continue;
   4548 
   4549     if (Idx < (int)NumElems)
   4550       SeenV1 = true;
   4551     else
   4552       SeenV2 = true;
   4553 
   4554     // Only accept consecutive elements from the same vector
   4555     if ((Idx % NumElems != OpIdx) || (SeenV1 && SeenV2))
   4556       return false;
   4557   }
   4558 
   4559   OpNum = SeenV1 ? 0 : 1;
   4560   return true;
   4561 }
   4562 
   4563 /// isVectorShiftRight - Returns true if the shuffle can be implemented as a
   4564 /// logical left shift of a vector.
   4565 static bool isVectorShiftRight(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG,
   4566                                bool &isLeft, SDValue &ShVal, unsigned &ShAmt) {
   4567   unsigned NumElems = SVOp->getValueType(0).getVectorNumElements();
   4568   unsigned NumZeros = getNumOfConsecutiveZeros(SVOp, NumElems,
   4569               false /* check zeros from right */, DAG);
   4570   unsigned OpSrc;
   4571 
   4572   if (!NumZeros)
   4573     return false;
   4574 
   4575   // Considering the elements in the mask that are not consecutive zeros,
   4576   // check if they consecutively come from only one of the source vectors.
   4577   //
   4578   //               V1 = {X, A, B, C}     0
   4579   //                         \  \  \    /
   4580   //   vector_shuffle V1, V2 <1, 2, 3, X>
   4581   //
   4582   if (!isShuffleMaskConsecutive(SVOp,
   4583             0,                   // Mask Start Index
   4584             NumElems-NumZeros,   // Mask End Index(exclusive)
   4585             NumZeros,            // Where to start looking in the src vector
   4586             NumElems,            // Number of elements in vector
   4587             OpSrc))              // Which source operand ?
   4588     return false;
   4589 
   4590   isLeft = false;
   4591   ShAmt = NumZeros;
   4592   ShVal = SVOp->getOperand(OpSrc);
   4593   return true;
   4594 }
   4595 
   4596 /// isVectorShiftLeft - Returns true if the shuffle can be implemented as a
   4597 /// logical left shift of a vector.
   4598 static bool isVectorShiftLeft(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG,
   4599                               bool &isLeft, SDValue &ShVal, unsigned &ShAmt) {
   4600   unsigned NumElems = SVOp->getValueType(0).getVectorNumElements();
   4601   unsigned NumZeros = getNumOfConsecutiveZeros(SVOp, NumElems,
   4602               true /* check zeros from left */, DAG);
   4603   unsigned OpSrc;
   4604 
   4605   if (!NumZeros)
   4606     return false;
   4607 
   4608   // Considering the elements in the mask that are not consecutive zeros,
   4609   // check if they consecutively come from only one of the source vectors.
   4610   //
   4611   //                           0    { A, B, X, X } = V2
   4612   //                          / \    /  /
   4613   //   vector_shuffle V1, V2 <X, X, 4, 5>
   4614   //
   4615   if (!isShuffleMaskConsecutive(SVOp,
   4616             NumZeros,     // Mask Start Index
   4617             NumElems,     // Mask End Index(exclusive)
   4618             0,            // Where to start looking in the src vector
   4619             NumElems,     // Number of elements in vector
   4620             OpSrc))       // Which source operand ?
   4621     return false;
   4622 
   4623   isLeft = true;
   4624   ShAmt = NumZeros;
   4625   ShVal = SVOp->getOperand(OpSrc);
   4626   return true;
   4627 }
   4628 
   4629 /// isVectorShift - Returns true if the shuffle can be implemented as a
   4630 /// logical left or right shift of a vector.
   4631 static bool isVectorShift(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG,
   4632                           bool &isLeft, SDValue &ShVal, unsigned &ShAmt) {
   4633   // Although the logic below support any bitwidth size, there are no
   4634   // shift instructions which handle more than 128-bit vectors.
   4635   if (SVOp->getValueType(0).getSizeInBits() > 128)
   4636     return false;
   4637 
   4638   if (isVectorShiftLeft(SVOp, DAG, isLeft, ShVal, ShAmt) ||
   4639       isVectorShiftRight(SVOp, DAG, isLeft, ShVal, ShAmt))
   4640     return true;
   4641 
   4642   return false;
   4643 }
   4644 
   4645 /// LowerBuildVectorv16i8 - Custom lower build_vector of v16i8.
   4646 ///
   4647 static SDValue LowerBuildVectorv16i8(SDValue Op, unsigned NonZeros,
   4648                                        unsigned NumNonZero, unsigned NumZero,
   4649                                        SelectionDAG &DAG,
   4650                                        const X86Subtarget* Subtarget,
   4651                                        const TargetLowering &TLI) {
   4652   if (NumNonZero > 8)
   4653     return SDValue();
   4654 
   4655   DebugLoc dl = Op.getDebugLoc();
   4656   SDValue V(0, 0);
   4657   bool First = true;
   4658   for (unsigned i = 0; i < 16; ++i) {
   4659     bool ThisIsNonZero = (NonZeros & (1 << i)) != 0;
   4660     if (ThisIsNonZero && First) {
   4661       if (NumZero)
   4662         V = getZeroVector(MVT::v8i16, Subtarget, DAG, dl);
   4663       else
   4664         V = DAG.getUNDEF(MVT::v8i16);
   4665       First = false;
   4666     }
   4667 
   4668     if ((i & 1) != 0) {
   4669       SDValue ThisElt(0, 0), LastElt(0, 0);
   4670       bool LastIsNonZero = (NonZeros & (1 << (i-1))) != 0;
   4671       if (LastIsNonZero) {
   4672         LastElt = DAG.getNode(ISD::ZERO_EXTEND, dl,
   4673                               MVT::i16, Op.getOperand(i-1));
   4674       }
   4675       if (ThisIsNonZero) {
   4676         ThisElt = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i16, Op.getOperand(i));
   4677         ThisElt = DAG.getNode(ISD::SHL, dl, MVT::i16,
   4678                               ThisElt, DAG.getConstant(8, MVT::i8));
   4679         if (LastIsNonZero)
   4680           ThisElt = DAG.getNode(ISD::OR, dl, MVT::i16, ThisElt, LastElt);
   4681       } else
   4682         ThisElt = LastElt;
   4683 
   4684       if (ThisElt.getNode())
   4685         V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v8i16, V, ThisElt,
   4686                         DAG.getIntPtrConstant(i/2));
   4687     }
   4688   }
   4689 
   4690   return DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, V);
   4691 }
   4692 
   4693 /// LowerBuildVectorv8i16 - Custom lower build_vector of v8i16.
   4694 ///
   4695 static SDValue LowerBuildVectorv8i16(SDValue Op, unsigned NonZeros,
   4696                                      unsigned NumNonZero, unsigned NumZero,
   4697                                      SelectionDAG &DAG,
   4698                                      const X86Subtarget* Subtarget,
   4699                                      const TargetLowering &TLI) {
   4700   if (NumNonZero > 4)
   4701     return SDValue();
   4702 
   4703   DebugLoc dl = Op.getDebugLoc();
   4704   SDValue V(0, 0);
   4705   bool First = true;
   4706   for (unsigned i = 0; i < 8; ++i) {
   4707     bool isNonZero = (NonZeros & (1 << i)) != 0;
   4708     if (isNonZero) {
   4709       if (First) {
   4710         if (NumZero)
   4711           V = getZeroVector(MVT::v8i16, Subtarget, DAG, dl);
   4712         else
   4713           V = DAG.getUNDEF(MVT::v8i16);
   4714         First = false;
   4715       }
   4716       V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl,
   4717                       MVT::v8i16, V, Op.getOperand(i),
   4718                       DAG.getIntPtrConstant(i));
   4719     }
   4720   }
   4721 
   4722   return V;
   4723 }
   4724 
   4725 /// getVShift - Return a vector logical shift node.
   4726 ///
   4727 static SDValue getVShift(bool isLeft, EVT VT, SDValue SrcOp,
   4728                          unsigned NumBits, SelectionDAG &DAG,
   4729                          const TargetLowering &TLI, DebugLoc dl) {
   4730   assert(VT.getSizeInBits() == 128 && "Unknown type for VShift");
   4731   EVT ShVT = MVT::v2i64;
   4732   unsigned Opc = isLeft ? X86ISD::VSHLDQ : X86ISD::VSRLDQ;
   4733   SrcOp = DAG.getNode(ISD::BITCAST, dl, ShVT, SrcOp);
   4734   return DAG.getNode(ISD::BITCAST, dl, VT,
   4735                      DAG.getNode(Opc, dl, ShVT, SrcOp,
   4736                              DAG.getConstant(NumBits,
   4737                                   TLI.getShiftAmountTy(SrcOp.getValueType()))));
   4738 }
   4739 
   4740 SDValue
   4741 X86TargetLowering::LowerAsSplatVectorLoad(SDValue SrcOp, EVT VT, DebugLoc dl,
   4742                                           SelectionDAG &DAG) const {
   4743 
   4744   // Check if the scalar load can be widened into a vector load. And if
   4745   // the address is "base + cst" see if the cst can be "absorbed" into
   4746   // the shuffle mask.
   4747   if (LoadSDNode *LD = dyn_cast<LoadSDNode>(SrcOp)) {
   4748     SDValue Ptr = LD->getBasePtr();
   4749     if (!ISD::isNormalLoad(LD) || LD->isVolatile())
   4750       return SDValue();
   4751     EVT PVT = LD->getValueType(0);
   4752     if (PVT != MVT::i32 && PVT != MVT::f32)
   4753       return SDValue();
   4754 
   4755     int FI = -1;
   4756     int64_t Offset = 0;
   4757     if (FrameIndexSDNode *FINode = dyn_cast<FrameIndexSDNode>(Ptr)) {
   4758       FI = FINode->getIndex();
   4759       Offset = 0;
   4760     } else if (DAG.isBaseWithConstantOffset(Ptr) &&
   4761                isa<FrameIndexSDNode>(Ptr.getOperand(0))) {
   4762       FI = cast<FrameIndexSDNode>(Ptr.getOperand(0))->getIndex();
   4763       Offset = Ptr.getConstantOperandVal(1);
   4764       Ptr = Ptr.getOperand(0);
   4765     } else {
   4766       return SDValue();
   4767     }
   4768 
   4769     // FIXME: 256-bit vector instructions don't require a strict alignment,
   4770     // improve this code to support it better.
   4771     unsigned RequiredAlign = VT.getSizeInBits()/8;
   4772     SDValue Chain = LD->getChain();
   4773     // Make sure the stack object alignment is at least 16 or 32.
   4774     MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo();
   4775     if (DAG.InferPtrAlignment(Ptr) < RequiredAlign) {
   4776       if (MFI->isFixedObjectIndex(FI)) {
   4777         // Can't change the alignment. FIXME: It's possible to compute
   4778         // the exact stack offset and reference FI + adjust offset instead.
   4779         // If someone *really* cares about this. That's the way to implement it.
   4780         return SDValue();
   4781       } else {
   4782         MFI->setObjectAlignment(FI, RequiredAlign);
   4783       }
   4784     }
   4785 
   4786     // (Offset % 16 or 32) must be multiple of 4. Then address is then
   4787     // Ptr + (Offset & ~15).
   4788     if (Offset < 0)
   4789       return SDValue();
   4790     if ((Offset % RequiredAlign) & 3)
   4791       return SDValue();
   4792     int64_t StartOffset = Offset & ~(RequiredAlign-1);
   4793     if (StartOffset)
   4794       Ptr = DAG.getNode(ISD::ADD, Ptr.getDebugLoc(), Ptr.getValueType(),
   4795                         Ptr,DAG.getConstant(StartOffset, Ptr.getValueType()));
   4796 
   4797     int EltNo = (Offset - StartOffset) >> 2;
   4798     int NumElems = VT.getVectorNumElements();
   4799 
   4800     EVT NVT = EVT::getVectorVT(*DAG.getContext(), PVT, NumElems);
   4801     SDValue V1 = DAG.getLoad(NVT, dl, Chain, Ptr,
   4802                              LD->getPointerInfo().getWithOffset(StartOffset),
   4803                              false, false, false, 0);
   4804 
   4805     SmallVector<int, 8> Mask;
   4806     for (int i = 0; i < NumElems; ++i)
   4807       Mask.push_back(EltNo);
   4808 
   4809     return DAG.getVectorShuffle(NVT, dl, V1, DAG.getUNDEF(NVT), &Mask[0]);
   4810   }
   4811 
   4812   return SDValue();
   4813 }
   4814 
   4815 /// EltsFromConsecutiveLoads - Given the initializing elements 'Elts' of a
   4816 /// vector of type 'VT', see if the elements can be replaced by a single large
   4817 /// load which has the same value as a build_vector whose operands are 'elts'.
   4818 ///
   4819 /// Example: <load i32 *a, load i32 *a+4, undef, undef> -> zextload a
   4820 ///
   4821 /// FIXME: we'd also like to handle the case where the last elements are zero
   4822 /// rather than undef via VZEXT_LOAD, but we do not detect that case today.
   4823 /// There's even a handy isZeroNode for that purpose.
   4824 static SDValue EltsFromConsecutiveLoads(EVT VT, SmallVectorImpl<SDValue> &Elts,
   4825                                         DebugLoc &DL, SelectionDAG &DAG) {
   4826   EVT EltVT = VT.getVectorElementType();
   4827   unsigned NumElems = Elts.size();
   4828 
   4829   LoadSDNode *LDBase = NULL;
   4830   unsigned LastLoadedElt = -1U;
   4831 
   4832   // For each element in the initializer, see if we've found a load or an undef.
   4833   // If we don't find an initial load element, or later load elements are
   4834   // non-consecutive, bail out.
   4835   for (unsigned i = 0; i < NumElems; ++i) {
   4836     SDValue Elt = Elts[i];
   4837 
   4838     if (!Elt.getNode() ||
   4839         (Elt.getOpcode() != ISD::UNDEF && !ISD::isNON_EXTLoad(Elt.getNode())))
   4840       return SDValue();
   4841     if (!LDBase) {
   4842       if (Elt.getNode()->getOpcode() == ISD::UNDEF)
   4843         return SDValue();
   4844       LDBase = cast<LoadSDNode>(Elt.getNode());
   4845       LastLoadedElt = i;
   4846       continue;
   4847     }
   4848     if (Elt.getOpcode() == ISD::UNDEF)
   4849       continue;
   4850 
   4851     LoadSDNode *LD = cast<LoadSDNode>(Elt);
   4852     if (!DAG.isConsecutiveLoad(LD, LDBase, EltVT.getSizeInBits()/8, i))
   4853       return SDValue();
   4854     LastLoadedElt = i;
   4855   }
   4856 
   4857   // If we have found an entire vector of loads and undefs, then return a large
   4858   // load of the entire vector width starting at the base pointer.  If we found
   4859   // consecutive loads for the low half, generate a vzext_load node.
   4860   if (LastLoadedElt == NumElems - 1) {
   4861     if (DAG.InferPtrAlignment(LDBase->getBasePtr()) >= 16)
   4862       return DAG.getLoad(VT, DL, LDBase->getChain(), LDBase->getBasePtr(),
   4863                          LDBase->getPointerInfo(),
   4864                          LDBase->isVolatile(), LDBase->isNonTemporal(),
   4865                          LDBase->isInvariant(), 0);
   4866     return DAG.getLoad(VT, DL, LDBase->getChain(), LDBase->getBasePtr(),
   4867                        LDBase->getPointerInfo(),
   4868                        LDBase->isVolatile(), LDBase->isNonTemporal(),
   4869                        LDBase->isInvariant(), LDBase->getAlignment());
   4870   } else if (NumElems == 4 && LastLoadedElt == 1 &&
   4871              DAG.getTargetLoweringInfo().isTypeLegal(MVT::v2i64)) {
   4872     SDVTList Tys = DAG.getVTList(MVT::v2i64, MVT::Other);
   4873     SDValue Ops[] = { LDBase->getChain(), LDBase->getBasePtr() };
   4874     SDValue ResNode =
   4875         DAG.getMemIntrinsicNode(X86ISD::VZEXT_LOAD, DL, Tys, Ops, 2, MVT::i64,
   4876                                 LDBase->getPointerInfo(),
   4877                                 LDBase->getAlignment(),
   4878                                 false/*isVolatile*/, true/*ReadMem*/,
   4879                                 false/*WriteMem*/);
   4880     return DAG.getNode(ISD::BITCAST, DL, VT, ResNode);
   4881   }
   4882   return SDValue();
   4883 }
   4884 
   4885 /// LowerVectorBroadcast - Attempt to use the vbroadcast instruction
   4886 /// to generate a splat value for the following cases:
   4887 /// 1. A splat BUILD_VECTOR which uses a single scalar load, or a constant.
   4888 /// 2. A splat shuffle which uses a scalar_to_vector node which comes from
   4889 /// a scalar load, or a constant.
   4890 /// The VBROADCAST node is returned when a pattern is found,
   4891 /// or SDValue() otherwise.
   4892 SDValue
   4893 X86TargetLowering::LowerVectorBroadcast(SDValue &Op, SelectionDAG &DAG) const {
   4894   if (!Subtarget->hasAVX())
   4895     return SDValue();
   4896 
   4897   EVT VT = Op.getValueType();
   4898   DebugLoc dl = Op.getDebugLoc();
   4899 
   4900   SDValue Ld;
   4901   bool ConstSplatVal;
   4902 
   4903   switch (Op.getOpcode()) {
   4904     default:
   4905       // Unknown pattern found.
   4906       return SDValue();
   4907 
   4908     case ISD::BUILD_VECTOR: {
   4909       // The BUILD_VECTOR node must be a splat.
   4910       if (!isSplatVector(Op.getNode()))
   4911         return SDValue();
   4912 
   4913       Ld = Op.getOperand(0);
   4914       ConstSplatVal = (Ld.getOpcode() == ISD::Constant ||
   4915                      Ld.getOpcode() == ISD::ConstantFP);
   4916 
   4917       // The suspected load node has several users. Make sure that all
   4918       // of its users are from the BUILD_VECTOR node.
   4919       // Constants may have multiple users.
   4920       if (!ConstSplatVal && !Ld->hasNUsesOfValue(VT.getVectorNumElements(), 0))
   4921         return SDValue();
   4922       break;
   4923     }
   4924 
   4925     case ISD::VECTOR_SHUFFLE: {
   4926       ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
   4927 
   4928       // Shuffles must have a splat mask where the first element is
   4929       // broadcasted.
   4930       if ((!SVOp->isSplat()) || SVOp->getMaskElt(0) != 0)
   4931         return SDValue();
   4932 
   4933       SDValue Sc = Op.getOperand(0);
   4934       if (Sc.getOpcode() != ISD::SCALAR_TO_VECTOR)
   4935         return SDValue();
   4936 
   4937       Ld = Sc.getOperand(0);
   4938       ConstSplatVal = (Ld.getOpcode() == ISD::Constant ||
   4939                        Ld.getOpcode() == ISD::ConstantFP);
   4940 
   4941       // The scalar_to_vector node and the suspected
   4942       // load node must have exactly one user.
   4943       // Constants may have multiple users.
   4944       if (!ConstSplatVal && (!Sc.hasOneUse() || !Ld.hasOneUse()))
   4945         return SDValue();
   4946       break;
   4947     }
   4948   }
   4949 
   4950   bool Is256 = VT.getSizeInBits() == 256;
   4951   bool Is128 = VT.getSizeInBits() == 128;
   4952 
   4953   // Handle the broadcasting a single constant scalar from the constant pool
   4954   // into a vector. On Sandybridge it is still better to load a constant vector
   4955   // from the constant pool and not to broadcast it from a scalar.
   4956   if (ConstSplatVal && Subtarget->hasAVX2()) {
   4957     EVT CVT = Ld.getValueType();
   4958     assert(!CVT.isVector() && "Must not broadcast a vector type");
   4959     unsigned ScalarSize = CVT.getSizeInBits();
   4960 
   4961     if ((Is256 && (ScalarSize == 32 || ScalarSize == 64)) ||
   4962         (Is128 && (ScalarSize == 32))) {
   4963 
   4964       const Constant *C = 0;
   4965       if (ConstantSDNode *CI = dyn_cast<ConstantSDNode>(Ld))
   4966         C = CI->getConstantIntValue();
   4967       else if (ConstantFPSDNode *CF = dyn_cast<ConstantFPSDNode>(Ld))
   4968         C = CF->getConstantFPValue();
   4969 
   4970       assert(C && "Invalid constant type");
   4971 
   4972       SDValue CP = DAG.getConstantPool(C, getPointerTy());
   4973       unsigned Alignment = cast<ConstantPoolSDNode>(CP)->getAlignment();
   4974       Ld = DAG.getLoad(CVT, dl, DAG.getEntryNode(), CP,
   4975                          MachinePointerInfo::getConstantPool(),
   4976                          false, false, false, Alignment);
   4977 
   4978       return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld);
   4979     }
   4980   }
   4981 
   4982   // The scalar source must be a normal load.
   4983   if (!ISD::isNormalLoad(Ld.getNode()))
   4984     return SDValue();
   4985 
   4986   // Reject loads that have uses of the chain result
   4987   if (Ld->hasAnyUseOfValue(1))
   4988     return SDValue();
   4989 
   4990   unsigned ScalarSize = Ld.getValueType().getSizeInBits();
   4991 
   4992   // VBroadcast to YMM
   4993   if (Is256 && (ScalarSize == 32 || ScalarSize == 64))
   4994     return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld);
   4995 
   4996   // VBroadcast to XMM
   4997   if (Is128 && (ScalarSize == 32))
   4998     return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld);
   4999 
   5000   // The integer check is needed for the 64-bit into 128-bit so it doesn't match
   5001   // double since there is vbroadcastsd xmm
   5002   if (Subtarget->hasAVX2() && Ld.getValueType().isInteger()) {
   5003     // VBroadcast to YMM
   5004     if (Is256 && (ScalarSize == 8 || ScalarSize == 16))
   5005       return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld);
   5006 
   5007     // VBroadcast to XMM
   5008     if (Is128 && (ScalarSize ==  8 || ScalarSize == 16 || ScalarSize == 64))
   5009       return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld);
   5010   }
   5011 
   5012   // Unsupported broadcast.
   5013   return SDValue();
   5014 }
   5015 
   5016 SDValue
   5017 X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const {
   5018   DebugLoc dl = Op.getDebugLoc();
   5019 
   5020   EVT VT = Op.getValueType();
   5021   EVT ExtVT = VT.getVectorElementType();
   5022   unsigned NumElems = Op.getNumOperands();
   5023 
   5024   // Vectors containing all zeros can be matched by pxor and xorps later
   5025   if (ISD::isBuildVectorAllZeros(Op.getNode())) {
   5026     // Canonicalize this to <4 x i32> to 1) ensure the zero vectors are CSE'd
   5027     // and 2) ensure that i64 scalars are eliminated on x86-32 hosts.
   5028     if (VT == MVT::v4i32 || VT == MVT::v8i32)
   5029       return Op;
   5030 
   5031     return getZeroVector(VT, Subtarget, DAG, dl);
   5032   }
   5033 
   5034   // Vectors containing all ones can be matched by pcmpeqd on 128-bit width
   5035   // vectors or broken into v4i32 operations on 256-bit vectors. AVX2 can use
   5036   // vpcmpeqd on 256-bit vectors.
   5037   if (ISD::isBuildVectorAllOnes(Op.getNode())) {
   5038     if (VT == MVT::v4i32 || (VT == MVT::v8i32 && Subtarget->hasAVX2()))
   5039       return Op;
   5040 
   5041     return getOnesVector(VT, Subtarget->hasAVX2(), DAG, dl);
   5042   }
   5043 
   5044   SDValue Broadcast = LowerVectorBroadcast(Op, DAG);
   5045   if (Broadcast.getNode())
   5046     return Broadcast;
   5047 
   5048   unsigned EVTBits = ExtVT.getSizeInBits();
   5049 
   5050   unsigned NumZero  = 0;
   5051   unsigned NumNonZero = 0;
   5052   unsigned NonZeros = 0;
   5053   bool IsAllConstants = true;
   5054   SmallSet<SDValue, 8> Values;
   5055   for (unsigned i = 0; i < NumElems; ++i) {
   5056     SDValue Elt = Op.getOperand(i);
   5057     if (Elt.getOpcode() == ISD::UNDEF)
   5058       continue;
   5059     Values.insert(Elt);
   5060     if (Elt.getOpcode() != ISD::Constant &&
   5061         Elt.getOpcode() != ISD::ConstantFP)
   5062       IsAllConstants = false;
   5063     if (X86::isZeroNode(Elt))
   5064       NumZero++;
   5065     else {
   5066       NonZeros |= (1 << i);
   5067       NumNonZero++;
   5068     }
   5069   }
   5070 
   5071   // All undef vector. Return an UNDEF.  All zero vectors were handled above.
   5072   if (NumNonZero == 0)
   5073     return DAG.getUNDEF(VT);
   5074 
   5075   // Special case for single non-zero, non-undef, element.
   5076   if (NumNonZero == 1) {
   5077     unsigned Idx = CountTrailingZeros_32(NonZeros);
   5078     SDValue Item = Op.getOperand(Idx);
   5079 
   5080     // If this is an insertion of an i64 value on x86-32, and if the top bits of
   5081     // the value are obviously zero, truncate the value to i32 and do the
   5082     // insertion that way.  Only do this if the value is non-constant or if the
   5083     // value is a constant being inserted into element 0.  It is cheaper to do
   5084     // a constant pool load than it is to do a movd + shuffle.
   5085     if (ExtVT == MVT::i64 && !Subtarget->is64Bit() &&
   5086         (!IsAllConstants || Idx == 0)) {
   5087       if (DAG.MaskedValueIsZero(Item, APInt::getBitsSet(64, 32, 64))) {
   5088         // Handle SSE only.
   5089         assert(VT == MVT::v2i64 && "Expected an SSE value type!");
   5090         EVT VecVT = MVT::v4i32;
   5091         unsigned VecElts = 4;
   5092 
   5093         // Truncate the value (which may itself be a constant) to i32, and
   5094         // convert it to a vector with movd (S2V+shuffle to zero extend).
   5095         Item = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, Item);
   5096         Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VecVT, Item);
   5097         Item = getShuffleVectorZeroOrUndef(Item, 0, true, Subtarget, DAG);
   5098 
   5099         // Now we have our 32-bit value zero extended in the low element of
   5100         // a vector.  If Idx != 0, swizzle it into place.
   5101         if (Idx != 0) {
   5102           SmallVector<int, 4> Mask;
   5103           Mask.push_back(Idx);
   5104           for (unsigned i = 1; i != VecElts; ++i)
   5105             Mask.push_back(i);
   5106           Item = DAG.getVectorShuffle(VecVT, dl, Item,
   5107                                       DAG.getUNDEF(Item.getValueType()),
   5108                                       &Mask[0]);
   5109         }
   5110         return DAG.getNode(ISD::BITCAST, dl, VT, Item);
   5111       }
   5112     }
   5113 
   5114     // If we have a constant or non-constant insertion into the low element of
   5115     // a vector, we can do this with SCALAR_TO_VECTOR + shuffle of zero into
   5116     // the rest of the elements.  This will be matched as movd/movq/movss/movsd
   5117     // depending on what the source datatype is.
   5118     if (Idx == 0) {
   5119       if (NumZero == 0)
   5120         return DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item);
   5121 
   5122       if (ExtVT == MVT::i32 || ExtVT == MVT::f32 || ExtVT == MVT::f64 ||
   5123           (ExtVT == MVT::i64 && Subtarget->is64Bit())) {
   5124         if (VT.getSizeInBits() == 256) {
   5125           SDValue ZeroVec = getZeroVector(VT, Subtarget, DAG, dl);
   5126           return DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, ZeroVec,
   5127                              Item, DAG.getIntPtrConstant(0));
   5128         }
   5129         assert(VT.getSizeInBits() == 128 && "Expected an SSE value type!");
   5130         Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item);
   5131         // Turn it into a MOVL (i.e. movss, movsd, or movd) to a zero vector.
   5132         return getShuffleVectorZeroOrUndef(Item, 0, true, Subtarget, DAG);
   5133       }
   5134 
   5135       if (ExtVT == MVT::i16 || ExtVT == MVT::i8) {
   5136         Item = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, Item);
   5137         Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, Item);
   5138         if (VT.getSizeInBits() == 256) {
   5139           SDValue ZeroVec = getZeroVector(MVT::v8i32, Subtarget, DAG, dl);
   5140           Item = Insert128BitVector(ZeroVec, Item, DAG.getConstant(0, MVT::i32),
   5141                                     DAG, dl);
   5142         } else {
   5143           assert(VT.getSizeInBits() == 128 && "Expected an SSE value type!");
   5144           Item = getShuffleVectorZeroOrUndef(Item, 0, true, Subtarget, DAG);
   5145         }
   5146         return DAG.getNode(ISD::BITCAST, dl, VT, Item);
   5147       }
   5148     }
   5149 
   5150     // Is it a vector logical left shift?
   5151     if (NumElems == 2 && Idx == 1 &&
   5152         X86::isZeroNode(Op.getOperand(0)) &&
   5153         !X86::isZeroNode(Op.getOperand(1))) {
   5154       unsigned NumBits = VT.getSizeInBits();
   5155       return getVShift(true, VT,
   5156                        DAG.getNode(ISD::SCALAR_TO_VECTOR, dl,
   5157                                    VT, Op.getOperand(1)),
   5158                        NumBits/2, DAG, *this, dl);
   5159     }
   5160 
   5161     if (IsAllConstants) // Otherwise, it's better to do a constpool load.
   5162       return SDValue();
   5163 
   5164     // Otherwise, if this is a vector with i32 or f32 elements, and the element
   5165     // is a non-constant being inserted into an element other than the low one,
   5166     // we can't use a constant pool load.  Instead, use SCALAR_TO_VECTOR (aka
   5167     // movd/movss) to move this into the low element, then shuffle it into
   5168     // place.
   5169     if (EVTBits == 32) {
   5170       Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item);
   5171 
   5172       // Turn it into a shuffle of zero and zero-extended scalar to vector.
   5173       Item = getShuffleVectorZeroOrUndef(Item, 0, NumZero > 0, Subtarget, DAG);
   5174       SmallVector<int, 8> MaskVec;
   5175       for (unsigned i = 0; i < NumElems; i++)
   5176         MaskVec.push_back(i == Idx ? 0 : 1);
   5177       return DAG.getVectorShuffle(VT, dl, Item, DAG.getUNDEF(VT), &MaskVec[0]);
   5178     }
   5179   }
   5180 
   5181   // Splat is obviously ok. Let legalizer expand it to a shuffle.
   5182   if (Values.size() == 1) {
   5183     if (EVTBits == 32) {
   5184       // Instead of a shuffle like this:
   5185       // shuffle (scalar_to_vector (load (ptr + 4))), undef, <0, 0, 0, 0>
   5186       // Check if it's possible to issue this instead.
   5187       // shuffle (vload ptr)), undef, <1, 1, 1, 1>
   5188       unsigned Idx = CountTrailingZeros_32(NonZeros);
   5189       SDValue Item = Op.getOperand(Idx);
   5190       if (Op.getNode()->isOnlyUserOf(Item.getNode()))
   5191         return LowerAsSplatVectorLoad(Item, VT, dl, DAG);
   5192     }
   5193     return SDValue();
   5194   }
   5195 
   5196   // A vector full of immediates; various special cases are already
   5197   // handled, so this is best done with a single constant-pool load.
   5198   if (IsAllConstants)
   5199     return SDValue();
   5200 
   5201   // For AVX-length vectors, build the individual 128-bit pieces and use
   5202   // shuffles to put them in place.
   5203   if (VT.getSizeInBits() == 256) {
   5204     SmallVector<SDValue, 32> V;
   5205     for (unsigned i = 0; i != NumElems; ++i)
   5206       V.push_back(Op.getOperand(i));
   5207 
   5208     EVT HVT = EVT::getVectorVT(*DAG.getContext(), ExtVT, NumElems/2);
   5209 
   5210     // Build both the lower and upper subvector.
   5211     SDValue Lower = DAG.getNode(ISD::BUILD_VECTOR, dl, HVT, &V[0], NumElems/2);
   5212     SDValue Upper = DAG.getNode(ISD::BUILD_VECTOR, dl, HVT, &V[NumElems / 2],
   5213                                 NumElems/2);
   5214 
   5215     // Recreate the wider vector with the lower and upper part.
   5216     SDValue Vec = Insert128BitVector(DAG.getNode(ISD::UNDEF, dl, VT), Lower,
   5217                                 DAG.getConstant(0, MVT::i32), DAG, dl);
   5218     return Insert128BitVector(Vec, Upper, DAG.getConstant(NumElems/2, MVT::i32),
   5219                               DAG, dl);
   5220   }
   5221 
   5222   // Let legalizer expand 2-wide build_vectors.
   5223   if (EVTBits == 64) {
   5224     if (NumNonZero == 1) {
   5225       // One half is zero or undef.
   5226       unsigned Idx = CountTrailingZeros_32(NonZeros);
   5227       SDValue V2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT,
   5228                                  Op.getOperand(Idx));
   5229       return getShuffleVectorZeroOrUndef(V2, Idx, true, Subtarget, DAG);
   5230     }
   5231     return SDValue();
   5232   }
   5233 
   5234   // If element VT is < 32 bits, convert it to inserts into a zero vector.
   5235   if (EVTBits == 8 && NumElems == 16) {
   5236     SDValue V = LowerBuildVectorv16i8(Op, NonZeros,NumNonZero,NumZero, DAG,
   5237                                         Subtarget, *this);
   5238     if (V.getNode()) return V;
   5239   }
   5240 
   5241   if (EVTBits == 16 && NumElems == 8) {
   5242     SDValue V = LowerBuildVectorv8i16(Op, NonZeros,NumNonZero,NumZero, DAG,
   5243                                       Subtarget, *this);
   5244     if (V.getNode()) return V;
   5245   }
   5246 
   5247   // If element VT is == 32 bits, turn it into a number of shuffles.
   5248   SmallVector<SDValue, 8> V(NumElems);
   5249   if (NumElems == 4 && NumZero > 0) {
   5250     for (unsigned i = 0; i < 4; ++i) {
   5251       bool isZero = !(NonZeros & (1 << i));
   5252       if (isZero)
   5253         V[i] = getZeroVector(VT, Subtarget, DAG, dl);
   5254       else
   5255         V[i] = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(i));
   5256     }
   5257 
   5258     for (unsigned i = 0; i < 2; ++i) {
   5259       switch ((NonZeros & (0x3 << i*2)) >> (i*2)) {
   5260         default: break;
   5261         case 0:
   5262           V[i] = V[i*2];  // Must be a zero vector.
   5263           break;
   5264         case 1:
   5265           V[i] = getMOVL(DAG, dl, VT, V[i*2+1], V[i*2]);
   5266           break;
   5267         case 2:
   5268           V[i] = getMOVL(DAG, dl, VT, V[i*2], V[i*2+1]);
   5269           break;
   5270         case 3:
   5271           V[i] = getUnpackl(DAG, dl, VT, V[i*2], V[i*2+1]);
   5272           break;
   5273       }
   5274     }
   5275 
   5276     bool Reverse1 = (NonZeros & 0x3) == 2;
   5277     bool Reverse2 = ((NonZeros & (0x3 << 2)) >> 2) == 2;
   5278     int MaskVec[] = {
   5279       Reverse1 ? 1 : 0,
   5280       Reverse1 ? 0 : 1,
   5281       static_cast<int>(Reverse2 ? NumElems+1 : NumElems),
   5282       static_cast<int>(Reverse2 ? NumElems   : NumElems+1)
   5283     };
   5284     return DAG.getVectorShuffle(VT, dl, V[0], V[1], &MaskVec[0]);
   5285   }
   5286 
   5287   if (Values.size() > 1 && VT.getSizeInBits() == 128) {
   5288     // Check for a build vector of consecutive loads.
   5289     for (unsigned i = 0; i < NumElems; ++i)
   5290       V[i] = Op.getOperand(i);
   5291 
   5292     // Check for elements which are consecutive loads.
   5293     SDValue LD = EltsFromConsecutiveLoads(VT, V, dl, DAG);
   5294     if (LD.getNode())
   5295       return LD;
   5296 
   5297     // For SSE 4.1, use insertps to put the high elements into the low element.
   5298     if (getSubtarget()->hasSSE41()) {
   5299       SDValue Result;
   5300       if (Op.getOperand(0).getOpcode() != ISD::UNDEF)
   5301         Result = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(0));
   5302       else
   5303         Result = DAG.getUNDEF(VT);
   5304 
   5305       for (unsigned i = 1; i < NumElems; ++i) {
   5306         if (Op.getOperand(i).getOpcode() == ISD::UNDEF) continue;
   5307         Result = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, Result,
   5308                              Op.getOperand(i), DAG.getIntPtrConstant(i));
   5309       }
   5310       return Result;
   5311     }
   5312 
   5313     // Otherwise, expand into a number of unpckl*, start by extending each of
   5314     // our (non-undef) elements to the full vector width with the element in the
   5315     // bottom slot of the vector (which generates no code for SSE).
   5316     for (unsigned i = 0; i < NumElems; ++i) {
   5317       if (Op.getOperand(i).getOpcode() != ISD::UNDEF)
   5318         V[i] = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(i));
   5319       else
   5320         V[i] = DAG.getUNDEF(VT);
   5321     }
   5322 
   5323     // Next, we iteratively mix elements, e.g. for v4f32:
   5324     //   Step 1: unpcklps 0, 2 ==> X: <?, ?, 2, 0>
   5325     //         : unpcklps 1, 3 ==> Y: <?, ?, 3, 1>
   5326     //   Step 2: unpcklps X, Y ==>    <3, 2, 1, 0>
   5327     unsigned EltStride = NumElems >> 1;
   5328     while (EltStride != 0) {
   5329       for (unsigned i = 0; i < EltStride; ++i) {
   5330         // If V[i+EltStride] is undef and this is the first round of mixing,
   5331         // then it is safe to just drop this shuffle: V[i] is already in the
   5332         // right place, the one element (since it's the first round) being
   5333         // inserted as undef can be dropped.  This isn't safe for successive
   5334         // rounds because they will permute elements within both vectors.
   5335         if (V[i+EltStride].getOpcode() == ISD::UNDEF &&
   5336             EltStride == NumElems/2)
   5337           continue;
   5338 
   5339         V[i] = getUnpackl(DAG, dl, VT, V[i], V[i + EltStride]);
   5340       }
   5341       EltStride >>= 1;
   5342     }
   5343     return V[0];
   5344   }
   5345   return SDValue();
   5346 }
   5347 
   5348 // LowerMMXCONCAT_VECTORS - We support concatenate two MMX registers and place
   5349 // them in a MMX register.  This is better than doing a stack convert.
   5350 static SDValue LowerMMXCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) {
   5351   DebugLoc dl = Op.getDebugLoc();
   5352   EVT ResVT = Op.getValueType();
   5353 
   5354   assert(ResVT == MVT::v2i64 || ResVT == MVT::v4i32 ||
   5355          ResVT == MVT::v8i16 || ResVT == MVT::v16i8);
   5356   int Mask[2];
   5357   SDValue InVec = DAG.getNode(ISD::BITCAST,dl, MVT::v1i64, Op.getOperand(0));
   5358   SDValue VecOp = DAG.getNode(X86ISD::MOVQ2DQ, dl, MVT::v2i64, InVec);
   5359   InVec = Op.getOperand(1);
   5360   if (InVec.getOpcode() == ISD::SCALAR_TO_VECTOR) {
   5361     unsigned NumElts = ResVT.getVectorNumElements();
   5362     VecOp = DAG.getNode(ISD::BITCAST, dl, ResVT, VecOp);
   5363     VecOp = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, ResVT, VecOp,
   5364                        InVec.getOperand(0), DAG.getIntPtrConstant(NumElts/2+1));
   5365   } else {
   5366     InVec = DAG.getNode(ISD::BITCAST, dl, MVT::v1i64, InVec);
   5367     SDValue VecOp2 = DAG.getNode(X86ISD::MOVQ2DQ, dl, MVT::v2i64, InVec);
   5368     Mask[0] = 0; Mask[1] = 2;
   5369     VecOp = DAG.getVectorShuffle(MVT::v2i64, dl, VecOp, VecOp2, Mask);
   5370   }
   5371   return DAG.getNode(ISD::BITCAST, dl, ResVT, VecOp);
   5372 }
   5373 
   5374 // LowerAVXCONCAT_VECTORS - 256-bit AVX can use the vinsertf128 instruction
   5375 // to create 256-bit vectors from two other 128-bit ones.
   5376 static SDValue LowerAVXCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) {
   5377   DebugLoc dl = Op.getDebugLoc();
   5378   EVT ResVT = Op.getValueType();
   5379 
   5380   assert(ResVT.getSizeInBits() == 256 && "Value type must be 256-bit wide");
   5381 
   5382   SDValue V1 = Op.getOperand(0);
   5383   SDValue V2 = Op.getOperand(1);
   5384   unsigned NumElems = ResVT.getVectorNumElements();
   5385 
   5386   SDValue V = Insert128BitVector(DAG.getNode(ISD::UNDEF, dl, ResVT), V1,
   5387                                  DAG.getConstant(0, MVT::i32), DAG, dl);
   5388   return Insert128BitVector(V, V2, DAG.getConstant(NumElems/2, MVT::i32),
   5389                             DAG, dl);
   5390 }
   5391 
   5392 SDValue
   5393 X86TargetLowering::LowerCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) const {
   5394   EVT ResVT = Op.getValueType();
   5395 
   5396   assert(Op.getNumOperands() == 2);
   5397   assert((ResVT.getSizeInBits() == 128 || ResVT.getSizeInBits() == 256) &&
   5398          "Unsupported CONCAT_VECTORS for value type");
   5399 
   5400   // We support concatenate two MMX registers and place them in a MMX register.
   5401   // This is better than doing a stack convert.
   5402   if (ResVT.is128BitVector())
   5403     return LowerMMXCONCAT_VECTORS(Op, DAG);
   5404 
   5405   // 256-bit AVX can use the vinsertf128 instruction to create 256-bit vectors
   5406   // from two other 128-bit ones.
   5407   return LowerAVXCONCAT_VECTORS(Op, DAG);
   5408 }
   5409 
   5410 // Try to lower a shuffle node into a simple blend instruction.
   5411 static SDValue LowerVECTOR_SHUFFLEtoBlend(SDValue Op,
   5412                                           const X86Subtarget *Subtarget,
   5413                                           SelectionDAG &DAG) {
   5414   ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
   5415   SDValue V1 = SVOp->getOperand(0);
   5416   SDValue V2 = SVOp->getOperand(1);
   5417   DebugLoc dl = SVOp->getDebugLoc();
   5418   EVT VT = Op.getValueType();
   5419   EVT InVT = V1.getValueType();
   5420   int MaskSize = VT.getVectorNumElements();
   5421   int InSize = InVT.getVectorNumElements();
   5422 
   5423   if (!Subtarget->hasSSE41())
   5424     return SDValue();
   5425 
   5426   if (MaskSize != InSize)
   5427     return SDValue();
   5428 
   5429   int ISDNo = 0;
   5430   MVT OpTy;
   5431 
   5432   switch (VT.getSimpleVT().SimpleTy) {
   5433   default: return SDValue();
   5434   case MVT::v8i16:
   5435            ISDNo = X86ISD::BLENDPW;
   5436            OpTy = MVT::v8i16;
   5437            break;
   5438   case MVT::v4i32:
   5439   case MVT::v4f32:
   5440            ISDNo = X86ISD::BLENDPS;
   5441            OpTy = MVT::v4f32;
   5442            break;
   5443   case MVT::v2i64:
   5444   case MVT::v2f64:
   5445            ISDNo = X86ISD::BLENDPD;
   5446            OpTy = MVT::v2f64;
   5447            break;
   5448   case MVT::v8i32:
   5449   case MVT::v8f32:
   5450            if (!Subtarget->hasAVX())
   5451              return SDValue();
   5452            ISDNo = X86ISD::BLENDPS;
   5453            OpTy = MVT::v8f32;
   5454            break;
   5455   case MVT::v4i64:
   5456   case MVT::v4f64:
   5457            if (!Subtarget->hasAVX())
   5458              return SDValue();
   5459            ISDNo = X86ISD::BLENDPD;
   5460            OpTy = MVT::v4f64;
   5461            break;
   5462   case MVT::v16i16:
   5463            if (!Subtarget->hasAVX2())
   5464              return SDValue();
   5465            ISDNo = X86ISD::BLENDPW;
   5466            OpTy = MVT::v16i16;
   5467            break;
   5468   }
   5469   assert(ISDNo && "Invalid Op Number");
   5470 
   5471   unsigned MaskVals = 0;
   5472 
   5473   for (int i = 0; i < MaskSize; ++i) {
   5474     int EltIdx = SVOp->getMaskElt(i);
   5475     if (EltIdx == i || EltIdx == -1)
   5476       MaskVals |= (1<<i);
   5477     else if (EltIdx == (i + MaskSize))
   5478       continue; // Bit is set to zero;
   5479     else return SDValue();
   5480   }
   5481 
   5482   V1 = DAG.getNode(ISD::BITCAST, dl, OpTy, V1);
   5483   V2 = DAG.getNode(ISD::BITCAST, dl, OpTy, V2);
   5484   SDValue Ret =  DAG.getNode(ISDNo, dl, OpTy, V1, V2,
   5485                              DAG.getConstant(MaskVals, MVT::i32));
   5486   return DAG.getNode(ISD::BITCAST, dl, VT, Ret);
   5487 }
   5488 
   5489 // v8i16 shuffles - Prefer shuffles in the following order:
   5490 // 1. [all]   pshuflw, pshufhw, optional move
   5491 // 2. [ssse3] 1 x pshufb
   5492 // 3. [ssse3] 2 x pshufb + 1 x por
   5493 // 4. [all]   mov + pshuflw + pshufhw + N x (pextrw + pinsrw)
   5494 SDValue
   5495 X86TargetLowering::LowerVECTOR_SHUFFLEv8i16(SDValue Op,
   5496                                             SelectionDAG &DAG) const {
   5497   ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
   5498   SDValue V1 = SVOp->getOperand(0);
   5499   SDValue V2 = SVOp->getOperand(1);
   5500   DebugLoc dl = SVOp->getDebugLoc();
   5501   SmallVector<int, 8> MaskVals;
   5502 
   5503   // Determine if more than 1 of the words in each of the low and high quadwords
   5504   // of the result come from the same quadword of one of the two inputs.  Undef
   5505   // mask values count as coming from any quadword, for better codegen.
   5506   unsigned LoQuad[] = { 0, 0, 0, 0 };
   5507   unsigned HiQuad[] = { 0, 0, 0, 0 };
   5508   std::bitset<4> InputQuads;
   5509   for (unsigned i = 0; i < 8; ++i) {
   5510     unsigned *Quad = i < 4 ? LoQuad : HiQuad;
   5511     int EltIdx = SVOp->getMaskElt(i);
   5512     MaskVals.push_back(EltIdx);
   5513     if (EltIdx < 0) {
   5514       ++Quad[0];
   5515       ++Quad[1];
   5516       ++Quad[2];
   5517       ++Quad[3];
   5518       continue;
   5519     }
   5520     ++Quad[EltIdx / 4];
   5521     InputQuads.set(EltIdx / 4);
   5522   }
   5523 
   5524   int BestLoQuad = -1;
   5525   unsigned MaxQuad = 1;
   5526   for (unsigned i = 0; i < 4; ++i) {
   5527     if (LoQuad[i] > MaxQuad) {
   5528       BestLoQuad = i;
   5529       MaxQuad = LoQuad[i];
   5530     }
   5531   }
   5532 
   5533   int BestHiQuad = -1;
   5534   MaxQuad = 1;
   5535   for (unsigned i = 0; i < 4; ++i) {
   5536     if (HiQuad[i] > MaxQuad) {
   5537       BestHiQuad = i;
   5538       MaxQuad = HiQuad[i];
   5539     }
   5540   }
   5541 
   5542   // For SSSE3, If all 8 words of the result come from only 1 quadword of each
   5543   // of the two input vectors, shuffle them into one input vector so only a
   5544   // single pshufb instruction is necessary. If There are more than 2 input
   5545   // quads, disable the next transformation since it does not help SSSE3.
   5546   bool V1Used = InputQuads[0] || InputQuads[1];
   5547   bool V2Used = InputQuads[2] || InputQuads[3];
   5548   if (Subtarget->hasSSSE3()) {
   5549     if (InputQuads.count() == 2 && V1Used && V2Used) {
   5550       BestLoQuad = InputQuads[0] ? 0 : 1;
   5551       BestHiQuad = InputQuads[2] ? 2 : 3;
   5552     }
   5553     if (InputQuads.count() > 2) {
   5554       BestLoQuad = -1;
   5555       BestHiQuad = -1;
   5556     }
   5557   }
   5558 
   5559   // If BestLoQuad or BestHiQuad are set, shuffle the quads together and update
   5560   // the shuffle mask.  If a quad is scored as -1, that means that it contains
   5561   // words from all 4 input quadwords.
   5562   SDValue NewV;
   5563   if (BestLoQuad >= 0 || BestHiQuad >= 0) {
   5564     int MaskV[] = {
   5565       BestLoQuad < 0 ? 0 : BestLoQuad,
   5566       BestHiQuad < 0 ? 1 : BestHiQuad
   5567     };
   5568     NewV = DAG.getVectorShuffle(MVT::v2i64, dl,
   5569                   DAG.getNode(ISD::BITCAST, dl, MVT::v2i64, V1),
   5570                   DAG.getNode(ISD::BITCAST, dl, MVT::v2i64, V2), &MaskV[0]);
   5571     NewV = DAG.getNode(ISD::BITCAST, dl, MVT::v8i16, NewV);
   5572 
   5573     // Rewrite the MaskVals and assign NewV to V1 if NewV now contains all the
   5574     // source words for the shuffle, to aid later transformations.
   5575     bool AllWordsInNewV = true;
   5576     bool InOrder[2] = { true, true };
   5577     for (unsigned i = 0; i != 8; ++i) {
   5578       int idx = MaskVals[i];
   5579       if (idx != (int)i)
   5580         InOrder[i/4] = false;
   5581       if (idx < 0 || (idx/4) == BestLoQuad || (idx/4) == BestHiQuad)
   5582         continue;
   5583       AllWordsInNewV = false;
   5584       break;
   5585     }
   5586 
   5587     bool pshuflw = AllWordsInNewV, pshufhw = AllWordsInNewV;
   5588     if (AllWordsInNewV) {
   5589       for (int i = 0; i != 8; ++i) {
   5590         int idx = MaskVals[i];
   5591         if (idx < 0)
   5592           continue;
   5593         idx = MaskVals[i] = (idx / 4) == BestLoQuad ? (idx & 3) : (idx & 3) + 4;
   5594         if ((idx != i) && idx < 4)
   5595           pshufhw = false;
   5596         if ((idx != i) && idx > 3)
   5597           pshuflw = false;
   5598       }
   5599       V1 = NewV;
   5600       V2Used = false;
   5601       BestLoQuad = 0;
   5602       BestHiQuad = 1;
   5603     }
   5604 
   5605     // If we've eliminated the use of V2, and the new mask is a pshuflw or
   5606     // pshufhw, that's as cheap as it gets.  Return the new shuffle.
   5607     if ((pshufhw && InOrder[0]) || (pshuflw && InOrder[1])) {
   5608       unsigned Opc = pshufhw ? X86ISD::PSHUFHW : X86ISD::PSHUFLW;
   5609       unsigned TargetMask = 0;
   5610       NewV = DAG.getVectorShuffle(MVT::v8i16, dl, NewV,
   5611                                   DAG.getUNDEF(MVT::v8i16), &MaskVals[0]);
   5612       ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(NewV.getNode());
   5613       TargetMask = pshufhw ? getShufflePSHUFHWImmediate(SVOp):
   5614                              getShufflePSHUFLWImmediate(SVOp);
   5615       V1 = NewV.getOperand(0);
   5616       return getTargetShuffleNode(Opc, dl, MVT::v8i16, V1, TargetMask, DAG);
   5617     }
   5618   }
   5619 
   5620   // If we have SSSE3, and all words of the result are from 1 input vector,
   5621   // case 2 is generated, otherwise case 3 is generated.  If no SSSE3
   5622   // is present, fall back to case 4.
   5623   if (Subtarget->hasSSSE3()) {
   5624     SmallVector<SDValue,16> pshufbMask;
   5625 
   5626     // If we have elements from both input vectors, set the high bit of the
   5627     // shuffle mask element to zero out elements that come from V2 in the V1
   5628     // mask, and elements that come from V1 in the V2 mask, so that the two
   5629     // results can be OR'd together.
   5630     bool TwoInputs = V1Used && V2Used;
   5631     for (unsigned i = 0; i != 8; ++i) {
   5632       int EltIdx = MaskVals[i] * 2;
   5633       if (TwoInputs && (EltIdx >= 16)) {
   5634         pshufbMask.push_back(DAG.getConstant(0x80, MVT::i8));
   5635         pshufbMask.push_back(DAG.getConstant(0x80, MVT::i8));
   5636         continue;
   5637       }
   5638       pshufbMask.push_back(DAG.getConstant(EltIdx,   MVT::i8));
   5639       pshufbMask.push_back(DAG.getConstant(EltIdx+1, MVT::i8));
   5640     }
   5641     V1 = DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, V1);
   5642     V1 = DAG.getNode(X86ISD::PSHUFB, dl, MVT::v16i8, V1,
   5643                      DAG.getNode(ISD::BUILD_VECTOR, dl,
   5644                                  MVT::v16i8, &pshufbMask[0], 16));
   5645     if (!TwoInputs)
   5646       return DAG.getNode(ISD::BITCAST, dl, MVT::v8i16, V1);
   5647 
   5648     // Calculate the shuffle mask for the second input, shuffle it, and
   5649     // OR it with the first shuffled input.
   5650     pshufbMask.clear();
   5651     for (unsigned i = 0; i != 8; ++i) {
   5652       int EltIdx = MaskVals[i] * 2;
   5653       if (EltIdx < 16) {
   5654         pshufbMask.push_back(DAG.getConstant(0x80, MVT::i8));
   5655         pshufbMask.push_back(DAG.getConstant(0x80, MVT::i8));
   5656         continue;
   5657       }
   5658       pshufbMask.push_back(DAG.getConstant(EltIdx - 16, MVT::i8));
   5659       pshufbMask.push_back(DAG.getConstant(EltIdx - 15, MVT::i8));
   5660     }
   5661     V2 = DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, V2);
   5662     V2 = DAG.getNode(X86ISD::PSHUFB, dl, MVT::v16i8, V2,
   5663                      DAG.getNode(ISD::BUILD_VECTOR, dl,
   5664                                  MVT::v16i8, &pshufbMask[0], 16));
   5665     V1 = DAG.getNode(ISD::OR, dl, MVT::v16i8, V1, V2);
   5666     return DAG.getNode(ISD::BITCAST, dl, MVT::v8i16, V1);
   5667   }
   5668 
   5669   // If BestLoQuad >= 0, generate a pshuflw to put the low elements in order,
   5670   // and update MaskVals with new element order.
   5671   std::bitset<8> InOrder;
   5672   if (BestLoQuad >= 0) {
   5673     int MaskV[] = { -1, -1, -1, -1, 4, 5, 6, 7 };
   5674     for (int i = 0; i != 4; ++i) {
   5675       int idx = MaskVals[i];
   5676       if (idx < 0) {
   5677         InOrder.set(i);
   5678       } else if ((idx / 4) == BestLoQuad) {
   5679         MaskV[i] = idx & 3;
   5680         InOrder.set(i);
   5681       }
   5682     }
   5683     NewV = DAG.getVectorShuffle(MVT::v8i16, dl, NewV, DAG.getUNDEF(MVT::v8i16),
   5684                                 &MaskV[0]);
   5685 
   5686     if (NewV.getOpcode() == ISD::VECTOR_SHUFFLE && Subtarget->hasSSSE3()) {
   5687       ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(NewV.getNode());
   5688       NewV = getTargetShuffleNode(X86ISD::PSHUFLW, dl, MVT::v8i16,
   5689                                   NewV.getOperand(0),
   5690                                   getShufflePSHUFLWImmediate(SVOp), DAG);
   5691     }
   5692   }
   5693 
   5694   // If BestHi >= 0, generate a pshufhw to put the high elements in order,
   5695   // and update MaskVals with the new element order.
   5696   if (BestHiQuad >= 0) {
   5697     int MaskV[] = { 0, 1, 2, 3, -1, -1, -1, -1 };
   5698     for (unsigned i = 4; i != 8; ++i) {
   5699       int idx = MaskVals[i];
   5700       if (idx < 0) {
   5701         InOrder.set(i);
   5702       } else if ((idx / 4) == BestHiQuad) {
   5703         MaskV[i] = (idx & 3) + 4;
   5704         InOrder.set(i);
   5705       }
   5706     }
   5707     NewV = DAG.getVectorShuffle(MVT::v8i16, dl, NewV, DAG.getUNDEF(MVT::v8i16),
   5708                                 &MaskV[0]);
   5709 
   5710     if (NewV.getOpcode() == ISD::VECTOR_SHUFFLE && Subtarget->hasSSSE3()) {
   5711       ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(NewV.getNode());
   5712       NewV = getTargetShuffleNode(X86ISD::PSHUFHW, dl, MVT::v8i16,
   5713                                   NewV.getOperand(0),
   5714                                   getShufflePSHUFHWImmediate(SVOp), DAG);
   5715     }
   5716   }
   5717 
   5718   // In case BestHi & BestLo were both -1, which means each quadword has a word
   5719   // from each of the four input quadwords, calculate the InOrder bitvector now
   5720   // before falling through to the insert/extract cleanup.
   5721   if (BestLoQuad == -1 && BestHiQuad == -1) {
   5722     NewV = V1;
   5723     for (int i = 0; i != 8; ++i)
   5724       if (MaskVals[i] < 0 || MaskVals[i] == i)
   5725         InOrder.set(i);
   5726   }
   5727 
   5728   // The other elements are put in the right place using pextrw and pinsrw.
   5729   for (unsigned i = 0; i != 8; ++i) {
   5730     if (InOrder[i])
   5731       continue;
   5732     int EltIdx = MaskVals[i];
   5733     if (EltIdx < 0)
   5734       continue;
   5735     SDValue ExtOp = (EltIdx < 8)
   5736     ? DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16, V1,
   5737                   DAG.getIntPtrConstant(EltIdx))
   5738     : DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16, V2,
   5739                   DAG.getIntPtrConstant(EltIdx - 8));
   5740     NewV = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v8i16, NewV, ExtOp,
   5741                        DAG.getIntPtrConstant(i));
   5742   }
   5743   return NewV;
   5744 }
   5745 
   5746 // v16i8 shuffles - Prefer shuffles in the following order:
   5747 // 1. [ssse3] 1 x pshufb
   5748 // 2. [ssse3] 2 x pshufb + 1 x por
   5749 // 3. [all]   v8i16 shuffle + N x pextrw + rotate + pinsrw
   5750 static
   5751 SDValue LowerVECTOR_SHUFFLEv16i8(ShuffleVectorSDNode *SVOp,
   5752                                  SelectionDAG &DAG,
   5753                                  const X86TargetLowering &TLI) {
   5754   SDValue V1 = SVOp->getOperand(0);
   5755   SDValue V2 = SVOp->getOperand(1);
   5756   DebugLoc dl = SVOp->getDebugLoc();
   5757   ArrayRef<int> MaskVals = SVOp->getMask();
   5758 
   5759   // If we have SSSE3, case 1 is generated when all result bytes come from
   5760   // one of  the inputs.  Otherwise, case 2 is generated.  If no SSSE3 is
   5761   // present, fall back to case 3.
   5762   // FIXME: kill V2Only once shuffles are canonizalized by getNode.
   5763   bool V1Only = true;
   5764   bool V2Only = true;
   5765   for (unsigned i = 0; i < 16; ++i) {
   5766     int EltIdx = MaskVals[i];
   5767     if (EltIdx < 0)
   5768       continue;
   5769     if (EltIdx < 16)
   5770       V2Only = false;
   5771     else
   5772       V1Only = false;
   5773   }
   5774 
   5775   // If SSSE3, use 1 pshufb instruction per vector with elements in the result.
   5776   if (TLI.getSubtarget()->hasSSSE3()) {
   5777     SmallVector<SDValue,16> pshufbMask;
   5778 
   5779     // If all result elements are from one input vector, then only translate
   5780     // undef mask values to 0x80 (zero out result) in the pshufb mask.
   5781     //
   5782     // Otherwise, we have elements from both input vectors, and must zero out
   5783     // elements that come from V2 in the first mask, and V1 in the second mask
   5784     // so that we can OR them together.
   5785     bool TwoInputs = !(V1Only || V2Only);
   5786     for (unsigned i = 0; i != 16; ++i) {
   5787       int EltIdx = MaskVals[i];
   5788       if (EltIdx < 0 || (TwoInputs && EltIdx >= 16)) {
   5789         pshufbMask.push_back(DAG.getConstant(0x80, MVT::i8));
   5790         continue;
   5791       }
   5792       pshufbMask.push_back(DAG.getConstant(EltIdx, MVT::i8));
   5793     }
   5794     // If all the elements are from V2, assign it to V1 and return after
   5795     // building the first pshufb.
   5796     if (V2Only)
   5797       V1 = V2;
   5798     V1 = DAG.getNode(X86ISD::PSHUFB, dl, MVT::v16i8, V1,
   5799                      DAG.getNode(ISD::BUILD_VECTOR, dl,
   5800                                  MVT::v16i8, &pshufbMask[0], 16));
   5801     if (!TwoInputs)
   5802       return V1;
   5803 
   5804     // Calculate the shuffle mask for the second input, shuffle it, and
   5805     // OR it with the first shuffled input.
   5806     pshufbMask.clear();
   5807     for (unsigned i = 0; i != 16; ++i) {
   5808       int EltIdx = MaskVals[i];
   5809       if (EltIdx < 16) {
   5810         pshufbMask.push_back(DAG.getConstant(0x80, MVT::i8));
   5811         continue;
   5812       }
   5813       pshufbMask.push_back(DAG.getConstant(EltIdx - 16, MVT::i8));
   5814     }
   5815     V2 = DAG.getNode(X86ISD::PSHUFB, dl, MVT::v16i8, V2,
   5816                      DAG.getNode(ISD::BUILD_VECTOR, dl,
   5817                                  MVT::v16i8, &pshufbMask[0], 16));
   5818     return DAG.getNode(ISD::OR, dl, MVT::v16i8, V1, V2);
   5819   }
   5820 
   5821   // No SSSE3 - Calculate in place words and then fix all out of place words
   5822   // With 0-16 extracts & inserts.  Worst case is 16 bytes out of order from
   5823   // the 16 different words that comprise the two doublequadword input vectors.
   5824   V1 = DAG.getNode(ISD::BITCAST, dl, MVT::v8i16, V1);
   5825   V2 = DAG.getNode(ISD::BITCAST, dl, MVT::v8i16, V2);
   5826   SDValue NewV = V2Only ? V2 : V1;
   5827   for (int i = 0; i != 8; ++i) {
   5828     int Elt0 = MaskVals[i*2];
   5829     int Elt1 = MaskVals[i*2+1];
   5830 
   5831     // This word of the result is all undef, skip it.
   5832     if (Elt0 < 0 && Elt1 < 0)
   5833       continue;
   5834 
   5835     // This word of the result is already in the correct place, skip it.
   5836     if (V1Only && (Elt0 == i*2) && (Elt1 == i*2+1))
   5837       continue;
   5838     if (V2Only && (Elt0 == i*2+16) && (Elt1 == i*2+17))
   5839       continue;
   5840 
   5841     SDValue Elt0Src = Elt0 < 16 ? V1 : V2;
   5842     SDValue Elt1Src = Elt1 < 16 ? V1 : V2;
   5843     SDValue InsElt;
   5844 
   5845     // If Elt0 and Elt1 are defined, are consecutive, and can be load
   5846     // using a single extract together, load it and store it.
   5847     if ((Elt0 >= 0) && ((Elt0 + 1) == Elt1) && ((Elt0 & 1) == 0)) {
   5848       InsElt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16, Elt1Src,
   5849                            DAG.getIntPtrConstant(Elt1 / 2));
   5850       NewV = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v8i16, NewV, InsElt,
   5851                         DAG.getIntPtrConstant(i));
   5852       continue;
   5853     }
   5854 
   5855     // If Elt1 is defined, extract it from the appropriate source.  If the
   5856     // source byte is not also odd, shift the extracted word left 8 bits
   5857     // otherwise clear the bottom 8 bits if we need to do an or.
   5858     if (Elt1 >= 0) {
   5859       InsElt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16, Elt1Src,
   5860                            DAG.getIntPtrConstant(Elt1 / 2));
   5861       if ((Elt1 & 1) == 0)
   5862         InsElt = DAG.getNode(ISD::SHL, dl, MVT::i16, InsElt,
   5863                              DAG.getConstant(8,
   5864                                   TLI.getShiftAmountTy(InsElt.getValueType())));
   5865       else if (Elt0 >= 0)
   5866         InsElt = DAG.getNode(ISD::AND, dl, MVT::i16, InsElt,
   5867                              DAG.getConstant(0xFF00, MVT::i16));
   5868     }
   5869     // If Elt0 is defined, extract it from the appropriate source.  If the
   5870     // source byte is not also even, shift the extracted word right 8 bits. If
   5871     // Elt1 was also defined, OR the extracted values together before
   5872     // inserting them in the result.
   5873     if (Elt0 >= 0) {
   5874       SDValue InsElt0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16,
   5875                                     Elt0Src, DAG.getIntPtrConstant(Elt0 / 2));
   5876       if ((Elt0 & 1) != 0)
   5877         InsElt0 = DAG.getNode(ISD::SRL, dl, MVT::i16, InsElt0,
   5878                               DAG.getConstant(8,
   5879                                  TLI.getShiftAmountTy(InsElt0.getValueType())));
   5880       else if (Elt1 >= 0)
   5881         InsElt0 = DAG.getNode(ISD::AND, dl, MVT::i16, InsElt0,
   5882                              DAG.getConstant(0x00FF, MVT::i16));
   5883       InsElt = Elt1 >= 0 ? DAG.getNode(ISD::OR, dl, MVT::i16, InsElt, InsElt0)
   5884                          : InsElt0;
   5885     }
   5886     NewV = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v8i16, NewV, InsElt,
   5887                        DAG.getIntPtrConstant(i));
   5888   }
   5889   return DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, NewV);
   5890 }
   5891 
   5892 /// RewriteAsNarrowerShuffle - Try rewriting v8i16 and v16i8 shuffles as 4 wide
   5893 /// ones, or rewriting v4i32 / v4f32 as 2 wide ones if possible. This can be
   5894 /// done when every pair / quad of shuffle mask elements point to elements in
   5895 /// the right sequence. e.g.
   5896 /// vector_shuffle X, Y, <2, 3, | 10, 11, | 0, 1, | 14, 15>
   5897 static
   5898 SDValue RewriteAsNarrowerShuffle(ShuffleVectorSDNode *SVOp,
   5899                                  SelectionDAG &DAG, DebugLoc dl) {
   5900   EVT VT = SVOp->getValueType(0);
   5901   SDValue V1 = SVOp->getOperand(0);
   5902   SDValue V2 = SVOp->getOperand(1);
   5903   unsigned NumElems = VT.getVectorNumElements();
   5904   unsigned NewWidth = (NumElems == 4) ? 2 : 4;
   5905   EVT NewVT;
   5906   switch (VT.getSimpleVT().SimpleTy) {
   5907   default: llvm_unreachable("Unexpected!");
   5908   case MVT::v4f32: NewVT = MVT::v2f64; break;
   5909   case MVT::v4i32: NewVT = MVT::v2i64; break;
   5910   case MVT::v8i16: NewVT = MVT::v4i32; break;
   5911   case MVT::v16i8: NewVT = MVT::v4i32; break;
   5912   }
   5913 
   5914   int Scale = NumElems / NewWidth;
   5915   SmallVector<int, 8> MaskVec;
   5916   for (unsigned i = 0; i < NumElems; i += Scale) {
   5917     int StartIdx = -1;
   5918     for (int j = 0; j < Scale; ++j) {
   5919       int EltIdx = SVOp->getMaskElt(i+j);
   5920       if (EltIdx < 0)
   5921         continue;
   5922       if (StartIdx == -1)
   5923         StartIdx = EltIdx - (EltIdx % Scale);
   5924       if (EltIdx != StartIdx + j)
   5925         return SDValue();
   5926     }
   5927     if (StartIdx == -1)
   5928       MaskVec.push_back(-1);
   5929     else
   5930       MaskVec.push_back(StartIdx / Scale);
   5931   }
   5932 
   5933   V1 = DAG.getNode(ISD::BITCAST, dl, NewVT, V1);
   5934   V2 = DAG.getNode(ISD::BITCAST, dl, NewVT, V2);
   5935   return DAG.getVectorShuffle(NewVT, dl, V1, V2, &MaskVec[0]);
   5936 }
   5937 
   5938 /// getVZextMovL - Return a zero-extending vector move low node.
   5939 ///
   5940 static SDValue getVZextMovL(EVT VT, EVT OpVT,
   5941                             SDValue SrcOp, SelectionDAG &DAG,
   5942                             const X86Subtarget *Subtarget, DebugLoc dl) {
   5943   if (VT == MVT::v2f64 || VT == MVT::v4f32) {
   5944     LoadSDNode *LD = NULL;
   5945     if (!isScalarLoadToVector(SrcOp.getNode(), &LD))
   5946       LD = dyn_cast<LoadSDNode>(SrcOp);
   5947     if (!LD) {
   5948       // movssrr and movsdrr do not clear top bits. Try to use movd, movq
   5949       // instead.
   5950       MVT ExtVT = (OpVT == MVT::v2f64) ? MVT::i64 : MVT::i32;
   5951       if ((ExtVT != MVT::i64 || Subtarget->is64Bit()) &&
   5952           SrcOp.getOpcode() == ISD::SCALAR_TO_VECTOR &&
   5953           SrcOp.getOperand(0).getOpcode() == ISD::BITCAST &&
   5954           SrcOp.getOperand(0).getOperand(0).getValueType() == ExtVT) {
   5955         // PR2108
   5956         OpVT = (OpVT == MVT::v2f64) ? MVT::v2i64 : MVT::v4i32;
   5957         return DAG.getNode(ISD::BITCAST, dl, VT,
   5958                            DAG.getNode(X86ISD::VZEXT_MOVL, dl, OpVT,
   5959                                        DAG.getNode(ISD::SCALAR_TO_VECTOR, dl,
   5960                                                    OpVT,
   5961                                                    SrcOp.getOperand(0)
   5962                                                           .getOperand(0))));
   5963       }
   5964     }
   5965   }
   5966 
   5967   return DAG.getNode(ISD::BITCAST, dl, VT,
   5968                      DAG.getNode(X86ISD::VZEXT_MOVL, dl, OpVT,
   5969                                  DAG.getNode(ISD::BITCAST, dl,
   5970                                              OpVT, SrcOp)));
   5971 }
   5972 
   5973 /// LowerVECTOR_SHUFFLE_256 - Handle all 256-bit wide vectors shuffles
   5974 /// which could not be matched by any known target speficic shuffle
   5975 static SDValue
   5976 LowerVECTOR_SHUFFLE_256(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG) {
   5977   EVT VT = SVOp->getValueType(0);
   5978 
   5979   unsigned NumElems = VT.getVectorNumElements();
   5980   unsigned NumLaneElems = NumElems / 2;
   5981 
   5982   DebugLoc dl = SVOp->getDebugLoc();
   5983   MVT EltVT = VT.getVectorElementType().getSimpleVT();
   5984   EVT NVT = MVT::getVectorVT(EltVT, NumLaneElems);
   5985   SDValue Shufs[2];
   5986 
   5987   SmallVector<int, 16> Mask;
   5988   for (unsigned l = 0; l < 2; ++l) {
   5989     // Build a shuffle mask for the output, discovering on the fly which
   5990     // input vectors to use as shuffle operands (recorded in InputUsed).
   5991     // If building a suitable shuffle vector proves too hard, then bail
   5992     // out with useBuildVector set.
   5993     int InputUsed[2] = { -1, -1 }; // Not yet discovered.
   5994     unsigned LaneStart = l * NumLaneElems;
   5995     for (unsigned i = 0; i != NumLaneElems; ++i) {
   5996       // The mask element.  This indexes into the input.
   5997       int Idx = SVOp->getMaskElt(i+LaneStart);
   5998       if (Idx < 0) {
   5999         // the mask element does not index into any input vector.
   6000         Mask.push_back(-1);
   6001         continue;
   6002       }
   6003 
   6004       // The input vector this mask element indexes into.
   6005       int Input = Idx / NumLaneElems;
   6006 
   6007       // Turn the index into an offset from the start of the input vector.
   6008       Idx -= Input * NumLaneElems;
   6009 
   6010       // Find or create a shuffle vector operand to hold this input.
   6011       unsigned OpNo;
   6012       for (OpNo = 0; OpNo < array_lengthof(InputUsed); ++OpNo) {
   6013         if (InputUsed[OpNo] == Input)
   6014           // This input vector is already an operand.
   6015           break;
   6016         if (InputUsed[OpNo] < 0) {
   6017           // Create a new operand for this input vector.
   6018           InputUsed[OpNo] = Input;
   6019           break;
   6020         }
   6021       }
   6022 
   6023       if (OpNo >= array_lengthof(InputUsed)) {
   6024         // More than two input vectors used! Give up.
   6025         return SDValue();
   6026       }
   6027 
   6028       // Add the mask index for the new shuffle vector.
   6029       Mask.push_back(Idx + OpNo * NumLaneElems);
   6030     }
   6031 
   6032     if (InputUsed[0] < 0) {
   6033       // No input vectors were used! The result is undefined.
   6034       Shufs[l] = DAG.getUNDEF(NVT);
   6035     } else {
   6036       SDValue Op0 = Extract128BitVector(SVOp->getOperand(InputUsed[0] / 2),
   6037                    DAG.getConstant((InputUsed[0] % 2) * NumLaneElems, MVT::i32),
   6038                                    DAG, dl);
   6039       // If only one input was used, use an undefined vector for the other.
   6040       SDValue Op1 = (InputUsed[1] < 0) ? DAG.getUNDEF(NVT) :
   6041         Extract128BitVector(SVOp->getOperand(InputUsed[1] / 2),
   6042                    DAG.getConstant((InputUsed[1] % 2) * NumLaneElems, MVT::i32),
   6043                                    DAG, dl);
   6044       // At least one input vector was used. Create a new shuffle vector.
   6045       Shufs[l] = DAG.getVectorShuffle(NVT, dl, Op0, Op1, &Mask[0]);
   6046     }
   6047 
   6048     Mask.clear();
   6049   }
   6050 
   6051   // Concatenate the result back
   6052   SDValue V = Insert128BitVector(DAG.getNode(ISD::UNDEF, dl, VT), Shufs[0],
   6053                                  DAG.getConstant(0, MVT::i32), DAG, dl);
   6054   return Insert128BitVector(V, Shufs[1],DAG.getConstant(NumLaneElems, MVT::i32),
   6055                             DAG, dl);
   6056 }
   6057 
   6058 /// LowerVECTOR_SHUFFLE_128v4 - Handle all 128-bit wide vectors with
   6059 /// 4 elements, and match them with several different shuffle types.
   6060 static SDValue
   6061 LowerVECTOR_SHUFFLE_128v4(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG) {
   6062   SDValue V1 = SVOp->getOperand(0);
   6063   SDValue V2 = SVOp->getOperand(1);
   6064   DebugLoc dl = SVOp->getDebugLoc();
   6065   EVT VT = SVOp->getValueType(0);
   6066 
   6067   assert(VT.getSizeInBits() == 128 && "Unsupported vector size");
   6068 
   6069   std::pair<int, int> Locs[4];
   6070   int Mask1[] = { -1, -1, -1, -1 };
   6071   SmallVector<int, 8> PermMask(SVOp->getMask().begin(), SVOp->getMask().end());
   6072 
   6073   unsigned NumHi = 0;
   6074   unsigned NumLo = 0;
   6075   for (unsigned i = 0; i != 4; ++i) {
   6076     int Idx = PermMask[i];
   6077     if (Idx < 0) {
   6078       Locs[i] = std::make_pair(-1, -1);
   6079     } else {
   6080       assert(Idx < 8 && "Invalid VECTOR_SHUFFLE index!");
   6081       if (Idx < 4) {
   6082         Locs[i] = std::make_pair(0, NumLo);
   6083         Mask1[NumLo] = Idx;
   6084         NumLo++;
   6085       } else {
   6086         Locs[i] = std::make_pair(1, NumHi);
   6087         if (2+NumHi < 4)
   6088           Mask1[2+NumHi] = Idx;
   6089         NumHi++;
   6090       }
   6091     }
   6092   }
   6093 
   6094   if (NumLo <= 2 && NumHi <= 2) {
   6095     // If no more than two elements come from either vector. This can be
   6096     // implemented with two shuffles. First shuffle gather the elements.
   6097     // The second shuffle, which takes the first shuffle as both of its
   6098     // vector operands, put the elements into the right order.
   6099     V1 = DAG.getVectorShuffle(VT, dl, V1, V2, &Mask1[0]);
   6100 
   6101     int Mask2[] = { -1, -1, -1, -1 };
   6102 
   6103     for (unsigned i = 0; i != 4; ++i)
   6104       if (Locs[i].first != -1) {
   6105         unsigned Idx = (i < 2) ? 0 : 4;
   6106         Idx += Locs[i].first * 2 + Locs[i].second;
   6107         Mask2[i] = Idx;
   6108       }
   6109 
   6110     return DAG.getVectorShuffle(VT, dl, V1, V1, &Mask2[0]);
   6111   } else if (NumLo == 3 || NumHi == 3) {
   6112     // Otherwise, we must have three elements from one vector, call it X, and
   6113     // one element from the other, call it Y.  First, use a shufps to build an
   6114     // intermediate vector with the one element from Y and the element from X
   6115     // that will be in the same half in the final destination (the indexes don't
   6116     // matter). Then, use a shufps to build the final vector, taking the half
   6117     // containing the element from Y from the intermediate, and the other half
   6118     // from X.
   6119     if (NumHi == 3) {
   6120       // Normalize it so the 3 elements come from V1.
   6121       CommuteVectorShuffleMask(PermMask, 4);
   6122       std::swap(V1, V2);
   6123     }
   6124 
   6125     // Find the element from V2.
   6126     unsigned HiIndex;
   6127     for (HiIndex = 0; HiIndex < 3; ++HiIndex) {
   6128       int Val = PermMask[HiIndex];
   6129       if (Val < 0)
   6130         continue;
   6131       if (Val >= 4)
   6132         break;
   6133     }
   6134 
   6135     Mask1[0] = PermMask[HiIndex];
   6136     Mask1[1] = -1;
   6137     Mask1[2] = PermMask[HiIndex^1];
   6138     Mask1[3] = -1;
   6139     V2 = DAG.getVectorShuffle(VT, dl, V1, V2, &Mask1[0]);
   6140 
   6141     if (HiIndex >= 2) {
   6142       Mask1[0] = PermMask[0];
   6143       Mask1[1] = PermMask[1];
   6144       Mask1[2] = HiIndex & 1 ? 6 : 4;
   6145       Mask1[3] = HiIndex & 1 ? 4 : 6;
   6146       return DAG.getVectorShuffle(VT, dl, V1, V2, &Mask1[0]);
   6147     } else {
   6148       Mask1[0] = HiIndex & 1 ? 2 : 0;
   6149       Mask1[1] = HiIndex & 1 ? 0 : 2;
   6150       Mask1[2] = PermMask[2];
   6151       Mask1[3] = PermMask[3];
   6152       if (Mask1[2] >= 0)
   6153         Mask1[2] += 4;
   6154       if (Mask1[3] >= 0)
   6155         Mask1[3] += 4;
   6156       return DAG.getVectorShuffle(VT, dl, V2, V1, &Mask1[0]);
   6157     }
   6158   }
   6159 
   6160   // Break it into (shuffle shuffle_hi, shuffle_lo).
   6161   int LoMask[] = { -1, -1, -1, -1 };
   6162   int HiMask[] = { -1, -1, -1, -1 };
   6163 
   6164   int *MaskPtr = LoMask;
   6165   unsigned MaskIdx = 0;
   6166   unsigned LoIdx = 0;
   6167   unsigned HiIdx = 2;
   6168   for (unsigned i = 0; i != 4; ++i) {
   6169     if (i == 2) {
   6170       MaskPtr = HiMask;
   6171       MaskIdx = 1;
   6172       LoIdx = 0;
   6173       HiIdx = 2;
   6174     }
   6175     int Idx = PermMask[i];
   6176     if (Idx < 0) {
   6177       Locs[i] = std::make_pair(-1, -1);
   6178     } else if (Idx < 4) {
   6179       Locs[i] = std::make_pair(MaskIdx, LoIdx);
   6180       MaskPtr[LoIdx] = Idx;
   6181       LoIdx++;
   6182     } else {
   6183       Locs[i] = std::make_pair(MaskIdx, HiIdx);
   6184       MaskPtr[HiIdx] = Idx;
   6185       HiIdx++;
   6186     }
   6187   }
   6188 
   6189   SDValue LoShuffle = DAG.getVectorShuffle(VT, dl, V1, V2, &LoMask[0]);
   6190   SDValue HiShuffle = DAG.getVectorShuffle(VT, dl, V1, V2, &HiMask[0]);
   6191   int MaskOps[] = { -1, -1, -1, -1 };
   6192   for (unsigned i = 0; i != 4; ++i)
   6193     if (Locs[i].first != -1)
   6194       MaskOps[i] = Locs[i].first * 4 + Locs[i].second;
   6195   return DAG.getVectorShuffle(VT, dl, LoShuffle, HiShuffle, &MaskOps[0]);
   6196 }
   6197 
   6198 static bool MayFoldVectorLoad(SDValue V) {
   6199   if (V.hasOneUse() && V.getOpcode() == ISD::BITCAST)
   6200     V = V.getOperand(0);
   6201   if (V.hasOneUse() && V.getOpcode() == ISD::SCALAR_TO_VECTOR)
   6202     V = V.getOperand(0);
   6203   if (V.hasOneUse() && V.getOpcode() == ISD::BUILD_VECTOR &&
   6204       V.getNumOperands() == 2 && V.getOperand(1).getOpcode() == ISD::UNDEF)
   6205     // BUILD_VECTOR (load), undef
   6206     V = V.getOperand(0);
   6207   if (MayFoldLoad(V))
   6208     return true;
   6209   return false;
   6210 }
   6211 
   6212 // FIXME: the version above should always be used. Since there's
   6213 // a bug where several vector shuffles can't be folded because the
   6214 // DAG is not updated during lowering and a node claims to have two
   6215 // uses while it only has one, use this version, and let isel match
   6216 // another instruction if the load really happens to have more than
   6217 // one use. Remove this version after this bug get fixed.
   6218 // rdar://8434668, PR8156
   6219 static bool RelaxedMayFoldVectorLoad(SDValue V) {
   6220   if (V.hasOneUse() && V.getOpcode() == ISD::BITCAST)
   6221     V = V.getOperand(0);
   6222   if (V.hasOneUse() && V.getOpcode() == ISD::SCALAR_TO_VECTOR)
   6223     V = V.getOperand(0);
   6224   if (ISD::isNormalLoad(V.getNode()))
   6225     return true;
   6226   return false;
   6227 }
   6228 
   6229 static
   6230 SDValue getMOVDDup(SDValue &Op, DebugLoc &dl, SDValue V1, SelectionDAG &DAG) {
   6231   EVT VT = Op.getValueType();
   6232 
   6233   // Canonizalize to v2f64.
   6234   V1 = DAG.getNode(ISD::BITCAST, dl, MVT::v2f64, V1);
   6235   return DAG.getNode(ISD::BITCAST, dl, VT,
   6236                      getTargetShuffleNode(X86ISD::MOVDDUP, dl, MVT::v2f64,
   6237                                           V1, DAG));
   6238 }
   6239 
   6240 static
   6241 SDValue getMOVLowToHigh(SDValue &Op, DebugLoc &dl, SelectionDAG &DAG,
   6242                         bool HasSSE2) {
   6243   SDValue V1 = Op.getOperand(0);
   6244   SDValue V2 = Op.getOperand(1);
   6245   EVT VT = Op.getValueType();
   6246 
   6247   assert(VT != MVT::v2i64 && "unsupported shuffle type");
   6248 
   6249   if (HasSSE2 && VT == MVT::v2f64)
   6250     return getTargetShuffleNode(X86ISD::MOVLHPD, dl, VT, V1, V2, DAG);
   6251 
   6252   // v4f32 or v4i32: canonizalized to v4f32 (which is legal for SSE1)
   6253   return DAG.getNode(ISD::BITCAST, dl, VT,
   6254                      getTargetShuffleNode(X86ISD::MOVLHPS, dl, MVT::v4f32,
   6255                            DAG.getNode(ISD::BITCAST, dl, MVT::v4f32, V1),
   6256                            DAG.getNode(ISD::BITCAST, dl, MVT::v4f32, V2), DAG));
   6257 }
   6258 
   6259 static
   6260 SDValue getMOVHighToLow(SDValue &Op, DebugLoc &dl, SelectionDAG &DAG) {
   6261   SDValue V1 = Op.getOperand(0);
   6262   SDValue V2 = Op.getOperand(1);
   6263   EVT VT = Op.getValueType();
   6264 
   6265   assert((VT == MVT::v4i32 || VT == MVT::v4f32) &&
   6266          "unsupported shuffle type");
   6267 
   6268   if (V2.getOpcode() == ISD::UNDEF)
   6269     V2 = V1;
   6270 
   6271   // v4i32 or v4f32
   6272   return getTargetShuffleNode(X86ISD::MOVHLPS, dl, VT, V1, V2, DAG);
   6273 }
   6274 
   6275 static
   6276 SDValue getMOVLP(SDValue &Op, DebugLoc &dl, SelectionDAG &DAG, bool HasSSE2) {
   6277   SDValue V1 = Op.getOperand(0);
   6278   SDValue V2 = Op.getOperand(1);
   6279   EVT VT = Op.getValueType();
   6280   unsigned NumElems = VT.getVectorNumElements();
   6281 
   6282   // Use MOVLPS and MOVLPD in case V1 or V2 are loads. During isel, the second
   6283   // operand of these instructions is only memory, so check if there's a
   6284   // potencial load folding here, otherwise use SHUFPS or MOVSD to match the
   6285   // same masks.
   6286   bool CanFoldLoad = false;
   6287 
   6288   // Trivial case, when V2 comes from a load.
   6289   if (MayFoldVectorLoad(V2))
   6290     CanFoldLoad = true;
   6291 
   6292   // When V1 is a load, it can be folded later into a store in isel, example:
   6293   //  (store (v4f32 (X86Movlps (load addr:$src1), VR128:$src2)), addr:$src1)
   6294   //    turns into:
   6295   //  (MOVLPSmr addr:$src1, VR128:$src2)
   6296   // So, recognize this potential and also use MOVLPS or MOVLPD
   6297   else if (MayFoldVectorLoad(V1) && MayFoldIntoStore(Op))
   6298     CanFoldLoad = true;
   6299 
   6300   ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
   6301   if (CanFoldLoad) {
   6302     if (HasSSE2 && NumElems == 2)
   6303       return getTargetShuffleNode(X86ISD::MOVLPD, dl, VT, V1, V2, DAG);
   6304 
   6305     if (NumElems == 4)
   6306       // If we don't care about the second element, procede to use movss.
   6307       if (SVOp->getMaskElt(1) != -1)
   6308         return getTargetShuffleNode(X86ISD::MOVLPS, dl, VT, V1, V2, DAG);
   6309   }
   6310 
   6311   // movl and movlp will both match v2i64, but v2i64 is never matched by
   6312   // movl earlier because we make it strict to avoid messing with the movlp load
   6313   // folding logic (see the code above getMOVLP call). Match it here then,
   6314   // this is horrible, but will stay like this until we move all shuffle
   6315   // matching to x86 specific nodes. Note that for the 1st condition all
   6316   // types are matched with movsd.
   6317   if (HasSSE2) {
   6318     // FIXME: isMOVLMask should be checked and matched before getMOVLP,
   6319     // as to remove this logic from here, as much as possible
   6320     if (NumElems == 2 || !isMOVLMask(SVOp->getMask(), VT))
   6321       return getTargetShuffleNode(X86ISD::MOVSD, dl, VT, V1, V2, DAG);
   6322     return getTargetShuffleNode(X86ISD::MOVSS, dl, VT, V1, V2, DAG);
   6323   }
   6324 
   6325   assert(VT != MVT::v4i32 && "unsupported shuffle type");
   6326 
   6327   // Invert the operand order and use SHUFPS to match it.
   6328   return getTargetShuffleNode(X86ISD::SHUFP, dl, VT, V2, V1,
   6329                               getShuffleSHUFImmediate(SVOp), DAG);
   6330 }
   6331 
   6332 SDValue
   6333 X86TargetLowering::NormalizeVectorShuffle(SDValue Op, SelectionDAG &DAG) const {
   6334   ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
   6335   EVT VT = Op.getValueType();
   6336   DebugLoc dl = Op.getDebugLoc();
   6337   SDValue V1 = Op.getOperand(0);
   6338   SDValue V2 = Op.getOperand(1);
   6339 
   6340   if (isZeroShuffle(SVOp))
   6341     return getZeroVector(VT, Subtarget, DAG, dl);
   6342 
   6343   // Handle splat operations
   6344   if (SVOp->isSplat()) {
   6345     unsigned NumElem = VT.getVectorNumElements();
   6346     int Size = VT.getSizeInBits();
   6347 
   6348     // Use vbroadcast whenever the splat comes from a foldable load
   6349     SDValue Broadcast = LowerVectorBroadcast(Op, DAG);
   6350     if (Broadcast.getNode())
   6351       return Broadcast;
   6352 
   6353     // Handle splats by matching through known shuffle masks
   6354     if ((Size == 128 && NumElem <= 4) ||
   6355         (Size == 256 && NumElem < 8))
   6356       return SDValue();
   6357 
   6358     // All remaning splats are promoted to target supported vector shuffles.
   6359     return PromoteSplat(SVOp, DAG);
   6360   }
   6361 
   6362   // If the shuffle can be profitably rewritten as a narrower shuffle, then
   6363   // do it!
   6364   if (VT == MVT::v8i16 || VT == MVT::v16i8) {
   6365     SDValue NewOp = RewriteAsNarrowerShuffle(SVOp, DAG, dl);
   6366     if (NewOp.getNode())
   6367       return DAG.getNode(ISD::BITCAST, dl, VT, NewOp);
   6368   } else if ((VT == MVT::v4i32 ||
   6369              (VT == MVT::v4f32 && Subtarget->hasSSE2()))) {
   6370     // FIXME: Figure out a cleaner way to do this.
   6371     // Try to make use of movq to zero out the top part.
   6372     if (ISD::isBuildVectorAllZeros(V2.getNode())) {
   6373       SDValue NewOp = RewriteAsNarrowerShuffle(SVOp, DAG, dl);
   6374       if (NewOp.getNode()) {
   6375         EVT NewVT = NewOp.getValueType();
   6376         if (isCommutedMOVLMask(cast<ShuffleVectorSDNode>(NewOp)->getMask(),
   6377                                NewVT, true, false))
   6378           return getVZextMovL(VT, NewVT, NewOp.getOperand(0),
   6379                               DAG, Subtarget, dl);
   6380       }
   6381     } else if (ISD::isBuildVectorAllZeros(V1.getNode())) {
   6382       SDValue NewOp = RewriteAsNarrowerShuffle(SVOp, DAG, dl);
   6383       if (NewOp.getNode()) {
   6384         EVT NewVT = NewOp.getValueType();
   6385         if (isMOVLMask(cast<ShuffleVectorSDNode>(NewOp)->getMask(), NewVT))
   6386           return getVZextMovL(VT, NewVT, NewOp.getOperand(1),
   6387                               DAG, Subtarget, dl);
   6388       }
   6389     }
   6390   }
   6391   return SDValue();
   6392 }
   6393 
   6394 SDValue
   6395 X86TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const {
   6396   ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
   6397   SDValue V1 = Op.getOperand(0);
   6398   SDValue V2 = Op.getOperand(1);
   6399   EVT VT = Op.getValueType();
   6400   DebugLoc dl = Op.getDebugLoc();
   6401   unsigned NumElems = VT.getVectorNumElements();
   6402   bool V1IsUndef = V1.getOpcode() == ISD::UNDEF;
   6403   bool V2IsUndef = V2.getOpcode() == ISD::UNDEF;
   6404   bool V1IsSplat = false;
   6405   bool V2IsSplat = false;
   6406   bool HasSSE2 = Subtarget->hasSSE2();
   6407   bool HasAVX    = Subtarget->hasAVX();
   6408   bool HasAVX2   = Subtarget->hasAVX2();
   6409   MachineFunction &MF = DAG.getMachineFunction();
   6410   bool OptForSize = MF.getFunction()->hasFnAttr(Attribute::OptimizeForSize);
   6411 
   6412   assert(VT.getSizeInBits() != 64 && "Can't lower MMX shuffles");
   6413 
   6414   if (V1IsUndef && V2IsUndef)
   6415     return DAG.getUNDEF(VT);
   6416 
   6417   assert(!V1IsUndef && "Op 1 of shuffle should not be undef");
   6418 
   6419   // Vector shuffle lowering takes 3 steps:
   6420   //
   6421   // 1) Normalize the input vectors. Here splats, zeroed vectors, profitable
   6422   //    narrowing and commutation of operands should be handled.
   6423   // 2) Matching of shuffles with known shuffle masks to x86 target specific
   6424   //    shuffle nodes.
   6425   // 3) Rewriting of unmatched masks into new generic shuffle operations,
   6426   //    so the shuffle can be broken into other shuffles and the legalizer can
   6427   //    try the lowering again.
   6428   //
   6429   // The general idea is that no vector_shuffle operation should be left to
   6430   // be matched during isel, all of them must be converted to a target specific
   6431   // node here.
   6432 
   6433   // Normalize the input vectors. Here splats, zeroed vectors, profitable
   6434   // narrowing and commutation of operands should be handled. The actual code
   6435   // doesn't include all of those, work in progress...
   6436   SDValue NewOp = NormalizeVectorShuffle(Op, DAG);
   6437   if (NewOp.getNode())
   6438     return NewOp;
   6439 
   6440   SmallVector<int, 8> M(SVOp->getMask().begin(), SVOp->getMask().end());
   6441 
   6442   // NOTE: isPSHUFDMask can also match both masks below (unpckl_undef and
   6443   // unpckh_undef). Only use pshufd if speed is more important than size.
   6444   if (OptForSize && isUNPCKL_v_undef_Mask(M, VT, HasAVX2))
   6445     return getTargetShuffleNode(X86ISD::UNPCKL, dl, VT, V1, V1, DAG);
   6446   if (OptForSize && isUNPCKH_v_undef_Mask(M, VT, HasAVX2))
   6447     return getTargetShuffleNode(X86ISD::UNPCKH, dl, VT, V1, V1, DAG);
   6448 
   6449   if (isMOVDDUPMask(M, VT) && Subtarget->hasSSE3() &&
   6450       V2IsUndef && RelaxedMayFoldVectorLoad(V1))
   6451     return getMOVDDup(Op, dl, V1, DAG);
   6452 
   6453   if (isMOVHLPS_v_undef_Mask(M, VT))
   6454     return getMOVHighToLow(Op, dl, DAG);
   6455 
   6456   // Use to match splats
   6457   if (HasSSE2 && isUNPCKHMask(M, VT, HasAVX2) && V2IsUndef &&
   6458       (VT == MVT::v2f64 || VT == MVT::v2i64))
   6459     return getTargetShuffleNode(X86ISD::UNPCKH, dl, VT, V1, V1, DAG);
   6460 
   6461   if (isPSHUFDMask(M, VT)) {
   6462     // The actual implementation will match the mask in the if above and then
   6463     // during isel it can match several different instructions, not only pshufd
   6464     // as its name says, sad but true, emulate the behavior for now...
   6465     if (isMOVDDUPMask(M, VT) && ((VT == MVT::v4f32 || VT == MVT::v2i64)))
   6466       return getTargetShuffleNode(X86ISD::MOVLHPS, dl, VT, V1, V1, DAG);
   6467 
   6468     unsigned TargetMask = getShuffleSHUFImmediate(SVOp);
   6469 
   6470     if (HasAVX && (VT == MVT::v4f32 || VT == MVT::v2f64))
   6471       return getTargetShuffleNode(X86ISD::VPERMILP, dl, VT, V1, TargetMask, DAG);
   6472 
   6473     if (HasSSE2 && (VT == MVT::v4f32 || VT == MVT::v4i32))
   6474       return getTargetShuffleNode(X86ISD::PSHUFD, dl, VT, V1, TargetMask, DAG);
   6475 
   6476     return getTargetShuffleNode(X86ISD::SHUFP, dl, VT, V1, V1,
   6477                                 TargetMask, DAG);
   6478   }
   6479 
   6480   // Check if this can be converted into a logical shift.
   6481   bool isLeft = false;
   6482   unsigned ShAmt = 0;
   6483   SDValue ShVal;
   6484   bool isShift = HasSSE2 && isVectorShift(SVOp, DAG, isLeft, ShVal, ShAmt);
   6485   if (isShift && ShVal.hasOneUse()) {
   6486     // If the shifted value has multiple uses, it may be cheaper to use
   6487     // v_set0 + movlhps or movhlps, etc.
   6488     EVT EltVT = VT.getVectorElementType();
   6489     ShAmt *= EltVT.getSizeInBits();
   6490     return getVShift(isLeft, VT, ShVal, ShAmt, DAG, *this, dl);
   6491   }
   6492 
   6493   if (isMOVLMask(M, VT)) {
   6494     if (ISD::isBuildVectorAllZeros(V1.getNode()))
   6495       return getVZextMovL(VT, VT, V2, DAG, Subtarget, dl);
   6496     if (!isMOVLPMask(M, VT)) {
   6497       if (HasSSE2 && (VT == MVT::v2i64 || VT == MVT::v2f64))
   6498         return getTargetShuffleNode(X86ISD::MOVSD, dl, VT, V1, V2, DAG);
   6499 
   6500       if (VT == MVT::v4i32 || VT == MVT::v4f32)
   6501         return getTargetShuffleNode(X86ISD::MOVSS, dl, VT, V1, V2, DAG);
   6502     }
   6503   }
   6504 
   6505   // FIXME: fold these into legal mask.
   6506   if (isMOVLHPSMask(M, VT) && !isUNPCKLMask(M, VT, HasAVX2))
   6507     return getMOVLowToHigh(Op, dl, DAG, HasSSE2);
   6508 
   6509   if (isMOVHLPSMask(M, VT))
   6510     return getMOVHighToLow(Op, dl, DAG);
   6511 
   6512   if (V2IsUndef && isMOVSHDUPMask(M, VT, Subtarget))
   6513     return getTargetShuffleNode(X86ISD::MOVSHDUP, dl, VT, V1, DAG);
   6514 
   6515   if (V2IsUndef && isMOVSLDUPMask(M, VT, Subtarget))
   6516     return getTargetShuffleNode(X86ISD::MOVSLDUP, dl, VT, V1, DAG);
   6517 
   6518   if (isMOVLPMask(M, VT))
   6519     return getMOVLP(Op, dl, DAG, HasSSE2);
   6520 
   6521   if (ShouldXformToMOVHLPS(M, VT) ||
   6522       ShouldXformToMOVLP(V1.getNode(), V2.getNode(), M, VT))
   6523     return CommuteVectorShuffle(SVOp, DAG);
   6524 
   6525   if (isShift) {
   6526     // No better options. Use a vshldq / vsrldq.
   6527     EVT EltVT = VT.getVectorElementType();
   6528     ShAmt *= EltVT.getSizeInBits();
   6529     return getVShift(isLeft, VT, ShVal, ShAmt, DAG, *this, dl);
   6530   }
   6531 
   6532   bool Commuted = false;
   6533   // FIXME: This should also accept a bitcast of a splat?  Be careful, not
   6534   // 1,1,1,1 -> v8i16 though.
   6535   V1IsSplat = isSplatVector(V1.getNode());
   6536   V2IsSplat = isSplatVector(V2.getNode());
   6537 
   6538   // Canonicalize the splat or undef, if present, to be on the RHS.
   6539   if (!V2IsUndef && V1IsSplat && !V2IsSplat) {
   6540     CommuteVectorShuffleMask(M, NumElems);
   6541     std::swap(V1, V2);
   6542     std::swap(V1IsSplat, V2IsSplat);
   6543     Commuted = true;
   6544   }
   6545 
   6546   if (isCommutedMOVLMask(M, VT, V2IsSplat, V2IsUndef)) {
   6547     // Shuffling low element of v1 into undef, just return v1.
   6548     if (V2IsUndef)
   6549       return V1;
   6550     // If V2 is a splat, the mask may be malformed such as <4,3,3,3>, which
   6551     // the instruction selector will not match, so get a canonical MOVL with
   6552     // swapped operands to undo the commute.
   6553     return getMOVL(DAG, dl, VT, V2, V1);
   6554   }
   6555 
   6556   if (isUNPCKLMask(M, VT, HasAVX2))
   6557     return getTargetShuffleNode(X86ISD::UNPCKL, dl, VT, V1, V2, DAG);
   6558 
   6559   if (isUNPCKHMask(M, VT, HasAVX2))
   6560     return getTargetShuffleNode(X86ISD::UNPCKH, dl, VT, V1, V2, DAG);
   6561 
   6562   if (V2IsSplat) {
   6563     // Normalize mask so all entries that point to V2 points to its first
   6564     // element then try to match unpck{h|l} again. If match, return a
   6565     // new vector_shuffle with the corrected mask.p
   6566     SmallVector<int, 8> NewMask(M.begin(), M.end());
   6567     NormalizeMask(NewMask, NumElems);
   6568     if (isUNPCKLMask(NewMask, VT, HasAVX2, true)) {
   6569       return getTargetShuffleNode(X86ISD::UNPCKL, dl, VT, V1, V2, DAG);
   6570     } else if (isUNPCKHMask(NewMask, VT, HasAVX2, true)) {
   6571       return getTargetShuffleNode(X86ISD::UNPCKH, dl, VT, V1, V2, DAG);
   6572     }
   6573   }
   6574 
   6575   if (Commuted) {
   6576     // Commute is back and try unpck* again.
   6577     // FIXME: this seems wrong.
   6578     CommuteVectorShuffleMask(M, NumElems);
   6579     std::swap(V1, V2);
   6580     std::swap(V1IsSplat, V2IsSplat);
   6581     Commuted = false;
   6582 
   6583     if (isUNPCKLMask(M, VT, HasAVX2))
   6584       return getTargetShuffleNode(X86ISD::UNPCKL, dl, VT, V1, V2, DAG);
   6585 
   6586     if (isUNPCKHMask(M, VT, HasAVX2))
   6587       return getTargetShuffleNode(X86ISD::UNPCKH, dl, VT, V1, V2, DAG);
   6588   }
   6589 
   6590   // Normalize the node to match x86 shuffle ops if needed
   6591   if (!V2IsUndef && (isSHUFPMask(M, VT, HasAVX, /* Commuted */ true)))
   6592     return CommuteVectorShuffle(SVOp, DAG);
   6593 
   6594   // The checks below are all present in isShuffleMaskLegal, but they are
   6595   // inlined here right now to enable us to directly emit target specific
   6596   // nodes, and remove one by one until they don't return Op anymore.
   6597 
   6598   if (isPALIGNRMask(M, VT, Subtarget))
   6599     return getTargetShuffleNode(X86ISD::PALIGN, dl, VT, V1, V2,
   6600                                 getShufflePALIGNRImmediate(SVOp),
   6601                                 DAG);
   6602 
   6603   if (ShuffleVectorSDNode::isSplatMask(&M[0], VT) &&
   6604       SVOp->getSplatIndex() == 0 && V2IsUndef) {
   6605     if (VT == MVT::v2f64 || VT == MVT::v2i64)
   6606       return getTargetShuffleNode(X86ISD::UNPCKL, dl, VT, V1, V1, DAG);
   6607   }
   6608 
   6609   if (isPSHUFHWMask(M, VT))
   6610     return getTargetShuffleNode(X86ISD::PSHUFHW, dl, VT, V1,
   6611                                 getShufflePSHUFHWImmediate(SVOp),
   6612                                 DAG);
   6613 
   6614   if (isPSHUFLWMask(M, VT))
   6615     return getTargetShuffleNode(X86ISD::PSHUFLW, dl, VT, V1,
   6616                                 getShufflePSHUFLWImmediate(SVOp),
   6617                                 DAG);
   6618 
   6619   if (isSHUFPMask(M, VT, HasAVX))
   6620     return getTargetShuffleNode(X86ISD::SHUFP, dl, VT, V1, V2,
   6621                                 getShuffleSHUFImmediate(SVOp), DAG);
   6622 
   6623   if (isUNPCKL_v_undef_Mask(M, VT, HasAVX2))
   6624     return getTargetShuffleNode(X86ISD::UNPCKL, dl, VT, V1, V1, DAG);
   6625   if (isUNPCKH_v_undef_Mask(M, VT, HasAVX2))
   6626     return getTargetShuffleNode(X86ISD::UNPCKH, dl, VT, V1, V1, DAG);
   6627 
   6628   //===--------------------------------------------------------------------===//
   6629   // Generate target specific nodes for 128 or 256-bit shuffles only
   6630   // supported in the AVX instruction set.
   6631   //
   6632 
   6633   // Handle VMOVDDUPY permutations
   6634   if (V2IsUndef && isMOVDDUPYMask(M, VT, HasAVX))
   6635     return getTargetShuffleNode(X86ISD::MOVDDUP, dl, VT, V1, DAG);
   6636 
   6637   // Handle VPERMILPS/D* permutations
   6638   if (isVPERMILPMask(M, VT, HasAVX)) {
   6639     if (HasAVX2 && VT == MVT::v8i32)
   6640       return getTargetShuffleNode(X86ISD::PSHUFD, dl, VT, V1,
   6641                                   getShuffleSHUFImmediate(SVOp), DAG);
   6642     return getTargetShuffleNode(X86ISD::VPERMILP, dl, VT, V1,
   6643                                 getShuffleSHUFImmediate(SVOp), DAG);
   6644   }
   6645 
   6646   // Handle VPERM2F128/VPERM2I128 permutations
   6647   if (isVPERM2X128Mask(M, VT, HasAVX))
   6648     return getTargetShuffleNode(X86ISD::VPERM2X128, dl, VT, V1,
   6649                                 V2, getShuffleVPERM2X128Immediate(SVOp), DAG);
   6650 
   6651   SDValue BlendOp = LowerVECTOR_SHUFFLEtoBlend(Op, Subtarget, DAG);
   6652   if (BlendOp.getNode())
   6653     return BlendOp;
   6654 
   6655   if (V2IsUndef && HasAVX2 && (VT == MVT::v8i32 || VT == MVT::v8f32)) {
   6656     SmallVector<SDValue, 8> permclMask;
   6657     for (unsigned i = 0; i != 8; ++i) {
   6658       permclMask.push_back(DAG.getConstant((M[i]>=0) ? M[i] : 0, MVT::i32));
   6659     }
   6660     SDValue Mask = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v8i32,
   6661                                &permclMask[0], 8);
   6662     // Bitcast is for VPERMPS since mask is v8i32 but node takes v8f32
   6663     return DAG.getNode(X86ISD::VPERMV, dl, VT,
   6664                        DAG.getNode(ISD::BITCAST, dl, VT, Mask), V1);
   6665   }
   6666 
   6667   if (V2IsUndef && HasAVX2 && (VT == MVT::v4i64 || VT == MVT::v4f64))
   6668     return getTargetShuffleNode(X86ISD::VPERMI, dl, VT, V1,
   6669                                 getShuffleCLImmediate(SVOp), DAG);
   6670 
   6671 
   6672   //===--------------------------------------------------------------------===//
   6673   // Since no target specific shuffle was selected for this generic one,
   6674   // lower it into other known shuffles. FIXME: this isn't true yet, but
   6675   // this is the plan.
   6676   //
   6677 
   6678   // Handle v8i16 specifically since SSE can do byte extraction and insertion.
   6679   if (VT == MVT::v8i16) {
   6680     SDValue NewOp = LowerVECTOR_SHUFFLEv8i16(Op, DAG);
   6681     if (NewOp.getNode())
   6682       return NewOp;
   6683   }
   6684 
   6685   if (VT == MVT::v16i8) {
   6686     SDValue NewOp = LowerVECTOR_SHUFFLEv16i8(SVOp, DAG, *this);
   6687     if (NewOp.getNode())
   6688       return NewOp;
   6689   }
   6690 
   6691   // Handle all 128-bit wide vectors with 4 elements, and match them with
   6692   // several different shuffle types.
   6693   if (NumElems == 4 && VT.getSizeInBits() == 128)
   6694     return LowerVECTOR_SHUFFLE_128v4(SVOp, DAG);
   6695 
   6696   // Handle general 256-bit shuffles
   6697   if (VT.is256BitVector())
   6698     return LowerVECTOR_SHUFFLE_256(SVOp, DAG);
   6699 
   6700   return SDValue();
   6701 }
   6702 
   6703 SDValue
   6704 X86TargetLowering::LowerEXTRACT_VECTOR_ELT_SSE4(SDValue Op,
   6705                                                 SelectionDAG &DAG) const {
   6706   EVT VT = Op.getValueType();
   6707   DebugLoc dl = Op.getDebugLoc();
   6708 
   6709   if (Op.getOperand(0).getValueType().getSizeInBits() != 128)
   6710     return SDValue();
   6711 
   6712   if (VT.getSizeInBits() == 8) {
   6713     SDValue Extract = DAG.getNode(X86ISD::PEXTRB, dl, MVT::i32,
   6714                                     Op.getOperand(0), Op.getOperand(1));
   6715     SDValue Assert  = DAG.getNode(ISD::AssertZext, dl, MVT::i32, Extract,
   6716                                     DAG.getValueType(VT));
   6717     return DAG.getNode(ISD::TRUNCATE, dl, VT, Assert);
   6718   } else if (VT.getSizeInBits() == 16) {
   6719     unsigned Idx = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
   6720     // If Idx is 0, it's cheaper to do a move instead of a pextrw.
   6721     if (Idx == 0)
   6722       return DAG.getNode(ISD::TRUNCATE, dl, MVT::i16,
   6723                          DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32,
   6724                                      DAG.getNode(ISD::BITCAST, dl,
   6725                                                  MVT::v4i32,
   6726                                                  Op.getOperand(0)),
   6727                                      Op.getOperand(1)));
   6728     SDValue Extract = DAG.getNode(X86ISD::PEXTRW, dl, MVT::i32,
   6729                                     Op.getOperand(0), Op.getOperand(1));
   6730     SDValue Assert  = DAG.getNode(ISD::AssertZext, dl, MVT::i32, Extract,
   6731                                     DAG.getValueType(VT));
   6732     return DAG.getNode(ISD::TRUNCATE, dl, VT, Assert);
   6733   } else if (VT == MVT::f32) {
   6734     // EXTRACTPS outputs to a GPR32 register which will require a movd to copy
   6735     // the result back to FR32 register. It's only worth matching if the
   6736     // result has a single use which is a store or a bitcast to i32.  And in
   6737     // the case of a store, it's not worth it if the index is a constant 0,
   6738     // because a MOVSSmr can be used instead, which is smaller and faster.
   6739     if (!Op.hasOneUse())
   6740       return SDValue();
   6741     SDNode *User = *Op.getNode()->use_begin();
   6742     if ((User->getOpcode() != ISD::STORE ||
   6743          (isa<ConstantSDNode>(Op.getOperand(1)) &&
   6744           cast<ConstantSDNode>(Op.getOperand(1))->isNullValue())) &&
   6745         (User->getOpcode() != ISD::BITCAST ||
   6746          User->getValueType(0) != MVT::i32))
   6747       return SDValue();
   6748     SDValue Extract = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32,
   6749                                   DAG.getNode(ISD::BITCAST, dl, MVT::v4i32,
   6750                                               Op.getOperand(0)),
   6751                                               Op.getOperand(1));
   6752     return DAG.getNode(ISD::BITCAST, dl, MVT::f32, Extract);
   6753   } else if (VT == MVT::i32 || VT == MVT::i64) {
   6754     // ExtractPS/pextrq works with constant index.
   6755     if (isa<ConstantSDNode>(Op.getOperand(1)))
   6756       return Op;
   6757   }
   6758   return SDValue();
   6759 }
   6760 
   6761 
   6762 SDValue
   6763 X86TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op,
   6764                                            SelectionDAG &DAG) const {
   6765   if (!isa<ConstantSDNode>(Op.getOperand(1)))
   6766     return SDValue();
   6767 
   6768   SDValue Vec = Op.getOperand(0);
   6769   EVT VecVT = Vec.getValueType();
   6770 
   6771   // If this is a 256-bit vector result, first extract the 128-bit vector and
   6772   // then extract the element from the 128-bit vector.
   6773   if (VecVT.getSizeInBits() == 256) {
   6774     DebugLoc dl = Op.getNode()->getDebugLoc();
   6775     unsigned NumElems = VecVT.getVectorNumElements();
   6776     SDValue Idx = Op.getOperand(1);
   6777     unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
   6778 
   6779     // Get the 128-bit vector.
   6780     bool Upper = IdxVal >= NumElems/2;
   6781     Vec = Extract128BitVector(Vec,
   6782                     DAG.getConstant(Upper ? NumElems/2 : 0, MVT::i32), DAG, dl);
   6783 
   6784     return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, Op.getValueType(), Vec,
   6785                     Upper ? DAG.getConstant(IdxVal-NumElems/2, MVT::i32) : Idx);
   6786   }
   6787 
   6788   assert(Vec.getValueSizeInBits() <= 128 && "Unexpected vector length");
   6789 
   6790   if (Subtarget->hasSSE41()) {
   6791     SDValue Res = LowerEXTRACT_VECTOR_ELT_SSE4(Op, DAG);
   6792     if (Res.getNode())
   6793       return Res;
   6794   }
   6795 
   6796   EVT VT = Op.getValueType();
   6797   DebugLoc dl = Op.getDebugLoc();
   6798   // TODO: handle v16i8.
   6799   if (VT.getSizeInBits() == 16) {
   6800     SDValue Vec = Op.getOperand(0);
   6801     unsigned Idx = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
   6802     if (Idx == 0)
   6803       return DAG.getNode(ISD::TRUNCATE, dl, MVT::i16,
   6804                          DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32,
   6805                                      DAG.getNode(ISD::BITCAST, dl,
   6806                                                  MVT::v4i32, Vec),
   6807                                      Op.getOperand(1)));
   6808     // Transform it so it match pextrw which produces a 32-bit result.
   6809     EVT EltVT = MVT::i32;
   6810     SDValue Extract = DAG.getNode(X86ISD::PEXTRW, dl, EltVT,
   6811                                     Op.getOperand(0), Op.getOperand(1));
   6812     SDValue Assert  = DAG.getNode(ISD::AssertZext, dl, EltVT, Extract,
   6813                                     DAG.getValueType(VT));
   6814     return DAG.getNode(ISD::TRUNCATE, dl, VT, Assert);
   6815   } else if (VT.getSizeInBits() == 32) {
   6816     unsigned Idx = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
   6817     if (Idx == 0)
   6818       return Op;
   6819 
   6820     // SHUFPS the element to the lowest double word, then movss.
   6821     int Mask[4] = { static_cast<int>(Idx), -1, -1, -1 };
   6822     EVT VVT = Op.getOperand(0).getValueType();
   6823     SDValue Vec = DAG.getVectorShuffle(VVT, dl, Op.getOperand(0),
   6824                                        DAG.getUNDEF(VVT), Mask);
   6825     return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Vec,
   6826                        DAG.getIntPtrConstant(0));
   6827   } else if (VT.getSizeInBits() == 64) {
   6828     // FIXME: .td only matches this for <2 x f64>, not <2 x i64> on 32b
   6829     // FIXME: seems like this should be unnecessary if mov{h,l}pd were taught
   6830     //        to match extract_elt for f64.
   6831     unsigned Idx = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
   6832     if (Idx == 0)
   6833       return Op;
   6834 
   6835     // UNPCKHPD the element to the lowest double word, then movsd.
   6836     // Note if the lower 64 bits of the result of the UNPCKHPD is then stored
   6837     // to a f64mem, the whole operation is folded into a single MOVHPDmr.
   6838     int Mask[2] = { 1, -1 };
   6839     EVT VVT = Op.getOperand(0).getValueType();
   6840     SDValue Vec = DAG.getVectorShuffle(VVT, dl, Op.getOperand(0),
   6841                                        DAG.getUNDEF(VVT), Mask);
   6842     return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Vec,
   6843                        DAG.getIntPtrConstant(0));
   6844   }
   6845 
   6846   return SDValue();
   6847 }
   6848 
   6849 SDValue
   6850 X86TargetLowering::LowerINSERT_VECTOR_ELT_SSE4(SDValue Op,
   6851                                                SelectionDAG &DAG) const {
   6852   EVT VT = Op.getValueType();
   6853   EVT EltVT = VT.getVectorElementType();
   6854   DebugLoc dl = Op.getDebugLoc();
   6855 
   6856   SDValue N0 = Op.getOperand(0);
   6857   SDValue N1 = Op.getOperand(1);
   6858   SDValue N2 = Op.getOperand(2);
   6859 
   6860   if (VT.getSizeInBits() == 256)
   6861     return SDValue();
   6862 
   6863   if ((EltVT.getSizeInBits() == 8 || EltVT.getSizeInBits() == 16) &&
   6864       isa<ConstantSDNode>(N2)) {
   6865     unsigned Opc;
   6866     if (VT == MVT::v8i16)
   6867       Opc = X86ISD::PINSRW;
   6868     else if (VT == MVT::v16i8)
   6869       Opc = X86ISD::PINSRB;
   6870     else
   6871       Opc = X86ISD::PINSRB;
   6872 
   6873     // Transform it so it match pinsr{b,w} which expects a GR32 as its second
   6874     // argument.
   6875     if (N1.getValueType() != MVT::i32)
   6876       N1 = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, N1);
   6877     if (N2.getValueType() != MVT::i32)
   6878       N2 = DAG.getIntPtrConstant(cast<ConstantSDNode>(N2)->getZExtValue());
   6879     return DAG.getNode(Opc, dl, VT, N0, N1, N2);
   6880   } else if (EltVT == MVT::f32 && isa<ConstantSDNode>(N2)) {
   6881     // Bits [7:6] of the constant are the source select.  This will always be
   6882     //  zero here.  The DAG Combiner may combine an extract_elt index into these
   6883     //  bits.  For example (insert (extract, 3), 2) could be matched by putting
   6884     //  the '3' into bits [7:6] of X86ISD::INSERTPS.
   6885     // Bits [5:4] of the constant are the destination select.  This is the
   6886     //  value of the incoming immediate.
   6887     // Bits [3:0] of the constant are the zero mask.  The DAG Combiner may
   6888     //   combine either bitwise AND or insert of float 0.0 to set these bits.
   6889     N2 = DAG.getIntPtrConstant(cast<ConstantSDNode>(N2)->getZExtValue() << 4);
   6890     // Create this as a scalar to vector..
   6891     N1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4f32, N1);
   6892     return DAG.getNode(X86ISD::INSERTPS, dl, VT, N0, N1, N2);
   6893   } else if ((EltVT == MVT::i32 || EltVT == MVT::i64) &&
   6894              isa<ConstantSDNode>(N2)) {
   6895     // PINSR* works with constant index.
   6896     return Op;
   6897   }
   6898   return SDValue();
   6899 }
   6900 
   6901 SDValue
   6902 X86TargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const {
   6903   EVT VT = Op.getValueType();
   6904   EVT EltVT = VT.getVectorElementType();
   6905 
   6906   DebugLoc dl = Op.getDebugLoc();
   6907   SDValue N0 = Op.getOperand(0);
   6908   SDValue N1 = Op.getOperand(1);
   6909   SDValue N2 = Op.getOperand(2);
   6910 
   6911   // If this is a 256-bit vector result, first extract the 128-bit vector,
   6912   // insert the element into the extracted half and then place it back.
   6913   if (VT.getSizeInBits() == 256) {
   6914     if (!isa<ConstantSDNode>(N2))
   6915       return SDValue();
   6916 
   6917     // Get the desired 128-bit vector half.
   6918     unsigned NumElems = VT.getVectorNumElements();
   6919     unsigned IdxVal = cast<ConstantSDNode>(N2)->getZExtValue();
   6920     bool Upper = IdxVal >= NumElems/2;
   6921     SDValue Ins128Idx = DAG.getConstant(Upper ? NumElems/2 : 0, MVT::i32);
   6922     SDValue V = Extract128BitVector(N0, Ins128Idx, DAG, dl);
   6923 
   6924     // Insert the element into the desired half.
   6925     V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, V.getValueType(), V,
   6926                  N1, Upper ? DAG.getConstant(IdxVal-NumElems/2, MVT::i32) : N2);
   6927 
   6928     // Insert the changed part back to the 256-bit vector
   6929     return Insert128BitVector(N0, V, Ins128Idx, DAG, dl);
   6930   }
   6931 
   6932   if (Subtarget->hasSSE41())
   6933     return LowerINSERT_VECTOR_ELT_SSE4(Op, DAG);
   6934 
   6935   if (EltVT == MVT::i8)
   6936     return SDValue();
   6937 
   6938   if (EltVT.getSizeInBits() == 16 && isa<ConstantSDNode>(N2)) {
   6939     // Transform it so it match pinsrw which expects a 16-bit value in a GR32
   6940     // as its second argument.
   6941     if (N1.getValueType() != MVT::i32)
   6942       N1 = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, N1);
   6943     if (N2.getValueType() != MVT::i32)
   6944       N2 = DAG.getIntPtrConstant(cast<ConstantSDNode>(N2)->getZExtValue());
   6945     return DAG.getNode(X86ISD::PINSRW, dl, VT, N0, N1, N2);
   6946   }
   6947   return SDValue();
   6948 }
   6949 
   6950 SDValue
   6951 X86TargetLowering::LowerSCALAR_TO_VECTOR(SDValue Op, SelectionDAG &DAG) const {
   6952   LLVMContext *Context = DAG.getContext();
   6953   DebugLoc dl = Op.getDebugLoc();
   6954   EVT OpVT = Op.getValueType();
   6955 
   6956   // If this is a 256-bit vector result, first insert into a 128-bit
   6957   // vector and then insert into the 256-bit vector.
   6958   if (OpVT.getSizeInBits() > 128) {
   6959     // Insert into a 128-bit vector.
   6960     EVT VT128 = EVT::getVectorVT(*Context,
   6961                                  OpVT.getVectorElementType(),
   6962                                  OpVT.getVectorNumElements() / 2);
   6963 
   6964     Op = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT128, Op.getOperand(0));
   6965 
   6966     // Insert the 128-bit vector.
   6967     return Insert128BitVector(DAG.getNode(ISD::UNDEF, dl, OpVT), Op,
   6968                               DAG.getConstant(0, MVT::i32),
   6969                               DAG, dl);
   6970   }
   6971 
   6972   if (Op.getValueType() == MVT::v1i64 &&
   6973       Op.getOperand(0).getValueType() == MVT::i64)
   6974     return DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v1i64, Op.getOperand(0));
   6975 
   6976   SDValue AnyExt = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, Op.getOperand(0));
   6977   assert(Op.getValueType().getSimpleVT().getSizeInBits() == 128 &&
   6978          "Expected an SSE type!");
   6979   return DAG.getNode(ISD::BITCAST, dl, Op.getValueType(),
   6980                      DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32,AnyExt));
   6981 }
   6982 
   6983 // Lower a node with an EXTRACT_SUBVECTOR opcode.  This may result in
   6984 // a simple subregister reference or explicit instructions to grab
   6985 // upper bits of a vector.
   6986 SDValue
   6987 X86TargetLowering::LowerEXTRACT_SUBVECTOR(SDValue Op, SelectionDAG &DAG) const {
   6988   if (Subtarget->hasAVX()) {
   6989     DebugLoc dl = Op.getNode()->getDebugLoc();
   6990     SDValue Vec = Op.getNode()->getOperand(0);
   6991     SDValue Idx = Op.getNode()->getOperand(1);
   6992 
   6993     if (Op.getNode()->getValueType(0).getSizeInBits() == 128
   6994         && Vec.getNode()->getValueType(0).getSizeInBits() == 256) {
   6995         return Extract128BitVector(Vec, Idx, DAG, dl);
   6996     }
   6997   }
   6998   return SDValue();
   6999 }
   7000 
   7001 // Lower a node with an INSERT_SUBVECTOR opcode.  This may result in a
   7002 // simple superregister reference or explicit instructions to insert
   7003 // the upper bits of a vector.
   7004 SDValue
   7005 X86TargetLowering::LowerINSERT_SUBVECTOR(SDValue Op, SelectionDAG &DAG) const {
   7006   if (Subtarget->hasAVX()) {
   7007     DebugLoc dl = Op.getNode()->getDebugLoc();
   7008     SDValue Vec = Op.getNode()->getOperand(0);
   7009     SDValue SubVec = Op.getNode()->getOperand(1);
   7010     SDValue Idx = Op.getNode()->getOperand(2);
   7011 
   7012     if (Op.getNode()->getValueType(0).getSizeInBits() == 256
   7013         && SubVec.getNode()->getValueType(0).getSizeInBits() == 128) {
   7014       return Insert128BitVector(Vec, SubVec, Idx, DAG, dl);
   7015     }
   7016   }
   7017   return SDValue();
   7018 }
   7019 
   7020 // ConstantPool, JumpTable, GlobalAddress, and ExternalSymbol are lowered as
   7021 // their target countpart wrapped in the X86ISD::Wrapper node. Suppose N is
   7022 // one of the above mentioned nodes. It has to be wrapped because otherwise
   7023 // Select(N) returns N. So the raw TargetGlobalAddress nodes, etc. can only
   7024 // be used to form addressing mode. These wrapped nodes will be selected
   7025 // into MOV32ri.
   7026 SDValue
   7027 X86TargetLowering::LowerConstantPool(SDValue Op, SelectionDAG &DAG) const {
   7028   ConstantPoolSDNode *CP = cast<ConstantPoolSDNode>(Op);
   7029 
   7030   // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the
   7031   // global base reg.
   7032   unsigned char OpFlag = 0;
   7033   unsigned WrapperKind = X86ISD::Wrapper;
   7034   CodeModel::Model M = getTargetMachine().getCodeModel();
   7035 
   7036   if (Subtarget->isPICStyleRIPRel() &&
   7037       (M == CodeModel::Small || M == CodeModel::Kernel))
   7038     WrapperKind = X86ISD::WrapperRIP;
   7039   else if (Subtarget->isPICStyleGOT())
   7040     OpFlag = X86II::MO_GOTOFF;
   7041   else if (Subtarget->isPICStyleStubPIC())
   7042     OpFlag = X86II::MO_PIC_BASE_OFFSET;
   7043 
   7044   SDValue Result = DAG.getTargetConstantPool(CP->getConstVal(), getPointerTy(),
   7045                                              CP->getAlignment(),
   7046                                              CP->getOffset(), OpFlag);
   7047   DebugLoc DL = CP->getDebugLoc();
   7048   Result = DAG.getNode(WrapperKind, DL, getPointerTy(), Result);
   7049   // With PIC, the address is actually $g + Offset.
   7050   if (OpFlag) {
   7051     Result = DAG.getNode(ISD::ADD, DL, getPointerTy(),
   7052                          DAG.getNode(X86ISD::GlobalBaseReg,
   7053                                      DebugLoc(), getPointerTy()),
   7054                          Result);
   7055   }
   7056 
   7057   return Result;
   7058 }
   7059 
   7060 SDValue X86TargetLowering::LowerJumpTable(SDValue Op, SelectionDAG &DAG) const {
   7061   JumpTableSDNode *JT = cast<JumpTableSDNode>(Op);
   7062 
   7063   // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the
   7064   // global base reg.
   7065   unsigned char OpFlag = 0;
   7066   unsigned WrapperKind = X86ISD::Wrapper;
   7067   CodeModel::Model M = getTargetMachine().getCodeModel();
   7068 
   7069   if (Subtarget->isPICStyleRIPRel() &&
   7070       (M == CodeModel::Small || M == CodeModel::Kernel))
   7071     WrapperKind = X86ISD::WrapperRIP;
   7072   else if (Subtarget->isPICStyleGOT())
   7073     OpFlag = X86II::MO_GOTOFF;
   7074   else if (Subtarget->isPICStyleStubPIC())
   7075     OpFlag = X86II::MO_PIC_BASE_OFFSET;
   7076 
   7077   SDValue Result = DAG.getTargetJumpTable(JT->getIndex(), getPointerTy(),
   7078                                           OpFlag);
   7079   DebugLoc DL = JT->getDebugLoc();
   7080   Result = DAG.getNode(WrapperKind, DL, getPointerTy(), Result);
   7081 
   7082   // With PIC, the address is actually $g + Offset.
   7083   if (OpFlag)
   7084     Result = DAG.getNode(ISD::ADD, DL, getPointerTy(),
   7085                          DAG.getNode(X86ISD::GlobalBaseReg,
   7086                                      DebugLoc(), getPointerTy()),
   7087                          Result);
   7088 
   7089   return Result;
   7090 }
   7091 
   7092 SDValue
   7093 X86TargetLowering::LowerExternalSymbol(SDValue Op, SelectionDAG &DAG) const {
   7094   const char *Sym = cast<ExternalSymbolSDNode>(Op)->getSymbol();
   7095 
   7096   // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the
   7097   // global base reg.
   7098   unsigned char OpFlag = 0;
   7099   unsigned WrapperKind = X86ISD::Wrapper;
   7100   CodeModel::Model M = getTargetMachine().getCodeModel();
   7101 
   7102   if (Subtarget->isPICStyleRIPRel() &&
   7103       (M == CodeModel::Small || M == CodeModel::Kernel)) {
   7104     if (Subtarget->isTargetDarwin() || Subtarget->isTargetELF())
   7105       OpFlag = X86II::MO_GOTPCREL;
   7106     WrapperKind = X86ISD::WrapperRIP;
   7107   } else if (Subtarget->isPICStyleGOT()) {
   7108     OpFlag = X86II::MO_GOT;
   7109   } else if (Subtarget->isPICStyleStubPIC()) {
   7110     OpFlag = X86II::MO_DARWIN_NONLAZY_PIC_BASE;
   7111   } else if (Subtarget->isPICStyleStubNoDynamic()) {
   7112     OpFlag = X86II::MO_DARWIN_NONLAZY;
   7113   }
   7114 
   7115   SDValue Result = DAG.getTargetExternalSymbol(Sym, getPointerTy(), OpFlag);
   7116 
   7117   DebugLoc DL = Op.getDebugLoc();
   7118   Result = DAG.getNode(WrapperKind, DL, getPointerTy(), Result);
   7119 
   7120 
   7121   // With PIC, the address is actually $g + Offset.
   7122   if (getTargetMachine().getRelocationModel() == Reloc::PIC_ &&
   7123       !Subtarget->is64Bit()) {
   7124     Result = DAG.getNode(ISD::ADD, DL, getPointerTy(),
   7125                          DAG.getNode(X86ISD::GlobalBaseReg,
   7126                                      DebugLoc(), getPointerTy()),
   7127                          Result);
   7128   }
   7129 
   7130   // For symbols that require a load from a stub to get the address, emit the
   7131   // load.
   7132   if (isGlobalStubReference(OpFlag))
   7133     Result = DAG.getLoad(getPointerTy(), DL, DAG.getEntryNode(), Result,
   7134                          MachinePointerInfo::getGOT(), false, false, false, 0);
   7135 
   7136   return Result;
   7137 }
   7138 
   7139 SDValue
   7140 X86TargetLowering::LowerBlockAddress(SDValue Op, SelectionDAG &DAG) const {
   7141   // Create the TargetBlockAddressAddress node.
   7142   unsigned char OpFlags =
   7143     Subtarget->ClassifyBlockAddressReference();
   7144   CodeModel::Model M = getTargetMachine().getCodeModel();
   7145   const BlockAddress *BA = cast<BlockAddressSDNode>(Op)->getBlockAddress();
   7146   DebugLoc dl = Op.getDebugLoc();
   7147   SDValue Result = DAG.getBlockAddress(BA, getPointerTy(),
   7148                                        /*isTarget=*/true, OpFlags);
   7149 
   7150   if (Subtarget->isPICStyleRIPRel() &&
   7151       (M == CodeModel::Small || M == CodeModel::Kernel))
   7152     Result = DAG.getNode(X86ISD::WrapperRIP, dl, getPointerTy(), Result);
   7153   else
   7154     Result = DAG.getNode(X86ISD::Wrapper, dl, getPointerTy(), Result);
   7155 
   7156   // With PIC, the address is actually $g + Offset.
   7157   if (isGlobalRelativeToPICBase(OpFlags)) {
   7158     Result = DAG.getNode(ISD::ADD, dl, getPointerTy(),
   7159                          DAG.getNode(X86ISD::GlobalBaseReg, dl, getPointerTy()),
   7160                          Result);
   7161   }
   7162 
   7163   return Result;
   7164 }
   7165 
   7166 SDValue
   7167 X86TargetLowering::LowerGlobalAddress(const GlobalValue *GV, DebugLoc dl,
   7168                                       int64_t Offset,
   7169                                       SelectionDAG &DAG) const {
   7170   // Create the TargetGlobalAddress node, folding in the constant
   7171   // offset if it is legal.
   7172   unsigned char OpFlags =
   7173     Subtarget->ClassifyGlobalReference(GV, getTargetMachine());
   7174   CodeModel::Model M = getTargetMachine().getCodeModel();
   7175   SDValue Result;
   7176   if (OpFlags == X86II::MO_NO_FLAG &&
   7177       X86::isOffsetSuitableForCodeModel(Offset, M)) {
   7178     // A direct static reference to a global.
   7179     Result = DAG.getTargetGlobalAddress(GV, dl, getPointerTy(), Offset);
   7180     Offset = 0;
   7181   } else {
   7182     Result = DAG.getTargetGlobalAddress(GV, dl, getPointerTy(), 0, OpFlags);
   7183   }
   7184 
   7185   if (Subtarget->isPICStyleRIPRel() &&
   7186       (M == CodeModel::Small || M == CodeModel::Kernel))
   7187     Result = DAG.getNode(X86ISD::WrapperRIP, dl, getPointerTy(), Result);
   7188   else
   7189     Result = DAG.getNode(X86ISD::Wrapper, dl, getPointerTy(), Result);
   7190 
   7191   // With PIC, the address is actually $g + Offset.
   7192   if (isGlobalRelativeToPICBase(OpFlags)) {
   7193     Result = DAG.getNode(ISD::ADD, dl, getPointerTy(),
   7194                          DAG.getNode(X86ISD::GlobalBaseReg, dl, getPointerTy()),
   7195                          Result);
   7196   }
   7197 
   7198   // For globals that require a load from a stub to get the address, emit the
   7199   // load.
   7200   if (isGlobalStubReference(OpFlags))
   7201     Result = DAG.getLoad(getPointerTy(), dl, DAG.getEntryNode(), Result,
   7202                          MachinePointerInfo::getGOT(), false, false, false, 0);
   7203 
   7204   // If there was a non-zero offset that we didn't fold, create an explicit
   7205   // addition for it.
   7206   if (Offset != 0)
   7207     Result = DAG.getNode(ISD::ADD, dl, getPointerTy(), Result,
   7208                          DAG.getConstant(Offset, getPointerTy()));
   7209 
   7210   return Result;
   7211 }
   7212 
   7213 SDValue
   7214 X86TargetLowering::LowerGlobalAddress(SDValue Op, SelectionDAG &DAG) const {
   7215   const GlobalValue *GV = cast<GlobalAddressSDNode>(Op)->getGlobal();
   7216   int64_t Offset = cast<GlobalAddressSDNode>(Op)->getOffset();
   7217   return LowerGlobalAddress(GV, Op.getDebugLoc(), Offset, DAG);
   7218 }
   7219 
   7220 static SDValue
   7221 GetTLSADDR(SelectionDAG &DAG, SDValue Chain, GlobalAddressSDNode *GA,
   7222            SDValue *InFlag, const EVT PtrVT, unsigned ReturnReg,
   7223            unsigned char OperandFlags) {
   7224   MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo();
   7225   SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
   7226   DebugLoc dl = GA->getDebugLoc();
   7227   SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), dl,
   7228                                            GA->getValueType(0),
   7229                                            GA->getOffset(),
   7230                                            OperandFlags);
   7231   if (InFlag) {
   7232     SDValue Ops[] = { Chain,  TGA, *InFlag };
   7233     Chain = DAG.getNode(X86ISD::TLSADDR, dl, NodeTys, Ops, 3);
   7234   } else {
   7235     SDValue Ops[]  = { Chain, TGA };
   7236     Chain = DAG.getNode(X86ISD::TLSADDR, dl, NodeTys, Ops, 2);
   7237   }
   7238 
   7239   // TLSADDR will be codegen'ed as call. Inform MFI that function has calls.
   7240   MFI->setAdjustsStack(true);
   7241 
   7242   SDValue Flag = Chain.getValue(1);
   7243   return DAG.getCopyFromReg(Chain, dl, ReturnReg, PtrVT, Flag);
   7244 }
   7245 
   7246 // Lower ISD::GlobalTLSAddress using the "general dynamic" model, 32 bit
   7247 static SDValue
   7248 LowerToTLSGeneralDynamicModel32(GlobalAddressSDNode *GA, SelectionDAG &DAG,
   7249                                 const EVT PtrVT) {
   7250   SDValue InFlag;
   7251   DebugLoc dl = GA->getDebugLoc();  // ? function entry point might be better
   7252   SDValue Chain = DAG.getCopyToReg(DAG.getEntryNode(), dl, X86::EBX,
   7253                                      DAG.getNode(X86ISD::GlobalBaseReg,
   7254                                                  DebugLoc(), PtrVT), InFlag);
   7255   InFlag = Chain.getValue(1);
   7256 
   7257   return GetTLSADDR(DAG, Chain, GA, &InFlag, PtrVT, X86::EAX, X86II::MO_TLSGD);
   7258 }
   7259 
   7260 // Lower ISD::GlobalTLSAddress using the "general dynamic" model, 64 bit
   7261 static SDValue
   7262 LowerToTLSGeneralDynamicModel64(GlobalAddressSDNode *GA, SelectionDAG &DAG,
   7263                                 const EVT PtrVT) {
   7264   return GetTLSADDR(DAG, DAG.getEntryNode(), GA, NULL, PtrVT,
   7265                     X86::RAX, X86II::MO_TLSGD);
   7266 }
   7267 
   7268 // Lower ISD::GlobalTLSAddress using the "initial exec" (for no-pic) or
   7269 // "local exec" model.
   7270 static SDValue LowerToTLSExecModel(GlobalAddressSDNode *GA, SelectionDAG &DAG,
   7271                                    const EVT PtrVT, TLSModel::Model model,
   7272                                    bool is64Bit) {
   7273   DebugLoc dl = GA->getDebugLoc();
   7274 
   7275   // Get the Thread Pointer, which is %gs:0 (32-bit) or %fs:0 (64-bit).
   7276   Value *Ptr = Constant::getNullValue(Type::getInt8PtrTy(*DAG.getContext(),
   7277                                                          is64Bit ? 257 : 256));
   7278 
   7279   SDValue ThreadPointer = DAG.getLoad(PtrVT, dl, DAG.getEntryNode(),
   7280                                       DAG.getIntPtrConstant(0),
   7281                                       MachinePointerInfo(Ptr),
   7282                                       false, false, false, 0);
   7283 
   7284   unsigned char OperandFlags = 0;
   7285   // Most TLS accesses are not RIP relative, even on x86-64.  One exception is
   7286   // initialexec.
   7287   unsigned WrapperKind = X86ISD::Wrapper;
   7288   if (model == TLSModel::LocalExec) {
   7289     OperandFlags = is64Bit ? X86II::MO_TPOFF : X86II::MO_NTPOFF;
   7290   } else if (is64Bit) {
   7291     assert(model == TLSModel::InitialExec);
   7292     OperandFlags = X86II::MO_GOTTPOFF;
   7293     WrapperKind = X86ISD::WrapperRIP;
   7294   } else {
   7295     assert(model == TLSModel::InitialExec);
   7296     OperandFlags = X86II::MO_INDNTPOFF;
   7297   }
   7298 
   7299   // emit "addl x@ntpoff,%eax" (local exec) or "addl x@indntpoff,%eax" (initial
   7300   // exec)
   7301   SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), dl,
   7302                                            GA->getValueType(0),
   7303                                            GA->getOffset(), OperandFlags);
   7304   SDValue Offset = DAG.getNode(WrapperKind, dl, PtrVT, TGA);
   7305 
   7306   if (model == TLSModel::InitialExec)
   7307     Offset = DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), Offset,
   7308                          MachinePointerInfo::getGOT(), false, false, false, 0);
   7309 
   7310   // The address of the thread local variable is the add of the thread
   7311   // pointer with the offset of the variable.
   7312   return DAG.getNode(ISD::ADD, dl, PtrVT, ThreadPointer, Offset);
   7313 }
   7314 
   7315 SDValue
   7316 X86TargetLowering::LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const {
   7317 
   7318   GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(Op);
   7319   const GlobalValue *GV = GA->getGlobal();
   7320 
   7321   if (Subtarget->isTargetELF()) {
   7322     // TODO: implement the "local dynamic" model
   7323     // TODO: implement the "initial exec"model for pic executables
   7324 
   7325     // If GV is an alias then use the aliasee for determining
   7326     // thread-localness.
   7327     if (const GlobalAlias *GA = dyn_cast<GlobalAlias>(GV))
   7328       GV = GA->resolveAliasedGlobal(false);
   7329 
   7330     TLSModel::Model model = getTargetMachine().getTLSModel(GV);
   7331 
   7332     switch (model) {
   7333       case TLSModel::GeneralDynamic:
   7334       case TLSModel::LocalDynamic: // not implemented
   7335         if (Subtarget->is64Bit())
   7336           return LowerToTLSGeneralDynamicModel64(GA, DAG, getPointerTy());
   7337         return LowerToTLSGeneralDynamicModel32(GA, DAG, getPointerTy());
   7338 
   7339       case TLSModel::InitialExec:
   7340       case TLSModel::LocalExec:
   7341         return LowerToTLSExecModel(GA, DAG, getPointerTy(), model,
   7342                                    Subtarget->is64Bit());
   7343     }
   7344   } else if (Subtarget->isTargetDarwin()) {
   7345     // Darwin only has one model of TLS.  Lower to that.
   7346     unsigned char OpFlag = 0;
   7347     unsigned WrapperKind = Subtarget->isPICStyleRIPRel() ?
   7348                            X86ISD::WrapperRIP : X86ISD::Wrapper;
   7349 
   7350     // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the
   7351     // global base reg.
   7352     bool PIC32 = (getTargetMachine().getRelocationModel() == Reloc::PIC_) &&
   7353                   !Subtarget->is64Bit();
   7354     if (PIC32)
   7355       OpFlag = X86II::MO_TLVP_PIC_BASE;
   7356     else
   7357       OpFlag = X86II::MO_TLVP;
   7358     DebugLoc DL = Op.getDebugLoc();
   7359     SDValue Result = DAG.getTargetGlobalAddress(GA->getGlobal(), DL,
   7360                                                 GA->getValueType(0),
   7361                                                 GA->getOffset(), OpFlag);
   7362     SDValue Offset = DAG.getNode(WrapperKind, DL, getPointerTy(), Result);
   7363 
   7364     // With PIC32, the address is actually $g + Offset.
   7365     if (PIC32)
   7366       Offset = DAG.getNode(ISD::ADD, DL, getPointerTy(),
   7367                            DAG.getNode(X86ISD::GlobalBaseReg,
   7368                                        DebugLoc(), getPointerTy()),
   7369                            Offset);
   7370 
   7371     // Lowering the machine isd will make sure everything is in the right
   7372     // location.
   7373     SDValue Chain = DAG.getEntryNode();
   7374     SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
   7375     SDValue Args[] = { Chain, Offset };
   7376     Chain = DAG.getNode(X86ISD::TLSCALL, DL, NodeTys, Args, 2);
   7377 
   7378     // TLSCALL will be codegen'ed as call. Inform MFI that function has calls.
   7379     MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo();
   7380     MFI->setAdjustsStack(true);
   7381 
   7382     // And our return value (tls address) is in the standard call return value
   7383     // location.
   7384     unsigned Reg = Subtarget->is64Bit() ? X86::RAX : X86::EAX;
   7385     return DAG.getCopyFromReg(Chain, DL, Reg, getPointerTy(),
   7386                               Chain.getValue(1));
   7387   } else if (Subtarget->isTargetWindows()) {
   7388     // Just use the implicit TLS architecture
   7389     // Need to generate someting similar to:
   7390     //   mov     rdx, qword [gs:abs 58H]; Load pointer to ThreadLocalStorage
   7391     //                                  ; from TEB
   7392     //   mov     ecx, dword [rel _tls_index]: Load index (from C runtime)
   7393     //   mov     rcx, qword [rdx+rcx*8]
   7394     //   mov     eax, .tls$:tlsvar
   7395     //   [rax+rcx] contains the address
   7396     // Windows 64bit: gs:0x58
   7397     // Windows 32bit: fs:__tls_array
   7398 
   7399     // If GV is an alias then use the aliasee for determining
   7400     // thread-localness.
   7401     if (const GlobalAlias *GA = dyn_cast<GlobalAlias>(GV))
   7402       GV = GA->resolveAliasedGlobal(false);
   7403     DebugLoc dl = GA->getDebugLoc();
   7404     SDValue Chain = DAG.getEntryNode();
   7405 
   7406     // Get the Thread Pointer, which is %fs:__tls_array (32-bit) or
   7407     // %gs:0x58 (64-bit).
   7408     Value *Ptr = Constant::getNullValue(Subtarget->is64Bit()
   7409                                         ? Type::getInt8PtrTy(*DAG.getContext(),
   7410                                                              256)
   7411                                         : Type::getInt32PtrTy(*DAG.getContext(),
   7412                                                               257));
   7413 
   7414     SDValue ThreadPointer = DAG.getLoad(getPointerTy(), dl, Chain,
   7415                                         Subtarget->is64Bit()
   7416                                         ? DAG.getIntPtrConstant(0x58)
   7417                                         : DAG.getExternalSymbol("_tls_array",
   7418                                                                 getPointerTy()),
   7419                                         MachinePointerInfo(Ptr),
   7420                                         false, false, false, 0);
   7421 
   7422     // Load the _tls_index variable
   7423     SDValue IDX = DAG.getExternalSymbol("_tls_index", getPointerTy());
   7424     if (Subtarget->is64Bit())
   7425       IDX = DAG.getExtLoad(ISD::ZEXTLOAD, dl, getPointerTy(), Chain,
   7426                            IDX, MachinePointerInfo(), MVT::i32,
   7427                            false, false, 0);
   7428     else
   7429       IDX = DAG.getLoad(getPointerTy(), dl, Chain, IDX, MachinePointerInfo(),
   7430                         false, false, false, 0);
   7431 
   7432     SDValue Scale = DAG.getConstant(Log2_64_Ceil(TD->getPointerSize()),
   7433 		                            getPointerTy());
   7434     IDX = DAG.getNode(ISD::SHL, dl, getPointerTy(), IDX, Scale);
   7435 
   7436     SDValue res = DAG.getNode(ISD::ADD, dl, getPointerTy(), ThreadPointer, IDX);
   7437     res = DAG.getLoad(getPointerTy(), dl, Chain, res, MachinePointerInfo(),
   7438                       false, false, false, 0);
   7439 
   7440     // Get the offset of start of .tls section
   7441     SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), dl,
   7442                                              GA->getValueType(0),
   7443                                              GA->getOffset(), X86II::MO_SECREL);
   7444     SDValue Offset = DAG.getNode(X86ISD::Wrapper, dl, getPointerTy(), TGA);
   7445 
   7446     // The address of the thread local variable is the add of the thread
   7447     // pointer with the offset of the variable.
   7448     return DAG.getNode(ISD::ADD, dl, getPointerTy(), res, Offset);
   7449   }
   7450 
   7451   llvm_unreachable("TLS not implemented for this target.");
   7452 }
   7453 
   7454 
   7455 /// LowerShiftParts - Lower SRA_PARTS and friends, which return two i32 values
   7456 /// and take a 2 x i32 value to shift plus a shift amount.
   7457 SDValue X86TargetLowering::LowerShiftParts(SDValue Op, SelectionDAG &DAG) const{
   7458   assert(Op.getNumOperands() == 3 && "Not a double-shift!");
   7459   EVT VT = Op.getValueType();
   7460   unsigned VTBits = VT.getSizeInBits();
   7461   DebugLoc dl = Op.getDebugLoc();
   7462   bool isSRA = Op.getOpcode() == ISD::SRA_PARTS;
   7463   SDValue ShOpLo = Op.getOperand(0);
   7464   SDValue ShOpHi = Op.getOperand(1);
   7465   SDValue ShAmt  = Op.getOperand(2);
   7466   SDValue Tmp1 = isSRA ? DAG.getNode(ISD::SRA, dl, VT, ShOpHi,
   7467                                      DAG.getConstant(VTBits - 1, MVT::i8))
   7468                        : DAG.getConstant(0, VT);
   7469 
   7470   SDValue Tmp2, Tmp3;
   7471   if (Op.getOpcode() == ISD::SHL_PARTS) {
   7472     Tmp2 = DAG.getNode(X86ISD::SHLD, dl, VT, ShOpHi, ShOpLo, ShAmt);
   7473     Tmp3 = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, ShAmt);
   7474   } else {
   7475     Tmp2 = DAG.getNode(X86ISD::SHRD, dl, VT, ShOpLo, ShOpHi, ShAmt);
   7476     Tmp3 = DAG.getNode(isSRA ? ISD::SRA : ISD::SRL, dl, VT, ShOpHi, ShAmt);
   7477   }
   7478 
   7479   SDValue AndNode = DAG.getNode(ISD::AND, dl, MVT::i8, ShAmt,
   7480                                 DAG.getConstant(VTBits, MVT::i8));
   7481   SDValue Cond = DAG.getNode(X86ISD::CMP, dl, MVT::i32,
   7482                              AndNode, DAG.getConstant(0, MVT::i8));
   7483 
   7484   SDValue Hi, Lo;
   7485   SDValue CC = DAG.getConstant(X86::COND_NE, MVT::i8);
   7486   SDValue Ops0[4] = { Tmp2, Tmp3, CC, Cond };
   7487   SDValue Ops1[4] = { Tmp3, Tmp1, CC, Cond };
   7488 
   7489   if (Op.getOpcode() == ISD::SHL_PARTS) {
   7490     Hi = DAG.getNode(X86ISD::CMOV, dl, VT, Ops0, 4);
   7491     Lo = DAG.getNode(X86ISD::CMOV, dl, VT, Ops1, 4);
   7492   } else {
   7493     Lo = DAG.getNode(X86ISD::CMOV, dl, VT, Ops0, 4);
   7494     Hi = DAG.getNode(X86ISD::CMOV, dl, VT, Ops1, 4);
   7495   }
   7496 
   7497   SDValue Ops[2] = { Lo, Hi };
   7498   return DAG.getMergeValues(Ops, 2, dl);
   7499 }
   7500 
   7501 SDValue X86TargetLowering::LowerSINT_TO_FP(SDValue Op,
   7502                                            SelectionDAG &DAG) const {
   7503   EVT SrcVT = Op.getOperand(0).getValueType();
   7504 
   7505   if (SrcVT.isVector())
   7506     return SDValue();
   7507 
   7508   assert(SrcVT.getSimpleVT() <= MVT::i64 && SrcVT.getSimpleVT() >= MVT::i16 &&
   7509          "Unknown SINT_TO_FP to lower!");
   7510 
   7511   // These are really Legal; return the operand so the caller accepts it as
   7512   // Legal.
   7513   if (SrcVT == MVT::i32 && isScalarFPTypeInSSEReg(Op.getValueType()))
   7514     return Op;
   7515   if (SrcVT == MVT::i64 && isScalarFPTypeInSSEReg(Op.getValueType()) &&
   7516       Subtarget->is64Bit()) {
   7517     return Op;
   7518   }
   7519 
   7520   DebugLoc dl = Op.getDebugLoc();
   7521   unsigned Size = SrcVT.getSizeInBits()/8;
   7522   MachineFunction &MF = DAG.getMachineFunction();
   7523   int SSFI = MF.getFrameInfo()->CreateStackObject(Size, Size, false);
   7524   SDValue StackSlot = DAG.getFrameIndex(SSFI, getPointerTy());
   7525   SDValue Chain = DAG.getStore(DAG.getEntryNode(), dl, Op.getOperand(0),
   7526                                StackSlot,
   7527                                MachinePointerInfo::getFixedStack(SSFI),
   7528                                false, false, 0);
   7529   return BuildFILD(Op, SrcVT, Chain, StackSlot, DAG);
   7530 }
   7531 
   7532 SDValue X86TargetLowering::BuildFILD(SDValue Op, EVT SrcVT, SDValue Chain,
   7533                                      SDValue StackSlot,
   7534                                      SelectionDAG &DAG) const {
   7535   // Build the FILD
   7536   DebugLoc DL = Op.getDebugLoc();
   7537   SDVTList Tys;
   7538   bool useSSE = isScalarFPTypeInSSEReg(Op.getValueType());
   7539   if (useSSE)
   7540     Tys = DAG.getVTList(MVT::f64, MVT::Other, MVT::Glue);
   7541   else
   7542     Tys = DAG.getVTList(Op.getValueType(), MVT::Other);
   7543 
   7544   unsigned ByteSize = SrcVT.getSizeInBits()/8;
   7545 
   7546   FrameIndexSDNode *FI = dyn_cast<FrameIndexSDNode>(StackSlot);
   7547   MachineMemOperand *MMO;
   7548   if (FI) {
   7549     int SSFI = FI->getIndex();
   7550     MMO =
   7551       DAG.getMachineFunction()
   7552       .getMachineMemOperand(MachinePointerInfo::getFixedStack(SSFI),
   7553                             MachineMemOperand::MOLoad, ByteSize, ByteSize);
   7554   } else {
   7555     MMO = cast<LoadSDNode>(StackSlot)->getMemOperand();
   7556     StackSlot = StackSlot.getOperand(1);
   7557   }
   7558   SDValue Ops[] = { Chain, StackSlot, DAG.getValueType(SrcVT) };
   7559   SDValue Result = DAG.getMemIntrinsicNode(useSSE ? X86ISD::FILD_FLAG :
   7560                                            X86ISD::FILD, DL,
   7561                                            Tys, Ops, array_lengthof(Ops),
   7562                                            SrcVT, MMO);
   7563 
   7564   if (useSSE) {
   7565     Chain = Result.getValue(1);
   7566     SDValue InFlag = Result.getValue(2);
   7567 
   7568     // FIXME: Currently the FST is flagged to the FILD_FLAG. This
   7569     // shouldn't be necessary except that RFP cannot be live across
   7570     // multiple blocks. When stackifier is fixed, they can be uncoupled.
   7571     MachineFunction &MF = DAG.getMachineFunction();
   7572     unsigned SSFISize = Op.getValueType().getSizeInBits()/8;
   7573     int SSFI = MF.getFrameInfo()->CreateStackObject(SSFISize, SSFISize, false);
   7574     SDValue StackSlot = DAG.getFrameIndex(SSFI, getPointerTy());
   7575     Tys = DAG.getVTList(MVT::Other);
   7576     SDValue Ops[] = {
   7577       Chain, Result, StackSlot, DAG.getValueType(Op.getValueType()), InFlag
   7578     };
   7579     MachineMemOperand *MMO =
   7580       DAG.getMachineFunction()
   7581       .getMachineMemOperand(MachinePointerInfo::getFixedStack(SSFI),
   7582                             MachineMemOperand::MOStore, SSFISize, SSFISize);
   7583 
   7584     Chain = DAG.getMemIntrinsicNode(X86ISD::FST, DL, Tys,
   7585                                     Ops, array_lengthof(Ops),
   7586                                     Op.getValueType(), MMO);
   7587     Result = DAG.getLoad(Op.getValueType(), DL, Chain, StackSlot,
   7588                          MachinePointerInfo::getFixedStack(SSFI),
   7589                          false, false, false, 0);
   7590   }
   7591 
   7592   return Result;
   7593 }
   7594 
   7595 // LowerUINT_TO_FP_i64 - 64-bit unsigned integer to double expansion.
   7596 SDValue X86TargetLowering::LowerUINT_TO_FP_i64(SDValue Op,
   7597                                                SelectionDAG &DAG) const {
   7598   // This algorithm is not obvious. Here it is what we're trying to output:
   7599   /*
   7600      movq       %rax,  %xmm0
   7601      punpckldq  (c0),  %xmm0  // c0: (uint4){ 0x43300000U, 0x45300000U, 0U, 0U }
   7602      subpd      (c1),  %xmm0  // c1: (double2){ 0x1.0p52, 0x1.0p52 * 0x1.0p32 }
   7603      #ifdef __SSE3__
   7604        haddpd   %xmm0, %xmm0
   7605      #else
   7606        pshufd   $0x4e, %xmm0, %xmm1
   7607        addpd    %xmm1, %xmm0
   7608      #endif
   7609   */
   7610 
   7611   DebugLoc dl = Op.getDebugLoc();
   7612   LLVMContext *Context = DAG.getContext();
   7613 
   7614   // Build some magic constants.
   7615   const uint32_t CV0[] = { 0x43300000, 0x45300000, 0, 0 };
   7616   Constant *C0 = ConstantDataVector::get(*Context, CV0);
   7617   SDValue CPIdx0 = DAG.getConstantPool(C0, getPointerTy(), 16);
   7618 
   7619   SmallVector<Constant*,2> CV1;
   7620   CV1.push_back(
   7621         ConstantFP::get(*Context, APFloat(APInt(64, 0x4330000000000000ULL))));
   7622   CV1.push_back(
   7623         ConstantFP::get(*Context, APFloat(APInt(64, 0x4530000000000000ULL))));
   7624   Constant *C1 = ConstantVector::get(CV1);
   7625   SDValue CPIdx1 = DAG.getConstantPool(C1, getPointerTy(), 16);
   7626 
   7627   // Load the 64-bit value into an XMM register.
   7628   SDValue XR1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64,
   7629                             Op.getOperand(0));
   7630   SDValue CLod0 = DAG.getLoad(MVT::v4i32, dl, DAG.getEntryNode(), CPIdx0,
   7631                               MachinePointerInfo::getConstantPool(),
   7632                               false, false, false, 16);
   7633   SDValue Unpck1 = getUnpackl(DAG, dl, MVT::v4i32,
   7634                               DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, XR1),
   7635                               CLod0);
   7636 
   7637   SDValue CLod1 = DAG.getLoad(MVT::v2f64, dl, CLod0.getValue(1), CPIdx1,
   7638                               MachinePointerInfo::getConstantPool(),
   7639                               false, false, false, 16);
   7640   SDValue XR2F = DAG.getNode(ISD::BITCAST, dl, MVT::v2f64, Unpck1);
   7641   SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::v2f64, XR2F, CLod1);
   7642   SDValue Result;
   7643 
   7644   if (Subtarget->hasSSE3()) {
   7645     // FIXME: The 'haddpd' instruction may be slower than 'movhlps + addsd'.
   7646     Result = DAG.getNode(X86ISD::FHADD, dl, MVT::v2f64, Sub, Sub);
   7647   } else {
   7648     SDValue S2F = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, Sub);
   7649     SDValue Shuffle = getTargetShuffleNode(X86ISD::PSHUFD, dl, MVT::v4i32,
   7650                                            S2F, 0x4E, DAG);
   7651     Result = DAG.getNode(ISD::FADD, dl, MVT::v2f64,
   7652                          DAG.getNode(ISD::BITCAST, dl, MVT::v2f64, Shuffle),
   7653                          Sub);
   7654   }
   7655 
   7656   return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, Result,
   7657                      DAG.getIntPtrConstant(0));
   7658 }
   7659 
   7660 // LowerUINT_TO_FP_i32 - 32-bit unsigned integer to float expansion.
   7661 SDValue X86TargetLowering::LowerUINT_TO_FP_i32(SDValue Op,
   7662                                                SelectionDAG &DAG) const {
   7663   DebugLoc dl = Op.getDebugLoc();
   7664   // FP constant to bias correct the final result.
   7665   SDValue Bias = DAG.getConstantFP(BitsToDouble(0x4330000000000000ULL),
   7666                                    MVT::f64);
   7667 
   7668   // Load the 32-bit value into an XMM register.
   7669   SDValue Load = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32,
   7670                              Op.getOperand(0));
   7671 
   7672   // Zero out the upper parts of the register.
   7673   Load = getShuffleVectorZeroOrUndef(Load, 0, true, Subtarget, DAG);
   7674 
   7675   Load = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64,
   7676                      DAG.getNode(ISD::BITCAST, dl, MVT::v2f64, Load),
   7677                      DAG.getIntPtrConstant(0));
   7678 
   7679   // Or the load with the bias.
   7680   SDValue Or = DAG.getNode(ISD::OR, dl, MVT::v2i64,
   7681                            DAG.getNode(ISD::BITCAST, dl, MVT::v2i64,
   7682                                        DAG.getNode(ISD::SCALAR_TO_VECTOR, dl,
   7683                                                    MVT::v2f64, Load)),
   7684                            DAG.getNode(ISD::BITCAST, dl, MVT::v2i64,
   7685                                        DAG.getNode(ISD::SCALAR_TO_VECTOR, dl,
   7686                                                    MVT::v2f64, Bias)));
   7687   Or = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64,
   7688                    DAG.getNode(ISD::BITCAST, dl, MVT::v2f64, Or),
   7689                    DAG.getIntPtrConstant(0));
   7690 
   7691   // Subtract the bias.
   7692   SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::f64, Or, Bias);
   7693 
   7694   // Handle final rounding.
   7695   EVT DestVT = Op.getValueType();
   7696 
   7697   if (DestVT.bitsLT(MVT::f64)) {
   7698     return DAG.getNode(ISD::FP_ROUND, dl, DestVT, Sub,
   7699                        DAG.getIntPtrConstant(0));
   7700   } else if (DestVT.bitsGT(MVT::f64)) {
   7701     return DAG.getNode(ISD::FP_EXTEND, dl, DestVT, Sub);
   7702   }
   7703 
   7704   // Handle final rounding.
   7705   return Sub;
   7706 }
   7707 
   7708 SDValue X86TargetLowering::LowerUINT_TO_FP(SDValue Op,
   7709                                            SelectionDAG &DAG) const {
   7710   SDValue N0 = Op.getOperand(0);
   7711   DebugLoc dl = Op.getDebugLoc();
   7712 
   7713   // Since UINT_TO_FP is legal (it's marked custom), dag combiner won't
   7714   // optimize it to a SINT_TO_FP when the sign bit is known zero. Perform
   7715   // the optimization here.
   7716   if (DAG.SignBitIsZero(N0))
   7717     return DAG.getNode(ISD::SINT_TO_FP, dl, Op.getValueType(), N0);
   7718 
   7719   EVT SrcVT = N0.getValueType();
   7720   EVT DstVT = Op.getValueType();
   7721   if (SrcVT == MVT::i64 && DstVT == MVT::f64 && X86ScalarSSEf64)
   7722     return LowerUINT_TO_FP_i64(Op, DAG);
   7723   else if (SrcVT == MVT::i32 && X86ScalarSSEf64)
   7724     return LowerUINT_TO_FP_i32(Op, DAG);
   7725   else if (Subtarget->is64Bit() &&
   7726            SrcVT == MVT::i64 && DstVT == MVT::f32)
   7727     return SDValue();
   7728 
   7729   // Make a 64-bit buffer, and use it to build an FILD.
   7730   SDValue StackSlot = DAG.CreateStackTemporary(MVT::i64);
   7731   if (SrcVT == MVT::i32) {
   7732     SDValue WordOff = DAG.getConstant(4, getPointerTy());
   7733     SDValue OffsetSlot = DAG.getNode(ISD::ADD, dl,
   7734                                      getPointerTy(), StackSlot, WordOff);
   7735     SDValue Store1 = DAG.getStore(DAG.getEntryNode(), dl, Op.getOperand(0),
   7736                                   StackSlot, MachinePointerInfo(),
   7737                                   false, false, 0);
   7738     SDValue Store2 = DAG.getStore(Store1, dl, DAG.getConstant(0, MVT::i32),
   7739                                   OffsetSlot, MachinePointerInfo(),
   7740                                   false, false, 0);
   7741     SDValue Fild = BuildFILD(Op, MVT::i64, Store2, StackSlot, DAG);
   7742     return Fild;
   7743   }
   7744 
   7745   assert(SrcVT == MVT::i64 && "Unexpected type in UINT_TO_FP");
   7746   SDValue Store = DAG.getStore(DAG.getEntryNode(), dl, Op.getOperand(0),
   7747                                StackSlot, MachinePointerInfo(),
   7748                                false, false, 0);
   7749   // For i64 source, we need to add the appropriate power of 2 if the input
   7750   // was negative.  This is the same as the optimization in
   7751   // DAGTypeLegalizer::ExpandIntOp_UNIT_TO_FP, and for it to be safe here,
   7752   // we must be careful to do the computation in x87 extended precision, not
   7753   // in SSE. (The generic code can't know it's OK to do this, or how to.)
   7754   int SSFI = cast<FrameIndexSDNode>(StackSlot)->getIndex();
   7755   MachineMemOperand *MMO =
   7756     DAG.getMachineFunction()
   7757     .getMachineMemOperand(MachinePointerInfo::getFixedStack(SSFI),
   7758                           MachineMemOperand::MOLoad, 8, 8);
   7759 
   7760   SDVTList Tys = DAG.getVTList(MVT::f80, MVT::Other);
   7761   SDValue Ops[] = { Store, StackSlot, DAG.getValueType(MVT::i64) };
   7762   SDValue Fild = DAG.getMemIntrinsicNode(X86ISD::FILD, dl, Tys, Ops, 3,
   7763                                          MVT::i64, MMO);
   7764 
   7765   APInt FF(32, 0x5F800000ULL);
   7766 
   7767   // Check whether the sign bit is set.
   7768   SDValue SignSet = DAG.getSetCC(dl, getSetCCResultType(MVT::i64),
   7769                                  Op.getOperand(0), DAG.getConstant(0, MVT::i64),
   7770                                  ISD::SETLT);
   7771 
   7772   // Build a 64 bit pair (0, FF) in the constant pool, with FF in the lo bits.
   7773   SDValue FudgePtr = DAG.getConstantPool(
   7774                              ConstantInt::get(*DAG.getContext(), FF.zext(64)),
   7775                                          getPointerTy());
   7776 
   7777   // Get a pointer to FF if the sign bit was set, or to 0 otherwise.
   7778   SDValue Zero = DAG.getIntPtrConstant(0);
   7779   SDValue Four = DAG.getIntPtrConstant(4);
   7780   SDValue Offset = DAG.getNode(ISD::SELECT, dl, Zero.getValueType(), SignSet,
   7781                                Zero, Four);
   7782   FudgePtr = DAG.getNode(ISD::ADD, dl, getPointerTy(), FudgePtr, Offset);
   7783 
   7784   // Load the value out, extending it from f32 to f80.
   7785   // FIXME: Avoid the extend by constructing the right constant pool?
   7786   SDValue Fudge = DAG.getExtLoad(ISD::EXTLOAD, dl, MVT::f80, DAG.getEntryNode(),
   7787                                  FudgePtr, MachinePointerInfo::getConstantPool(),
   7788                                  MVT::f32, false, false, 4);
   7789   // Extend everything to 80 bits to force it to be done on x87.
   7790   SDValue Add = DAG.getNode(ISD::FADD, dl, MVT::f80, Fild, Fudge);
   7791   return DAG.getNode(ISD::FP_ROUND, dl, DstVT, Add, DAG.getIntPtrConstant(0));
   7792 }
   7793 
   7794 std::pair<SDValue,SDValue> X86TargetLowering::
   7795 FP_TO_INTHelper(SDValue Op, SelectionDAG &DAG, bool IsSigned, bool IsReplace) const {
   7796   DebugLoc DL = Op.getDebugLoc();
   7797 
   7798   EVT DstTy = Op.getValueType();
   7799 
   7800   if (!IsSigned && !isIntegerTypeFTOL(DstTy)) {
   7801     assert(DstTy == MVT::i32 && "Unexpected FP_TO_UINT");
   7802     DstTy = MVT::i64;
   7803   }
   7804 
   7805   assert(DstTy.getSimpleVT() <= MVT::i64 &&
   7806          DstTy.getSimpleVT() >= MVT::i16 &&
   7807          "Unknown FP_TO_INT to lower!");
   7808 
   7809   // These are really Legal.
   7810   if (DstTy == MVT::i32 &&
   7811       isScalarFPTypeInSSEReg(Op.getOperand(0).getValueType()))
   7812     return std::make_pair(SDValue(), SDValue());
   7813   if (Subtarget->is64Bit() &&
   7814       DstTy == MVT::i64 &&
   7815       isScalarFPTypeInSSEReg(Op.getOperand(0).getValueType()))
   7816     return std::make_pair(SDValue(), SDValue());
   7817 
   7818   // We lower FP->int64 either into FISTP64 followed by a load from a temporary
   7819   // stack slot, or into the FTOL runtime function.
   7820   MachineFunction &MF = DAG.getMachineFunction();
   7821   unsigned MemSize = DstTy.getSizeInBits()/8;
   7822   int SSFI = MF.getFrameInfo()->CreateStackObject(MemSize, MemSize, false);
   7823   SDValue StackSlot = DAG.getFrameIndex(SSFI, getPointerTy());
   7824 
   7825   unsigned Opc;
   7826   if (!IsSigned && isIntegerTypeFTOL(DstTy))
   7827     Opc = X86ISD::WIN_FTOL;
   7828   else
   7829     switch (DstTy.getSimpleVT().SimpleTy) {
   7830     default: llvm_unreachable("Invalid FP_TO_SINT to lower!");
   7831     case MVT::i16: Opc = X86ISD::FP_TO_INT16_IN_MEM; break;
   7832     case MVT::i32: Opc = X86ISD::FP_TO_INT32_IN_MEM; break;
   7833     case MVT::i64: Opc = X86ISD::FP_TO_INT64_IN_MEM; break;
   7834     }
   7835 
   7836   SDValue Chain = DAG.getEntryNode();
   7837   SDValue Value = Op.getOperand(0);
   7838   EVT TheVT = Op.getOperand(0).getValueType();
   7839   // FIXME This causes a redundant load/store if the SSE-class value is already
   7840   // in memory, such as if it is on the callstack.
   7841   if (isScalarFPTypeInSSEReg(TheVT)) {
   7842     assert(DstTy == MVT::i64 && "Invalid FP_TO_SINT to lower!");
   7843     Chain = DAG.getStore(Chain, DL, Value, StackSlot,
   7844                          MachinePointerInfo::getFixedStack(SSFI),
   7845                          false, false, 0);
   7846     SDVTList Tys = DAG.getVTList(Op.getOperand(0).getValueType(), MVT::Other);
   7847     SDValue Ops[] = {
   7848       Chain, StackSlot, DAG.getValueType(TheVT)
   7849     };
   7850 
   7851     MachineMemOperand *MMO =
   7852       MF.getMachineMemOperand(MachinePointerInfo::getFixedStack(SSFI),
   7853                               MachineMemOperand::MOLoad, MemSize, MemSize);
   7854     Value = DAG.getMemIntrinsicNode(X86ISD::FLD, DL, Tys, Ops, 3,
   7855                                     DstTy, MMO);
   7856     Chain = Value.getValue(1);
   7857     SSFI = MF.getFrameInfo()->CreateStackObject(MemSize, MemSize, false);
   7858     StackSlot = DAG.getFrameIndex(SSFI, getPointerTy());
   7859   }
   7860 
   7861   MachineMemOperand *MMO =
   7862     MF.getMachineMemOperand(MachinePointerInfo::getFixedStack(SSFI),
   7863                             MachineMemOperand::MOStore, MemSize, MemSize);
   7864 
   7865   if (Opc != X86ISD::WIN_FTOL) {
   7866     // Build the FP_TO_INT*_IN_MEM
   7867     SDValue Ops[] = { Chain, Value, StackSlot };
   7868     SDValue FIST = DAG.getMemIntrinsicNode(Opc, DL, DAG.getVTList(MVT::Other),
   7869                                            Ops, 3, DstTy, MMO);
   7870     return std::make_pair(FIST, StackSlot);
   7871   } else {
   7872     SDValue ftol = DAG.getNode(X86ISD::WIN_FTOL, DL,
   7873       DAG.getVTList(MVT::Other, MVT::Glue),
   7874       Chain, Value);
   7875     SDValue eax = DAG.getCopyFromReg(ftol, DL, X86::EAX,
   7876       MVT::i32, ftol.getValue(1));
   7877     SDValue edx = DAG.getCopyFromReg(eax.getValue(1), DL, X86::EDX,
   7878       MVT::i32, eax.getValue(2));
   7879     SDValue Ops[] = { eax, edx };
   7880     SDValue pair = IsReplace
   7881       ? DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, Ops, 2)
   7882       : DAG.getMergeValues(Ops, 2, DL);
   7883     return std::make_pair(pair, SDValue());
   7884   }
   7885 }
   7886 
   7887 SDValue X86TargetLowering::LowerFP_TO_SINT(SDValue Op,
   7888                                            SelectionDAG &DAG) const {
   7889   if (Op.getValueType().isVector())
   7890     return SDValue();
   7891 
   7892   std::pair<SDValue,SDValue> Vals = FP_TO_INTHelper(Op, DAG,
   7893     /*IsSigned=*/ true, /*IsReplace=*/ false);
   7894   SDValue FIST = Vals.first, StackSlot = Vals.second;
   7895   // If FP_TO_INTHelper failed, the node is actually supposed to be Legal.
   7896   if (FIST.getNode() == 0) return Op;
   7897 
   7898   if (StackSlot.getNode())
   7899     // Load the result.
   7900     return DAG.getLoad(Op.getValueType(), Op.getDebugLoc(),
   7901                        FIST, StackSlot, MachinePointerInfo(),
   7902                        false, false, false, 0);
   7903   else
   7904     // The node is the result.
   7905     return FIST;
   7906 }
   7907 
   7908 SDValue X86TargetLowering::LowerFP_TO_UINT(SDValue Op,
   7909                                            SelectionDAG &DAG) const {
   7910   std::pair<SDValue,SDValue> Vals = FP_TO_INTHelper(Op, DAG,
   7911     /*IsSigned=*/ false, /*IsReplace=*/ false);
   7912   SDValue FIST = Vals.first, StackSlot = Vals.second;
   7913   assert(FIST.getNode() && "Unexpected failure");
   7914 
   7915   if (StackSlot.getNode())
   7916     // Load the result.
   7917     return DAG.getLoad(Op.getValueType(), Op.getDebugLoc(),
   7918                        FIST, StackSlot, MachinePointerInfo(),
   7919                        false, false, false, 0);
   7920   else
   7921     // The node is the result.
   7922     return FIST;
   7923 }
   7924 
   7925 SDValue X86TargetLowering::LowerFABS(SDValue Op,
   7926                                      SelectionDAG &DAG) const {
   7927   LLVMContext *Context = DAG.getContext();
   7928   DebugLoc dl = Op.getDebugLoc();
   7929   EVT VT = Op.getValueType();
   7930   EVT EltVT = VT;
   7931   if (VT.isVector())
   7932     EltVT = VT.getVectorElementType();
   7933   Constant *C;
   7934   if (EltVT == MVT::f64) {
   7935     C = ConstantVector::getSplat(2,
   7936                 ConstantFP::get(*Context, APFloat(APInt(64, ~(1ULL << 63)))));
   7937   } else {
   7938     C = ConstantVector::getSplat(4,
   7939                ConstantFP::get(*Context, APFloat(APInt(32, ~(1U << 31)))));
   7940   }
   7941   SDValue CPIdx = DAG.getConstantPool(C, getPointerTy(), 16);
   7942   SDValue Mask = DAG.getLoad(VT, dl, DAG.getEntryNode(), CPIdx,
   7943                              MachinePointerInfo::getConstantPool(),
   7944                              false, false, false, 16);
   7945   return DAG.getNode(X86ISD::FAND, dl, VT, Op.getOperand(0), Mask);
   7946 }
   7947 
   7948 SDValue X86TargetLowering::LowerFNEG(SDValue Op, SelectionDAG &DAG) const {
   7949   LLVMContext *Context = DAG.getContext();
   7950   DebugLoc dl = Op.getDebugLoc();
   7951   EVT VT = Op.getValueType();
   7952   EVT EltVT = VT;
   7953   unsigned NumElts = VT == MVT::f64 ? 2 : 4;
   7954   if (VT.isVector()) {
   7955     EltVT = VT.getVectorElementType();
   7956     NumElts = VT.getVectorNumElements();
   7957   }
   7958   Constant *C;
   7959   if (EltVT == MVT::f64)
   7960     C = ConstantFP::get(*Context, APFloat(APInt(64, 1ULL << 63)));
   7961   else
   7962     C = ConstantFP::get(*Context, APFloat(APInt(32, 1U << 31)));
   7963   C = ConstantVector::getSplat(NumElts, C);
   7964   SDValue CPIdx = DAG.getConstantPool(C, getPointerTy(), 16);
   7965   SDValue Mask = DAG.getLoad(VT, dl, DAG.getEntryNode(), CPIdx,
   7966                              MachinePointerInfo::getConstantPool(),
   7967                              false, false, false, 16);
   7968   if (VT.isVector()) {
   7969     MVT XORVT = VT.getSizeInBits() == 128 ? MVT::v2i64 : MVT::v4i64;
   7970     return DAG.getNode(ISD::BITCAST, dl, VT,
   7971                        DAG.getNode(ISD::XOR, dl, XORVT,
   7972                     DAG.getNode(ISD::BITCAST, dl, XORVT,
   7973                                 Op.getOperand(0)),
   7974                     DAG.getNode(ISD::BITCAST, dl, XORVT, Mask)));
   7975   } else {
   7976     return DAG.getNode(X86ISD::FXOR, dl, VT, Op.getOperand(0), Mask);
   7977   }
   7978 }
   7979 
   7980 SDValue X86TargetLowering::LowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) const {
   7981   LLVMContext *Context = DAG.getContext();
   7982   SDValue Op0 = Op.getOperand(0);
   7983   SDValue Op1 = Op.getOperand(1);
   7984   DebugLoc dl = Op.getDebugLoc();
   7985   EVT VT = Op.getValueType();
   7986   EVT SrcVT = Op1.getValueType();
   7987 
   7988   // If second operand is smaller, extend it first.
   7989   if (SrcVT.bitsLT(VT)) {
   7990     Op1 = DAG.getNode(ISD::FP_EXTEND, dl, VT, Op1);
   7991     SrcVT = VT;
   7992   }
   7993   // And if it is bigger, shrink it first.
   7994   if (SrcVT.bitsGT(VT)) {
   7995     Op1 = DAG.getNode(ISD::FP_ROUND, dl, VT, Op1, DAG.getIntPtrConstant(1));
   7996     SrcVT = VT;
   7997   }
   7998 
   7999   // At this point the operands and the result should have the same
   8000   // type, and that won't be f80 since that is not custom lowered.
   8001 
   8002   // First get the sign bit of second operand.
   8003   SmallVector<Constant*,4> CV;
   8004   if (SrcVT == MVT::f64) {
   8005     CV.push_back(ConstantFP::get(*Context, APFloat(APInt(64, 1ULL << 63))));
   8006     CV.push_back(ConstantFP::get(*Context, APFloat(APInt(64, 0))));
   8007   } else {
   8008     CV.push_back(ConstantFP::get(*Context, APFloat(APInt(32, 1U << 31))));
   8009     CV.push_back(ConstantFP::get(*Context, APFloat(APInt(32, 0))));
   8010     CV.push_back(ConstantFP::get(*Context, APFloat(APInt(32, 0))));
   8011     CV.push_back(ConstantFP::get(*Context, APFloat(APInt(32, 0))));
   8012   }
   8013   Constant *C = ConstantVector::get(CV);
   8014   SDValue CPIdx = DAG.getConstantPool(C, getPointerTy(), 16);
   8015   SDValue Mask1 = DAG.getLoad(SrcVT, dl, DAG.getEntryNode(), CPIdx,
   8016                               MachinePointerInfo::getConstantPool(),
   8017                               false, false, false, 16);
   8018   SDValue SignBit = DAG.getNode(X86ISD::FAND, dl, SrcVT, Op1, Mask1);
   8019 
   8020   // Shift sign bit right or left if the two operands have different types.
   8021   if (SrcVT.bitsGT(VT)) {
   8022     // Op0 is MVT::f32, Op1 is MVT::f64.
   8023     SignBit = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2f64, SignBit);
   8024     SignBit = DAG.getNode(X86ISD::FSRL, dl, MVT::v2f64, SignBit,
   8025                           DAG.getConstant(32, MVT::i32));
   8026     SignBit = DAG.getNode(ISD::BITCAST, dl, MVT::v4f32, SignBit);
   8027     SignBit = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f32, SignBit,
   8028                           DAG.getIntPtrConstant(0));
   8029   }
   8030 
   8031   // Clear first operand sign bit.
   8032   CV.clear();
   8033   if (VT == MVT::f64) {
   8034     CV.push_back(ConstantFP::get(*Context, APFloat(APInt(64, ~(1ULL << 63)))));
   8035     CV.push_back(ConstantFP::get(*Context, APFloat(APInt(64, 0))));
   8036   } else {
   8037     CV.push_back(ConstantFP::get(*Context, APFloat(APInt(32, ~(1U << 31)))));
   8038     CV.push_back(ConstantFP::get(*Context, APFloat(APInt(32, 0))));
   8039     CV.push_back(ConstantFP::get(*Context, APFloat(APInt(32, 0))));
   8040     CV.push_back(ConstantFP::get(*Context, APFloat(APInt(32, 0))));
   8041   }
   8042   C = ConstantVector::get(CV);
   8043   CPIdx = DAG.getConstantPool(C, getPointerTy(), 16);
   8044   SDValue Mask2 = DAG.getLoad(VT, dl, DAG.getEntryNode(), CPIdx,
   8045                               MachinePointerInfo::getConstantPool(),
   8046                               false, false, false, 16);
   8047   SDValue Val = DAG.getNode(X86ISD::FAND, dl, VT, Op0, Mask2);
   8048 
   8049   // Or the value with the sign bit.
   8050   return DAG.getNode(X86ISD::FOR, dl, VT, Val, SignBit);
   8051 }
   8052 
   8053 SDValue X86TargetLowering::LowerFGETSIGN(SDValue Op, SelectionDAG &DAG) const {
   8054   SDValue N0 = Op.getOperand(0);
   8055   DebugLoc dl = Op.getDebugLoc();
   8056   EVT VT = Op.getValueType();
   8057 
   8058   // Lower ISD::FGETSIGN to (AND (X86ISD::FGETSIGNx86 ...) 1).
   8059   SDValue xFGETSIGN = DAG.getNode(X86ISD::FGETSIGNx86, dl, VT, N0,
   8060                                   DAG.getConstant(1, VT));
   8061   return DAG.getNode(ISD::AND, dl, VT, xFGETSIGN, DAG.getConstant(1, VT));
   8062 }
   8063 
   8064 /// Emit nodes that will be selected as "test Op0,Op0", or something
   8065 /// equivalent.
   8066 SDValue X86TargetLowering::EmitTest(SDValue Op, unsigned X86CC,
   8067                                     SelectionDAG &DAG) const {
   8068   DebugLoc dl = Op.getDebugLoc();
   8069 
   8070   // CF and OF aren't always set the way we want. Determine which
   8071   // of these we need.
   8072   bool NeedCF = false;
   8073   bool NeedOF = false;
   8074   switch (X86CC) {
   8075   default: break;
   8076   case X86::COND_A: case X86::COND_AE:
   8077   case X86::COND_B: case X86::COND_BE:
   8078     NeedCF = true;
   8079     break;
   8080   case X86::COND_G: case X86::COND_GE:
   8081   case X86::COND_L: case X86::COND_LE:
   8082   case X86::COND_O: case X86::COND_NO:
   8083     NeedOF = true;
   8084     break;
   8085   }
   8086 
   8087   // See if we can use the EFLAGS value from the operand instead of
   8088   // doing a separate TEST. TEST always sets OF and CF to 0, so unless
   8089   // we prove that the arithmetic won't overflow, we can't use OF or CF.
   8090   if (Op.getResNo() != 0 || NeedOF || NeedCF)
   8091     // Emit a CMP with 0, which is the TEST pattern.
   8092     return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op,
   8093                        DAG.getConstant(0, Op.getValueType()));
   8094 
   8095   unsigned Opcode = 0;
   8096   unsigned NumOperands = 0;
   8097   switch (Op.getNode()->getOpcode()) {
   8098   case ISD::ADD:
   8099     // Due to an isel shortcoming, be conservative if this add is likely to be
   8100     // selected as part of a load-modify-store instruction. When the root node
   8101     // in a match is a store, isel doesn't know how to remap non-chain non-flag
   8102     // uses of other nodes in the match, such as the ADD in this case. This
   8103     // leads to the ADD being left around and reselected, with the result being
   8104     // two adds in the output.  Alas, even if none our users are stores, that
   8105     // doesn't prove we're O.K.  Ergo, if we have any parents that aren't
   8106     // CopyToReg or SETCC, eschew INC/DEC.  A better fix seems to require
   8107     // climbing the DAG back to the root, and it doesn't seem to be worth the
   8108     // effort.
   8109     for (SDNode::use_iterator UI = Op.getNode()->use_begin(),
   8110          UE = Op.getNode()->use_end(); UI != UE; ++UI)
   8111       if (UI->getOpcode() != ISD::CopyToReg &&
   8112           UI->getOpcode() != ISD::SETCC &&
   8113           UI->getOpcode() != ISD::STORE)
   8114         goto default_case;
   8115 
   8116     if (ConstantSDNode *C =
   8117         dyn_cast<ConstantSDNode>(Op.getNode()->getOperand(1))) {
   8118       // An add of one will be selected as an INC.
   8119       if (C->getAPIntValue() == 1) {
   8120         Opcode = X86ISD::INC;
   8121         NumOperands = 1;
   8122         break;
   8123       }
   8124 
   8125       // An add of negative one (subtract of one) will be selected as a DEC.
   8126       if (C->getAPIntValue().isAllOnesValue()) {
   8127         Opcode = X86ISD::DEC;
   8128         NumOperands = 1;
   8129         break;
   8130       }
   8131     }
   8132 
   8133     // Otherwise use a regular EFLAGS-setting add.
   8134     Opcode = X86ISD::ADD;
   8135     NumOperands = 2;
   8136     break;
   8137   case ISD::AND: {
   8138     // If the primary and result isn't used, don't bother using X86ISD::AND,
   8139     // because a TEST instruction will be better.
   8140     bool NonFlagUse = false;
   8141     for (SDNode::use_iterator UI = Op.getNode()->use_begin(),
   8142            UE = Op.getNode()->use_end(); UI != UE; ++UI) {
   8143       SDNode *User = *UI;
   8144       unsigned UOpNo = UI.getOperandNo();
   8145       if (User->getOpcode() == ISD::TRUNCATE && User->hasOneUse()) {
   8146         // Look pass truncate.
   8147         UOpNo = User->use_begin().getOperandNo();
   8148         User = *User->use_begin();
   8149       }
   8150 
   8151       if (User->getOpcode() != ISD::BRCOND &&
   8152           User->getOpcode() != ISD::SETCC &&
   8153           (User->getOpcode() != ISD::SELECT || UOpNo != 0)) {
   8154         NonFlagUse = true;
   8155         break;
   8156       }
   8157     }
   8158 
   8159     if (!NonFlagUse)
   8160       break;
   8161   }
   8162     // FALL THROUGH
   8163   case ISD::SUB:
   8164   case ISD::OR:
   8165   case ISD::XOR:
   8166     // Due to the ISEL shortcoming noted above, be conservative if this op is
   8167     // likely to be selected as part of a load-modify-store instruction.
   8168     for (SDNode::use_iterator UI = Op.getNode()->use_begin(),
   8169            UE = Op.getNode()->use_end(); UI != UE; ++UI)
   8170       if (UI->getOpcode() == ISD::STORE)
   8171         goto default_case;
   8172 
   8173     // Otherwise use a regular EFLAGS-setting instruction.
   8174     switch (Op.getNode()->getOpcode()) {
   8175     default: llvm_unreachable("unexpected operator!");
   8176     case ISD::SUB: Opcode = X86ISD::SUB; break;
   8177     case ISD::OR:  Opcode = X86ISD::OR;  break;
   8178     case ISD::XOR: Opcode = X86ISD::XOR; break;
   8179     case ISD::AND: Opcode = X86ISD::AND; break;
   8180     }
   8181 
   8182     NumOperands = 2;
   8183     break;
   8184   case X86ISD::ADD:
   8185   case X86ISD::SUB:
   8186   case X86ISD::INC:
   8187   case X86ISD::DEC:
   8188   case X86ISD::OR:
   8189   case X86ISD::XOR:
   8190   case X86ISD::AND:
   8191     return SDValue(Op.getNode(), 1);
   8192   default:
   8193   default_case:
   8194     break;
   8195   }
   8196 
   8197   if (Opcode == 0)
   8198     // Emit a CMP with 0, which is the TEST pattern.
   8199     return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op,
   8200                        DAG.getConstant(0, Op.getValueType()));
   8201 
   8202   SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32);
   8203   SmallVector<SDValue, 4> Ops;
   8204   for (unsigned i = 0; i != NumOperands; ++i)
   8205     Ops.push_back(Op.getOperand(i));
   8206 
   8207   SDValue New = DAG.getNode(Opcode, dl, VTs, &Ops[0], NumOperands);
   8208   DAG.ReplaceAllUsesWith(Op, New);
   8209   return SDValue(New.getNode(), 1);
   8210 }
   8211 
   8212 /// Emit nodes that will be selected as "cmp Op0,Op1", or something
   8213 /// equivalent.
   8214 SDValue X86TargetLowering::EmitCmp(SDValue Op0, SDValue Op1, unsigned X86CC,
   8215                                    SelectionDAG &DAG) const {
   8216   if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op1))
   8217     if (C->getAPIntValue() == 0)
   8218       return EmitTest(Op0, X86CC, DAG);
   8219 
   8220   DebugLoc dl = Op0.getDebugLoc();
   8221   return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op0, Op1);
   8222 }
   8223 
   8224 /// LowerToBT - Result of 'and' is compared against zero. Turn it into a BT node
   8225 /// if it's possible.
   8226 SDValue X86TargetLowering::LowerToBT(SDValue And, ISD::CondCode CC,
   8227                                      DebugLoc dl, SelectionDAG &DAG) const {
   8228   SDValue Op0 = And.getOperand(0);
   8229   SDValue Op1 = And.getOperand(1);
   8230   if (Op0.getOpcode() == ISD::TRUNCATE)
   8231     Op0 = Op0.getOperand(0);
   8232   if (Op1.getOpcode() == ISD::TRUNCATE)
   8233     Op1 = Op1.getOperand(0);
   8234 
   8235   SDValue LHS, RHS;
   8236   if (Op1.getOpcode() == ISD::SHL)
   8237     std::swap(Op0, Op1);
   8238   if (Op0.getOpcode() == ISD::SHL) {
   8239     if (ConstantSDNode *And00C = dyn_cast<ConstantSDNode>(Op0.getOperand(0)))
   8240       if (And00C->getZExtValue() == 1) {
   8241         // If we looked past a truncate, check that it's only truncating away
   8242         // known zeros.
   8243         unsigned BitWidth = Op0.getValueSizeInBits();
   8244         unsigned AndBitWidth = And.getValueSizeInBits();
   8245         if (BitWidth > AndBitWidth) {
   8246           APInt Zeros, Ones;
   8247           DAG.ComputeMaskedBits(Op0, Zeros, Ones);
   8248           if (Zeros.countLeadingOnes() < BitWidth - AndBitWidth)
   8249             return SDValue();
   8250         }
   8251         LHS = Op1;
   8252         RHS = Op0.getOperand(1);
   8253       }
   8254   } else if (Op1.getOpcode() == ISD::Constant) {
   8255     ConstantSDNode *AndRHS = cast<ConstantSDNode>(Op1);
   8256     uint64_t AndRHSVal = AndRHS->getZExtValue();
   8257     SDValue AndLHS = Op0;
   8258 
   8259     if (AndRHSVal == 1 && AndLHS.getOpcode() == ISD::SRL) {
   8260       LHS = AndLHS.getOperand(0);
   8261       RHS = AndLHS.getOperand(1);
   8262     }
   8263 
   8264     // Use BT if the immediate can't be encoded in a TEST instruction.
   8265     if (!isUInt<32>(AndRHSVal) && isPowerOf2_64(AndRHSVal)) {
   8266       LHS = AndLHS;
   8267       RHS = DAG.getConstant(Log2_64_Ceil(AndRHSVal), LHS.getValueType());
   8268     }
   8269   }
   8270 
   8271   if (LHS.getNode()) {
   8272     // If LHS is i8, promote it to i32 with any_extend.  There is no i8 BT
   8273     // instruction.  Since the shift amount is in-range-or-undefined, we know
   8274     // that doing a bittest on the i32 value is ok.  We extend to i32 because
   8275     // the encoding for the i16 version is larger than the i32 version.
   8276     // Also promote i16 to i32 for performance / code size reason.
   8277     if (LHS.getValueType() == MVT::i8 ||
   8278         LHS.getValueType() == MVT::i16)
   8279       LHS = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, LHS);
   8280 
   8281     // If the operand types disagree, extend the shift amount to match.  Since
   8282     // BT ignores high bits (like shifts) we can use anyextend.
   8283     if (LHS.getValueType() != RHS.getValueType())
   8284       RHS = DAG.getNode(ISD::ANY_EXTEND, dl, LHS.getValueType(), RHS);
   8285 
   8286     SDValue BT = DAG.getNode(X86ISD::BT, dl, MVT::i32, LHS, RHS);
   8287     unsigned Cond = CC == ISD::SETEQ ? X86::COND_AE : X86::COND_B;
   8288     return DAG.getNode(X86ISD::SETCC, dl, MVT::i8,
   8289                        DAG.getConstant(Cond, MVT::i8), BT);
   8290   }
   8291 
   8292   return SDValue();
   8293 }
   8294 
   8295 SDValue X86TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const {
   8296 
   8297   if (Op.getValueType().isVector()) return LowerVSETCC(Op, DAG);
   8298 
   8299   assert(Op.getValueType() == MVT::i8 && "SetCC type must be 8-bit integer");
   8300   SDValue Op0 = Op.getOperand(0);
   8301   SDValue Op1 = Op.getOperand(1);
   8302   DebugLoc dl = Op.getDebugLoc();
   8303   ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(2))->get();
   8304 
   8305   // Optimize to BT if possible.
   8306   // Lower (X & (1 << N)) == 0 to BT(X, N).
   8307   // Lower ((X >>u N) & 1) != 0 to BT(X, N).
   8308   // Lower ((X >>s N) & 1) != 0 to BT(X, N).
   8309   if (Op0.getOpcode() == ISD::AND && Op0.hasOneUse() &&
   8310       Op1.getOpcode() == ISD::Constant &&
   8311       cast<ConstantSDNode>(Op1)->isNullValue() &&
   8312       (CC == ISD::SETEQ || CC == ISD::SETNE)) {
   8313     SDValue NewSetCC = LowerToBT(Op0, CC, dl, DAG);
   8314     if (NewSetCC.getNode())
   8315       return NewSetCC;
   8316   }
   8317 
   8318   // Look for X == 0, X == 1, X != 0, or X != 1.  We can simplify some forms of
   8319   // these.
   8320   if (Op1.getOpcode() == ISD::Constant &&
   8321       (cast<ConstantSDNode>(Op1)->getZExtValue() == 1 ||
   8322        cast<ConstantSDNode>(Op1)->isNullValue()) &&
   8323       (CC == ISD::SETEQ || CC == ISD::SETNE)) {
   8324 
   8325     // If the input is a setcc, then reuse the input setcc or use a new one with
   8326     // the inverted condition.
   8327     if (Op0.getOpcode() == X86ISD::SETCC) {
   8328       X86::CondCode CCode = (X86::CondCode)Op0.getConstantOperandVal(0);
   8329       bool Invert = (CC == ISD::SETNE) ^
   8330         cast<ConstantSDNode>(Op1)->isNullValue();
   8331       if (!Invert) return Op0;
   8332 
   8333       CCode = X86::GetOppositeBranchCondition(CCode);
   8334       return DAG.getNode(X86ISD::SETCC, dl, MVT::i8,
   8335                          DAG.getConstant(CCode, MVT::i8), Op0.getOperand(1));
   8336     }
   8337   }
   8338 
   8339   bool isFP = Op1.getValueType().isFloatingPoint();
   8340   unsigned X86CC = TranslateX86CC(CC, isFP, Op0, Op1, DAG);
   8341   if (X86CC == X86::COND_INVALID)
   8342     return SDValue();
   8343 
   8344   SDValue EFLAGS = EmitCmp(Op0, Op1, X86CC, DAG);
   8345   return DAG.getNode(X86ISD::SETCC, dl, MVT::i8,
   8346                      DAG.getConstant(X86CC, MVT::i8), EFLAGS);
   8347 }
   8348 
   8349 // Lower256IntVSETCC - Break a VSETCC 256-bit integer VSETCC into two new 128
   8350 // ones, and then concatenate the result back.
   8351 static SDValue Lower256IntVSETCC(SDValue Op, SelectionDAG &DAG) {
   8352   EVT VT = Op.getValueType();
   8353 
   8354   assert(VT.getSizeInBits() == 256 && Op.getOpcode() == ISD::SETCC &&
   8355          "Unsupported value type for operation");
   8356 
   8357   int NumElems = VT.getVectorNumElements();
   8358   DebugLoc dl = Op.getDebugLoc();
   8359   SDValue CC = Op.getOperand(2);
   8360   SDValue Idx0 = DAG.getConstant(0, MVT::i32);
   8361   SDValue Idx1 = DAG.getConstant(NumElems/2, MVT::i32);
   8362 
   8363   // Extract the LHS vectors
   8364   SDValue LHS = Op.getOperand(0);
   8365   SDValue LHS1 = Extract128BitVector(LHS, Idx0, DAG, dl);
   8366   SDValue LHS2 = Extract128BitVector(LHS, Idx1, DAG, dl);
   8367 
   8368   // Extract the RHS vectors
   8369   SDValue RHS = Op.getOperand(1);
   8370   SDValue RHS1 = Extract128BitVector(RHS, Idx0, DAG, dl);
   8371   SDValue RHS2 = Extract128BitVector(RHS, Idx1, DAG, dl);
   8372 
   8373   // Issue the operation on the smaller types and concatenate the result back
   8374   MVT EltVT = VT.getVectorElementType().getSimpleVT();
   8375   EVT NewVT = MVT::getVectorVT(EltVT, NumElems/2);
   8376   return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT,
   8377                      DAG.getNode(Op.getOpcode(), dl, NewVT, LHS1, RHS1, CC),
   8378                      DAG.getNode(Op.getOpcode(), dl, NewVT, LHS2, RHS2, CC));
   8379 }
   8380 
   8381 
   8382 SDValue X86TargetLowering::LowerVSETCC(SDValue Op, SelectionDAG &DAG) const {
   8383   SDValue Cond;
   8384   SDValue Op0 = Op.getOperand(0);
   8385   SDValue Op1 = Op.getOperand(1);
   8386   SDValue CC = Op.getOperand(2);
   8387   EVT VT = Op.getValueType();
   8388   ISD::CondCode SetCCOpcode = cast<CondCodeSDNode>(CC)->get();
   8389   bool isFP = Op.getOperand(1).getValueType().isFloatingPoint();
   8390   DebugLoc dl = Op.getDebugLoc();
   8391 
   8392   if (isFP) {
   8393     unsigned SSECC = 8;
   8394     EVT EltVT = Op0.getValueType().getVectorElementType();
   8395     assert(EltVT == MVT::f32 || EltVT == MVT::f64); (void)EltVT;
   8396 
   8397     bool Swap = false;
   8398 
   8399     // SSE Condition code mapping:
   8400     //  0 - EQ
   8401     //  1 - LT
   8402     //  2 - LE
   8403     //  3 - UNORD
   8404     //  4 - NEQ
   8405     //  5 - NLT
   8406     //  6 - NLE
   8407     //  7 - ORD
   8408     switch (SetCCOpcode) {
   8409     default: break;
   8410     case ISD::SETOEQ:
   8411     case ISD::SETEQ:  SSECC = 0; break;
   8412     case ISD::SETOGT:
   8413     case ISD::SETGT: Swap = true; // Fallthrough
   8414     case ISD::SETLT:
   8415     case ISD::SETOLT: SSECC = 1; break;
   8416     case ISD::SETOGE:
   8417     case ISD::SETGE: Swap = true; // Fallthrough
   8418     case ISD::SETLE:
   8419     case ISD::SETOLE: SSECC = 2; break;
   8420     case ISD::SETUO:  SSECC = 3; break;
   8421     case ISD::SETUNE:
   8422     case ISD::SETNE:  SSECC = 4; break;
   8423     case ISD::SETULE: Swap = true;
   8424     case ISD::SETUGE: SSECC = 5; break;
   8425     case ISD::SETULT: Swap = true;
   8426     case ISD::SETUGT: SSECC = 6; break;
   8427     case ISD::SETO:   SSECC = 7; break;
   8428     }
   8429     if (Swap)
   8430       std::swap(Op0, Op1);
   8431 
   8432     // In the two special cases we can't handle, emit two comparisons.
   8433     if (SSECC == 8) {
   8434       if (SetCCOpcode == ISD::SETUEQ) {
   8435         SDValue UNORD, EQ;
   8436         UNORD = DAG.getNode(X86ISD::CMPP, dl, VT, Op0, Op1,
   8437                             DAG.getConstant(3, MVT::i8));
   8438         EQ = DAG.getNode(X86ISD::CMPP, dl, VT, Op0, Op1,
   8439                          DAG.getConstant(0, MVT::i8));
   8440         return DAG.getNode(ISD::OR, dl, VT, UNORD, EQ);
   8441       } else if (SetCCOpcode == ISD::SETONE) {
   8442         SDValue ORD, NEQ;
   8443         ORD = DAG.getNode(X86ISD::CMPP, dl, VT, Op0, Op1,
   8444                           DAG.getConstant(7, MVT::i8));
   8445         NEQ = DAG.getNode(X86ISD::CMPP, dl, VT, Op0, Op1,
   8446                           DAG.getConstant(4, MVT::i8));
   8447         return DAG.getNode(ISD::AND, dl, VT, ORD, NEQ);
   8448       }
   8449       llvm_unreachable("Illegal FP comparison");
   8450     }
   8451     // Handle all other FP comparisons here.
   8452     return DAG.getNode(X86ISD::CMPP, dl, VT, Op0, Op1,
   8453                        DAG.getConstant(SSECC, MVT::i8));
   8454   }
   8455 
   8456   // Break 256-bit integer vector compare into smaller ones.
   8457   if (VT.getSizeInBits() == 256 && !Subtarget->hasAVX2())
   8458     return Lower256IntVSETCC(Op, DAG);
   8459 
   8460   // We are handling one of the integer comparisons here.  Since SSE only has
   8461   // GT and EQ comparisons for integer, swapping operands and multiple
   8462   // operations may be required for some comparisons.
   8463   unsigned Opc = 0;
   8464   bool Swap = false, Invert = false, FlipSigns = false;
   8465 
   8466   switch (SetCCOpcode) {
   8467   default: break;
   8468   case ISD::SETNE:  Invert = true;
   8469   case ISD::SETEQ:  Opc = X86ISD::PCMPEQ; break;
   8470   case ISD::SETLT:  Swap = true;
   8471   case ISD::SETGT:  Opc = X86ISD::PCMPGT; break;
   8472   case ISD::SETGE:  Swap = true;
   8473   case ISD::SETLE:  Opc = X86ISD::PCMPGT; Invert = true; break;
   8474   case ISD::SETULT: Swap = true;
   8475   case ISD::SETUGT: Opc = X86ISD::PCMPGT; FlipSigns = true; break;
   8476   case ISD::SETUGE: Swap = true;
   8477   case ISD::SETULE: Opc = X86ISD::PCMPGT; FlipSigns = true; Invert = true; break;
   8478   }
   8479   if (Swap)
   8480     std::swap(Op0, Op1);
   8481 
   8482   // Check that the operation in question is available (most are plain SSE2,
   8483   // but PCMPGTQ and PCMPEQQ have different requirements).
   8484   if (Opc == X86ISD::PCMPGT && VT == MVT::v2i64 && !Subtarget->hasSSE42())
   8485     return SDValue();
   8486   if (Opc == X86ISD::PCMPEQ && VT == MVT::v2i64 && !Subtarget->hasSSE41())
   8487     return SDValue();
   8488 
   8489   // Since SSE has no unsigned integer comparisons, we need to flip  the sign
   8490   // bits of the inputs before performing those operations.
   8491   if (FlipSigns) {
   8492     EVT EltVT = VT.getVectorElementType();
   8493     SDValue SignBit = DAG.getConstant(APInt::getSignBit(EltVT.getSizeInBits()),
   8494                                       EltVT);
   8495     std::vector<SDValue> SignBits(VT.getVectorNumElements(), SignBit);
   8496     SDValue SignVec = DAG.getNode(ISD::BUILD_VECTOR, dl, VT, &SignBits[0],
   8497                                     SignBits.size());
   8498     Op0 = DAG.getNode(ISD::XOR, dl, VT, Op0, SignVec);
   8499     Op1 = DAG.getNode(ISD::XOR, dl, VT, Op1, SignVec);
   8500   }
   8501 
   8502   SDValue Result = DAG.getNode(Opc, dl, VT, Op0, Op1);
   8503 
   8504   // If the logical-not of the result is required, perform that now.
   8505   if (Invert)
   8506     Result = DAG.getNOT(dl, Result, VT);
   8507 
   8508   return Result;
   8509 }
   8510 
   8511 // isX86LogicalCmp - Return true if opcode is a X86 logical comparison.
   8512 static bool isX86LogicalCmp(SDValue Op) {
   8513   unsigned Opc = Op.getNode()->getOpcode();
   8514   if (Opc == X86ISD::CMP || Opc == X86ISD::COMI || Opc == X86ISD::UCOMI)
   8515     return true;
   8516   if (Op.getResNo() == 1 &&
   8517       (Opc == X86ISD::ADD ||
   8518        Opc == X86ISD::SUB ||
   8519        Opc == X86ISD::ADC ||
   8520        Opc == X86ISD::SBB ||
   8521        Opc == X86ISD::SMUL ||
   8522        Opc == X86ISD::UMUL ||
   8523        Opc == X86ISD::INC ||
   8524        Opc == X86ISD::DEC ||
   8525        Opc == X86ISD::OR ||
   8526        Opc == X86ISD::XOR ||
   8527        Opc == X86ISD::AND))
   8528     return true;
   8529 
   8530   if (Op.getResNo() == 2 && Opc == X86ISD::UMUL)
   8531     return true;
   8532 
   8533   return false;
   8534 }
   8535 
   8536 static bool isZero(SDValue V) {
   8537   ConstantSDNode *C = dyn_cast<ConstantSDNode>(V);
   8538   return C && C->isNullValue();
   8539 }
   8540 
   8541 static bool isAllOnes(SDValue V) {
   8542   ConstantSDNode *C = dyn_cast<ConstantSDNode>(V);
   8543   return C && C->isAllOnesValue();
   8544 }
   8545 
   8546 SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
   8547   bool addTest = true;
   8548   SDValue Cond  = Op.getOperand(0);
   8549   SDValue Op1 = Op.getOperand(1);
   8550   SDValue Op2 = Op.getOperand(2);
   8551   DebugLoc DL = Op.getDebugLoc();
   8552   SDValue CC;
   8553 
   8554   if (Cond.getOpcode() == ISD::SETCC) {
   8555     SDValue NewCond = LowerSETCC(Cond, DAG);
   8556     if (NewCond.getNode())
   8557       Cond = NewCond;
   8558   }
   8559 
   8560   // (select (x == 0), -1, y) -> (sign_bit (x - 1)) | y
   8561   // (select (x == 0), y, -1) -> ~(sign_bit (x - 1)) | y
   8562   // (select (x != 0), y, -1) -> (sign_bit (x - 1)) | y
   8563   // (select (x != 0), -1, y) -> ~(sign_bit (x - 1)) | y
   8564   if (Cond.getOpcode() == X86ISD::SETCC &&
   8565       Cond.getOperand(1).getOpcode() == X86ISD::CMP &&
   8566       isZero(Cond.getOperand(1).getOperand(1))) {
   8567     SDValue Cmp = Cond.getOperand(1);
   8568 
   8569     unsigned CondCode =cast<ConstantSDNode>(Cond.getOperand(0))->getZExtValue();
   8570 
   8571     if ((isAllOnes(Op1) || isAllOnes(Op2)) &&
   8572         (CondCode == X86::COND_E || CondCode == X86::COND_NE)) {
   8573       SDValue Y = isAllOnes(Op2) ? Op1 : Op2;
   8574 
   8575       SDValue CmpOp0 = Cmp.getOperand(0);
   8576       Cmp = DAG.getNode(X86ISD::CMP, DL, MVT::i32,
   8577                         CmpOp0, DAG.getConstant(1, CmpOp0.getValueType()));
   8578 
   8579       SDValue Res =   // Res = 0 or -1.
   8580         DAG.getNode(X86ISD::SETCC_CARRY, DL, Op.getValueType(),
   8581                     DAG.getConstant(X86::COND_B, MVT::i8), Cmp);
   8582 
   8583       if (isAllOnes(Op1) != (CondCode == X86::COND_E))
   8584         Res = DAG.getNOT(DL, Res, Res.getValueType());
   8585 
   8586       ConstantSDNode *N2C = dyn_cast<ConstantSDNode>(Op2);
   8587       if (N2C == 0 || !N2C->isNullValue())
   8588         Res = DAG.getNode(ISD::OR, DL, Res.getValueType(), Res, Y);
   8589       return Res;
   8590     }
   8591   }
   8592 
   8593   // Look past (and (setcc_carry (cmp ...)), 1).
   8594   if (Cond.getOpcode() == ISD::AND &&
   8595       Cond.getOperand(0).getOpcode() == X86ISD::SETCC_CARRY) {
   8596     ConstantSDNode *C = dyn_cast<ConstantSDNode>(Cond.getOperand(1));
   8597     if (C && C->getAPIntValue() == 1)
   8598       Cond = Cond.getOperand(0);
   8599   }
   8600 
   8601   // If condition flag is set by a X86ISD::CMP, then use it as the condition
   8602   // setting operand in place of the X86ISD::SETCC.
   8603   unsigned CondOpcode = Cond.getOpcode();
   8604   if (CondOpcode == X86ISD::SETCC ||
   8605       CondOpcode == X86ISD::SETCC_CARRY) {
   8606     CC = Cond.getOperand(0);
   8607 
   8608     SDValue Cmp = Cond.getOperand(1);
   8609     unsigned Opc = Cmp.getOpcode();
   8610     EVT VT = Op.getValueType();
   8611 
   8612     bool IllegalFPCMov = false;
   8613     if (VT.isFloatingPoint() && !VT.isVector() &&
   8614         !isScalarFPTypeInSSEReg(VT))  // FPStack?
   8615       IllegalFPCMov = !hasFPCMov(cast<ConstantSDNode>(CC)->getSExtValue());
   8616 
   8617     if ((isX86LogicalCmp(Cmp) && !IllegalFPCMov) ||
   8618         Opc == X86ISD::BT) { // FIXME
   8619       Cond = Cmp;
   8620       addTest = false;
   8621     }
   8622   } else if (CondOpcode == ISD::USUBO || CondOpcode == ISD::SSUBO ||
   8623              CondOpcode == ISD::UADDO || CondOpcode == ISD::SADDO ||
   8624              ((CondOpcode == ISD::UMULO || CondOpcode == ISD::SMULO) &&
   8625               Cond.getOperand(0).getValueType() != MVT::i8)) {
   8626     SDValue LHS = Cond.getOperand(0);
   8627     SDValue RHS = Cond.getOperand(1);
   8628     unsigned X86Opcode;
   8629     unsigned X86Cond;
   8630     SDVTList VTs;
   8631     switch (CondOpcode) {
   8632     case ISD::UADDO: X86Opcode = X86ISD::ADD; X86Cond = X86::COND_B; break;
   8633     case ISD::SADDO: X86Opcode = X86ISD::ADD; X86Cond = X86::COND_O; break;
   8634     case ISD::USUBO: X86Opcode = X86ISD::SUB; X86Cond = X86::COND_B; break;
   8635     case ISD::SSUBO: X86Opcode = X86ISD::SUB; X86Cond = X86::COND_O; break;
   8636     case ISD::UMULO: X86Opcode = X86ISD::UMUL; X86Cond = X86::COND_O; break;
   8637     case ISD::SMULO: X86Opcode = X86ISD::SMUL; X86Cond = X86::COND_O; break;
   8638     default: llvm_unreachable("unexpected overflowing operator");
   8639     }
   8640     if (CondOpcode == ISD::UMULO)
   8641       VTs = DAG.getVTList(LHS.getValueType(), LHS.getValueType(),
   8642                           MVT::i32);
   8643     else
   8644       VTs = DAG.getVTList(LHS.getValueType(), MVT::i32);
   8645 
   8646     SDValue X86Op = DAG.getNode(X86Opcode, DL, VTs, LHS, RHS);
   8647 
   8648     if (CondOpcode == ISD::UMULO)
   8649       Cond = X86Op.getValue(2);
   8650     else
   8651       Cond = X86Op.getValue(1);
   8652 
   8653     CC = DAG.getConstant(X86Cond, MVT::i8);
   8654     addTest = false;
   8655   }
   8656 
   8657   if (addTest) {
   8658     // Look pass the truncate.
   8659     if (Cond.getOpcode() == ISD::TRUNCATE)
   8660       Cond = Cond.getOperand(0);
   8661 
   8662     // We know the result of AND is compared against zero. Try to match
   8663     // it to BT.
   8664     if (Cond.getOpcode() == ISD::AND && Cond.hasOneUse()) {
   8665       SDValue NewSetCC = LowerToBT(Cond, ISD::SETNE, DL, DAG);
   8666       if (NewSetCC.getNode()) {
   8667         CC = NewSetCC.getOperand(0);
   8668         Cond = NewSetCC.getOperand(1);
   8669         addTest = false;
   8670       }
   8671     }
   8672   }
   8673 
   8674   if (addTest) {
   8675     CC = DAG.getConstant(X86::COND_NE, MVT::i8);
   8676     Cond = EmitTest(Cond, X86::COND_NE, DAG);
   8677   }
   8678 
   8679   // a <  b ? -1 :  0 -> RES = ~setcc_carry
   8680   // a <  b ?  0 : -1 -> RES = setcc_carry
   8681   // a >= b ? -1 :  0 -> RES = setcc_carry
   8682   // a >= b ?  0 : -1 -> RES = ~setcc_carry
   8683   if (Cond.getOpcode() == X86ISD::CMP) {
   8684     unsigned CondCode = cast<ConstantSDNode>(CC)->getZExtValue();
   8685 
   8686     if ((CondCode == X86::COND_AE || CondCode == X86::COND_B) &&
   8687         (isAllOnes(Op1) || isAllOnes(Op2)) && (isZero(Op1) || isZero(Op2))) {
   8688       SDValue Res = DAG.getNode(X86ISD::SETCC_CARRY, DL, Op.getValueType(),
   8689                                 DAG.getConstant(X86::COND_B, MVT::i8), Cond);
   8690       if (isAllOnes(Op1) != (CondCode == X86::COND_B))
   8691         return DAG.getNOT(DL, Res, Res.getValueType());
   8692       return Res;
   8693     }
   8694   }
   8695 
   8696   // X86ISD::CMOV means set the result (which is operand 1) to the RHS if
   8697   // condition is true.
   8698   SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::Glue);
   8699   SDValue Ops[] = { Op2, Op1, CC, Cond };
   8700   return DAG.getNode(X86ISD::CMOV, DL, VTs, Ops, array_lengthof(Ops));
   8701 }
   8702 
   8703 // isAndOrOfSingleUseSetCCs - Return true if node is an ISD::AND or
   8704 // ISD::OR of two X86ISD::SETCC nodes each of which has no other use apart
   8705 // from the AND / OR.
   8706 static bool isAndOrOfSetCCs(SDValue Op, unsigned &Opc) {
   8707   Opc = Op.getOpcode();
   8708   if (Opc != ISD::OR && Opc != ISD::AND)
   8709     return false;
   8710   return (Op.getOperand(0).getOpcode() == X86ISD::SETCC &&
   8711           Op.getOperand(0).hasOneUse() &&
   8712           Op.getOperand(1).getOpcode() == X86ISD::SETCC &&
   8713           Op.getOperand(1).hasOneUse());
   8714 }
   8715 
   8716 // isXor1OfSetCC - Return true if node is an ISD::XOR of a X86ISD::SETCC and
   8717 // 1 and that the SETCC node has a single use.
   8718 static bool isXor1OfSetCC(SDValue Op) {
   8719   if (Op.getOpcode() != ISD::XOR)
   8720     return false;
   8721   ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(Op.getOperand(1));
   8722   if (N1C && N1C->getAPIntValue() == 1) {
   8723     return Op.getOperand(0).getOpcode() == X86ISD::SETCC &&
   8724       Op.getOperand(0).hasOneUse();
   8725   }
   8726   return false;
   8727 }
   8728 
   8729 SDValue X86TargetLowering::LowerBRCOND(SDValue Op, SelectionDAG &DAG) const {
   8730   bool addTest = true;
   8731   SDValue Chain = Op.getOperand(0);
   8732   SDValue Cond  = Op.getOperand(1);
   8733   SDValue Dest  = Op.getOperand(2);
   8734   DebugLoc dl = Op.getDebugLoc();
   8735   SDValue CC;
   8736   bool Inverted = false;
   8737 
   8738   if (Cond.getOpcode() == ISD::SETCC) {
   8739     // Check for setcc([su]{add,sub,mul}o == 0).
   8740     if (cast<CondCodeSDNode>(Cond.getOperand(2))->get() == ISD::SETEQ &&
   8741         isa<ConstantSDNode>(Cond.getOperand(1)) &&
   8742         cast<ConstantSDNode>(Cond.getOperand(1))->isNullValue() &&
   8743         Cond.getOperand(0).getResNo() == 1 &&
   8744         (Cond.getOperand(0).getOpcode() == ISD::SADDO ||
   8745          Cond.getOperand(0).getOpcode() == ISD::UADDO ||
   8746          Cond.getOperand(0).getOpcode() == ISD::SSUBO ||
   8747          Cond.getOperand(0).getOpcode() == ISD::USUBO ||
   8748          Cond.getOperand(0).getOpcode() == ISD::SMULO ||
   8749          Cond.getOperand(0).getOpcode() == ISD::UMULO)) {
   8750       Inverted = true;
   8751       Cond = Cond.getOperand(0);
   8752     } else {
   8753       SDValue NewCond = LowerSETCC(Cond, DAG);
   8754       if (NewCond.getNode())
   8755         Cond = NewCond;
   8756     }
   8757   }
   8758 #if 0
   8759   // FIXME: LowerXALUO doesn't handle these!!
   8760   else if (Cond.getOpcode() == X86ISD::ADD  ||
   8761            Cond.getOpcode() == X86ISD::SUB  ||
   8762            Cond.getOpcode() == X86ISD::SMUL ||
   8763            Cond.getOpcode() == X86ISD::UMUL)
   8764     Cond = LowerXALUO(Cond, DAG);
   8765 #endif
   8766 
   8767   // Look pass (and (setcc_carry (cmp ...)), 1).
   8768   if (Cond.getOpcode() == ISD::AND &&
   8769       Cond.getOperand(0).getOpcode() == X86ISD::SETCC_CARRY) {
   8770     ConstantSDNode *C = dyn_cast<ConstantSDNode>(Cond.getOperand(1));
   8771     if (C && C->getAPIntValue() == 1)
   8772       Cond = Cond.getOperand(0);
   8773   }
   8774 
   8775   // If condition flag is set by a X86ISD::CMP, then use it as the condition
   8776   // setting operand in place of the X86ISD::SETCC.
   8777   unsigned CondOpcode = Cond.getOpcode();
   8778   if (CondOpcode == X86ISD::SETCC ||
   8779       CondOpcode == X86ISD::SETCC_CARRY) {
   8780     CC = Cond.getOperand(0);
   8781 
   8782     SDValue Cmp = Cond.getOperand(1);
   8783     unsigned Opc = Cmp.getOpcode();
   8784     // FIXME: WHY THE SPECIAL CASING OF LogicalCmp??
   8785     if (isX86LogicalCmp(Cmp) || Opc == X86ISD::BT) {
   8786       Cond = Cmp;
   8787       addTest = false;
   8788     } else {
   8789       switch (cast<ConstantSDNode>(CC)->getZExtValue()) {
   8790       default: break;
   8791       case X86::COND_O:
   8792       case X86::COND_B:
   8793         // These can only come from an arithmetic instruction with overflow,
   8794         // e.g. SADDO, UADDO.
   8795         Cond = Cond.getNode()->getOperand(1);
   8796         addTest = false;
   8797         break;
   8798       }
   8799     }
   8800   }
   8801   CondOpcode = Cond.getOpcode();
   8802   if (CondOpcode == ISD::UADDO || CondOpcode == ISD::SADDO ||
   8803       CondOpcode == ISD::USUBO || CondOpcode == ISD::SSUBO ||
   8804       ((CondOpcode == ISD::UMULO || CondOpcode == ISD::SMULO) &&
   8805        Cond.getOperand(0).getValueType() != MVT::i8)) {
   8806     SDValue LHS = Cond.getOperand(0);
   8807     SDValue RHS = Cond.getOperand(1);
   8808     unsigned X86Opcode;
   8809     unsigned X86Cond;
   8810     SDVTList VTs;
   8811     switch (CondOpcode) {
   8812     case ISD::UADDO: X86Opcode = X86ISD::ADD; X86Cond = X86::COND_B; break;
   8813     case ISD::SADDO: X86Opcode = X86ISD::ADD; X86Cond = X86::COND_O; break;
   8814     case ISD::USUBO: X86Opcode = X86ISD::SUB; X86Cond = X86::COND_B; break;
   8815     case ISD::SSUBO: X86Opcode = X86ISD::SUB; X86Cond = X86::COND_O; break;
   8816     case ISD::UMULO: X86Opcode = X86ISD::UMUL; X86Cond = X86::COND_O; break;
   8817     case ISD::SMULO: X86Opcode = X86ISD::SMUL; X86Cond = X86::COND_O; break;
   8818     default: llvm_unreachable("unexpected overflowing operator");
   8819     }
   8820     if (Inverted)
   8821       X86Cond = X86::GetOppositeBranchCondition((X86::CondCode)X86Cond);
   8822     if (CondOpcode == ISD::UMULO)
   8823       VTs = DAG.getVTList(LHS.getValueType(), LHS.getValueType(),
   8824                           MVT::i32);
   8825     else
   8826       VTs = DAG.getVTList(LHS.getValueType(), MVT::i32);
   8827 
   8828     SDValue X86Op = DAG.getNode(X86Opcode, dl, VTs, LHS, RHS);
   8829 
   8830     if (CondOpcode == ISD::UMULO)
   8831       Cond = X86Op.getValue(2);
   8832     else
   8833       Cond = X86Op.getValue(1);
   8834 
   8835     CC = DAG.getConstant(X86Cond, MVT::i8);
   8836     addTest = false;
   8837   } else {
   8838     unsigned CondOpc;
   8839     if (Cond.hasOneUse() && isAndOrOfSetCCs(Cond, CondOpc)) {
   8840       SDValue Cmp = Cond.getOperand(0).getOperand(1);
   8841       if (CondOpc == ISD::OR) {
   8842         // Also, recognize the pattern generated by an FCMP_UNE. We can emit
   8843         // two branches instead of an explicit OR instruction with a
   8844         // separate test.
   8845         if (Cmp == Cond.getOperand(1).getOperand(1) &&
   8846             isX86LogicalCmp(Cmp)) {
   8847           CC = Cond.getOperand(0).getOperand(0);
   8848           Chain = DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(),
   8849                               Chain, Dest, CC, Cmp);
   8850           CC = Cond.getOperand(1).getOperand(0);
   8851           Cond = Cmp;
   8852           addTest = false;
   8853         }
   8854       } else { // ISD::AND
   8855         // Also, recognize the pattern generated by an FCMP_OEQ. We can emit
   8856         // two branches instead of an explicit AND instruction with a
   8857         // separate test. However, we only do this if this block doesn't
   8858         // have a fall-through edge, because this requires an explicit
   8859         // jmp when the condition is false.
   8860         if (Cmp == Cond.getOperand(1).getOperand(1) &&
   8861             isX86LogicalCmp(Cmp) &&
   8862             Op.getNode()->hasOneUse()) {
   8863           X86::CondCode CCode =
   8864             (X86::CondCode)Cond.getOperand(0).getConstantOperandVal(0);
   8865           CCode = X86::GetOppositeBranchCondition(CCode);
   8866           CC = DAG.getConstant(CCode, MVT::i8);
   8867           SDNode *User = *Op.getNode()->use_begin();
   8868           // Look for an unconditional branch following this conditional branch.
   8869           // We need this because we need to reverse the successors in order
   8870           // to implement FCMP_OEQ.
   8871           if (User->getOpcode() == ISD::BR) {
   8872             SDValue FalseBB = User->getOperand(1);
   8873             SDNode *NewBR =
   8874               DAG.UpdateNodeOperands(User, User->getOperand(0), Dest);
   8875             assert(NewBR == User);
   8876             (void)NewBR;
   8877             Dest = FalseBB;
   8878 
   8879             Chain = DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(),
   8880                                 Chain, Dest, CC, Cmp);
   8881             X86::CondCode CCode =
   8882               (X86::CondCode)Cond.getOperand(1).getConstantOperandVal(0);
   8883             CCode = X86::GetOppositeBranchCondition(CCode);
   8884             CC = DAG.getConstant(CCode, MVT::i8);
   8885             Cond = Cmp;
   8886             addTest = false;
   8887           }
   8888         }
   8889       }
   8890     } else if (Cond.hasOneUse() && isXor1OfSetCC(Cond)) {
   8891       // Recognize for xorb (setcc), 1 patterns. The xor inverts the condition.
   8892       // It should be transformed during dag combiner except when the condition
   8893       // is set by a arithmetics with overflow node.
   8894       X86::CondCode CCode =
   8895         (X86::CondCode)Cond.getOperand(0).getConstantOperandVal(0);
   8896       CCode = X86::GetOppositeBranchCondition(CCode);
   8897       CC = DAG.getConstant(CCode, MVT::i8);
   8898       Cond = Cond.getOperand(0).getOperand(1);
   8899       addTest = false;
   8900     } else if (Cond.getOpcode() == ISD::SETCC &&
   8901                cast<CondCodeSDNode>(Cond.getOperand(2))->get() == ISD::SETOEQ) {
   8902       // For FCMP_OEQ, we can emit
   8903       // two branches instead of an explicit AND instruction with a
   8904       // separate test. However, we only do this if this block doesn't
   8905       // have a fall-through edge, because this requires an explicit
   8906       // jmp when the condition is false.
   8907       if (Op.getNode()->hasOneUse()) {
   8908         SDNode *User = *Op.getNode()->use_begin();
   8909         // Look for an unconditional branch following this conditional branch.
   8910         // We need this because we need to reverse the successors in order
   8911         // to implement FCMP_OEQ.
   8912         if (User->getOpcode() == ISD::BR) {
   8913           SDValue FalseBB = User->getOperand(1);
   8914           SDNode *NewBR =
   8915             DAG.UpdateNodeOperands(User, User->getOperand(0), Dest);
   8916           assert(NewBR == User);
   8917           (void)NewBR;
   8918           Dest = FalseBB;
   8919 
   8920           SDValue Cmp = DAG.getNode(X86ISD::CMP, dl, MVT::i32,
   8921                                     Cond.getOperand(0), Cond.getOperand(1));
   8922           CC = DAG.getConstant(X86::COND_NE, MVT::i8);
   8923           Chain = DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(),
   8924                               Chain, Dest, CC, Cmp);
   8925           CC = DAG.getConstant(X86::COND_P, MVT::i8);
   8926           Cond = Cmp;
   8927           addTest = false;
   8928         }
   8929       }
   8930     } else if (Cond.getOpcode() == ISD::SETCC &&
   8931                cast<CondCodeSDNode>(Cond.getOperand(2))->get() == ISD::SETUNE) {
   8932       // For FCMP_UNE, we can emit
   8933       // two branches instead of an explicit AND instruction with a
   8934       // separate test. However, we only do this if this block doesn't
   8935       // have a fall-through edge, because this requires an explicit
   8936       // jmp when the condition is false.
   8937       if (Op.getNode()->hasOneUse()) {
   8938         SDNode *User = *Op.getNode()->use_begin();
   8939         // Look for an unconditional branch following this conditional branch.
   8940         // We need this because we need to reverse the successors in order
   8941         // to implement FCMP_UNE.
   8942         if (User->getOpcode() == ISD::BR) {
   8943           SDValue FalseBB = User->getOperand(1);
   8944           SDNode *NewBR =
   8945             DAG.UpdateNodeOperands(User, User->getOperand(0), Dest);
   8946           assert(NewBR == User);
   8947           (void)NewBR;
   8948 
   8949           SDValue Cmp = DAG.getNode(X86ISD::CMP, dl, MVT::i32,
   8950                                     Cond.getOperand(0), Cond.getOperand(1));
   8951           CC = DAG.getConstant(X86::COND_NE, MVT::i8);
   8952           Chain = DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(),
   8953                               Chain, Dest, CC, Cmp);
   8954           CC = DAG.getConstant(X86::COND_NP, MVT::i8);
   8955           Cond = Cmp;
   8956           addTest = false;
   8957           Dest = FalseBB;
   8958         }
   8959       }
   8960     }
   8961   }
   8962 
   8963   if (addTest) {
   8964     // Look pass the truncate.
   8965     if (Cond.getOpcode() == ISD::TRUNCATE)
   8966       Cond = Cond.getOperand(0);
   8967 
   8968     // We know the result of AND is compared against zero. Try to match
   8969     // it to BT.
   8970     if (Cond.getOpcode() == ISD::AND && Cond.hasOneUse()) {
   8971       SDValue NewSetCC = LowerToBT(Cond, ISD::SETNE, dl, DAG);
   8972       if (NewSetCC.getNode()) {
   8973         CC = NewSetCC.getOperand(0);
   8974         Cond = NewSetCC.getOperand(1);
   8975         addTest = false;
   8976       }
   8977     }
   8978   }
   8979 
   8980   if (addTest) {
   8981     CC = DAG.getConstant(X86::COND_NE, MVT::i8);
   8982     Cond = EmitTest(Cond, X86::COND_NE, DAG);
   8983   }
   8984   return DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(),
   8985                      Chain, Dest, CC, Cond);
   8986 }
   8987 
   8988 
   8989 // Lower dynamic stack allocation to _alloca call for Cygwin/Mingw targets.
   8990 // Calls to _alloca is needed to probe the stack when allocating more than 4k
   8991 // bytes in one go. Touching the stack at 4K increments is necessary to ensure
   8992 // that the guard pages used by the OS virtual memory manager are allocated in
   8993 // correct sequence.
   8994 SDValue
   8995 X86TargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op,
   8996                                            SelectionDAG &DAG) const {
   8997   assert((Subtarget->isTargetCygMing() || Subtarget->isTargetWindows() ||
   8998           getTargetMachine().Options.EnableSegmentedStacks) &&
   8999          "This should be used only on Windows targets or when segmented stacks "
   9000          "are being used");
   9001   assert(!Subtarget->isTargetEnvMacho() && "Not implemented");
   9002   DebugLoc dl = Op.getDebugLoc();
   9003 
   9004   // Get the inputs.
   9005   SDValue Chain = Op.getOperand(0);
   9006   SDValue Size  = Op.getOperand(1);
   9007   // FIXME: Ensure alignment here
   9008 
   9009   bool Is64Bit = Subtarget->is64Bit();
   9010   EVT SPTy = Is64Bit ? MVT::i64 : MVT::i32;
   9011 
   9012   if (getTargetMachine().Options.EnableSegmentedStacks) {
   9013     MachineFunction &MF = DAG.getMachineFunction();
   9014     MachineRegisterInfo &MRI = MF.getRegInfo();
   9015 
   9016     if (Is64Bit) {
   9017       // The 64 bit implementation of segmented stacks needs to clobber both r10
   9018       // r11. This makes it impossible to use it along with nested parameters.
   9019       const Function *F = MF.getFunction();
   9020 
   9021       for (Function::const_arg_iterator I = F->arg_begin(), E = F->arg_end();
   9022            I != E; I++)
   9023         if (I->hasNestAttr())
   9024           report_fatal_error("Cannot use segmented stacks with functions that "
   9025                              "have nested arguments.");
   9026     }
   9027 
   9028     const TargetRegisterClass *AddrRegClass =
   9029       getRegClassFor(Subtarget->is64Bit() ? MVT::i64:MVT::i32);
   9030     unsigned Vreg = MRI.createVirtualRegister(AddrRegClass);
   9031     Chain = DAG.getCopyToReg(Chain, dl, Vreg, Size);
   9032     SDValue Value = DAG.getNode(X86ISD::SEG_ALLOCA, dl, SPTy, Chain,
   9033                                 DAG.getRegister(Vreg, SPTy));
   9034     SDValue Ops1[2] = { Value, Chain };
   9035     return DAG.getMergeValues(Ops1, 2, dl);
   9036   } else {
   9037     SDValue Flag;
   9038     unsigned Reg = (Subtarget->is64Bit() ? X86::RAX : X86::EAX);
   9039 
   9040     Chain = DAG.getCopyToReg(Chain, dl, Reg, Size, Flag);
   9041     Flag = Chain.getValue(1);
   9042     SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
   9043 
   9044     Chain = DAG.getNode(X86ISD::WIN_ALLOCA, dl, NodeTys, Chain, Flag);
   9045     Flag = Chain.getValue(1);
   9046 
   9047     Chain = DAG.getCopyFromReg(Chain, dl, X86StackPtr, SPTy).getValue(1);
   9048 
   9049     SDValue Ops1[2] = { Chain.getValue(0), Chain };
   9050     return DAG.getMergeValues(Ops1, 2, dl);
   9051   }
   9052 }
   9053 
   9054 SDValue X86TargetLowering::LowerVASTART(SDValue Op, SelectionDAG &DAG) const {
   9055   MachineFunction &MF = DAG.getMachineFunction();
   9056   X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
   9057 
   9058   const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
   9059   DebugLoc DL = Op.getDebugLoc();
   9060 
   9061   if (!Subtarget->is64Bit() || Subtarget->isTargetWin64()) {
   9062     // vastart just stores the address of the VarArgsFrameIndex slot into the
   9063     // memory location argument.
   9064     SDValue FR = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(),
   9065                                    getPointerTy());
   9066     return DAG.getStore(Op.getOperand(0), DL, FR, Op.getOperand(1),
   9067                         MachinePointerInfo(SV), false, false, 0);
   9068   }
   9069 
   9070   // __va_list_tag:
   9071   //   gp_offset         (0 - 6 * 8)
   9072   //   fp_offset         (48 - 48 + 8 * 16)
   9073   //   overflow_arg_area (point to parameters coming in memory).
   9074   //   reg_save_area
   9075   SmallVector<SDValue, 8> MemOps;
   9076   SDValue FIN = Op.getOperand(1);
   9077   // Store gp_offset
   9078   SDValue Store = DAG.getStore(Op.getOperand(0), DL,
   9079                                DAG.getConstant(FuncInfo->getVarArgsGPOffset(),
   9080                                                MVT::i32),
   9081                                FIN, MachinePointerInfo(SV), false, false, 0);
   9082   MemOps.push_back(Store);
   9083 
   9084   // Store fp_offset
   9085   FIN = DAG.getNode(ISD::ADD, DL, getPointerTy(),
   9086                     FIN, DAG.getIntPtrConstant(4));
   9087   Store = DAG.getStore(Op.getOperand(0), DL,
   9088                        DAG.getConstant(FuncInfo->getVarArgsFPOffset(),
   9089                                        MVT::i32),
   9090                        FIN, MachinePointerInfo(SV, 4), false, false, 0);
   9091   MemOps.push_back(Store);
   9092 
   9093   // Store ptr to overflow_arg_area
   9094   FIN = DAG.getNode(ISD::ADD, DL, getPointerTy(),
   9095                     FIN, DAG.getIntPtrConstant(4));
   9096   SDValue OVFIN = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(),
   9097                                     getPointerTy());
   9098   Store = DAG.getStore(Op.getOperand(0), DL, OVFIN, FIN,
   9099                        MachinePointerInfo(SV, 8),
   9100                        false, false, 0);
   9101   MemOps.push_back(Store);
   9102 
   9103   // Store ptr to reg_save_area.
   9104   FIN = DAG.getNode(ISD::ADD, DL, getPointerTy(),
   9105                     FIN, DAG.getIntPtrConstant(8));
   9106   SDValue RSFIN = DAG.getFrameIndex(FuncInfo->getRegSaveFrameIndex(),
   9107                                     getPointerTy());
   9108   Store = DAG.getStore(Op.getOperand(0), DL, RSFIN, FIN,
   9109                        MachinePointerInfo(SV, 16), false, false, 0);
   9110   MemOps.push_back(Store);
   9111   return DAG.getNode(ISD::TokenFactor, DL, MVT::Other,
   9112                      &MemOps[0], MemOps.size());
   9113 }
   9114 
   9115 SDValue X86TargetLowering::LowerVAARG(SDValue Op, SelectionDAG &DAG) const {
   9116   assert(Subtarget->is64Bit() &&
   9117          "LowerVAARG only handles 64-bit va_arg!");
   9118   assert((Subtarget->isTargetLinux() ||
   9119           Subtarget->isTargetDarwin()) &&
   9120           "Unhandled target in LowerVAARG");
   9121   assert(Op.getNode()->getNumOperands() == 4);
   9122   SDValue Chain = Op.getOperand(0);
   9123   SDValue SrcPtr = Op.getOperand(1);
   9124   const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
   9125   unsigned Align = Op.getConstantOperandVal(3);
   9126   DebugLoc dl = Op.getDebugLoc();
   9127 
   9128   EVT ArgVT = Op.getNode()->getValueType(0);
   9129   Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext());
   9130   uint32_t ArgSize = getTargetData()->getTypeAllocSize(ArgTy);
   9131   uint8_t ArgMode;
   9132 
   9133   // Decide which area this value should be read from.
   9134   // TODO: Implement the AMD64 ABI in its entirety. This simple
   9135   // selection mechanism works only for the basic types.
   9136   if (ArgVT == MVT::f80) {
   9137     llvm_unreachable("va_arg for f80 not yet implemented");
   9138   } else if (ArgVT.isFloatingPoint() && ArgSize <= 16 /*bytes*/) {
   9139     ArgMode = 2;  // Argument passed in XMM register. Use fp_offset.
   9140   } else if (ArgVT.isInteger() && ArgSize <= 32 /*bytes*/) {
   9141     ArgMode = 1;  // Argument passed in GPR64 register(s). Use gp_offset.
   9142   } else {
   9143     llvm_unreachable("Unhandled argument type in LowerVAARG");
   9144   }
   9145 
   9146   if (ArgMode == 2) {
   9147     // Sanity Check: Make sure using fp_offset makes sense.
   9148     assert(!getTargetMachine().Options.UseSoftFloat &&
   9149            !(DAG.getMachineFunction()
   9150                 .getFunction()->hasFnAttr(Attribute::NoImplicitFloat)) &&
   9151            Subtarget->hasSSE1());
   9152   }
   9153 
   9154   // Insert VAARG_64 node into the DAG
   9155   // VAARG_64 returns two values: Variable Argument Address, Chain
   9156   SmallVector<SDValue, 11> InstOps;
   9157   InstOps.push_back(Chain);
   9158   InstOps.push_back(SrcPtr);
   9159   InstOps.push_back(DAG.getConstant(ArgSize, MVT::i32));
   9160   InstOps.push_back(DAG.getConstant(ArgMode, MVT::i8));
   9161   InstOps.push_back(DAG.getConstant(Align, MVT::i32));
   9162   SDVTList VTs = DAG.getVTList(getPointerTy(), MVT::Other);
   9163   SDValue VAARG = DAG.getMemIntrinsicNode(X86ISD::VAARG_64, dl,
   9164                                           VTs, &InstOps[0], InstOps.size(),
   9165                                           MVT::i64,
   9166                                           MachinePointerInfo(SV),
   9167                                           /*Align=*/0,
   9168                                           /*Volatile=*/false,
   9169                                           /*ReadMem=*/true,
   9170                                           /*WriteMem=*/true);
   9171   Chain = VAARG.getValue(1);
   9172 
   9173   // Load the next argument and return it
   9174   return DAG.getLoad(ArgVT, dl,
   9175                      Chain,
   9176                      VAARG,
   9177                      MachinePointerInfo(),
   9178                      false, false, false, 0);
   9179 }
   9180 
   9181 SDValue X86TargetLowering::LowerVACOPY(SDValue Op, SelectionDAG &DAG) const {
   9182   // X86-64 va_list is a struct { i32, i32, i8*, i8* }.
   9183   assert(Subtarget->is64Bit() && "This code only handles 64-bit va_copy!");
   9184   SDValue Chain = Op.getOperand(0);
   9185   SDValue DstPtr = Op.getOperand(1);
   9186   SDValue SrcPtr = Op.getOperand(2);
   9187   const Value *DstSV = cast<SrcValueSDNode>(Op.getOperand(3))->getValue();
   9188   const Value *SrcSV = cast<SrcValueSDNode>(Op.getOperand(4))->getValue();
   9189   DebugLoc DL = Op.getDebugLoc();
   9190 
   9191   return DAG.getMemcpy(Chain, DL, DstPtr, SrcPtr,
   9192                        DAG.getIntPtrConstant(24), 8, /*isVolatile*/false,
   9193                        false,
   9194                        MachinePointerInfo(DstSV), MachinePointerInfo(SrcSV));
   9195 }
   9196 
   9197 // getTargetVShiftNOde - Handle vector element shifts where the shift amount
   9198 // may or may not be a constant. Takes immediate version of shift as input.
   9199 static SDValue getTargetVShiftNode(unsigned Opc, DebugLoc dl, EVT VT,
   9200                                    SDValue SrcOp, SDValue ShAmt,
   9201                                    SelectionDAG &DAG) {
   9202   assert(ShAmt.getValueType() == MVT::i32 && "ShAmt is not i32");
   9203 
   9204   if (isa<ConstantSDNode>(ShAmt)) {
   9205     switch (Opc) {
   9206       default: llvm_unreachable("Unknown target vector shift node");
   9207       case X86ISD::VSHLI:
   9208       case X86ISD::VSRLI:
   9209       case X86ISD::VSRAI:
   9210         return DAG.getNode(Opc, dl, VT, SrcOp, ShAmt);
   9211     }
   9212   }
   9213 
   9214   // Change opcode to non-immediate version
   9215   switch (Opc) {
   9216     default: llvm_unreachable("Unknown target vector shift node");
   9217     case X86ISD::VSHLI: Opc = X86ISD::VSHL; break;
   9218     case X86ISD::VSRLI: Opc = X86ISD::VSRL; break;
   9219     case X86ISD::VSRAI: Opc = X86ISD::VSRA; break;
   9220   }
   9221 
   9222   // Need to build a vector containing shift amount
   9223   // Shift amount is 32-bits, but SSE instructions read 64-bit, so fill with 0
   9224   SDValue ShOps[4];
   9225   ShOps[0] = ShAmt;
   9226   ShOps[1] = DAG.getConstant(0, MVT::i32);
   9227   ShOps[2] = DAG.getUNDEF(MVT::i32);
   9228   ShOps[3] = DAG.getUNDEF(MVT::i32);
   9229   ShAmt = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32, &ShOps[0], 4);
   9230   ShAmt = DAG.getNode(ISD::BITCAST, dl, VT, ShAmt);
   9231   return DAG.getNode(Opc, dl, VT, SrcOp, ShAmt);
   9232 }
   9233 
   9234 SDValue
   9235 X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) const {
   9236   DebugLoc dl = Op.getDebugLoc();
   9237   unsigned IntNo = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
   9238   switch (IntNo) {
   9239   default: return SDValue();    // Don't custom lower most intrinsics.
   9240   // Comparison intrinsics.
   9241   case Intrinsic::x86_sse_comieq_ss:
   9242   case Intrinsic::x86_sse_comilt_ss:
   9243   case Intrinsic::x86_sse_comile_ss:
   9244   case Intrinsic::x86_sse_comigt_ss:
   9245   case Intrinsic::x86_sse_comige_ss:
   9246   case Intrinsic::x86_sse_comineq_ss:
   9247   case Intrinsic::x86_sse_ucomieq_ss:
   9248   case Intrinsic::x86_sse_ucomilt_ss:
   9249   case Intrinsic::x86_sse_ucomile_ss:
   9250   case Intrinsic::x86_sse_ucomigt_ss:
   9251   case Intrinsic::x86_sse_ucomige_ss:
   9252   case Intrinsic::x86_sse_ucomineq_ss:
   9253   case Intrinsic::x86_sse2_comieq_sd:
   9254   case Intrinsic::x86_sse2_comilt_sd:
   9255   case Intrinsic::x86_sse2_comile_sd:
   9256   case Intrinsic::x86_sse2_comigt_sd:
   9257   case Intrinsic::x86_sse2_comige_sd:
   9258   case Intrinsic::x86_sse2_comineq_sd:
   9259   case Intrinsic::x86_sse2_ucomieq_sd:
   9260   case Intrinsic::x86_sse2_ucomilt_sd:
   9261   case Intrinsic::x86_sse2_ucomile_sd:
   9262   case Intrinsic::x86_sse2_ucomigt_sd:
   9263   case Intrinsic::x86_sse2_ucomige_sd:
   9264   case Intrinsic::x86_sse2_ucomineq_sd: {
   9265     unsigned Opc = 0;
   9266     ISD::CondCode CC = ISD::SETCC_INVALID;
   9267     switch (IntNo) {
   9268     default: llvm_unreachable("Impossible intrinsic");  // Can't reach here.
   9269     case Intrinsic::x86_sse_comieq_ss:
   9270     case Intrinsic::x86_sse2_comieq_sd:
   9271       Opc = X86ISD::COMI;
   9272       CC = ISD::SETEQ;
   9273       break;
   9274     case Intrinsic::x86_sse_comilt_ss:
   9275     case Intrinsic::x86_sse2_comilt_sd:
   9276       Opc = X86ISD::COMI;
   9277       CC = ISD::SETLT;
   9278       break;
   9279     case Intrinsic::x86_sse_comile_ss:
   9280     case Intrinsic::x86_sse2_comile_sd:
   9281       Opc = X86ISD::COMI;
   9282       CC = ISD::SETLE;
   9283       break;
   9284     case Intrinsic::x86_sse_comigt_ss:
   9285     case Intrinsic::x86_sse2_comigt_sd:
   9286       Opc = X86ISD::COMI;
   9287       CC = ISD::SETGT;
   9288       break;
   9289     case Intrinsic::x86_sse_comige_ss:
   9290     case Intrinsic::x86_sse2_comige_sd:
   9291       Opc = X86ISD::COMI;
   9292       CC = ISD::SETGE;
   9293       break;
   9294     case Intrinsic::x86_sse_comineq_ss:
   9295     case Intrinsic::x86_sse2_comineq_sd:
   9296       Opc = X86ISD::COMI;
   9297       CC = ISD::SETNE;
   9298       break;
   9299     case Intrinsic::x86_sse_ucomieq_ss:
   9300     case Intrinsic::x86_sse2_ucomieq_sd:
   9301       Opc = X86ISD::UCOMI;
   9302       CC = ISD::SETEQ;
   9303       break;
   9304     case Intrinsic::x86_sse_ucomilt_ss:
   9305     case Intrinsic::x86_sse2_ucomilt_sd:
   9306       Opc = X86ISD::UCOMI;
   9307       CC = ISD::SETLT;
   9308       break;
   9309     case Intrinsic::x86_sse_ucomile_ss:
   9310     case Intrinsic::x86_sse2_ucomile_sd:
   9311       Opc = X86ISD::UCOMI;
   9312       CC = ISD::SETLE;
   9313       break;
   9314     case Intrinsic::x86_sse_ucomigt_ss:
   9315     case Intrinsic::x86_sse2_ucomigt_sd:
   9316       Opc = X86ISD::UCOMI;
   9317       CC = ISD::SETGT;
   9318       break;
   9319     case Intrinsic::x86_sse_ucomige_ss:
   9320     case Intrinsic::x86_sse2_ucomige_sd:
   9321       Opc = X86ISD::UCOMI;
   9322       CC = ISD::SETGE;
   9323       break;
   9324     case Intrinsic::x86_sse_ucomineq_ss:
   9325     case Intrinsic::x86_sse2_ucomineq_sd:
   9326       Opc = X86ISD::UCOMI;
   9327       CC = ISD::SETNE;
   9328       break;
   9329     }
   9330 
   9331     SDValue LHS = Op.getOperand(1);
   9332     SDValue RHS = Op.getOperand(2);
   9333     unsigned X86CC = TranslateX86CC(CC, true, LHS, RHS, DAG);
   9334     assert(X86CC != X86::COND_INVALID && "Unexpected illegal condition!");
   9335     SDValue Cond = DAG.getNode(Opc, dl, MVT::i32, LHS, RHS);
   9336     SDValue SetCC = DAG.getNode(X86ISD::SETCC, dl, MVT::i8,
   9337                                 DAG.getConstant(X86CC, MVT::i8), Cond);
   9338     return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC);
   9339   }
   9340   // XOP comparison intrinsics
   9341   case Intrinsic::x86_xop_vpcomltb:
   9342   case Intrinsic::x86_xop_vpcomltw:
   9343   case Intrinsic::x86_xop_vpcomltd:
   9344   case Intrinsic::x86_xop_vpcomltq:
   9345   case Intrinsic::x86_xop_vpcomltub:
   9346   case Intrinsic::x86_xop_vpcomltuw:
   9347   case Intrinsic::x86_xop_vpcomltud:
   9348   case Intrinsic::x86_xop_vpcomltuq:
   9349   case Intrinsic::x86_xop_vpcomleb:
   9350   case Intrinsic::x86_xop_vpcomlew:
   9351   case Intrinsic::x86_xop_vpcomled:
   9352   case Intrinsic::x86_xop_vpcomleq:
   9353   case Intrinsic::x86_xop_vpcomleub:
   9354   case Intrinsic::x86_xop_vpcomleuw:
   9355   case Intrinsic::x86_xop_vpcomleud:
   9356   case Intrinsic::x86_xop_vpcomleuq:
   9357   case Intrinsic::x86_xop_vpcomgtb:
   9358   case Intrinsic::x86_xop_vpcomgtw:
   9359   case Intrinsic::x86_xop_vpcomgtd:
   9360   case Intrinsic::x86_xop_vpcomgtq:
   9361   case Intrinsic::x86_xop_vpcomgtub:
   9362   case Intrinsic::x86_xop_vpcomgtuw:
   9363   case Intrinsic::x86_xop_vpcomgtud:
   9364   case Intrinsic::x86_xop_vpcomgtuq:
   9365   case Intrinsic::x86_xop_vpcomgeb:
   9366   case Intrinsic::x86_xop_vpcomgew:
   9367   case Intrinsic::x86_xop_vpcomged:
   9368   case Intrinsic::x86_xop_vpcomgeq:
   9369   case Intrinsic::x86_xop_vpcomgeub:
   9370   case Intrinsic::x86_xop_vpcomgeuw:
   9371   case Intrinsic::x86_xop_vpcomgeud:
   9372   case Intrinsic::x86_xop_vpcomgeuq:
   9373   case Intrinsic::x86_xop_vpcomeqb:
   9374   case Intrinsic::x86_xop_vpcomeqw:
   9375   case Intrinsic::x86_xop_vpcomeqd:
   9376   case Intrinsic::x86_xop_vpcomeqq:
   9377   case Intrinsic::x86_xop_vpcomequb:
   9378   case Intrinsic::x86_xop_vpcomequw:
   9379   case Intrinsic::x86_xop_vpcomequd:
   9380   case Intrinsic::x86_xop_vpcomequq:
   9381   case Intrinsic::x86_xop_vpcomneb:
   9382   case Intrinsic::x86_xop_vpcomnew:
   9383   case Intrinsic::x86_xop_vpcomned:
   9384   case Intrinsic::x86_xop_vpcomneq:
   9385   case Intrinsic::x86_xop_vpcomneub:
   9386   case Intrinsic::x86_xop_vpcomneuw:
   9387   case Intrinsic::x86_xop_vpcomneud:
   9388   case Intrinsic::x86_xop_vpcomneuq:
   9389   case Intrinsic::x86_xop_vpcomfalseb:
   9390   case Intrinsic::x86_xop_vpcomfalsew:
   9391   case Intrinsic::x86_xop_vpcomfalsed:
   9392   case Intrinsic::x86_xop_vpcomfalseq:
   9393   case Intrinsic::x86_xop_vpcomfalseub:
   9394   case Intrinsic::x86_xop_vpcomfalseuw:
   9395   case Intrinsic::x86_xop_vpcomfalseud:
   9396   case Intrinsic::x86_xop_vpcomfalseuq:
   9397   case Intrinsic::x86_xop_vpcomtrueb:
   9398   case Intrinsic::x86_xop_vpcomtruew:
   9399   case Intrinsic::x86_xop_vpcomtrued:
   9400   case Intrinsic::x86_xop_vpcomtrueq:
   9401   case Intrinsic::x86_xop_vpcomtrueub:
   9402   case Intrinsic::x86_xop_vpcomtrueuw:
   9403   case Intrinsic::x86_xop_vpcomtrueud:
   9404   case Intrinsic::x86_xop_vpcomtrueuq: {
   9405     unsigned CC = 0;
   9406     unsigned Opc = 0;
   9407 
   9408     switch (IntNo) {
   9409     default: llvm_unreachable("Impossible intrinsic");  // Can't reach here.
   9410     case Intrinsic::x86_xop_vpcomltb:
   9411     case Intrinsic::x86_xop_vpcomltw:
   9412     case Intrinsic::x86_xop_vpcomltd:
   9413     case Intrinsic::x86_xop_vpcomltq:
   9414       CC = 0;
   9415       Opc = X86ISD::VPCOM;
   9416       break;
   9417     case Intrinsic::x86_xop_vpcomltub:
   9418     case Intrinsic::x86_xop_vpcomltuw:
   9419     case Intrinsic::x86_xop_vpcomltud:
   9420     case Intrinsic::x86_xop_vpcomltuq:
   9421       CC = 0;
   9422       Opc = X86ISD::VPCOMU;
   9423       break;
   9424     case Intrinsic::x86_xop_vpcomleb:
   9425     case Intrinsic::x86_xop_vpcomlew:
   9426     case Intrinsic::x86_xop_vpcomled:
   9427     case Intrinsic::x86_xop_vpcomleq:
   9428       CC = 1;
   9429       Opc = X86ISD::VPCOM;
   9430       break;
   9431     case Intrinsic::x86_xop_vpcomleub:
   9432     case Intrinsic::x86_xop_vpcomleuw:
   9433     case Intrinsic::x86_xop_vpcomleud:
   9434     case Intrinsic::x86_xop_vpcomleuq:
   9435       CC = 1;
   9436       Opc = X86ISD::VPCOMU;
   9437       break;
   9438     case Intrinsic::x86_xop_vpcomgtb:
   9439     case Intrinsic::x86_xop_vpcomgtw:
   9440     case Intrinsic::x86_xop_vpcomgtd:
   9441     case Intrinsic::x86_xop_vpcomgtq:
   9442       CC = 2;
   9443       Opc = X86ISD::VPCOM;
   9444       break;
   9445     case Intrinsic::x86_xop_vpcomgtub:
   9446     case Intrinsic::x86_xop_vpcomgtuw:
   9447     case Intrinsic::x86_xop_vpcomgtud:
   9448     case Intrinsic::x86_xop_vpcomgtuq:
   9449       CC = 2;
   9450       Opc = X86ISD::VPCOMU;
   9451       break;
   9452     case Intrinsic::x86_xop_vpcomgeb:
   9453     case Intrinsic::x86_xop_vpcomgew:
   9454     case Intrinsic::x86_xop_vpcomged:
   9455     case Intrinsic::x86_xop_vpcomgeq:
   9456       CC = 3;
   9457       Opc = X86ISD::VPCOM;
   9458       break;
   9459     case Intrinsic::x86_xop_vpcomgeub:
   9460     case Intrinsic::x86_xop_vpcomgeuw:
   9461     case Intrinsic::x86_xop_vpcomgeud:
   9462     case Intrinsic::x86_xop_vpcomgeuq:
   9463       CC = 3;
   9464       Opc = X86ISD::VPCOMU;
   9465       break;
   9466     case Intrinsic::x86_xop_vpcomeqb:
   9467     case Intrinsic::x86_xop_vpcomeqw:
   9468     case Intrinsic::x86_xop_vpcomeqd:
   9469     case Intrinsic::x86_xop_vpcomeqq:
   9470       CC = 4;
   9471       Opc = X86ISD::VPCOM;
   9472       break;
   9473     case Intrinsic::x86_xop_vpcomequb:
   9474     case Intrinsic::x86_xop_vpcomequw:
   9475     case Intrinsic::x86_xop_vpcomequd:
   9476     case Intrinsic::x86_xop_vpcomequq:
   9477       CC = 4;
   9478       Opc = X86ISD::VPCOMU;
   9479       break;
   9480     case Intrinsic::x86_xop_vpcomneb:
   9481     case Intrinsic::x86_xop_vpcomnew:
   9482     case Intrinsic::x86_xop_vpcomned:
   9483     case Intrinsic::x86_xop_vpcomneq:
   9484       CC = 5;
   9485       Opc = X86ISD::VPCOM;
   9486       break;
   9487     case Intrinsic::x86_xop_vpcomneub:
   9488     case Intrinsic::x86_xop_vpcomneuw:
   9489     case Intrinsic::x86_xop_vpcomneud:
   9490     case Intrinsic::x86_xop_vpcomneuq:
   9491       CC = 5;
   9492       Opc = X86ISD::VPCOMU;
   9493       break;
   9494     case Intrinsic::x86_xop_vpcomfalseb:
   9495     case Intrinsic::x86_xop_vpcomfalsew:
   9496     case Intrinsic::x86_xop_vpcomfalsed:
   9497     case Intrinsic::x86_xop_vpcomfalseq:
   9498       CC = 6;
   9499       Opc = X86ISD::VPCOM;
   9500       break;
   9501     case Intrinsic::x86_xop_vpcomfalseub:
   9502     case Intrinsic::x86_xop_vpcomfalseuw:
   9503     case Intrinsic::x86_xop_vpcomfalseud:
   9504     case Intrinsic::x86_xop_vpcomfalseuq:
   9505       CC = 6;
   9506       Opc = X86ISD::VPCOMU;
   9507       break;
   9508     case Intrinsic::x86_xop_vpcomtrueb:
   9509     case Intrinsic::x86_xop_vpcomtruew:
   9510     case Intrinsic::x86_xop_vpcomtrued:
   9511     case Intrinsic::x86_xop_vpcomtrueq:
   9512       CC = 7;
   9513       Opc = X86ISD::VPCOM;
   9514       break;
   9515     case Intrinsic::x86_xop_vpcomtrueub:
   9516     case Intrinsic::x86_xop_vpcomtrueuw:
   9517     case Intrinsic::x86_xop_vpcomtrueud:
   9518     case Intrinsic::x86_xop_vpcomtrueuq:
   9519       CC = 7;
   9520       Opc = X86ISD::VPCOMU;
   9521       break;
   9522     }
   9523 
   9524     SDValue LHS = Op.getOperand(1);
   9525     SDValue RHS = Op.getOperand(2);
   9526     return DAG.getNode(Opc, dl, Op.getValueType(), LHS, RHS,
   9527                        DAG.getConstant(CC, MVT::i8));
   9528   }
   9529 
   9530   // Arithmetic intrinsics.
   9531   case Intrinsic::x86_sse2_pmulu_dq:
   9532   case Intrinsic::x86_avx2_pmulu_dq:
   9533     return DAG.getNode(X86ISD::PMULUDQ, dl, Op.getValueType(),
   9534                        Op.getOperand(1), Op.getOperand(2));
   9535   case Intrinsic::x86_sse3_hadd_ps:
   9536   case Intrinsic::x86_sse3_hadd_pd:
   9537   case Intrinsic::x86_avx_hadd_ps_256:
   9538   case Intrinsic::x86_avx_hadd_pd_256:
   9539     return DAG.getNode(X86ISD::FHADD, dl, Op.getValueType(),
   9540                        Op.getOperand(1), Op.getOperand(2));
   9541   case Intrinsic::x86_sse3_hsub_ps:
   9542   case Intrinsic::x86_sse3_hsub_pd:
   9543   case Intrinsic::x86_avx_hsub_ps_256:
   9544   case Intrinsic::x86_avx_hsub_pd_256:
   9545     return DAG.getNode(X86ISD::FHSUB, dl, Op.getValueType(),
   9546                        Op.getOperand(1), Op.getOperand(2));
   9547   case Intrinsic::x86_ssse3_phadd_w_128:
   9548   case Intrinsic::x86_ssse3_phadd_d_128:
   9549   case Intrinsic::x86_avx2_phadd_w:
   9550   case Intrinsic::x86_avx2_phadd_d:
   9551     return DAG.getNode(X86ISD::HADD, dl, Op.getValueType(),
   9552                        Op.getOperand(1), Op.getOperand(2));
   9553   case Intrinsic::x86_ssse3_phsub_w_128:
   9554   case Intrinsic::x86_ssse3_phsub_d_128:
   9555   case Intrinsic::x86_avx2_phsub_w:
   9556   case Intrinsic::x86_avx2_phsub_d:
   9557     return DAG.getNode(X86ISD::HSUB, dl, Op.getValueType(),
   9558                        Op.getOperand(1), Op.getOperand(2));
   9559   case Intrinsic::x86_avx2_psllv_d:
   9560   case Intrinsic::x86_avx2_psllv_q:
   9561   case Intrinsic::x86_avx2_psllv_d_256:
   9562   case Intrinsic::x86_avx2_psllv_q_256:
   9563     return DAG.getNode(ISD::SHL, dl, Op.getValueType(),
   9564                       Op.getOperand(1), Op.getOperand(2));
   9565   case Intrinsic::x86_avx2_psrlv_d:
   9566   case Intrinsic::x86_avx2_psrlv_q:
   9567   case Intrinsic::x86_avx2_psrlv_d_256:
   9568   case Intrinsic::x86_avx2_psrlv_q_256:
   9569     return DAG.getNode(ISD::SRL, dl, Op.getValueType(),
   9570                       Op.getOperand(1), Op.getOperand(2));
   9571   case Intrinsic::x86_avx2_psrav_d:
   9572   case Intrinsic::x86_avx2_psrav_d_256:
   9573     return DAG.getNode(ISD::SRA, dl, Op.getValueType(),
   9574                       Op.getOperand(1), Op.getOperand(2));
   9575   case Intrinsic::x86_ssse3_pshuf_b_128:
   9576   case Intrinsic::x86_avx2_pshuf_b:
   9577     return DAG.getNode(X86ISD::PSHUFB, dl, Op.getValueType(),
   9578                        Op.getOperand(1), Op.getOperand(2));
   9579   case Intrinsic::x86_ssse3_psign_b_128:
   9580   case Intrinsic::x86_ssse3_psign_w_128:
   9581   case Intrinsic::x86_ssse3_psign_d_128:
   9582   case Intrinsic::x86_avx2_psign_b:
   9583   case Intrinsic::x86_avx2_psign_w:
   9584   case Intrinsic::x86_avx2_psign_d:
   9585     return DAG.getNode(X86ISD::PSIGN, dl, Op.getValueType(),
   9586                        Op.getOperand(1), Op.getOperand(2));
   9587   case Intrinsic::x86_sse41_insertps:
   9588     return DAG.getNode(X86ISD::INSERTPS, dl, Op.getValueType(),
   9589                        Op.getOperand(1), Op.getOperand(2), Op.getOperand(3));
   9590   case Intrinsic::x86_avx_vperm2f128_ps_256:
   9591   case Intrinsic::x86_avx_vperm2f128_pd_256:
   9592   case Intrinsic::x86_avx_vperm2f128_si_256:
   9593   case Intrinsic::x86_avx2_vperm2i128:
   9594     return DAG.getNode(X86ISD::VPERM2X128, dl, Op.getValueType(),
   9595                        Op.getOperand(1), Op.getOperand(2), Op.getOperand(3));
   9596   case Intrinsic::x86_avx2_permd:
   9597   case Intrinsic::x86_avx2_permps:
   9598     // Operands intentionally swapped. Mask is last operand to intrinsic,
   9599     // but second operand for node/intruction.
   9600     return DAG.getNode(X86ISD::VPERMV, dl, Op.getValueType(),
   9601                        Op.getOperand(2), Op.getOperand(1));
   9602 
   9603   // ptest and testp intrinsics. The intrinsic these come from are designed to
   9604   // return an integer value, not just an instruction so lower it to the ptest
   9605   // or testp pattern and a setcc for the result.
   9606   case Intrinsic::x86_sse41_ptestz:
   9607   case Intrinsic::x86_sse41_ptestc:
   9608   case Intrinsic::x86_sse41_ptestnzc:
   9609   case Intrinsic::x86_avx_ptestz_256:
   9610   case Intrinsic::x86_avx_ptestc_256:
   9611   case Intrinsic::x86_avx_ptestnzc_256:
   9612   case Intrinsic::x86_avx_vtestz_ps:
   9613   case Intrinsic::x86_avx_vtestc_ps:
   9614   case Intrinsic::x86_avx_vtestnzc_ps:
   9615   case Intrinsic::x86_avx_vtestz_pd:
   9616   case Intrinsic::x86_avx_vtestc_pd:
   9617   case Intrinsic::x86_avx_vtestnzc_pd:
   9618   case Intrinsic::x86_avx_vtestz_ps_256:
   9619   case Intrinsic::x86_avx_vtestc_ps_256:
   9620   case Intrinsic::x86_avx_vtestnzc_ps_256:
   9621   case Intrinsic::x86_avx_vtestz_pd_256:
   9622   case Intrinsic::x86_avx_vtestc_pd_256:
   9623   case Intrinsic::x86_avx_vtestnzc_pd_256: {
   9624     bool IsTestPacked = false;
   9625     unsigned X86CC = 0;
   9626     switch (IntNo) {
   9627     default: llvm_unreachable("Bad fallthrough in Intrinsic lowering.");
   9628     case Intrinsic::x86_avx_vtestz_ps:
   9629     case Intrinsic::x86_avx_vtestz_pd:
   9630     case Intrinsic::x86_avx_vtestz_ps_256:
   9631     case Intrinsic::x86_avx_vtestz_pd_256:
   9632       IsTestPacked = true; // Fallthrough
   9633     case Intrinsic::x86_sse41_ptestz:
   9634     case Intrinsic::x86_avx_ptestz_256:
   9635       // ZF = 1
   9636       X86CC = X86::COND_E;
   9637       break;
   9638     case Intrinsic::x86_avx_vtestc_ps:
   9639     case Intrinsic::x86_avx_vtestc_pd:
   9640     case Intrinsic::x86_avx_vtestc_ps_256:
   9641     case Intrinsic::x86_avx_vtestc_pd_256:
   9642       IsTestPacked = true; // Fallthrough
   9643     case Intrinsic::x86_sse41_ptestc:
   9644     case Intrinsic::x86_avx_ptestc_256:
   9645       // CF = 1
   9646       X86CC = X86::COND_B;
   9647       break;
   9648     case Intrinsic::x86_avx_vtestnzc_ps:
   9649     case Intrinsic::x86_avx_vtestnzc_pd:
   9650     case Intrinsic::x86_avx_vtestnzc_ps_256:
   9651     case Intrinsic::x86_avx_vtestnzc_pd_256:
   9652       IsTestPacked = true; // Fallthrough
   9653     case Intrinsic::x86_sse41_ptestnzc:
   9654     case Intrinsic::x86_avx_ptestnzc_256:
   9655       // ZF and CF = 0
   9656       X86CC = X86::COND_A;
   9657       break;
   9658     }
   9659 
   9660     SDValue LHS = Op.getOperand(1);
   9661     SDValue RHS = Op.getOperand(2);
   9662     unsigned TestOpc = IsTestPacked ? X86ISD::TESTP : X86ISD::PTEST;
   9663     SDValue Test = DAG.getNode(TestOpc, dl, MVT::i32, LHS, RHS);
   9664     SDValue CC = DAG.getConstant(X86CC, MVT::i8);
   9665     SDValue SetCC = DAG.getNode(X86ISD::SETCC, dl, MVT::i8, CC, Test);
   9666     return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC);
   9667   }
   9668 
   9669   // SSE/AVX shift intrinsics
   9670   case Intrinsic::x86_sse2_psll_w:
   9671   case Intrinsic::x86_sse2_psll_d:
   9672   case Intrinsic::x86_sse2_psll_q:
   9673   case Intrinsic::x86_avx2_psll_w:
   9674   case Intrinsic::x86_avx2_psll_d:
   9675   case Intrinsic::x86_avx2_psll_q:
   9676     return DAG.getNode(X86ISD::VSHL, dl, Op.getValueType(),
   9677                        Op.getOperand(1), Op.getOperand(2));
   9678   case Intrinsic::x86_sse2_psrl_w:
   9679   case Intrinsic::x86_sse2_psrl_d:
   9680   case Intrinsic::x86_sse2_psrl_q:
   9681   case Intrinsic::x86_avx2_psrl_w:
   9682   case Intrinsic::x86_avx2_psrl_d:
   9683   case Intrinsic::x86_avx2_psrl_q:
   9684     return DAG.getNode(X86ISD::VSRL, dl, Op.getValueType(),
   9685                        Op.getOperand(1), Op.getOperand(2));
   9686   case Intrinsic::x86_sse2_psra_w:
   9687   case Intrinsic::x86_sse2_psra_d:
   9688   case Intrinsic::x86_avx2_psra_w:
   9689   case Intrinsic::x86_avx2_psra_d:
   9690     return DAG.getNode(X86ISD::VSRA, dl, Op.getValueType(),
   9691                        Op.getOperand(1), Op.getOperand(2));
   9692   case Intrinsic::x86_sse2_pslli_w:
   9693   case Intrinsic::x86_sse2_pslli_d:
   9694   case Intrinsic::x86_sse2_pslli_q:
   9695   case Intrinsic::x86_avx2_pslli_w:
   9696   case Intrinsic::x86_avx2_pslli_d:
   9697   case Intrinsic::x86_avx2_pslli_q:
   9698     return getTargetVShiftNode(X86ISD::VSHLI, dl, Op.getValueType(),
   9699                                Op.getOperand(1), Op.getOperand(2), DAG);
   9700   case Intrinsic::x86_sse2_psrli_w:
   9701   case Intrinsic::x86_sse2_psrli_d:
   9702   case Intrinsic::x86_sse2_psrli_q:
   9703   case Intrinsic::x86_avx2_psrli_w:
   9704   case Intrinsic::x86_avx2_psrli_d:
   9705   case Intrinsic::x86_avx2_psrli_q:
   9706     return getTargetVShiftNode(X86ISD::VSRLI, dl, Op.getValueType(),
   9707                                Op.getOperand(1), Op.getOperand(2), DAG);
   9708   case Intrinsic::x86_sse2_psrai_w:
   9709   case Intrinsic::x86_sse2_psrai_d:
   9710   case Intrinsic::x86_avx2_psrai_w:
   9711   case Intrinsic::x86_avx2_psrai_d:
   9712     return getTargetVShiftNode(X86ISD::VSRAI, dl, Op.getValueType(),
   9713                                Op.getOperand(1), Op.getOperand(2), DAG);
   9714   // Fix vector shift instructions where the last operand is a non-immediate
   9715   // i32 value.
   9716   case Intrinsic::x86_mmx_pslli_w:
   9717   case Intrinsic::x86_mmx_pslli_d:
   9718   case Intrinsic::x86_mmx_pslli_q:
   9719   case Intrinsic::x86_mmx_psrli_w:
   9720   case Intrinsic::x86_mmx_psrli_d:
   9721   case Intrinsic::x86_mmx_psrli_q:
   9722   case Intrinsic::x86_mmx_psrai_w:
   9723   case Intrinsic::x86_mmx_psrai_d: {
   9724     SDValue ShAmt = Op.getOperand(2);
   9725     if (isa<ConstantSDNode>(ShAmt))
   9726       return SDValue();
   9727 
   9728     unsigned NewIntNo = 0;
   9729     switch (IntNo) {
   9730     case Intrinsic::x86_mmx_pslli_w:
   9731       NewIntNo = Intrinsic::x86_mmx_psll_w;
   9732       break;
   9733     case Intrinsic::x86_mmx_pslli_d:
   9734       NewIntNo = Intrinsic::x86_mmx_psll_d;
   9735       break;
   9736     case Intrinsic::x86_mmx_pslli_q:
   9737       NewIntNo = Intrinsic::x86_mmx_psll_q;
   9738       break;
   9739     case Intrinsic::x86_mmx_psrli_w:
   9740       NewIntNo = Intrinsic::x86_mmx_psrl_w;
   9741       break;
   9742     case Intrinsic::x86_mmx_psrli_d:
   9743       NewIntNo = Intrinsic::x86_mmx_psrl_d;
   9744       break;
   9745     case Intrinsic::x86_mmx_psrli_q:
   9746       NewIntNo = Intrinsic::x86_mmx_psrl_q;
   9747       break;
   9748     case Intrinsic::x86_mmx_psrai_w:
   9749       NewIntNo = Intrinsic::x86_mmx_psra_w;
   9750       break;
   9751     case Intrinsic::x86_mmx_psrai_d:
   9752       NewIntNo = Intrinsic::x86_mmx_psra_d;
   9753       break;
   9754     default: llvm_unreachable("Impossible intrinsic");  // Can't reach here.
   9755     }
   9756 
   9757     // The vector shift intrinsics with scalars uses 32b shift amounts but
   9758     // the sse2/mmx shift instructions reads 64 bits. Set the upper 32 bits
   9759     // to be zero.
   9760     ShAmt =  DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v2i32, ShAmt,
   9761                          DAG.getConstant(0, MVT::i32));
   9762 // FIXME this must be lowered to get rid of the invalid type.
   9763 
   9764     EVT VT = Op.getValueType();
   9765     ShAmt = DAG.getNode(ISD::BITCAST, dl, VT, ShAmt);
   9766     return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT,
   9767                        DAG.getConstant(NewIntNo, MVT::i32),
   9768                        Op.getOperand(1), ShAmt);
   9769   }
   9770   }
   9771 }
   9772 
   9773 SDValue X86TargetLowering::LowerRETURNADDR(SDValue Op,
   9774                                            SelectionDAG &DAG) const {
   9775   MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo();
   9776   MFI->setReturnAddressIsTaken(true);
   9777 
   9778   unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
   9779   DebugLoc dl = Op.getDebugLoc();
   9780 
   9781   if (Depth > 0) {
   9782     SDValue FrameAddr = LowerFRAMEADDR(Op, DAG);
   9783     SDValue Offset =
   9784       DAG.getConstant(TD->getPointerSize(),
   9785                       Subtarget->is64Bit() ? MVT::i64 : MVT::i32);
   9786     return DAG.getLoad(getPointerTy(), dl, DAG.getEntryNode(),
   9787                        DAG.getNode(ISD::ADD, dl, getPointerTy(),
   9788                                    FrameAddr, Offset),
   9789                        MachinePointerInfo(), false, false, false, 0);
   9790   }
   9791 
   9792   // Just load the return address.
   9793   SDValue RetAddrFI = getReturnAddressFrameIndex(DAG);
   9794   return DAG.getLoad(getPointerTy(), dl, DAG.getEntryNode(),
   9795                      RetAddrFI, MachinePointerInfo(), false, false, false, 0);
   9796 }
   9797 
   9798 SDValue X86TargetLowering::LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) const {
   9799   MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo();
   9800   MFI->setFrameAddressIsTaken(true);
   9801 
   9802   EVT VT = Op.getValueType();
   9803   DebugLoc dl = Op.getDebugLoc();  // FIXME probably not meaningful
   9804   unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
   9805   unsigned FrameReg = Subtarget->is64Bit() ? X86::RBP : X86::EBP;
   9806   SDValue FrameAddr = DAG.getCopyFromReg(DAG.getEntryNode(), dl, FrameReg, VT);
   9807   while (Depth--)
   9808     FrameAddr = DAG.getLoad(VT, dl, DAG.getEntryNode(), FrameAddr,
   9809                             MachinePointerInfo(),
   9810                             false, false, false, 0);
   9811   return FrameAddr;
   9812 }
   9813 
   9814 SDValue X86TargetLowering::LowerFRAME_TO_ARGS_OFFSET(SDValue Op,
   9815                                                      SelectionDAG &DAG) const {
   9816   return DAG.getIntPtrConstant(2*TD->getPointerSize());
   9817 }
   9818 
   9819 SDValue X86TargetLowering::LowerEH_RETURN(SDValue Op, SelectionDAG &DAG) const {
   9820   MachineFunction &MF = DAG.getMachineFunction();
   9821   SDValue Chain     = Op.getOperand(0);
   9822   SDValue Offset    = Op.getOperand(1);
   9823   SDValue Handler   = Op.getOperand(2);
   9824   DebugLoc dl       = Op.getDebugLoc();
   9825 
   9826   SDValue Frame = DAG.getCopyFromReg(DAG.getEntryNode(), dl,
   9827                                      Subtarget->is64Bit() ? X86::RBP : X86::EBP,
   9828                                      getPointerTy());
   9829   unsigned StoreAddrReg = (Subtarget->is64Bit() ? X86::RCX : X86::ECX);
   9830 
   9831   SDValue StoreAddr = DAG.getNode(ISD::ADD, dl, getPointerTy(), Frame,
   9832                                   DAG.getIntPtrConstant(TD->getPointerSize()));
   9833   StoreAddr = DAG.getNode(ISD::ADD, dl, getPointerTy(), StoreAddr, Offset);
   9834   Chain = DAG.getStore(Chain, dl, Handler, StoreAddr, MachinePointerInfo(),
   9835                        false, false, 0);
   9836   Chain = DAG.getCopyToReg(Chain, dl, StoreAddrReg, StoreAddr);
   9837   MF.getRegInfo().addLiveOut(StoreAddrReg);
   9838 
   9839   return DAG.getNode(X86ISD::EH_RETURN, dl,
   9840                      MVT::Other,
   9841                      Chain, DAG.getRegister(StoreAddrReg, getPointerTy()));
   9842 }
   9843 
   9844 SDValue X86TargetLowering::LowerADJUST_TRAMPOLINE(SDValue Op,
   9845                                                   SelectionDAG &DAG) const {
   9846   return Op.getOperand(0);
   9847 }
   9848 
   9849 SDValue X86TargetLowering::LowerINIT_TRAMPOLINE(SDValue Op,
   9850                                                 SelectionDAG &DAG) const {
   9851   SDValue Root = Op.getOperand(0);
   9852   SDValue Trmp = Op.getOperand(1); // trampoline
   9853   SDValue FPtr = Op.getOperand(2); // nested function
   9854   SDValue Nest = Op.getOperand(3); // 'nest' parameter value
   9855   DebugLoc dl  = Op.getDebugLoc();
   9856 
   9857   const Value *TrmpAddr = cast<SrcValueSDNode>(Op.getOperand(4))->getValue();
   9858 
   9859   if (Subtarget->is64Bit()) {
   9860     SDValue OutChains[6];
   9861 
   9862     // Large code-model.
   9863     const unsigned char JMP64r  = 0xFF; // 64-bit jmp through register opcode.
   9864     const unsigned char MOV64ri = 0xB8; // X86::MOV64ri opcode.
   9865 
   9866     const unsigned char N86R10 = X86_MC::getX86RegNum(X86::R10);
   9867     const unsigned char N86R11 = X86_MC::getX86RegNum(X86::R11);
   9868 
   9869     const unsigned char REX_WB = 0x40 | 0x08 | 0x01; // REX prefix
   9870 
   9871     // Load the pointer to the nested function into R11.
   9872     unsigned OpCode = ((MOV64ri | N86R11) << 8) | REX_WB; // movabsq r11
   9873     SDValue Addr = Trmp;
   9874     OutChains[0] = DAG.getStore(Root, dl, DAG.getConstant(OpCode, MVT::i16),
   9875                                 Addr, MachinePointerInfo(TrmpAddr),
   9876                                 false, false, 0);
   9877 
   9878     Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
   9879                        DAG.getConstant(2, MVT::i64));
   9880     OutChains[1] = DAG.getStore(Root, dl, FPtr, Addr,
   9881                                 MachinePointerInfo(TrmpAddr, 2),
   9882                                 false, false, 2);
   9883 
   9884     // Load the 'nest' parameter value into R10.
   9885     // R10 is specified in X86CallingConv.td
   9886     OpCode = ((MOV64ri | N86R10) << 8) | REX_WB; // movabsq r10
   9887     Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
   9888                        DAG.getConstant(10, MVT::i64));
   9889     OutChains[2] = DAG.getStore(Root, dl, DAG.getConstant(OpCode, MVT::i16),
   9890                                 Addr, MachinePointerInfo(TrmpAddr, 10),
   9891                                 false, false, 0);
   9892 
   9893     Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
   9894                        DAG.getConstant(12, MVT::i64));
   9895     OutChains[3] = DAG.getStore(Root, dl, Nest, Addr,
   9896                                 MachinePointerInfo(TrmpAddr, 12),
   9897                                 false, false, 2);
   9898 
   9899     // Jump to the nested function.
   9900     OpCode = (JMP64r << 8) | REX_WB; // jmpq *...
   9901     Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
   9902                        DAG.getConstant(20, MVT::i64));
   9903     OutChains[4] = DAG.getStore(Root, dl, DAG.getConstant(OpCode, MVT::i16),
   9904                                 Addr, MachinePointerInfo(TrmpAddr, 20),
   9905                                 false, false, 0);
   9906 
   9907     unsigned char ModRM = N86R11 | (4 << 3) | (3 << 6); // ...r11
   9908     Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
   9909                        DAG.getConstant(22, MVT::i64));
   9910     OutChains[5] = DAG.getStore(Root, dl, DAG.getConstant(ModRM, MVT::i8), Addr,
   9911                                 MachinePointerInfo(TrmpAddr, 22),
   9912                                 false, false, 0);
   9913 
   9914     return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, OutChains, 6);
   9915   } else {
   9916     const Function *Func =
   9917       cast<Function>(cast<SrcValueSDNode>(Op.getOperand(5))->getValue());
   9918     CallingConv::ID CC = Func->getCallingConv();
   9919     unsigned NestReg;
   9920 
   9921     switch (CC) {
   9922     default:
   9923       llvm_unreachable("Unsupported calling convention");
   9924     case CallingConv::C:
   9925     case CallingConv::X86_StdCall: {
   9926       // Pass 'nest' parameter in ECX.
   9927       // Must be kept in sync with X86CallingConv.td
   9928       NestReg = X86::ECX;
   9929 
   9930       // Check that ECX wasn't needed by an 'inreg' parameter.
   9931       FunctionType *FTy = Func->getFunctionType();
   9932       const AttrListPtr &Attrs = Func->getAttributes();
   9933 
   9934       if (!Attrs.isEmpty() && !Func->isVarArg()) {
   9935         unsigned InRegCount = 0;
   9936         unsigned Idx = 1;
   9937 
   9938         for (FunctionType::param_iterator I = FTy->param_begin(),
   9939              E = FTy->param_end(); I != E; ++I, ++Idx)
   9940           if (Attrs.paramHasAttr(Idx, Attribute::InReg))
   9941             // FIXME: should only count parameters that are lowered to integers.
   9942             InRegCount += (TD->getTypeSizeInBits(*I) + 31) / 32;
   9943 
   9944         if (InRegCount > 2) {
   9945           report_fatal_error("Nest register in use - reduce number of inreg"
   9946                              " parameters!");
   9947         }
   9948       }
   9949       break;
   9950     }
   9951     case CallingConv::X86_FastCall:
   9952     case CallingConv::X86_ThisCall:
   9953     case CallingConv::Fast:
   9954       // Pass 'nest' parameter in EAX.
   9955       // Must be kept in sync with X86CallingConv.td
   9956       NestReg = X86::EAX;
   9957       break;
   9958     }
   9959 
   9960     SDValue OutChains[4];
   9961     SDValue Addr, Disp;
   9962 
   9963     Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp,
   9964                        DAG.getConstant(10, MVT::i32));
   9965     Disp = DAG.getNode(ISD::SUB, dl, MVT::i32, FPtr, Addr);
   9966 
   9967     // This is storing the opcode for MOV32ri.
   9968     const unsigned char MOV32ri = 0xB8; // X86::MOV32ri's opcode byte.
   9969     const unsigned char N86Reg = X86_MC::getX86RegNum(NestReg);
   9970     OutChains[0] = DAG.getStore(Root, dl,
   9971                                 DAG.getConstant(MOV32ri|N86Reg, MVT::i8),
   9972                                 Trmp, MachinePointerInfo(TrmpAddr),
   9973                                 false, false, 0);
   9974 
   9975     Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp,
   9976                        DAG.getConstant(1, MVT::i32));
   9977     OutChains[1] = DAG.getStore(Root, dl, Nest, Addr,
   9978                                 MachinePointerInfo(TrmpAddr, 1),
   9979                                 false, false, 1);
   9980 
   9981     const unsigned char JMP = 0xE9; // jmp <32bit dst> opcode.
   9982     Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp,
   9983                        DAG.getConstant(5, MVT::i32));
   9984     OutChains[2] = DAG.getStore(Root, dl, DAG.getConstant(JMP, MVT::i8), Addr,
   9985                                 MachinePointerInfo(TrmpAddr, 5),
   9986                                 false, false, 1);
   9987 
   9988     Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp,
   9989                        DAG.getConstant(6, MVT::i32));
   9990     OutChains[3] = DAG.getStore(Root, dl, Disp, Addr,
   9991                                 MachinePointerInfo(TrmpAddr, 6),
   9992                                 false, false, 1);
   9993 
   9994     return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, OutChains, 4);
   9995   }
   9996 }
   9997 
   9998 SDValue X86TargetLowering::LowerFLT_ROUNDS_(SDValue Op,
   9999                                             SelectionDAG &DAG) const {
   10000   /*
   10001    The rounding mode is in bits 11:10 of FPSR, and has the following
   10002    settings:
   10003      00 Round to nearest
   10004      01 Round to -inf
   10005      10 Round to +inf
   10006      11 Round to 0
   10007 
   10008   FLT_ROUNDS, on the other hand, expects the following:
   10009     -1 Undefined
   10010      0 Round to 0
   10011      1 Round to nearest
   10012      2 Round to +inf
   10013      3 Round to -inf
   10014 
   10015   To perform the conversion, we do:
   10016     (((((FPSR & 0x800) >> 11) | ((FPSR & 0x400) >> 9)) + 1) & 3)
   10017   */
   10018 
   10019   MachineFunction &MF = DAG.getMachineFunction();
   10020   const TargetMachine &TM = MF.getTarget();
   10021   const TargetFrameLowering &TFI = *TM.getFrameLowering();
   10022   unsigned StackAlignment = TFI.getStackAlignment();
   10023   EVT VT = Op.getValueType();
   10024   DebugLoc DL = Op.getDebugLoc();
   10025 
   10026   // Save FP Control Word to stack slot
   10027   int SSFI = MF.getFrameInfo()->CreateStackObject(2, StackAlignment, false);
   10028   SDValue StackSlot = DAG.getFrameIndex(SSFI, getPointerTy());
   10029 
   10030 
   10031   MachineMemOperand *MMO =
   10032    MF.getMachineMemOperand(MachinePointerInfo::getFixedStack(SSFI),
   10033                            MachineMemOperand::MOStore, 2, 2);
   10034 
   10035   SDValue Ops[] = { DAG.getEntryNode(), StackSlot };
   10036   SDValue Chain = DAG.getMemIntrinsicNode(X86ISD::FNSTCW16m, DL,
   10037                                           DAG.getVTList(MVT::Other),
   10038                                           Ops, 2, MVT::i16, MMO);
   10039 
   10040   // Load FP Control Word from stack slot
   10041   SDValue CWD = DAG.getLoad(MVT::i16, DL, Chain, StackSlot,
   10042                             MachinePointerInfo(), false, false, false, 0);
   10043 
   10044   // Transform as necessary
   10045   SDValue CWD1 =
   10046     DAG.getNode(ISD::SRL, DL, MVT::i16,
   10047                 DAG.getNode(ISD::AND, DL, MVT::i16,
   10048                             CWD, DAG.getConstant(0x800, MVT::i16)),
   10049                 DAG.getConstant(11, MVT::i8));
   10050   SDValue CWD2 =
   10051     DAG.getNode(ISD::SRL, DL, MVT::i16,
   10052                 DAG.getNode(ISD::AND, DL, MVT::i16,
   10053                             CWD, DAG.getConstant(0x400, MVT::i16)),
   10054                 DAG.getConstant(9, MVT::i8));
   10055 
   10056   SDValue RetVal =
   10057     DAG.getNode(ISD::AND, DL, MVT::i16,
   10058                 DAG.getNode(ISD::ADD, DL, MVT::i16,
   10059                             DAG.getNode(ISD::OR, DL, MVT::i16, CWD1, CWD2),
   10060                             DAG.getConstant(1, MVT::i16)),
   10061                 DAG.getConstant(3, MVT::i16));
   10062 
   10063 
   10064   return DAG.getNode((VT.getSizeInBits() < 16 ?
   10065                       ISD::TRUNCATE : ISD::ZERO_EXTEND), DL, VT, RetVal);
   10066 }
   10067 
   10068 SDValue X86TargetLowering::LowerCTLZ(SDValue Op, SelectionDAG &DAG) const {
   10069   EVT VT = Op.getValueType();
   10070   EVT OpVT = VT;
   10071   unsigned NumBits = VT.getSizeInBits();
   10072   DebugLoc dl = Op.getDebugLoc();
   10073 
   10074   Op = Op.getOperand(0);
   10075   if (VT == MVT::i8) {
   10076     // Zero extend to i32 since there is not an i8 bsr.
   10077     OpVT = MVT::i32;
   10078     Op = DAG.getNode(ISD::ZERO_EXTEND, dl, OpVT, Op);
   10079   }
   10080 
   10081   // Issue a bsr (scan bits in reverse) which also sets EFLAGS.
   10082   SDVTList VTs = DAG.getVTList(OpVT, MVT::i32);
   10083   Op = DAG.getNode(X86ISD::BSR, dl, VTs, Op);
   10084 
   10085   // If src is zero (i.e. bsr sets ZF), returns NumBits.
   10086   SDValue Ops[] = {
   10087     Op,
   10088     DAG.getConstant(NumBits+NumBits-1, OpVT),
   10089     DAG.getConstant(X86::COND_E, MVT::i8),
   10090     Op.getValue(1)
   10091   };
   10092   Op = DAG.getNode(X86ISD::CMOV, dl, OpVT, Ops, array_lengthof(Ops));
   10093 
   10094   // Finally xor with NumBits-1.
   10095   Op = DAG.getNode(ISD::XOR, dl, OpVT, Op, DAG.getConstant(NumBits-1, OpVT));
   10096 
   10097   if (VT == MVT::i8)
   10098     Op = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Op);
   10099   return Op;
   10100 }
   10101 
   10102 SDValue X86TargetLowering::LowerCTLZ_ZERO_UNDEF(SDValue Op,
   10103                                                 SelectionDAG &DAG) const {
   10104   EVT VT = Op.getValueType();
   10105   EVT OpVT = VT;
   10106   unsigned NumBits = VT.getSizeInBits();
   10107   DebugLoc dl = Op.getDebugLoc();
   10108 
   10109   Op = Op.getOperand(0);
   10110   if (VT == MVT::i8) {
   10111     // Zero extend to i32 since there is not an i8 bsr.
   10112     OpVT = MVT::i32;
   10113     Op = DAG.getNode(ISD::ZERO_EXTEND, dl, OpVT, Op);
   10114   }
   10115 
   10116   // Issue a bsr (scan bits in reverse).
   10117   SDVTList VTs = DAG.getVTList(OpVT, MVT::i32);
   10118   Op = DAG.getNode(X86ISD::BSR, dl, VTs, Op);
   10119 
   10120   // And xor with NumBits-1.
   10121   Op = DAG.getNode(ISD::XOR, dl, OpVT, Op, DAG.getConstant(NumBits-1, OpVT));
   10122 
   10123   if (VT == MVT::i8)
   10124     Op = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Op);
   10125   return Op;
   10126 }
   10127 
   10128 SDValue X86TargetLowering::LowerCTTZ(SDValue Op, SelectionDAG &DAG) const {
   10129   EVT VT = Op.getValueType();
   10130   unsigned NumBits = VT.getSizeInBits();
   10131   DebugLoc dl = Op.getDebugLoc();
   10132   Op = Op.getOperand(0);
   10133 
   10134   // Issue a bsf (scan bits forward) which also sets EFLAGS.
   10135   SDVTList VTs = DAG.getVTList(VT, MVT::i32);
   10136   Op = DAG.getNode(X86ISD::BSF, dl, VTs, Op);
   10137 
   10138   // If src is zero (i.e. bsf sets ZF), returns NumBits.
   10139   SDValue Ops[] = {
   10140     Op,
   10141     DAG.getConstant(NumBits, VT),
   10142     DAG.getConstant(X86::COND_E, MVT::i8),
   10143     Op.getValue(1)
   10144   };
   10145   return DAG.getNode(X86ISD::CMOV, dl, VT, Ops, array_lengthof(Ops));
   10146 }
   10147 
   10148 // Lower256IntArith - Break a 256-bit integer operation into two new 128-bit
   10149 // ones, and then concatenate the result back.
   10150 static SDValue Lower256IntArith(SDValue Op, SelectionDAG &DAG) {
   10151   EVT VT = Op.getValueType();
   10152 
   10153   assert(VT.getSizeInBits() == 256 && VT.isInteger() &&
   10154          "Unsupported value type for operation");
   10155 
   10156   int NumElems = VT.getVectorNumElements();
   10157   DebugLoc dl = Op.getDebugLoc();
   10158   SDValue Idx0 = DAG.getConstant(0, MVT::i32);
   10159   SDValue Idx1 = DAG.getConstant(NumElems/2, MVT::i32);
   10160 
   10161   // Extract the LHS vectors
   10162   SDValue LHS = Op.getOperand(0);
   10163   SDValue LHS1 = Extract128BitVector(LHS, Idx0, DAG, dl);
   10164   SDValue LHS2 = Extract128BitVector(LHS, Idx1, DAG, dl);
   10165 
   10166   // Extract the RHS vectors
   10167   SDValue RHS = Op.getOperand(1);
   10168   SDValue RHS1 = Extract128BitVector(RHS, Idx0, DAG, dl);
   10169   SDValue RHS2 = Extract128BitVector(RHS, Idx1, DAG, dl);
   10170 
   10171   MVT EltVT = VT.getVectorElementType().getSimpleVT();
   10172   EVT NewVT = MVT::getVectorVT(EltVT, NumElems/2);
   10173 
   10174   return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT,
   10175                      DAG.getNode(Op.getOpcode(), dl, NewVT, LHS1, RHS1),
   10176                      DAG.getNode(Op.getOpcode(), dl, NewVT, LHS2, RHS2));
   10177 }
   10178 
   10179 SDValue X86TargetLowering::LowerADD(SDValue Op, SelectionDAG &DAG) const {
   10180   assert(Op.getValueType().getSizeInBits() == 256 &&
   10181          Op.getValueType().isInteger() &&
   10182          "Only handle AVX 256-bit vector integer operation");
   10183   return Lower256IntArith(Op, DAG);
   10184 }
   10185 
   10186 SDValue X86TargetLowering::LowerSUB(SDValue Op, SelectionDAG &DAG) const {
   10187   assert(Op.getValueType().getSizeInBits() == 256 &&
   10188          Op.getValueType().isInteger() &&
   10189          "Only handle AVX 256-bit vector integer operation");
   10190   return Lower256IntArith(Op, DAG);
   10191 }
   10192 
   10193 SDValue X86TargetLowering::LowerMUL(SDValue Op, SelectionDAG &DAG) const {
   10194   EVT VT = Op.getValueType();
   10195 
   10196   // Decompose 256-bit ops into smaller 128-bit ops.
   10197   if (VT.getSizeInBits() == 256 && !Subtarget->hasAVX2())
   10198     return Lower256IntArith(Op, DAG);
   10199 
   10200   assert((VT == MVT::v2i64 || VT == MVT::v4i64) &&
   10201          "Only know how to lower V2I64/V4I64 multiply");
   10202 
   10203   DebugLoc dl = Op.getDebugLoc();
   10204 
   10205   //  Ahi = psrlqi(a, 32);
   10206   //  Bhi = psrlqi(b, 32);
   10207   //
   10208   //  AloBlo = pmuludq(a, b);
   10209   //  AloBhi = pmuludq(a, Bhi);
   10210   //  AhiBlo = pmuludq(Ahi, b);
   10211 
   10212   //  AloBhi = psllqi(AloBhi, 32);
   10213   //  AhiBlo = psllqi(AhiBlo, 32);
   10214   //  return AloBlo + AloBhi + AhiBlo;
   10215 
   10216   SDValue A = Op.getOperand(0);
   10217   SDValue B = Op.getOperand(1);
   10218 
   10219   SDValue ShAmt = DAG.getConstant(32, MVT::i32);
   10220 
   10221   SDValue Ahi = DAG.getNode(X86ISD::VSRLI, dl, VT, A, ShAmt);
   10222   SDValue Bhi = DAG.getNode(X86ISD::VSRLI, dl, VT, B, ShAmt);
   10223 
   10224   // Bit cast to 32-bit vectors for MULUDQ
   10225   EVT MulVT = (VT == MVT::v2i64) ? MVT::v4i32 : MVT::v8i32;
   10226   A = DAG.getNode(ISD::BITCAST, dl, MulVT, A);
   10227   B = DAG.getNode(ISD::BITCAST, dl, MulVT, B);
   10228   Ahi = DAG.getNode(ISD::BITCAST, dl, MulVT, Ahi);
   10229   Bhi = DAG.getNode(ISD::BITCAST, dl, MulVT, Bhi);
   10230 
   10231   SDValue AloBlo = DAG.getNode(X86ISD::PMULUDQ, dl, VT, A, B);
   10232   SDValue AloBhi = DAG.getNode(X86ISD::PMULUDQ, dl, VT, A, Bhi);
   10233   SDValue AhiBlo = DAG.getNode(X86ISD::PMULUDQ, dl, VT, Ahi, B);
   10234 
   10235   AloBhi = DAG.getNode(X86ISD::VSHLI, dl, VT, AloBhi, ShAmt);
   10236   AhiBlo = DAG.getNode(X86ISD::VSHLI, dl, VT, AhiBlo, ShAmt);
   10237 
   10238   SDValue Res = DAG.getNode(ISD::ADD, dl, VT, AloBlo, AloBhi);
   10239   return DAG.getNode(ISD::ADD, dl, VT, Res, AhiBlo);
   10240 }
   10241 
   10242 SDValue X86TargetLowering::LowerShift(SDValue Op, SelectionDAG &DAG) const {
   10243 
   10244   EVT VT = Op.getValueType();
   10245   DebugLoc dl = Op.getDebugLoc();
   10246   SDValue R = Op.getOperand(0);
   10247   SDValue Amt = Op.getOperand(1);
   10248   LLVMContext *Context = DAG.getContext();
   10249 
   10250   if (!Subtarget->hasSSE2())
   10251     return SDValue();
   10252 
   10253   // Optimize shl/srl/sra with constant shift amount.
   10254   if (isSplatVector(Amt.getNode())) {
   10255     SDValue SclrAmt = Amt->getOperand(0);
   10256     if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(SclrAmt)) {
   10257       uint64_t ShiftAmt = C->getZExtValue();
   10258 
   10259       if (VT == MVT::v2i64 || VT == MVT::v4i32 || VT == MVT::v8i16 ||
   10260           (Subtarget->hasAVX2() &&
   10261            (VT == MVT::v4i64 || VT == MVT::v8i32 || VT == MVT::v16i16))) {
   10262         if (Op.getOpcode() == ISD::SHL)
   10263           return DAG.getNode(X86ISD::VSHLI, dl, VT, R,
   10264                              DAG.getConstant(ShiftAmt, MVT::i32));
   10265         if (Op.getOpcode() == ISD::SRL)
   10266           return DAG.getNode(X86ISD::VSRLI, dl, VT, R,
   10267                              DAG.getConstant(ShiftAmt, MVT::i32));
   10268         if (Op.getOpcode() == ISD::SRA && VT != MVT::v2i64 && VT != MVT::v4i64)
   10269           return DAG.getNode(X86ISD::VSRAI, dl, VT, R,
   10270                              DAG.getConstant(ShiftAmt, MVT::i32));
   10271       }
   10272 
   10273       if (VT == MVT::v16i8) {
   10274         if (Op.getOpcode() == ISD::SHL) {
   10275           // Make a large shift.
   10276           SDValue SHL = DAG.getNode(X86ISD::VSHLI, dl, MVT::v8i16, R,
   10277                                     DAG.getConstant(ShiftAmt, MVT::i32));
   10278           SHL = DAG.getNode(ISD::BITCAST, dl, VT, SHL);
   10279           // Zero out the rightmost bits.
   10280           SmallVector<SDValue, 16> V(16,
   10281                                      DAG.getConstant(uint8_t(-1U << ShiftAmt),
   10282                                                      MVT::i8));
   10283           return DAG.getNode(ISD::AND, dl, VT, SHL,
   10284                              DAG.getNode(ISD::BUILD_VECTOR, dl, VT, &V[0], 16));
   10285         }
   10286         if (Op.getOpcode() == ISD::SRL) {
   10287           // Make a large shift.
   10288           SDValue SRL = DAG.getNode(X86ISD::VSRLI, dl, MVT::v8i16, R,
   10289                                     DAG.getConstant(ShiftAmt, MVT::i32));
   10290           SRL = DAG.getNode(ISD::BITCAST, dl, VT, SRL);
   10291           // Zero out the leftmost bits.
   10292           SmallVector<SDValue, 16> V(16,
   10293                                      DAG.getConstant(uint8_t(-1U) >> ShiftAmt,
   10294                                                      MVT::i8));
   10295           return DAG.getNode(ISD::AND, dl, VT, SRL,
   10296                              DAG.getNode(ISD::BUILD_VECTOR, dl, VT, &V[0], 16));
   10297         }
   10298         if (Op.getOpcode() == ISD::SRA) {
   10299           if (ShiftAmt == 7) {
   10300             // R s>> 7  ===  R s< 0
   10301             SDValue Zeros = getZeroVector(VT, Subtarget, DAG, dl);
   10302             return DAG.getNode(X86ISD::PCMPGT, dl, VT, Zeros, R);
   10303           }
   10304 
   10305           // R s>> a === ((R u>> a) ^ m) - m
   10306           SDValue Res = DAG.getNode(ISD::SRL, dl, VT, R, Amt);
   10307           SmallVector<SDValue, 16> V(16, DAG.getConstant(128 >> ShiftAmt,
   10308                                                          MVT::i8));
   10309           SDValue Mask = DAG.getNode(ISD::BUILD_VECTOR, dl, VT, &V[0], 16);
   10310           Res = DAG.getNode(ISD::XOR, dl, VT, Res, Mask);
   10311           Res = DAG.getNode(ISD::SUB, dl, VT, Res, Mask);
   10312           return Res;
   10313         }
   10314       }
   10315 
   10316       if (Subtarget->hasAVX2() && VT == MVT::v32i8) {
   10317         if (Op.getOpcode() == ISD::SHL) {
   10318           // Make a large shift.
   10319           SDValue SHL = DAG.getNode(X86ISD::VSHLI, dl, MVT::v16i16, R,
   10320                                     DAG.getConstant(ShiftAmt, MVT::i32));
   10321           SHL = DAG.getNode(ISD::BITCAST, dl, VT, SHL);
   10322           // Zero out the rightmost bits.
   10323           SmallVector<SDValue, 32> V(32,
   10324                                      DAG.getConstant(uint8_t(-1U << ShiftAmt),
   10325                                                      MVT::i8));
   10326           return DAG.getNode(ISD::AND, dl, VT, SHL,
   10327                              DAG.getNode(ISD::BUILD_VECTOR, dl, VT, &V[0], 32));
   10328         }
   10329         if (Op.getOpcode() == ISD::SRL) {
   10330           // Make a large shift.
   10331           SDValue SRL = DAG.getNode(X86ISD::VSRLI, dl, MVT::v16i16, R,
   10332                                     DAG.getConstant(ShiftAmt, MVT::i32));
   10333           SRL = DAG.getNode(ISD::BITCAST, dl, VT, SRL);
   10334           // Zero out the leftmost bits.
   10335           SmallVector<SDValue, 32> V(32,
   10336                                      DAG.getConstant(uint8_t(-1U) >> ShiftAmt,
   10337                                                      MVT::i8));
   10338           return DAG.getNode(ISD::AND, dl, VT, SRL,
   10339                              DAG.getNode(ISD::BUILD_VECTOR, dl, VT, &V[0], 32));
   10340         }
   10341         if (Op.getOpcode() == ISD::SRA) {
   10342           if (ShiftAmt == 7) {
   10343             // R s>> 7  ===  R s< 0
   10344             SDValue Zeros = getZeroVector(VT, Subtarget, DAG, dl);
   10345             return DAG.getNode(X86ISD::PCMPGT, dl, VT, Zeros, R);
   10346           }
   10347 
   10348           // R s>> a === ((R u>> a) ^ m) - m
   10349           SDValue Res = DAG.getNode(ISD::SRL, dl, VT, R, Amt);
   10350           SmallVector<SDValue, 32> V(32, DAG.getConstant(128 >> ShiftAmt,
   10351                                                          MVT::i8));
   10352           SDValue Mask = DAG.getNode(ISD::BUILD_VECTOR, dl, VT, &V[0], 32);
   10353           Res = DAG.getNode(ISD::XOR, dl, VT, Res, Mask);
   10354           Res = DAG.getNode(ISD::SUB, dl, VT, Res, Mask);
   10355           return Res;
   10356         }
   10357       }
   10358     }
   10359   }
   10360 
   10361   // Lower SHL with variable shift amount.
   10362   if (VT == MVT::v4i32 && Op->getOpcode() == ISD::SHL) {
   10363     Op = DAG.getNode(X86ISD::VSHLI, dl, VT, Op.getOperand(1),
   10364                      DAG.getConstant(23, MVT::i32));
   10365 
   10366     const uint32_t CV[] = { 0x3f800000U, 0x3f800000U, 0x3f800000U, 0x3f800000U};
   10367     Constant *C = ConstantDataVector::get(*Context, CV);
   10368     SDValue CPIdx = DAG.getConstantPool(C, getPointerTy(), 16);
   10369     SDValue Addend = DAG.getLoad(VT, dl, DAG.getEntryNode(), CPIdx,
   10370                                  MachinePointerInfo::getConstantPool(),
   10371                                  false, false, false, 16);
   10372 
   10373     Op = DAG.getNode(ISD::ADD, dl, VT, Op, Addend);
   10374     Op = DAG.getNode(ISD::BITCAST, dl, MVT::v4f32, Op);
   10375     Op = DAG.getNode(ISD::FP_TO_SINT, dl, VT, Op);
   10376     return DAG.getNode(ISD::MUL, dl, VT, Op, R);
   10377   }
   10378   if (VT == MVT::v16i8 && Op->getOpcode() == ISD::SHL) {
   10379     assert(Subtarget->hasSSE2() && "Need SSE2 for pslli/pcmpeq.");
   10380 
   10381     // a = a << 5;
   10382     Op = DAG.getNode(X86ISD::VSHLI, dl, MVT::v8i16, Op.getOperand(1),
   10383                      DAG.getConstant(5, MVT::i32));
   10384     Op = DAG.getNode(ISD::BITCAST, dl, VT, Op);
   10385 
   10386     // Turn 'a' into a mask suitable for VSELECT
   10387     SDValue VSelM = DAG.getConstant(0x80, VT);
   10388     SDValue OpVSel = DAG.getNode(ISD::AND, dl, VT, VSelM, Op);
   10389     OpVSel = DAG.getNode(X86ISD::PCMPEQ, dl, VT, OpVSel, VSelM);
   10390 
   10391     SDValue CM1 = DAG.getConstant(0x0f, VT);
   10392     SDValue CM2 = DAG.getConstant(0x3f, VT);
   10393 
   10394     // r = VSELECT(r, psllw(r & (char16)15, 4), a);
   10395     SDValue M = DAG.getNode(ISD::AND, dl, VT, R, CM1);
   10396     M = getTargetVShiftNode(X86ISD::VSHLI, dl, MVT::v8i16, M,
   10397                             DAG.getConstant(4, MVT::i32), DAG);
   10398     M = DAG.getNode(ISD::BITCAST, dl, VT, M);
   10399     R = DAG.getNode(ISD::VSELECT, dl, VT, OpVSel, M, R);
   10400 
   10401     // a += a
   10402     Op = DAG.getNode(ISD::ADD, dl, VT, Op, Op);
   10403     OpVSel = DAG.getNode(ISD::AND, dl, VT, VSelM, Op);
   10404     OpVSel = DAG.getNode(X86ISD::PCMPEQ, dl, VT, OpVSel, VSelM);
   10405 
   10406     // r = VSELECT(r, psllw(r & (char16)63, 2), a);
   10407     M = DAG.getNode(ISD::AND, dl, VT, R, CM2);
   10408     M = getTargetVShiftNode(X86ISD::VSHLI, dl, MVT::v8i16, M,
   10409                             DAG.getConstant(2, MVT::i32), DAG);
   10410     M = DAG.getNode(ISD::BITCAST, dl, VT, M);
   10411     R = DAG.getNode(ISD::VSELECT, dl, VT, OpVSel, M, R);
   10412 
   10413     // a += a
   10414     Op = DAG.getNode(ISD::ADD, dl, VT, Op, Op);
   10415     OpVSel = DAG.getNode(ISD::AND, dl, VT, VSelM, Op);
   10416     OpVSel = DAG.getNode(X86ISD::PCMPEQ, dl, VT, OpVSel, VSelM);
   10417 
   10418     // return VSELECT(r, r+r, a);
   10419     R = DAG.getNode(ISD::VSELECT, dl, VT, OpVSel,
   10420                     DAG.getNode(ISD::ADD, dl, VT, R, R), R);
   10421     return R;
   10422   }
   10423 
   10424   // Decompose 256-bit shifts into smaller 128-bit shifts.
   10425   if (VT.getSizeInBits() == 256) {
   10426     unsigned NumElems = VT.getVectorNumElements();
   10427     MVT EltVT = VT.getVectorElementType().getSimpleVT();
   10428     EVT NewVT = MVT::getVectorVT(EltVT, NumElems/2);
   10429 
   10430     // Extract the two vectors
   10431     SDValue V1 = Extract128BitVector(R, DAG.getConstant(0, MVT::i32), DAG, dl);
   10432     SDValue V2 = Extract128BitVector(R, DAG.getConstant(NumElems/2, MVT::i32),
   10433                                      DAG, dl);
   10434 
   10435     // Recreate the shift amount vectors
   10436     SDValue Amt1, Amt2;
   10437     if (Amt.getOpcode() == ISD::BUILD_VECTOR) {
   10438       // Constant shift amount
   10439       SmallVector<SDValue, 4> Amt1Csts;
   10440       SmallVector<SDValue, 4> Amt2Csts;
   10441       for (unsigned i = 0; i != NumElems/2; ++i)
   10442         Amt1Csts.push_back(Amt->getOperand(i));
   10443       for (unsigned i = NumElems/2; i != NumElems; ++i)
   10444         Amt2Csts.push_back(Amt->getOperand(i));
   10445 
   10446       Amt1 = DAG.getNode(ISD::BUILD_VECTOR, dl, NewVT,
   10447                                  &Amt1Csts[0], NumElems/2);
   10448       Amt2 = DAG.getNode(ISD::BUILD_VECTOR, dl, NewVT,
   10449                                  &Amt2Csts[0], NumElems/2);
   10450     } else {
   10451       // Variable shift amount
   10452       Amt1 = Extract128BitVector(Amt, DAG.getConstant(0, MVT::i32), DAG, dl);
   10453       Amt2 = Extract128BitVector(Amt, DAG.getConstant(NumElems/2, MVT::i32),
   10454                                  DAG, dl);
   10455     }
   10456 
   10457     // Issue new vector shifts for the smaller types
   10458     V1 = DAG.getNode(Op.getOpcode(), dl, NewVT, V1, Amt1);
   10459     V2 = DAG.getNode(Op.getOpcode(), dl, NewVT, V2, Amt2);
   10460 
   10461     // Concatenate the result back
   10462     return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, V1, V2);
   10463   }
   10464 
   10465   return SDValue();
   10466 }
   10467 
   10468 SDValue X86TargetLowering::LowerXALUO(SDValue Op, SelectionDAG &DAG) const {
   10469   // Lower the "add/sub/mul with overflow" instruction into a regular ins plus
   10470   // a "setcc" instruction that checks the overflow flag. The "brcond" lowering
   10471   // looks for this combo and may remove the "setcc" instruction if the "setcc"
   10472   // has only one use.
   10473   SDNode *N = Op.getNode();
   10474   SDValue LHS = N->getOperand(0);
   10475   SDValue RHS = N->getOperand(1);
   10476   unsigned BaseOp = 0;
   10477   unsigned Cond = 0;
   10478   DebugLoc DL = Op.getDebugLoc();
   10479   switch (Op.getOpcode()) {
   10480   default: llvm_unreachable("Unknown ovf instruction!");
   10481   case ISD::SADDO:
   10482     // A subtract of one will be selected as a INC. Note that INC doesn't
   10483     // set CF, so we can't do this for UADDO.
   10484     if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(RHS))
   10485       if (C->isOne()) {
   10486         BaseOp = X86ISD::INC;
   10487         Cond = X86::COND_O;
   10488         break;
   10489       }
   10490     BaseOp = X86ISD::ADD;
   10491     Cond = X86::COND_O;
   10492     break;
   10493   case ISD::UADDO:
   10494     BaseOp = X86ISD::ADD;
   10495     Cond = X86::COND_B;
   10496     break;
   10497   case ISD::SSUBO:
   10498     // A subtract of one will be selected as a DEC. Note that DEC doesn't
   10499     // set CF, so we can't do this for USUBO.
   10500     if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(RHS))
   10501       if (C->isOne()) {
   10502         BaseOp = X86ISD::DEC;
   10503         Cond = X86::COND_O;
   10504         break;
   10505       }
   10506     BaseOp = X86ISD::SUB;
   10507     Cond = X86::COND_O;
   10508     break;
   10509   case ISD::USUBO:
   10510     BaseOp = X86ISD::SUB;
   10511     Cond = X86::COND_B;
   10512     break;
   10513   case ISD::SMULO:
   10514     BaseOp = X86ISD::SMUL;
   10515     Cond = X86::COND_O;
   10516     break;
   10517   case ISD::UMULO: { // i64, i8 = umulo lhs, rhs --> i64, i64, i32 umul lhs,rhs
   10518     SDVTList VTs = DAG.getVTList(N->getValueType(0), N->getValueType(0),
   10519                                  MVT::i32);
   10520     SDValue Sum = DAG.getNode(X86ISD::UMUL, DL, VTs, LHS, RHS);
   10521 
   10522     SDValue SetCC =
   10523       DAG.getNode(X86ISD::SETCC, DL, MVT::i8,
   10524                   DAG.getConstant(X86::COND_O, MVT::i32),
   10525                   SDValue(Sum.getNode(), 2));
   10526 
   10527     return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(), Sum, SetCC);
   10528   }
   10529   }
   10530 
   10531   // Also sets EFLAGS.
   10532   SDVTList VTs = DAG.getVTList(N->getValueType(0), MVT::i32);
   10533   SDValue Sum = DAG.getNode(BaseOp, DL, VTs, LHS, RHS);
   10534 
   10535   SDValue SetCC =
   10536     DAG.getNode(X86ISD::SETCC, DL, N->getValueType(1),
   10537                 DAG.getConstant(Cond, MVT::i32),
   10538                 SDValue(Sum.getNode(), 1));
   10539 
   10540   return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(), Sum, SetCC);
   10541 }
   10542 
   10543 SDValue X86TargetLowering::LowerSIGN_EXTEND_INREG(SDValue Op,
   10544                                                   SelectionDAG &DAG) const {
   10545   DebugLoc dl = Op.getDebugLoc();
   10546   EVT ExtraVT = cast<VTSDNode>(Op.getOperand(1))->getVT();
   10547   EVT VT = Op.getValueType();
   10548 
   10549   if (!Subtarget->hasSSE2() || !VT.isVector())
   10550     return SDValue();
   10551 
   10552   unsigned BitsDiff = VT.getScalarType().getSizeInBits() -
   10553                       ExtraVT.getScalarType().getSizeInBits();
   10554   SDValue ShAmt = DAG.getConstant(BitsDiff, MVT::i32);
   10555 
   10556   switch (VT.getSimpleVT().SimpleTy) {
   10557     default: return SDValue();
   10558     case MVT::v8i32:
   10559     case MVT::v16i16:
   10560       if (!Subtarget->hasAVX())
   10561         return SDValue();
   10562       if (!Subtarget->hasAVX2()) {
   10563         // needs to be split
   10564         int NumElems = VT.getVectorNumElements();
   10565         SDValue Idx0 = DAG.getConstant(0, MVT::i32);
   10566         SDValue Idx1 = DAG.getConstant(NumElems/2, MVT::i32);
   10567 
   10568         // Extract the LHS vectors
   10569         SDValue LHS = Op.getOperand(0);
   10570         SDValue LHS1 = Extract128BitVector(LHS, Idx0, DAG, dl);
   10571         SDValue LHS2 = Extract128BitVector(LHS, Idx1, DAG, dl);
   10572 
   10573         MVT EltVT = VT.getVectorElementType().getSimpleVT();
   10574         EVT NewVT = MVT::getVectorVT(EltVT, NumElems/2);
   10575 
   10576         EVT ExtraEltVT = ExtraVT.getVectorElementType();
   10577         int ExtraNumElems = ExtraVT.getVectorNumElements();
   10578         ExtraVT = EVT::getVectorVT(*DAG.getContext(), ExtraEltVT,
   10579                                    ExtraNumElems/2);
   10580         SDValue Extra = DAG.getValueType(ExtraVT);
   10581 
   10582         LHS1 = DAG.getNode(Op.getOpcode(), dl, NewVT, LHS1, Extra);
   10583         LHS2 = DAG.getNode(Op.getOpcode(), dl, NewVT, LHS2, Extra);
   10584 
   10585         return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, LHS1, LHS2);;
   10586       }
   10587       // fall through
   10588     case MVT::v4i32:
   10589     case MVT::v8i16: {
   10590       SDValue Tmp1 = getTargetVShiftNode(X86ISD::VSHLI, dl, VT,
   10591                                          Op.getOperand(0), ShAmt, DAG);
   10592       return getTargetVShiftNode(X86ISD::VSRAI, dl, VT, Tmp1, ShAmt, DAG);
   10593     }
   10594   }
   10595 }
   10596 
   10597 
   10598 SDValue X86TargetLowering::LowerMEMBARRIER(SDValue Op, SelectionDAG &DAG) const{
   10599   DebugLoc dl = Op.getDebugLoc();
   10600 
   10601   // Go ahead and emit the fence on x86-64 even if we asked for no-sse2.
   10602   // There isn't any reason to disable it if the target processor supports it.
   10603   if (!Subtarget->hasSSE2() && !Subtarget->is64Bit()) {
   10604     SDValue Chain = Op.getOperand(0);
   10605     SDValue Zero = DAG.getConstant(0, MVT::i32);
   10606     SDValue Ops[] = {
   10607       DAG.getRegister(X86::ESP, MVT::i32), // Base
   10608       DAG.getTargetConstant(1, MVT::i8),   // Scale
   10609       DAG.getRegister(0, MVT::i32),        // Index
   10610       DAG.getTargetConstant(0, MVT::i32),  // Disp
   10611       DAG.getRegister(0, MVT::i32),        // Segment.
   10612       Zero,
   10613       Chain
   10614     };
   10615     SDNode *Res =
   10616       DAG.getMachineNode(X86::OR32mrLocked, dl, MVT::Other, Ops,
   10617                           array_lengthof(Ops));
   10618     return SDValue(Res, 0);
   10619   }
   10620 
   10621   unsigned isDev = cast<ConstantSDNode>(Op.getOperand(5))->getZExtValue();
   10622   if (!isDev)
   10623     return DAG.getNode(X86ISD::MEMBARRIER, dl, MVT::Other, Op.getOperand(0));
   10624 
   10625   unsigned Op1 = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
   10626   unsigned Op2 = cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue();
   10627   unsigned Op3 = cast<ConstantSDNode>(Op.getOperand(3))->getZExtValue();
   10628   unsigned Op4 = cast<ConstantSDNode>(Op.getOperand(4))->getZExtValue();
   10629 
   10630   // def : Pat<(membarrier (i8 0), (i8 0), (i8 0), (i8 1), (i8 1)), (SFENCE)>;
   10631   if (!Op1 && !Op2 && !Op3 && Op4)
   10632     return DAG.getNode(X86ISD::SFENCE, dl, MVT::Other, Op.getOperand(0));
   10633 
   10634   // def : Pat<(membarrier (i8 1), (i8 0), (i8 0), (i8 0), (i8 1)), (LFENCE)>;
   10635   if (Op1 && !Op2 && !Op3 && !Op4)
   10636     return DAG.getNode(X86ISD::LFENCE, dl, MVT::Other, Op.getOperand(0));
   10637 
   10638   // def : Pat<(membarrier (i8 imm), (i8 imm), (i8 imm), (i8 imm), (i8 1)),
   10639   //           (MFENCE)>;
   10640   return DAG.getNode(X86ISD::MFENCE, dl, MVT::Other, Op.getOperand(0));
   10641 }
   10642 
   10643 SDValue X86TargetLowering::LowerATOMIC_FENCE(SDValue Op,
   10644                                              SelectionDAG &DAG) const {
   10645   DebugLoc dl = Op.getDebugLoc();
   10646   AtomicOrdering FenceOrdering = static_cast<AtomicOrdering>(
   10647     cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue());
   10648   SynchronizationScope FenceScope = static_cast<SynchronizationScope>(
   10649     cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue());
   10650 
   10651   // The only fence that needs an instruction is a sequentially-consistent
   10652   // cross-thread fence.
   10653   if (FenceOrdering == SequentiallyConsistent && FenceScope == CrossThread) {
   10654     // Use mfence if we have SSE2 or we're on x86-64 (even if we asked for
   10655     // no-sse2). There isn't any reason to disable it if the target processor
   10656     // supports it.
   10657     if (Subtarget->hasSSE2() || Subtarget->is64Bit())
   10658       return DAG.getNode(X86ISD::MFENCE, dl, MVT::Other, Op.getOperand(0));
   10659 
   10660     SDValue Chain = Op.getOperand(0);
   10661     SDValue Zero = DAG.getConstant(0, MVT::i32);
   10662     SDValue Ops[] = {
   10663       DAG.getRegister(X86::ESP, MVT::i32), // Base
   10664       DAG.getTargetConstant(1, MVT::i8),   // Scale
   10665       DAG.getRegister(0, MVT::i32),        // Index
   10666       DAG.getTargetConstant(0, MVT::i32),  // Disp
   10667       DAG.getRegister(0, MVT::i32),        // Segment.
   10668       Zero,
   10669       Chain
   10670     };
   10671     SDNode *Res =
   10672       DAG.getMachineNode(X86::OR32mrLocked, dl, MVT::Other, Ops,
   10673                          array_lengthof(Ops));
   10674     return SDValue(Res, 0);
   10675   }
   10676 
   10677   // MEMBARRIER is a compiler barrier; it codegens to a no-op.
   10678   return DAG.getNode(X86ISD::MEMBARRIER, dl, MVT::Other, Op.getOperand(0));
   10679 }
   10680 
   10681 
   10682 SDValue X86TargetLowering::LowerCMP_SWAP(SDValue Op, SelectionDAG &DAG) const {
   10683   EVT T = Op.getValueType();
   10684   DebugLoc DL = Op.getDebugLoc();
   10685   unsigned Reg = 0;
   10686   unsigned size = 0;
   10687   switch(T.getSimpleVT().SimpleTy) {
   10688   default: llvm_unreachable("Invalid value type!");
   10689   case MVT::i8:  Reg = X86::AL;  size = 1; break;
   10690   case MVT::i16: Reg = X86::AX;  size = 2; break;
   10691   case MVT::i32: Reg = X86::EAX; size = 4; break;
   10692   case MVT::i64:
   10693     assert(Subtarget->is64Bit() && "Node not type legal!");
   10694     Reg = X86::RAX; size = 8;
   10695     break;
   10696   }
   10697   SDValue cpIn = DAG.getCopyToReg(Op.getOperand(0), DL, Reg,
   10698                                     Op.getOperand(2), SDValue());
   10699   SDValue Ops[] = { cpIn.getValue(0),
   10700                     Op.getOperand(1),
   10701                     Op.getOperand(3),
   10702                     DAG.getTargetConstant(size, MVT::i8),
   10703                     cpIn.getValue(1) };
   10704   SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue);
   10705   MachineMemOperand *MMO = cast<AtomicSDNode>(Op)->getMemOperand();
   10706   SDValue Result = DAG.getMemIntrinsicNode(X86ISD::LCMPXCHG_DAG, DL, Tys,
   10707                                            Ops, 5, T, MMO);
   10708   SDValue cpOut =
   10709     DAG.getCopyFromReg(Result.getValue(0), DL, Reg, T, Result.getValue(1));
   10710   return cpOut;
   10711 }
   10712 
   10713 SDValue X86TargetLowering::LowerREADCYCLECOUNTER(SDValue Op,
   10714                                                  SelectionDAG &DAG) const {
   10715   assert(Subtarget->is64Bit() && "Result not type legalized?");
   10716   SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue);
   10717   SDValue TheChain = Op.getOperand(0);
   10718   DebugLoc dl = Op.getDebugLoc();
   10719   SDValue rd = DAG.getNode(X86ISD::RDTSC_DAG, dl, Tys, &TheChain, 1);
   10720   SDValue rax = DAG.getCopyFromReg(rd, dl, X86::RAX, MVT::i64, rd.getValue(1));
   10721   SDValue rdx = DAG.getCopyFromReg(rax.getValue(1), dl, X86::RDX, MVT::i64,
   10722                                    rax.getValue(2));
   10723   SDValue Tmp = DAG.getNode(ISD::SHL, dl, MVT::i64, rdx,
   10724                             DAG.getConstant(32, MVT::i8));
   10725   SDValue Ops[] = {
   10726     DAG.getNode(ISD::OR, dl, MVT::i64, rax, Tmp),
   10727     rdx.getValue(1)
   10728   };
   10729   return DAG.getMergeValues(Ops, 2, dl);
   10730 }
   10731 
   10732 SDValue X86TargetLowering::LowerBITCAST(SDValue Op,
   10733                                             SelectionDAG &DAG) const {
   10734   EVT SrcVT = Op.getOperand(0).getValueType();
   10735   EVT DstVT = Op.getValueType();
   10736   assert(Subtarget->is64Bit() && !Subtarget->hasSSE2() &&
   10737          Subtarget->hasMMX() && "Unexpected custom BITCAST");
   10738   assert((DstVT == MVT::i64 ||
   10739           (DstVT.isVector() && DstVT.getSizeInBits()==64)) &&
   10740          "Unexpected custom BITCAST");
   10741   // i64 <=> MMX conversions are Legal.
   10742   if (SrcVT==MVT::i64 && DstVT.isVector())
   10743     return Op;
   10744   if (DstVT==MVT::i64 && SrcVT.isVector())
   10745     return Op;
   10746   // MMX <=> MMX conversions are Legal.
   10747   if (SrcVT.isVector() && DstVT.isVector())
   10748     return Op;
   10749   // All other conversions need to be expanded.
   10750   return SDValue();
   10751 }
   10752 
   10753 SDValue X86TargetLowering::LowerLOAD_SUB(SDValue Op, SelectionDAG &DAG) const {
   10754   SDNode *Node = Op.getNode();
   10755   DebugLoc dl = Node->getDebugLoc();
   10756   EVT T = Node->getValueType(0);
   10757   SDValue negOp = DAG.getNode(ISD::SUB, dl, T,
   10758                               DAG.getConstant(0, T), Node->getOperand(2));
   10759   return DAG.getAtomic(ISD::ATOMIC_LOAD_ADD, dl,
   10760                        cast<AtomicSDNode>(Node)->getMemoryVT(),
   10761                        Node->getOperand(0),
   10762                        Node->getOperand(1), negOp,
   10763                        cast<AtomicSDNode>(Node)->getSrcValue(),
   10764                        cast<AtomicSDNode>(Node)->getAlignment(),
   10765                        cast<AtomicSDNode>(Node)->getOrdering(),
   10766                        cast<AtomicSDNode>(Node)->getSynchScope());
   10767 }
   10768 
   10769 static SDValue LowerATOMIC_STORE(SDValue Op, SelectionDAG &DAG) {
   10770   SDNode *Node = Op.getNode();
   10771   DebugLoc dl = Node->getDebugLoc();
   10772   EVT VT = cast<AtomicSDNode>(Node)->getMemoryVT();
   10773 
   10774   // Convert seq_cst store -> xchg
   10775   // Convert wide store -> swap (-> cmpxchg8b/cmpxchg16b)
   10776   // FIXME: On 32-bit, store -> fist or movq would be more efficient
   10777   //        (The only way to get a 16-byte store is cmpxchg16b)
   10778   // FIXME: 16-byte ATOMIC_SWAP isn't actually hooked up at the moment.
   10779   if (cast<AtomicSDNode>(Node)->getOrdering() == SequentiallyConsistent ||
   10780       !DAG.getTargetLoweringInfo().isTypeLegal(VT)) {
   10781     SDValue Swap = DAG.getAtomic(ISD::ATOMIC_SWAP, dl,
   10782                                  cast<AtomicSDNode>(Node)->getMemoryVT(),
   10783                                  Node->getOperand(0),
   10784                                  Node->getOperand(1), Node->getOperand(2),
   10785                                  cast<AtomicSDNode>(Node)->getMemOperand(),
   10786                                  cast<AtomicSDNode>(Node)->getOrdering(),
   10787                                  cast<AtomicSDNode>(Node)->getSynchScope());
   10788     return Swap.getValue(1);
   10789   }
   10790   // Other atomic stores have a simple pattern.
   10791   return Op;
   10792 }
   10793 
   10794 static SDValue LowerADDC_ADDE_SUBC_SUBE(SDValue Op, SelectionDAG &DAG) {
   10795   EVT VT = Op.getNode()->getValueType(0);
   10796 
   10797   // Let legalize expand this if it isn't a legal type yet.
   10798   if (!DAG.getTargetLoweringInfo().isTypeLegal(VT))
   10799     return SDValue();
   10800 
   10801   SDVTList VTs = DAG.getVTList(VT, MVT::i32);
   10802 
   10803   unsigned Opc;
   10804   bool ExtraOp = false;
   10805   switch (Op.getOpcode()) {
   10806   default: llvm_unreachable("Invalid code");
   10807   case ISD::ADDC: Opc = X86ISD::ADD; break;
   10808   case ISD::ADDE: Opc = X86ISD::ADC; ExtraOp = true; break;
   10809   case ISD::SUBC: Opc = X86ISD::SUB; break;
   10810   case ISD::SUBE: Opc = X86ISD::SBB; ExtraOp = true; break;
   10811   }
   10812 
   10813   if (!ExtraOp)
   10814     return DAG.getNode(Opc, Op->getDebugLoc(), VTs, Op.getOperand(0),
   10815                        Op.getOperand(1));
   10816   return DAG.getNode(Opc, Op->getDebugLoc(), VTs, Op.getOperand(0),
   10817                      Op.getOperand(1), Op.getOperand(2));
   10818 }
   10819 
   10820 /// LowerOperation - Provide custom lowering hooks for some operations.
   10821 ///
   10822 SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
   10823   switch (Op.getOpcode()) {
   10824   default: llvm_unreachable("Should not custom lower this!");
   10825   case ISD::SIGN_EXTEND_INREG:  return LowerSIGN_EXTEND_INREG(Op,DAG);
   10826   case ISD::MEMBARRIER:         return LowerMEMBARRIER(Op,DAG);
   10827   case ISD::ATOMIC_FENCE:       return LowerATOMIC_FENCE(Op,DAG);
   10828   case ISD::ATOMIC_CMP_SWAP:    return LowerCMP_SWAP(Op,DAG);
   10829   case ISD::ATOMIC_LOAD_SUB:    return LowerLOAD_SUB(Op,DAG);
   10830   case ISD::ATOMIC_STORE:       return LowerATOMIC_STORE(Op,DAG);
   10831   case ISD::BUILD_VECTOR:       return LowerBUILD_VECTOR(Op, DAG);
   10832   case ISD::CONCAT_VECTORS:     return LowerCONCAT_VECTORS(Op, DAG);
   10833   case ISD::VECTOR_SHUFFLE:     return LowerVECTOR_SHUFFLE(Op, DAG);
   10834   case ISD::EXTRACT_VECTOR_ELT: return LowerEXTRACT_VECTOR_ELT(Op, DAG);
   10835   case ISD::INSERT_VECTOR_ELT:  return LowerINSERT_VECTOR_ELT(Op, DAG);
   10836   case ISD::EXTRACT_SUBVECTOR:  return LowerEXTRACT_SUBVECTOR(Op, DAG);
   10837   case ISD::INSERT_SUBVECTOR:   return LowerINSERT_SUBVECTOR(Op, DAG);
   10838   case ISD::SCALAR_TO_VECTOR:   return LowerSCALAR_TO_VECTOR(Op, DAG);
   10839   case ISD::ConstantPool:       return LowerConstantPool(Op, DAG);
   10840   case ISD::GlobalAddress:      return LowerGlobalAddress(Op, DAG);
   10841   case ISD::GlobalTLSAddress:   return LowerGlobalTLSAddress(Op, DAG);
   10842   case ISD::ExternalSymbol:     return LowerExternalSymbol(Op, DAG);
   10843   case ISD::BlockAddress:       return LowerBlockAddress(Op, DAG);
   10844   case ISD::SHL_PARTS:
   10845   case ISD::SRA_PARTS:
   10846   case ISD::SRL_PARTS:          return LowerShiftParts(Op, DAG);
   10847   case ISD::SINT_TO_FP:         return LowerSINT_TO_FP(Op, DAG);
   10848   case ISD::UINT_TO_FP:         return LowerUINT_TO_FP(Op, DAG);
   10849   case ISD::FP_TO_SINT:         return LowerFP_TO_SINT(Op, DAG);
   10850   case ISD::FP_TO_UINT:         return LowerFP_TO_UINT(Op, DAG);
   10851   case ISD::FABS:               return LowerFABS(Op, DAG);
   10852   case ISD::FNEG:               return LowerFNEG(Op, DAG);
   10853   case ISD::FCOPYSIGN:          return LowerFCOPYSIGN(Op, DAG);
   10854   case ISD::FGETSIGN:           return LowerFGETSIGN(Op, DAG);
   10855   case ISD::SETCC:              return LowerSETCC(Op, DAG);
   10856   case ISD::SELECT:             return LowerSELECT(Op, DAG);
   10857   case ISD::BRCOND:             return LowerBRCOND(Op, DAG);
   10858   case ISD::JumpTable:          return LowerJumpTable(Op, DAG);
   10859   case ISD::VASTART:            return LowerVASTART(Op, DAG);
   10860   case ISD::VAARG:              return LowerVAARG(Op, DAG);
   10861   case ISD::VACOPY:             return LowerVACOPY(Op, DAG);
   10862   case ISD::INTRINSIC_WO_CHAIN: return LowerINTRINSIC_WO_CHAIN(Op, DAG);
   10863   case ISD::RETURNADDR:         return LowerRETURNADDR(Op, DAG);
   10864   case ISD::FRAMEADDR:          return LowerFRAMEADDR(Op, DAG);
   10865   case ISD::FRAME_TO_ARGS_OFFSET:
   10866                                 return LowerFRAME_TO_ARGS_OFFSET(Op, DAG);
   10867   case ISD::DYNAMIC_STACKALLOC: return LowerDYNAMIC_STACKALLOC(Op, DAG);
   10868   case ISD::EH_RETURN:          return LowerEH_RETURN(Op, DAG);
   10869   case ISD::INIT_TRAMPOLINE:    return LowerINIT_TRAMPOLINE(Op, DAG);
   10870   case ISD::ADJUST_TRAMPOLINE:  return LowerADJUST_TRAMPOLINE(Op, DAG);
   10871   case ISD::FLT_ROUNDS_:        return LowerFLT_ROUNDS_(Op, DAG);
   10872   case ISD::CTLZ:               return LowerCTLZ(Op, DAG);
   10873   case ISD::CTLZ_ZERO_UNDEF:    return LowerCTLZ_ZERO_UNDEF(Op, DAG);
   10874   case ISD::CTTZ:               return LowerCTTZ(Op, DAG);
   10875   case ISD::MUL:                return LowerMUL(Op, DAG);
   10876   case ISD::SRA:
   10877   case ISD::SRL:
   10878   case ISD::SHL:                return LowerShift(Op, DAG);
   10879   case ISD::SADDO:
   10880   case ISD::UADDO:
   10881   case ISD::SSUBO:
   10882   case ISD::USUBO:
   10883   case ISD::SMULO:
   10884   case ISD::UMULO:              return LowerXALUO(Op, DAG);
   10885   case ISD::READCYCLECOUNTER:   return LowerREADCYCLECOUNTER(Op, DAG);
   10886   case ISD::BITCAST:            return LowerBITCAST(Op, DAG);
   10887   case ISD::ADDC:
   10888   case ISD::ADDE:
   10889   case ISD::SUBC:
   10890   case ISD::SUBE:               return LowerADDC_ADDE_SUBC_SUBE(Op, DAG);
   10891   case ISD::ADD:                return LowerADD(Op, DAG);
   10892   case ISD::SUB:                return LowerSUB(Op, DAG);
   10893   }
   10894 }
   10895 
   10896 static void ReplaceATOMIC_LOAD(SDNode *Node,
   10897                                   SmallVectorImpl<SDValue> &Results,
   10898                                   SelectionDAG &DAG) {
   10899   DebugLoc dl = Node->getDebugLoc();
   10900   EVT VT = cast<AtomicSDNode>(Node)->getMemoryVT();
   10901 
   10902   // Convert wide load -> cmpxchg8b/cmpxchg16b
   10903   // FIXME: On 32-bit, load -> fild or movq would be more efficient
   10904   //        (The only way to get a 16-byte load is cmpxchg16b)
   10905   // FIXME: 16-byte ATOMIC_CMP_SWAP isn't actually hooked up at the moment.
   10906   SDValue Zero = DAG.getConstant(0, VT);
   10907   SDValue Swap = DAG.getAtomic(ISD::ATOMIC_CMP_SWAP, dl, VT,
   10908                                Node->getOperand(0),
   10909                                Node->getOperand(1), Zero, Zero,
   10910                                cast<AtomicSDNode>(Node)->getMemOperand(),
   10911                                cast<AtomicSDNode>(Node)->getOrdering(),
   10912                                cast<AtomicSDNode>(Node)->getSynchScope());
   10913   Results.push_back(Swap.getValue(0));
   10914   Results.push_back(Swap.getValue(1));
   10915 }
   10916 
   10917 void X86TargetLowering::
   10918 ReplaceATOMIC_BINARY_64(SDNode *Node, SmallVectorImpl<SDValue>&Results,
   10919                         SelectionDAG &DAG, unsigned NewOp) const {
   10920   DebugLoc dl = Node->getDebugLoc();
   10921   assert (Node->getValueType(0) == MVT::i64 &&
   10922           "Only know how to expand i64 atomics");
   10923 
   10924   SDValue Chain = Node->getOperand(0);
   10925   SDValue In1 = Node->getOperand(1);
   10926   SDValue In2L = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32,
   10927                              Node->getOperand(2), DAG.getIntPtrConstant(0));
   10928   SDValue In2H = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32,
   10929                              Node->getOperand(2), DAG.getIntPtrConstant(1));
   10930   SDValue Ops[] = { Chain, In1, In2L, In2H };
   10931   SDVTList Tys = DAG.getVTList(MVT::i32, MVT::i32, MVT::Other);
   10932   SDValue Result =
   10933     DAG.getMemIntrinsicNode(NewOp, dl, Tys, Ops, 4, MVT::i64,
   10934                             cast<MemSDNode>(Node)->getMemOperand());
   10935   SDValue OpsF[] = { Result.getValue(0), Result.getValue(1)};
   10936   Results.push_back(DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, OpsF, 2));
   10937   Results.push_back(Result.getValue(2));
   10938 }
   10939 
   10940 /// ReplaceNodeResults - Replace a node with an illegal result type
   10941 /// with a new node built out of custom code.
   10942 void X86TargetLowering::ReplaceNodeResults(SDNode *N,
   10943                                            SmallVectorImpl<SDValue>&Results,
   10944                                            SelectionDAG &DAG) const {
   10945   DebugLoc dl = N->getDebugLoc();
   10946   switch (N->getOpcode()) {
   10947   default:
   10948     llvm_unreachable("Do not know how to custom type legalize this operation!");
   10949   case ISD::SIGN_EXTEND_INREG:
   10950   case ISD::ADDC:
   10951   case ISD::ADDE:
   10952   case ISD::SUBC:
   10953   case ISD::SUBE:
   10954     // We don't want to expand or promote these.
   10955     return;
   10956   case ISD::FP_TO_SINT:
   10957   case ISD::FP_TO_UINT: {
   10958     bool IsSigned = N->getOpcode() == ISD::FP_TO_SINT;
   10959 
   10960     if (!IsSigned && !isIntegerTypeFTOL(SDValue(N, 0).getValueType()))
   10961       return;
   10962 
   10963     std::pair<SDValue,SDValue> Vals =
   10964         FP_TO_INTHelper(SDValue(N, 0), DAG, IsSigned, /*IsReplace=*/ true);
   10965     SDValue FIST = Vals.first, StackSlot = Vals.second;
   10966     if (FIST.getNode() != 0) {
   10967       EVT VT = N->getValueType(0);
   10968       // Return a load from the stack slot.
   10969       if (StackSlot.getNode() != 0)
   10970         Results.push_back(DAG.getLoad(VT, dl, FIST, StackSlot,
   10971                                       MachinePointerInfo(),
   10972                                       false, false, false, 0));
   10973       else
   10974         Results.push_back(FIST);
   10975     }
   10976     return;
   10977   }
   10978   case ISD::READCYCLECOUNTER: {
   10979     SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue);
   10980     SDValue TheChain = N->getOperand(0);
   10981     SDValue rd = DAG.getNode(X86ISD::RDTSC_DAG, dl, Tys, &TheChain, 1);
   10982     SDValue eax = DAG.getCopyFromReg(rd, dl, X86::EAX, MVT::i32,
   10983                                      rd.getValue(1));
   10984     SDValue edx = DAG.getCopyFromReg(eax.getValue(1), dl, X86::EDX, MVT::i32,
   10985                                      eax.getValue(2));
   10986     // Use a buildpair to merge the two 32-bit values into a 64-bit one.
   10987     SDValue Ops[] = { eax, edx };
   10988     Results.push_back(DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, Ops, 2));
   10989     Results.push_back(edx.getValue(1));
   10990     return;
   10991   }
   10992   case ISD::ATOMIC_CMP_SWAP: {
   10993     EVT T = N->getValueType(0);
   10994     assert((T == MVT::i64 || T == MVT::i128) && "can only expand cmpxchg pair");
   10995     bool Regs64bit = T == MVT::i128;
   10996     EVT HalfT = Regs64bit ? MVT::i64 : MVT::i32;
   10997     SDValue cpInL, cpInH;
   10998     cpInL = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, HalfT, N->getOperand(2),
   10999                         DAG.getConstant(0, HalfT));
   11000     cpInH = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, HalfT, N->getOperand(2),
   11001                         DAG.getConstant(1, HalfT));
   11002     cpInL = DAG.getCopyToReg(N->getOperand(0), dl,
   11003                              Regs64bit ? X86::RAX : X86::EAX,
   11004                              cpInL, SDValue());
   11005     cpInH = DAG.getCopyToReg(cpInL.getValue(0), dl,
   11006                              Regs64bit ? X86::RDX : X86::EDX,
   11007                              cpInH, cpInL.getValue(1));
   11008     SDValue swapInL, swapInH;
   11009     swapInL = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, HalfT, N->getOperand(3),
   11010                           DAG.getConstant(0, HalfT));
   11011     swapInH = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, HalfT, N->getOperand(3),
   11012                           DAG.getConstant(1, HalfT));
   11013     swapInL = DAG.getCopyToReg(cpInH.getValue(0), dl,
   11014                                Regs64bit ? X86::RBX : X86::EBX,
   11015                                swapInL, cpInH.getValue(1));
   11016     swapInH = DAG.getCopyToReg(swapInL.getValue(0), dl,
   11017                                Regs64bit ? X86::RCX : X86::ECX,
   11018                                swapInH, swapInL.getValue(1));
   11019     SDValue Ops[] = { swapInH.getValue(0),
   11020                       N->getOperand(1),
   11021                       swapInH.getValue(1) };
   11022     SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue);
   11023     MachineMemOperand *MMO = cast<AtomicSDNode>(N)->getMemOperand();
   11024     unsigned Opcode = Regs64bit ? X86ISD::LCMPXCHG16_DAG :
   11025                                   X86ISD::LCMPXCHG8_DAG;
   11026     SDValue Result = DAG.getMemIntrinsicNode(Opcode, dl, Tys,
   11027                                              Ops, 3, T, MMO);
   11028     SDValue cpOutL = DAG.getCopyFromReg(Result.getValue(0), dl,
   11029                                         Regs64bit ? X86::RAX : X86::EAX,
   11030                                         HalfT, Result.getValue(1));
   11031     SDValue cpOutH = DAG.getCopyFromReg(cpOutL.getValue(1), dl,
   11032                                         Regs64bit ? X86::RDX : X86::EDX,
   11033                                         HalfT, cpOutL.getValue(2));
   11034     SDValue OpsF[] = { cpOutL.getValue(0), cpOutH.getValue(0)};
   11035     Results.push_back(DAG.getNode(ISD::BUILD_PAIR, dl, T, OpsF, 2));
   11036     Results.push_back(cpOutH.getValue(1));
   11037     return;
   11038   }
   11039   case ISD::ATOMIC_LOAD_ADD:
   11040     ReplaceATOMIC_BINARY_64(N, Results, DAG, X86ISD::ATOMADD64_DAG);
   11041     return;
   11042   case ISD::ATOMIC_LOAD_AND:
   11043     ReplaceATOMIC_BINARY_64(N, Results, DAG, X86ISD::ATOMAND64_DAG);
   11044     return;
   11045   case ISD::ATOMIC_LOAD_NAND:
   11046     ReplaceATOMIC_BINARY_64(N, Results, DAG, X86ISD::ATOMNAND64_DAG);
   11047     return;
   11048   case ISD::ATOMIC_LOAD_OR:
   11049     ReplaceATOMIC_BINARY_64(N, Results, DAG, X86ISD::ATOMOR64_DAG);
   11050     return;
   11051   case ISD::ATOMIC_LOAD_SUB:
   11052     ReplaceATOMIC_BINARY_64(N, Results, DAG, X86ISD::ATOMSUB64_DAG);
   11053     return;
   11054   case ISD::ATOMIC_LOAD_XOR:
   11055     ReplaceATOMIC_BINARY_64(N, Results, DAG, X86ISD::ATOMXOR64_DAG);
   11056     return;
   11057   case ISD::ATOMIC_SWAP:
   11058     ReplaceATOMIC_BINARY_64(N, Results, DAG, X86ISD::ATOMSWAP64_DAG);
   11059     return;
   11060   case ISD::ATOMIC_LOAD:
   11061     ReplaceATOMIC_LOAD(N, Results, DAG);
   11062   }
   11063 }
   11064 
   11065 const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const {
   11066   switch (Opcode) {
   11067   default: return NULL;
   11068   case X86ISD::BSF:                return "X86ISD::BSF";
   11069   case X86ISD::BSR:                return "X86ISD::BSR";
   11070   case X86ISD::SHLD:               return "X86ISD::SHLD";
   11071   case X86ISD::SHRD:               return "X86ISD::SHRD";
   11072   case X86ISD::FAND:               return "X86ISD::FAND";
   11073   case X86ISD::FOR:                return "X86ISD::FOR";
   11074   case X86ISD::FXOR:               return "X86ISD::FXOR";
   11075   case X86ISD::FSRL:               return "X86ISD::FSRL";
   11076   case X86ISD::FILD:               return "X86ISD::FILD";
   11077   case X86ISD::FILD_FLAG:          return "X86ISD::FILD_FLAG";
   11078   case X86ISD::FP_TO_INT16_IN_MEM: return "X86ISD::FP_TO_INT16_IN_MEM";
   11079   case X86ISD::FP_TO_INT32_IN_MEM: return "X86ISD::FP_TO_INT32_IN_MEM";
   11080   case X86ISD::FP_TO_INT64_IN_MEM: return "X86ISD::FP_TO_INT64_IN_MEM";
   11081   case X86ISD::FLD:                return "X86ISD::FLD";
   11082   case X86ISD::FST:                return "X86ISD::FST";
   11083   case X86ISD::CALL:               return "X86ISD::CALL";
   11084   case X86ISD::RDTSC_DAG:          return "X86ISD::RDTSC_DAG";
   11085   case X86ISD::BT:                 return "X86ISD::BT";
   11086   case X86ISD::CMP:                return "X86ISD::CMP";
   11087   case X86ISD::COMI:               return "X86ISD::COMI";
   11088   case X86ISD::UCOMI:              return "X86ISD::UCOMI";
   11089   case X86ISD::SETCC:              return "X86ISD::SETCC";
   11090   case X86ISD::SETCC_CARRY:        return "X86ISD::SETCC_CARRY";
   11091   case X86ISD::FSETCCsd:           return "X86ISD::FSETCCsd";
   11092   case X86ISD::FSETCCss:           return "X86ISD::FSETCCss";
   11093   case X86ISD::CMOV:               return "X86ISD::CMOV";
   11094   case X86ISD::BRCOND:             return "X86ISD::BRCOND";
   11095   case X86ISD::RET_FLAG:           return "X86ISD::RET_FLAG";
   11096   case X86ISD::REP_STOS:           return "X86ISD::REP_STOS";
   11097   case X86ISD::REP_MOVS:           return "X86ISD::REP_MOVS";
   11098   case X86ISD::GlobalBaseReg:      return "X86ISD::GlobalBaseReg";
   11099   case X86ISD::Wrapper:            return "X86ISD::Wrapper";
   11100   case X86ISD::WrapperRIP:         return "X86ISD::WrapperRIP";
   11101   case X86ISD::PEXTRB:             return "X86ISD::PEXTRB";
   11102   case X86ISD::PEXTRW:             return "X86ISD::PEXTRW";
   11103   case X86ISD::INSERTPS:           return "X86ISD::INSERTPS";
   11104   case X86ISD::PINSRB:             return "X86ISD::PINSRB";
   11105   case X86ISD::PINSRW:             return "X86ISD::PINSRW";
   11106   case X86ISD::PSHUFB:             return "X86ISD::PSHUFB";
   11107   case X86ISD::ANDNP:              return "X86ISD::ANDNP";
   11108   case X86ISD::PSIGN:              return "X86ISD::PSIGN";
   11109   case X86ISD::BLENDV:             return "X86ISD::BLENDV";
   11110   case X86ISD::BLENDPW:            return "X86ISD::BLENDPW";
   11111   case X86ISD::BLENDPS:            return "X86ISD::BLENDPS";
   11112   case X86ISD::BLENDPD:            return "X86ISD::BLENDPD";
   11113   case X86ISD::HADD:               return "X86ISD::HADD";
   11114   case X86ISD::HSUB:               return "X86ISD::HSUB";
   11115   case X86ISD::FHADD:              return "X86ISD::FHADD";
   11116   case X86ISD::FHSUB:              return "X86ISD::FHSUB";
   11117   case X86ISD::FMAX:               return "X86ISD::FMAX";
   11118   case X86ISD::FMIN:               return "X86ISD::FMIN";
   11119   case X86ISD::FRSQRT:             return "X86ISD::FRSQRT";
   11120   case X86ISD::FRCP:               return "X86ISD::FRCP";
   11121   case X86ISD::TLSADDR:            return "X86ISD::TLSADDR";
   11122   case X86ISD::TLSCALL:            return "X86ISD::TLSCALL";
   11123   case X86ISD::EH_RETURN:          return "X86ISD::EH_RETURN";
   11124   case X86ISD::TC_RETURN:          return "X86ISD::TC_RETURN";
   11125   case X86ISD::FNSTCW16m:          return "X86ISD::FNSTCW16m";
   11126   case X86ISD::LCMPXCHG_DAG:       return "X86ISD::LCMPXCHG_DAG";
   11127   case X86ISD::LCMPXCHG8_DAG:      return "X86ISD::LCMPXCHG8_DAG";
   11128   case X86ISD::ATOMADD64_DAG:      return "X86ISD::ATOMADD64_DAG";
   11129   case X86ISD::ATOMSUB64_DAG:      return "X86ISD::ATOMSUB64_DAG";
   11130   case X86ISD::ATOMOR64_DAG:       return "X86ISD::ATOMOR64_DAG";
   11131   case X86ISD::ATOMXOR64_DAG:      return "X86ISD::ATOMXOR64_DAG";
   11132   case X86ISD::ATOMAND64_DAG:      return "X86ISD::ATOMAND64_DAG";
   11133   case X86ISD::ATOMNAND64_DAG:     return "X86ISD::ATOMNAND64_DAG";
   11134   case X86ISD::VZEXT_MOVL:         return "X86ISD::VZEXT_MOVL";
   11135   case X86ISD::VZEXT_LOAD:         return "X86ISD::VZEXT_LOAD";
   11136   case X86ISD::VSHLDQ:             return "X86ISD::VSHLDQ";
   11137   case X86ISD::VSRLDQ:             return "X86ISD::VSRLDQ";
   11138   case X86ISD::VSHL:               return "X86ISD::VSHL";
   11139   case X86ISD::VSRL:               return "X86ISD::VSRL";
   11140   case X86ISD::VSRA:               return "X86ISD::VSRA";
   11141   case X86ISD::VSHLI:              return "X86ISD::VSHLI";
   11142   case X86ISD::VSRLI:              return "X86ISD::VSRLI";
   11143   case X86ISD::VSRAI:              return "X86ISD::VSRAI";
   11144   case X86ISD::CMPP:               return "X86ISD::CMPP";
   11145   case X86ISD::PCMPEQ:             return "X86ISD::PCMPEQ";
   11146   case X86ISD::PCMPGT:             return "X86ISD::PCMPGT";
   11147   case X86ISD::ADD:                return "X86ISD::ADD";
   11148   case X86ISD::SUB:                return "X86ISD::SUB";
   11149   case X86ISD::ADC:                return "X86ISD::ADC";
   11150   case X86ISD::SBB:                return "X86ISD::SBB";
   11151   case X86ISD::SMUL:               return "X86ISD::SMUL";
   11152   case X86ISD::UMUL:               return "X86ISD::UMUL";
   11153   case X86ISD::INC:                return "X86ISD::INC";
   11154   case X86ISD::DEC:                return "X86ISD::DEC";
   11155   case X86ISD::OR:                 return "X86ISD::OR";
   11156   case X86ISD::XOR:                return "X86ISD::XOR";
   11157   case X86ISD::AND:                return "X86ISD::AND";
   11158   case X86ISD::ANDN:               return "X86ISD::ANDN";
   11159   case X86ISD::BLSI:               return "X86ISD::BLSI";
   11160   case X86ISD::BLSMSK:             return "X86ISD::BLSMSK";
   11161   case X86ISD::BLSR:               return "X86ISD::BLSR";
   11162   case X86ISD::MUL_IMM:            return "X86ISD::MUL_IMM";
   11163   case X86ISD::PTEST:              return "X86ISD::PTEST";
   11164   case X86ISD::TESTP:              return "X86ISD::TESTP";
   11165   case X86ISD::PALIGN:             return "X86ISD::PALIGN";
   11166   case X86ISD::PSHUFD:             return "X86ISD::PSHUFD";
   11167   case X86ISD::PSHUFHW:            return "X86ISD::PSHUFHW";
   11168   case X86ISD::PSHUFLW:            return "X86ISD::PSHUFLW";
   11169   case X86ISD::SHUFP:              return "X86ISD::SHUFP";
   11170   case X86ISD::MOVLHPS:            return "X86ISD::MOVLHPS";
   11171   case X86ISD::MOVLHPD:            return "X86ISD::MOVLHPD";
   11172   case X86ISD::MOVHLPS:            return "X86ISD::MOVHLPS";
   11173   case X86ISD::MOVLPS:             return "X86ISD::MOVLPS";
   11174   case X86ISD::MOVLPD:             return "X86ISD::MOVLPD";
   11175   case X86ISD::MOVDDUP:            return "X86ISD::MOVDDUP";
   11176   case X86ISD::MOVSHDUP:           return "X86ISD::MOVSHDUP";
   11177   case X86ISD::MOVSLDUP:           return "X86ISD::MOVSLDUP";
   11178   case X86ISD::MOVSD:              return "X86ISD::MOVSD";
   11179   case X86ISD::MOVSS:              return "X86ISD::MOVSS";
   11180   case X86ISD::UNPCKL:             return "X86ISD::UNPCKL";
   11181   case X86ISD::UNPCKH:             return "X86ISD::UNPCKH";
   11182   case X86ISD::VBROADCAST:         return "X86ISD::VBROADCAST";
   11183   case X86ISD::VPERMILP:           return "X86ISD::VPERMILP";
   11184   case X86ISD::VPERM2X128:         return "X86ISD::VPERM2X128";
   11185   case X86ISD::VPERMV:             return "X86ISD::VPERMV";
   11186   case X86ISD::VPERMI:             return "X86ISD::VPERMI";
   11187   case X86ISD::PMULUDQ:            return "X86ISD::PMULUDQ";
   11188   case X86ISD::VASTART_SAVE_XMM_REGS: return "X86ISD::VASTART_SAVE_XMM_REGS";
   11189   case X86ISD::VAARG_64:           return "X86ISD::VAARG_64";
   11190   case X86ISD::WIN_ALLOCA:         return "X86ISD::WIN_ALLOCA";
   11191   case X86ISD::MEMBARRIER:         return "X86ISD::MEMBARRIER";
   11192   case X86ISD::SEG_ALLOCA:         return "X86ISD::SEG_ALLOCA";
   11193   case X86ISD::WIN_FTOL:           return "X86ISD::WIN_FTOL";
   11194   }
   11195 }
   11196 
   11197 // isLegalAddressingMode - Return true if the addressing mode represented
   11198 // by AM is legal for this target, for a load/store of the specified type.
   11199 bool X86TargetLowering::isLegalAddressingMode(const AddrMode &AM,
   11200                                               Type *Ty) const {
   11201   // X86 supports extremely general addressing modes.
   11202   CodeModel::Model M = getTargetMachine().getCodeModel();
   11203   Reloc::Model R = getTargetMachine().getRelocationModel();
   11204 
   11205   // X86 allows a sign-extended 32-bit immediate field as a displacement.
   11206   if (!X86::isOffsetSuitableForCodeModel(AM.BaseOffs, M, AM.BaseGV != NULL))
   11207     return false;
   11208 
   11209   if (AM.BaseGV) {
   11210     unsigned GVFlags =
   11211       Subtarget->ClassifyGlobalReference(AM.BaseGV, getTargetMachine());
   11212 
   11213     // If a reference to this global requires an extra load, we can't fold it.
   11214     if (isGlobalStubReference(GVFlags))
   11215       return false;
   11216 
   11217     // If BaseGV requires a register for the PIC base, we cannot also have a
   11218     // BaseReg specified.
   11219     if (AM.HasBaseReg && isGlobalRelativeToPICBase(GVFlags))
   11220       return false;
   11221 
   11222     // If lower 4G is not available, then we must use rip-relative addressing.
   11223     if ((M != CodeModel::Small || R != Reloc::Static) &&
   11224         Subtarget->is64Bit() && (AM.BaseOffs || AM.Scale > 1))
   11225       return false;
   11226   }
   11227 
   11228   switch (AM.Scale) {
   11229   case 0:
   11230   case 1:
   11231   case 2:
   11232   case 4:
   11233   case 8:
   11234     // These scales always work.
   11235     break;
   11236   case 3:
   11237   case 5:
   11238   case 9:
   11239     // These scales are formed with basereg+scalereg.  Only accept if there is
   11240     // no basereg yet.
   11241     if (AM.HasBaseReg)
   11242       return false;
   11243     break;
   11244   default:  // Other stuff never works.
   11245     return false;
   11246   }
   11247 
   11248   return true;
   11249 }
   11250 
   11251 
   11252 bool X86TargetLowering::isTruncateFree(Type *Ty1, Type *Ty2) const {
   11253   if (!Ty1->isIntegerTy() || !Ty2->isIntegerTy())
   11254     return false;
   11255   unsigned NumBits1 = Ty1->getPrimitiveSizeInBits();
   11256   unsigned NumBits2 = Ty2->getPrimitiveSizeInBits();
   11257   if (NumBits1 <= NumBits2)
   11258     return false;
   11259   return true;
   11260 }
   11261 
   11262 bool X86TargetLowering::isTruncateFree(EVT VT1, EVT VT2) const {
   11263   if (!VT1.isInteger() || !VT2.isInteger())
   11264     return false;
   11265   unsigned NumBits1 = VT1.getSizeInBits();
   11266   unsigned NumBits2 = VT2.getSizeInBits();
   11267   if (NumBits1 <= NumBits2)
   11268     return false;
   11269   return true;
   11270 }
   11271 
   11272 bool X86TargetLowering::isZExtFree(Type *Ty1, Type *Ty2) const {
   11273   // x86-64 implicitly zero-extends 32-bit results in 64-bit registers.
   11274   return Ty1->isIntegerTy(32) && Ty2->isIntegerTy(64) && Subtarget->is64Bit();
   11275 }
   11276 
   11277 bool X86TargetLowering::isZExtFree(EVT VT1, EVT VT2) const {
   11278   // x86-64 implicitly zero-extends 32-bit results in 64-bit registers.
   11279   return VT1 == MVT::i32 && VT2 == MVT::i64 && Subtarget->is64Bit();
   11280 }
   11281 
   11282 bool X86TargetLowering::isNarrowingProfitable(EVT VT1, EVT VT2) const {
   11283   // i16 instructions are longer (0x66 prefix) and potentially slower.
   11284   return !(VT1 == MVT::i32 && VT2 == MVT::i16);
   11285 }
   11286 
   11287 /// isShuffleMaskLegal - Targets can use this to indicate that they only
   11288 /// support *some* VECTOR_SHUFFLE operations, those with specific masks.
   11289 /// By default, if a target supports the VECTOR_SHUFFLE node, all mask values
   11290 /// are assumed to be legal.
   11291 bool
   11292 X86TargetLowering::isShuffleMaskLegal(const SmallVectorImpl<int> &M,
   11293                                       EVT VT) const {
   11294   // Very little shuffling can be done for 64-bit vectors right now.
   11295   if (VT.getSizeInBits() == 64)
   11296     return false;
   11297 
   11298   // FIXME: pshufb, blends, shifts.
   11299   return (VT.getVectorNumElements() == 2 ||
   11300           ShuffleVectorSDNode::isSplatMask(&M[0], VT) ||
   11301           isMOVLMask(M, VT) ||
   11302           isSHUFPMask(M, VT, Subtarget->hasAVX()) ||
   11303           isPSHUFDMask(M, VT) ||
   11304           isPSHUFHWMask(M, VT) ||
   11305           isPSHUFLWMask(M, VT) ||
   11306           isPALIGNRMask(M, VT, Subtarget) ||
   11307           isUNPCKLMask(M, VT, Subtarget->hasAVX2()) ||
   11308           isUNPCKHMask(M, VT, Subtarget->hasAVX2()) ||
   11309           isUNPCKL_v_undef_Mask(M, VT, Subtarget->hasAVX2()) ||
   11310           isUNPCKH_v_undef_Mask(M, VT, Subtarget->hasAVX2()));
   11311 }
   11312 
   11313 bool
   11314 X86TargetLowering::isVectorClearMaskLegal(const SmallVectorImpl<int> &Mask,
   11315                                           EVT VT) const {
   11316   unsigned NumElts = VT.getVectorNumElements();
   11317   // FIXME: This collection of masks seems suspect.
   11318   if (NumElts == 2)
   11319     return true;
   11320   if (NumElts == 4 && VT.getSizeInBits() == 128) {
   11321     return (isMOVLMask(Mask, VT)  ||
   11322             isCommutedMOVLMask(Mask, VT, true) ||
   11323             isSHUFPMask(Mask, VT, Subtarget->hasAVX()) ||
   11324             isSHUFPMask(Mask, VT, Subtarget->hasAVX(), /* Commuted */ true));
   11325   }
   11326   return false;
   11327 }
   11328 
   11329 //===----------------------------------------------------------------------===//
   11330 //                           X86 Scheduler Hooks
   11331 //===----------------------------------------------------------------------===//
   11332 
   11333 // private utility function
   11334 MachineBasicBlock *
   11335 X86TargetLowering::EmitAtomicBitwiseWithCustomInserter(MachineInstr *bInstr,
   11336                                                        MachineBasicBlock *MBB,
   11337                                                        unsigned regOpc,
   11338                                                        unsigned immOpc,
   11339                                                        unsigned LoadOpc,
   11340                                                        unsigned CXchgOpc,
   11341                                                        unsigned notOpc,
   11342                                                        unsigned EAXreg,
   11343                                                  const TargetRegisterClass *RC,
   11344                                                        bool Invert) const {
   11345   // For the atomic bitwise operator, we generate
   11346   //   thisMBB:
   11347   //   newMBB:
   11348   //     ld  t1 = [bitinstr.addr]
   11349   //     op  t2 = t1, [bitinstr.val]
   11350   //     not t3 = t2  (if Invert)
   11351   //     mov EAX = t1
   11352   //     lcs dest = [bitinstr.addr], t3  [EAX is implicit]
   11353   //     bz  newMBB
   11354   //     fallthrough -->nextMBB
   11355   const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
   11356   const BasicBlock *LLVM_BB = MBB->getBasicBlock();
   11357   MachineFunction::iterator MBBIter = MBB;
   11358   ++MBBIter;
   11359 
   11360   /// First build the CFG
   11361   MachineFunction *F = MBB->getParent();
   11362   MachineBasicBlock *thisMBB = MBB;
   11363   MachineBasicBlock *newMBB = F->CreateMachineBasicBlock(LLVM_BB);
   11364   MachineBasicBlock *nextMBB = F->CreateMachineBasicBlock(LLVM_BB);
   11365   F->insert(MBBIter, newMBB);
   11366   F->insert(MBBIter, nextMBB);
   11367 
   11368   // Transfer the remainder of thisMBB and its successor edges to nextMBB.
   11369   nextMBB->splice(nextMBB->begin(), thisMBB,
   11370                   llvm::next(MachineBasicBlock::iterator(bInstr)),
   11371                   thisMBB->end());
   11372   nextMBB->transferSuccessorsAndUpdatePHIs(thisMBB);
   11373 
   11374   // Update thisMBB to fall through to newMBB
   11375   thisMBB->addSuccessor(newMBB);
   11376 
   11377   // newMBB jumps to itself and fall through to nextMBB
   11378   newMBB->addSuccessor(nextMBB);
   11379   newMBB->addSuccessor(newMBB);
   11380 
   11381   // Insert instructions into newMBB based on incoming instruction
   11382   assert(bInstr->getNumOperands() < X86::AddrNumOperands + 4 &&
   11383          "unexpected number of operands");
   11384   DebugLoc dl = bInstr->getDebugLoc();
   11385   MachineOperand& destOper = bInstr->getOperand(0);
   11386   MachineOperand* argOpers[2 + X86::AddrNumOperands];
   11387   int numArgs = bInstr->getNumOperands() - 1;
   11388   for (int i=0; i < numArgs; ++i)
   11389     argOpers[i] = &bInstr->getOperand(i+1);
   11390 
   11391   // x86 address has 4 operands: base, index, scale, and displacement
   11392   int lastAddrIndx = X86::AddrNumOperands - 1; // [0,3]
   11393   int valArgIndx = lastAddrIndx + 1;
   11394 
   11395   unsigned t1 = F->getRegInfo().createVirtualRegister(RC);
   11396   MachineInstrBuilder MIB = BuildMI(newMBB, dl, TII->get(LoadOpc), t1);
   11397   for (int i=0; i <= lastAddrIndx; ++i)
   11398     (*MIB).addOperand(*argOpers[i]);
   11399 
   11400   unsigned t2 = F->getRegInfo().createVirtualRegister(RC);
   11401   assert((argOpers[valArgIndx]->isReg() ||
   11402           argOpers[valArgIndx]->isImm()) &&
   11403          "invalid operand");
   11404   if (argOpers[valArgIndx]->isReg())
   11405     MIB = BuildMI(newMBB, dl, TII->get(regOpc), t2);
   11406   else
   11407     MIB = BuildMI(newMBB, dl, TII->get(immOpc), t2);
   11408   MIB.addReg(t1);
   11409   (*MIB).addOperand(*argOpers[valArgIndx]);
   11410 
   11411   unsigned t3 = F->getRegInfo().createVirtualRegister(RC);
   11412   if (Invert) {
   11413     MIB = BuildMI(newMBB, dl, TII->get(notOpc), t3).addReg(t2);
   11414   }
   11415   else
   11416     t3 = t2;
   11417 
   11418   MIB = BuildMI(newMBB, dl, TII->get(TargetOpcode::COPY), EAXreg);
   11419   MIB.addReg(t1);
   11420 
   11421   MIB = BuildMI(newMBB, dl, TII->get(CXchgOpc));
   11422   for (int i=0; i <= lastAddrIndx; ++i)
   11423     (*MIB).addOperand(*argOpers[i]);
   11424   MIB.addReg(t3);
   11425   assert(bInstr->hasOneMemOperand() && "Unexpected number of memoperand");
   11426   (*MIB).setMemRefs(bInstr->memoperands_begin(),
   11427                     bInstr->memoperands_end());
   11428 
   11429   MIB = BuildMI(newMBB, dl, TII->get(TargetOpcode::COPY), destOper.getReg());
   11430   MIB.addReg(EAXreg);
   11431 
   11432   // insert branch
   11433   BuildMI(newMBB, dl, TII->get(X86::JNE_4)).addMBB(newMBB);
   11434 
   11435   bInstr->eraseFromParent();   // The pseudo instruction is gone now.
   11436   return nextMBB;
   11437 }
   11438 
   11439 // private utility function:  64 bit atomics on 32 bit host.
   11440 MachineBasicBlock *
   11441 X86TargetLowering::EmitAtomicBit6432WithCustomInserter(MachineInstr *bInstr,
   11442                                                        MachineBasicBlock *MBB,
   11443                                                        unsigned regOpcL,
   11444                                                        unsigned regOpcH,
   11445                                                        unsigned immOpcL,
   11446                                                        unsigned immOpcH,
   11447                                                        bool Invert) const {
   11448   // For the atomic bitwise operator, we generate
   11449   //   thisMBB (instructions are in pairs, except cmpxchg8b)
   11450   //     ld t1,t2 = [bitinstr.addr]
   11451   //   newMBB:
   11452   //     out1, out2 = phi (thisMBB, t1/t2) (newMBB, t3/t4)
   11453   //     op  t5, t6 <- out1, out2, [bitinstr.val]
   11454   //      (for SWAP, substitute:  mov t5, t6 <- [bitinstr.val])
   11455   //     neg t7, t8 < t5, t6  (if Invert)
   11456   //     mov ECX, EBX <- t5, t6
   11457   //     mov EAX, EDX <- t1, t2
   11458   //     cmpxchg8b [bitinstr.addr]  [EAX, EDX, EBX, ECX implicit]
   11459   //     mov t3, t4 <- EAX, EDX
   11460   //     bz  newMBB
   11461   //     result in out1, out2
   11462   //     fallthrough -->nextMBB
   11463 
   11464   const TargetRegisterClass *RC = X86::GR32RegisterClass;
   11465   const unsigned LoadOpc = X86::MOV32rm;
   11466   const unsigned NotOpc = X86::NOT32r;
   11467   const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
   11468   const BasicBlock *LLVM_BB = MBB->getBasicBlock();
   11469   MachineFunction::iterator MBBIter = MBB;
   11470   ++MBBIter;
   11471 
   11472   /// First build the CFG
   11473   MachineFunction *F = MBB->getParent();
   11474   MachineBasicBlock *thisMBB = MBB;
   11475   MachineBasicBlock *newMBB = F->CreateMachineBasicBlock(LLVM_BB);
   11476   MachineBasicBlock *nextMBB = F->CreateMachineBasicBlock(LLVM_BB);
   11477   F->insert(MBBIter, newMBB);
   11478   F->insert(MBBIter, nextMBB);
   11479 
   11480   // Transfer the remainder of thisMBB and its successor edges to nextMBB.
   11481   nextMBB->splice(nextMBB->begin(), thisMBB,
   11482                   llvm::next(MachineBasicBlock::iterator(bInstr)),
   11483                   thisMBB->end());
   11484   nextMBB->transferSuccessorsAndUpdatePHIs(thisMBB);
   11485 
   11486   // Update thisMBB to fall through to newMBB
   11487   thisMBB->addSuccessor(newMBB);
   11488 
   11489   // newMBB jumps to itself and fall through to nextMBB
   11490   newMBB->addSuccessor(nextMBB);
   11491   newMBB->addSuccessor(newMBB);
   11492 
   11493   DebugLoc dl = bInstr->getDebugLoc();
   11494   // Insert instructions into newMBB based on incoming instruction
   11495   // There are 8 "real" operands plus 9 implicit def/uses, ignored here.
   11496   assert(bInstr->getNumOperands() < X86::AddrNumOperands + 14 &&
   11497          "unexpected number of operands");
   11498   MachineOperand& dest1Oper = bInstr->getOperand(0);
   11499   MachineOperand& dest2Oper = bInstr->getOperand(1);
   11500   MachineOperand* argOpers[2 + X86::AddrNumOperands];
   11501   for (int i=0; i < 2 + X86::AddrNumOperands; ++i) {
   11502     argOpers[i] = &bInstr->getOperand(i+2);
   11503 
   11504     // We use some of the operands multiple times, so conservatively just
   11505     // clear any kill flags that might be present.
   11506     if (argOpers[i]->isReg() && argOpers[i]->isUse())
   11507       argOpers[i]->setIsKill(false);
   11508   }
   11509 
   11510   // x86 address has 5 operands: base, index, scale, displacement, and segment.
   11511   int lastAddrIndx = X86::AddrNumOperands - 1; // [0,3]
   11512 
   11513   unsigned t1 = F->getRegInfo().createVirtualRegister(RC);
   11514   MachineInstrBuilder MIB = BuildMI(thisMBB, dl, TII->get(LoadOpc), t1);
   11515   for (int i=0; i <= lastAddrIndx; ++i)
   11516     (*MIB).addOperand(*argOpers[i]);
   11517   unsigned t2 = F->getRegInfo().createVirtualRegister(RC);
   11518   MIB = BuildMI(thisMBB, dl, TII->get(LoadOpc), t2);
   11519   // add 4 to displacement.
   11520   for (int i=0; i <= lastAddrIndx-2; ++i)
   11521     (*MIB).addOperand(*argOpers[i]);
   11522   MachineOperand newOp3 = *(argOpers[3]);
   11523   if (newOp3.isImm())
   11524     newOp3.setImm(newOp3.getImm()+4);
   11525   else
   11526     newOp3.setOffset(newOp3.getOffset()+4);
   11527   (*MIB).addOperand(newOp3);
   11528   (*MIB).addOperand(*argOpers[lastAddrIndx]);
   11529 
   11530   // t3/4 are defined later, at the bottom of the loop
   11531   unsigned t3 = F->getRegInfo().createVirtualRegister(RC);
   11532   unsigned t4 = F->getRegInfo().createVirtualRegister(RC);
   11533   BuildMI(newMBB, dl, TII->get(X86::PHI), dest1Oper.getReg())
   11534     .addReg(t1).addMBB(thisMBB).addReg(t3).addMBB(newMBB);
   11535   BuildMI(newMBB, dl, TII->get(X86::PHI), dest2Oper.getReg())
   11536     .addReg(t2).addMBB(thisMBB).addReg(t4).addMBB(newMBB);
   11537 
   11538   // The subsequent operations should be using the destination registers of
   11539   // the PHI instructions.
   11540   t1 = dest1Oper.getReg();
   11541   t2 = dest2Oper.getReg();
   11542 
   11543   int valArgIndx = lastAddrIndx + 1;
   11544   assert((argOpers[valArgIndx]->isReg() ||
   11545           argOpers[valArgIndx]->isImm()) &&
   11546          "invalid operand");
   11547   unsigned t5 = F->getRegInfo().createVirtualRegister(RC);
   11548   unsigned t6 = F->getRegInfo().createVirtualRegister(RC);
   11549   if (argOpers[valArgIndx]->isReg())
   11550     MIB = BuildMI(newMBB, dl, TII->get(regOpcL), t5);
   11551   else
   11552     MIB = BuildMI(newMBB, dl, TII->get(immOpcL), t5);
   11553   if (regOpcL != X86::MOV32rr)
   11554     MIB.addReg(t1);
   11555   (*MIB).addOperand(*argOpers[valArgIndx]);
   11556   assert(argOpers[valArgIndx + 1]->isReg() ==
   11557          argOpers[valArgIndx]->isReg());
   11558   assert(argOpers[valArgIndx + 1]->isImm() ==
   11559          argOpers[valArgIndx]->isImm());
   11560   if (argOpers[valArgIndx + 1]->isReg())
   11561     MIB = BuildMI(newMBB, dl, TII->get(regOpcH), t6);
   11562   else
   11563     MIB = BuildMI(newMBB, dl, TII->get(immOpcH), t6);
   11564   if (regOpcH != X86::MOV32rr)
   11565     MIB.addReg(t2);
   11566   (*MIB).addOperand(*argOpers[valArgIndx + 1]);
   11567 
   11568   unsigned t7, t8;
   11569   if (Invert) {
   11570     t7 = F->getRegInfo().createVirtualRegister(RC);
   11571     t8 = F->getRegInfo().createVirtualRegister(RC);
   11572     MIB = BuildMI(newMBB, dl, TII->get(NotOpc), t7).addReg(t5);
   11573     MIB = BuildMI(newMBB, dl, TII->get(NotOpc), t8).addReg(t6);
   11574   } else {
   11575     t7 = t5;
   11576     t8 = t6;
   11577   }
   11578 
   11579   MIB = BuildMI(newMBB, dl, TII->get(TargetOpcode::COPY), X86::EAX);
   11580   MIB.addReg(t1);
   11581   MIB = BuildMI(newMBB, dl, TII->get(TargetOpcode::COPY), X86::EDX);
   11582   MIB.addReg(t2);
   11583 
   11584   MIB = BuildMI(newMBB, dl, TII->get(TargetOpcode::COPY), X86::EBX);
   11585   MIB.addReg(t7);
   11586   MIB = BuildMI(newMBB, dl, TII->get(TargetOpcode::COPY), X86::ECX);
   11587   MIB.addReg(t8);
   11588 
   11589   MIB = BuildMI(newMBB, dl, TII->get(X86::LCMPXCHG8B));
   11590   for (int i=0; i <= lastAddrIndx; ++i)
   11591     (*MIB).addOperand(*argOpers[i]);
   11592 
   11593   assert(bInstr->hasOneMemOperand() && "Unexpected number of memoperand");
   11594   (*MIB).setMemRefs(bInstr->memoperands_begin(),
   11595                     bInstr->memoperands_end());
   11596 
   11597   MIB = BuildMI(newMBB, dl, TII->get(TargetOpcode::COPY), t3);
   11598   MIB.addReg(X86::EAX);
   11599   MIB = BuildMI(newMBB, dl, TII->get(TargetOpcode::COPY), t4);
   11600   MIB.addReg(X86::EDX);
   11601 
   11602   // insert branch
   11603   BuildMI(newMBB, dl, TII->get(X86::JNE_4)).addMBB(newMBB);
   11604 
   11605   bInstr->eraseFromParent();   // The pseudo instruction is gone now.
   11606   return nextMBB;
   11607 }
   11608 
   11609 // private utility function
   11610 MachineBasicBlock *
   11611 X86TargetLowering::EmitAtomicMinMaxWithCustomInserter(MachineInstr *mInstr,
   11612                                                       MachineBasicBlock *MBB,
   11613                                                       unsigned cmovOpc) const {
   11614   // For the atomic min/max operator, we generate
   11615   //   thisMBB:
   11616   //   newMBB:
   11617   //     ld t1 = [min/max.addr]
   11618   //     mov t2 = [min/max.val]
   11619   //     cmp  t1, t2
   11620   //     cmov[cond] t2 = t1
   11621   //     mov EAX = t1
   11622   //     lcs dest = [bitinstr.addr], t2  [EAX is implicit]
   11623   //     bz   newMBB
   11624   //     fallthrough -->nextMBB
   11625   //
   11626   const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
   11627   const BasicBlock *LLVM_BB = MBB->getBasicBlock();
   11628   MachineFunction::iterator MBBIter = MBB;
   11629   ++MBBIter;
   11630 
   11631   /// First build the CFG
   11632   MachineFunction *F = MBB->getParent();
   11633   MachineBasicBlock *thisMBB = MBB;
   11634   MachineBasicBlock *newMBB = F->CreateMachineBasicBlock(LLVM_BB);
   11635   MachineBasicBlock *nextMBB = F->CreateMachineBasicBlock(LLVM_BB);
   11636   F->insert(MBBIter, newMBB);
   11637   F->insert(MBBIter, nextMBB);
   11638 
   11639   // Transfer the remainder of thisMBB and its successor edges to nextMBB.
   11640   nextMBB->splice(nextMBB->begin(), thisMBB,
   11641                   llvm::next(MachineBasicBlock::iterator(mInstr)),
   11642                   thisMBB->end());
   11643   nextMBB->transferSuccessorsAndUpdatePHIs(thisMBB);
   11644 
   11645   // Update thisMBB to fall through to newMBB
   11646   thisMBB->addSuccessor(newMBB);
   11647 
   11648   // newMBB jumps to newMBB and fall through to nextMBB
   11649   newMBB->addSuccessor(nextMBB);
   11650   newMBB->addSuccessor(newMBB);
   11651 
   11652   DebugLoc dl = mInstr->getDebugLoc();
   11653   // Insert instructions into newMBB based on incoming instruction
   11654   assert(mInstr->getNumOperands() < X86::AddrNumOperands + 4 &&
   11655          "unexpected number of operands");
   11656   MachineOperand& destOper = mInstr->getOperand(0);
   11657   MachineOperand* argOpers[2 + X86::AddrNumOperands];
   11658   int numArgs = mInstr->getNumOperands() - 1;
   11659   for (int i=0; i < numArgs; ++i)
   11660     argOpers[i] = &mInstr->getOperand(i+1);
   11661 
   11662   // x86 address has 4 operands: base, index, scale, and displacement
   11663   int lastAddrIndx = X86::AddrNumOperands - 1; // [0,3]
   11664   int valArgIndx = lastAddrIndx + 1;
   11665 
   11666   unsigned t1 = F->getRegInfo().createVirtualRegister(X86::GR32RegisterClass);
   11667   MachineInstrBuilder MIB = BuildMI(newMBB, dl, TII->get(X86::MOV32rm), t1);
   11668   for (int i=0; i <= lastAddrIndx; ++i)
   11669     (*MIB).addOperand(*argOpers[i]);
   11670 
   11671   // We only support register and immediate values
   11672   assert((argOpers[valArgIndx]->isReg() ||
   11673           argOpers[valArgIndx]->isImm()) &&
   11674          "invalid operand");
   11675 
   11676   unsigned t2 = F->getRegInfo().createVirtualRegister(X86::GR32RegisterClass);
   11677   if (argOpers[valArgIndx]->isReg())
   11678     MIB = BuildMI(newMBB, dl, TII->get(TargetOpcode::COPY), t2);
   11679   else
   11680     MIB = BuildMI(newMBB, dl, TII->get(X86::MOV32rr), t2);
   11681   (*MIB).addOperand(*argOpers[valArgIndx]);
   11682 
   11683   MIB = BuildMI(newMBB, dl, TII->get(TargetOpcode::COPY), X86::EAX);
   11684   MIB.addReg(t1);
   11685 
   11686   MIB = BuildMI(newMBB, dl, TII->get(X86::CMP32rr));
   11687   MIB.addReg(t1);
   11688   MIB.addReg(t2);
   11689 
   11690   // Generate movc
   11691   unsigned t3 = F->getRegInfo().createVirtualRegister(X86::GR32RegisterClass);
   11692   MIB = BuildMI(newMBB, dl, TII->get(cmovOpc),t3);
   11693   MIB.addReg(t2);
   11694   MIB.addReg(t1);
   11695 
   11696   // Cmp and exchange if none has modified the memory location
   11697   MIB = BuildMI(newMBB, dl, TII->get(X86::LCMPXCHG32));
   11698   for (int i=0; i <= lastAddrIndx; ++i)
   11699     (*MIB).addOperand(*argOpers[i]);
   11700   MIB.addReg(t3);
   11701   assert(mInstr->hasOneMemOperand() && "Unexpected number of memoperand");
   11702   (*MIB).setMemRefs(mInstr->memoperands_begin(),
   11703                     mInstr->memoperands_end());
   11704 
   11705   MIB = BuildMI(newMBB, dl, TII->get(TargetOpcode::COPY), destOper.getReg());
   11706   MIB.addReg(X86::EAX);
   11707 
   11708   // insert branch
   11709   BuildMI(newMBB, dl, TII->get(X86::JNE_4)).addMBB(newMBB);
   11710 
   11711   mInstr->eraseFromParent();   // The pseudo instruction is gone now.
   11712   return nextMBB;
   11713 }
   11714 
   11715 // FIXME: When we get size specific XMM0 registers, i.e. XMM0_V16I8
   11716 // or XMM0_V32I8 in AVX all of this code can be replaced with that
   11717 // in the .td file.
   11718 MachineBasicBlock *
   11719 X86TargetLowering::EmitPCMP(MachineInstr *MI, MachineBasicBlock *BB,
   11720                             unsigned numArgs, bool memArg) const {
   11721   assert(Subtarget->hasSSE42() &&
   11722          "Target must have SSE4.2 or AVX features enabled");
   11723 
   11724   DebugLoc dl = MI->getDebugLoc();
   11725   const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
   11726   unsigned Opc;
   11727   if (!Subtarget->hasAVX()) {
   11728     if (memArg)
   11729       Opc = numArgs == 3 ? X86::PCMPISTRM128rm : X86::PCMPESTRM128rm;
   11730     else
   11731       Opc = numArgs == 3 ? X86::PCMPISTRM128rr : X86::PCMPESTRM128rr;
   11732   } else {
   11733     if (memArg)
   11734       Opc = numArgs == 3 ? X86::VPCMPISTRM128rm : X86::VPCMPESTRM128rm;
   11735     else
   11736       Opc = numArgs == 3 ? X86::VPCMPISTRM128rr : X86::VPCMPESTRM128rr;
   11737   }
   11738 
   11739   MachineInstrBuilder MIB = BuildMI(*BB, MI, dl, TII->get(Opc));
   11740   for (unsigned i = 0; i < numArgs; ++i) {
   11741     MachineOperand &Op = MI->getOperand(i+1);
   11742     if (!(Op.isReg() && Op.isImplicit()))
   11743       MIB.addOperand(Op);
   11744   }
   11745   BuildMI(*BB, MI, dl,
   11746     TII->get(Subtarget->hasAVX() ? X86::VMOVAPSrr : X86::MOVAPSrr),
   11747              MI->getOperand(0).getReg())
   11748     .addReg(X86::XMM0);
   11749 
   11750   MI->eraseFromParent();
   11751   return BB;
   11752 }
   11753 
   11754 MachineBasicBlock *
   11755 X86TargetLowering::EmitMonitor(MachineInstr *MI, MachineBasicBlock *BB) const {
   11756   DebugLoc dl = MI->getDebugLoc();
   11757   const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
   11758 
   11759   // Address into RAX/EAX, other two args into ECX, EDX.
   11760   unsigned MemOpc = Subtarget->is64Bit() ? X86::LEA64r : X86::LEA32r;
   11761   unsigned MemReg = Subtarget->is64Bit() ? X86::RAX : X86::EAX;
   11762   MachineInstrBuilder MIB = BuildMI(*BB, MI, dl, TII->get(MemOpc), MemReg);
   11763   for (int i = 0; i < X86::AddrNumOperands; ++i)
   11764     MIB.addOperand(MI->getOperand(i));
   11765 
   11766   unsigned ValOps = X86::AddrNumOperands;
   11767   BuildMI(*BB, MI, dl, TII->get(TargetOpcode::COPY), X86::ECX)
   11768     .addReg(MI->getOperand(ValOps).getReg());
   11769   BuildMI(*BB, MI, dl, TII->get(TargetOpcode::COPY), X86::EDX)
   11770     .addReg(MI->getOperand(ValOps+1).getReg());
   11771 
   11772   // The instruction doesn't actually take any operands though.
   11773   BuildMI(*BB, MI, dl, TII->get(X86::MONITORrrr));
   11774 
   11775   MI->eraseFromParent(); // The pseudo is gone now.
   11776   return BB;
   11777 }
   11778 
   11779 MachineBasicBlock *
   11780 X86TargetLowering::EmitMwait(MachineInstr *MI, MachineBasicBlock *BB) const {
   11781   DebugLoc dl = MI->getDebugLoc();
   11782   const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
   11783 
   11784   // First arg in ECX, the second in EAX.
   11785   BuildMI(*BB, MI, dl, TII->get(TargetOpcode::COPY), X86::ECX)
   11786     .addReg(MI->getOperand(0).getReg());
   11787   BuildMI(*BB, MI, dl, TII->get(TargetOpcode::COPY), X86::EAX)
   11788     .addReg(MI->getOperand(1).getReg());
   11789 
   11790   // The instruction doesn't actually take any operands though.
   11791   BuildMI(*BB, MI, dl, TII->get(X86::MWAITrr));
   11792 
   11793   MI->eraseFromParent(); // The pseudo is gone now.
   11794   return BB;
   11795 }
   11796 
   11797 MachineBasicBlock *
   11798 X86TargetLowering::EmitVAARG64WithCustomInserter(
   11799                    MachineInstr *MI,
   11800                    MachineBasicBlock *MBB) const {
   11801   // Emit va_arg instruction on X86-64.
   11802 
   11803   // Operands to this pseudo-instruction:
   11804   // 0  ) Output        : destination address (reg)
   11805   // 1-5) Input         : va_list address (addr, i64mem)
   11806   // 6  ) ArgSize       : Size (in bytes) of vararg type
   11807   // 7  ) ArgMode       : 0=overflow only, 1=use gp_offset, 2=use fp_offset
   11808   // 8  ) Align         : Alignment of type
   11809   // 9  ) EFLAGS (implicit-def)
   11810 
   11811   assert(MI->getNumOperands() == 10 && "VAARG_64 should have 10 operands!");
   11812   assert(X86::AddrNumOperands == 5 && "VAARG_64 assumes 5 address operands");
   11813 
   11814   unsigned DestReg = MI->getOperand(0).getReg();
   11815   MachineOperand &Base = MI->getOperand(1);
   11816   MachineOperand &Scale = MI->getOperand(2);
   11817   MachineOperand &Index = MI->getOperand(3);
   11818   MachineOperand &Disp = MI->getOperand(4);
   11819   MachineOperand &Segment = MI->getOperand(5);
   11820   unsigned ArgSize = MI->getOperand(6).getImm();
   11821   unsigned ArgMode = MI->getOperand(7).getImm();
   11822   unsigned Align = MI->getOperand(8).getImm();
   11823 
   11824   // Memory Reference
   11825   assert(MI->hasOneMemOperand() && "Expected VAARG_64 to have one memoperand");
   11826   MachineInstr::mmo_iterator MMOBegin = MI->memoperands_begin();
   11827   MachineInstr::mmo_iterator MMOEnd = MI->memoperands_end();
   11828 
   11829   // Machine Information
   11830   const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
   11831   MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo();
   11832   const TargetRegisterClass *AddrRegClass = getRegClassFor(MVT::i64);
   11833   const TargetRegisterClass *OffsetRegClass = getRegClassFor(MVT::i32);
   11834   DebugLoc DL = MI->getDebugLoc();
   11835 
   11836   // struct va_list {
   11837   //   i32   gp_offset
   11838   //   i32   fp_offset
   11839   //   i64   overflow_area (address)
   11840   //   i64   reg_save_area (address)
   11841   // }
   11842   // sizeof(va_list) = 24
   11843   // alignment(va_list) = 8
   11844 
   11845   unsigned TotalNumIntRegs = 6;
   11846   unsigned TotalNumXMMRegs = 8;
   11847   bool UseGPOffset = (ArgMode == 1);
   11848   bool UseFPOffset = (ArgMode == 2);
   11849   unsigned MaxOffset = TotalNumIntRegs * 8 +
   11850                        (UseFPOffset ? TotalNumXMMRegs * 16 : 0);
   11851 
   11852   /* Align ArgSize to a multiple of 8 */
   11853   unsigned ArgSizeA8 = (ArgSize + 7) & ~7;
   11854   bool NeedsAlign = (Align > 8);
   11855 
   11856   MachineBasicBlock *thisMBB = MBB;
   11857   MachineBasicBlock *overflowMBB;
   11858   MachineBasicBlock *offsetMBB;
   11859   MachineBasicBlock *endMBB;
   11860 
   11861   unsigned OffsetDestReg = 0;    // Argument address computed by offsetMBB
   11862   unsigned OverflowDestReg = 0;  // Argument address computed by overflowMBB
   11863   unsigned OffsetReg = 0;
   11864 
   11865   if (!UseGPOffset && !UseFPOffset) {
   11866     // If we only pull from the overflow region, we don't create a branch.
   11867     // We don't need to alter control flow.
   11868     OffsetDestReg = 0; // unused
   11869     OverflowDestReg = DestReg;
   11870 
   11871     offsetMBB = NULL;
   11872     overflowMBB = thisMBB;
   11873     endMBB = thisMBB;
   11874   } else {
   11875     // First emit code to check if gp_offset (or fp_offset) is below the bound.
   11876     // If so, pull the argument from reg_save_area. (branch to offsetMBB)
   11877     // If not, pull from overflow_area. (branch to overflowMBB)
   11878     //
   11879     //       thisMBB
   11880     //         |     .
   11881     //         |        .
   11882     //     offsetMBB   overflowMBB
   11883     //         |        .
   11884     //         |     .
   11885     //        endMBB
   11886 
   11887     // Registers for the PHI in endMBB
   11888     OffsetDestReg = MRI.createVirtualRegister(AddrRegClass);
   11889     OverflowDestReg = MRI.createVirtualRegister(AddrRegClass);
   11890 
   11891     const BasicBlock *LLVM_BB = MBB->getBasicBlock();
   11892     MachineFunction *MF = MBB->getParent();
   11893     overflowMBB = MF->CreateMachineBasicBlock(LLVM_BB);
   11894     offsetMBB = MF->CreateMachineBasicBlock(LLVM_BB);
   11895     endMBB = MF->CreateMachineBasicBlock(LLVM_BB);
   11896 
   11897     MachineFunction::iterator MBBIter = MBB;
   11898     ++MBBIter;
   11899 
   11900     // Insert the new basic blocks
   11901     MF->insert(MBBIter, offsetMBB);
   11902     MF->insert(MBBIter, overflowMBB);
   11903     MF->insert(MBBIter, endMBB);
   11904 
   11905     // Transfer the remainder of MBB and its successor edges to endMBB.
   11906     endMBB->splice(endMBB->begin(), thisMBB,
   11907                     llvm::next(MachineBasicBlock::iterator(MI)),
   11908                     thisMBB->end());
   11909     endMBB->transferSuccessorsAndUpdatePHIs(thisMBB);
   11910 
   11911     // Make offsetMBB and overflowMBB successors of thisMBB
   11912     thisMBB->addSuccessor(offsetMBB);
   11913     thisMBB->addSuccessor(overflowMBB);
   11914 
   11915     // endMBB is a successor of both offsetMBB and overflowMBB
   11916     offsetMBB->addSuccessor(endMBB);
   11917     overflowMBB->addSuccessor(endMBB);
   11918 
   11919     // Load the offset value into a register
   11920     OffsetReg = MRI.createVirtualRegister(OffsetRegClass);
   11921     BuildMI(thisMBB, DL, TII->get(X86::MOV32rm), OffsetReg)
   11922       .addOperand(Base)
   11923       .addOperand(Scale)
   11924       .addOperand(Index)
   11925       .addDisp(Disp, UseFPOffset ? 4 : 0)
   11926       .addOperand(Segment)
   11927       .setMemRefs(MMOBegin, MMOEnd);
   11928 
   11929     // Check if there is enough room left to pull this argument.
   11930     BuildMI(thisMBB, DL, TII->get(X86::CMP32ri))
   11931       .addReg(OffsetReg)
   11932       .addImm(MaxOffset + 8 - ArgSizeA8);
   11933 
   11934     // Branch to "overflowMBB" if offset >= max
   11935     // Fall through to "offsetMBB" otherwise
   11936     BuildMI(thisMBB, DL, TII->get(X86::GetCondBranchFromCond(X86::COND_AE)))
   11937       .addMBB(overflowMBB);
   11938   }
   11939 
   11940   // In offsetMBB, emit code to use the reg_save_area.
   11941   if (offsetMBB) {
   11942     assert(OffsetReg != 0);
   11943 
   11944     // Read the reg_save_area address.
   11945     unsigned RegSaveReg = MRI.createVirtualRegister(AddrRegClass);
   11946     BuildMI(offsetMBB, DL, TII->get(X86::MOV64rm), RegSaveReg)
   11947       .addOperand(Base)
   11948       .addOperand(Scale)
   11949       .addOperand(Index)
   11950       .addDisp(Disp, 16)
   11951       .addOperand(Segment)
   11952       .setMemRefs(MMOBegin, MMOEnd);
   11953 
   11954     // Zero-extend the offset
   11955     unsigned OffsetReg64 = MRI.createVirtualRegister(AddrRegClass);
   11956       BuildMI(offsetMBB, DL, TII->get(X86::SUBREG_TO_REG), OffsetReg64)
   11957         .addImm(0)
   11958         .addReg(OffsetReg)
   11959         .addImm(X86::sub_32bit);
   11960 
   11961     // Add the offset to the reg_save_area to get the final address.
   11962     BuildMI(offsetMBB, DL, TII->get(X86::ADD64rr), OffsetDestReg)
   11963       .addReg(OffsetReg64)
   11964       .addReg(RegSaveReg);
   11965 
   11966     // Compute the offset for the next argument
   11967     unsigned NextOffsetReg = MRI.createVirtualRegister(OffsetRegClass);
   11968     BuildMI(offsetMBB, DL, TII->get(X86::ADD32ri), NextOffsetReg)
   11969       .addReg(OffsetReg)
   11970       .addImm(UseFPOffset ? 16 : 8);
   11971 
   11972     // Store it back into the va_list.
   11973     BuildMI(offsetMBB, DL, TII->get(X86::MOV32mr))
   11974       .addOperand(Base)
   11975       .addOperand(Scale)
   11976       .addOperand(Index)
   11977       .addDisp(Disp, UseFPOffset ? 4 : 0)
   11978       .addOperand(Segment)
   11979       .addReg(NextOffsetReg)
   11980       .setMemRefs(MMOBegin, MMOEnd);
   11981 
   11982     // Jump to endMBB
   11983     BuildMI(offsetMBB, DL, TII->get(X86::JMP_4))
   11984       .addMBB(endMBB);
   11985   }
   11986 
   11987   //
   11988   // Emit code to use overflow area
   11989   //
   11990 
   11991   // Load the overflow_area address into a register.
   11992   unsigned OverflowAddrReg = MRI.createVirtualRegister(AddrRegClass);
   11993   BuildMI(overflowMBB, DL, TII->get(X86::MOV64rm), OverflowAddrReg)
   11994     .addOperand(Base)
   11995     .addOperand(Scale)
   11996     .addOperand(Index)
   11997     .addDisp(Disp, 8)
   11998     .addOperand(Segment)
   11999     .setMemRefs(MMOBegin, MMOEnd);
   12000 
   12001   // If we need to align it, do so. Otherwise, just copy the address
   12002   // to OverflowDestReg.
   12003   if (NeedsAlign) {
   12004     // Align the overflow address
   12005     assert((Align & (Align-1)) == 0 && "Alignment must be a power of 2");
   12006     unsigned TmpReg = MRI.createVirtualRegister(AddrRegClass);
   12007 
   12008     // aligned_addr = (addr + (align-1)) & ~(align-1)
   12009     BuildMI(overflowMBB, DL, TII->get(X86::ADD64ri32), TmpReg)
   12010       .addReg(OverflowAddrReg)
   12011       .addImm(Align-1);
   12012 
   12013     BuildMI(overflowMBB, DL, TII->get(X86::AND64ri32), OverflowDestReg)
   12014       .addReg(TmpReg)
   12015       .addImm(~(uint64_t)(Align-1));
   12016   } else {
   12017     BuildMI(overflowMBB, DL, TII->get(TargetOpcode::COPY), OverflowDestReg)
   12018       .addReg(OverflowAddrReg);
   12019   }
   12020 
   12021   // Compute the next overflow address after this argument.
   12022   // (the overflow address should be kept 8-byte aligned)
   12023   unsigned NextAddrReg = MRI.createVirtualRegister(AddrRegClass);
   12024   BuildMI(overflowMBB, DL, TII->get(X86::ADD64ri32), NextAddrReg)
   12025     .addReg(OverflowDestReg)
   12026     .addImm(ArgSizeA8);
   12027 
   12028   // Store the new overflow address.
   12029   BuildMI(overflowMBB, DL, TII->get(X86::MOV64mr))
   12030     .addOperand(Base)
   12031     .addOperand(Scale)
   12032     .addOperand(Index)
   12033     .addDisp(Disp, 8)
   12034     .addOperand(Segment)
   12035     .addReg(NextAddrReg)
   12036     .setMemRefs(MMOBegin, MMOEnd);
   12037 
   12038   // If we branched, emit the PHI to the front of endMBB.
   12039   if (offsetMBB) {
   12040     BuildMI(*endMBB, endMBB->begin(), DL,
   12041             TII->get(X86::PHI), DestReg)
   12042       .addReg(OffsetDestReg).addMBB(offsetMBB)
   12043       .addReg(OverflowDestReg).addMBB(overflowMBB);
   12044   }
   12045 
   12046   // Erase the pseudo instruction
   12047   MI->eraseFromParent();
   12048 
   12049   return endMBB;
   12050 }
   12051 
   12052 MachineBasicBlock *
   12053 X86TargetLowering::EmitVAStartSaveXMMRegsWithCustomInserter(
   12054                                                  MachineInstr *MI,
   12055                                                  MachineBasicBlock *MBB) const {
   12056   // Emit code to save XMM registers to the stack. The ABI says that the
   12057   // number of registers to save is given in %al, so it's theoretically
   12058   // possible to do an indirect jump trick to avoid saving all of them,
   12059   // however this code takes a simpler approach and just executes all
   12060   // of the stores if %al is non-zero. It's less code, and it's probably
   12061   // easier on the hardware branch predictor, and stores aren't all that
   12062   // expensive anyway.
   12063 
   12064   // Create the new basic blocks. One block contains all the XMM stores,
   12065   // and one block is the final destination regardless of whether any
   12066   // stores were performed.
   12067   const BasicBlock *LLVM_BB = MBB->getBasicBlock();
   12068   MachineFunction *F = MBB->getParent();
   12069   MachineFunction::iterator MBBIter = MBB;
   12070   ++MBBIter;
   12071   MachineBasicBlock *XMMSaveMBB = F->CreateMachineBasicBlock(LLVM_BB);
   12072   MachineBasicBlock *EndMBB = F->CreateMachineBasicBlock(LLVM_BB);
   12073   F->insert(MBBIter, XMMSaveMBB);
   12074   F->insert(MBBIter, EndMBB);
   12075 
   12076   // Transfer the remainder of MBB and its successor edges to EndMBB.
   12077   EndMBB->splice(EndMBB->begin(), MBB,
   12078                  llvm::next(MachineBasicBlock::iterator(MI)),
   12079                  MBB->end());
   12080   EndMBB->transferSuccessorsAndUpdatePHIs(MBB);
   12081 
   12082   // The original block will now fall through to the XMM save block.
   12083   MBB->addSuccessor(XMMSaveMBB);
   12084   // The XMMSaveMBB will fall through to the end block.
   12085   XMMSaveMBB->addSuccessor(EndMBB);
   12086 
   12087   // Now add the instructions.
   12088   const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
   12089   DebugLoc DL = MI->getDebugLoc();
   12090 
   12091   unsigned CountReg = MI->getOperand(0).getReg();
   12092   int64_t RegSaveFrameIndex = MI->getOperand(1).getImm();
   12093   int64_t VarArgsFPOffset = MI->getOperand(2).getImm();
   12094 
   12095   if (!Subtarget->isTargetWin64()) {
   12096     // If %al is 0, branch around the XMM save block.
   12097     BuildMI(MBB, DL, TII->get(X86::TEST8rr)).addReg(CountReg).addReg(CountReg);
   12098     BuildMI(MBB, DL, TII->get(X86::JE_4)).addMBB(EndMBB);
   12099     MBB->addSuccessor(EndMBB);
   12100   }
   12101 
   12102   unsigned MOVOpc = Subtarget->hasAVX() ? X86::VMOVAPSmr : X86::MOVAPSmr;
   12103   // In the XMM save block, save all the XMM argument registers.
   12104   for (int i = 3, e = MI->getNumOperands(); i != e; ++i) {
   12105     int64_t Offset = (i - 3) * 16 + VarArgsFPOffset;
   12106     MachineMemOperand *MMO =
   12107       F->getMachineMemOperand(
   12108           MachinePointerInfo::getFixedStack(RegSaveFrameIndex, Offset),
   12109         MachineMemOperand::MOStore,
   12110         /*Size=*/16, /*Align=*/16);
   12111     BuildMI(XMMSaveMBB, DL, TII->get(MOVOpc))
   12112       .addFrameIndex(RegSaveFrameIndex)
   12113       .addImm(/*Scale=*/1)
   12114       .addReg(/*IndexReg=*/0)
   12115       .addImm(/*Disp=*/Offset)
   12116       .addReg(/*Segment=*/0)
   12117       .addReg(MI->getOperand(i).getReg())
   12118       .addMemOperand(MMO);
   12119   }
   12120 
   12121   MI->eraseFromParent();   // The pseudo instruction is gone now.
   12122 
   12123   return EndMBB;
   12124 }
   12125 
   12126 // The EFLAGS operand of SelectItr might be missing a kill marker
   12127 // because there were multiple uses of EFLAGS, and ISel didn't know
   12128 // which to mark. Figure out whether SelectItr should have had a
   12129 // kill marker, and set it if it should. Returns the correct kill
   12130 // marker value.
   12131 static bool checkAndUpdateEFLAGSKill(MachineBasicBlock::iterator SelectItr,
   12132                                      MachineBasicBlock* BB,
   12133                                      const TargetRegisterInfo* TRI) {
   12134   // Scan forward through BB for a use/def of EFLAGS.
   12135   MachineBasicBlock::iterator miI(llvm::next(SelectItr));
   12136   for (MachineBasicBlock::iterator miE = BB->end(); miI != miE; ++miI) {
   12137     const MachineInstr& mi = *miI;
   12138     if (mi.readsRegister(X86::EFLAGS))
   12139       return false;
   12140     if (mi.definesRegister(X86::EFLAGS))
   12141       break; // Should have kill-flag - update below.
   12142   }
   12143 
   12144   // If we hit the end of the block, check whether EFLAGS is live into a
   12145   // successor.
   12146   if (miI == BB->end()) {
   12147     for (MachineBasicBlock::succ_iterator sItr = BB->succ_begin(),
   12148                                           sEnd = BB->succ_end();
   12149          sItr != sEnd; ++sItr) {
   12150       MachineBasicBlock* succ = *sItr;
   12151       if (succ->isLiveIn(X86::EFLAGS))
   12152         return false;
   12153     }
   12154   }
   12155 
   12156   // We found a def, or hit the end of the basic block and EFLAGS wasn't live
   12157   // out. SelectMI should have a kill flag on EFLAGS.
   12158   SelectItr->addRegisterKilled(X86::EFLAGS, TRI);
   12159   return true;
   12160 }
   12161 
   12162 MachineBasicBlock *
   12163 X86TargetLowering::EmitLoweredSelect(MachineInstr *MI,
   12164                                      MachineBasicBlock *BB) const {
   12165   const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
   12166   DebugLoc DL = MI->getDebugLoc();
   12167 
   12168   // To "insert" a SELECT_CC instruction, we actually have to insert the
   12169   // diamond control-flow pattern.  The incoming instruction knows the
   12170   // destination vreg to set, the condition code register to branch on, the
   12171   // true/false values to select between, and a branch opcode to use.
   12172   const BasicBlock *LLVM_BB = BB->getBasicBlock();
   12173   MachineFunction::iterator It = BB;
   12174   ++It;
   12175 
   12176   //  thisMBB:
   12177   //  ...
   12178   //   TrueVal = ...
   12179   //   cmpTY ccX, r1, r2
   12180   //   bCC copy1MBB
   12181   //   fallthrough --> copy0MBB
   12182   MachineBasicBlock *thisMBB = BB;
   12183   MachineFunction *F = BB->getParent();
   12184   MachineBasicBlock *copy0MBB = F->CreateMachineBasicBlock(LLVM_BB);
   12185   MachineBasicBlock *sinkMBB = F->CreateMachineBasicBlock(LLVM_BB);
   12186   F->insert(It, copy0MBB);
   12187   F->insert(It, sinkMBB);
   12188 
   12189   // If the EFLAGS register isn't dead in the terminator, then claim that it's
   12190   // live into the sink and copy blocks.
   12191   const TargetRegisterInfo* TRI = getTargetMachine().getRegisterInfo();
   12192   if (!MI->killsRegister(X86::EFLAGS) &&
   12193       !checkAndUpdateEFLAGSKill(MI, BB, TRI)) {
   12194     copy0MBB->addLiveIn(X86::EFLAGS);
   12195     sinkMBB->addLiveIn(X86::EFLAGS);
   12196   }
   12197 
   12198   // Transfer the remainder of BB and its successor edges to sinkMBB.
   12199   sinkMBB->splice(sinkMBB->begin(), BB,
   12200                   llvm::next(MachineBasicBlock::iterator(MI)),
   12201                   BB->end());
   12202   sinkMBB->transferSuccessorsAndUpdatePHIs(BB);
   12203 
   12204   // Add the true and fallthrough blocks as its successors.
   12205   BB->addSuccessor(copy0MBB);
   12206   BB->addSuccessor(sinkMBB);
   12207 
   12208   // Create the conditional branch instruction.
   12209   unsigned Opc =
   12210     X86::GetCondBranchFromCond((X86::CondCode)MI->getOperand(3).getImm());
   12211   BuildMI(BB, DL, TII->get(Opc)).addMBB(sinkMBB);
   12212 
   12213   //  copy0MBB:
   12214   //   %FalseValue = ...
   12215   //   # fallthrough to sinkMBB
   12216   copy0MBB->addSuccessor(sinkMBB);
   12217 
   12218   //  sinkMBB:
   12219   //   %Result = phi [ %FalseValue, copy0MBB ], [ %TrueValue, thisMBB ]
   12220   //  ...
   12221   BuildMI(*sinkMBB, sinkMBB->begin(), DL,
   12222           TII->get(X86::PHI), MI->getOperand(0).getReg())
   12223     .addReg(MI->getOperand(1).getReg()).addMBB(copy0MBB)
   12224     .addReg(MI->getOperand(2).getReg()).addMBB(thisMBB);
   12225 
   12226   MI->eraseFromParent();   // The pseudo instruction is gone now.
   12227   return sinkMBB;
   12228 }
   12229 
   12230 MachineBasicBlock *
   12231 X86TargetLowering::EmitLoweredSegAlloca(MachineInstr *MI, MachineBasicBlock *BB,
   12232                                         bool Is64Bit) const {
   12233   const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
   12234   DebugLoc DL = MI->getDebugLoc();
   12235   MachineFunction *MF = BB->getParent();
   12236   const BasicBlock *LLVM_BB = BB->getBasicBlock();
   12237 
   12238   assert(getTargetMachine().Options.EnableSegmentedStacks);
   12239 
   12240   unsigned TlsReg = Is64Bit ? X86::FS : X86::GS;
   12241   unsigned TlsOffset = Is64Bit ? 0x70 : 0x30;
   12242 
   12243   // BB:
   12244   //  ... [Till the alloca]
   12245   // If stacklet is not large enough, jump to mallocMBB
   12246   //
   12247   // bumpMBB:
   12248   //  Allocate by subtracting from RSP
   12249   //  Jump to continueMBB
   12250   //
   12251   // mallocMBB:
   12252   //  Allocate by call to runtime
   12253   //
   12254   // continueMBB:
   12255   //  ...
   12256   //  [rest of original BB]
   12257   //
   12258 
   12259   MachineBasicBlock *mallocMBB = MF->CreateMachineBasicBlock(LLVM_BB);
   12260   MachineBasicBlock *bumpMBB = MF->CreateMachineBasicBlock(LLVM_BB);
   12261   MachineBasicBlock *continueMBB = MF->CreateMachineBasicBlock(LLVM_BB);
   12262 
   12263   MachineRegisterInfo &MRI = MF->getRegInfo();
   12264   const TargetRegisterClass *AddrRegClass =
   12265     getRegClassFor(Is64Bit ? MVT::i64:MVT::i32);
   12266 
   12267   unsigned mallocPtrVReg = MRI.createVirtualRegister(AddrRegClass),
   12268     bumpSPPtrVReg = MRI.createVirtualRegister(AddrRegClass),
   12269     tmpSPVReg = MRI.createVirtualRegister(AddrRegClass),
   12270     SPLimitVReg = MRI.createVirtualRegister(AddrRegClass),
   12271     sizeVReg = MI->getOperand(1).getReg(),
   12272     physSPReg = Is64Bit ? X86::RSP : X86::ESP;
   12273 
   12274   MachineFunction::iterator MBBIter = BB;
   12275   ++MBBIter;
   12276 
   12277   MF->insert(MBBIter, bumpMBB);
   12278   MF->insert(MBBIter, mallocMBB);
   12279   MF->insert(MBBIter, continueMBB);
   12280 
   12281   continueMBB->splice(continueMBB->begin(), BB, llvm::next
   12282                       (MachineBasicBlock::iterator(MI)), BB->end());
   12283   continueMBB->transferSuccessorsAndUpdatePHIs(BB);
   12284 
   12285   // Add code to the main basic block to check if the stack limit has been hit,
   12286   // and if so, jump to mallocMBB otherwise to bumpMBB.
   12287   BuildMI(BB, DL, TII->get(TargetOpcode::COPY), tmpSPVReg).addReg(physSPReg);
   12288   BuildMI(BB, DL, TII->get(Is64Bit ? X86::SUB64rr:X86::SUB32rr), SPLimitVReg)
   12289     .addReg(tmpSPVReg).addReg(sizeVReg);
   12290   BuildMI(BB, DL, TII->get(Is64Bit ? X86::CMP64mr:X86::CMP32mr))
   12291     .addReg(0).addImm(1).addReg(0).addImm(TlsOffset).addReg(TlsReg)
   12292     .addReg(SPLimitVReg);
   12293   BuildMI(BB, DL, TII->get(X86::JG_4)).addMBB(mallocMBB);
   12294 
   12295   // bumpMBB simply decreases the stack pointer, since we know the current
   12296   // stacklet has enough space.
   12297   BuildMI(bumpMBB, DL, TII->get(TargetOpcode::COPY), physSPReg)
   12298     .addReg(SPLimitVReg);
   12299   BuildMI(bumpMBB, DL, TII->get(TargetOpcode::COPY), bumpSPPtrVReg)
   12300     .addReg(SPLimitVReg);
   12301   BuildMI(bumpMBB, DL, TII->get(X86::JMP_4)).addMBB(continueMBB);
   12302 
   12303   // Calls into a routine in libgcc to allocate more space from the heap.
   12304   const uint32_t *RegMask =
   12305     getTargetMachine().getRegisterInfo()->getCallPreservedMask(CallingConv::C);
   12306   if (Is64Bit) {
   12307     BuildMI(mallocMBB, DL, TII->get(X86::MOV64rr), X86::RDI)
   12308       .addReg(sizeVReg);
   12309     BuildMI(mallocMBB, DL, TII->get(X86::CALL64pcrel32))
   12310       .addExternalSymbol("__morestack_allocate_stack_space").addReg(X86::RDI)
   12311       .addRegMask(RegMask)
   12312       .addReg(X86::RAX, RegState::ImplicitDefine);
   12313   } else {
   12314     BuildMI(mallocMBB, DL, TII->get(X86::SUB32ri), physSPReg).addReg(physSPReg)
   12315       .addImm(12);
   12316     BuildMI(mallocMBB, DL, TII->get(X86::PUSH32r)).addReg(sizeVReg);
   12317     BuildMI(mallocMBB, DL, TII->get(X86::CALLpcrel32))
   12318       .addExternalSymbol("__morestack_allocate_stack_space")
   12319       .addRegMask(RegMask)
   12320       .addReg(X86::EAX, RegState::ImplicitDefine);
   12321   }
   12322 
   12323   if (!Is64Bit)
   12324     BuildMI(mallocMBB, DL, TII->get(X86::ADD32ri), physSPReg).addReg(physSPReg)
   12325       .addImm(16);
   12326 
   12327   BuildMI(mallocMBB, DL, TII->get(TargetOpcode::COPY), mallocPtrVReg)
   12328     .addReg(Is64Bit ? X86::RAX : X86::EAX);
   12329   BuildMI(mallocMBB, DL, TII->get(X86::JMP_4)).addMBB(continueMBB);
   12330 
   12331   // Set up the CFG correctly.
   12332   BB->addSuccessor(bumpMBB);
   12333   BB->addSuccessor(mallocMBB);
   12334   mallocMBB->addSuccessor(continueMBB);
   12335   bumpMBB->addSuccessor(continueMBB);
   12336 
   12337   // Take care of the PHI nodes.
   12338   BuildMI(*continueMBB, continueMBB->begin(), DL, TII->get(X86::PHI),
   12339           MI->getOperand(0).getReg())
   12340     .addReg(mallocPtrVReg).addMBB(mallocMBB)
   12341     .addReg(bumpSPPtrVReg).addMBB(bumpMBB);
   12342 
   12343   // Delete the original pseudo instruction.
   12344   MI->eraseFromParent();
   12345 
   12346   // And we're done.
   12347   return continueMBB;
   12348 }
   12349 
   12350 MachineBasicBlock *
   12351 X86TargetLowering::EmitLoweredWinAlloca(MachineInstr *MI,
   12352                                           MachineBasicBlock *BB) const {
   12353   const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
   12354   DebugLoc DL = MI->getDebugLoc();
   12355 
   12356   assert(!Subtarget->isTargetEnvMacho());
   12357 
   12358   // The lowering is pretty easy: we're just emitting the call to _alloca.  The
   12359   // non-trivial part is impdef of ESP.
   12360 
   12361   if (Subtarget->isTargetWin64()) {
   12362     if (Subtarget->isTargetCygMing()) {
   12363       // ___chkstk(Mingw64):
   12364       // Clobbers R10, R11, RAX and EFLAGS.
   12365       // Updates RSP.
   12366       BuildMI(*BB, MI, DL, TII->get(X86::W64ALLOCA))
   12367         .addExternalSymbol("___chkstk")
   12368         .addReg(X86::RAX, RegState::Implicit)
   12369         .addReg(X86::RSP, RegState::Implicit)
   12370         .addReg(X86::RAX, RegState::Define | RegState::Implicit)
   12371         .addReg(X86::RSP, RegState::Define | RegState::Implicit)
   12372         .addReg(X86::EFLAGS, RegState::Define | RegState::Implicit);
   12373     } else {
   12374       // __chkstk(MSVCRT): does not update stack pointer.
   12375       // Clobbers R10, R11 and EFLAGS.
   12376       // FIXME: RAX(allocated size) might be reused and not killed.
   12377       BuildMI(*BB, MI, DL, TII->get(X86::W64ALLOCA))
   12378         .addExternalSymbol("__chkstk")
   12379         .addReg(X86::RAX, RegState::Implicit)
   12380         .addReg(X86::EFLAGS, RegState::Define | RegState::Implicit);
   12381       // RAX has the offset to subtracted from RSP.
   12382       BuildMI(*BB, MI, DL, TII->get(X86::SUB64rr), X86::RSP)
   12383         .addReg(X86::RSP)
   12384         .addReg(X86::RAX);
   12385     }
   12386   } else {
   12387     const char *StackProbeSymbol =
   12388       Subtarget->isTargetWindows() ? "_chkstk" : "_alloca";
   12389 
   12390     BuildMI(*BB, MI, DL, TII->get(X86::CALLpcrel32))
   12391       .addExternalSymbol(StackProbeSymbol)
   12392       .addReg(X86::EAX, RegState::Implicit)
   12393       .addReg(X86::ESP, RegState::Implicit)
   12394       .addReg(X86::EAX, RegState::Define | RegState::Implicit)
   12395       .addReg(X86::ESP, RegState::Define | RegState::Implicit)
   12396       .addReg(X86::EFLAGS, RegState::Define | RegState::Implicit);
   12397   }
   12398 
   12399   MI->eraseFromParent();   // The pseudo instruction is gone now.
   12400   return BB;
   12401 }
   12402 
   12403 MachineBasicBlock *
   12404 X86TargetLowering::EmitLoweredTLSCall(MachineInstr *MI,
   12405                                       MachineBasicBlock *BB) const {
   12406   // This is pretty easy.  We're taking the value that we received from
   12407   // our load from the relocation, sticking it in either RDI (x86-64)
   12408   // or EAX and doing an indirect call.  The return value will then
   12409   // be in the normal return register.
   12410   const X86InstrInfo *TII
   12411     = static_cast<const X86InstrInfo*>(getTargetMachine().getInstrInfo());
   12412   DebugLoc DL = MI->getDebugLoc();
   12413   MachineFunction *F = BB->getParent();
   12414 
   12415   assert(Subtarget->isTargetDarwin() && "Darwin only instr emitted?");
   12416   assert(MI->getOperand(3).isGlobal() && "This should be a global");
   12417 
   12418   // Get a register mask for the lowered call.
   12419   // FIXME: The 32-bit calls have non-standard calling conventions. Use a
   12420   // proper register mask.
   12421   const uint32_t *RegMask =
   12422     getTargetMachine().getRegisterInfo()->getCallPreservedMask(CallingConv::C);
   12423   if (Subtarget->is64Bit()) {
   12424     MachineInstrBuilder MIB = BuildMI(*BB, MI, DL,
   12425                                       TII->get(X86::MOV64rm), X86::RDI)
   12426     .addReg(X86::RIP)
   12427     .addImm(0).addReg(0)
   12428     .addGlobalAddress(MI->getOperand(3).getGlobal(), 0,
   12429                       MI->getOperand(3).getTargetFlags())
   12430     .addReg(0);
   12431     MIB = BuildMI(*BB, MI, DL, TII->get(X86::CALL64m));
   12432     addDirectMem(MIB, X86::RDI);
   12433     MIB.addReg(X86::RAX, RegState::ImplicitDefine).addRegMask(RegMask);
   12434   } else if (getTargetMachine().getRelocationModel() != Reloc::PIC_) {
   12435     MachineInstrBuilder MIB = BuildMI(*BB, MI, DL,
   12436                                       TII->get(X86::MOV32rm), X86::EAX)
   12437     .addReg(0)
   12438     .addImm(0).addReg(0)
   12439     .addGlobalAddress(MI->getOperand(3).getGlobal(), 0,
   12440                       MI->getOperand(3).getTargetFlags())
   12441     .addReg(0);
   12442     MIB = BuildMI(*BB, MI, DL, TII->get(X86::CALL32m));
   12443     addDirectMem(MIB, X86::EAX);
   12444     MIB.addReg(X86::EAX, RegState::ImplicitDefine).addRegMask(RegMask);
   12445   } else {
   12446     MachineInstrBuilder MIB = BuildMI(*BB, MI, DL,
   12447                                       TII->get(X86::MOV32rm), X86::EAX)
   12448     .addReg(TII->getGlobalBaseReg(F))
   12449     .addImm(0).addReg(0)
   12450     .addGlobalAddress(MI->getOperand(3).getGlobal(), 0,
   12451                       MI->getOperand(3).getTargetFlags())
   12452     .addReg(0);
   12453     MIB = BuildMI(*BB, MI, DL, TII->get(X86::CALL32m));
   12454     addDirectMem(MIB, X86::EAX);
   12455     MIB.addReg(X86::EAX, RegState::ImplicitDefine).addRegMask(RegMask);
   12456   }
   12457 
   12458   MI->eraseFromParent(); // The pseudo instruction is gone now.
   12459   return BB;
   12460 }
   12461 
   12462 MachineBasicBlock *
   12463 X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI,
   12464                                                MachineBasicBlock *BB) const {
   12465   switch (MI->getOpcode()) {
   12466   default: llvm_unreachable("Unexpected instr type to insert");
   12467   case X86::TAILJMPd64:
   12468   case X86::TAILJMPr64:
   12469   case X86::TAILJMPm64:
   12470     llvm_unreachable("TAILJMP64 would not be touched here.");
   12471   case X86::TCRETURNdi64:
   12472   case X86::TCRETURNri64:
   12473   case X86::TCRETURNmi64:
   12474     return BB;
   12475   case X86::WIN_ALLOCA:
   12476     return EmitLoweredWinAlloca(MI, BB);
   12477   case X86::SEG_ALLOCA_32:
   12478     return EmitLoweredSegAlloca(MI, BB, false);
   12479   case X86::SEG_ALLOCA_64:
   12480     return EmitLoweredSegAlloca(MI, BB, true);
   12481   case X86::TLSCall_32:
   12482   case X86::TLSCall_64:
   12483     return EmitLoweredTLSCall(MI, BB);
   12484   case X86::CMOV_GR8:
   12485   case X86::CMOV_FR32:
   12486   case X86::CMOV_FR64:
   12487   case X86::CMOV_V4F32:
   12488   case X86::CMOV_V2F64:
   12489   case X86::CMOV_V2I64:
   12490   case X86::CMOV_V8F32:
   12491   case X86::CMOV_V4F64:
   12492   case X86::CMOV_V4I64:
   12493   case X86::CMOV_GR16:
   12494   case X86::CMOV_GR32:
   12495   case X86::CMOV_RFP32:
   12496   case X86::CMOV_RFP64:
   12497   case X86::CMOV_RFP80:
   12498     return EmitLoweredSelect(MI, BB);
   12499 
   12500   case X86::FP32_TO_INT16_IN_MEM:
   12501   case X86::FP32_TO_INT32_IN_MEM:
   12502   case X86::FP32_TO_INT64_IN_MEM:
   12503   case X86::FP64_TO_INT16_IN_MEM:
   12504   case X86::FP64_TO_INT32_IN_MEM:
   12505   case X86::FP64_TO_INT64_IN_MEM:
   12506   case X86::FP80_TO_INT16_IN_MEM:
   12507   case X86::FP80_TO_INT32_IN_MEM:
   12508   case X86::FP80_TO_INT64_IN_MEM: {
   12509     const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
   12510     DebugLoc DL = MI->getDebugLoc();
   12511 
   12512     // Change the floating point control register to use "round towards zero"
   12513     // mode when truncating to an integer value.
   12514     MachineFunction *F = BB->getParent();
   12515     int CWFrameIdx = F->getFrameInfo()->CreateStackObject(2, 2, false);
   12516     addFrameReference(BuildMI(*BB, MI, DL,
   12517                               TII->get(X86::FNSTCW16m)), CWFrameIdx);
   12518 
   12519     // Load the old value of the high byte of the control word...
   12520     unsigned OldCW =
   12521       F->getRegInfo().createVirtualRegister(X86::GR16RegisterClass);
   12522     addFrameReference(BuildMI(*BB, MI, DL, TII->get(X86::MOV16rm), OldCW),
   12523                       CWFrameIdx);
   12524 
   12525     // Set the high part to be round to zero...
   12526     addFrameReference(BuildMI(*BB, MI, DL, TII->get(X86::MOV16mi)), CWFrameIdx)
   12527       .addImm(0xC7F);
   12528 
   12529     // Reload the modified control word now...
   12530     addFrameReference(BuildMI(*BB, MI, DL,
   12531                               TII->get(X86::FLDCW16m)), CWFrameIdx);
   12532 
   12533     // Restore the memory image of control word to original value
   12534     addFrameReference(BuildMI(*BB, MI, DL, TII->get(X86::MOV16mr)), CWFrameIdx)
   12535       .addReg(OldCW);
   12536 
   12537     // Get the X86 opcode to use.
   12538     unsigned Opc;
   12539     switch (MI->getOpcode()) {
   12540     default: llvm_unreachable("illegal opcode!");
   12541     case X86::FP32_TO_INT16_IN_MEM: Opc = X86::IST_Fp16m32; break;
   12542     case X86::FP32_TO_INT32_IN_MEM: Opc = X86::IST_Fp32m32; break;
   12543     case X86::FP32_TO_INT64_IN_MEM: Opc = X86::IST_Fp64m32; break;
   12544     case X86::FP64_TO_INT16_IN_MEM: Opc = X86::IST_Fp16m64; break;
   12545     case X86::FP64_TO_INT32_IN_MEM: Opc = X86::IST_Fp32m64; break;
   12546     case X86::FP64_TO_INT64_IN_MEM: Opc = X86::IST_Fp64m64; break;
   12547     case X86::FP80_TO_INT16_IN_MEM: Opc = X86::IST_Fp16m80; break;
   12548     case X86::FP80_TO_INT32_IN_MEM: Opc = X86::IST_Fp32m80; break;
   12549     case X86::FP80_TO_INT64_IN_MEM: Opc = X86::IST_Fp64m80; break;
   12550     }
   12551 
   12552     X86AddressMode AM;
   12553     MachineOperand &Op = MI->getOperand(0);
   12554     if (Op.isReg()) {
   12555       AM.BaseType = X86AddressMode::RegBase;
   12556       AM.Base.Reg = Op.getReg();
   12557     } else {
   12558       AM.BaseType = X86AddressMode::FrameIndexBase;
   12559       AM.Base.FrameIndex = Op.getIndex();
   12560     }
   12561     Op = MI->getOperand(1);
   12562     if (Op.isImm())
   12563       AM.Scale = Op.getImm();
   12564     Op = MI->getOperand(2);
   12565     if (Op.isImm())
   12566       AM.IndexReg = Op.getImm();
   12567     Op = MI->getOperand(3);
   12568     if (Op.isGlobal()) {
   12569       AM.GV = Op.getGlobal();
   12570     } else {
   12571       AM.Disp = Op.getImm();
   12572     }
   12573     addFullAddress(BuildMI(*BB, MI, DL, TII->get(Opc)), AM)
   12574                       .addReg(MI->getOperand(X86::AddrNumOperands).getReg());
   12575 
   12576     // Reload the original control word now.
   12577     addFrameReference(BuildMI(*BB, MI, DL,
   12578                               TII->get(X86::FLDCW16m)), CWFrameIdx);
   12579 
   12580     MI->eraseFromParent();   // The pseudo instruction is gone now.
   12581     return BB;
   12582   }
   12583     // String/text processing lowering.
   12584   case X86::PCMPISTRM128REG:
   12585   case X86::VPCMPISTRM128REG:
   12586     return EmitPCMP(MI, BB, 3, false /* in-mem */);
   12587   case X86::PCMPISTRM128MEM:
   12588   case X86::VPCMPISTRM128MEM:
   12589     return EmitPCMP(MI, BB, 3, true /* in-mem */);
   12590   case X86::PCMPESTRM128REG:
   12591   case X86::VPCMPESTRM128REG:
   12592     return EmitPCMP(MI, BB, 5, false /* in mem */);
   12593   case X86::PCMPESTRM128MEM:
   12594   case X86::VPCMPESTRM128MEM:
   12595     return EmitPCMP(MI, BB, 5, true /* in mem */);
   12596 
   12597     // Thread synchronization.
   12598   case X86::MONITOR:
   12599     return EmitMonitor(MI, BB);
   12600   case X86::MWAIT:
   12601     return EmitMwait(MI, BB);
   12602 
   12603     // Atomic Lowering.
   12604   case X86::ATOMAND32:
   12605     return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::AND32rr,
   12606                                                X86::AND32ri, X86::MOV32rm,
   12607                                                X86::LCMPXCHG32,
   12608                                                X86::NOT32r, X86::EAX,
   12609                                                X86::GR32RegisterClass);
   12610   case X86::ATOMOR32:
   12611     return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::OR32rr,
   12612                                                X86::OR32ri, X86::MOV32rm,
   12613                                                X86::LCMPXCHG32,
   12614                                                X86::NOT32r, X86::EAX,
   12615                                                X86::GR32RegisterClass);
   12616   case X86::ATOMXOR32:
   12617     return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::XOR32rr,
   12618                                                X86::XOR32ri, X86::MOV32rm,
   12619                                                X86::LCMPXCHG32,
   12620                                                X86::NOT32r, X86::EAX,
   12621                                                X86::GR32RegisterClass);
   12622   case X86::ATOMNAND32:
   12623     return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::AND32rr,
   12624                                                X86::AND32ri, X86::MOV32rm,
   12625                                                X86::LCMPXCHG32,
   12626                                                X86::NOT32r, X86::EAX,
   12627                                                X86::GR32RegisterClass, true);
   12628   case X86::ATOMMIN32:
   12629     return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVL32rr);
   12630   case X86::ATOMMAX32:
   12631     return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVG32rr);
   12632   case X86::ATOMUMIN32:
   12633     return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVB32rr);
   12634   case X86::ATOMUMAX32:
   12635     return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVA32rr);
   12636 
   12637   case X86::ATOMAND16:
   12638     return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::AND16rr,
   12639                                                X86::AND16ri, X86::MOV16rm,
   12640                                                X86::LCMPXCHG16,
   12641                                                X86::NOT16r, X86::AX,
   12642                                                X86::GR16RegisterClass);
   12643   case X86::ATOMOR16:
   12644     return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::OR16rr,
   12645                                                X86::OR16ri, X86::MOV16rm,
   12646                                                X86::LCMPXCHG16,
   12647                                                X86::NOT16r, X86::AX,
   12648                                                X86::GR16RegisterClass);
   12649   case X86::ATOMXOR16:
   12650     return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::XOR16rr,
   12651                                                X86::XOR16ri, X86::MOV16rm,
   12652                                                X86::LCMPXCHG16,
   12653                                                X86::NOT16r, X86::AX,
   12654                                                X86::GR16RegisterClass);
   12655   case X86::ATOMNAND16:
   12656     return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::AND16rr,
   12657                                                X86::AND16ri, X86::MOV16rm,
   12658                                                X86::LCMPXCHG16,
   12659                                                X86::NOT16r, X86::AX,
   12660                                                X86::GR16RegisterClass, true);
   12661   case X86::ATOMMIN16:
   12662     return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVL16rr);
   12663   case X86::ATOMMAX16:
   12664     return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVG16rr);
   12665   case X86::ATOMUMIN16:
   12666     return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVB16rr);
   12667   case X86::ATOMUMAX16:
   12668     return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVA16rr);
   12669 
   12670   case X86::ATOMAND8:
   12671     return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::AND8rr,
   12672                                                X86::AND8ri, X86::MOV8rm,
   12673                                                X86::LCMPXCHG8,
   12674                                                X86::NOT8r, X86::AL,
   12675                                                X86::GR8RegisterClass);
   12676   case X86::ATOMOR8:
   12677     return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::OR8rr,
   12678                                                X86::OR8ri, X86::MOV8rm,
   12679                                                X86::LCMPXCHG8,
   12680                                                X86::NOT8r, X86::AL,
   12681                                                X86::GR8RegisterClass);
   12682   case X86::ATOMXOR8:
   12683     return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::XOR8rr,
   12684                                                X86::XOR8ri, X86::MOV8rm,
   12685                                                X86::LCMPXCHG8,
   12686                                                X86::NOT8r, X86::AL,
   12687                                                X86::GR8RegisterClass);
   12688   case X86::ATOMNAND8:
   12689     return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::AND8rr,
   12690                                                X86::AND8ri, X86::MOV8rm,
   12691                                                X86::LCMPXCHG8,
   12692                                                X86::NOT8r, X86::AL,
   12693                                                X86::GR8RegisterClass, true);
   12694   // FIXME: There are no CMOV8 instructions; MIN/MAX need some other way.
   12695   // This group is for 64-bit host.
   12696   case X86::ATOMAND64:
   12697     return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::AND64rr,
   12698                                                X86::AND64ri32, X86::MOV64rm,
   12699                                                X86::LCMPXCHG64,
   12700                                                X86::NOT64r, X86::RAX,
   12701                                                X86::GR64RegisterClass);
   12702   case X86::ATOMOR64:
   12703     return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::OR64rr,
   12704                                                X86::OR64ri32, X86::MOV64rm,
   12705                                                X86::LCMPXCHG64,
   12706                                                X86::NOT64r, X86::RAX,
   12707                                                X86::GR64RegisterClass);
   12708   case X86::ATOMXOR64:
   12709     return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::XOR64rr,
   12710                                                X86::XOR64ri32, X86::MOV64rm,
   12711                                                X86::LCMPXCHG64,
   12712                                                X86::NOT64r, X86::RAX,
   12713                                                X86::GR64RegisterClass);
   12714   case X86::ATOMNAND64:
   12715     return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::AND64rr,
   12716                                                X86::AND64ri32, X86::MOV64rm,
   12717                                                X86::LCMPXCHG64,
   12718                                                X86::NOT64r, X86::RAX,
   12719                                                X86::GR64RegisterClass, true);
   12720   case X86::ATOMMIN64:
   12721     return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVL64rr);
   12722   case X86::ATOMMAX64:
   12723     return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVG64rr);
   12724   case X86::ATOMUMIN64:
   12725     return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVB64rr);
   12726   case X86::ATOMUMAX64:
   12727     return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVA64rr);
   12728 
   12729   // This group does 64-bit operations on a 32-bit host.
   12730   case X86::ATOMAND6432:
   12731     return EmitAtomicBit6432WithCustomInserter(MI, BB,
   12732                                                X86::AND32rr, X86::AND32rr,
   12733                                                X86::AND32ri, X86::AND32ri,
   12734                                                false);
   12735   case X86::ATOMOR6432:
   12736     return EmitAtomicBit6432WithCustomInserter(MI, BB,
   12737                                                X86::OR32rr, X86::OR32rr,
   12738                                                X86::OR32ri, X86::OR32ri,
   12739                                                false);
   12740   case X86::ATOMXOR6432:
   12741     return EmitAtomicBit6432WithCustomInserter(MI, BB,
   12742                                                X86::XOR32rr, X86::XOR32rr,
   12743                                                X86::XOR32ri, X86::XOR32ri,
   12744                                                false);
   12745   case X86::ATOMNAND6432:
   12746     return EmitAtomicBit6432WithCustomInserter(MI, BB,
   12747                                                X86::AND32rr, X86::AND32rr,
   12748                                                X86::AND32ri, X86::AND32ri,
   12749                                                true);
   12750   case X86::ATOMADD6432:
   12751     return EmitAtomicBit6432WithCustomInserter(MI, BB,
   12752                                                X86::ADD32rr, X86::ADC32rr,
   12753                                                X86::ADD32ri, X86::ADC32ri,
   12754                                                false);
   12755   case X86::ATOMSUB6432:
   12756     return EmitAtomicBit6432WithCustomInserter(MI, BB,
   12757                                                X86::SUB32rr, X86::SBB32rr,
   12758                                                X86::SUB32ri, X86::SBB32ri,
   12759                                                false);
   12760   case X86::ATOMSWAP6432:
   12761     return EmitAtomicBit6432WithCustomInserter(MI, BB,
   12762                                                X86::MOV32rr, X86::MOV32rr,
   12763                                                X86::MOV32ri, X86::MOV32ri,
   12764                                                false);
   12765   case X86::VASTART_SAVE_XMM_REGS:
   12766     return EmitVAStartSaveXMMRegsWithCustomInserter(MI, BB);
   12767 
   12768   case X86::VAARG_64:
   12769     return EmitVAARG64WithCustomInserter(MI, BB);
   12770   }
   12771 }
   12772 
   12773 //===----------------------------------------------------------------------===//
   12774 //                           X86 Optimization Hooks
   12775 //===----------------------------------------------------------------------===//
   12776 
   12777 void X86TargetLowering::computeMaskedBitsForTargetNode(const SDValue Op,
   12778                                                        APInt &KnownZero,
   12779                                                        APInt &KnownOne,
   12780                                                        const SelectionDAG &DAG,
   12781                                                        unsigned Depth) const {
   12782   unsigned BitWidth = KnownZero.getBitWidth();
   12783   unsigned Opc = Op.getOpcode();
   12784   assert((Opc >= ISD::BUILTIN_OP_END ||
   12785           Opc == ISD::INTRINSIC_WO_CHAIN ||
   12786           Opc == ISD::INTRINSIC_W_CHAIN ||
   12787           Opc == ISD::INTRINSIC_VOID) &&
   12788          "Should use MaskedValueIsZero if you don't know whether Op"
   12789          " is a target node!");
   12790 
   12791   KnownZero = KnownOne = APInt(BitWidth, 0);   // Don't know anything.
   12792   switch (Opc) {
   12793   default: break;
   12794   case X86ISD::ADD:
   12795   case X86ISD::SUB:
   12796   case X86ISD::ADC:
   12797   case X86ISD::SBB:
   12798   case X86ISD::SMUL:
   12799   case X86ISD::UMUL:
   12800   case X86ISD::INC:
   12801   case X86ISD::DEC:
   12802   case X86ISD::OR:
   12803   case X86ISD::XOR:
   12804   case X86ISD::AND:
   12805     // These nodes' second result is a boolean.
   12806     if (Op.getResNo() == 0)
   12807       break;
   12808     // Fallthrough
   12809   case X86ISD::SETCC:
   12810     KnownZero |= APInt::getHighBitsSet(BitWidth, BitWidth - 1);
   12811     break;
   12812   case ISD::INTRINSIC_WO_CHAIN: {
   12813     unsigned IntId = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
   12814     unsigned NumLoBits = 0;
   12815     switch (IntId) {
   12816     default: break;
   12817     case Intrinsic::x86_sse_movmsk_ps:
   12818     case Intrinsic::x86_avx_movmsk_ps_256:
   12819     case Intrinsic::x86_sse2_movmsk_pd:
   12820     case Intrinsic::x86_avx_movmsk_pd_256:
   12821     case Intrinsic::x86_mmx_pmovmskb:
   12822     case Intrinsic::x86_sse2_pmovmskb_128:
   12823     case Intrinsic::x86_avx2_pmovmskb: {
   12824       // High bits of movmskp{s|d}, pmovmskb are known zero.
   12825       switch (IntId) {
   12826         default: llvm_unreachable("Impossible intrinsic");  // Can't reach here.
   12827         case Intrinsic::x86_sse_movmsk_ps:      NumLoBits = 4; break;
   12828         case Intrinsic::x86_avx_movmsk_ps_256:  NumLoBits = 8; break;
   12829         case Intrinsic::x86_sse2_movmsk_pd:     NumLoBits = 2; break;
   12830         case Intrinsic::x86_avx_movmsk_pd_256:  NumLoBits = 4; break;
   12831         case Intrinsic::x86_mmx_pmovmskb:       NumLoBits = 8; break;
   12832         case Intrinsic::x86_sse2_pmovmskb_128:  NumLoBits = 16; break;
   12833         case Intrinsic::x86_avx2_pmovmskb:      NumLoBits = 32; break;
   12834       }
   12835       KnownZero = APInt::getHighBitsSet(BitWidth, BitWidth - NumLoBits);
   12836       break;
   12837     }
   12838     }
   12839     break;
   12840   }
   12841   }
   12842 }
   12843 
   12844 unsigned X86TargetLowering::ComputeNumSignBitsForTargetNode(SDValue Op,
   12845                                                          unsigned Depth) const {
   12846   // SETCC_CARRY sets the dest to ~0 for true or 0 for false.
   12847   if (Op.getOpcode() == X86ISD::SETCC_CARRY)
   12848     return Op.getValueType().getScalarType().getSizeInBits();
   12849 
   12850   // Fallback case.
   12851   return 1;
   12852 }
   12853 
   12854 /// isGAPlusOffset - Returns true (and the GlobalValue and the offset) if the
   12855 /// node is a GlobalAddress + offset.
   12856 bool X86TargetLowering::isGAPlusOffset(SDNode *N,
   12857                                        const GlobalValue* &GA,
   12858                                        int64_t &Offset) const {
   12859   if (N->getOpcode() == X86ISD::Wrapper) {
   12860     if (isa<GlobalAddressSDNode>(N->getOperand(0))) {
   12861       GA = cast<GlobalAddressSDNode>(N->getOperand(0))->getGlobal();
   12862       Offset = cast<GlobalAddressSDNode>(N->getOperand(0))->getOffset();
   12863       return true;
   12864     }
   12865   }
   12866   return TargetLowering::isGAPlusOffset(N, GA, Offset);
   12867 }
   12868 
   12869 /// isShuffleHigh128VectorInsertLow - Checks whether the shuffle node is the
   12870 /// same as extracting the high 128-bit part of 256-bit vector and then
   12871 /// inserting the result into the low part of a new 256-bit vector
   12872 static bool isShuffleHigh128VectorInsertLow(ShuffleVectorSDNode *SVOp) {
   12873   EVT VT = SVOp->getValueType(0);
   12874   int NumElems = VT.getVectorNumElements();
   12875 
   12876   // vector_shuffle <4, 5, 6, 7, u, u, u, u> or <2, 3, u, u>
   12877   for (int i = 0, j = NumElems/2; i < NumElems/2; ++i, ++j)
   12878     if (!isUndefOrEqual(SVOp->getMaskElt(i), j) ||
   12879         SVOp->getMaskElt(j) >= 0)
   12880       return false;
   12881 
   12882   return true;
   12883 }
   12884 
   12885 /// isShuffleLow128VectorInsertHigh - Checks whether the shuffle node is the
   12886 /// same as extracting the low 128-bit part of 256-bit vector and then
   12887 /// inserting the result into the high part of a new 256-bit vector
   12888 static bool isShuffleLow128VectorInsertHigh(ShuffleVectorSDNode *SVOp) {
   12889   EVT VT = SVOp->getValueType(0);
   12890   int NumElems = VT.getVectorNumElements();
   12891 
   12892   // vector_shuffle <u, u, u, u, 0, 1, 2, 3> or <u, u, 0, 1>
   12893   for (int i = NumElems/2, j = 0; i < NumElems; ++i, ++j)
   12894     if (!isUndefOrEqual(SVOp->getMaskElt(i), j) ||
   12895         SVOp->getMaskElt(j) >= 0)
   12896       return false;
   12897 
   12898   return true;
   12899 }
   12900 
   12901 /// PerformShuffleCombine256 - Performs shuffle combines for 256-bit vectors.
   12902 static SDValue PerformShuffleCombine256(SDNode *N, SelectionDAG &DAG,
   12903                                         TargetLowering::DAGCombinerInfo &DCI,
   12904                                         const X86Subtarget* Subtarget) {
   12905   DebugLoc dl = N->getDebugLoc();
   12906   ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N);
   12907   SDValue V1 = SVOp->getOperand(0);
   12908   SDValue V2 = SVOp->getOperand(1);
   12909   EVT VT = SVOp->getValueType(0);
   12910   int NumElems = VT.getVectorNumElements();
   12911 
   12912   if (V1.getOpcode() == ISD::CONCAT_VECTORS &&
   12913       V2.getOpcode() == ISD::CONCAT_VECTORS) {
   12914     //
   12915     //                   0,0,0,...
   12916     //                      |
   12917     //    V      UNDEF    BUILD_VECTOR    UNDEF
   12918     //     \      /           \           /
   12919     //  CONCAT_VECTOR         CONCAT_VECTOR
   12920     //         \                  /
   12921     //          \                /
   12922     //          RESULT: V + zero extended
   12923     //
   12924     if (V2.getOperand(0).getOpcode() != ISD::BUILD_VECTOR ||
   12925         V2.getOperand(1).getOpcode() != ISD::UNDEF ||
   12926         V1.getOperand(1).getOpcode() != ISD::UNDEF)
   12927       return SDValue();
   12928 
   12929     if (!ISD::isBuildVectorAllZeros(V2.getOperand(0).getNode()))
   12930       return SDValue();
   12931 
   12932     // To match the shuffle mask, the first half of the mask should
   12933     // be exactly the first vector, and all the rest a splat with the
   12934     // first element of the second one.
   12935     for (int i = 0; i < NumElems/2; ++i)
   12936       if (!isUndefOrEqual(SVOp->getMaskElt(i), i) ||
   12937           !isUndefOrEqual(SVOp->getMaskElt(i+NumElems/2), NumElems))
   12938         return SDValue();
   12939 
   12940     // If V1 is coming from a vector load then just fold to a VZEXT_LOAD.
   12941     if (LoadSDNode *Ld = dyn_cast<LoadSDNode>(V1.getOperand(0))) {
   12942       SDVTList Tys = DAG.getVTList(MVT::v4i64, MVT::Other);
   12943       SDValue Ops[] = { Ld->getChain(), Ld->getBasePtr() };
   12944       SDValue ResNode =
   12945         DAG.getMemIntrinsicNode(X86ISD::VZEXT_LOAD, dl, Tys, Ops, 2,
   12946                                 Ld->getMemoryVT(),
   12947                                 Ld->getPointerInfo(),
   12948                                 Ld->getAlignment(),
   12949                                 false/*isVolatile*/, true/*ReadMem*/,
   12950                                 false/*WriteMem*/);
   12951       return DAG.getNode(ISD::BITCAST, dl, VT, ResNode);
   12952     }
   12953 
   12954     // Emit a zeroed vector and insert the desired subvector on its
   12955     // first half.
   12956     SDValue Zeros = getZeroVector(VT, Subtarget, DAG, dl);
   12957     SDValue InsV = Insert128BitVector(Zeros, V1.getOperand(0),
   12958                          DAG.getConstant(0, MVT::i32), DAG, dl);
   12959     return DCI.CombineTo(N, InsV);
   12960   }
   12961 
   12962   //===--------------------------------------------------------------------===//
   12963   // Combine some shuffles into subvector extracts and inserts:
   12964   //
   12965 
   12966   // vector_shuffle <4, 5, 6, 7, u, u, u, u> or <2, 3, u, u>
   12967   if (isShuffleHigh128VectorInsertLow(SVOp)) {
   12968     SDValue V = Extract128BitVector(V1, DAG.getConstant(NumElems/2, MVT::i32),
   12969                                     DAG, dl);
   12970     SDValue InsV = Insert128BitVector(DAG.getNode(ISD::UNDEF, dl, VT),
   12971                                       V, DAG.getConstant(0, MVT::i32), DAG, dl);
   12972     return DCI.CombineTo(N, InsV);
   12973   }
   12974 
   12975   // vector_shuffle <u, u, u, u, 0, 1, 2, 3> or <u, u, 0, 1>
   12976   if (isShuffleLow128VectorInsertHigh(SVOp)) {
   12977     SDValue V = Extract128BitVector(V1, DAG.getConstant(0, MVT::i32), DAG, dl);
   12978     SDValue InsV = Insert128BitVector(DAG.getNode(ISD::UNDEF, dl, VT),
   12979                              V, DAG.getConstant(NumElems/2, MVT::i32), DAG, dl);
   12980     return DCI.CombineTo(N, InsV);
   12981   }
   12982 
   12983   return SDValue();
   12984 }
   12985 
   12986 /// PerformShuffleCombine - Performs several different shuffle combines.
   12987 static SDValue PerformShuffleCombine(SDNode *N, SelectionDAG &DAG,
   12988                                      TargetLowering::DAGCombinerInfo &DCI,
   12989                                      const X86Subtarget *Subtarget) {
   12990   DebugLoc dl = N->getDebugLoc();
   12991   EVT VT = N->getValueType(0);
   12992 
   12993   // Don't create instructions with illegal types after legalize types has run.
   12994   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
   12995   if (!DCI.isBeforeLegalize() && !TLI.isTypeLegal(VT.getVectorElementType()))
   12996     return SDValue();
   12997 
   12998   // Combine 256-bit vector shuffles. This is only profitable when in AVX mode
   12999   if (Subtarget->hasAVX() && VT.getSizeInBits() == 256 &&
   13000       N->getOpcode() == ISD::VECTOR_SHUFFLE)
   13001     return PerformShuffleCombine256(N, DAG, DCI, Subtarget);
   13002 
   13003   // Only handle 128 wide vector from here on.
   13004   if (VT.getSizeInBits() != 128)
   13005     return SDValue();
   13006 
   13007   // Combine a vector_shuffle that is equal to build_vector load1, load2, load3,
   13008   // load4, <0, 1, 2, 3> into a 128-bit load if the load addresses are
   13009   // consecutive, non-overlapping, and in the right order.
   13010   SmallVector<SDValue, 16> Elts;
   13011   for (unsigned i = 0, e = VT.getVectorNumElements(); i != e; ++i)
   13012     Elts.push_back(getShuffleScalarElt(N, i, DAG, 0));
   13013 
   13014   return EltsFromConsecutiveLoads(VT, Elts, dl, DAG);
   13015 }
   13016 
   13017 
   13018 /// PerformTruncateCombine - Converts truncate operation to
   13019 /// a sequence of vector shuffle operations.
   13020 /// It is possible when we truncate 256-bit vector to 128-bit vector
   13021 
   13022 SDValue X86TargetLowering::PerformTruncateCombine(SDNode *N, SelectionDAG &DAG,
   13023                                                   DAGCombinerInfo &DCI) const {
   13024   if (!DCI.isBeforeLegalizeOps())
   13025     return SDValue();
   13026 
   13027   if (!Subtarget->hasAVX()) return SDValue();
   13028 
   13029   EVT VT = N->getValueType(0);
   13030   SDValue Op = N->getOperand(0);
   13031   EVT OpVT = Op.getValueType();
   13032   DebugLoc dl = N->getDebugLoc();
   13033 
   13034   if ((VT == MVT::v4i32) && (OpVT == MVT::v4i64)) {
   13035 
   13036     SDValue OpLo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v2i64, Op,
   13037                           DAG.getIntPtrConstant(0));
   13038 
   13039     SDValue OpHi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v2i64, Op,
   13040                           DAG.getIntPtrConstant(2));
   13041 
   13042     OpLo = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, OpLo);
   13043     OpHi = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, OpHi);
   13044 
   13045     // PSHUFD
   13046     int ShufMask1[] = {0, 2, 0, 0};
   13047 
   13048     OpLo = DAG.getVectorShuffle(VT, dl, OpLo, DAG.getUNDEF(VT),
   13049                                 ShufMask1);
   13050     OpHi = DAG.getVectorShuffle(VT, dl, OpHi, DAG.getUNDEF(VT),
   13051                                 ShufMask1);
   13052 
   13053     // MOVLHPS
   13054     int ShufMask2[] = {0, 1, 4, 5};
   13055 
   13056     return DAG.getVectorShuffle(VT, dl, OpLo, OpHi, ShufMask2);
   13057   }
   13058   if ((VT == MVT::v8i16) && (OpVT == MVT::v8i32)) {
   13059 
   13060     SDValue OpLo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v4i32, Op,
   13061                           DAG.getIntPtrConstant(0));
   13062 
   13063     SDValue OpHi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v4i32, Op,
   13064                           DAG.getIntPtrConstant(4));
   13065 
   13066     OpLo = DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, OpLo);
   13067     OpHi = DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, OpHi);
   13068 
   13069     // PSHUFB
   13070     int ShufMask1[] = {0,  1,  4,  5,  8,  9, 12, 13,
   13071                       -1, -1, -1, -1, -1, -1, -1, -1};
   13072 
   13073     OpLo = DAG.getVectorShuffle(MVT::v16i8, dl, OpLo,
   13074                                 DAG.getUNDEF(MVT::v16i8),
   13075                                 ShufMask1);
   13076     OpHi = DAG.getVectorShuffle(MVT::v16i8, dl, OpHi,
   13077                                 DAG.getUNDEF(MVT::v16i8),
   13078                                 ShufMask1);
   13079 
   13080     OpLo = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, OpLo);
   13081     OpHi = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, OpHi);
   13082 
   13083     // MOVLHPS
   13084     int ShufMask2[] = {0, 1, 4, 5};
   13085 
   13086     SDValue res = DAG.getVectorShuffle(MVT::v4i32, dl, OpLo, OpHi, ShufMask2);
   13087     return DAG.getNode(ISD::BITCAST, dl, MVT::v8i16, res);
   13088   }
   13089 
   13090   return SDValue();
   13091 }
   13092 
   13093 /// XFormVExtractWithShuffleIntoLoad - Check if a vector extract from a target
   13094 /// specific shuffle of a load can be folded into a single element load.
   13095 /// Similar handling for VECTOR_SHUFFLE is performed by DAGCombiner, but
   13096 /// shuffles have been customed lowered so we need to handle those here.
   13097 static SDValue XFormVExtractWithShuffleIntoLoad(SDNode *N, SelectionDAG &DAG,
   13098                                          TargetLowering::DAGCombinerInfo &DCI) {
   13099   if (DCI.isBeforeLegalizeOps())
   13100     return SDValue();
   13101 
   13102   SDValue InVec = N->getOperand(0);
   13103   SDValue EltNo = N->getOperand(1);
   13104 
   13105   if (!isa<ConstantSDNode>(EltNo))
   13106     return SDValue();
   13107 
   13108   EVT VT = InVec.getValueType();
   13109 
   13110   bool HasShuffleIntoBitcast = false;
   13111   if (InVec.getOpcode() == ISD::BITCAST) {
   13112     // Don't duplicate a load with other uses.
   13113     if (!InVec.hasOneUse())
   13114       return SDValue();
   13115     EVT BCVT = InVec.getOperand(0).getValueType();
   13116     if (BCVT.getVectorNumElements() != VT.getVectorNumElements())
   13117       return SDValue();
   13118     InVec = InVec.getOperand(0);
   13119     HasShuffleIntoBitcast = true;
   13120   }
   13121 
   13122   if (!isTargetShuffle(InVec.getOpcode()))
   13123     return SDValue();
   13124 
   13125   // Don't duplicate a load with other uses.
   13126   if (!InVec.hasOneUse())
   13127     return SDValue();
   13128 
   13129   SmallVector<int, 16> ShuffleMask;
   13130   bool UnaryShuffle;
   13131   if (!getTargetShuffleMask(InVec.getNode(), VT, ShuffleMask, UnaryShuffle))
   13132     return SDValue();
   13133 
   13134   // Select the input vector, guarding against out of range extract vector.
   13135   unsigned NumElems = VT.getVectorNumElements();
   13136   int Elt = cast<ConstantSDNode>(EltNo)->getZExtValue();
   13137   int Idx = (Elt > (int)NumElems) ? -1 : ShuffleMask[Elt];
   13138   SDValue LdNode = (Idx < (int)NumElems) ? InVec.getOperand(0)
   13139                                          : InVec.getOperand(1);
   13140 
   13141   // If inputs to shuffle are the same for both ops, then allow 2 uses
   13142   unsigned AllowedUses = InVec.getOperand(0) == InVec.getOperand(1) ? 2 : 1;
   13143 
   13144   if (LdNode.getOpcode() == ISD::BITCAST) {
   13145     // Don't duplicate a load with other uses.
   13146     if (!LdNode.getNode()->hasNUsesOfValue(AllowedUses, 0))
   13147       return SDValue();
   13148 
   13149     AllowedUses = 1; // only allow 1 load use if we have a bitcast
   13150     LdNode = LdNode.getOperand(0);
   13151   }
   13152 
   13153   if (!ISD::isNormalLoad(LdNode.getNode()))
   13154     return SDValue();
   13155 
   13156   LoadSDNode *LN0 = cast<LoadSDNode>(LdNode);
   13157 
   13158   if (!LN0 ||!LN0->hasNUsesOfValue(AllowedUses, 0) || LN0->isVolatile())
   13159     return SDValue();
   13160 
   13161   if (HasShuffleIntoBitcast) {
   13162     // If there's a bitcast before the shuffle, check if the load type and
   13163     // alignment is valid.
   13164     unsigned Align = LN0->getAlignment();
   13165     const TargetLowering &TLI = DAG.getTargetLoweringInfo();
   13166     unsigned NewAlign = TLI.getTargetData()->
   13167       getABITypeAlignment(VT.getTypeForEVT(*DAG.getContext()));
   13168 
   13169     if (NewAlign > Align || !TLI.isOperationLegalOrCustom(ISD::LOAD, VT))
   13170       return SDValue();
   13171   }
   13172 
   13173   // All checks match so transform back to vector_shuffle so that DAG combiner
   13174   // can finish the job
   13175   DebugLoc dl = N->getDebugLoc();
   13176 
   13177   // Create shuffle node taking into account the case that its a unary shuffle
   13178   SDValue Shuffle = (UnaryShuffle) ? DAG.getUNDEF(VT) : InVec.getOperand(1);
   13179   Shuffle = DAG.getVectorShuffle(InVec.getValueType(), dl,
   13180                                  InVec.getOperand(0), Shuffle,
   13181                                  &ShuffleMask[0]);
   13182   Shuffle = DAG.getNode(ISD::BITCAST, dl, VT, Shuffle);
   13183   return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, N->getValueType(0), Shuffle,
   13184                      EltNo);
   13185 }
   13186 
   13187 /// PerformEXTRACT_VECTOR_ELTCombine - Detect vector gather/scatter index
   13188 /// generation and convert it from being a bunch of shuffles and extracts
   13189 /// to a simple store and scalar loads to extract the elements.
   13190 static SDValue PerformEXTRACT_VECTOR_ELTCombine(SDNode *N, SelectionDAG &DAG,
   13191                                          TargetLowering::DAGCombinerInfo &DCI) {
   13192   SDValue NewOp = XFormVExtractWithShuffleIntoLoad(N, DAG, DCI);
   13193   if (NewOp.getNode())
   13194     return NewOp;
   13195 
   13196   SDValue InputVector = N->getOperand(0);
   13197 
   13198   // Only operate on vectors of 4 elements, where the alternative shuffling
   13199   // gets to be more expensive.
   13200   if (InputVector.getValueType() != MVT::v4i32)
   13201     return SDValue();
   13202 
   13203   // Check whether every use of InputVector is an EXTRACT_VECTOR_ELT with a
   13204   // single use which is a sign-extend or zero-extend, and all elements are
   13205   // used.
   13206   SmallVector<SDNode *, 4> Uses;
   13207   unsigned ExtractedElements = 0;
   13208   for (SDNode::use_iterator UI = InputVector.getNode()->use_begin(),
   13209        UE = InputVector.getNode()->use_end(); UI != UE; ++UI) {
   13210     if (UI.getUse().getResNo() != InputVector.getResNo())
   13211       return SDValue();
   13212 
   13213     SDNode *Extract = *UI;
   13214     if (Extract->getOpcode() != ISD::EXTRACT_VECTOR_ELT)
   13215       return SDValue();
   13216 
   13217     if (Extract->getValueType(0) != MVT::i32)
   13218       return SDValue();
   13219     if (!Extract->hasOneUse())
   13220       return SDValue();
   13221     if (Extract->use_begin()->getOpcode() != ISD::SIGN_EXTEND &&
   13222         Extract->use_begin()->getOpcode() != ISD::ZERO_EXTEND)
   13223       return SDValue();
   13224     if (!isa<ConstantSDNode>(Extract->getOperand(1)))
   13225       return SDValue();
   13226 
   13227     // Record which element was extracted.
   13228     ExtractedElements |=
   13229       1 << cast<ConstantSDNode>(Extract->getOperand(1))->getZExtValue();
   13230 
   13231     Uses.push_back(Extract);
   13232   }
   13233 
   13234   // If not all the elements were used, this may not be worthwhile.
   13235   if (ExtractedElements != 15)
   13236     return SDValue();
   13237 
   13238   // Ok, we've now decided to do the transformation.
   13239   DebugLoc dl = InputVector.getDebugLoc();
   13240 
   13241   // Store the value to a temporary stack slot.
   13242   SDValue StackPtr = DAG.CreateStackTemporary(InputVector.getValueType());
   13243   SDValue Ch = DAG.getStore(DAG.getEntryNode(), dl, InputVector, StackPtr,
   13244                             MachinePointerInfo(), false, false, 0);
   13245 
   13246   // Replace each use (extract) with a load of the appropriate element.
   13247   for (SmallVectorImpl<SDNode *>::iterator UI = Uses.begin(),
   13248        UE = Uses.end(); UI != UE; ++UI) {
   13249     SDNode *Extract = *UI;
   13250 
   13251     // cOMpute the element's address.
   13252     SDValue Idx = Extract->getOperand(1);
   13253     unsigned EltSize =
   13254         InputVector.getValueType().getVectorElementType().getSizeInBits()/8;
   13255     uint64_t Offset = EltSize * cast<ConstantSDNode>(Idx)->getZExtValue();
   13256     const TargetLowering &TLI = DAG.getTargetLoweringInfo();
   13257     SDValue OffsetVal = DAG.getConstant(Offset, TLI.getPointerTy());
   13258 
   13259     SDValue ScalarAddr = DAG.getNode(ISD::ADD, dl, TLI.getPointerTy(),
   13260                                      StackPtr, OffsetVal);
   13261 
   13262     // Load the scalar.
   13263     SDValue LoadScalar = DAG.getLoad(Extract->getValueType(0), dl, Ch,
   13264                                      ScalarAddr, MachinePointerInfo(),
   13265                                      false, false, false, 0);
   13266 
   13267     // Replace the exact with the load.
   13268     DAG.ReplaceAllUsesOfValueWith(SDValue(Extract, 0), LoadScalar);
   13269   }
   13270 
   13271   // The replacement was made in place; don't return anything.
   13272   return SDValue();
   13273 }
   13274 
   13275 /// PerformSELECTCombine - Do target-specific dag combines on SELECT and VSELECT
   13276 /// nodes.
   13277 static SDValue PerformSELECTCombine(SDNode *N, SelectionDAG &DAG,
   13278                                     TargetLowering::DAGCombinerInfo &DCI,
   13279                                     const X86Subtarget *Subtarget) {
   13280 
   13281 
   13282   DebugLoc DL = N->getDebugLoc();
   13283   SDValue Cond = N->getOperand(0);
   13284   // Get the LHS/RHS of the select.
   13285   SDValue LHS = N->getOperand(1);
   13286   SDValue RHS = N->getOperand(2);
   13287   EVT VT = LHS.getValueType();
   13288 
   13289   // If we have SSE[12] support, try to form min/max nodes. SSE min/max
   13290   // instructions match the semantics of the common C idiom x<y?x:y but not
   13291   // x<=y?x:y, because of how they handle negative zero (which can be
   13292   // ignored in unsafe-math mode).
   13293   if (Cond.getOpcode() == ISD::SETCC && VT.isFloatingPoint() &&
   13294       VT != MVT::f80 && DAG.getTargetLoweringInfo().isTypeLegal(VT) &&
   13295       (Subtarget->hasSSE2() ||
   13296        (Subtarget->hasSSE1() && VT.getScalarType() == MVT::f32))) {
   13297     ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
   13298 
   13299     unsigned Opcode = 0;
   13300     // Check for x CC y ? x : y.
   13301     if (DAG.isEqualTo(LHS, Cond.getOperand(0)) &&
   13302         DAG.isEqualTo(RHS, Cond.getOperand(1))) {
   13303       switch (CC) {
   13304       default: break;
   13305       case ISD::SETULT:
   13306         // Converting this to a min would handle NaNs incorrectly, and swapping
   13307         // the operands would cause it to handle comparisons between positive
   13308         // and negative zero incorrectly.
   13309         if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS)) {
   13310           if (!DAG.getTarget().Options.UnsafeFPMath &&
   13311               !(DAG.isKnownNeverZero(LHS) || DAG.isKnownNeverZero(RHS)))
   13312             break;
   13313           std::swap(LHS, RHS);
   13314         }
   13315         Opcode = X86ISD::FMIN;
   13316         break;
   13317       case ISD::SETOLE:
   13318         // Converting this to a min would handle comparisons between positive
   13319         // and negative zero incorrectly.
   13320         if (!DAG.getTarget().Options.UnsafeFPMath &&
   13321             !DAG.isKnownNeverZero(LHS) && !DAG.isKnownNeverZero(RHS))
   13322           break;
   13323         Opcode = X86ISD::FMIN;
   13324         break;
   13325       case ISD::SETULE:
   13326         // Converting this to a min would handle both negative zeros and NaNs
   13327         // incorrectly, but we can swap the operands to fix both.
   13328         std::swap(LHS, RHS);
   13329       case ISD::SETOLT:
   13330       case ISD::SETLT:
   13331       case ISD::SETLE:
   13332         Opcode = X86ISD::FMIN;
   13333         break;
   13334 
   13335       case ISD::SETOGE:
   13336         // Converting this to a max would handle comparisons between positive
   13337         // and negative zero incorrectly.
   13338         if (!DAG.getTarget().Options.UnsafeFPMath &&
   13339             !DAG.isKnownNeverZero(LHS) && !DAG.isKnownNeverZero(RHS))
   13340           break;
   13341         Opcode = X86ISD::FMAX;
   13342         break;
   13343       case ISD::SETUGT:
   13344         // Converting this to a max would handle NaNs incorrectly, and swapping
   13345         // the operands would cause it to handle comparisons between positive
   13346         // and negative zero incorrectly.
   13347         if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS)) {
   13348           if (!DAG.getTarget().Options.UnsafeFPMath &&
   13349               !(DAG.isKnownNeverZero(LHS) || DAG.isKnownNeverZero(RHS)))
   13350             break;
   13351           std::swap(LHS, RHS);
   13352         }
   13353         Opcode = X86ISD::FMAX;
   13354         break;
   13355       case ISD::SETUGE:
   13356         // Converting this to a max would handle both negative zeros and NaNs
   13357         // incorrectly, but we can swap the operands to fix both.
   13358         std::swap(LHS, RHS);
   13359       case ISD::SETOGT:
   13360       case ISD::SETGT:
   13361       case ISD::SETGE:
   13362         Opcode = X86ISD::FMAX;
   13363         break;
   13364       }
   13365     // Check for x CC y ? y : x -- a min/max with reversed arms.
   13366     } else if (DAG.isEqualTo(LHS, Cond.getOperand(1)) &&
   13367                DAG.isEqualTo(RHS, Cond.getOperand(0))) {
   13368       switch (CC) {
   13369       default: break;
   13370       case ISD::SETOGE:
   13371         // Converting this to a min would handle comparisons between positive
   13372         // and negative zero incorrectly, and swapping the operands would
   13373         // cause it to handle NaNs incorrectly.
   13374         if (!DAG.getTarget().Options.UnsafeFPMath &&
   13375             !(DAG.isKnownNeverZero(LHS) || DAG.isKnownNeverZero(RHS))) {
   13376           if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS))
   13377             break;
   13378           std::swap(LHS, RHS);
   13379         }
   13380         Opcode = X86ISD::FMIN;
   13381         break;
   13382       case ISD::SETUGT:
   13383         // Converting this to a min would handle NaNs incorrectly.
   13384         if (!DAG.getTarget().Options.UnsafeFPMath &&
   13385             (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS)))
   13386           break;
   13387         Opcode = X86ISD::FMIN;
   13388         break;
   13389       case ISD::SETUGE:
   13390         // Converting this to a min would handle both negative zeros and NaNs
   13391         // incorrectly, but we can swap the operands to fix both.
   13392         std::swap(LHS, RHS);
   13393       case ISD::SETOGT:
   13394       case ISD::SETGT:
   13395       case ISD::SETGE:
   13396         Opcode = X86ISD::FMIN;
   13397         break;
   13398 
   13399       case ISD::SETULT:
   13400         // Converting this to a max would handle NaNs incorrectly.
   13401         if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS))
   13402           break;
   13403         Opcode = X86ISD::FMAX;
   13404         break;
   13405       case ISD::SETOLE:
   13406         // Converting this to a max would handle comparisons between positive
   13407         // and negative zero incorrectly, and swapping the operands would
   13408         // cause it to handle NaNs incorrectly.
   13409         if (!DAG.getTarget().Options.UnsafeFPMath &&
   13410             !DAG.isKnownNeverZero(LHS) && !DAG.isKnownNeverZero(RHS)) {
   13411           if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS))
   13412             break;
   13413           std::swap(LHS, RHS);
   13414         }
   13415         Opcode = X86ISD::FMAX;
   13416         break;
   13417       case ISD::SETULE:
   13418         // Converting this to a max would handle both negative zeros and NaNs
   13419         // incorrectly, but we can swap the operands to fix both.
   13420         std::swap(LHS, RHS);
   13421       case ISD::SETOLT:
   13422       case ISD::SETLT:
   13423       case ISD::SETLE:
   13424         Opcode = X86ISD::FMAX;
   13425         break;
   13426       }
   13427     }
   13428 
   13429     if (Opcode)
   13430       return DAG.getNode(Opcode, DL, N->getValueType(0), LHS, RHS);
   13431   }
   13432 
   13433   // If this is a select between two integer constants, try to do some
   13434   // optimizations.
   13435   if (ConstantSDNode *TrueC = dyn_cast<ConstantSDNode>(LHS)) {
   13436     if (ConstantSDNode *FalseC = dyn_cast<ConstantSDNode>(RHS))
   13437       // Don't do this for crazy integer types.
   13438       if (DAG.getTargetLoweringInfo().isTypeLegal(LHS.getValueType())) {
   13439         // If this is efficiently invertible, canonicalize the LHSC/RHSC values
   13440         // so that TrueC (the true value) is larger than FalseC.
   13441         bool NeedsCondInvert = false;
   13442 
   13443         if (TrueC->getAPIntValue().ult(FalseC->getAPIntValue()) &&
   13444             // Efficiently invertible.
   13445             (Cond.getOpcode() == ISD::SETCC ||  // setcc -> invertible.
   13446              (Cond.getOpcode() == ISD::XOR &&   // xor(X, C) -> invertible.
   13447               isa<ConstantSDNode>(Cond.getOperand(1))))) {
   13448           NeedsCondInvert = true;
   13449           std::swap(TrueC, FalseC);
   13450         }
   13451 
   13452         // Optimize C ? 8 : 0 -> zext(C) << 3.  Likewise for any pow2/0.
   13453         if (FalseC->getAPIntValue() == 0 &&
   13454             TrueC->getAPIntValue().isPowerOf2()) {
   13455           if (NeedsCondInvert) // Invert the condition if needed.
   13456             Cond = DAG.getNode(ISD::XOR, DL, Cond.getValueType(), Cond,
   13457                                DAG.getConstant(1, Cond.getValueType()));
   13458 
   13459           // Zero extend the condition if needed.
   13460           Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, LHS.getValueType(), Cond);
   13461 
   13462           unsigned ShAmt = TrueC->getAPIntValue().logBase2();
   13463           return DAG.getNode(ISD::SHL, DL, LHS.getValueType(), Cond,
   13464                              DAG.getConstant(ShAmt, MVT::i8));
   13465         }
   13466 
   13467         // Optimize Cond ? cst+1 : cst -> zext(setcc(C)+cst.
   13468         if (FalseC->getAPIntValue()+1 == TrueC->getAPIntValue()) {
   13469           if (NeedsCondInvert) // Invert the condition if needed.
   13470             Cond = DAG.getNode(ISD::XOR, DL, Cond.getValueType(), Cond,
   13471                                DAG.getConstant(1, Cond.getValueType()));
   13472 
   13473           // Zero extend the condition if needed.
   13474           Cond = DAG.getNode(ISD::ZERO_EXTEND, DL,
   13475                              FalseC->getValueType(0), Cond);
   13476           return DAG.getNode(ISD::ADD, DL, Cond.getValueType(), Cond,
   13477                              SDValue(FalseC, 0));
   13478         }
   13479 
   13480         // Optimize cases that will turn into an LEA instruction.  This requires
   13481         // an i32 or i64 and an efficient multiplier (1, 2, 3, 4, 5, 8, 9).
   13482         if (N->getValueType(0) == MVT::i32 || N->getValueType(0) == MVT::i64) {
   13483           uint64_t Diff = TrueC->getZExtValue()-FalseC->getZExtValue();
   13484           if (N->getValueType(0) == MVT::i32) Diff = (unsigned)Diff;
   13485 
   13486           bool isFastMultiplier = false;
   13487           if (Diff < 10) {
   13488             switch ((unsigned char)Diff) {
   13489               default: break;
   13490               case 1:  // result = add base, cond
   13491               case 2:  // result = lea base(    , cond*2)
   13492               case 3:  // result = lea base(cond, cond*2)
   13493               case 4:  // result = lea base(    , cond*4)
   13494               case 5:  // result = lea base(cond, cond*4)
   13495               case 8:  // result = lea base(    , cond*8)
   13496               case 9:  // result = lea base(cond, cond*8)
   13497                 isFastMultiplier = true;
   13498                 break;
   13499             }
   13500           }
   13501 
   13502           if (isFastMultiplier) {
   13503             APInt Diff = TrueC->getAPIntValue()-FalseC->getAPIntValue();
   13504             if (NeedsCondInvert) // Invert the condition if needed.
   13505               Cond = DAG.getNode(ISD::XOR, DL, Cond.getValueType(), Cond,
   13506                                  DAG.getConstant(1, Cond.getValueType()));
   13507 
   13508             // Zero extend the condition if needed.
   13509             Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, FalseC->getValueType(0),
   13510                                Cond);
   13511             // Scale the condition by the difference.
   13512             if (Diff != 1)
   13513               Cond = DAG.getNode(ISD::MUL, DL, Cond.getValueType(), Cond,
   13514                                  DAG.getConstant(Diff, Cond.getValueType()));
   13515 
   13516             // Add the base if non-zero.
   13517             if (FalseC->getAPIntValue() != 0)
   13518               Cond = DAG.getNode(ISD::ADD, DL, Cond.getValueType(), Cond,
   13519                                  SDValue(FalseC, 0));
   13520             return Cond;
   13521           }
   13522         }
   13523       }
   13524   }
   13525 
   13526   // Canonicalize max and min:
   13527   // (x > y) ? x : y -> (x >= y) ? x : y
   13528   // (x < y) ? x : y -> (x <= y) ? x : y
   13529   // This allows use of COND_S / COND_NS (see TranslateX86CC) which eliminates
   13530   // the need for an extra compare
   13531   // against zero. e.g.
   13532   // (x - y) > 0 : (x - y) ? 0 -> (x - y) >= 0 : (x - y) ? 0
   13533   // subl   %esi, %edi
   13534   // testl  %edi, %edi
   13535   // movl   $0, %eax
   13536   // cmovgl %edi, %eax
   13537   // =>
   13538   // xorl   %eax, %eax
   13539   // subl   %esi, $edi
   13540   // cmovsl %eax, %edi
   13541   if (N->getOpcode() == ISD::SELECT && Cond.getOpcode() == ISD::SETCC &&
   13542       DAG.isEqualTo(LHS, Cond.getOperand(0)) &&
   13543       DAG.isEqualTo(RHS, Cond.getOperand(1))) {
   13544     ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
   13545     switch (CC) {
   13546     default: break;
   13547     case ISD::SETLT:
   13548     case ISD::SETGT: {
   13549       ISD::CondCode NewCC = (CC == ISD::SETLT) ? ISD::SETLE : ISD::SETGE;
   13550       Cond = DAG.getSetCC(Cond.getDebugLoc(), Cond.getValueType(),
   13551                           Cond.getOperand(0), Cond.getOperand(1), NewCC);
   13552       return DAG.getNode(ISD::SELECT, DL, VT, Cond, LHS, RHS);
   13553     }
   13554     }
   13555   }
   13556 
   13557   // If we know that this node is legal then we know that it is going to be
   13558   // matched by one of the SSE/AVX BLEND instructions. These instructions only
   13559   // depend on the highest bit in each word. Try to use SimplifyDemandedBits
   13560   // to simplify previous instructions.
   13561   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
   13562   if (N->getOpcode() == ISD::VSELECT && DCI.isBeforeLegalizeOps() &&
   13563       !DCI.isBeforeLegalize() &&
   13564       TLI.isOperationLegal(ISD::VSELECT, VT)) {
   13565     unsigned BitWidth = Cond.getValueType().getScalarType().getSizeInBits();
   13566     assert(BitWidth >= 8 && BitWidth <= 64 && "Invalid mask size");
   13567     APInt DemandedMask = APInt::getHighBitsSet(BitWidth, 1);
   13568 
   13569     APInt KnownZero, KnownOne;
   13570     TargetLowering::TargetLoweringOpt TLO(DAG, DCI.isBeforeLegalize(),
   13571                                           DCI.isBeforeLegalizeOps());
   13572     if (TLO.ShrinkDemandedConstant(Cond, DemandedMask) ||
   13573         TLI.SimplifyDemandedBits(Cond, DemandedMask, KnownZero, KnownOne, TLO))
   13574       DCI.CommitTargetLoweringOpt(TLO);
   13575   }
   13576 
   13577   return SDValue();
   13578 }
   13579 
   13580 /// Optimize X86ISD::CMOV [LHS, RHS, CONDCODE (e.g. X86::COND_NE), CONDVAL]
   13581 static SDValue PerformCMOVCombine(SDNode *N, SelectionDAG &DAG,
   13582                                   TargetLowering::DAGCombinerInfo &DCI) {
   13583   DebugLoc DL = N->getDebugLoc();
   13584 
   13585   // If the flag operand isn't dead, don't touch this CMOV.
   13586   if (N->getNumValues() == 2 && !SDValue(N, 1).use_empty())
   13587     return SDValue();
   13588 
   13589   SDValue FalseOp = N->getOperand(0);
   13590   SDValue TrueOp = N->getOperand(1);
   13591   X86::CondCode CC = (X86::CondCode)N->getConstantOperandVal(2);
   13592   SDValue Cond = N->getOperand(3);
   13593   if (CC == X86::COND_E || CC == X86::COND_NE) {
   13594     switch (Cond.getOpcode()) {
   13595     default: break;
   13596     case X86ISD::BSR:
   13597     case X86ISD::BSF:
   13598       // If operand of BSR / BSF are proven never zero, then ZF cannot be set.
   13599       if (DAG.isKnownNeverZero(Cond.getOperand(0)))
   13600         return (CC == X86::COND_E) ? FalseOp : TrueOp;
   13601     }
   13602   }
   13603 
   13604   // If this is a select between two integer constants, try to do some
   13605   // optimizations.  Note that the operands are ordered the opposite of SELECT
   13606   // operands.
   13607   if (ConstantSDNode *TrueC = dyn_cast<ConstantSDNode>(TrueOp)) {
   13608     if (ConstantSDNode *FalseC = dyn_cast<ConstantSDNode>(FalseOp)) {
   13609       // Canonicalize the TrueC/FalseC values so that TrueC (the true value) is
   13610       // larger than FalseC (the false value).
   13611       if (TrueC->getAPIntValue().ult(FalseC->getAPIntValue())) {
   13612         CC = X86::GetOppositeBranchCondition(CC);
   13613         std::swap(TrueC, FalseC);
   13614       }
   13615 
   13616       // Optimize C ? 8 : 0 -> zext(setcc(C)) << 3.  Likewise for any pow2/0.
   13617       // This is efficient for any integer data type (including i8/i16) and
   13618       // shift amount.
   13619       if (FalseC->getAPIntValue() == 0 && TrueC->getAPIntValue().isPowerOf2()) {
   13620         Cond = DAG.getNode(X86ISD::SETCC, DL, MVT::i8,
   13621                            DAG.getConstant(CC, MVT::i8), Cond);
   13622 
   13623         // Zero extend the condition if needed.
   13624         Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, TrueC->getValueType(0), Cond);
   13625 
   13626         unsigned ShAmt = TrueC->getAPIntValue().logBase2();
   13627         Cond = DAG.getNode(ISD::SHL, DL, Cond.getValueType(), Cond,
   13628                            DAG.getConstant(ShAmt, MVT::i8));
   13629         if (N->getNumValues() == 2)  // Dead flag value?
   13630           return DCI.CombineTo(N, Cond, SDValue());
   13631         return Cond;
   13632       }
   13633 
   13634       // Optimize Cond ? cst+1 : cst -> zext(setcc(C)+cst.  This is efficient
   13635       // for any integer data type, including i8/i16.
   13636       if (FalseC->getAPIntValue()+1 == TrueC->getAPIntValue()) {
   13637         Cond = DAG.getNode(X86ISD::SETCC, DL, MVT::i8,
   13638                            DAG.getConstant(CC, MVT::i8), Cond);
   13639 
   13640         // Zero extend the condition if needed.
   13641         Cond = DAG.getNode(ISD::ZERO_EXTEND, DL,
   13642                            FalseC->getValueType(0), Cond);
   13643         Cond = DAG.getNode(ISD::ADD, DL, Cond.getValueType(), Cond,
   13644                            SDValue(FalseC, 0));
   13645 
   13646         if (N->getNumValues() == 2)  // Dead flag value?
   13647           return DCI.CombineTo(N, Cond, SDValue());
   13648         return Cond;
   13649       }
   13650 
   13651       // Optimize cases that will turn into an LEA instruction.  This requires
   13652       // an i32 or i64 and an efficient multiplier (1, 2, 3, 4, 5, 8, 9).
   13653       if (N->getValueType(0) == MVT::i32 || N->getValueType(0) == MVT::i64) {
   13654         uint64_t Diff = TrueC->getZExtValue()-FalseC->getZExtValue();
   13655         if (N->getValueType(0) == MVT::i32) Diff = (unsigned)Diff;
   13656 
   13657         bool isFastMultiplier = false;
   13658         if (Diff < 10) {
   13659           switch ((unsigned char)Diff) {
   13660           default: break;
   13661           case 1:  // result = add base, cond
   13662           case 2:  // result = lea base(    , cond*2)
   13663           case 3:  // result = lea base(cond, cond*2)
   13664           case 4:  // result = lea base(    , cond*4)
   13665           case 5:  // result = lea base(cond, cond*4)
   13666           case 8:  // result = lea base(    , cond*8)
   13667           case 9:  // result = lea base(cond, cond*8)
   13668             isFastMultiplier = true;
   13669             break;
   13670           }
   13671         }
   13672 
   13673         if (isFastMultiplier) {
   13674           APInt Diff = TrueC->getAPIntValue()-FalseC->getAPIntValue();
   13675           Cond = DAG.getNode(X86ISD::SETCC, DL, MVT::i8,
   13676                              DAG.getConstant(CC, MVT::i8), Cond);
   13677           // Zero extend the condition if needed.
   13678           Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, FalseC->getValueType(0),
   13679                              Cond);
   13680           // Scale the condition by the difference.
   13681           if (Diff != 1)
   13682             Cond = DAG.getNode(ISD::MUL, DL, Cond.getValueType(), Cond,
   13683                                DAG.getConstant(Diff, Cond.getValueType()));
   13684 
   13685           // Add the base if non-zero.
   13686           if (FalseC->getAPIntValue() != 0)
   13687             Cond = DAG.getNode(ISD::ADD, DL, Cond.getValueType(), Cond,
   13688                                SDValue(FalseC, 0));
   13689           if (N->getNumValues() == 2)  // Dead flag value?
   13690             return DCI.CombineTo(N, Cond, SDValue());
   13691           return Cond;
   13692         }
   13693       }
   13694     }
   13695   }
   13696   return SDValue();
   13697 }
   13698 
   13699 
   13700 /// PerformMulCombine - Optimize a single multiply with constant into two
   13701 /// in order to implement it with two cheaper instructions, e.g.
   13702 /// LEA + SHL, LEA + LEA.
   13703 static SDValue PerformMulCombine(SDNode *N, SelectionDAG &DAG,
   13704                                  TargetLowering::DAGCombinerInfo &DCI) {
   13705   if (DCI.isBeforeLegalize() || DCI.isCalledByLegalizer())
   13706     return SDValue();
   13707 
   13708   EVT VT = N->getValueType(0);
   13709   if (VT != MVT::i64)
   13710     return SDValue();
   13711 
   13712   ConstantSDNode *C = dyn_cast<ConstantSDNode>(N->getOperand(1));
   13713   if (!C)
   13714     return SDValue();
   13715   uint64_t MulAmt = C->getZExtValue();
   13716   if (isPowerOf2_64(MulAmt) || MulAmt == 3 || MulAmt == 5 || MulAmt == 9)
   13717     return SDValue();
   13718 
   13719   uint64_t MulAmt1 = 0;
   13720   uint64_t MulAmt2 = 0;
   13721   if ((MulAmt % 9) == 0) {
   13722     MulAmt1 = 9;
   13723     MulAmt2 = MulAmt / 9;
   13724   } else if ((MulAmt % 5) == 0) {
   13725     MulAmt1 = 5;
   13726     MulAmt2 = MulAmt / 5;
   13727   } else if ((MulAmt % 3) == 0) {
   13728     MulAmt1 = 3;
   13729     MulAmt2 = MulAmt / 3;
   13730   }
   13731   if (MulAmt2 &&
   13732       (isPowerOf2_64(MulAmt2) || MulAmt2 == 3 || MulAmt2 == 5 || MulAmt2 == 9)){
   13733     DebugLoc DL = N->getDebugLoc();
   13734 
   13735     if (isPowerOf2_64(MulAmt2) &&
   13736         !(N->hasOneUse() && N->use_begin()->getOpcode() == ISD::ADD))
   13737       // If second multiplifer is pow2, issue it first. We want the multiply by
   13738       // 3, 5, or 9 to be folded into the addressing mode unless the lone use
   13739       // is an add.
   13740       std::swap(MulAmt1, MulAmt2);
   13741 
   13742     SDValue NewMul;
   13743     if (isPowerOf2_64(MulAmt1))
   13744       NewMul = DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
   13745                            DAG.getConstant(Log2_64(MulAmt1), MVT::i8));
   13746     else
   13747       NewMul = DAG.getNode(X86ISD::MUL_IMM, DL, VT, N->getOperand(0),
   13748                            DAG.getConstant(MulAmt1, VT));
   13749 
   13750     if (isPowerOf2_64(MulAmt2))
   13751       NewMul = DAG.getNode(ISD::SHL, DL, VT, NewMul,
   13752                            DAG.getConstant(Log2_64(MulAmt2), MVT::i8));
   13753     else
   13754       NewMul = DAG.getNode(X86ISD::MUL_IMM, DL, VT, NewMul,
   13755                            DAG.getConstant(MulAmt2, VT));
   13756 
   13757     // Do not add new nodes to DAG combiner worklist.
   13758     DCI.CombineTo(N, NewMul, false);
   13759   }
   13760   return SDValue();
   13761 }
   13762 
   13763 static SDValue PerformSHLCombine(SDNode *N, SelectionDAG &DAG) {
   13764   SDValue N0 = N->getOperand(0);
   13765   SDValue N1 = N->getOperand(1);
   13766   ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(N1);
   13767   EVT VT = N0.getValueType();
   13768 
   13769   // fold (shl (and (setcc_c), c1), c2) -> (and setcc_c, (c1 << c2))
   13770   // since the result of setcc_c is all zero's or all ones.
   13771   if (VT.isInteger() && !VT.isVector() &&
   13772       N1C && N0.getOpcode() == ISD::AND &&
   13773       N0.getOperand(1).getOpcode() == ISD::Constant) {
   13774     SDValue N00 = N0.getOperand(0);
   13775     if (N00.getOpcode() == X86ISD::SETCC_CARRY ||
   13776         ((N00.getOpcode() == ISD::ANY_EXTEND ||
   13777           N00.getOpcode() == ISD::ZERO_EXTEND) &&
   13778          N00.getOperand(0).getOpcode() == X86ISD::SETCC_CARRY)) {
   13779       APInt Mask = cast<ConstantSDNode>(N0.getOperand(1))->getAPIntValue();
   13780       APInt ShAmt = N1C->getAPIntValue();
   13781       Mask = Mask.shl(ShAmt);
   13782       if (Mask != 0)
   13783         return DAG.getNode(ISD::AND, N->getDebugLoc(), VT,
   13784                            N00, DAG.getConstant(Mask, VT));
   13785     }
   13786   }
   13787 
   13788 
   13789   // Hardware support for vector shifts is sparse which makes us scalarize the
   13790   // vector operations in many cases. Also, on sandybridge ADD is faster than
   13791   // shl.
   13792   // (shl V, 1) -> add V,V
   13793   if (isSplatVector(N1.getNode())) {
   13794     assert(N0.getValueType().isVector() && "Invalid vector shift type");
   13795     ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(N1->getOperand(0));
   13796     // We shift all of the values by one. In many cases we do not have
   13797     // hardware support for this operation. This is better expressed as an ADD
   13798     // of two values.
   13799     if (N1C && (1 == N1C->getZExtValue())) {
   13800       return DAG.getNode(ISD::ADD, N->getDebugLoc(), VT, N0, N0);
   13801     }
   13802   }
   13803 
   13804   return SDValue();
   13805 }
   13806 
   13807 /// PerformShiftCombine - Transforms vector shift nodes to use vector shifts
   13808 ///                       when possible.
   13809 static SDValue PerformShiftCombine(SDNode* N, SelectionDAG &DAG,
   13810                                    TargetLowering::DAGCombinerInfo &DCI,
   13811                                    const X86Subtarget *Subtarget) {
   13812   EVT VT = N->getValueType(0);
   13813   if (N->getOpcode() == ISD::SHL) {
   13814     SDValue V = PerformSHLCombine(N, DAG);
   13815     if (V.getNode()) return V;
   13816   }
   13817 
   13818   // On X86 with SSE2 support, we can transform this to a vector shift if
   13819   // all elements are shifted by the same amount.  We can't do this in legalize
   13820   // because the a constant vector is typically transformed to a constant pool
   13821   // so we have no knowledge of the shift amount.
   13822   if (!Subtarget->hasSSE2())
   13823     return SDValue();
   13824 
   13825   if (VT != MVT::v2i64 && VT != MVT::v4i32 && VT != MVT::v8i16 &&
   13826       (!Subtarget->hasAVX2() ||
   13827        (VT != MVT::v4i64 && VT != MVT::v8i32 && VT != MVT::v16i16)))
   13828     return SDValue();
   13829 
   13830   SDValue ShAmtOp = N->getOperand(1);
   13831   EVT EltVT = VT.getVectorElementType();
   13832   DebugLoc DL = N->getDebugLoc();
   13833   SDValue BaseShAmt = SDValue();
   13834   if (ShAmtOp.getOpcode() == ISD::BUILD_VECTOR) {
   13835     unsigned NumElts = VT.getVectorNumElements();
   13836     unsigned i = 0;
   13837     for (; i != NumElts; ++i) {
   13838       SDValue Arg = ShAmtOp.getOperand(i);
   13839       if (Arg.getOpcode() == ISD::UNDEF) continue;
   13840       BaseShAmt = Arg;
   13841       break;
   13842     }
   13843     // Handle the case where the build_vector is all undef
   13844     // FIXME: Should DAG allow this?
   13845     if (i == NumElts)
   13846       return SDValue();
   13847 
   13848     for (; i != NumElts; ++i) {
   13849       SDValue Arg = ShAmtOp.getOperand(i);
   13850       if (Arg.getOpcode() == ISD::UNDEF) continue;
   13851       if (Arg != BaseShAmt) {
   13852         return SDValue();
   13853       }
   13854     }
   13855   } else if (ShAmtOp.getOpcode() == ISD::VECTOR_SHUFFLE &&
   13856              cast<ShuffleVectorSDNode>(ShAmtOp)->isSplat()) {
   13857     SDValue InVec = ShAmtOp.getOperand(0);
   13858     if (InVec.getOpcode() == ISD::BUILD_VECTOR) {
   13859       unsigned NumElts = InVec.getValueType().getVectorNumElements();
   13860       unsigned i = 0;
   13861       for (; i != NumElts; ++i) {
   13862         SDValue Arg = InVec.getOperand(i);
   13863         if (Arg.getOpcode() == ISD::UNDEF) continue;
   13864         BaseShAmt = Arg;
   13865         break;
   13866       }
   13867     } else if (InVec.getOpcode() == ISD::INSERT_VECTOR_ELT) {
   13868        if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(InVec.getOperand(2))) {
   13869          unsigned SplatIdx= cast<ShuffleVectorSDNode>(ShAmtOp)->getSplatIndex();
   13870          if (C->getZExtValue() == SplatIdx)
   13871            BaseShAmt = InVec.getOperand(1);
   13872        }
   13873     }
   13874     if (BaseShAmt.getNode() == 0) {
   13875       // Don't create instructions with illegal types after legalize
   13876       // types has run.
   13877       if (!DAG.getTargetLoweringInfo().isTypeLegal(EltVT) &&
   13878           !DCI.isBeforeLegalize())
   13879         return SDValue();
   13880 
   13881       BaseShAmt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltVT, ShAmtOp,
   13882                               DAG.getIntPtrConstant(0));
   13883     }
   13884   } else
   13885     return SDValue();
   13886 
   13887   // The shift amount is an i32.
   13888   if (EltVT.bitsGT(MVT::i32))
   13889     BaseShAmt = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, BaseShAmt);
   13890   else if (EltVT.bitsLT(MVT::i32))
   13891     BaseShAmt = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, BaseShAmt);
   13892 
   13893   // The shift amount is identical so we can do a vector shift.
   13894   SDValue  ValOp = N->getOperand(0);
   13895   switch (N->getOpcode()) {
   13896   default:
   13897     llvm_unreachable("Unknown shift opcode!");
   13898   case ISD::SHL:
   13899     switch (VT.getSimpleVT().SimpleTy) {
   13900     default: return SDValue();
   13901     case MVT::v2i64:
   13902     case MVT::v4i32:
   13903     case MVT::v8i16:
   13904     case MVT::v4i64:
   13905     case MVT::v8i32:
   13906     case MVT::v16i16:
   13907       return getTargetVShiftNode(X86ISD::VSHLI, DL, VT, ValOp, BaseShAmt, DAG);
   13908     }
   13909   case ISD::SRA:
   13910     switch (VT.getSimpleVT().SimpleTy) {
   13911     default: return SDValue();
   13912     case MVT::v4i32:
   13913     case MVT::v8i16:
   13914     case MVT::v8i32:
   13915     case MVT::v16i16:
   13916       return getTargetVShiftNode(X86ISD::VSRAI, DL, VT, ValOp, BaseShAmt, DAG);
   13917     }
   13918   case ISD::SRL:
   13919     switch (VT.getSimpleVT().SimpleTy) {
   13920     default: return SDValue();
   13921     case MVT::v2i64:
   13922     case MVT::v4i32:
   13923     case MVT::v8i16:
   13924     case MVT::v4i64:
   13925     case MVT::v8i32:
   13926     case MVT::v16i16:
   13927       return getTargetVShiftNode(X86ISD::VSRLI, DL, VT, ValOp, BaseShAmt, DAG);
   13928     }
   13929   }
   13930 }
   13931 
   13932 
   13933 // CMPEQCombine - Recognize the distinctive  (AND (setcc ...) (setcc ..))
   13934 // where both setccs reference the same FP CMP, and rewrite for CMPEQSS
   13935 // and friends.  Likewise for OR -> CMPNEQSS.
   13936 static SDValue CMPEQCombine(SDNode *N, SelectionDAG &DAG,
   13937                             TargetLowering::DAGCombinerInfo &DCI,
   13938                             const X86Subtarget *Subtarget) {
   13939   unsigned opcode;
   13940 
   13941   // SSE1 supports CMP{eq|ne}SS, and SSE2 added CMP{eq|ne}SD, but
   13942   // we're requiring SSE2 for both.
   13943   if (Subtarget->hasSSE2() && isAndOrOfSetCCs(SDValue(N, 0U), opcode)) {
   13944     SDValue N0 = N->getOperand(0);
   13945     SDValue N1 = N->getOperand(1);
   13946     SDValue CMP0 = N0->getOperand(1);
   13947     SDValue CMP1 = N1->getOperand(1);
   13948     DebugLoc DL = N->getDebugLoc();
   13949 
   13950     // The SETCCs should both refer to the same CMP.
   13951     if (CMP0.getOpcode() != X86ISD::CMP || CMP0 != CMP1)
   13952       return SDValue();
   13953 
   13954     SDValue CMP00 = CMP0->getOperand(0);
   13955     SDValue CMP01 = CMP0->getOperand(1);
   13956     EVT     VT    = CMP00.getValueType();
   13957 
   13958     if (VT == MVT::f32 || VT == MVT::f64) {
   13959       bool ExpectingFlags = false;
   13960       // Check for any users that want flags:
   13961       for (SDNode::use_iterator UI = N->use_begin(),
   13962              UE = N->use_end();
   13963            !ExpectingFlags && UI != UE; ++UI)
   13964         switch (UI->getOpcode()) {
   13965         default:
   13966         case ISD::BR_CC:
   13967         case ISD::BRCOND:
   13968         case ISD::SELECT:
   13969           ExpectingFlags = true;
   13970           break;
   13971         case ISD::CopyToReg:
   13972         case ISD::SIGN_EXTEND:
   13973         case ISD::ZERO_EXTEND:
   13974         case ISD::ANY_EXTEND:
   13975           break;
   13976         }
   13977 
   13978       if (!ExpectingFlags) {
   13979         enum X86::CondCode cc0 = (enum X86::CondCode)N0.getConstantOperandVal(0);
   13980         enum X86::CondCode cc1 = (enum X86::CondCode)N1.getConstantOperandVal(0);
   13981 
   13982         if (cc1 == X86::COND_E || cc1 == X86::COND_NE) {
   13983           X86::CondCode tmp = cc0;
   13984           cc0 = cc1;
   13985           cc1 = tmp;
   13986         }
   13987 
   13988         if ((cc0 == X86::COND_E  && cc1 == X86::COND_NP) ||
   13989             (cc0 == X86::COND_NE && cc1 == X86::COND_P)) {
   13990           bool is64BitFP = (CMP00.getValueType() == MVT::f64);
   13991           X86ISD::NodeType NTOperator = is64BitFP ?
   13992             X86ISD::FSETCCsd : X86ISD::FSETCCss;
   13993           // FIXME: need symbolic constants for these magic numbers.
   13994           // See X86ATTInstPrinter.cpp:printSSECC().
   13995           unsigned x86cc = (cc0 == X86::COND_E) ? 0 : 4;
   13996           SDValue OnesOrZeroesF = DAG.getNode(NTOperator, DL, MVT::f32, CMP00, CMP01,
   13997                                               DAG.getConstant(x86cc, MVT::i8));
   13998           SDValue OnesOrZeroesI = DAG.getNode(ISD::BITCAST, DL, MVT::i32,
   13999                                               OnesOrZeroesF);
   14000           SDValue ANDed = DAG.getNode(ISD::AND, DL, MVT::i32, OnesOrZeroesI,
   14001                                       DAG.getConstant(1, MVT::i32));
   14002           SDValue OneBitOfTruth = DAG.getNode(ISD::TRUNCATE, DL, MVT::i8, ANDed);
   14003           return OneBitOfTruth;
   14004         }
   14005       }
   14006     }
   14007   }
   14008   return SDValue();
   14009 }
   14010 
   14011 /// CanFoldXORWithAllOnes - Test whether the XOR operand is a AllOnes vector
   14012 /// so it can be folded inside ANDNP.
   14013 static bool CanFoldXORWithAllOnes(const SDNode *N) {
   14014   EVT VT = N->getValueType(0);
   14015 
   14016   // Match direct AllOnes for 128 and 256-bit vectors
   14017   if (ISD::isBuildVectorAllOnes(N))
   14018     return true;
   14019 
   14020   // Look through a bit convert.
   14021   if (N->getOpcode() == ISD::BITCAST)
   14022     N = N->getOperand(0).getNode();
   14023 
   14024   // Sometimes the operand may come from a insert_subvector building a 256-bit
   14025   // allones vector
   14026   if (VT.getSizeInBits() == 256 &&
   14027       N->getOpcode() == ISD::INSERT_SUBVECTOR) {
   14028     SDValue V1 = N->getOperand(0);
   14029     SDValue V2 = N->getOperand(1);
   14030 
   14031     if (V1.getOpcode() == ISD::INSERT_SUBVECTOR &&
   14032         V1.getOperand(0).getOpcode() == ISD::UNDEF &&
   14033         ISD::isBuildVectorAllOnes(V1.getOperand(1).getNode()) &&
   14034         ISD::isBuildVectorAllOnes(V2.getNode()))
   14035       return true;
   14036   }
   14037 
   14038   return false;
   14039 }
   14040 
   14041 static SDValue PerformAndCombine(SDNode *N, SelectionDAG &DAG,
   14042                                  TargetLowering::DAGCombinerInfo &DCI,
   14043                                  const X86Subtarget *Subtarget) {
   14044   if (DCI.isBeforeLegalizeOps())
   14045     return SDValue();
   14046 
   14047   SDValue R = CMPEQCombine(N, DAG, DCI, Subtarget);
   14048   if (R.getNode())
   14049     return R;
   14050 
   14051   EVT VT = N->getValueType(0);
   14052 
   14053   // Create ANDN, BLSI, and BLSR instructions
   14054   // BLSI is X & (-X)
   14055   // BLSR is X & (X-1)
   14056   if (Subtarget->hasBMI() && (VT == MVT::i32 || VT == MVT::i64)) {
   14057     SDValue N0 = N->getOperand(0);
   14058     SDValue N1 = N->getOperand(1);
   14059     DebugLoc DL = N->getDebugLoc();
   14060 
   14061     // Check LHS for not
   14062     if (N0.getOpcode() == ISD::XOR && isAllOnes(N0.getOperand(1)))
   14063       return DAG.getNode(X86ISD::ANDN, DL, VT, N0.getOperand(0), N1);
   14064     // Check RHS for not
   14065     if (N1.getOpcode() == ISD::XOR && isAllOnes(N1.getOperand(1)))
   14066       return DAG.getNode(X86ISD::ANDN, DL, VT, N1.getOperand(0), N0);
   14067 
   14068     // Check LHS for neg
   14069     if (N0.getOpcode() == ISD::SUB && N0.getOperand(1) == N1 &&
   14070         isZero(N0.getOperand(0)))
   14071       return DAG.getNode(X86ISD::BLSI, DL, VT, N1);
   14072 
   14073     // Check RHS for neg
   14074     if (N1.getOpcode() == ISD::SUB && N1.getOperand(1) == N0 &&
   14075         isZero(N1.getOperand(0)))
   14076       return DAG.getNode(X86ISD::BLSI, DL, VT, N0);
   14077 
   14078     // Check LHS for X-1
   14079     if (N0.getOpcode() == ISD::ADD && N0.getOperand(0) == N1 &&
   14080         isAllOnes(N0.getOperand(1)))
   14081       return DAG.getNode(X86ISD::BLSR, DL, VT, N1);
   14082 
   14083     // Check RHS for X-1
   14084     if (N1.getOpcode() == ISD::ADD && N1.getOperand(0) == N0 &&
   14085         isAllOnes(N1.getOperand(1)))
   14086       return DAG.getNode(X86ISD::BLSR, DL, VT, N0);
   14087 
   14088     return SDValue();
   14089   }
   14090 
   14091   // Want to form ANDNP nodes:
   14092   // 1) In the hopes of then easily combining them with OR and AND nodes
   14093   //    to form PBLEND/PSIGN.
   14094   // 2) To match ANDN packed intrinsics
   14095   if (VT != MVT::v2i64 && VT != MVT::v4i64)
   14096     return SDValue();
   14097 
   14098   SDValue N0 = N->getOperand(0);
   14099   SDValue N1 = N->getOperand(1);
   14100   DebugLoc DL = N->getDebugLoc();
   14101 
   14102   // Check LHS for vnot
   14103   if (N0.getOpcode() == ISD::XOR &&
   14104       //ISD::isBuildVectorAllOnes(N0.getOperand(1).getNode()))
   14105       CanFoldXORWithAllOnes(N0.getOperand(1).getNode()))
   14106     return DAG.getNode(X86ISD::ANDNP, DL, VT, N0.getOperand(0), N1);
   14107 
   14108   // Check RHS for vnot
   14109   if (N1.getOpcode() == ISD::XOR &&
   14110       //ISD::isBuildVectorAllOnes(N1.getOperand(1).getNode()))
   14111       CanFoldXORWithAllOnes(N1.getOperand(1).getNode()))
   14112     return DAG.getNode(X86ISD::ANDNP, DL, VT, N1.getOperand(0), N0);
   14113 
   14114   return SDValue();
   14115 }
   14116 
   14117 static SDValue PerformOrCombine(SDNode *N, SelectionDAG &DAG,
   14118                                 TargetLowering::DAGCombinerInfo &DCI,
   14119                                 const X86Subtarget *Subtarget) {
   14120   if (DCI.isBeforeLegalizeOps())
   14121     return SDValue();
   14122 
   14123   SDValue R = CMPEQCombine(N, DAG, DCI, Subtarget);
   14124   if (R.getNode())
   14125     return R;
   14126 
   14127   EVT VT = N->getValueType(0);
   14128 
   14129   SDValue N0 = N->getOperand(0);
   14130   SDValue N1 = N->getOperand(1);
   14131 
   14132   // look for psign/blend
   14133   if (VT == MVT::v2i64 || VT == MVT::v4i64) {
   14134     if (!Subtarget->hasSSSE3() ||
   14135         (VT == MVT::v4i64 && !Subtarget->hasAVX2()))
   14136       return SDValue();
   14137 
   14138     // Canonicalize pandn to RHS
   14139     if (N0.getOpcode() == X86ISD::ANDNP)
   14140       std::swap(N0, N1);
   14141     // or (and (m, y), (pandn m, x))
   14142     if (N0.getOpcode() == ISD::AND && N1.getOpcode() == X86ISD::ANDNP) {
   14143       SDValue Mask = N1.getOperand(0);
   14144       SDValue X    = N1.getOperand(1);
   14145       SDValue Y;
   14146       if (N0.getOperand(0) == Mask)
   14147         Y = N0.getOperand(1);
   14148       if (N0.getOperand(1) == Mask)
   14149         Y = N0.getOperand(0);
   14150 
   14151       // Check to see if the mask appeared in both the AND and ANDNP and
   14152       if (!Y.getNode())
   14153         return SDValue();
   14154 
   14155       // Validate that X, Y, and Mask are BIT_CONVERTS, and see through them.
   14156       // Look through mask bitcast.
   14157       if (Mask.getOpcode() == ISD::BITCAST)
   14158         Mask = Mask.getOperand(0);
   14159       if (X.getOpcode() == ISD::BITCAST)
   14160         X = X.getOperand(0);
   14161       if (Y.getOpcode() == ISD::BITCAST)
   14162         Y = Y.getOperand(0);
   14163 
   14164       EVT MaskVT = Mask.getValueType();
   14165 
   14166       // Validate that the Mask operand is a vector sra node.
   14167       // FIXME: what to do for bytes, since there is a psignb/pblendvb, but
   14168       // there is no psrai.b
   14169       if (Mask.getOpcode() != X86ISD::VSRAI)
   14170         return SDValue();
   14171 
   14172       // Check that the SRA is all signbits.
   14173       SDValue SraC = Mask.getOperand(1);
   14174       unsigned SraAmt  = cast<ConstantSDNode>(SraC)->getZExtValue();
   14175       unsigned EltBits = MaskVT.getVectorElementType().getSizeInBits();
   14176       if ((SraAmt + 1) != EltBits)
   14177         return SDValue();
   14178 
   14179       DebugLoc DL = N->getDebugLoc();
   14180 
   14181       // Now we know we at least have a plendvb with the mask val.  See if
   14182       // we can form a psignb/w/d.
   14183       // psign = x.type == y.type == mask.type && y = sub(0, x);
   14184       if (Y.getOpcode() == ISD::SUB && Y.getOperand(1) == X &&
   14185           ISD::isBuildVectorAllZeros(Y.getOperand(0).getNode()) &&
   14186           X.getValueType() == MaskVT && Y.getValueType() == MaskVT) {
   14187         assert((EltBits == 8 || EltBits == 16 || EltBits == 32) &&
   14188                "Unsupported VT for PSIGN");
   14189         Mask = DAG.getNode(X86ISD::PSIGN, DL, MaskVT, X, Mask.getOperand(0));
   14190         return DAG.getNode(ISD::BITCAST, DL, VT, Mask);
   14191       }
   14192       // PBLENDVB only available on SSE 4.1
   14193       if (!Subtarget->hasSSE41())
   14194         return SDValue();
   14195 
   14196       EVT BlendVT = (VT == MVT::v4i64) ? MVT::v32i8 : MVT::v16i8;
   14197 
   14198       X = DAG.getNode(ISD::BITCAST, DL, BlendVT, X);
   14199       Y = DAG.getNode(ISD::BITCAST, DL, BlendVT, Y);
   14200       Mask = DAG.getNode(ISD::BITCAST, DL, BlendVT, Mask);
   14201       Mask = DAG.getNode(ISD::VSELECT, DL, BlendVT, Mask, Y, X);
   14202       return DAG.getNode(ISD::BITCAST, DL, VT, Mask);
   14203     }
   14204   }
   14205 
   14206   if (VT != MVT::i16 && VT != MVT::i32 && VT != MVT::i64)
   14207     return SDValue();
   14208 
   14209   // fold (or (x << c) | (y >> (64 - c))) ==> (shld64 x, y, c)
   14210   if (N0.getOpcode() == ISD::SRL && N1.getOpcode() == ISD::SHL)
   14211     std::swap(N0, N1);
   14212   if (N0.getOpcode() != ISD::SHL || N1.getOpcode() != ISD::SRL)
   14213     return SDValue();
   14214   if (!N0.hasOneUse() || !N1.hasOneUse())
   14215     return SDValue();
   14216 
   14217   SDValue ShAmt0 = N0.getOperand(1);
   14218   if (ShAmt0.getValueType() != MVT::i8)
   14219     return SDValue();
   14220   SDValue ShAmt1 = N1.getOperand(1);
   14221   if (ShAmt1.getValueType() != MVT::i8)
   14222     return SDValue();
   14223   if (ShAmt0.getOpcode() == ISD::TRUNCATE)
   14224     ShAmt0 = ShAmt0.getOperand(0);
   14225   if (ShAmt1.getOpcode() == ISD::TRUNCATE)
   14226     ShAmt1 = ShAmt1.getOperand(0);
   14227 
   14228   DebugLoc DL = N->getDebugLoc();
   14229   unsigned Opc = X86ISD::SHLD;
   14230   SDValue Op0 = N0.getOperand(0);
   14231   SDValue Op1 = N1.getOperand(0);
   14232   if (ShAmt0.getOpcode() == ISD::SUB) {
   14233     Opc = X86ISD::SHRD;
   14234     std::swap(Op0, Op1);
   14235     std::swap(ShAmt0, ShAmt1);
   14236   }
   14237 
   14238   unsigned Bits = VT.getSizeInBits();
   14239   if (ShAmt1.getOpcode() == ISD::SUB) {
   14240     SDValue Sum = ShAmt1.getOperand(0);
   14241     if (ConstantSDNode *SumC = dyn_cast<ConstantSDNode>(Sum)) {
   14242       SDValue ShAmt1Op1 = ShAmt1.getOperand(1);
   14243       if (ShAmt1Op1.getNode()->getOpcode() == ISD::TRUNCATE)
   14244         ShAmt1Op1 = ShAmt1Op1.getOperand(0);
   14245       if (SumC->getSExtValue() == Bits && ShAmt1Op1 == ShAmt0)
   14246         return DAG.getNode(Opc, DL, VT,
   14247                            Op0, Op1,
   14248                            DAG.getNode(ISD::TRUNCATE, DL,
   14249                                        MVT::i8, ShAmt0));
   14250     }
   14251   } else if (ConstantSDNode *ShAmt1C = dyn_cast<ConstantSDNode>(ShAmt1)) {
   14252     ConstantSDNode *ShAmt0C = dyn_cast<ConstantSDNode>(ShAmt0);
   14253     if (ShAmt0C &&
   14254         ShAmt0C->getSExtValue() + ShAmt1C->getSExtValue() == Bits)
   14255       return DAG.getNode(Opc, DL, VT,
   14256                          N0.getOperand(0), N1.getOperand(0),
   14257                          DAG.getNode(ISD::TRUNCATE, DL,
   14258                                        MVT::i8, ShAmt0));
   14259   }
   14260 
   14261   return SDValue();
   14262 }
   14263 
   14264 // PerformXorCombine - Attempts to turn XOR nodes into BLSMSK nodes
   14265 static SDValue PerformXorCombine(SDNode *N, SelectionDAG &DAG,
   14266                                  TargetLowering::DAGCombinerInfo &DCI,
   14267                                  const X86Subtarget *Subtarget) {
   14268   if (DCI.isBeforeLegalizeOps())
   14269     return SDValue();
   14270 
   14271   EVT VT = N->getValueType(0);
   14272 
   14273   if (VT != MVT::i32 && VT != MVT::i64)
   14274     return SDValue();
   14275 
   14276   assert(Subtarget->hasBMI() && "Creating BLSMSK requires BMI instructions");
   14277 
   14278   // Create BLSMSK instructions by finding X ^ (X-1)
   14279   SDValue N0 = N->getOperand(0);
   14280   SDValue N1 = N->getOperand(1);
   14281   DebugLoc DL = N->getDebugLoc();
   14282 
   14283   if (N0.getOpcode() == ISD::ADD && N0.getOperand(0) == N1 &&
   14284       isAllOnes(N0.getOperand(1)))
   14285     return DAG.getNode(X86ISD::BLSMSK, DL, VT, N1);
   14286 
   14287   if (N1.getOpcode() == ISD::ADD && N1.getOperand(0) == N0 &&
   14288       isAllOnes(N1.getOperand(1)))
   14289     return DAG.getNode(X86ISD::BLSMSK, DL, VT, N0);
   14290 
   14291   return SDValue();
   14292 }
   14293 
   14294 /// PerformLOADCombine - Do target-specific dag combines on LOAD nodes.
   14295 static SDValue PerformLOADCombine(SDNode *N, SelectionDAG &DAG,
   14296                                    const X86Subtarget *Subtarget) {
   14297   LoadSDNode *Ld = cast<LoadSDNode>(N);
   14298   EVT RegVT = Ld->getValueType(0);
   14299   EVT MemVT = Ld->getMemoryVT();
   14300   DebugLoc dl = Ld->getDebugLoc();
   14301   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
   14302 
   14303   ISD::LoadExtType Ext = Ld->getExtensionType();
   14304 
   14305   // If this is a vector EXT Load then attempt to optimize it using a
   14306   // shuffle. We need SSE4 for the shuffles.
   14307   // TODO: It is possible to support ZExt by zeroing the undef values
   14308   // during the shuffle phase or after the shuffle.
   14309   if (RegVT.isVector() && RegVT.isInteger() &&
   14310       Ext == ISD::EXTLOAD && Subtarget->hasSSE41()) {
   14311     assert(MemVT != RegVT && "Cannot extend to the same type");
   14312     assert(MemVT.isVector() && "Must load a vector from memory");
   14313 
   14314     unsigned NumElems = RegVT.getVectorNumElements();
   14315     unsigned RegSz = RegVT.getSizeInBits();
   14316     unsigned MemSz = MemVT.getSizeInBits();
   14317     assert(RegSz > MemSz && "Register size must be greater than the mem size");
   14318     // All sizes must be a power of two
   14319     if (!isPowerOf2_32(RegSz * MemSz * NumElems)) return SDValue();
   14320 
   14321     // Attempt to load the original value using a single load op.
   14322     // Find a scalar type which is equal to the loaded word size.
   14323     MVT SclrLoadTy = MVT::i8;
   14324     for (unsigned tp = MVT::FIRST_INTEGER_VALUETYPE;
   14325          tp < MVT::LAST_INTEGER_VALUETYPE; ++tp) {
   14326       MVT Tp = (MVT::SimpleValueType)tp;
   14327       if (TLI.isTypeLegal(Tp) &&  Tp.getSizeInBits() == MemSz) {
   14328         SclrLoadTy = Tp;
   14329         break;
   14330       }
   14331     }
   14332 
   14333     // Proceed if a load word is found.
   14334     if (SclrLoadTy.getSizeInBits() != MemSz) return SDValue();
   14335 
   14336     EVT LoadUnitVecVT = EVT::getVectorVT(*DAG.getContext(), SclrLoadTy,
   14337       RegSz/SclrLoadTy.getSizeInBits());
   14338 
   14339     EVT WideVecVT = EVT::getVectorVT(*DAG.getContext(), MemVT.getScalarType(),
   14340                                   RegSz/MemVT.getScalarType().getSizeInBits());
   14341     // Can't shuffle using an illegal type.
   14342     if (!TLI.isTypeLegal(WideVecVT)) return SDValue();
   14343 
   14344     // Perform a single load.
   14345     SDValue ScalarLoad = DAG.getLoad(SclrLoadTy, dl, Ld->getChain(),
   14346                                   Ld->getBasePtr(),
   14347                                   Ld->getPointerInfo(), Ld->isVolatile(),
   14348                                   Ld->isNonTemporal(), Ld->isInvariant(),
   14349                                   Ld->getAlignment());
   14350 
   14351     // Insert the word loaded into a vector.
   14352     SDValue ScalarInVector = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl,
   14353       LoadUnitVecVT, ScalarLoad);
   14354 
   14355     // Bitcast the loaded value to a vector of the original element type, in
   14356     // the size of the target vector type.
   14357     SDValue SlicedVec = DAG.getNode(ISD::BITCAST, dl, WideVecVT,
   14358                                     ScalarInVector);
   14359     unsigned SizeRatio = RegSz/MemSz;
   14360 
   14361     // Redistribute the loaded elements into the different locations.
   14362     SmallVector<int, 8> ShuffleVec(NumElems * SizeRatio, -1);
   14363     for (unsigned i = 0; i < NumElems; i++) ShuffleVec[i*SizeRatio] = i;
   14364 
   14365     SDValue Shuff = DAG.getVectorShuffle(WideVecVT, dl, SlicedVec,
   14366                                 DAG.getUNDEF(SlicedVec.getValueType()),
   14367                                 ShuffleVec.data());
   14368 
   14369     // Bitcast to the requested type.
   14370     Shuff = DAG.getNode(ISD::BITCAST, dl, RegVT, Shuff);
   14371     // Replace the original load with the new sequence
   14372     // and return the new chain.
   14373     DAG.ReplaceAllUsesOfValueWith(SDValue(N, 0), Shuff);
   14374     return SDValue(ScalarLoad.getNode(), 1);
   14375   }
   14376 
   14377   return SDValue();
   14378 }
   14379 
   14380 /// PerformSTORECombine - Do target-specific dag combines on STORE nodes.
   14381 static SDValue PerformSTORECombine(SDNode *N, SelectionDAG &DAG,
   14382                                    const X86Subtarget *Subtarget) {
   14383   StoreSDNode *St = cast<StoreSDNode>(N);
   14384   EVT VT = St->getValue().getValueType();
   14385   EVT StVT = St->getMemoryVT();
   14386   DebugLoc dl = St->getDebugLoc();
   14387   SDValue StoredVal = St->getOperand(1);
   14388   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
   14389 
   14390   // If we are saving a concatenation of two XMM registers, perform two stores.
   14391   // This is better in Sandy Bridge cause one 256-bit mem op is done via two
   14392   // 128-bit ones. If in the future the cost becomes only one memory access the
   14393   // first version would be better.
   14394   if (VT.getSizeInBits() == 256 &&
   14395     StoredVal.getNode()->getOpcode() == ISD::CONCAT_VECTORS &&
   14396     StoredVal.getNumOperands() == 2) {
   14397 
   14398     SDValue Value0 = StoredVal.getOperand(0);
   14399     SDValue Value1 = StoredVal.getOperand(1);
   14400 
   14401     SDValue Stride = DAG.getConstant(16, TLI.getPointerTy());
   14402     SDValue Ptr0 = St->getBasePtr();
   14403     SDValue Ptr1 = DAG.getNode(ISD::ADD, dl, Ptr0.getValueType(), Ptr0, Stride);
   14404 
   14405     SDValue Ch0 = DAG.getStore(St->getChain(), dl, Value0, Ptr0,
   14406                                 St->getPointerInfo(), St->isVolatile(),
   14407                                 St->isNonTemporal(), St->getAlignment());
   14408     SDValue Ch1 = DAG.getStore(St->getChain(), dl, Value1, Ptr1,
   14409                                 St->getPointerInfo(), St->isVolatile(),
   14410                                 St->isNonTemporal(), St->getAlignment());
   14411     return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Ch0, Ch1);
   14412   }
   14413 
   14414   // Optimize trunc store (of multiple scalars) to shuffle and store.
   14415   // First, pack all of the elements in one place. Next, store to memory
   14416   // in fewer chunks.
   14417   if (St->isTruncatingStore() && VT.isVector()) {
   14418     const TargetLowering &TLI = DAG.getTargetLoweringInfo();
   14419     unsigned NumElems = VT.getVectorNumElements();
   14420     assert(StVT != VT && "Cannot truncate to the same type");
   14421     unsigned FromSz = VT.getVectorElementType().getSizeInBits();
   14422     unsigned ToSz = StVT.getVectorElementType().getSizeInBits();
   14423 
   14424     // From, To sizes and ElemCount must be pow of two
   14425     if (!isPowerOf2_32(NumElems * FromSz * ToSz)) return SDValue();
   14426     // We are going to use the original vector elt for storing.
   14427     // Accumulated smaller vector elements must be a multiple of the store size.
   14428     if (0 != (NumElems * FromSz) % ToSz) return SDValue();
   14429 
   14430     unsigned SizeRatio  = FromSz / ToSz;
   14431 
   14432     assert(SizeRatio * NumElems * ToSz == VT.getSizeInBits());
   14433 
   14434     // Create a type on which we perform the shuffle
   14435     EVT WideVecVT = EVT::getVectorVT(*DAG.getContext(),
   14436             StVT.getScalarType(), NumElems*SizeRatio);
   14437 
   14438     assert(WideVecVT.getSizeInBits() == VT.getSizeInBits());
   14439 
   14440     SDValue WideVec = DAG.getNode(ISD::BITCAST, dl, WideVecVT, St->getValue());
   14441     SmallVector<int, 8> ShuffleVec(NumElems * SizeRatio, -1);
   14442     for (unsigned i = 0; i < NumElems; i++ ) ShuffleVec[i] = i * SizeRatio;
   14443 
   14444     // Can't shuffle using an illegal type
   14445     if (!TLI.isTypeLegal(WideVecVT)) return SDValue();
   14446 
   14447     SDValue Shuff = DAG.getVectorShuffle(WideVecVT, dl, WideVec,
   14448                                 DAG.getUNDEF(WideVec.getValueType()),
   14449                                 ShuffleVec.data());
   14450     // At this point all of the data is stored at the bottom of the
   14451     // register. We now need to save it to mem.
   14452 
   14453     // Find the largest store unit
   14454     MVT StoreType = MVT::i8;
   14455     for (unsigned tp = MVT::FIRST_INTEGER_VALUETYPE;
   14456          tp < MVT::LAST_INTEGER_VALUETYPE; ++tp) {
   14457       MVT Tp = (MVT::SimpleValueType)tp;
   14458       if (TLI.isTypeLegal(Tp) && StoreType.getSizeInBits() < NumElems * ToSz)
   14459         StoreType = Tp;
   14460     }
   14461 
   14462     // Bitcast the original vector into a vector of store-size units
   14463     EVT StoreVecVT = EVT::getVectorVT(*DAG.getContext(),
   14464             StoreType, VT.getSizeInBits()/EVT(StoreType).getSizeInBits());
   14465     assert(StoreVecVT.getSizeInBits() == VT.getSizeInBits());
   14466     SDValue ShuffWide = DAG.getNode(ISD::BITCAST, dl, StoreVecVT, Shuff);
   14467     SmallVector<SDValue, 8> Chains;
   14468     SDValue Increment = DAG.getConstant(StoreType.getSizeInBits()/8,
   14469                                         TLI.getPointerTy());
   14470     SDValue Ptr = St->getBasePtr();
   14471 
   14472     // Perform one or more big stores into memory.
   14473     for (unsigned i = 0; i < (ToSz*NumElems)/StoreType.getSizeInBits() ; i++) {
   14474       SDValue SubVec = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl,
   14475                                    StoreType, ShuffWide,
   14476                                    DAG.getIntPtrConstant(i));
   14477       SDValue Ch = DAG.getStore(St->getChain(), dl, SubVec, Ptr,
   14478                                 St->getPointerInfo(), St->isVolatile(),
   14479                                 St->isNonTemporal(), St->getAlignment());
   14480       Ptr = DAG.getNode(ISD::ADD, dl, Ptr.getValueType(), Ptr, Increment);
   14481       Chains.push_back(Ch);
   14482     }
   14483 
   14484     return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, &Chains[0],
   14485                                Chains.size());
   14486   }
   14487 
   14488 
   14489   // Turn load->store of MMX types into GPR load/stores.  This avoids clobbering
   14490   // the FP state in cases where an emms may be missing.
   14491   // A preferable solution to the general problem is to figure out the right
   14492   // places to insert EMMS.  This qualifies as a quick hack.
   14493 
   14494   // Similarly, turn load->store of i64 into double load/stores in 32-bit mode.
   14495   if (VT.getSizeInBits() != 64)
   14496     return SDValue();
   14497 
   14498   const Function *F = DAG.getMachineFunction().getFunction();
   14499   bool NoImplicitFloatOps = F->hasFnAttr(Attribute::NoImplicitFloat);
   14500   bool F64IsLegal = !DAG.getTarget().Options.UseSoftFloat && !NoImplicitFloatOps
   14501                      && Subtarget->hasSSE2();
   14502   if ((VT.isVector() ||
   14503        (VT == MVT::i64 && F64IsLegal && !Subtarget->is64Bit())) &&
   14504       isa<LoadSDNode>(St->getValue()) &&
   14505       !cast<LoadSDNode>(St->getValue())->isVolatile() &&
   14506       St->getChain().hasOneUse() && !St->isVolatile()) {
   14507     SDNode* LdVal = St->getValue().getNode();
   14508     LoadSDNode *Ld = 0;
   14509     int TokenFactorIndex = -1;
   14510     SmallVector<SDValue, 8> Ops;
   14511     SDNode* ChainVal = St->getChain().getNode();
   14512     // Must be a store of a load.  We currently handle two cases:  the load
   14513     // is a direct child, and it's under an intervening TokenFactor.  It is
   14514     // possible to dig deeper under nested TokenFactors.
   14515     if (ChainVal == LdVal)
   14516       Ld = cast<LoadSDNode>(St->getChain());
   14517     else if (St->getValue().hasOneUse() &&
   14518              ChainVal->getOpcode() == ISD::TokenFactor) {
   14519       for (unsigned i = 0, e = ChainVal->getNumOperands(); i != e; ++i) {
   14520         if (ChainVal->getOperand(i).getNode() == LdVal) {
   14521           TokenFactorIndex = i;
   14522           Ld = cast<LoadSDNode>(St->getValue());
   14523         } else
   14524           Ops.push_back(ChainVal->getOperand(i));
   14525       }
   14526     }
   14527 
   14528     if (!Ld || !ISD::isNormalLoad(Ld))
   14529       return SDValue();
   14530 
   14531     // If this is not the MMX case, i.e. we are just turning i64 load/store
   14532     // into f64 load/store, avoid the transformation if there are multiple
   14533     // uses of the loaded value.
   14534     if (!VT.isVector() && !Ld->hasNUsesOfValue(1, 0))
   14535       return SDValue();
   14536 
   14537     DebugLoc LdDL = Ld->getDebugLoc();
   14538     DebugLoc StDL = N->getDebugLoc();
   14539     // If we are a 64-bit capable x86, lower to a single movq load/store pair.
   14540     // Otherwise, if it's legal to use f64 SSE instructions, use f64 load/store
   14541     // pair instead.
   14542     if (Subtarget->is64Bit() || F64IsLegal) {
   14543       EVT LdVT = Subtarget->is64Bit() ? MVT::i64 : MVT::f64;
   14544       SDValue NewLd = DAG.getLoad(LdVT, LdDL, Ld->getChain(), Ld->getBasePtr(),
   14545                                   Ld->getPointerInfo(), Ld->isVolatile(),
   14546                                   Ld->isNonTemporal(), Ld->isInvariant(),
   14547                                   Ld->getAlignment());
   14548       SDValue NewChain = NewLd.getValue(1);
   14549       if (TokenFactorIndex != -1) {
   14550         Ops.push_back(NewChain);
   14551         NewChain = DAG.getNode(ISD::TokenFactor, LdDL, MVT::Other, &Ops[0],
   14552                                Ops.size());
   14553       }
   14554       return DAG.getStore(NewChain, StDL, NewLd, St->getBasePtr(),
   14555                           St->getPointerInfo(),
   14556                           St->isVolatile(), St->isNonTemporal(),
   14557                           St->getAlignment());
   14558     }
   14559 
   14560     // Otherwise, lower to two pairs of 32-bit loads / stores.
   14561     SDValue LoAddr = Ld->getBasePtr();
   14562     SDValue HiAddr = DAG.getNode(ISD::ADD, LdDL, MVT::i32, LoAddr,
   14563                                  DAG.getConstant(4, MVT::i32));
   14564 
   14565     SDValue LoLd = DAG.getLoad(MVT::i32, LdDL, Ld->getChain(), LoAddr,
   14566                                Ld->getPointerInfo(),
   14567                                Ld->isVolatile(), Ld->isNonTemporal(),
   14568                                Ld->isInvariant(), Ld->getAlignment());
   14569     SDValue HiLd = DAG.getLoad(MVT::i32, LdDL, Ld->getChain(), HiAddr,
   14570                                Ld->getPointerInfo().getWithOffset(4),
   14571                                Ld->isVolatile(), Ld->isNonTemporal(),
   14572                                Ld->isInvariant(),
   14573                                MinAlign(Ld->getAlignment(), 4));
   14574 
   14575     SDValue NewChain = LoLd.getValue(1);
   14576     if (TokenFactorIndex != -1) {
   14577       Ops.push_back(LoLd);
   14578       Ops.push_back(HiLd);
   14579       NewChain = DAG.getNode(ISD::TokenFactor, LdDL, MVT::Other, &Ops[0],
   14580                              Ops.size());
   14581     }
   14582 
   14583     LoAddr = St->getBasePtr();
   14584     HiAddr = DAG.getNode(ISD::ADD, StDL, MVT::i32, LoAddr,
   14585                          DAG.getConstant(4, MVT::i32));
   14586 
   14587     SDValue LoSt = DAG.getStore(NewChain, StDL, LoLd, LoAddr,
   14588                                 St->getPointerInfo(),
   14589                                 St->isVolatile(), St->isNonTemporal(),
   14590                                 St->getAlignment());
   14591     SDValue HiSt = DAG.getStore(NewChain, StDL, HiLd, HiAddr,
   14592                                 St->getPointerInfo().getWithOffset(4),
   14593                                 St->isVolatile(),
   14594                                 St->isNonTemporal(),
   14595                                 MinAlign(St->getAlignment(), 4));
   14596     return DAG.getNode(ISD::TokenFactor, StDL, MVT::Other, LoSt, HiSt);
   14597   }
   14598   return SDValue();
   14599 }
   14600 
   14601 /// isHorizontalBinOp - Return 'true' if this vector operation is "horizontal"
   14602 /// and return the operands for the horizontal operation in LHS and RHS.  A
   14603 /// horizontal operation performs the binary operation on successive elements
   14604 /// of its first operand, then on successive elements of its second operand,
   14605 /// returning the resulting values in a vector.  For example, if
   14606 ///   A = < float a0, float a1, float a2, float a3 >
   14607 /// and
   14608 ///   B = < float b0, float b1, float b2, float b3 >
   14609 /// then the result of doing a horizontal operation on A and B is
   14610 ///   A horizontal-op B = < a0 op a1, a2 op a3, b0 op b1, b2 op b3 >.
   14611 /// In short, LHS and RHS are inspected to see if LHS op RHS is of the form
   14612 /// A horizontal-op B, for some already available A and B, and if so then LHS is
   14613 /// set to A, RHS to B, and the routine returns 'true'.
   14614 /// Note that the binary operation should have the property that if one of the
   14615 /// operands is UNDEF then the result is UNDEF.
   14616 static bool isHorizontalBinOp(SDValue &LHS, SDValue &RHS, bool IsCommutative) {
   14617   // Look for the following pattern: if
   14618   //   A = < float a0, float a1, float a2, float a3 >
   14619   //   B = < float b0, float b1, float b2, float b3 >
   14620   // and
   14621   //   LHS = VECTOR_SHUFFLE A, B, <0, 2, 4, 6>
   14622   //   RHS = VECTOR_SHUFFLE A, B, <1, 3, 5, 7>
   14623   // then LHS op RHS = < a0 op a1, a2 op a3, b0 op b1, b2 op b3 >
   14624   // which is A horizontal-op B.
   14625 
   14626   // At least one of the operands should be a vector shuffle.
   14627   if (LHS.getOpcode() != ISD::VECTOR_SHUFFLE &&
   14628       RHS.getOpcode() != ISD::VECTOR_SHUFFLE)
   14629     return false;
   14630 
   14631   EVT VT = LHS.getValueType();
   14632 
   14633   assert((VT.is128BitVector() || VT.is256BitVector()) &&
   14634          "Unsupported vector type for horizontal add/sub");
   14635 
   14636   // Handle 128 and 256-bit vector lengths. AVX defines horizontal add/sub to
   14637   // operate independently on 128-bit lanes.
   14638   unsigned NumElts = VT.getVectorNumElements();
   14639   unsigned NumLanes = VT.getSizeInBits()/128;
   14640   unsigned NumLaneElts = NumElts / NumLanes;
   14641   assert((NumLaneElts % 2 == 0) &&
   14642          "Vector type should have an even number of elements in each lane");
   14643   unsigned HalfLaneElts = NumLaneElts/2;
   14644 
   14645   // View LHS in the form
   14646   //   LHS = VECTOR_SHUFFLE A, B, LMask
   14647   // If LHS is not a shuffle then pretend it is the shuffle
   14648   //   LHS = VECTOR_SHUFFLE LHS, undef, <0, 1, ..., N-1>
   14649   // NOTE: in what follows a default initialized SDValue represents an UNDEF of
   14650   // type VT.
   14651   SDValue A, B;
   14652   SmallVector<int, 16> LMask(NumElts);
   14653   if (LHS.getOpcode() == ISD::VECTOR_SHUFFLE) {
   14654     if (LHS.getOperand(0).getOpcode() != ISD::UNDEF)
   14655       A = LHS.getOperand(0);
   14656     if (LHS.getOperand(1).getOpcode() != ISD::UNDEF)
   14657       B = LHS.getOperand(1);
   14658     ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(LHS.getNode())->getMask();
   14659     std::copy(Mask.begin(), Mask.end(), LMask.begin());
   14660   } else {
   14661     if (LHS.getOpcode() != ISD::UNDEF)
   14662       A = LHS;
   14663     for (unsigned i = 0; i != NumElts; ++i)
   14664       LMask[i] = i;
   14665   }
   14666 
   14667   // Likewise, view RHS in the form
   14668   //   RHS = VECTOR_SHUFFLE C, D, RMask
   14669   SDValue C, D;
   14670   SmallVector<int, 16> RMask(NumElts);
   14671   if (RHS.getOpcode() == ISD::VECTOR_SHUFFLE) {
   14672     if (RHS.getOperand(0).getOpcode() != ISD::UNDEF)
   14673       C = RHS.getOperand(0);
   14674     if (RHS.getOperand(1).getOpcode() != ISD::UNDEF)
   14675       D = RHS.getOperand(1);
   14676     ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(RHS.getNode())->getMask();
   14677     std::copy(Mask.begin(), Mask.end(), RMask.begin());
   14678   } else {
   14679     if (RHS.getOpcode() != ISD::UNDEF)
   14680       C = RHS;
   14681     for (unsigned i = 0; i != NumElts; ++i)
   14682       RMask[i] = i;
   14683   }
   14684 
   14685   // Check that the shuffles are both shuffling the same vectors.
   14686   if (!(A == C && B == D) && !(A == D && B == C))
   14687     return false;
   14688 
   14689   // If everything is UNDEF then bail out: it would be better to fold to UNDEF.
   14690   if (!A.getNode() && !B.getNode())
   14691     return false;
   14692 
   14693   // If A and B occur in reverse order in RHS, then "swap" them (which means
   14694   // rewriting the mask).
   14695   if (A != C)
   14696     CommuteVectorShuffleMask(RMask, NumElts);
   14697 
   14698   // At this point LHS and RHS are equivalent to
   14699   //   LHS = VECTOR_SHUFFLE A, B, LMask
   14700   //   RHS = VECTOR_SHUFFLE A, B, RMask
   14701   // Check that the masks correspond to performing a horizontal operation.
   14702   for (unsigned i = 0; i != NumElts; ++i) {
   14703     int LIdx = LMask[i], RIdx = RMask[i];
   14704 
   14705     // Ignore any UNDEF components.
   14706     if (LIdx < 0 || RIdx < 0 ||
   14707         (!A.getNode() && (LIdx < (int)NumElts || RIdx < (int)NumElts)) ||
   14708         (!B.getNode() && (LIdx >= (int)NumElts || RIdx >= (int)NumElts)))
   14709       continue;
   14710 
   14711     // Check that successive elements are being operated on.  If not, this is
   14712     // not a horizontal operation.
   14713     unsigned Src = (i/HalfLaneElts) % 2; // each lane is split between srcs
   14714     unsigned LaneStart = (i/NumLaneElts) * NumLaneElts;
   14715     int Index = 2*(i%HalfLaneElts) + NumElts*Src + LaneStart;
   14716     if (!(LIdx == Index && RIdx == Index + 1) &&
   14717         !(IsCommutative && LIdx == Index + 1 && RIdx == Index))
   14718       return false;
   14719   }
   14720 
   14721   LHS = A.getNode() ? A : B; // If A is 'UNDEF', use B for it.
   14722   RHS = B.getNode() ? B : A; // If B is 'UNDEF', use A for it.
   14723   return true;
   14724 }
   14725 
   14726 /// PerformFADDCombine - Do target-specific dag combines on floating point adds.
   14727 static SDValue PerformFADDCombine(SDNode *N, SelectionDAG &DAG,
   14728                                   const X86Subtarget *Subtarget) {
   14729   EVT VT = N->getValueType(0);
   14730   SDValue LHS = N->getOperand(0);
   14731   SDValue RHS = N->getOperand(1);
   14732 
   14733   // Try to synthesize horizontal adds from adds of shuffles.
   14734   if (((Subtarget->hasSSE3() && (VT == MVT::v4f32 || VT == MVT::v2f64)) ||
   14735        (Subtarget->hasAVX() && (VT == MVT::v8f32 || VT == MVT::v4f64))) &&
   14736       isHorizontalBinOp(LHS, RHS, true))
   14737     return DAG.getNode(X86ISD::FHADD, N->getDebugLoc(), VT, LHS, RHS);
   14738   return SDValue();
   14739 }
   14740 
   14741 /// PerformFSUBCombine - Do target-specific dag combines on floating point subs.
   14742 static SDValue PerformFSUBCombine(SDNode *N, SelectionDAG &DAG,
   14743                                   const X86Subtarget *Subtarget) {
   14744   EVT VT = N->getValueType(0);
   14745   SDValue LHS = N->getOperand(0);
   14746   SDValue RHS = N->getOperand(1);
   14747 
   14748   // Try to synthesize horizontal subs from subs of shuffles.
   14749   if (((Subtarget->hasSSE3() && (VT == MVT::v4f32 || VT == MVT::v2f64)) ||
   14750        (Subtarget->hasAVX() && (VT == MVT::v8f32 || VT == MVT::v4f64))) &&
   14751       isHorizontalBinOp(LHS, RHS, false))
   14752     return DAG.getNode(X86ISD::FHSUB, N->getDebugLoc(), VT, LHS, RHS);
   14753   return SDValue();
   14754 }
   14755 
   14756 /// PerformFORCombine - Do target-specific dag combines on X86ISD::FOR and
   14757 /// X86ISD::FXOR nodes.
   14758 static SDValue PerformFORCombine(SDNode *N, SelectionDAG &DAG) {
   14759   assert(N->getOpcode() == X86ISD::FOR || N->getOpcode() == X86ISD::FXOR);
   14760   // F[X]OR(0.0, x) -> x
   14761   // F[X]OR(x, 0.0) -> x
   14762   if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N->getOperand(0)))
   14763     if (C->getValueAPF().isPosZero())
   14764       return N->getOperand(1);
   14765   if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N->getOperand(1)))
   14766     if (C->getValueAPF().isPosZero())
   14767       return N->getOperand(0);
   14768   return SDValue();
   14769 }
   14770 
   14771 /// PerformFANDCombine - Do target-specific dag combines on X86ISD::FAND nodes.
   14772 static SDValue PerformFANDCombine(SDNode *N, SelectionDAG &DAG) {
   14773   // FAND(0.0, x) -> 0.0
   14774   // FAND(x, 0.0) -> 0.0
   14775   if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N->getOperand(0)))
   14776     if (C->getValueAPF().isPosZero())
   14777       return N->getOperand(0);
   14778   if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N->getOperand(1)))
   14779     if (C->getValueAPF().isPosZero())
   14780       return N->getOperand(1);
   14781   return SDValue();
   14782 }
   14783 
   14784 static SDValue PerformBTCombine(SDNode *N,
   14785                                 SelectionDAG &DAG,
   14786                                 TargetLowering::DAGCombinerInfo &DCI) {
   14787   // BT ignores high bits in the bit index operand.
   14788   SDValue Op1 = N->getOperand(1);
   14789   if (Op1.hasOneUse()) {
   14790     unsigned BitWidth = Op1.getValueSizeInBits();
   14791     APInt DemandedMask = APInt::getLowBitsSet(BitWidth, Log2_32(BitWidth));
   14792     APInt KnownZero, KnownOne;
   14793     TargetLowering::TargetLoweringOpt TLO(DAG, !DCI.isBeforeLegalize(),
   14794                                           !DCI.isBeforeLegalizeOps());
   14795     const TargetLowering &TLI = DAG.getTargetLoweringInfo();
   14796     if (TLO.ShrinkDemandedConstant(Op1, DemandedMask) ||
   14797         TLI.SimplifyDemandedBits(Op1, DemandedMask, KnownZero, KnownOne, TLO))
   14798       DCI.CommitTargetLoweringOpt(TLO);
   14799   }
   14800   return SDValue();
   14801 }
   14802 
   14803 static SDValue PerformVZEXT_MOVLCombine(SDNode *N, SelectionDAG &DAG) {
   14804   SDValue Op = N->getOperand(0);
   14805   if (Op.getOpcode() == ISD::BITCAST)
   14806     Op = Op.getOperand(0);
   14807   EVT VT = N->getValueType(0), OpVT = Op.getValueType();
   14808   if (Op.getOpcode() == X86ISD::VZEXT_LOAD &&
   14809       VT.getVectorElementType().getSizeInBits() ==
   14810       OpVT.getVectorElementType().getSizeInBits()) {
   14811     return DAG.getNode(ISD::BITCAST, N->getDebugLoc(), VT, Op);
   14812   }
   14813   return SDValue();
   14814 }
   14815 
   14816 static SDValue PerformSExtCombine(SDNode *N, SelectionDAG &DAG,
   14817                                   TargetLowering::DAGCombinerInfo &DCI,
   14818                                   const X86Subtarget *Subtarget) {
   14819   if (!DCI.isBeforeLegalizeOps())
   14820     return SDValue();
   14821 
   14822   if (!Subtarget->hasAVX())
   14823     return SDValue();
   14824 
   14825   // Optimize vectors in AVX mode
   14826   // Sign extend  v8i16 to v8i32 and
   14827   //              v4i32 to v4i64
   14828   //
   14829   // Divide input vector into two parts
   14830   // for v4i32 the shuffle mask will be { 0, 1, -1, -1} {2, 3, -1, -1}
   14831   // use vpmovsx instruction to extend v4i32 -> v2i64; v8i16 -> v4i32
   14832   // concat the vectors to original VT
   14833 
   14834   EVT VT = N->getValueType(0);
   14835   SDValue Op = N->getOperand(0);
   14836   EVT OpVT = Op.getValueType();
   14837   DebugLoc dl = N->getDebugLoc();
   14838 
   14839   if ((VT == MVT::v4i64 && OpVT == MVT::v4i32) ||
   14840       (VT == MVT::v8i32 && OpVT == MVT::v8i16)) {
   14841 
   14842     unsigned NumElems = OpVT.getVectorNumElements();
   14843     SmallVector<int,8> ShufMask1(NumElems, -1);
   14844     for (unsigned i = 0; i < NumElems/2; i++) ShufMask1[i] = i;
   14845 
   14846     SDValue OpLo = DAG.getVectorShuffle(OpVT, dl, Op, DAG.getUNDEF(OpVT),
   14847                                         ShufMask1.data());
   14848 
   14849     SmallVector<int,8> ShufMask2(NumElems, -1);
   14850     for (unsigned i = 0; i < NumElems/2; i++) ShufMask2[i] = i + NumElems/2;
   14851 
   14852     SDValue OpHi = DAG.getVectorShuffle(OpVT, dl, Op, DAG.getUNDEF(OpVT),
   14853                                         ShufMask2.data());
   14854 
   14855     EVT HalfVT = EVT::getVectorVT(*DAG.getContext(), VT.getScalarType(),
   14856                                   VT.getVectorNumElements()/2);
   14857 
   14858     OpLo = DAG.getNode(X86ISD::VSEXT_MOVL, dl, HalfVT, OpLo);
   14859     OpHi = DAG.getNode(X86ISD::VSEXT_MOVL, dl, HalfVT, OpHi);
   14860 
   14861     return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, OpLo, OpHi);
   14862   }
   14863   return SDValue();
   14864 }
   14865 
   14866 static SDValue PerformZExtCombine(SDNode *N, SelectionDAG &DAG,
   14867                                   const X86Subtarget *Subtarget) {
   14868   // (i32 zext (and (i8  x86isd::setcc_carry), 1)) ->
   14869   //           (and (i32 x86isd::setcc_carry), 1)
   14870   // This eliminates the zext. This transformation is necessary because
   14871   // ISD::SETCC is always legalized to i8.
   14872   DebugLoc dl = N->getDebugLoc();
   14873   SDValue N0 = N->getOperand(0);
   14874   EVT VT = N->getValueType(0);
   14875   EVT OpVT = N0.getValueType();
   14876 
   14877   if (N0.getOpcode() == ISD::AND &&
   14878       N0.hasOneUse() &&
   14879       N0.getOperand(0).hasOneUse()) {
   14880     SDValue N00 = N0.getOperand(0);
   14881     if (N00.getOpcode() != X86ISD::SETCC_CARRY)
   14882       return SDValue();
   14883     ConstantSDNode *C = dyn_cast<ConstantSDNode>(N0.getOperand(1));
   14884     if (!C || C->getZExtValue() != 1)
   14885       return SDValue();
   14886     return DAG.getNode(ISD::AND, dl, VT,
   14887                        DAG.getNode(X86ISD::SETCC_CARRY, dl, VT,
   14888                                    N00.getOperand(0), N00.getOperand(1)),
   14889                        DAG.getConstant(1, VT));
   14890   }
   14891   // Optimize vectors in AVX mode:
   14892   //
   14893   //   v8i16 -> v8i32
   14894   //   Use vpunpcklwd for 4 lower elements  v8i16 -> v4i32.
   14895   //   Use vpunpckhwd for 4 upper elements  v8i16 -> v4i32.
   14896   //   Concat upper and lower parts.
   14897   //
   14898   //   v4i32 -> v4i64
   14899   //   Use vpunpckldq for 4 lower elements  v4i32 -> v2i64.
   14900   //   Use vpunpckhdq for 4 upper elements  v4i32 -> v2i64.
   14901   //   Concat upper and lower parts.
   14902   //
   14903   if (Subtarget->hasAVX()) {
   14904 
   14905     if (((VT == MVT::v8i32) && (OpVT == MVT::v8i16))  ||
   14906       ((VT == MVT::v4i64) && (OpVT == MVT::v4i32)))  {
   14907 
   14908       SDValue ZeroVec = getZeroVector(OpVT, Subtarget, DAG, dl);
   14909       SDValue OpLo = getTargetShuffleNode(X86ISD::UNPCKL, dl, OpVT, N0, ZeroVec, DAG);
   14910       SDValue OpHi = getTargetShuffleNode(X86ISD::UNPCKH, dl, OpVT, N0, ZeroVec, DAG);
   14911 
   14912       EVT HVT = EVT::getVectorVT(*DAG.getContext(), VT.getVectorElementType(),
   14913         VT.getVectorNumElements()/2);
   14914 
   14915       OpLo = DAG.getNode(ISD::BITCAST, dl, HVT, OpLo);
   14916       OpHi = DAG.getNode(ISD::BITCAST, dl, HVT, OpHi);
   14917 
   14918       return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, OpLo, OpHi);
   14919     }
   14920   }
   14921 
   14922 
   14923   return SDValue();
   14924 }
   14925 
   14926 // Optimize  RES = X86ISD::SETCC CONDCODE, EFLAG_INPUT
   14927 static SDValue PerformSETCCCombine(SDNode *N, SelectionDAG &DAG) {
   14928   unsigned X86CC = N->getConstantOperandVal(0);
   14929   SDValue EFLAG = N->getOperand(1);
   14930   DebugLoc DL = N->getDebugLoc();
   14931 
   14932   // Materialize "setb reg" as "sbb reg,reg", since it can be extended without
   14933   // a zext and produces an all-ones bit which is more useful than 0/1 in some
   14934   // cases.
   14935   if (X86CC == X86::COND_B)
   14936     return DAG.getNode(ISD::AND, DL, MVT::i8,
   14937                        DAG.getNode(X86ISD::SETCC_CARRY, DL, MVT::i8,
   14938                                    DAG.getConstant(X86CC, MVT::i8), EFLAG),
   14939                        DAG.getConstant(1, MVT::i8));
   14940 
   14941   return SDValue();
   14942 }
   14943 
   14944 static SDValue PerformSINT_TO_FPCombine(SDNode *N, SelectionDAG &DAG,
   14945                                         const X86TargetLowering *XTLI) {
   14946   SDValue Op0 = N->getOperand(0);
   14947   // Transform (SINT_TO_FP (i64 ...)) into an x87 operation if we have
   14948   // a 32-bit target where SSE doesn't support i64->FP operations.
   14949   if (Op0.getOpcode() == ISD::LOAD) {
   14950     LoadSDNode *Ld = cast<LoadSDNode>(Op0.getNode());
   14951     EVT VT = Ld->getValueType(0);
   14952     if (!Ld->isVolatile() && !N->getValueType(0).isVector() &&
   14953         ISD::isNON_EXTLoad(Op0.getNode()) && Op0.hasOneUse() &&
   14954         !XTLI->getSubtarget()->is64Bit() &&
   14955         !DAG.getTargetLoweringInfo().isTypeLegal(VT)) {
   14956       SDValue FILDChain = XTLI->BuildFILD(SDValue(N, 0), Ld->getValueType(0),
   14957                                           Ld->getChain(), Op0, DAG);
   14958       DAG.ReplaceAllUsesOfValueWith(Op0.getValue(1), FILDChain.getValue(1));
   14959       return FILDChain;
   14960     }
   14961   }
   14962   return SDValue();
   14963 }
   14964 
   14965 // Optimize RES, EFLAGS = X86ISD::ADC LHS, RHS, EFLAGS
   14966 static SDValue PerformADCCombine(SDNode *N, SelectionDAG &DAG,
   14967                                  X86TargetLowering::DAGCombinerInfo &DCI) {
   14968   // If the LHS and RHS of the ADC node are zero, then it can't overflow and
   14969   // the result is either zero or one (depending on the input carry bit).
   14970   // Strength reduce this down to a "set on carry" aka SETCC_CARRY&1.
   14971   if (X86::isZeroNode(N->getOperand(0)) &&
   14972       X86::isZeroNode(N->getOperand(1)) &&
   14973       // We don't have a good way to replace an EFLAGS use, so only do this when
   14974       // dead right now.
   14975       SDValue(N, 1).use_empty()) {
   14976     DebugLoc DL = N->getDebugLoc();
   14977     EVT VT = N->getValueType(0);
   14978     SDValue CarryOut = DAG.getConstant(0, N->getValueType(1));
   14979     SDValue Res1 = DAG.getNode(ISD::AND, DL, VT,
   14980                                DAG.getNode(X86ISD::SETCC_CARRY, DL, VT,
   14981                                            DAG.getConstant(X86::COND_B,MVT::i8),
   14982                                            N->getOperand(2)),
   14983                                DAG.getConstant(1, VT));
   14984     return DCI.CombineTo(N, Res1, CarryOut);
   14985   }
   14986 
   14987   return SDValue();
   14988 }
   14989 
   14990 // fold (add Y, (sete  X, 0)) -> adc  0, Y
   14991 //      (add Y, (setne X, 0)) -> sbb -1, Y
   14992 //      (sub (sete  X, 0), Y) -> sbb  0, Y
   14993 //      (sub (setne X, 0), Y) -> adc -1, Y
   14994 static SDValue OptimizeConditionalInDecrement(SDNode *N, SelectionDAG &DAG) {
   14995   DebugLoc DL = N->getDebugLoc();
   14996 
   14997   // Look through ZExts.
   14998   SDValue Ext = N->getOperand(N->getOpcode() == ISD::SUB ? 1 : 0);
   14999   if (Ext.getOpcode() != ISD::ZERO_EXTEND || !Ext.hasOneUse())
   15000     return SDValue();
   15001 
   15002   SDValue SetCC = Ext.getOperand(0);
   15003   if (SetCC.getOpcode() != X86ISD::SETCC || !SetCC.hasOneUse())
   15004     return SDValue();
   15005 
   15006   X86::CondCode CC = (X86::CondCode)SetCC.getConstantOperandVal(0);
   15007   if (CC != X86::COND_E && CC != X86::COND_NE)
   15008     return SDValue();
   15009 
   15010   SDValue Cmp = SetCC.getOperand(1);
   15011   if (Cmp.getOpcode() != X86ISD::CMP || !Cmp.hasOneUse() ||
   15012       !X86::isZeroNode(Cmp.getOperand(1)) ||
   15013       !Cmp.getOperand(0).getValueType().isInteger())
   15014     return SDValue();
   15015 
   15016   SDValue CmpOp0 = Cmp.getOperand(0);
   15017   SDValue NewCmp = DAG.getNode(X86ISD::CMP, DL, MVT::i32, CmpOp0,
   15018                                DAG.getConstant(1, CmpOp0.getValueType()));
   15019 
   15020   SDValue OtherVal = N->getOperand(N->getOpcode() == ISD::SUB ? 0 : 1);
   15021   if (CC == X86::COND_NE)
   15022     return DAG.getNode(N->getOpcode() == ISD::SUB ? X86ISD::ADC : X86ISD::SBB,
   15023                        DL, OtherVal.getValueType(), OtherVal,
   15024                        DAG.getConstant(-1ULL, OtherVal.getValueType()), NewCmp);
   15025   return DAG.getNode(N->getOpcode() == ISD::SUB ? X86ISD::SBB : X86ISD::ADC,
   15026                      DL, OtherVal.getValueType(), OtherVal,
   15027                      DAG.getConstant(0, OtherVal.getValueType()), NewCmp);
   15028 }
   15029 
   15030 /// PerformADDCombine - Do target-specific dag combines on integer adds.
   15031 static SDValue PerformAddCombine(SDNode *N, SelectionDAG &DAG,
   15032                                  const X86Subtarget *Subtarget) {
   15033   EVT VT = N->getValueType(0);
   15034   SDValue Op0 = N->getOperand(0);
   15035   SDValue Op1 = N->getOperand(1);
   15036 
   15037   // Try to synthesize horizontal adds from adds of shuffles.
   15038   if (((Subtarget->hasSSSE3() && (VT == MVT::v8i16 || VT == MVT::v4i32)) ||
   15039        (Subtarget->hasAVX2() && (VT == MVT::v16i16 || VT == MVT::v8i32))) &&
   15040       isHorizontalBinOp(Op0, Op1, true))
   15041     return DAG.getNode(X86ISD::HADD, N->getDebugLoc(), VT, Op0, Op1);
   15042 
   15043   return OptimizeConditionalInDecrement(N, DAG);
   15044 }
   15045 
   15046 static SDValue PerformSubCombine(SDNode *N, SelectionDAG &DAG,
   15047                                  const X86Subtarget *Subtarget) {
   15048   SDValue Op0 = N->getOperand(0);
   15049   SDValue Op1 = N->getOperand(1);
   15050 
   15051   // X86 can't encode an immediate LHS of a sub. See if we can push the
   15052   // negation into a preceding instruction.
   15053   if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op0)) {
   15054     // If the RHS of the sub is a XOR with one use and a constant, invert the
   15055     // immediate. Then add one to the LHS of the sub so we can turn
   15056     // X-Y -> X+~Y+1, saving one register.
   15057     if (Op1->hasOneUse() && Op1.getOpcode() == ISD::XOR &&
   15058         isa<ConstantSDNode>(Op1.getOperand(1))) {
   15059       APInt XorC = cast<ConstantSDNode>(Op1.getOperand(1))->getAPIntValue();
   15060       EVT VT = Op0.getValueType();
   15061       SDValue NewXor = DAG.getNode(ISD::XOR, Op1.getDebugLoc(), VT,
   15062                                    Op1.getOperand(0),
   15063                                    DAG.getConstant(~XorC, VT));
   15064       return DAG.getNode(ISD::ADD, N->getDebugLoc(), VT, NewXor,
   15065                          DAG.getConstant(C->getAPIntValue()+1, VT));
   15066     }
   15067   }
   15068 
   15069   // Try to synthesize horizontal adds from adds of shuffles.
   15070   EVT VT = N->getValueType(0);
   15071   if (((Subtarget->hasSSSE3() && (VT == MVT::v8i16 || VT == MVT::v4i32)) ||
   15072        (Subtarget->hasAVX2() && (VT == MVT::v16i16 || VT == MVT::v8i32))) &&
   15073       isHorizontalBinOp(Op0, Op1, true))
   15074     return DAG.getNode(X86ISD::HSUB, N->getDebugLoc(), VT, Op0, Op1);
   15075 
   15076   return OptimizeConditionalInDecrement(N, DAG);
   15077 }
   15078 
   15079 SDValue X86TargetLowering::PerformDAGCombine(SDNode *N,
   15080                                              DAGCombinerInfo &DCI) const {
   15081   SelectionDAG &DAG = DCI.DAG;
   15082   switch (N->getOpcode()) {
   15083   default: break;
   15084   case ISD::EXTRACT_VECTOR_ELT:
   15085     return PerformEXTRACT_VECTOR_ELTCombine(N, DAG, DCI);
   15086   case ISD::VSELECT:
   15087   case ISD::SELECT:         return PerformSELECTCombine(N, DAG, DCI, Subtarget);
   15088   case X86ISD::CMOV:        return PerformCMOVCombine(N, DAG, DCI);
   15089   case ISD::ADD:            return PerformAddCombine(N, DAG, Subtarget);
   15090   case ISD::SUB:            return PerformSubCombine(N, DAG, Subtarget);
   15091   case X86ISD::ADC:         return PerformADCCombine(N, DAG, DCI);
   15092   case ISD::MUL:            return PerformMulCombine(N, DAG, DCI);
   15093   case ISD::SHL:
   15094   case ISD::SRA:
   15095   case ISD::SRL:            return PerformShiftCombine(N, DAG, DCI, Subtarget);
   15096   case ISD::AND:            return PerformAndCombine(N, DAG, DCI, Subtarget);
   15097   case ISD::OR:             return PerformOrCombine(N, DAG, DCI, Subtarget);
   15098   case ISD::XOR:            return PerformXorCombine(N, DAG, DCI, Subtarget);
   15099   case ISD::LOAD:           return PerformLOADCombine(N, DAG, Subtarget);
   15100   case ISD::STORE:          return PerformSTORECombine(N, DAG, Subtarget);
   15101   case ISD::SINT_TO_FP:     return PerformSINT_TO_FPCombine(N, DAG, this);
   15102   case ISD::FADD:           return PerformFADDCombine(N, DAG, Subtarget);
   15103   case ISD::FSUB:           return PerformFSUBCombine(N, DAG, Subtarget);
   15104   case X86ISD::FXOR:
   15105   case X86ISD::FOR:         return PerformFORCombine(N, DAG);
   15106   case X86ISD::FAND:        return PerformFANDCombine(N, DAG);
   15107   case X86ISD::BT:          return PerformBTCombine(N, DAG, DCI);
   15108   case X86ISD::VZEXT_MOVL:  return PerformVZEXT_MOVLCombine(N, DAG);
   15109   case ISD::ZERO_EXTEND:    return PerformZExtCombine(N, DAG, Subtarget);
   15110   case ISD::SIGN_EXTEND:    return PerformSExtCombine(N, DAG, DCI, Subtarget);
   15111   case ISD::TRUNCATE:       return PerformTruncateCombine(N, DAG, DCI);
   15112   case X86ISD::SETCC:       return PerformSETCCCombine(N, DAG);
   15113   case X86ISD::SHUFP:       // Handle all target specific shuffles
   15114   case X86ISD::PALIGN:
   15115   case X86ISD::UNPCKH:
   15116   case X86ISD::UNPCKL:
   15117   case X86ISD::MOVHLPS:
   15118   case X86ISD::MOVLHPS:
   15119   case X86ISD::PSHUFD:
   15120   case X86ISD::PSHUFHW:
   15121   case X86ISD::PSHUFLW:
   15122   case X86ISD::MOVSS:
   15123   case X86ISD::MOVSD:
   15124   case X86ISD::VPERMILP:
   15125   case X86ISD::VPERM2X128:
   15126   case ISD::VECTOR_SHUFFLE: return PerformShuffleCombine(N, DAG, DCI,Subtarget);
   15127   }
   15128 
   15129   return SDValue();
   15130 }
   15131 
   15132 /// isTypeDesirableForOp - Return true if the target has native support for
   15133 /// the specified value type and it is 'desirable' to use the type for the
   15134 /// given node type. e.g. On x86 i16 is legal, but undesirable since i16
   15135 /// instruction encodings are longer and some i16 instructions are slow.
   15136 bool X86TargetLowering::isTypeDesirableForOp(unsigned Opc, EVT VT) const {
   15137   if (!isTypeLegal(VT))
   15138     return false;
   15139   if (VT != MVT::i16)
   15140     return true;
   15141 
   15142   switch (Opc) {
   15143   default:
   15144     return true;
   15145   case ISD::LOAD:
   15146   case ISD::SIGN_EXTEND:
   15147   case ISD::ZERO_EXTEND:
   15148   case ISD::ANY_EXTEND:
   15149   case ISD::SHL:
   15150   case ISD::SRL:
   15151   case ISD::SUB:
   15152   case ISD::ADD:
   15153   case ISD::MUL:
   15154   case ISD::AND:
   15155   case ISD::OR:
   15156   case ISD::XOR:
   15157     return false;
   15158   }
   15159 }
   15160 
   15161 /// IsDesirableToPromoteOp - This method query the target whether it is
   15162 /// beneficial for dag combiner to promote the specified node. If true, it
   15163 /// should return the desired promotion type by reference.
   15164 bool X86TargetLowering::IsDesirableToPromoteOp(SDValue Op, EVT &PVT) const {
   15165   EVT VT = Op.getValueType();
   15166   if (VT != MVT::i16)
   15167     return false;
   15168 
   15169   bool Promote = false;
   15170   bool Commute = false;
   15171   switch (Op.getOpcode()) {
   15172   default: break;
   15173   case ISD::LOAD: {
   15174     LoadSDNode *LD = cast<LoadSDNode>(Op);
   15175     // If the non-extending load has a single use and it's not live out, then it
   15176     // might be folded.
   15177     if (LD->getExtensionType() == ISD::NON_EXTLOAD /*&&
   15178                                                      Op.hasOneUse()*/) {
   15179       for (SDNode::use_iterator UI = Op.getNode()->use_begin(),
   15180              UE = Op.getNode()->use_end(); UI != UE; ++UI) {
   15181         // The only case where we'd want to promote LOAD (rather then it being
   15182         // promoted as an operand is when it's only use is liveout.
   15183         if (UI->getOpcode() != ISD::CopyToReg)
   15184           return false;
   15185       }
   15186     }
   15187     Promote = true;
   15188     break;
   15189   }
   15190   case ISD::SIGN_EXTEND:
   15191   case ISD::ZERO_EXTEND:
   15192   case ISD::ANY_EXTEND:
   15193     Promote = true;
   15194     break;
   15195   case ISD::SHL:
   15196   case ISD::SRL: {
   15197     SDValue N0 = Op.getOperand(0);
   15198     // Look out for (store (shl (load), x)).
   15199     if (MayFoldLoad(N0) && MayFoldIntoStore(Op))
   15200       return false;
   15201     Promote = true;
   15202     break;
   15203   }
   15204   case ISD::ADD:
   15205   case ISD::MUL:
   15206   case ISD::AND:
   15207   case ISD::OR:
   15208   case ISD::XOR:
   15209     Commute = true;
   15210     // fallthrough
   15211   case ISD::SUB: {
   15212     SDValue N0 = Op.getOperand(0);
   15213     SDValue N1 = Op.getOperand(1);
   15214     if (!Commute && MayFoldLoad(N1))
   15215       return false;
   15216     // Avoid disabling potential load folding opportunities.
   15217     if (MayFoldLoad(N0) && (!isa<ConstantSDNode>(N1) || MayFoldIntoStore(Op)))
   15218       return false;
   15219     if (MayFoldLoad(N1) && (!isa<ConstantSDNode>(N0) || MayFoldIntoStore(Op)))
   15220       return false;
   15221     Promote = true;
   15222   }
   15223   }
   15224 
   15225   PVT = MVT::i32;
   15226   return Promote;
   15227 }
   15228 
   15229 //===----------------------------------------------------------------------===//
   15230 //                           X86 Inline Assembly Support
   15231 //===----------------------------------------------------------------------===//
   15232 
   15233 namespace {
   15234   // Helper to match a string separated by whitespace.
   15235   bool matchAsmImpl(StringRef s, ArrayRef<const StringRef *> args) {
   15236     s = s.substr(s.find_first_not_of(" \t")); // Skip leading whitespace.
   15237 
   15238     for (unsigned i = 0, e = args.size(); i != e; ++i) {
   15239       StringRef piece(*args[i]);
   15240       if (!s.startswith(piece)) // Check if the piece matches.
   15241         return false;
   15242 
   15243       s = s.substr(piece.size());
   15244       StringRef::size_type pos = s.find_first_not_of(" \t");
   15245       if (pos == 0) // We matched a prefix.
   15246         return false;
   15247 
   15248       s = s.substr(pos);
   15249     }
   15250 
   15251     return s.empty();
   15252   }
   15253   const VariadicFunction1<bool, StringRef, StringRef, matchAsmImpl> matchAsm={};
   15254 }
   15255 
   15256 bool X86TargetLowering::ExpandInlineAsm(CallInst *CI) const {
   15257   InlineAsm *IA = cast<InlineAsm>(CI->getCalledValue());
   15258 
   15259   std::string AsmStr = IA->getAsmString();
   15260 
   15261   IntegerType *Ty = dyn_cast<IntegerType>(CI->getType());
   15262   if (!Ty || Ty->getBitWidth() % 16 != 0)
   15263     return false;
   15264 
   15265   // TODO: should remove alternatives from the asmstring: "foo {a|b}" -> "foo a"
   15266   SmallVector<StringRef, 4> AsmPieces;
   15267   SplitString(AsmStr, AsmPieces, ";\n");
   15268 
   15269   switch (AsmPieces.size()) {
   15270   default: return false;
   15271   case 1:
   15272     // FIXME: this should verify that we are targeting a 486 or better.  If not,
   15273     // we will turn this bswap into something that will be lowered to logical
   15274     // ops instead of emitting the bswap asm.  For now, we don't support 486 or
   15275     // lower so don't worry about this.
   15276     // bswap $0
   15277     if (matchAsm(AsmPieces[0], "bswap", "$0") ||
   15278         matchAsm(AsmPieces[0], "bswapl", "$0") ||
   15279         matchAsm(AsmPieces[0], "bswapq", "$0") ||
   15280         matchAsm(AsmPieces[0], "bswap", "${0:q}") ||
   15281         matchAsm(AsmPieces[0], "bswapl", "${0:q}") ||
   15282         matchAsm(AsmPieces[0], "bswapq", "${0:q}")) {
   15283       // No need to check constraints, nothing other than the equivalent of
   15284       // "=r,0" would be valid here.
   15285       return IntrinsicLowering::LowerToByteSwap(CI);
   15286     }
   15287 
   15288     // rorw $$8, ${0:w}  -->  llvm.bswap.i16
   15289     if (CI->getType()->isIntegerTy(16) &&
   15290         IA->getConstraintString().compare(0, 5, "=r,0,") == 0 &&
   15291         (matchAsm(AsmPieces[0], "rorw", "$$8,", "${0:w}") ||
   15292          matchAsm(AsmPieces[0], "rolw", "$$8,", "${0:w}"))) {
   15293       AsmPieces.clear();
   15294       const std::string &ConstraintsStr = IA->getConstraintString();
   15295       SplitString(StringRef(ConstraintsStr).substr(5), AsmPieces, ",");
   15296       std::sort(AsmPieces.begin(), AsmPieces.end());
   15297       if (AsmPieces.size() == 4 &&
   15298           AsmPieces[0] == "~{cc}" &&
   15299           AsmPieces[1] == "~{dirflag}" &&
   15300           AsmPieces[2] == "~{flags}" &&
   15301           AsmPieces[3] == "~{fpsr}")
   15302       return IntrinsicLowering::LowerToByteSwap(CI);
   15303     }
   15304     break;
   15305   case 3:
   15306     if (CI->getType()->isIntegerTy(32) &&
   15307         IA->getConstraintString().compare(0, 5, "=r,0,") == 0 &&
   15308         matchAsm(AsmPieces[0], "rorw", "$$8,", "${0:w}") &&
   15309         matchAsm(AsmPieces[1], "rorl", "$$16,", "$0") &&
   15310         matchAsm(AsmPieces[2], "rorw", "$$8,", "${0:w}")) {
   15311       AsmPieces.clear();
   15312       const std::string &ConstraintsStr = IA->getConstraintString();
   15313       SplitString(StringRef(ConstraintsStr).substr(5), AsmPieces, ",");
   15314       std::sort(AsmPieces.begin(), AsmPieces.end());
   15315       if (AsmPieces.size() == 4 &&
   15316           AsmPieces[0] == "~{cc}" &&
   15317           AsmPieces[1] == "~{dirflag}" &&
   15318           AsmPieces[2] == "~{flags}" &&
   15319           AsmPieces[3] == "~{fpsr}")
   15320         return IntrinsicLowering::LowerToByteSwap(CI);
   15321     }
   15322 
   15323     if (CI->getType()->isIntegerTy(64)) {
   15324       InlineAsm::ConstraintInfoVector Constraints = IA->ParseConstraints();
   15325       if (Constraints.size() >= 2 &&
   15326           Constraints[0].Codes.size() == 1 && Constraints[0].Codes[0] == "A" &&
   15327           Constraints[1].Codes.size() == 1 && Constraints[1].Codes[0] == "0") {
   15328         // bswap %eax / bswap %edx / xchgl %eax, %edx  -> llvm.bswap.i64
   15329         if (matchAsm(AsmPieces[0], "bswap", "%eax") &&
   15330             matchAsm(AsmPieces[1], "bswap", "%edx") &&
   15331             matchAsm(AsmPieces[2], "xchgl", "%eax,", "%edx"))
   15332           return IntrinsicLowering::LowerToByteSwap(CI);
   15333       }
   15334     }
   15335     break;
   15336   }
   15337   return false;
   15338 }
   15339 
   15340 
   15341 
   15342 /// getConstraintType - Given a constraint letter, return the type of
   15343 /// constraint it is for this target.
   15344 X86TargetLowering::ConstraintType
   15345 X86TargetLowering::getConstraintType(const std::string &Constraint) const {
   15346   if (Constraint.size() == 1) {
   15347     switch (Constraint[0]) {
   15348     case 'R':
   15349     case 'q':
   15350     case 'Q':
   15351     case 'f':
   15352     case 't':
   15353     case 'u':
   15354     case 'y':
   15355     case 'x':
   15356     case 'Y':
   15357     case 'l':
   15358       return C_RegisterClass;
   15359     case 'a':
   15360     case 'b':
   15361     case 'c':
   15362     case 'd':
   15363     case 'S':
   15364     case 'D':
   15365     case 'A':
   15366       return C_Register;
   15367     case 'I':
   15368     case 'J':
   15369     case 'K':
   15370     case 'L':
   15371     case 'M':
   15372     case 'N':
   15373     case 'G':
   15374     case 'C':
   15375     case 'e':
   15376     case 'Z':
   15377       return C_Other;
   15378     default:
   15379       break;
   15380     }
   15381   }
   15382   return TargetLowering::getConstraintType(Constraint);
   15383 }
   15384 
   15385 /// Examine constraint type and operand type and determine a weight value.
   15386 /// This object must already have been set up with the operand type
   15387 /// and the current alternative constraint selected.
   15388 TargetLowering::ConstraintWeight
   15389   X86TargetLowering::getSingleConstraintMatchWeight(
   15390     AsmOperandInfo &info, const char *constraint) const {
   15391   ConstraintWeight weight = CW_Invalid;
   15392   Value *CallOperandVal = info.CallOperandVal;
   15393     // If we don't have a value, we can't do a match,
   15394     // but allow it at the lowest weight.
   15395   if (CallOperandVal == NULL)
   15396     return CW_Default;
   15397   Type *type = CallOperandVal->getType();
   15398   // Look at the constraint type.
   15399   switch (*constraint) {
   15400   default:
   15401     weight = TargetLowering::getSingleConstraintMatchWeight(info, constraint);
   15402   case 'R':
   15403   case 'q':
   15404   case 'Q':
   15405   case 'a':
   15406   case 'b':
   15407   case 'c':
   15408   case 'd':
   15409   case 'S':
   15410   case 'D':
   15411   case 'A':
   15412     if (CallOperandVal->getType()->isIntegerTy())
   15413       weight = CW_SpecificReg;
   15414     break;
   15415   case 'f':
   15416   case 't':
   15417   case 'u':
   15418       if (type->isFloatingPointTy())
   15419         weight = CW_SpecificReg;
   15420       break;
   15421   case 'y':
   15422       if (type->isX86_MMXTy() && Subtarget->hasMMX())
   15423         weight = CW_SpecificReg;
   15424       break;
   15425   case 'x':
   15426   case 'Y':
   15427     if (((type->getPrimitiveSizeInBits() == 128) && Subtarget->hasSSE1()) ||
   15428         ((type->getPrimitiveSizeInBits() == 256) && Subtarget->hasAVX()))
   15429       weight = CW_Register;
   15430     break;
   15431   case 'I':
   15432     if (ConstantInt *C = dyn_cast<ConstantInt>(info.CallOperandVal)) {
   15433       if (C->getZExtValue() <= 31)
   15434         weight = CW_Constant;
   15435     }
   15436     break;
   15437   case 'J':
   15438     if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
   15439       if (C->getZExtValue() <= 63)
   15440         weight = CW_Constant;
   15441     }
   15442     break;
   15443   case 'K':
   15444     if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
   15445       if ((C->getSExtValue() >= -0x80) && (C->getSExtValue() <= 0x7f))
   15446         weight = CW_Constant;
   15447     }
   15448     break;
   15449   case 'L':
   15450     if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
   15451       if ((C->getZExtValue() == 0xff) || (C->getZExtValue() == 0xffff))
   15452         weight = CW_Constant;
   15453     }
   15454     break;
   15455   case 'M':
   15456     if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
   15457       if (C->getZExtValue() <= 3)
   15458         weight = CW_Constant;
   15459     }
   15460     break;
   15461   case 'N':
   15462     if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
   15463       if (C->getZExtValue() <= 0xff)
   15464         weight = CW_Constant;
   15465     }
   15466     break;
   15467   case 'G':
   15468   case 'C':
   15469     if (dyn_cast<ConstantFP>(CallOperandVal)) {
   15470       weight = CW_Constant;
   15471     }
   15472     break;
   15473   case 'e':
   15474     if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
   15475       if ((C->getSExtValue() >= -0x80000000LL) &&
   15476           (C->getSExtValue() <= 0x7fffffffLL))
   15477         weight = CW_Constant;
   15478     }
   15479     break;
   15480   case 'Z':
   15481     if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
   15482       if (C->getZExtValue() <= 0xffffffff)
   15483         weight = CW_Constant;
   15484     }
   15485     break;
   15486   }
   15487   return weight;
   15488 }
   15489 
   15490 /// LowerXConstraint - try to replace an X constraint, which matches anything,
   15491 /// with another that has more specific requirements based on the type of the
   15492 /// corresponding operand.
   15493 const char *X86TargetLowering::
   15494 LowerXConstraint(EVT ConstraintVT) const {
   15495   // FP X constraints get lowered to SSE1/2 registers if available, otherwise
   15496   // 'f' like normal targets.
   15497   if (ConstraintVT.isFloatingPoint()) {
   15498     if (Subtarget->hasSSE2())
   15499       return "Y";
   15500     if (Subtarget->hasSSE1())
   15501       return "x";
   15502   }
   15503 
   15504   return TargetLowering::LowerXConstraint(ConstraintVT);
   15505 }
   15506 
   15507 /// LowerAsmOperandForConstraint - Lower the specified operand into the Ops
   15508 /// vector.  If it is invalid, don't add anything to Ops.
   15509 void X86TargetLowering::LowerAsmOperandForConstraint(SDValue Op,
   15510                                                      std::string &Constraint,
   15511                                                      std::vector<SDValue>&Ops,
   15512                                                      SelectionDAG &DAG) const {
   15513   SDValue Result(0, 0);
   15514 
   15515   // Only support length 1 constraints for now.
   15516   if (Constraint.length() > 1) return;
   15517 
   15518   char ConstraintLetter = Constraint[0];
   15519   switch (ConstraintLetter) {
   15520   default: break;
   15521   case 'I':
   15522     if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
   15523       if (C->getZExtValue() <= 31) {
   15524         Result = DAG.getTargetConstant(C->getZExtValue(), Op.getValueType());
   15525         break;
   15526       }
   15527     }
   15528     return;
   15529   case 'J':
   15530     if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
   15531       if (C->getZExtValue() <= 63) {
   15532         Result = DAG.getTargetConstant(C->getZExtValue(), Op.getValueType());
   15533         break;
   15534       }
   15535     }
   15536     return;
   15537   case 'K':
   15538     if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
   15539       if ((int8_t)C->getSExtValue() == C->getSExtValue()) {
   15540         Result = DAG.getTargetConstant(C->getZExtValue(), Op.getValueType());
   15541         break;
   15542       }
   15543     }
   15544     return;
   15545   case 'N':
   15546     if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
   15547       if (C->getZExtValue() <= 255) {
   15548         Result = DAG.getTargetConstant(C->getZExtValue(), Op.getValueType());
   15549         break;
   15550       }
   15551     }
   15552     return;
   15553   case 'e': {
   15554     // 32-bit signed value
   15555     if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
   15556       if (ConstantInt::isValueValidForType(Type::getInt32Ty(*DAG.getContext()),
   15557                                            C->getSExtValue())) {
   15558         // Widen to 64 bits here to get it sign extended.
   15559         Result = DAG.getTargetConstant(C->getSExtValue(), MVT::i64);
   15560         break;
   15561       }
   15562     // FIXME gcc accepts some relocatable values here too, but only in certain
   15563     // memory models; it's complicated.
   15564     }
   15565     return;
   15566   }
   15567   case 'Z': {
   15568     // 32-bit unsigned value
   15569     if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
   15570       if (ConstantInt::isValueValidForType(Type::getInt32Ty(*DAG.getContext()),
   15571                                            C->getZExtValue())) {
   15572         Result = DAG.getTargetConstant(C->getZExtValue(), Op.getValueType());
   15573         break;
   15574       }
   15575     }
   15576     // FIXME gcc accepts some relocatable values here too, but only in certain
   15577     // memory models; it's complicated.
   15578     return;
   15579   }
   15580   case 'i': {
   15581     // Literal immediates are always ok.
   15582     if (ConstantSDNode *CST = dyn_cast<ConstantSDNode>(Op)) {
   15583       // Widen to 64 bits here to get it sign extended.
   15584       Result = DAG.getTargetConstant(CST->getSExtValue(), MVT::i64);
   15585       break;
   15586     }
   15587 
   15588     // In any sort of PIC mode addresses need to be computed at runtime by
   15589     // adding in a register or some sort of table lookup.  These can't
   15590     // be used as immediates.
   15591     if (Subtarget->isPICStyleGOT() || Subtarget->isPICStyleStubPIC())
   15592       return;
   15593 
   15594     // If we are in non-pic codegen mode, we allow the address of a global (with
   15595     // an optional displacement) to be used with 'i'.
   15596     GlobalAddressSDNode *GA = 0;
   15597     int64_t Offset = 0;
   15598 
   15599     // Match either (GA), (GA+C), (GA+C1+C2), etc.
   15600     while (1) {
   15601       if ((GA = dyn_cast<GlobalAddressSDNode>(Op))) {
   15602         Offset += GA->getOffset();
   15603         break;
   15604       } else if (Op.getOpcode() == ISD::ADD) {
   15605         if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op.getOperand(1))) {
   15606           Offset += C->getZExtValue();
   15607           Op = Op.getOperand(0);
   15608           continue;
   15609         }
   15610       } else if (Op.getOpcode() == ISD::SUB) {
   15611         if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op.getOperand(1))) {
   15612           Offset += -C->getZExtValue();
   15613           Op = Op.getOperand(0);
   15614           continue;
   15615         }
   15616       }
   15617 
   15618       // Otherwise, this isn't something we can handle, reject it.
   15619       return;
   15620     }
   15621 
   15622     const GlobalValue *GV = GA->getGlobal();
   15623     // If we require an extra load to get this address, as in PIC mode, we
   15624     // can't accept it.
   15625     if (isGlobalStubReference(Subtarget->ClassifyGlobalReference(GV,
   15626                                                         getTargetMachine())))
   15627       return;
   15628 
   15629     Result = DAG.getTargetGlobalAddress(GV, Op.getDebugLoc(),
   15630                                         GA->getValueType(0), Offset);
   15631     break;
   15632   }
   15633   }
   15634 
   15635   if (Result.getNode()) {
   15636     Ops.push_back(Result);
   15637     return;
   15638   }
   15639   return TargetLowering::LowerAsmOperandForConstraint(Op, Constraint, Ops, DAG);
   15640 }
   15641 
   15642 std::pair<unsigned, const TargetRegisterClass*>
   15643 X86TargetLowering::getRegForInlineAsmConstraint(const std::string &Constraint,
   15644                                                 EVT VT) const {
   15645   // First, see if this is a constraint that directly corresponds to an LLVM
   15646   // register class.
   15647   if (Constraint.size() == 1) {
   15648     // GCC Constraint Letters
   15649     switch (Constraint[0]) {
   15650     default: break;
   15651       // TODO: Slight differences here in allocation order and leaving
   15652       // RIP in the class. Do they matter any more here than they do
   15653       // in the normal allocation?
   15654     case 'q':   // GENERAL_REGS in 64-bit mode, Q_REGS in 32-bit mode.
   15655       if (Subtarget->is64Bit()) {
   15656 	if (VT == MVT::i32 || VT == MVT::f32)
   15657 	  return std::make_pair(0U, X86::GR32RegisterClass);
   15658 	else if (VT == MVT::i16)
   15659 	  return std::make_pair(0U, X86::GR16RegisterClass);
   15660 	else if (VT == MVT::i8 || VT == MVT::i1)
   15661 	  return std::make_pair(0U, X86::GR8RegisterClass);
   15662 	else if (VT == MVT::i64 || VT == MVT::f64)
   15663 	  return std::make_pair(0U, X86::GR64RegisterClass);
   15664 	break;
   15665       }
   15666       // 32-bit fallthrough
   15667     case 'Q':   // Q_REGS
   15668       if (VT == MVT::i32 || VT == MVT::f32)
   15669 	return std::make_pair(0U, X86::GR32_ABCDRegisterClass);
   15670       else if (VT == MVT::i16)
   15671 	return std::make_pair(0U, X86::GR16_ABCDRegisterClass);
   15672       else if (VT == MVT::i8 || VT == MVT::i1)
   15673 	return std::make_pair(0U, X86::GR8_ABCD_LRegisterClass);
   15674       else if (VT == MVT::i64)
   15675 	return std::make_pair(0U, X86::GR64_ABCDRegisterClass);
   15676       break;
   15677     case 'r':   // GENERAL_REGS
   15678     case 'l':   // INDEX_REGS
   15679       if (VT == MVT::i8 || VT == MVT::i1)
   15680         return std::make_pair(0U, X86::GR8RegisterClass);
   15681       if (VT == MVT::i16)
   15682         return std::make_pair(0U, X86::GR16RegisterClass);
   15683       if (VT == MVT::i32 || VT == MVT::f32 || !Subtarget->is64Bit())
   15684         return std::make_pair(0U, X86::GR32RegisterClass);
   15685       return std::make_pair(0U, X86::GR64RegisterClass);
   15686     case 'R':   // LEGACY_REGS
   15687       if (VT == MVT::i8 || VT == MVT::i1)
   15688         return std::make_pair(0U, X86::GR8_NOREXRegisterClass);
   15689       if (VT == MVT::i16)
   15690         return std::make_pair(0U, X86::GR16_NOREXRegisterClass);
   15691       if (VT == MVT::i32 || !Subtarget->is64Bit())
   15692         return std::make_pair(0U, X86::GR32_NOREXRegisterClass);
   15693       return std::make_pair(0U, X86::GR64_NOREXRegisterClass);
   15694     case 'f':  // FP Stack registers.
   15695       // If SSE is enabled for this VT, use f80 to ensure the isel moves the
   15696       // value to the correct fpstack register class.
   15697       if (VT == MVT::f32 && !isScalarFPTypeInSSEReg(VT))
   15698         return std::make_pair(0U, X86::RFP32RegisterClass);
   15699       if (VT == MVT::f64 && !isScalarFPTypeInSSEReg(VT))
   15700         return std::make_pair(0U, X86::RFP64RegisterClass);
   15701       return std::make_pair(0U, X86::RFP80RegisterClass);
   15702     case 'y':   // MMX_REGS if MMX allowed.
   15703       if (!Subtarget->hasMMX()) break;
   15704       return std::make_pair(0U, X86::VR64RegisterClass);
   15705     case 'Y':   // SSE_REGS if SSE2 allowed
   15706       if (!Subtarget->hasSSE2()) break;
   15707       // FALL THROUGH.
   15708     case 'x':   // SSE_REGS if SSE1 allowed or AVX_REGS if AVX allowed
   15709       if (!Subtarget->hasSSE1()) break;
   15710 
   15711       switch (VT.getSimpleVT().SimpleTy) {
   15712       default: break;
   15713       // Scalar SSE types.
   15714       case MVT::f32:
   15715       case MVT::i32:
   15716         return std::make_pair(0U, X86::FR32RegisterClass);
   15717       case MVT::f64:
   15718       case MVT::i64:
   15719         return std::make_pair(0U, X86::FR64RegisterClass);
   15720       // Vector types.
   15721       case MVT::v16i8:
   15722       case MVT::v8i16:
   15723       case MVT::v4i32:
   15724       case MVT::v2i64:
   15725       case MVT::v4f32:
   15726       case MVT::v2f64:
   15727         return std::make_pair(0U, X86::VR128RegisterClass);
   15728       // AVX types.
   15729       case MVT::v32i8:
   15730       case MVT::v16i16:
   15731       case MVT::v8i32:
   15732       case MVT::v4i64:
   15733       case MVT::v8f32:
   15734       case MVT::v4f64:
   15735         return std::make_pair(0U, X86::VR256RegisterClass);
   15736 
   15737       }
   15738       break;
   15739     }
   15740   }
   15741 
   15742   // Use the default implementation in TargetLowering to convert the register
   15743   // constraint into a member of a register class.
   15744   std::pair<unsigned, const TargetRegisterClass*> Res;
   15745   Res = TargetLowering::getRegForInlineAsmConstraint(Constraint, VT);
   15746 
   15747   // Not found as a standard register?
   15748   if (Res.second == 0) {
   15749     // Map st(0) -> st(7) -> ST0
   15750     if (Constraint.size() == 7 && Constraint[0] == '{' &&
   15751         std::tolower(Constraint[1]) == 's' &&
   15752         std::tolower(Constraint[2]) == 't' &&
   15753         Constraint[3] == '(' &&
   15754         (Constraint[4] >= '0' && Constraint[4] <= '7') &&
   15755         Constraint[5] == ')' &&
   15756         Constraint[6] == '}') {
   15757 
   15758       Res.first = X86::ST0+Constraint[4]-'0';
   15759       Res.second = X86::RFP80RegisterClass;
   15760       return Res;
   15761     }
   15762 
   15763     // GCC allows "st(0)" to be called just plain "st".
   15764     if (StringRef("{st}").equals_lower(Constraint)) {
   15765       Res.first = X86::ST0;
   15766       Res.second = X86::RFP80RegisterClass;
   15767       return Res;
   15768     }
   15769 
   15770     // flags -> EFLAGS
   15771     if (StringRef("{flags}").equals_lower(Constraint)) {
   15772       Res.first = X86::EFLAGS;
   15773       Res.second = X86::CCRRegisterClass;
   15774       return Res;
   15775     }
   15776 
   15777     // 'A' means EAX + EDX.
   15778     if (Constraint == "A") {
   15779       Res.first = X86::EAX;
   15780       Res.second = X86::GR32_ADRegisterClass;
   15781       return Res;
   15782     }
   15783     return Res;
   15784   }
   15785 
   15786   // Otherwise, check to see if this is a register class of the wrong value
   15787   // type.  For example, we want to map "{ax},i32" -> {eax}, we don't want it to
   15788   // turn into {ax},{dx}.
   15789   if (Res.second->hasType(VT))
   15790     return Res;   // Correct type already, nothing to do.
   15791 
   15792   // All of the single-register GCC register classes map their values onto
   15793   // 16-bit register pieces "ax","dx","cx","bx","si","di","bp","sp".  If we
   15794   // really want an 8-bit or 32-bit register, map to the appropriate register
   15795   // class and return the appropriate register.
   15796   if (Res.second == X86::GR16RegisterClass) {
   15797     if (VT == MVT::i8) {
   15798       unsigned DestReg = 0;
   15799       switch (Res.first) {
   15800       default: break;
   15801       case X86::AX: DestReg = X86::AL; break;
   15802       case X86::DX: DestReg = X86::DL; break;
   15803       case X86::CX: DestReg = X86::CL; break;
   15804       case X86::BX: DestReg = X86::BL; break;
   15805       }
   15806       if (DestReg) {
   15807         Res.first = DestReg;
   15808         Res.second = X86::GR8RegisterClass;
   15809       }
   15810     } else if (VT == MVT::i32) {
   15811       unsigned DestReg = 0;
   15812       switch (Res.first) {
   15813       default: break;
   15814       case X86::AX: DestReg = X86::EAX; break;
   15815       case X86::DX: DestReg = X86::EDX; break;
   15816       case X86::CX: DestReg = X86::ECX; break;
   15817       case X86::BX: DestReg = X86::EBX; break;
   15818       case X86::SI: DestReg = X86::ESI; break;
   15819       case X86::DI: DestReg = X86::EDI; break;
   15820       case X86::BP: DestReg = X86::EBP; break;
   15821       case X86::SP: DestReg = X86::ESP; break;
   15822       }
   15823       if (DestReg) {
   15824         Res.first = DestReg;
   15825         Res.second = X86::GR32RegisterClass;
   15826       }
   15827     } else if (VT == MVT::i64) {
   15828       unsigned DestReg = 0;
   15829       switch (Res.first) {
   15830       default: break;
   15831       case X86::AX: DestReg = X86::RAX; break;
   15832       case X86::DX: DestReg = X86::RDX; break;
   15833       case X86::CX: DestReg = X86::RCX; break;
   15834       case X86::BX: DestReg = X86::RBX; break;
   15835       case X86::SI: DestReg = X86::RSI; break;
   15836       case X86::DI: DestReg = X86::RDI; break;
   15837       case X86::BP: DestReg = X86::RBP; break;
   15838       case X86::SP: DestReg = X86::RSP; break;
   15839       }
   15840       if (DestReg) {
   15841         Res.first = DestReg;
   15842         Res.second = X86::GR64RegisterClass;
   15843       }
   15844     }
   15845   } else if (Res.second == X86::FR32RegisterClass ||
   15846              Res.second == X86::FR64RegisterClass ||
   15847              Res.second == X86::VR128RegisterClass) {
   15848     // Handle references to XMM physical registers that got mapped into the
   15849     // wrong class.  This can happen with constraints like {xmm0} where the
   15850     // target independent register mapper will just pick the first match it can
   15851     // find, ignoring the required type.
   15852     if (VT == MVT::f32)
   15853       Res.second = X86::FR32RegisterClass;
   15854     else if (VT == MVT::f64)
   15855       Res.second = X86::FR64RegisterClass;
   15856     else if (X86::VR128RegisterClass->hasType(VT))
   15857       Res.second = X86::VR128RegisterClass;
   15858   }
   15859 
   15860   return Res;
   15861 }
   15862