Home | History | Annotate | Download | only in X86
      1 //===-- X86ISelLowering.cpp - X86 DAG Lowering Implementation -------------===//
      2 //
      3 //                     The LLVM Compiler Infrastructure
      4 //
      5 // This file is distributed under the University of Illinois Open Source
      6 // License. See LICENSE.TXT for details.
      7 //
      8 //===----------------------------------------------------------------------===//
      9 //
     10 // This file defines the interfaces that X86 uses to lower LLVM code into a
     11 // selection DAG.
     12 //
     13 //===----------------------------------------------------------------------===//
     14 
     15 #define DEBUG_TYPE "x86-isel"
     16 #include "X86ISelLowering.h"
     17 #include "Utils/X86ShuffleDecode.h"
     18 #include "X86.h"
     19 #include "X86InstrBuilder.h"
     20 #include "X86TargetMachine.h"
     21 #include "X86TargetObjectFile.h"
     22 #include "llvm/ADT/SmallSet.h"
     23 #include "llvm/ADT/Statistic.h"
     24 #include "llvm/ADT/StringExtras.h"
     25 #include "llvm/ADT/VariadicFunction.h"
     26 #include "llvm/CodeGen/IntrinsicLowering.h"
     27 #include "llvm/CodeGen/MachineFrameInfo.h"
     28 #include "llvm/CodeGen/MachineFunction.h"
     29 #include "llvm/CodeGen/MachineInstrBuilder.h"
     30 #include "llvm/CodeGen/MachineJumpTableInfo.h"
     31 #include "llvm/CodeGen/MachineModuleInfo.h"
     32 #include "llvm/CodeGen/MachineRegisterInfo.h"
     33 #include "llvm/IR/CallingConv.h"
     34 #include "llvm/IR/Constants.h"
     35 #include "llvm/IR/DerivedTypes.h"
     36 #include "llvm/IR/Function.h"
     37 #include "llvm/IR/GlobalAlias.h"
     38 #include "llvm/IR/GlobalVariable.h"
     39 #include "llvm/IR/Instructions.h"
     40 #include "llvm/IR/Intrinsics.h"
     41 #include "llvm/IR/LLVMContext.h"
     42 #include "llvm/MC/MCAsmInfo.h"
     43 #include "llvm/MC/MCContext.h"
     44 #include "llvm/MC/MCExpr.h"
     45 #include "llvm/MC/MCSymbol.h"
     46 #include "llvm/Support/CallSite.h"
     47 #include "llvm/Support/Debug.h"
     48 #include "llvm/Support/ErrorHandling.h"
     49 #include "llvm/Support/MathExtras.h"
     50 #include "llvm/Target/TargetOptions.h"
     51 #include <bitset>
     52 #include <cctype>
     53 using namespace llvm;
     54 
     55 STATISTIC(NumTailCalls, "Number of tail calls");
     56 
     57 // Forward declarations.
     58 static SDValue getMOVL(SelectionDAG &DAG, DebugLoc dl, EVT VT, SDValue V1,
     59                        SDValue V2);
     60 
     61 /// Generate a DAG to grab 128-bits from a vector > 128 bits.  This
     62 /// sets things up to match to an AVX VEXTRACTF128 instruction or a
     63 /// simple subregister reference.  Idx is an index in the 128 bits we
     64 /// want.  It need not be aligned to a 128-bit bounday.  That makes
     65 /// lowering EXTRACT_VECTOR_ELT operations easier.
     66 static SDValue Extract128BitVector(SDValue Vec, unsigned IdxVal,
     67                                    SelectionDAG &DAG, DebugLoc dl) {
     68   EVT VT = Vec.getValueType();
     69   assert(VT.is256BitVector() && "Unexpected vector size!");
     70   EVT ElVT = VT.getVectorElementType();
     71   unsigned Factor = VT.getSizeInBits()/128;
     72   EVT ResultVT = EVT::getVectorVT(*DAG.getContext(), ElVT,
     73                                   VT.getVectorNumElements()/Factor);
     74 
     75   // Extract from UNDEF is UNDEF.
     76   if (Vec.getOpcode() == ISD::UNDEF)
     77     return DAG.getUNDEF(ResultVT);
     78 
     79   // Extract the relevant 128 bits.  Generate an EXTRACT_SUBVECTOR
     80   // we can match to VEXTRACTF128.
     81   unsigned ElemsPerChunk = 128 / ElVT.getSizeInBits();
     82 
     83   // This is the index of the first element of the 128-bit chunk
     84   // we want.
     85   unsigned NormalizedIdxVal = (((IdxVal * ElVT.getSizeInBits()) / 128)
     86                                * ElemsPerChunk);
     87 
     88   // If the input is a buildvector just emit a smaller one.
     89   if (Vec.getOpcode() == ISD::BUILD_VECTOR)
     90     return DAG.getNode(ISD::BUILD_VECTOR, dl, ResultVT,
     91                        Vec->op_begin()+NormalizedIdxVal, ElemsPerChunk);
     92 
     93   SDValue VecIdx = DAG.getIntPtrConstant(NormalizedIdxVal);
     94   SDValue Result = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, ResultVT, Vec,
     95                                VecIdx);
     96 
     97   return Result;
     98 }
     99 
    100 /// Generate a DAG to put 128-bits into a vector > 128 bits.  This
    101 /// sets things up to match to an AVX VINSERTF128 instruction or a
    102 /// simple superregister reference.  Idx is an index in the 128 bits
    103 /// we want.  It need not be aligned to a 128-bit bounday.  That makes
    104 /// lowering INSERT_VECTOR_ELT operations easier.
    105 static SDValue Insert128BitVector(SDValue Result, SDValue Vec,
    106                                   unsigned IdxVal, SelectionDAG &DAG,
    107                                   DebugLoc dl) {
    108   // Inserting UNDEF is Result
    109   if (Vec.getOpcode() == ISD::UNDEF)
    110     return Result;
    111 
    112   EVT VT = Vec.getValueType();
    113   assert(VT.is128BitVector() && "Unexpected vector size!");
    114 
    115   EVT ElVT = VT.getVectorElementType();
    116   EVT ResultVT = Result.getValueType();
    117 
    118   // Insert the relevant 128 bits.
    119   unsigned ElemsPerChunk = 128/ElVT.getSizeInBits();
    120 
    121   // This is the index of the first element of the 128-bit chunk
    122   // we want.
    123   unsigned NormalizedIdxVal = (((IdxVal * ElVT.getSizeInBits())/128)
    124                                * ElemsPerChunk);
    125 
    126   SDValue VecIdx = DAG.getIntPtrConstant(NormalizedIdxVal);
    127   return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResultVT, Result, Vec,
    128                      VecIdx);
    129 }
    130 
    131 /// Concat two 128-bit vectors into a 256 bit vector using VINSERTF128
    132 /// instructions. This is used because creating CONCAT_VECTOR nodes of
    133 /// BUILD_VECTORS returns a larger BUILD_VECTOR while we're trying to lower
    134 /// large BUILD_VECTORS.
    135 static SDValue Concat128BitVectors(SDValue V1, SDValue V2, EVT VT,
    136                                    unsigned NumElems, SelectionDAG &DAG,
    137                                    DebugLoc dl) {
    138   SDValue V = Insert128BitVector(DAG.getUNDEF(VT), V1, 0, DAG, dl);
    139   return Insert128BitVector(V, V2, NumElems/2, DAG, dl);
    140 }
    141 
    142 static TargetLoweringObjectFile *createTLOF(X86TargetMachine &TM) {
    143   const X86Subtarget *Subtarget = &TM.getSubtarget<X86Subtarget>();
    144   bool is64Bit = Subtarget->is64Bit();
    145 
    146   if (Subtarget->isTargetEnvMacho()) {
    147     if (is64Bit)
    148       return new X86_64MachoTargetObjectFile();
    149     return new TargetLoweringObjectFileMachO();
    150   }
    151 
    152   if (Subtarget->isTargetLinux())
    153     return new X86LinuxTargetObjectFile();
    154   if (Subtarget->isTargetELF())
    155     return new TargetLoweringObjectFileELF();
    156   if (Subtarget->isTargetCOFF() && !Subtarget->isTargetEnvMacho())
    157     return new TargetLoweringObjectFileCOFF();
    158   llvm_unreachable("unknown subtarget type");
    159 }
    160 
    161 X86TargetLowering::X86TargetLowering(X86TargetMachine &TM)
    162   : TargetLowering(TM, createTLOF(TM)) {
    163   Subtarget = &TM.getSubtarget<X86Subtarget>();
    164   X86ScalarSSEf64 = Subtarget->hasSSE2();
    165   X86ScalarSSEf32 = Subtarget->hasSSE1();
    166 
    167   RegInfo = TM.getRegisterInfo();
    168   TD = getDataLayout();
    169 
    170   // Set up the TargetLowering object.
    171   static const MVT IntVTs[] = { MVT::i8, MVT::i16, MVT::i32, MVT::i64 };
    172 
    173   // X86 is weird, it always uses i8 for shift amounts and setcc results.
    174   setBooleanContents(ZeroOrOneBooleanContent);
    175   // X86-SSE is even stranger. It uses -1 or 0 for vector masks.
    176   setBooleanVectorContents(ZeroOrNegativeOneBooleanContent);
    177 
    178   // For 64-bit since we have so many registers use the ILP scheduler, for
    179   // 32-bit code use the register pressure specific scheduling.
    180   // For Atom, always use ILP scheduling.
    181   if (Subtarget->isAtom())
    182     setSchedulingPreference(Sched::ILP);
    183   else if (Subtarget->is64Bit())
    184     setSchedulingPreference(Sched::ILP);
    185   else
    186     setSchedulingPreference(Sched::RegPressure);
    187   setStackPointerRegisterToSaveRestore(RegInfo->getStackRegister());
    188 
    189   // Bypass expensive divides on Atom when compiling with O2
    190   if (Subtarget->hasSlowDivide() && TM.getOptLevel() >= CodeGenOpt::Default) {
    191     addBypassSlowDiv(32, 8);
    192     if (Subtarget->is64Bit())
    193       addBypassSlowDiv(64, 16);
    194   }
    195 
    196   if (Subtarget->isTargetWindows() && !Subtarget->isTargetCygMing()) {
    197     // Setup Windows compiler runtime calls.
    198     setLibcallName(RTLIB::SDIV_I64, "_alldiv");
    199     setLibcallName(RTLIB::UDIV_I64, "_aulldiv");
    200     setLibcallName(RTLIB::SREM_I64, "_allrem");
    201     setLibcallName(RTLIB::UREM_I64, "_aullrem");
    202     setLibcallName(RTLIB::MUL_I64, "_allmul");
    203     setLibcallCallingConv(RTLIB::SDIV_I64, CallingConv::X86_StdCall);
    204     setLibcallCallingConv(RTLIB::UDIV_I64, CallingConv::X86_StdCall);
    205     setLibcallCallingConv(RTLIB::SREM_I64, CallingConv::X86_StdCall);
    206     setLibcallCallingConv(RTLIB::UREM_I64, CallingConv::X86_StdCall);
    207     setLibcallCallingConv(RTLIB::MUL_I64, CallingConv::X86_StdCall);
    208 
    209     // The _ftol2 runtime function has an unusual calling conv, which
    210     // is modeled by a special pseudo-instruction.
    211     setLibcallName(RTLIB::FPTOUINT_F64_I64, 0);
    212     setLibcallName(RTLIB::FPTOUINT_F32_I64, 0);
    213     setLibcallName(RTLIB::FPTOUINT_F64_I32, 0);
    214     setLibcallName(RTLIB::FPTOUINT_F32_I32, 0);
    215   }
    216 
    217   if (Subtarget->isTargetDarwin()) {
    218     // Darwin should use _setjmp/_longjmp instead of setjmp/longjmp.
    219     setUseUnderscoreSetJmp(false);
    220     setUseUnderscoreLongJmp(false);
    221   } else if (Subtarget->isTargetMingw()) {
    222     // MS runtime is weird: it exports _setjmp, but longjmp!
    223     setUseUnderscoreSetJmp(true);
    224     setUseUnderscoreLongJmp(false);
    225   } else {
    226     setUseUnderscoreSetJmp(true);
    227     setUseUnderscoreLongJmp(true);
    228   }
    229 
    230   // Set up the register classes.
    231   addRegisterClass(MVT::i8, &X86::GR8RegClass);
    232   addRegisterClass(MVT::i16, &X86::GR16RegClass);
    233   addRegisterClass(MVT::i32, &X86::GR32RegClass);
    234   if (Subtarget->is64Bit())
    235     addRegisterClass(MVT::i64, &X86::GR64RegClass);
    236 
    237   setLoadExtAction(ISD::SEXTLOAD, MVT::i1, Promote);
    238 
    239   // We don't accept any truncstore of integer registers.
    240   setTruncStoreAction(MVT::i64, MVT::i32, Expand);
    241   setTruncStoreAction(MVT::i64, MVT::i16, Expand);
    242   setTruncStoreAction(MVT::i64, MVT::i8 , Expand);
    243   setTruncStoreAction(MVT::i32, MVT::i16, Expand);
    244   setTruncStoreAction(MVT::i32, MVT::i8 , Expand);
    245   setTruncStoreAction(MVT::i16, MVT::i8,  Expand);
    246 
    247   // SETOEQ and SETUNE require checking two conditions.
    248   setCondCodeAction(ISD::SETOEQ, MVT::f32, Expand);
    249   setCondCodeAction(ISD::SETOEQ, MVT::f64, Expand);
    250   setCondCodeAction(ISD::SETOEQ, MVT::f80, Expand);
    251   setCondCodeAction(ISD::SETUNE, MVT::f32, Expand);
    252   setCondCodeAction(ISD::SETUNE, MVT::f64, Expand);
    253   setCondCodeAction(ISD::SETUNE, MVT::f80, Expand);
    254 
    255   // Promote all UINT_TO_FP to larger SINT_TO_FP's, as X86 doesn't have this
    256   // operation.
    257   setOperationAction(ISD::UINT_TO_FP       , MVT::i1   , Promote);
    258   setOperationAction(ISD::UINT_TO_FP       , MVT::i8   , Promote);
    259   setOperationAction(ISD::UINT_TO_FP       , MVT::i16  , Promote);
    260 
    261   if (Subtarget->is64Bit()) {
    262     setOperationAction(ISD::UINT_TO_FP     , MVT::i32  , Promote);
    263     setOperationAction(ISD::UINT_TO_FP     , MVT::i64  , Custom);
    264   } else if (!TM.Options.UseSoftFloat) {
    265     // We have an algorithm for SSE2->double, and we turn this into a
    266     // 64-bit FILD followed by conditional FADD for other targets.
    267     setOperationAction(ISD::UINT_TO_FP     , MVT::i64  , Custom);
    268     // We have an algorithm for SSE2, and we turn this into a 64-bit
    269     // FILD for other targets.
    270     setOperationAction(ISD::UINT_TO_FP     , MVT::i32  , Custom);
    271   }
    272 
    273   // Promote i1/i8 SINT_TO_FP to larger SINT_TO_FP's, as X86 doesn't have
    274   // this operation.
    275   setOperationAction(ISD::SINT_TO_FP       , MVT::i1   , Promote);
    276   setOperationAction(ISD::SINT_TO_FP       , MVT::i8   , Promote);
    277 
    278   if (!TM.Options.UseSoftFloat) {
    279     // SSE has no i16 to fp conversion, only i32
    280     if (X86ScalarSSEf32) {
    281       setOperationAction(ISD::SINT_TO_FP     , MVT::i16  , Promote);
    282       // f32 and f64 cases are Legal, f80 case is not
    283       setOperationAction(ISD::SINT_TO_FP     , MVT::i32  , Custom);
    284     } else {
    285       setOperationAction(ISD::SINT_TO_FP     , MVT::i16  , Custom);
    286       setOperationAction(ISD::SINT_TO_FP     , MVT::i32  , Custom);
    287     }
    288   } else {
    289     setOperationAction(ISD::SINT_TO_FP     , MVT::i16  , Promote);
    290     setOperationAction(ISD::SINT_TO_FP     , MVT::i32  , Promote);
    291   }
    292 
    293   // In 32-bit mode these are custom lowered.  In 64-bit mode F32 and F64
    294   // are Legal, f80 is custom lowered.
    295   setOperationAction(ISD::FP_TO_SINT     , MVT::i64  , Custom);
    296   setOperationAction(ISD::SINT_TO_FP     , MVT::i64  , Custom);
    297 
    298   // Promote i1/i8 FP_TO_SINT to larger FP_TO_SINTS's, as X86 doesn't have
    299   // this operation.
    300   setOperationAction(ISD::FP_TO_SINT       , MVT::i1   , Promote);
    301   setOperationAction(ISD::FP_TO_SINT       , MVT::i8   , Promote);
    302 
    303   if (X86ScalarSSEf32) {
    304     setOperationAction(ISD::FP_TO_SINT     , MVT::i16  , Promote);
    305     // f32 and f64 cases are Legal, f80 case is not
    306     setOperationAction(ISD::FP_TO_SINT     , MVT::i32  , Custom);
    307   } else {
    308     setOperationAction(ISD::FP_TO_SINT     , MVT::i16  , Custom);
    309     setOperationAction(ISD::FP_TO_SINT     , MVT::i32  , Custom);
    310   }
    311 
    312   // Handle FP_TO_UINT by promoting the destination to a larger signed
    313   // conversion.
    314   setOperationAction(ISD::FP_TO_UINT       , MVT::i1   , Promote);
    315   setOperationAction(ISD::FP_TO_UINT       , MVT::i8   , Promote);
    316   setOperationAction(ISD::FP_TO_UINT       , MVT::i16  , Promote);
    317 
    318   if (Subtarget->is64Bit()) {
    319     setOperationAction(ISD::FP_TO_UINT     , MVT::i64  , Expand);
    320     setOperationAction(ISD::FP_TO_UINT     , MVT::i32  , Promote);
    321   } else if (!TM.Options.UseSoftFloat) {
    322     // Since AVX is a superset of SSE3, only check for SSE here.
    323     if (Subtarget->hasSSE1() && !Subtarget->hasSSE3())
    324       // Expand FP_TO_UINT into a select.
    325       // FIXME: We would like to use a Custom expander here eventually to do
    326       // the optimal thing for SSE vs. the default expansion in the legalizer.
    327       setOperationAction(ISD::FP_TO_UINT   , MVT::i32  , Expand);
    328     else
    329       // With SSE3 we can use fisttpll to convert to a signed i64; without
    330       // SSE, we're stuck with a fistpll.
    331       setOperationAction(ISD::FP_TO_UINT   , MVT::i32  , Custom);
    332   }
    333 
    334   if (isTargetFTOL()) {
    335     // Use the _ftol2 runtime function, which has a pseudo-instruction
    336     // to handle its weird calling convention.
    337     setOperationAction(ISD::FP_TO_UINT     , MVT::i64  , Custom);
    338   }
    339 
    340   // TODO: when we have SSE, these could be more efficient, by using movd/movq.
    341   if (!X86ScalarSSEf64) {
    342     setOperationAction(ISD::BITCAST        , MVT::f32  , Expand);
    343     setOperationAction(ISD::BITCAST        , MVT::i32  , Expand);
    344     if (Subtarget->is64Bit()) {
    345       setOperationAction(ISD::BITCAST      , MVT::f64  , Expand);
    346       // Without SSE, i64->f64 goes through memory.
    347       setOperationAction(ISD::BITCAST      , MVT::i64  , Expand);
    348     }
    349   }
    350 
    351   // Scalar integer divide and remainder are lowered to use operations that
    352   // produce two results, to match the available instructions. This exposes
    353   // the two-result form to trivial CSE, which is able to combine x/y and x%y
    354   // into a single instruction.
    355   //
    356   // Scalar integer multiply-high is also lowered to use two-result
    357   // operations, to match the available instructions. However, plain multiply
    358   // (low) operations are left as Legal, as there are single-result
    359   // instructions for this in x86. Using the two-result multiply instructions
    360   // when both high and low results are needed must be arranged by dagcombine.
    361   for (unsigned i = 0; i != array_lengthof(IntVTs); ++i) {
    362     MVT VT = IntVTs[i];
    363     setOperationAction(ISD::MULHS, VT, Expand);
    364     setOperationAction(ISD::MULHU, VT, Expand);
    365     setOperationAction(ISD::SDIV, VT, Expand);
    366     setOperationAction(ISD::UDIV, VT, Expand);
    367     setOperationAction(ISD::SREM, VT, Expand);
    368     setOperationAction(ISD::UREM, VT, Expand);
    369 
    370     // Add/Sub overflow ops with MVT::Glues are lowered to EFLAGS dependences.
    371     setOperationAction(ISD::ADDC, VT, Custom);
    372     setOperationAction(ISD::ADDE, VT, Custom);
    373     setOperationAction(ISD::SUBC, VT, Custom);
    374     setOperationAction(ISD::SUBE, VT, Custom);
    375   }
    376 
    377   setOperationAction(ISD::BR_JT            , MVT::Other, Expand);
    378   setOperationAction(ISD::BRCOND           , MVT::Other, Custom);
    379   setOperationAction(ISD::BR_CC            , MVT::f32,   Expand);
    380   setOperationAction(ISD::BR_CC            , MVT::f64,   Expand);
    381   setOperationAction(ISD::BR_CC            , MVT::f80,   Expand);
    382   setOperationAction(ISD::BR_CC            , MVT::i8,    Expand);
    383   setOperationAction(ISD::BR_CC            , MVT::i16,   Expand);
    384   setOperationAction(ISD::BR_CC            , MVT::i32,   Expand);
    385   setOperationAction(ISD::BR_CC            , MVT::i64,   Expand);
    386   setOperationAction(ISD::SELECT_CC        , MVT::Other, Expand);
    387   if (Subtarget->is64Bit())
    388     setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i32, Legal);
    389   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i16  , Legal);
    390   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i8   , Legal);
    391   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1   , Expand);
    392   setOperationAction(ISD::FP_ROUND_INREG   , MVT::f32  , Expand);
    393   setOperationAction(ISD::FREM             , MVT::f32  , Expand);
    394   setOperationAction(ISD::FREM             , MVT::f64  , Expand);
    395   setOperationAction(ISD::FREM             , MVT::f80  , Expand);
    396   setOperationAction(ISD::FLT_ROUNDS_      , MVT::i32  , Custom);
    397 
    398   // Promote the i8 variants and force them on up to i32 which has a shorter
    399   // encoding.
    400   setOperationAction(ISD::CTTZ             , MVT::i8   , Promote);
    401   AddPromotedToType (ISD::CTTZ             , MVT::i8   , MVT::i32);
    402   setOperationAction(ISD::CTTZ_ZERO_UNDEF  , MVT::i8   , Promote);
    403   AddPromotedToType (ISD::CTTZ_ZERO_UNDEF  , MVT::i8   , MVT::i32);
    404   if (Subtarget->hasBMI()) {
    405     setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i16  , Expand);
    406     setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i32  , Expand);
    407     if (Subtarget->is64Bit())
    408       setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i64, Expand);
    409   } else {
    410     setOperationAction(ISD::CTTZ           , MVT::i16  , Custom);
    411     setOperationAction(ISD::CTTZ           , MVT::i32  , Custom);
    412     if (Subtarget->is64Bit())
    413       setOperationAction(ISD::CTTZ         , MVT::i64  , Custom);
    414   }
    415 
    416   if (Subtarget->hasLZCNT()) {
    417     // When promoting the i8 variants, force them to i32 for a shorter
    418     // encoding.
    419     setOperationAction(ISD::CTLZ           , MVT::i8   , Promote);
    420     AddPromotedToType (ISD::CTLZ           , MVT::i8   , MVT::i32);
    421     setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i8   , Promote);
    422     AddPromotedToType (ISD::CTLZ_ZERO_UNDEF, MVT::i8   , MVT::i32);
    423     setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i16  , Expand);
    424     setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i32  , Expand);
    425     if (Subtarget->is64Bit())
    426       setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i64, Expand);
    427   } else {
    428     setOperationAction(ISD::CTLZ           , MVT::i8   , Custom);
    429     setOperationAction(ISD::CTLZ           , MVT::i16  , Custom);
    430     setOperationAction(ISD::CTLZ           , MVT::i32  , Custom);
    431     setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i8   , Custom);
    432     setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i16  , Custom);
    433     setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i32  , Custom);
    434     if (Subtarget->is64Bit()) {
    435       setOperationAction(ISD::CTLZ         , MVT::i64  , Custom);
    436       setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i64, Custom);
    437     }
    438   }
    439 
    440   if (Subtarget->hasPOPCNT()) {
    441     setOperationAction(ISD::CTPOP          , MVT::i8   , Promote);
    442   } else {
    443     setOperationAction(ISD::CTPOP          , MVT::i8   , Expand);
    444     setOperationAction(ISD::CTPOP          , MVT::i16  , Expand);
    445     setOperationAction(ISD::CTPOP          , MVT::i32  , Expand);
    446     if (Subtarget->is64Bit())
    447       setOperationAction(ISD::CTPOP        , MVT::i64  , Expand);
    448   }
    449 
    450   setOperationAction(ISD::READCYCLECOUNTER , MVT::i64  , Custom);
    451   setOperationAction(ISD::BSWAP            , MVT::i16  , Expand);
    452 
    453   // These should be promoted to a larger select which is supported.
    454   setOperationAction(ISD::SELECT          , MVT::i1   , Promote);
    455   // X86 wants to expand cmov itself.
    456   setOperationAction(ISD::SELECT          , MVT::i8   , Custom);
    457   setOperationAction(ISD::SELECT          , MVT::i16  , Custom);
    458   setOperationAction(ISD::SELECT          , MVT::i32  , Custom);
    459   setOperationAction(ISD::SELECT          , MVT::f32  , Custom);
    460   setOperationAction(ISD::SELECT          , MVT::f64  , Custom);
    461   setOperationAction(ISD::SELECT          , MVT::f80  , Custom);
    462   setOperationAction(ISD::SETCC           , MVT::i8   , Custom);
    463   setOperationAction(ISD::SETCC           , MVT::i16  , Custom);
    464   setOperationAction(ISD::SETCC           , MVT::i32  , Custom);
    465   setOperationAction(ISD::SETCC           , MVT::f32  , Custom);
    466   setOperationAction(ISD::SETCC           , MVT::f64  , Custom);
    467   setOperationAction(ISD::SETCC           , MVT::f80  , Custom);
    468   if (Subtarget->is64Bit()) {
    469     setOperationAction(ISD::SELECT        , MVT::i64  , Custom);
    470     setOperationAction(ISD::SETCC         , MVT::i64  , Custom);
    471   }
    472   setOperationAction(ISD::EH_RETURN       , MVT::Other, Custom);
    473   // NOTE: EH_SJLJ_SETJMP/_LONGJMP supported here is NOT intened to support
    474   // SjLj exception handling but a light-weight setjmp/longjmp replacement to
    475   // support continuation, user-level threading, and etc.. As a result, no
    476   // other SjLj exception interfaces are implemented and please don't build
    477   // your own exception handling based on them.
    478   // LLVM/Clang supports zero-cost DWARF exception handling.
    479   setOperationAction(ISD::EH_SJLJ_SETJMP, MVT::i32, Custom);
    480   setOperationAction(ISD::EH_SJLJ_LONGJMP, MVT::Other, Custom);
    481 
    482   // Darwin ABI issue.
    483   setOperationAction(ISD::ConstantPool    , MVT::i32  , Custom);
    484   setOperationAction(ISD::JumpTable       , MVT::i32  , Custom);
    485   setOperationAction(ISD::GlobalAddress   , MVT::i32  , Custom);
    486   setOperationAction(ISD::GlobalTLSAddress, MVT::i32  , Custom);
    487   if (Subtarget->is64Bit())
    488     setOperationAction(ISD::GlobalTLSAddress, MVT::i64, Custom);
    489   setOperationAction(ISD::ExternalSymbol  , MVT::i32  , Custom);
    490   setOperationAction(ISD::BlockAddress    , MVT::i32  , Custom);
    491   if (Subtarget->is64Bit()) {
    492     setOperationAction(ISD::ConstantPool  , MVT::i64  , Custom);
    493     setOperationAction(ISD::JumpTable     , MVT::i64  , Custom);
    494     setOperationAction(ISD::GlobalAddress , MVT::i64  , Custom);
    495     setOperationAction(ISD::ExternalSymbol, MVT::i64  , Custom);
    496     setOperationAction(ISD::BlockAddress  , MVT::i64  , Custom);
    497   }
    498   // 64-bit addm sub, shl, sra, srl (iff 32-bit x86)
    499   setOperationAction(ISD::SHL_PARTS       , MVT::i32  , Custom);
    500   setOperationAction(ISD::SRA_PARTS       , MVT::i32  , Custom);
    501   setOperationAction(ISD::SRL_PARTS       , MVT::i32  , Custom);
    502   if (Subtarget->is64Bit()) {
    503     setOperationAction(ISD::SHL_PARTS     , MVT::i64  , Custom);
    504     setOperationAction(ISD::SRA_PARTS     , MVT::i64  , Custom);
    505     setOperationAction(ISD::SRL_PARTS     , MVT::i64  , Custom);
    506   }
    507 
    508   if (Subtarget->hasSSE1())
    509     setOperationAction(ISD::PREFETCH      , MVT::Other, Legal);
    510 
    511   setOperationAction(ISD::MEMBARRIER    , MVT::Other, Custom);
    512   setOperationAction(ISD::ATOMIC_FENCE  , MVT::Other, Custom);
    513 
    514   // On X86 and X86-64, atomic operations are lowered to locked instructions.
    515   // Locked instructions, in turn, have implicit fence semantics (all memory
    516   // operations are flushed before issuing the locked instruction, and they
    517   // are not buffered), so we can fold away the common pattern of
    518   // fence-atomic-fence.
    519   setShouldFoldAtomicFences(true);
    520 
    521   // Expand certain atomics
    522   for (unsigned i = 0; i != array_lengthof(IntVTs); ++i) {
    523     MVT VT = IntVTs[i];
    524     setOperationAction(ISD::ATOMIC_CMP_SWAP, VT, Custom);
    525     setOperationAction(ISD::ATOMIC_LOAD_SUB, VT, Custom);
    526     setOperationAction(ISD::ATOMIC_STORE, VT, Custom);
    527   }
    528 
    529   if (!Subtarget->is64Bit()) {
    530     setOperationAction(ISD::ATOMIC_LOAD, MVT::i64, Custom);
    531     setOperationAction(ISD::ATOMIC_LOAD_ADD, MVT::i64, Custom);
    532     setOperationAction(ISD::ATOMIC_LOAD_SUB, MVT::i64, Custom);
    533     setOperationAction(ISD::ATOMIC_LOAD_AND, MVT::i64, Custom);
    534     setOperationAction(ISD::ATOMIC_LOAD_OR, MVT::i64, Custom);
    535     setOperationAction(ISD::ATOMIC_LOAD_XOR, MVT::i64, Custom);
    536     setOperationAction(ISD::ATOMIC_LOAD_NAND, MVT::i64, Custom);
    537     setOperationAction(ISD::ATOMIC_SWAP, MVT::i64, Custom);
    538     setOperationAction(ISD::ATOMIC_LOAD_MAX, MVT::i64, Custom);
    539     setOperationAction(ISD::ATOMIC_LOAD_MIN, MVT::i64, Custom);
    540     setOperationAction(ISD::ATOMIC_LOAD_UMAX, MVT::i64, Custom);
    541     setOperationAction(ISD::ATOMIC_LOAD_UMIN, MVT::i64, Custom);
    542   }
    543 
    544   if (Subtarget->hasCmpxchg16b()) {
    545     setOperationAction(ISD::ATOMIC_CMP_SWAP, MVT::i128, Custom);
    546   }
    547 
    548   // FIXME - use subtarget debug flags
    549   if (!Subtarget->isTargetDarwin() &&
    550       !Subtarget->isTargetELF() &&
    551       !Subtarget->isTargetCygMing()) {
    552     setOperationAction(ISD::EH_LABEL, MVT::Other, Expand);
    553   }
    554 
    555   setOperationAction(ISD::EXCEPTIONADDR, MVT::i64, Expand);
    556   setOperationAction(ISD::EHSELECTION,   MVT::i64, Expand);
    557   setOperationAction(ISD::EXCEPTIONADDR, MVT::i32, Expand);
    558   setOperationAction(ISD::EHSELECTION,   MVT::i32, Expand);
    559   if (Subtarget->is64Bit()) {
    560     setExceptionPointerRegister(X86::RAX);
    561     setExceptionSelectorRegister(X86::RDX);
    562   } else {
    563     setExceptionPointerRegister(X86::EAX);
    564     setExceptionSelectorRegister(X86::EDX);
    565   }
    566   setOperationAction(ISD::FRAME_TO_ARGS_OFFSET, MVT::i32, Custom);
    567   setOperationAction(ISD::FRAME_TO_ARGS_OFFSET, MVT::i64, Custom);
    568 
    569   setOperationAction(ISD::INIT_TRAMPOLINE, MVT::Other, Custom);
    570   setOperationAction(ISD::ADJUST_TRAMPOLINE, MVT::Other, Custom);
    571 
    572   setOperationAction(ISD::TRAP, MVT::Other, Legal);
    573   setOperationAction(ISD::DEBUGTRAP, MVT::Other, Legal);
    574 
    575   // VASTART needs to be custom lowered to use the VarArgsFrameIndex
    576   setOperationAction(ISD::VASTART           , MVT::Other, Custom);
    577   setOperationAction(ISD::VAEND             , MVT::Other, Expand);
    578   if (Subtarget->is64Bit()) {
    579     setOperationAction(ISD::VAARG           , MVT::Other, Custom);
    580     setOperationAction(ISD::VACOPY          , MVT::Other, Custom);
    581   } else {
    582     setOperationAction(ISD::VAARG           , MVT::Other, Expand);
    583     setOperationAction(ISD::VACOPY          , MVT::Other, Expand);
    584   }
    585 
    586   setOperationAction(ISD::STACKSAVE,          MVT::Other, Expand);
    587   setOperationAction(ISD::STACKRESTORE,       MVT::Other, Expand);
    588 
    589   if (Subtarget->isTargetCOFF() && !Subtarget->isTargetEnvMacho())
    590     setOperationAction(ISD::DYNAMIC_STACKALLOC, Subtarget->is64Bit() ?
    591                        MVT::i64 : MVT::i32, Custom);
    592   else if (TM.Options.EnableSegmentedStacks)
    593     setOperationAction(ISD::DYNAMIC_STACKALLOC, Subtarget->is64Bit() ?
    594                        MVT::i64 : MVT::i32, Custom);
    595   else
    596     setOperationAction(ISD::DYNAMIC_STACKALLOC, Subtarget->is64Bit() ?
    597                        MVT::i64 : MVT::i32, Expand);
    598 
    599   if (!TM.Options.UseSoftFloat && X86ScalarSSEf64) {
    600     // f32 and f64 use SSE.
    601     // Set up the FP register classes.
    602     addRegisterClass(MVT::f32, &X86::FR32RegClass);
    603     addRegisterClass(MVT::f64, &X86::FR64RegClass);
    604 
    605     // Use ANDPD to simulate FABS.
    606     setOperationAction(ISD::FABS , MVT::f64, Custom);
    607     setOperationAction(ISD::FABS , MVT::f32, Custom);
    608 
    609     // Use XORP to simulate FNEG.
    610     setOperationAction(ISD::FNEG , MVT::f64, Custom);
    611     setOperationAction(ISD::FNEG , MVT::f32, Custom);
    612 
    613     // Use ANDPD and ORPD to simulate FCOPYSIGN.
    614     setOperationAction(ISD::FCOPYSIGN, MVT::f64, Custom);
    615     setOperationAction(ISD::FCOPYSIGN, MVT::f32, Custom);
    616 
    617     // Lower this to FGETSIGNx86 plus an AND.
    618     setOperationAction(ISD::FGETSIGN, MVT::i64, Custom);
    619     setOperationAction(ISD::FGETSIGN, MVT::i32, Custom);
    620 
    621     // We don't support sin/cos/fmod
    622     setOperationAction(ISD::FSIN   , MVT::f64, Expand);
    623     setOperationAction(ISD::FCOS   , MVT::f64, Expand);
    624     setOperationAction(ISD::FSINCOS, MVT::f64, Expand);
    625     setOperationAction(ISD::FSIN   , MVT::f32, Expand);
    626     setOperationAction(ISD::FCOS   , MVT::f32, Expand);
    627     setOperationAction(ISD::FSINCOS, MVT::f32, Expand);
    628 
    629     // Expand FP immediates into loads from the stack, except for the special
    630     // cases we handle.
    631     addLegalFPImmediate(APFloat(+0.0)); // xorpd
    632     addLegalFPImmediate(APFloat(+0.0f)); // xorps
    633   } else if (!TM.Options.UseSoftFloat && X86ScalarSSEf32) {
    634     // Use SSE for f32, x87 for f64.
    635     // Set up the FP register classes.
    636     addRegisterClass(MVT::f32, &X86::FR32RegClass);
    637     addRegisterClass(MVT::f64, &X86::RFP64RegClass);
    638 
    639     // Use ANDPS to simulate FABS.
    640     setOperationAction(ISD::FABS , MVT::f32, Custom);
    641 
    642     // Use XORP to simulate FNEG.
    643     setOperationAction(ISD::FNEG , MVT::f32, Custom);
    644 
    645     setOperationAction(ISD::UNDEF,     MVT::f64, Expand);
    646 
    647     // Use ANDPS and ORPS to simulate FCOPYSIGN.
    648     setOperationAction(ISD::FCOPYSIGN, MVT::f64, Expand);
    649     setOperationAction(ISD::FCOPYSIGN, MVT::f32, Custom);
    650 
    651     // We don't support sin/cos/fmod
    652     setOperationAction(ISD::FSIN   , MVT::f32, Expand);
    653     setOperationAction(ISD::FCOS   , MVT::f32, Expand);
    654     setOperationAction(ISD::FSINCOS, MVT::f32, Expand);
    655 
    656     // Special cases we handle for FP constants.
    657     addLegalFPImmediate(APFloat(+0.0f)); // xorps
    658     addLegalFPImmediate(APFloat(+0.0)); // FLD0
    659     addLegalFPImmediate(APFloat(+1.0)); // FLD1
    660     addLegalFPImmediate(APFloat(-0.0)); // FLD0/FCHS
    661     addLegalFPImmediate(APFloat(-1.0)); // FLD1/FCHS
    662 
    663     if (!TM.Options.UnsafeFPMath) {
    664       setOperationAction(ISD::FSIN   , MVT::f64, Expand);
    665       setOperationAction(ISD::FCOS   , MVT::f64, Expand);
    666       setOperationAction(ISD::FSINCOS, MVT::f64, Expand);
    667     }
    668   } else if (!TM.Options.UseSoftFloat) {
    669     // f32 and f64 in x87.
    670     // Set up the FP register classes.
    671     addRegisterClass(MVT::f64, &X86::RFP64RegClass);
    672     addRegisterClass(MVT::f32, &X86::RFP32RegClass);
    673 
    674     setOperationAction(ISD::UNDEF,     MVT::f64, Expand);
    675     setOperationAction(ISD::UNDEF,     MVT::f32, Expand);
    676     setOperationAction(ISD::FCOPYSIGN, MVT::f64, Expand);
    677     setOperationAction(ISD::FCOPYSIGN, MVT::f32, Expand);
    678 
    679     if (!TM.Options.UnsafeFPMath) {
    680       setOperationAction(ISD::FSIN   , MVT::f64, Expand);
    681       setOperationAction(ISD::FSIN   , MVT::f32, Expand);
    682       setOperationAction(ISD::FCOS   , MVT::f64, Expand);
    683       setOperationAction(ISD::FCOS   , MVT::f32, Expand);
    684       setOperationAction(ISD::FSINCOS, MVT::f64, Expand);
    685       setOperationAction(ISD::FSINCOS, MVT::f32, Expand);
    686     }
    687     addLegalFPImmediate(APFloat(+0.0)); // FLD0
    688     addLegalFPImmediate(APFloat(+1.0)); // FLD1
    689     addLegalFPImmediate(APFloat(-0.0)); // FLD0/FCHS
    690     addLegalFPImmediate(APFloat(-1.0)); // FLD1/FCHS
    691     addLegalFPImmediate(APFloat(+0.0f)); // FLD0
    692     addLegalFPImmediate(APFloat(+1.0f)); // FLD1
    693     addLegalFPImmediate(APFloat(-0.0f)); // FLD0/FCHS
    694     addLegalFPImmediate(APFloat(-1.0f)); // FLD1/FCHS
    695   }
    696 
    697   // We don't support FMA.
    698   setOperationAction(ISD::FMA, MVT::f64, Expand);
    699   setOperationAction(ISD::FMA, MVT::f32, Expand);
    700 
    701   // Long double always uses X87.
    702   if (!TM.Options.UseSoftFloat) {
    703     addRegisterClass(MVT::f80, &X86::RFP80RegClass);
    704     setOperationAction(ISD::UNDEF,     MVT::f80, Expand);
    705     setOperationAction(ISD::FCOPYSIGN, MVT::f80, Expand);
    706     {
    707       APFloat TmpFlt = APFloat::getZero(APFloat::x87DoubleExtended);
    708       addLegalFPImmediate(TmpFlt);  // FLD0
    709       TmpFlt.changeSign();
    710       addLegalFPImmediate(TmpFlt);  // FLD0/FCHS
    711 
    712       bool ignored;
    713       APFloat TmpFlt2(+1.0);
    714       TmpFlt2.convert(APFloat::x87DoubleExtended, APFloat::rmNearestTiesToEven,
    715                       &ignored);
    716       addLegalFPImmediate(TmpFlt2);  // FLD1
    717       TmpFlt2.changeSign();
    718       addLegalFPImmediate(TmpFlt2);  // FLD1/FCHS
    719     }
    720 
    721     if (!TM.Options.UnsafeFPMath) {
    722       setOperationAction(ISD::FSIN   , MVT::f80, Expand);
    723       setOperationAction(ISD::FCOS   , MVT::f80, Expand);
    724       setOperationAction(ISD::FSINCOS, MVT::f80, Expand);
    725     }
    726 
    727     setOperationAction(ISD::FFLOOR, MVT::f80, Expand);
    728     setOperationAction(ISD::FCEIL,  MVT::f80, Expand);
    729     setOperationAction(ISD::FTRUNC, MVT::f80, Expand);
    730     setOperationAction(ISD::FRINT,  MVT::f80, Expand);
    731     setOperationAction(ISD::FNEARBYINT, MVT::f80, Expand);
    732     setOperationAction(ISD::FMA, MVT::f80, Expand);
    733   }
    734 
    735   // Always use a library call for pow.
    736   setOperationAction(ISD::FPOW             , MVT::f32  , Expand);
    737   setOperationAction(ISD::FPOW             , MVT::f64  , Expand);
    738   setOperationAction(ISD::FPOW             , MVT::f80  , Expand);
    739 
    740   setOperationAction(ISD::FLOG, MVT::f80, Expand);
    741   setOperationAction(ISD::FLOG2, MVT::f80, Expand);
    742   setOperationAction(ISD::FLOG10, MVT::f80, Expand);
    743   setOperationAction(ISD::FEXP, MVT::f80, Expand);
    744   setOperationAction(ISD::FEXP2, MVT::f80, Expand);
    745 
    746   // First set operation action for all vector types to either promote
    747   // (for widening) or expand (for scalarization). Then we will selectively
    748   // turn on ones that can be effectively codegen'd.
    749   for (int i = MVT::FIRST_VECTOR_VALUETYPE;
    750            i <= MVT::LAST_VECTOR_VALUETYPE; ++i) {
    751     MVT VT = (MVT::SimpleValueType)i;
    752     setOperationAction(ISD::ADD , VT, Expand);
    753     setOperationAction(ISD::SUB , VT, Expand);
    754     setOperationAction(ISD::FADD, VT, Expand);
    755     setOperationAction(ISD::FNEG, VT, Expand);
    756     setOperationAction(ISD::FSUB, VT, Expand);
    757     setOperationAction(ISD::MUL , VT, Expand);
    758     setOperationAction(ISD::FMUL, VT, Expand);
    759     setOperationAction(ISD::SDIV, VT, Expand);
    760     setOperationAction(ISD::UDIV, VT, Expand);
    761     setOperationAction(ISD::FDIV, VT, Expand);
    762     setOperationAction(ISD::SREM, VT, Expand);
    763     setOperationAction(ISD::UREM, VT, Expand);
    764     setOperationAction(ISD::LOAD, VT, Expand);
    765     setOperationAction(ISD::VECTOR_SHUFFLE, VT, Expand);
    766     setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT,Expand);
    767     setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Expand);
    768     setOperationAction(ISD::EXTRACT_SUBVECTOR, VT,Expand);
    769     setOperationAction(ISD::INSERT_SUBVECTOR, VT,Expand);
    770     setOperationAction(ISD::FABS, VT, Expand);
    771     setOperationAction(ISD::FSIN, VT, Expand);
    772     setOperationAction(ISD::FSINCOS, VT, Expand);
    773     setOperationAction(ISD::FCOS, VT, Expand);
    774     setOperationAction(ISD::FSINCOS, VT, Expand);
    775     setOperationAction(ISD::FREM, VT, Expand);
    776     setOperationAction(ISD::FMA,  VT, Expand);
    777     setOperationAction(ISD::FPOWI, VT, Expand);
    778     setOperationAction(ISD::FSQRT, VT, Expand);
    779     setOperationAction(ISD::FCOPYSIGN, VT, Expand);
    780     setOperationAction(ISD::FFLOOR, VT, Expand);
    781     setOperationAction(ISD::FCEIL, VT, Expand);
    782     setOperationAction(ISD::FTRUNC, VT, Expand);
    783     setOperationAction(ISD::FRINT, VT, Expand);
    784     setOperationAction(ISD::FNEARBYINT, VT, Expand);
    785     setOperationAction(ISD::SMUL_LOHI, VT, Expand);
    786     setOperationAction(ISD::UMUL_LOHI, VT, Expand);
    787     setOperationAction(ISD::SDIVREM, VT, Expand);
    788     setOperationAction(ISD::UDIVREM, VT, Expand);
    789     setOperationAction(ISD::FPOW, VT, Expand);
    790     setOperationAction(ISD::CTPOP, VT, Expand);
    791     setOperationAction(ISD::CTTZ, VT, Expand);
    792     setOperationAction(ISD::CTTZ_ZERO_UNDEF, VT, Expand);
    793     setOperationAction(ISD::CTLZ, VT, Expand);
    794     setOperationAction(ISD::CTLZ_ZERO_UNDEF, VT, Expand);
    795     setOperationAction(ISD::SHL, VT, Expand);
    796     setOperationAction(ISD::SRA, VT, Expand);
    797     setOperationAction(ISD::SRL, VT, Expand);
    798     setOperationAction(ISD::ROTL, VT, Expand);
    799     setOperationAction(ISD::ROTR, VT, Expand);
    800     setOperationAction(ISD::BSWAP, VT, Expand);
    801     setOperationAction(ISD::SETCC, VT, Expand);
    802     setOperationAction(ISD::FLOG, VT, Expand);
    803     setOperationAction(ISD::FLOG2, VT, Expand);
    804     setOperationAction(ISD::FLOG10, VT, Expand);
    805     setOperationAction(ISD::FEXP, VT, Expand);
    806     setOperationAction(ISD::FEXP2, VT, Expand);
    807     setOperationAction(ISD::FP_TO_UINT, VT, Expand);
    808     setOperationAction(ISD::FP_TO_SINT, VT, Expand);
    809     setOperationAction(ISD::UINT_TO_FP, VT, Expand);
    810     setOperationAction(ISD::SINT_TO_FP, VT, Expand);
    811     setOperationAction(ISD::SIGN_EXTEND_INREG, VT,Expand);
    812     setOperationAction(ISD::TRUNCATE, VT, Expand);
    813     setOperationAction(ISD::SIGN_EXTEND, VT, Expand);
    814     setOperationAction(ISD::ZERO_EXTEND, VT, Expand);
    815     setOperationAction(ISD::ANY_EXTEND, VT, Expand);
    816     setOperationAction(ISD::VSELECT, VT, Expand);
    817     for (int InnerVT = MVT::FIRST_VECTOR_VALUETYPE;
    818              InnerVT <= MVT::LAST_VECTOR_VALUETYPE; ++InnerVT)
    819       setTruncStoreAction(VT,
    820                           (MVT::SimpleValueType)InnerVT, Expand);
    821     setLoadExtAction(ISD::SEXTLOAD, VT, Expand);
    822     setLoadExtAction(ISD::ZEXTLOAD, VT, Expand);
    823     setLoadExtAction(ISD::EXTLOAD, VT, Expand);
    824   }
    825 
    826   // FIXME: In order to prevent SSE instructions being expanded to MMX ones
    827   // with -msoft-float, disable use of MMX as well.
    828   if (!TM.Options.UseSoftFloat && Subtarget->hasMMX()) {
    829     addRegisterClass(MVT::x86mmx, &X86::VR64RegClass);
    830     // No operations on x86mmx supported, everything uses intrinsics.
    831   }
    832 
    833   // MMX-sized vectors (other than x86mmx) are expected to be expanded
    834   // into smaller operations.
    835   setOperationAction(ISD::MULHS,              MVT::v8i8,  Expand);
    836   setOperationAction(ISD::MULHS,              MVT::v4i16, Expand);
    837   setOperationAction(ISD::MULHS,              MVT::v2i32, Expand);
    838   setOperationAction(ISD::MULHS,              MVT::v1i64, Expand);
    839   setOperationAction(ISD::AND,                MVT::v8i8,  Expand);
    840   setOperationAction(ISD::AND,                MVT::v4i16, Expand);
    841   setOperationAction(ISD::AND,                MVT::v2i32, Expand);
    842   setOperationAction(ISD::AND,                MVT::v1i64, Expand);
    843   setOperationAction(ISD::OR,                 MVT::v8i8,  Expand);
    844   setOperationAction(ISD::OR,                 MVT::v4i16, Expand);
    845   setOperationAction(ISD::OR,                 MVT::v2i32, Expand);
    846   setOperationAction(ISD::OR,                 MVT::v1i64, Expand);
    847   setOperationAction(ISD::XOR,                MVT::v8i8,  Expand);
    848   setOperationAction(ISD::XOR,                MVT::v4i16, Expand);
    849   setOperationAction(ISD::XOR,                MVT::v2i32, Expand);
    850   setOperationAction(ISD::XOR,                MVT::v1i64, Expand);
    851   setOperationAction(ISD::SCALAR_TO_VECTOR,   MVT::v8i8,  Expand);
    852   setOperationAction(ISD::SCALAR_TO_VECTOR,   MVT::v4i16, Expand);
    853   setOperationAction(ISD::SCALAR_TO_VECTOR,   MVT::v2i32, Expand);
    854   setOperationAction(ISD::SCALAR_TO_VECTOR,   MVT::v1i64, Expand);
    855   setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v1i64, Expand);
    856   setOperationAction(ISD::SELECT,             MVT::v8i8,  Expand);
    857   setOperationAction(ISD::SELECT,             MVT::v4i16, Expand);
    858   setOperationAction(ISD::SELECT,             MVT::v2i32, Expand);
    859   setOperationAction(ISD::SELECT,             MVT::v1i64, Expand);
    860   setOperationAction(ISD::BITCAST,            MVT::v8i8,  Expand);
    861   setOperationAction(ISD::BITCAST,            MVT::v4i16, Expand);
    862   setOperationAction(ISD::BITCAST,            MVT::v2i32, Expand);
    863   setOperationAction(ISD::BITCAST,            MVT::v1i64, Expand);
    864 
    865   if (!TM.Options.UseSoftFloat && Subtarget->hasSSE1()) {
    866     addRegisterClass(MVT::v4f32, &X86::VR128RegClass);
    867 
    868     setOperationAction(ISD::FADD,               MVT::v4f32, Legal);
    869     setOperationAction(ISD::FSUB,               MVT::v4f32, Legal);
    870     setOperationAction(ISD::FMUL,               MVT::v4f32, Legal);
    871     setOperationAction(ISD::FDIV,               MVT::v4f32, Legal);
    872     setOperationAction(ISD::FSQRT,              MVT::v4f32, Legal);
    873     setOperationAction(ISD::FNEG,               MVT::v4f32, Custom);
    874     setOperationAction(ISD::FABS,               MVT::v4f32, Custom);
    875     setOperationAction(ISD::LOAD,               MVT::v4f32, Legal);
    876     setOperationAction(ISD::BUILD_VECTOR,       MVT::v4f32, Custom);
    877     setOperationAction(ISD::VECTOR_SHUFFLE,     MVT::v4f32, Custom);
    878     setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4f32, Custom);
    879     setOperationAction(ISD::SELECT,             MVT::v4f32, Custom);
    880   }
    881 
    882   if (!TM.Options.UseSoftFloat && Subtarget->hasSSE2()) {
    883     addRegisterClass(MVT::v2f64, &X86::VR128RegClass);
    884 
    885     // FIXME: Unfortunately -soft-float and -no-implicit-float means XMM
    886     // registers cannot be used even for integer operations.
    887     addRegisterClass(MVT::v16i8, &X86::VR128RegClass);
    888     addRegisterClass(MVT::v8i16, &X86::VR128RegClass);
    889     addRegisterClass(MVT::v4i32, &X86::VR128RegClass);
    890     addRegisterClass(MVT::v2i64, &X86::VR128RegClass);
    891 
    892     setOperationAction(ISD::ADD,                MVT::v16i8, Legal);
    893     setOperationAction(ISD::ADD,                MVT::v8i16, Legal);
    894     setOperationAction(ISD::ADD,                MVT::v4i32, Legal);
    895     setOperationAction(ISD::ADD,                MVT::v2i64, Legal);
    896     setOperationAction(ISD::MUL,                MVT::v4i32, Custom);
    897     setOperationAction(ISD::MUL,                MVT::v2i64, Custom);
    898     setOperationAction(ISD::SUB,                MVT::v16i8, Legal);
    899     setOperationAction(ISD::SUB,                MVT::v8i16, Legal);
    900     setOperationAction(ISD::SUB,                MVT::v4i32, Legal);
    901     setOperationAction(ISD::SUB,                MVT::v2i64, Legal);
    902     setOperationAction(ISD::MUL,                MVT::v8i16, Legal);
    903     setOperationAction(ISD::FADD,               MVT::v2f64, Legal);
    904     setOperationAction(ISD::FSUB,               MVT::v2f64, Legal);
    905     setOperationAction(ISD::FMUL,               MVT::v2f64, Legal);
    906     setOperationAction(ISD::FDIV,               MVT::v2f64, Legal);
    907     setOperationAction(ISD::FSQRT,              MVT::v2f64, Legal);
    908     setOperationAction(ISD::FNEG,               MVT::v2f64, Custom);
    909     setOperationAction(ISD::FABS,               MVT::v2f64, Custom);
    910 
    911     setOperationAction(ISD::SETCC,              MVT::v2i64, Custom);
    912     setOperationAction(ISD::SETCC,              MVT::v16i8, Custom);
    913     setOperationAction(ISD::SETCC,              MVT::v8i16, Custom);
    914     setOperationAction(ISD::SETCC,              MVT::v4i32, Custom);
    915 
    916     setOperationAction(ISD::SCALAR_TO_VECTOR,   MVT::v16i8, Custom);
    917     setOperationAction(ISD::SCALAR_TO_VECTOR,   MVT::v8i16, Custom);
    918     setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v8i16, Custom);
    919     setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v4i32, Custom);
    920     setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v4f32, Custom);
    921 
    922     // Custom lower build_vector, vector_shuffle, and extract_vector_elt.
    923     for (int i = MVT::v16i8; i != MVT::v2i64; ++i) {
    924       MVT VT = (MVT::SimpleValueType)i;
    925       // Do not attempt to custom lower non-power-of-2 vectors
    926       if (!isPowerOf2_32(VT.getVectorNumElements()))
    927         continue;
    928       // Do not attempt to custom lower non-128-bit vectors
    929       if (!VT.is128BitVector())
    930         continue;
    931       setOperationAction(ISD::BUILD_VECTOR,       VT, Custom);
    932       setOperationAction(ISD::VECTOR_SHUFFLE,     VT, Custom);
    933       setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
    934     }
    935 
    936     setOperationAction(ISD::BUILD_VECTOR,       MVT::v2f64, Custom);
    937     setOperationAction(ISD::BUILD_VECTOR,       MVT::v2i64, Custom);
    938     setOperationAction(ISD::VECTOR_SHUFFLE,     MVT::v2f64, Custom);
    939     setOperationAction(ISD::VECTOR_SHUFFLE,     MVT::v2i64, Custom);
    940     setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v2f64, Custom);
    941     setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2f64, Custom);
    942 
    943     if (Subtarget->is64Bit()) {
    944       setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v2i64, Custom);
    945       setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2i64, Custom);
    946     }
    947 
    948     // Promote v16i8, v8i16, v4i32 load, select, and, or, xor to v2i64.
    949     for (int i = MVT::v16i8; i != MVT::v2i64; ++i) {
    950       MVT VT = (MVT::SimpleValueType)i;
    951 
    952       // Do not attempt to promote non-128-bit vectors
    953       if (!VT.is128BitVector())
    954         continue;
    955 
    956       setOperationAction(ISD::AND,    VT, Promote);
    957       AddPromotedToType (ISD::AND,    VT, MVT::v2i64);
    958       setOperationAction(ISD::OR,     VT, Promote);
    959       AddPromotedToType (ISD::OR,     VT, MVT::v2i64);
    960       setOperationAction(ISD::XOR,    VT, Promote);
    961       AddPromotedToType (ISD::XOR,    VT, MVT::v2i64);
    962       setOperationAction(ISD::LOAD,   VT, Promote);
    963       AddPromotedToType (ISD::LOAD,   VT, MVT::v2i64);
    964       setOperationAction(ISD::SELECT, VT, Promote);
    965       AddPromotedToType (ISD::SELECT, VT, MVT::v2i64);
    966     }
    967 
    968     setTruncStoreAction(MVT::f64, MVT::f32, Expand);
    969 
    970     // Custom lower v2i64 and v2f64 selects.
    971     setOperationAction(ISD::LOAD,               MVT::v2f64, Legal);
    972     setOperationAction(ISD::LOAD,               MVT::v2i64, Legal);
    973     setOperationAction(ISD::SELECT,             MVT::v2f64, Custom);
    974     setOperationAction(ISD::SELECT,             MVT::v2i64, Custom);
    975 
    976     setOperationAction(ISD::FP_TO_SINT,         MVT::v4i32, Legal);
    977     setOperationAction(ISD::SINT_TO_FP,         MVT::v4i32, Legal);
    978 
    979     setOperationAction(ISD::UINT_TO_FP,         MVT::v4i8,  Custom);
    980     setOperationAction(ISD::UINT_TO_FP,         MVT::v4i16, Custom);
    981     // As there is no 64-bit GPR available, we need build a special custom
    982     // sequence to convert from v2i32 to v2f32.
    983     if (!Subtarget->is64Bit())
    984       setOperationAction(ISD::UINT_TO_FP,       MVT::v2f32, Custom);
    985 
    986     setOperationAction(ISD::FP_EXTEND,          MVT::v2f32, Custom);
    987     setOperationAction(ISD::FP_ROUND,           MVT::v2f32, Custom);
    988 
    989     setLoadExtAction(ISD::EXTLOAD,              MVT::v2f32, Legal);
    990   }
    991 
    992   if (Subtarget->hasSSE41()) {
    993     setOperationAction(ISD::FFLOOR,             MVT::f32,   Legal);
    994     setOperationAction(ISD::FCEIL,              MVT::f32,   Legal);
    995     setOperationAction(ISD::FTRUNC,             MVT::f32,   Legal);
    996     setOperationAction(ISD::FRINT,              MVT::f32,   Legal);
    997     setOperationAction(ISD::FNEARBYINT,         MVT::f32,   Legal);
    998     setOperationAction(ISD::FFLOOR,             MVT::f64,   Legal);
    999     setOperationAction(ISD::FCEIL,              MVT::f64,   Legal);
   1000     setOperationAction(ISD::FTRUNC,             MVT::f64,   Legal);
   1001     setOperationAction(ISD::FRINT,              MVT::f64,   Legal);
   1002     setOperationAction(ISD::FNEARBYINT,         MVT::f64,   Legal);
   1003 
   1004     setOperationAction(ISD::FFLOOR,             MVT::v4f32, Legal);
   1005     setOperationAction(ISD::FCEIL,              MVT::v4f32, Legal);
   1006     setOperationAction(ISD::FTRUNC,             MVT::v4f32, Legal);
   1007     setOperationAction(ISD::FRINT,              MVT::v4f32, Legal);
   1008     setOperationAction(ISD::FNEARBYINT,         MVT::v4f32, Legal);
   1009     setOperationAction(ISD::FFLOOR,             MVT::v2f64, Legal);
   1010     setOperationAction(ISD::FCEIL,              MVT::v2f64, Legal);
   1011     setOperationAction(ISD::FTRUNC,             MVT::v2f64, Legal);
   1012     setOperationAction(ISD::FRINT,              MVT::v2f64, Legal);
   1013     setOperationAction(ISD::FNEARBYINT,         MVT::v2f64, Legal);
   1014 
   1015     // FIXME: Do we need to handle scalar-to-vector here?
   1016     setOperationAction(ISD::MUL,                MVT::v4i32, Legal);
   1017 
   1018     setOperationAction(ISD::VSELECT,            MVT::v2f64, Legal);
   1019     setOperationAction(ISD::VSELECT,            MVT::v2i64, Legal);
   1020     setOperationAction(ISD::VSELECT,            MVT::v16i8, Legal);
   1021     setOperationAction(ISD::VSELECT,            MVT::v4i32, Legal);
   1022     setOperationAction(ISD::VSELECT,            MVT::v4f32, Legal);
   1023 
   1024     // i8 and i16 vectors are custom , because the source register and source
   1025     // source memory operand types are not the same width.  f32 vectors are
   1026     // custom since the immediate controlling the insert encodes additional
   1027     // information.
   1028     setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v16i8, Custom);
   1029     setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v8i16, Custom);
   1030     setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v4i32, Custom);
   1031     setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v4f32, Custom);
   1032 
   1033     setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v16i8, Custom);
   1034     setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v8i16, Custom);
   1035     setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4i32, Custom);
   1036     setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4f32, Custom);
   1037 
   1038     // FIXME: these should be Legal but thats only for the case where
   1039     // the index is constant.  For now custom expand to deal with that.
   1040     if (Subtarget->is64Bit()) {
   1041       setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v2i64, Custom);
   1042       setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2i64, Custom);
   1043     }
   1044   }
   1045 
   1046   if (Subtarget->hasSSE2()) {
   1047     setOperationAction(ISD::SRL,               MVT::v8i16, Custom);
   1048     setOperationAction(ISD::SRL,               MVT::v16i8, Custom);
   1049 
   1050     setOperationAction(ISD::SHL,               MVT::v8i16, Custom);
   1051     setOperationAction(ISD::SHL,               MVT::v16i8, Custom);
   1052 
   1053     setOperationAction(ISD::SRA,               MVT::v8i16, Custom);
   1054     setOperationAction(ISD::SRA,               MVT::v16i8, Custom);
   1055 
   1056     if (Subtarget->hasInt256()) {
   1057       setOperationAction(ISD::SRL,             MVT::v2i64, Legal);
   1058       setOperationAction(ISD::SRL,             MVT::v4i32, Legal);
   1059 
   1060       setOperationAction(ISD::SHL,             MVT::v2i64, Legal);
   1061       setOperationAction(ISD::SHL,             MVT::v4i32, Legal);
   1062 
   1063       setOperationAction(ISD::SRA,             MVT::v4i32, Legal);
   1064     } else {
   1065       setOperationAction(ISD::SRL,             MVT::v2i64, Custom);
   1066       setOperationAction(ISD::SRL,             MVT::v4i32, Custom);
   1067 
   1068       setOperationAction(ISD::SHL,             MVT::v2i64, Custom);
   1069       setOperationAction(ISD::SHL,             MVT::v4i32, Custom);
   1070 
   1071       setOperationAction(ISD::SRA,             MVT::v4i32, Custom);
   1072     }
   1073     setOperationAction(ISD::SDIV,              MVT::v8i16, Custom);
   1074     setOperationAction(ISD::SDIV,              MVT::v4i32, Custom);
   1075   }
   1076 
   1077   if (!TM.Options.UseSoftFloat && Subtarget->hasFp256()) {
   1078     addRegisterClass(MVT::v32i8,  &X86::VR256RegClass);
   1079     addRegisterClass(MVT::v16i16, &X86::VR256RegClass);
   1080     addRegisterClass(MVT::v8i32,  &X86::VR256RegClass);
   1081     addRegisterClass(MVT::v8f32,  &X86::VR256RegClass);
   1082     addRegisterClass(MVT::v4i64,  &X86::VR256RegClass);
   1083     addRegisterClass(MVT::v4f64,  &X86::VR256RegClass);
   1084 
   1085     setOperationAction(ISD::LOAD,               MVT::v8f32, Legal);
   1086     setOperationAction(ISD::LOAD,               MVT::v4f64, Legal);
   1087     setOperationAction(ISD::LOAD,               MVT::v4i64, Legal);
   1088 
   1089     setOperationAction(ISD::FADD,               MVT::v8f32, Legal);
   1090     setOperationAction(ISD::FSUB,               MVT::v8f32, Legal);
   1091     setOperationAction(ISD::FMUL,               MVT::v8f32, Legal);
   1092     setOperationAction(ISD::FDIV,               MVT::v8f32, Legal);
   1093     setOperationAction(ISD::FSQRT,              MVT::v8f32, Legal);
   1094     setOperationAction(ISD::FFLOOR,             MVT::v8f32, Legal);
   1095     setOperationAction(ISD::FCEIL,              MVT::v8f32, Legal);
   1096     setOperationAction(ISD::FTRUNC,             MVT::v8f32, Legal);
   1097     setOperationAction(ISD::FRINT,              MVT::v8f32, Legal);
   1098     setOperationAction(ISD::FNEARBYINT,         MVT::v8f32, Legal);
   1099     setOperationAction(ISD::FNEG,               MVT::v8f32, Custom);
   1100     setOperationAction(ISD::FABS,               MVT::v8f32, Custom);
   1101 
   1102     setOperationAction(ISD::FADD,               MVT::v4f64, Legal);
   1103     setOperationAction(ISD::FSUB,               MVT::v4f64, Legal);
   1104     setOperationAction(ISD::FMUL,               MVT::v4f64, Legal);
   1105     setOperationAction(ISD::FDIV,               MVT::v4f64, Legal);
   1106     setOperationAction(ISD::FSQRT,              MVT::v4f64, Legal);
   1107     setOperationAction(ISD::FFLOOR,             MVT::v4f64, Legal);
   1108     setOperationAction(ISD::FCEIL,              MVT::v4f64, Legal);
   1109     setOperationAction(ISD::FTRUNC,             MVT::v4f64, Legal);
   1110     setOperationAction(ISD::FRINT,              MVT::v4f64, Legal);
   1111     setOperationAction(ISD::FNEARBYINT,         MVT::v4f64, Legal);
   1112     setOperationAction(ISD::FNEG,               MVT::v4f64, Custom);
   1113     setOperationAction(ISD::FABS,               MVT::v4f64, Custom);
   1114 
   1115     setOperationAction(ISD::TRUNCATE,           MVT::v8i16, Custom);
   1116     setOperationAction(ISD::TRUNCATE,           MVT::v4i32, Custom);
   1117 
   1118     setOperationAction(ISD::FP_TO_SINT,         MVT::v8i16, Custom);
   1119 
   1120     setOperationAction(ISD::FP_TO_SINT,         MVT::v8i32, Legal);
   1121     setOperationAction(ISD::SINT_TO_FP,         MVT::v8i32, Legal);
   1122     setOperationAction(ISD::FP_ROUND,           MVT::v4f32, Legal);
   1123 
   1124     setOperationAction(ISD::ZERO_EXTEND,        MVT::v8i32, Custom);
   1125     setOperationAction(ISD::UINT_TO_FP,         MVT::v8i8,  Custom);
   1126     setOperationAction(ISD::UINT_TO_FP,         MVT::v8i16, Custom);
   1127 
   1128     setLoadExtAction(ISD::EXTLOAD,              MVT::v4f32, Legal);
   1129 
   1130     setOperationAction(ISD::SRL,               MVT::v16i16, Custom);
   1131     setOperationAction(ISD::SRL,               MVT::v32i8, Custom);
   1132 
   1133     setOperationAction(ISD::SHL,               MVT::v16i16, Custom);
   1134     setOperationAction(ISD::SHL,               MVT::v32i8, Custom);
   1135 
   1136     setOperationAction(ISD::SRA,               MVT::v16i16, Custom);
   1137     setOperationAction(ISD::SRA,               MVT::v32i8, Custom);
   1138 
   1139     setOperationAction(ISD::SDIV,              MVT::v16i16, Custom);
   1140 
   1141     setOperationAction(ISD::SETCC,             MVT::v32i8, Custom);
   1142     setOperationAction(ISD::SETCC,             MVT::v16i16, Custom);
   1143     setOperationAction(ISD::SETCC,             MVT::v8i32, Custom);
   1144     setOperationAction(ISD::SETCC,             MVT::v4i64, Custom);
   1145 
   1146     setOperationAction(ISD::SELECT,            MVT::v4f64, Custom);
   1147     setOperationAction(ISD::SELECT,            MVT::v4i64, Custom);
   1148     setOperationAction(ISD::SELECT,            MVT::v8f32, Custom);
   1149 
   1150     setOperationAction(ISD::VSELECT,           MVT::v4f64, Legal);
   1151     setOperationAction(ISD::VSELECT,           MVT::v4i64, Legal);
   1152     setOperationAction(ISD::VSELECT,           MVT::v8i32, Legal);
   1153     setOperationAction(ISD::VSELECT,           MVT::v8f32, Legal);
   1154 
   1155     setOperationAction(ISD::SIGN_EXTEND,       MVT::v4i64, Custom);
   1156     setOperationAction(ISD::SIGN_EXTEND,       MVT::v8i32, Custom);
   1157     setOperationAction(ISD::ZERO_EXTEND,       MVT::v4i64, Custom);
   1158     setOperationAction(ISD::ZERO_EXTEND,       MVT::v8i32, Custom);
   1159     setOperationAction(ISD::ANY_EXTEND,        MVT::v4i64, Custom);
   1160     setOperationAction(ISD::ANY_EXTEND,        MVT::v8i32, Custom);
   1161 
   1162     if (Subtarget->hasFMA() || Subtarget->hasFMA4()) {
   1163       setOperationAction(ISD::FMA,             MVT::v8f32, Legal);
   1164       setOperationAction(ISD::FMA,             MVT::v4f64, Legal);
   1165       setOperationAction(ISD::FMA,             MVT::v4f32, Legal);
   1166       setOperationAction(ISD::FMA,             MVT::v2f64, Legal);
   1167       setOperationAction(ISD::FMA,             MVT::f32, Legal);
   1168       setOperationAction(ISD::FMA,             MVT::f64, Legal);
   1169     }
   1170 
   1171     if (Subtarget->hasInt256()) {
   1172       setOperationAction(ISD::ADD,             MVT::v4i64, Legal);
   1173       setOperationAction(ISD::ADD,             MVT::v8i32, Legal);
   1174       setOperationAction(ISD::ADD,             MVT::v16i16, Legal);
   1175       setOperationAction(ISD::ADD,             MVT::v32i8, Legal);
   1176 
   1177       setOperationAction(ISD::SUB,             MVT::v4i64, Legal);
   1178       setOperationAction(ISD::SUB,             MVT::v8i32, Legal);
   1179       setOperationAction(ISD::SUB,             MVT::v16i16, Legal);
   1180       setOperationAction(ISD::SUB,             MVT::v32i8, Legal);
   1181 
   1182       setOperationAction(ISD::MUL,             MVT::v4i64, Custom);
   1183       setOperationAction(ISD::MUL,             MVT::v8i32, Legal);
   1184       setOperationAction(ISD::MUL,             MVT::v16i16, Legal);
   1185       // Don't lower v32i8 because there is no 128-bit byte mul
   1186 
   1187       setOperationAction(ISD::VSELECT,         MVT::v32i8, Legal);
   1188 
   1189       setOperationAction(ISD::SRL,             MVT::v4i64, Legal);
   1190       setOperationAction(ISD::SRL,             MVT::v8i32, Legal);
   1191 
   1192       setOperationAction(ISD::SHL,             MVT::v4i64, Legal);
   1193       setOperationAction(ISD::SHL,             MVT::v8i32, Legal);
   1194 
   1195       setOperationAction(ISD::SRA,             MVT::v8i32, Legal);
   1196 
   1197       setOperationAction(ISD::SDIV,            MVT::v8i32, Custom);
   1198     } else {
   1199       setOperationAction(ISD::ADD,             MVT::v4i64, Custom);
   1200       setOperationAction(ISD::ADD,             MVT::v8i32, Custom);
   1201       setOperationAction(ISD::ADD,             MVT::v16i16, Custom);
   1202       setOperationAction(ISD::ADD,             MVT::v32i8, Custom);
   1203 
   1204       setOperationAction(ISD::SUB,             MVT::v4i64, Custom);
   1205       setOperationAction(ISD::SUB,             MVT::v8i32, Custom);
   1206       setOperationAction(ISD::SUB,             MVT::v16i16, Custom);
   1207       setOperationAction(ISD::SUB,             MVT::v32i8, Custom);
   1208 
   1209       setOperationAction(ISD::MUL,             MVT::v4i64, Custom);
   1210       setOperationAction(ISD::MUL,             MVT::v8i32, Custom);
   1211       setOperationAction(ISD::MUL,             MVT::v16i16, Custom);
   1212       // Don't lower v32i8 because there is no 128-bit byte mul
   1213 
   1214       setOperationAction(ISD::SRL,             MVT::v4i64, Custom);
   1215       setOperationAction(ISD::SRL,             MVT::v8i32, Custom);
   1216 
   1217       setOperationAction(ISD::SHL,             MVT::v4i64, Custom);
   1218       setOperationAction(ISD::SHL,             MVT::v8i32, Custom);
   1219 
   1220       setOperationAction(ISD::SRA,             MVT::v8i32, Custom);
   1221     }
   1222 
   1223     // Custom lower several nodes for 256-bit types.
   1224     for (int i = MVT::FIRST_VECTOR_VALUETYPE;
   1225              i <= MVT::LAST_VECTOR_VALUETYPE; ++i) {
   1226       MVT VT = (MVT::SimpleValueType)i;
   1227 
   1228       // Extract subvector is special because the value type
   1229       // (result) is 128-bit but the source is 256-bit wide.
   1230       if (VT.is128BitVector())
   1231         setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Custom);
   1232 
   1233       // Do not attempt to custom lower other non-256-bit vectors
   1234       if (!VT.is256BitVector())
   1235         continue;
   1236 
   1237       setOperationAction(ISD::BUILD_VECTOR,       VT, Custom);
   1238       setOperationAction(ISD::VECTOR_SHUFFLE,     VT, Custom);
   1239       setOperationAction(ISD::INSERT_VECTOR_ELT,  VT, Custom);
   1240       setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
   1241       setOperationAction(ISD::SCALAR_TO_VECTOR,   VT, Custom);
   1242       setOperationAction(ISD::INSERT_SUBVECTOR,   VT, Custom);
   1243       setOperationAction(ISD::CONCAT_VECTORS,     VT, Custom);
   1244     }
   1245 
   1246     // Promote v32i8, v16i16, v8i32 select, and, or, xor to v4i64.
   1247     for (int i = MVT::v32i8; i != MVT::v4i64; ++i) {
   1248       MVT VT = (MVT::SimpleValueType)i;
   1249 
   1250       // Do not attempt to promote non-256-bit vectors
   1251       if (!VT.is256BitVector())
   1252         continue;
   1253 
   1254       setOperationAction(ISD::AND,    VT, Promote);
   1255       AddPromotedToType (ISD::AND,    VT, MVT::v4i64);
   1256       setOperationAction(ISD::OR,     VT, Promote);
   1257       AddPromotedToType (ISD::OR,     VT, MVT::v4i64);
   1258       setOperationAction(ISD::XOR,    VT, Promote);
   1259       AddPromotedToType (ISD::XOR,    VT, MVT::v4i64);
   1260       setOperationAction(ISD::LOAD,   VT, Promote);
   1261       AddPromotedToType (ISD::LOAD,   VT, MVT::v4i64);
   1262       setOperationAction(ISD::SELECT, VT, Promote);
   1263       AddPromotedToType (ISD::SELECT, VT, MVT::v4i64);
   1264     }
   1265   }
   1266 
   1267   // SIGN_EXTEND_INREGs are evaluated by the extend type. Handle the expansion
   1268   // of this type with custom code.
   1269   for (int VT = MVT::FIRST_VECTOR_VALUETYPE;
   1270            VT != MVT::LAST_VECTOR_VALUETYPE; VT++) {
   1271     setOperationAction(ISD::SIGN_EXTEND_INREG, (MVT::SimpleValueType)VT,
   1272                        Custom);
   1273   }
   1274 
   1275   // We want to custom lower some of our intrinsics.
   1276   setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom);
   1277   setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::Other, Custom);
   1278 
   1279   // Only custom-lower 64-bit SADDO and friends on 64-bit because we don't
   1280   // handle type legalization for these operations here.
   1281   //
   1282   // FIXME: We really should do custom legalization for addition and
   1283   // subtraction on x86-32 once PR3203 is fixed.  We really can't do much better
   1284   // than generic legalization for 64-bit multiplication-with-overflow, though.
   1285   for (unsigned i = 0, e = 3+Subtarget->is64Bit(); i != e; ++i) {
   1286     // Add/Sub/Mul with overflow operations are custom lowered.
   1287     MVT VT = IntVTs[i];
   1288     setOperationAction(ISD::SADDO, VT, Custom);
   1289     setOperationAction(ISD::UADDO, VT, Custom);
   1290     setOperationAction(ISD::SSUBO, VT, Custom);
   1291     setOperationAction(ISD::USUBO, VT, Custom);
   1292     setOperationAction(ISD::SMULO, VT, Custom);
   1293     setOperationAction(ISD::UMULO, VT, Custom);
   1294   }
   1295 
   1296   // There are no 8-bit 3-address imul/mul instructions
   1297   setOperationAction(ISD::SMULO, MVT::i8, Expand);
   1298   setOperationAction(ISD::UMULO, MVT::i8, Expand);
   1299 
   1300   if (!Subtarget->is64Bit()) {
   1301     // These libcalls are not available in 32-bit.
   1302     setLibcallName(RTLIB::SHL_I128, 0);
   1303     setLibcallName(RTLIB::SRL_I128, 0);
   1304     setLibcallName(RTLIB::SRA_I128, 0);
   1305   }
   1306 
   1307   // Combine sin / cos into one node or libcall if possible.
   1308   if (Subtarget->hasSinCos()) {
   1309     setLibcallName(RTLIB::SINCOS_F32, "sincosf");
   1310     setLibcallName(RTLIB::SINCOS_F64, "sincos");
   1311     if (Subtarget->isTargetDarwin()) {
   1312       // For MacOSX, we don't want to the normal expansion of a libcall to
   1313       // sincos. We want to issue a libcall to __sincos_stret to avoid memory
   1314       // traffic.
   1315       setOperationAction(ISD::FSINCOS, MVT::f64, Custom);
   1316       setOperationAction(ISD::FSINCOS, MVT::f32, Custom);
   1317     }
   1318   }
   1319 
   1320   // We have target-specific dag combine patterns for the following nodes:
   1321   setTargetDAGCombine(ISD::VECTOR_SHUFFLE);
   1322   setTargetDAGCombine(ISD::EXTRACT_VECTOR_ELT);
   1323   setTargetDAGCombine(ISD::VSELECT);
   1324   setTargetDAGCombine(ISD::SELECT);
   1325   setTargetDAGCombine(ISD::SHL);
   1326   setTargetDAGCombine(ISD::SRA);
   1327   setTargetDAGCombine(ISD::SRL);
   1328   setTargetDAGCombine(ISD::OR);
   1329   setTargetDAGCombine(ISD::AND);
   1330   setTargetDAGCombine(ISD::ADD);
   1331   setTargetDAGCombine(ISD::FADD);
   1332   setTargetDAGCombine(ISD::FSUB);
   1333   setTargetDAGCombine(ISD::FMA);
   1334   setTargetDAGCombine(ISD::SUB);
   1335   setTargetDAGCombine(ISD::LOAD);
   1336   setTargetDAGCombine(ISD::STORE);
   1337   setTargetDAGCombine(ISD::ZERO_EXTEND);
   1338   setTargetDAGCombine(ISD::ANY_EXTEND);
   1339   setTargetDAGCombine(ISD::SIGN_EXTEND);
   1340   setTargetDAGCombine(ISD::SIGN_EXTEND_INREG);
   1341   setTargetDAGCombine(ISD::TRUNCATE);
   1342   setTargetDAGCombine(ISD::SINT_TO_FP);
   1343   setTargetDAGCombine(ISD::SETCC);
   1344   if (Subtarget->is64Bit())
   1345     setTargetDAGCombine(ISD::MUL);
   1346   setTargetDAGCombine(ISD::XOR);
   1347 
   1348   computeRegisterProperties();
   1349 
   1350   // On Darwin, -Os means optimize for size without hurting performance,
   1351   // do not reduce the limit.
   1352   MaxStoresPerMemset = 16; // For @llvm.memset -> sequence of stores
   1353   MaxStoresPerMemsetOptSize = Subtarget->isTargetDarwin() ? 16 : 8;
   1354   MaxStoresPerMemcpy = 8; // For @llvm.memcpy -> sequence of stores
   1355   MaxStoresPerMemcpyOptSize = Subtarget->isTargetDarwin() ? 8 : 4;
   1356   MaxStoresPerMemmove = 8; // For @llvm.memmove -> sequence of stores
   1357   MaxStoresPerMemmoveOptSize = Subtarget->isTargetDarwin() ? 8 : 4;
   1358   setPrefLoopAlignment(4); // 2^4 bytes.
   1359   BenefitFromCodePlacementOpt = true;
   1360 
   1361   // Predictable cmov don't hurt on atom because it's in-order.
   1362   PredictableSelectIsExpensive = !Subtarget->isAtom();
   1363 
   1364   setPrefFunctionAlignment(4); // 2^4 bytes.
   1365 }
   1366 
   1367 EVT X86TargetLowering::getSetCCResultType(EVT VT) const {
   1368   if (!VT.isVector()) return MVT::i8;
   1369   return VT.changeVectorElementTypeToInteger();
   1370 }
   1371 
   1372 /// getMaxByValAlign - Helper for getByValTypeAlignment to determine
   1373 /// the desired ByVal argument alignment.
   1374 static void getMaxByValAlign(Type *Ty, unsigned &MaxAlign) {
   1375   if (MaxAlign == 16)
   1376     return;
   1377   if (VectorType *VTy = dyn_cast<VectorType>(Ty)) {
   1378     if (VTy->getBitWidth() == 128)
   1379       MaxAlign = 16;
   1380   } else if (ArrayType *ATy = dyn_cast<ArrayType>(Ty)) {
   1381     unsigned EltAlign = 0;
   1382     getMaxByValAlign(ATy->getElementType(), EltAlign);
   1383     if (EltAlign > MaxAlign)
   1384       MaxAlign = EltAlign;
   1385   } else if (StructType *STy = dyn_cast<StructType>(Ty)) {
   1386     for (unsigned i = 0, e = STy->getNumElements(); i != e; ++i) {
   1387       unsigned EltAlign = 0;
   1388       getMaxByValAlign(STy->getElementType(i), EltAlign);
   1389       if (EltAlign > MaxAlign)
   1390         MaxAlign = EltAlign;
   1391       if (MaxAlign == 16)
   1392         break;
   1393     }
   1394   }
   1395 }
   1396 
   1397 /// getByValTypeAlignment - Return the desired alignment for ByVal aggregate
   1398 /// function arguments in the caller parameter area. For X86, aggregates
   1399 /// that contain SSE vectors are placed at 16-byte boundaries while the rest
   1400 /// are at 4-byte boundaries.
   1401 unsigned X86TargetLowering::getByValTypeAlignment(Type *Ty) const {
   1402   if (Subtarget->is64Bit()) {
   1403     // Max of 8 and alignment of type.
   1404     unsigned TyAlign = TD->getABITypeAlignment(Ty);
   1405     if (TyAlign > 8)
   1406       return TyAlign;
   1407     return 8;
   1408   }
   1409 
   1410   unsigned Align = 4;
   1411   if (Subtarget->hasSSE1())
   1412     getMaxByValAlign(Ty, Align);
   1413   return Align;
   1414 }
   1415 
   1416 /// getOptimalMemOpType - Returns the target specific optimal type for load
   1417 /// and store operations as a result of memset, memcpy, and memmove
   1418 /// lowering. If DstAlign is zero that means it's safe to destination
   1419 /// alignment can satisfy any constraint. Similarly if SrcAlign is zero it
   1420 /// means there isn't a need to check it against alignment requirement,
   1421 /// probably because the source does not need to be loaded. If 'IsMemset' is
   1422 /// true, that means it's expanding a memset. If 'ZeroMemset' is true, that
   1423 /// means it's a memset of zero. 'MemcpyStrSrc' indicates whether the memcpy
   1424 /// source is constant so it does not need to be loaded.
   1425 /// It returns EVT::Other if the type should be determined using generic
   1426 /// target-independent logic.
   1427 EVT
   1428 X86TargetLowering::getOptimalMemOpType(uint64_t Size,
   1429                                        unsigned DstAlign, unsigned SrcAlign,
   1430                                        bool IsMemset, bool ZeroMemset,
   1431                                        bool MemcpyStrSrc,
   1432                                        MachineFunction &MF) const {
   1433   const Function *F = MF.getFunction();
   1434   if ((!IsMemset || ZeroMemset) &&
   1435       !F->getAttributes().hasAttribute(AttributeSet::FunctionIndex,
   1436                                        Attribute::NoImplicitFloat)) {
   1437     if (Size >= 16 &&
   1438         (Subtarget->isUnalignedMemAccessFast() ||
   1439          ((DstAlign == 0 || DstAlign >= 16) &&
   1440           (SrcAlign == 0 || SrcAlign >= 16)))) {
   1441       if (Size >= 32) {
   1442         if (Subtarget->hasInt256())
   1443           return MVT::v8i32;
   1444         if (Subtarget->hasFp256())
   1445           return MVT::v8f32;
   1446       }
   1447       if (Subtarget->hasSSE2())
   1448         return MVT::v4i32;
   1449       if (Subtarget->hasSSE1())
   1450         return MVT::v4f32;
   1451     } else if (!MemcpyStrSrc && Size >= 8 &&
   1452                !Subtarget->is64Bit() &&
   1453                Subtarget->hasSSE2()) {
   1454       // Do not use f64 to lower memcpy if source is string constant. It's
   1455       // better to use i32 to avoid the loads.
   1456       return MVT::f64;
   1457     }
   1458   }
   1459   if (Subtarget->is64Bit() && Size >= 8)
   1460     return MVT::i64;
   1461   return MVT::i32;
   1462 }
   1463 
   1464 bool X86TargetLowering::isSafeMemOpType(MVT VT) const {
   1465   if (VT == MVT::f32)
   1466     return X86ScalarSSEf32;
   1467   else if (VT == MVT::f64)
   1468     return X86ScalarSSEf64;
   1469   return true;
   1470 }
   1471 
   1472 bool
   1473 X86TargetLowering::allowsUnalignedMemoryAccesses(EVT VT, bool *Fast) const {
   1474   if (Fast)
   1475     *Fast = Subtarget->isUnalignedMemAccessFast();
   1476   return true;
   1477 }
   1478 
   1479 /// getJumpTableEncoding - Return the entry encoding for a jump table in the
   1480 /// current function.  The returned value is a member of the
   1481 /// MachineJumpTableInfo::JTEntryKind enum.
   1482 unsigned X86TargetLowering::getJumpTableEncoding() const {
   1483   // In GOT pic mode, each entry in the jump table is emitted as a @GOTOFF
   1484   // symbol.
   1485   if (getTargetMachine().getRelocationModel() == Reloc::PIC_ &&
   1486       Subtarget->isPICStyleGOT())
   1487     return MachineJumpTableInfo::EK_Custom32;
   1488 
   1489   // Otherwise, use the normal jump table encoding heuristics.
   1490   return TargetLowering::getJumpTableEncoding();
   1491 }
   1492 
   1493 const MCExpr *
   1494 X86TargetLowering::LowerCustomJumpTableEntry(const MachineJumpTableInfo *MJTI,
   1495                                              const MachineBasicBlock *MBB,
   1496                                              unsigned uid,MCContext &Ctx) const{
   1497   assert(getTargetMachine().getRelocationModel() == Reloc::PIC_ &&
   1498          Subtarget->isPICStyleGOT());
   1499   // In 32-bit ELF systems, our jump table entries are formed with @GOTOFF
   1500   // entries.
   1501   return MCSymbolRefExpr::Create(MBB->getSymbol(),
   1502                                  MCSymbolRefExpr::VK_GOTOFF, Ctx);
   1503 }
   1504 
   1505 /// getPICJumpTableRelocaBase - Returns relocation base for the given PIC
   1506 /// jumptable.
   1507 SDValue X86TargetLowering::getPICJumpTableRelocBase(SDValue Table,
   1508                                                     SelectionDAG &DAG) const {
   1509   if (!Subtarget->is64Bit())
   1510     // This doesn't have DebugLoc associated with it, but is not really the
   1511     // same as a Register.
   1512     return DAG.getNode(X86ISD::GlobalBaseReg, DebugLoc(), getPointerTy());
   1513   return Table;
   1514 }
   1515 
   1516 /// getPICJumpTableRelocBaseExpr - This returns the relocation base for the
   1517 /// given PIC jumptable, the same as getPICJumpTableRelocBase, but as an
   1518 /// MCExpr.
   1519 const MCExpr *X86TargetLowering::
   1520 getPICJumpTableRelocBaseExpr(const MachineFunction *MF, unsigned JTI,
   1521                              MCContext &Ctx) const {
   1522   // X86-64 uses RIP relative addressing based on the jump table label.
   1523   if (Subtarget->isPICStyleRIPRel())
   1524     return TargetLowering::getPICJumpTableRelocBaseExpr(MF, JTI, Ctx);
   1525 
   1526   // Otherwise, the reference is relative to the PIC base.
   1527   return MCSymbolRefExpr::Create(MF->getPICBaseSymbol(), Ctx);
   1528 }
   1529 
   1530 // FIXME: Why this routine is here? Move to RegInfo!
   1531 std::pair<const TargetRegisterClass*, uint8_t>
   1532 X86TargetLowering::findRepresentativeClass(MVT VT) const{
   1533   const TargetRegisterClass *RRC = 0;
   1534   uint8_t Cost = 1;
   1535   switch (VT.SimpleTy) {
   1536   default:
   1537     return TargetLowering::findRepresentativeClass(VT);
   1538   case MVT::i8: case MVT::i16: case MVT::i32: case MVT::i64:
   1539     RRC = Subtarget->is64Bit() ?
   1540       (const TargetRegisterClass*)&X86::GR64RegClass :
   1541       (const TargetRegisterClass*)&X86::GR32RegClass;
   1542     break;
   1543   case MVT::x86mmx:
   1544     RRC = &X86::VR64RegClass;
   1545     break;
   1546   case MVT::f32: case MVT::f64:
   1547   case MVT::v16i8: case MVT::v8i16: case MVT::v4i32: case MVT::v2i64:
   1548   case MVT::v4f32: case MVT::v2f64:
   1549   case MVT::v32i8: case MVT::v8i32: case MVT::v4i64: case MVT::v8f32:
   1550   case MVT::v4f64:
   1551     RRC = &X86::VR128RegClass;
   1552     break;
   1553   }
   1554   return std::make_pair(RRC, Cost);
   1555 }
   1556 
   1557 bool X86TargetLowering::getStackCookieLocation(unsigned &AddressSpace,
   1558                                                unsigned &Offset) const {
   1559   if (!Subtarget->isTargetLinux())
   1560     return false;
   1561 
   1562   if (Subtarget->is64Bit()) {
   1563     // %fs:0x28, unless we're using a Kernel code model, in which case it's %gs:
   1564     Offset = 0x28;
   1565     if (getTargetMachine().getCodeModel() == CodeModel::Kernel)
   1566       AddressSpace = 256;
   1567     else
   1568       AddressSpace = 257;
   1569   } else {
   1570     // %gs:0x14 on i386
   1571     Offset = 0x14;
   1572     AddressSpace = 256;
   1573   }
   1574   return true;
   1575 }
   1576 
   1577 //===----------------------------------------------------------------------===//
   1578 //               Return Value Calling Convention Implementation
   1579 //===----------------------------------------------------------------------===//
   1580 
   1581 #include "X86GenCallingConv.inc"
   1582 
   1583 bool
   1584 X86TargetLowering::CanLowerReturn(CallingConv::ID CallConv,
   1585                                   MachineFunction &MF, bool isVarArg,
   1586                         const SmallVectorImpl<ISD::OutputArg> &Outs,
   1587                         LLVMContext &Context) const {
   1588   SmallVector<CCValAssign, 16> RVLocs;
   1589   CCState CCInfo(CallConv, isVarArg, MF, getTargetMachine(),
   1590                  RVLocs, Context);
   1591   return CCInfo.CheckReturn(Outs, RetCC_X86);
   1592 }
   1593 
   1594 SDValue
   1595 X86TargetLowering::LowerReturn(SDValue Chain,
   1596                                CallingConv::ID CallConv, bool isVarArg,
   1597                                const SmallVectorImpl<ISD::OutputArg> &Outs,
   1598                                const SmallVectorImpl<SDValue> &OutVals,
   1599                                DebugLoc dl, SelectionDAG &DAG) const {
   1600   MachineFunction &MF = DAG.getMachineFunction();
   1601   X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
   1602 
   1603   SmallVector<CCValAssign, 16> RVLocs;
   1604   CCState CCInfo(CallConv, isVarArg, MF, getTargetMachine(),
   1605                  RVLocs, *DAG.getContext());
   1606   CCInfo.AnalyzeReturn(Outs, RetCC_X86);
   1607 
   1608   SDValue Flag;
   1609   SmallVector<SDValue, 6> RetOps;
   1610   RetOps.push_back(Chain); // Operand #0 = Chain (updated below)
   1611   // Operand #1 = Bytes To Pop
   1612   RetOps.push_back(DAG.getTargetConstant(FuncInfo->getBytesToPopOnReturn(),
   1613                    MVT::i16));
   1614 
   1615   // Copy the result values into the output registers.
   1616   for (unsigned i = 0; i != RVLocs.size(); ++i) {
   1617     CCValAssign &VA = RVLocs[i];
   1618     assert(VA.isRegLoc() && "Can only return in registers!");
   1619     SDValue ValToCopy = OutVals[i];
   1620     EVT ValVT = ValToCopy.getValueType();
   1621 
   1622     // Promote values to the appropriate types
   1623     if (VA.getLocInfo() == CCValAssign::SExt)
   1624       ValToCopy = DAG.getNode(ISD::SIGN_EXTEND, dl, VA.getLocVT(), ValToCopy);
   1625     else if (VA.getLocInfo() == CCValAssign::ZExt)
   1626       ValToCopy = DAG.getNode(ISD::ZERO_EXTEND, dl, VA.getLocVT(), ValToCopy);
   1627     else if (VA.getLocInfo() == CCValAssign::AExt)
   1628       ValToCopy = DAG.getNode(ISD::ANY_EXTEND, dl, VA.getLocVT(), ValToCopy);
   1629     else if (VA.getLocInfo() == CCValAssign::BCvt)
   1630       ValToCopy = DAG.getNode(ISD::BITCAST, dl, VA.getLocVT(), ValToCopy);
   1631 
   1632     // If this is x86-64, and we disabled SSE, we can't return FP values,
   1633     // or SSE or MMX vectors.
   1634     if ((ValVT == MVT::f32 || ValVT == MVT::f64 ||
   1635          VA.getLocReg() == X86::XMM0 || VA.getLocReg() == X86::XMM1) &&
   1636           (Subtarget->is64Bit() && !Subtarget->hasSSE1())) {
   1637       report_fatal_error("SSE register return with SSE disabled");
   1638     }
   1639     // Likewise we can't return F64 values with SSE1 only.  gcc does so, but
   1640     // llvm-gcc has never done it right and no one has noticed, so this
   1641     // should be OK for now.
   1642     if (ValVT == MVT::f64 &&
   1643         (Subtarget->is64Bit() && !Subtarget->hasSSE2()))
   1644       report_fatal_error("SSE2 register return with SSE2 disabled");
   1645 
   1646     // Returns in ST0/ST1 are handled specially: these are pushed as operands to
   1647     // the RET instruction and handled by the FP Stackifier.
   1648     if (VA.getLocReg() == X86::ST0 ||
   1649         VA.getLocReg() == X86::ST1) {
   1650       // If this is a copy from an xmm register to ST(0), use an FPExtend to
   1651       // change the value to the FP stack register class.
   1652       if (isScalarFPTypeInSSEReg(VA.getValVT()))
   1653         ValToCopy = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f80, ValToCopy);
   1654       RetOps.push_back(ValToCopy);
   1655       // Don't emit a copytoreg.
   1656       continue;
   1657     }
   1658 
   1659     // 64-bit vector (MMX) values are returned in XMM0 / XMM1 except for v1i64
   1660     // which is returned in RAX / RDX.
   1661     if (Subtarget->is64Bit()) {
   1662       if (ValVT == MVT::x86mmx) {
   1663         if (VA.getLocReg() == X86::XMM0 || VA.getLocReg() == X86::XMM1) {
   1664           ValToCopy = DAG.getNode(ISD::BITCAST, dl, MVT::i64, ValToCopy);
   1665           ValToCopy = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64,
   1666                                   ValToCopy);
   1667           // If we don't have SSE2 available, convert to v4f32 so the generated
   1668           // register is legal.
   1669           if (!Subtarget->hasSSE2())
   1670             ValToCopy = DAG.getNode(ISD::BITCAST, dl, MVT::v4f32,ValToCopy);
   1671         }
   1672       }
   1673     }
   1674 
   1675     Chain = DAG.getCopyToReg(Chain, dl, VA.getLocReg(), ValToCopy, Flag);
   1676     Flag = Chain.getValue(1);
   1677     RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT()));
   1678   }
   1679 
   1680   // The x86-64 ABIs require that for returning structs by value we copy
   1681   // the sret argument into %rax/%eax (depending on ABI) for the return.
   1682   // We saved the argument into a virtual register in the entry block,
   1683   // so now we copy the value out and into %rax/%eax.
   1684   if (Subtarget->is64Bit() &&
   1685       DAG.getMachineFunction().getFunction()->hasStructRetAttr()) {
   1686     MachineFunction &MF = DAG.getMachineFunction();
   1687     X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
   1688     unsigned Reg = FuncInfo->getSRetReturnReg();
   1689     assert(Reg &&
   1690            "SRetReturnReg should have been set in LowerFormalArguments().");
   1691     SDValue Val = DAG.getCopyFromReg(Chain, dl, Reg, getPointerTy());
   1692 
   1693     unsigned RetValReg = Subtarget->isTarget64BitILP32() ? X86::EAX : X86::RAX;
   1694     Chain = DAG.getCopyToReg(Chain, dl, RetValReg, Val, Flag);
   1695     Flag = Chain.getValue(1);
   1696 
   1697     // RAX/EAX now acts like a return value.
   1698     RetOps.push_back(DAG.getRegister(RetValReg, MVT::i64));
   1699   }
   1700 
   1701   RetOps[0] = Chain;  // Update chain.
   1702 
   1703   // Add the flag if we have it.
   1704   if (Flag.getNode())
   1705     RetOps.push_back(Flag);
   1706 
   1707   return DAG.getNode(X86ISD::RET_FLAG, dl,
   1708                      MVT::Other, &RetOps[0], RetOps.size());
   1709 }
   1710 
   1711 bool X86TargetLowering::isUsedByReturnOnly(SDNode *N, SDValue &Chain) const {
   1712   if (N->getNumValues() != 1)
   1713     return false;
   1714   if (!N->hasNUsesOfValue(1, 0))
   1715     return false;
   1716 
   1717   SDValue TCChain = Chain;
   1718   SDNode *Copy = *N->use_begin();
   1719   if (Copy->getOpcode() == ISD::CopyToReg) {
   1720     // If the copy has a glue operand, we conservatively assume it isn't safe to
   1721     // perform a tail call.
   1722     if (Copy->getOperand(Copy->getNumOperands()-1).getValueType() == MVT::Glue)
   1723       return false;
   1724     TCChain = Copy->getOperand(0);
   1725   } else if (Copy->getOpcode() != ISD::FP_EXTEND)
   1726     return false;
   1727 
   1728   bool HasRet = false;
   1729   for (SDNode::use_iterator UI = Copy->use_begin(), UE = Copy->use_end();
   1730        UI != UE; ++UI) {
   1731     if (UI->getOpcode() != X86ISD::RET_FLAG)
   1732       return false;
   1733     HasRet = true;
   1734   }
   1735 
   1736   if (!HasRet)
   1737     return false;
   1738 
   1739   Chain = TCChain;
   1740   return true;
   1741 }
   1742 
   1743 MVT
   1744 X86TargetLowering::getTypeForExtArgOrReturn(MVT VT,
   1745                                             ISD::NodeType ExtendKind) const {
   1746   MVT ReturnMVT;
   1747   // TODO: Is this also valid on 32-bit?
   1748   if (Subtarget->is64Bit() && VT == MVT::i1 && ExtendKind == ISD::ZERO_EXTEND)
   1749     ReturnMVT = MVT::i8;
   1750   else
   1751     ReturnMVT = MVT::i32;
   1752 
   1753   MVT MinVT = getRegisterType(ReturnMVT);
   1754   return VT.bitsLT(MinVT) ? MinVT : VT;
   1755 }
   1756 
   1757 /// LowerCallResult - Lower the result values of a call into the
   1758 /// appropriate copies out of appropriate physical registers.
   1759 ///
   1760 SDValue
   1761 X86TargetLowering::LowerCallResult(SDValue Chain, SDValue InFlag,
   1762                                    CallingConv::ID CallConv, bool isVarArg,
   1763                                    const SmallVectorImpl<ISD::InputArg> &Ins,
   1764                                    DebugLoc dl, SelectionDAG &DAG,
   1765                                    SmallVectorImpl<SDValue> &InVals) const {
   1766 
   1767   // Assign locations to each value returned by this call.
   1768   SmallVector<CCValAssign, 16> RVLocs;
   1769   bool Is64Bit = Subtarget->is64Bit();
   1770   CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(),
   1771                  getTargetMachine(), RVLocs, *DAG.getContext());
   1772   CCInfo.AnalyzeCallResult(Ins, RetCC_X86);
   1773 
   1774   // Copy all of the result registers out of their specified physreg.
   1775   for (unsigned i = 0, e = RVLocs.size(); i != e; ++i) {
   1776     CCValAssign &VA = RVLocs[i];
   1777     EVT CopyVT = VA.getValVT();
   1778 
   1779     // If this is x86-64, and we disabled SSE, we can't return FP values
   1780     if ((CopyVT == MVT::f32 || CopyVT == MVT::f64) &&
   1781         ((Is64Bit || Ins[i].Flags.isInReg()) && !Subtarget->hasSSE1())) {
   1782       report_fatal_error("SSE register return with SSE disabled");
   1783     }
   1784 
   1785     SDValue Val;
   1786 
   1787     // If this is a call to a function that returns an fp value on the floating
   1788     // point stack, we must guarantee the value is popped from the stack, so
   1789     // a CopyFromReg is not good enough - the copy instruction may be eliminated
   1790     // if the return value is not used. We use the FpPOP_RETVAL instruction
   1791     // instead.
   1792     if (VA.getLocReg() == X86::ST0 || VA.getLocReg() == X86::ST1) {
   1793       // If we prefer to use the value in xmm registers, copy it out as f80 and
   1794       // use a truncate to move it from fp stack reg to xmm reg.
   1795       if (isScalarFPTypeInSSEReg(VA.getValVT())) CopyVT = MVT::f80;
   1796       SDValue Ops[] = { Chain, InFlag };
   1797       Chain = SDValue(DAG.getMachineNode(X86::FpPOP_RETVAL, dl, CopyVT,
   1798                                          MVT::Other, MVT::Glue, Ops, 2), 1);
   1799       Val = Chain.getValue(0);
   1800 
   1801       // Round the f80 to the right size, which also moves it to the appropriate
   1802       // xmm register.
   1803       if (CopyVT != VA.getValVT())
   1804         Val = DAG.getNode(ISD::FP_ROUND, dl, VA.getValVT(), Val,
   1805                           // This truncation won't change the value.
   1806                           DAG.getIntPtrConstant(1));
   1807     } else {
   1808       Chain = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(),
   1809                                  CopyVT, InFlag).getValue(1);
   1810       Val = Chain.getValue(0);
   1811     }
   1812     InFlag = Chain.getValue(2);
   1813     InVals.push_back(Val);
   1814   }
   1815 
   1816   return Chain;
   1817 }
   1818 
   1819 //===----------------------------------------------------------------------===//
   1820 //                C & StdCall & Fast Calling Convention implementation
   1821 //===----------------------------------------------------------------------===//
   1822 //  StdCall calling convention seems to be standard for many Windows' API
   1823 //  routines and around. It differs from C calling convention just a little:
   1824 //  callee should clean up the stack, not caller. Symbols should be also
   1825 //  decorated in some fancy way :) It doesn't support any vector arguments.
   1826 //  For info on fast calling convention see Fast Calling Convention (tail call)
   1827 //  implementation LowerX86_32FastCCCallTo.
   1828 
   1829 /// CallIsStructReturn - Determines whether a call uses struct return
   1830 /// semantics.
   1831 enum StructReturnType {
   1832   NotStructReturn,
   1833   RegStructReturn,
   1834   StackStructReturn
   1835 };
   1836 static StructReturnType
   1837 callIsStructReturn(const SmallVectorImpl<ISD::OutputArg> &Outs) {
   1838   if (Outs.empty())
   1839     return NotStructReturn;
   1840 
   1841   const ISD::ArgFlagsTy &Flags = Outs[0].Flags;
   1842   if (!Flags.isSRet())
   1843     return NotStructReturn;
   1844   if (Flags.isInReg())
   1845     return RegStructReturn;
   1846   return StackStructReturn;
   1847 }
   1848 
   1849 /// ArgsAreStructReturn - Determines whether a function uses struct
   1850 /// return semantics.
   1851 static StructReturnType
   1852 argsAreStructReturn(const SmallVectorImpl<ISD::InputArg> &Ins) {
   1853   if (Ins.empty())
   1854     return NotStructReturn;
   1855 
   1856   const ISD::ArgFlagsTy &Flags = Ins[0].Flags;
   1857   if (!Flags.isSRet())
   1858     return NotStructReturn;
   1859   if (Flags.isInReg())
   1860     return RegStructReturn;
   1861   return StackStructReturn;
   1862 }
   1863 
   1864 /// CreateCopyOfByValArgument - Make a copy of an aggregate at address specified
   1865 /// by "Src" to address "Dst" with size and alignment information specified by
   1866 /// the specific parameter attribute. The copy will be passed as a byval
   1867 /// function parameter.
   1868 static SDValue
   1869 CreateCopyOfByValArgument(SDValue Src, SDValue Dst, SDValue Chain,
   1870                           ISD::ArgFlagsTy Flags, SelectionDAG &DAG,
   1871                           DebugLoc dl) {
   1872   SDValue SizeNode = DAG.getConstant(Flags.getByValSize(), MVT::i32);
   1873 
   1874   return DAG.getMemcpy(Chain, dl, Dst, Src, SizeNode, Flags.getByValAlign(),
   1875                        /*isVolatile*/false, /*AlwaysInline=*/true,
   1876                        MachinePointerInfo(), MachinePointerInfo());
   1877 }
   1878 
   1879 /// IsTailCallConvention - Return true if the calling convention is one that
   1880 /// supports tail call optimization.
   1881 static bool IsTailCallConvention(CallingConv::ID CC) {
   1882   return (CC == CallingConv::Fast || CC == CallingConv::GHC ||
   1883           CC == CallingConv::HiPE);
   1884 }
   1885 
   1886 bool X86TargetLowering::mayBeEmittedAsTailCall(CallInst *CI) const {
   1887   if (!CI->isTailCall() || getTargetMachine().Options.DisableTailCalls)
   1888     return false;
   1889 
   1890   CallSite CS(CI);
   1891   CallingConv::ID CalleeCC = CS.getCallingConv();
   1892   if (!IsTailCallConvention(CalleeCC) && CalleeCC != CallingConv::C)
   1893     return false;
   1894 
   1895   return true;
   1896 }
   1897 
   1898 /// FuncIsMadeTailCallSafe - Return true if the function is being made into
   1899 /// a tailcall target by changing its ABI.
   1900 static bool FuncIsMadeTailCallSafe(CallingConv::ID CC,
   1901                                    bool GuaranteedTailCallOpt) {
   1902   return GuaranteedTailCallOpt && IsTailCallConvention(CC);
   1903 }
   1904 
   1905 SDValue
   1906 X86TargetLowering::LowerMemArgument(SDValue Chain,
   1907                                     CallingConv::ID CallConv,
   1908                                     const SmallVectorImpl<ISD::InputArg> &Ins,
   1909                                     DebugLoc dl, SelectionDAG &DAG,
   1910                                     const CCValAssign &VA,
   1911                                     MachineFrameInfo *MFI,
   1912                                     unsigned i) const {
   1913   // Create the nodes corresponding to a load from this parameter slot.
   1914   ISD::ArgFlagsTy Flags = Ins[i].Flags;
   1915   bool AlwaysUseMutable = FuncIsMadeTailCallSafe(CallConv,
   1916                               getTargetMachine().Options.GuaranteedTailCallOpt);
   1917   bool isImmutable = !AlwaysUseMutable && !Flags.isByVal();
   1918   EVT ValVT;
   1919 
   1920   // If value is passed by pointer we have address passed instead of the value
   1921   // itself.
   1922   if (VA.getLocInfo() == CCValAssign::Indirect)
   1923     ValVT = VA.getLocVT();
   1924   else
   1925     ValVT = VA.getValVT();
   1926 
   1927   // FIXME: For now, all byval parameter objects are marked mutable. This can be
   1928   // changed with more analysis.
   1929   // In case of tail call optimization mark all arguments mutable. Since they
   1930   // could be overwritten by lowering of arguments in case of a tail call.
   1931   if (Flags.isByVal()) {
   1932     unsigned Bytes = Flags.getByValSize();
   1933     if (Bytes == 0) Bytes = 1; // Don't create zero-sized stack objects.
   1934     int FI = MFI->CreateFixedObject(Bytes, VA.getLocMemOffset(), isImmutable);
   1935     return DAG.getFrameIndex(FI, getPointerTy());
   1936   } else {
   1937     int FI = MFI->CreateFixedObject(ValVT.getSizeInBits()/8,
   1938                                     VA.getLocMemOffset(), isImmutable);
   1939     SDValue FIN = DAG.getFrameIndex(FI, getPointerTy());
   1940     return DAG.getLoad(ValVT, dl, Chain, FIN,
   1941                        MachinePointerInfo::getFixedStack(FI),
   1942                        false, false, false, 0);
   1943   }
   1944 }
   1945 
   1946 SDValue
   1947 X86TargetLowering::LowerFormalArguments(SDValue Chain,
   1948                                         CallingConv::ID CallConv,
   1949                                         bool isVarArg,
   1950                                       const SmallVectorImpl<ISD::InputArg> &Ins,
   1951                                         DebugLoc dl,
   1952                                         SelectionDAG &DAG,
   1953                                         SmallVectorImpl<SDValue> &InVals)
   1954                                           const {
   1955   MachineFunction &MF = DAG.getMachineFunction();
   1956   X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
   1957 
   1958   const Function* Fn = MF.getFunction();
   1959   if (Fn->hasExternalLinkage() &&
   1960       Subtarget->isTargetCygMing() &&
   1961       Fn->getName() == "main")
   1962     FuncInfo->setForceFramePointer(true);
   1963 
   1964   MachineFrameInfo *MFI = MF.getFrameInfo();
   1965   bool Is64Bit = Subtarget->is64Bit();
   1966   bool IsWindows = Subtarget->isTargetWindows();
   1967   bool IsWin64 = Subtarget->isTargetWin64();
   1968 
   1969   assert(!(isVarArg && IsTailCallConvention(CallConv)) &&
   1970          "Var args not supported with calling convention fastcc, ghc or hipe");
   1971 
   1972   // Assign locations to all of the incoming arguments.
   1973   SmallVector<CCValAssign, 16> ArgLocs;
   1974   CCState CCInfo(CallConv, isVarArg, MF, getTargetMachine(),
   1975                  ArgLocs, *DAG.getContext());
   1976 
   1977   // Allocate shadow area for Win64
   1978   if (IsWin64) {
   1979     CCInfo.AllocateStack(32, 8);
   1980   }
   1981 
   1982   CCInfo.AnalyzeFormalArguments(Ins, CC_X86);
   1983 
   1984   unsigned LastVal = ~0U;
   1985   SDValue ArgValue;
   1986   for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
   1987     CCValAssign &VA = ArgLocs[i];
   1988     // TODO: If an arg is passed in two places (e.g. reg and stack), skip later
   1989     // places.
   1990     assert(VA.getValNo() != LastVal &&
   1991            "Don't support value assigned to multiple locs yet");
   1992     (void)LastVal;
   1993     LastVal = VA.getValNo();
   1994 
   1995     if (VA.isRegLoc()) {
   1996       EVT RegVT = VA.getLocVT();
   1997       const TargetRegisterClass *RC;
   1998       if (RegVT == MVT::i32)
   1999         RC = &X86::GR32RegClass;
   2000       else if (Is64Bit && RegVT == MVT::i64)
   2001         RC = &X86::GR64RegClass;
   2002       else if (RegVT == MVT::f32)
   2003         RC = &X86::FR32RegClass;
   2004       else if (RegVT == MVT::f64)
   2005         RC = &X86::FR64RegClass;
   2006       else if (RegVT.is256BitVector())
   2007         RC = &X86::VR256RegClass;
   2008       else if (RegVT.is128BitVector())
   2009         RC = &X86::VR128RegClass;
   2010       else if (RegVT == MVT::x86mmx)
   2011         RC = &X86::VR64RegClass;
   2012       else
   2013         llvm_unreachable("Unknown argument type!");
   2014 
   2015       unsigned Reg = MF.addLiveIn(VA.getLocReg(), RC);
   2016       ArgValue = DAG.getCopyFromReg(Chain, dl, Reg, RegVT);
   2017 
   2018       // If this is an 8 or 16-bit value, it is really passed promoted to 32
   2019       // bits.  Insert an assert[sz]ext to capture this, then truncate to the
   2020       // right size.
   2021       if (VA.getLocInfo() == CCValAssign::SExt)
   2022         ArgValue = DAG.getNode(ISD::AssertSext, dl, RegVT, ArgValue,
   2023                                DAG.getValueType(VA.getValVT()));
   2024       else if (VA.getLocInfo() == CCValAssign::ZExt)
   2025         ArgValue = DAG.getNode(ISD::AssertZext, dl, RegVT, ArgValue,
   2026                                DAG.getValueType(VA.getValVT()));
   2027       else if (VA.getLocInfo() == CCValAssign::BCvt)
   2028         ArgValue = DAG.getNode(ISD::BITCAST, dl, VA.getValVT(), ArgValue);
   2029 
   2030       if (VA.isExtInLoc()) {
   2031         // Handle MMX values passed in XMM regs.
   2032         if (RegVT.isVector())
   2033           ArgValue = DAG.getNode(X86ISD::MOVDQ2Q, dl, VA.getValVT(), ArgValue);
   2034         else
   2035           ArgValue = DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), ArgValue);
   2036       }
   2037     } else {
   2038       assert(VA.isMemLoc());
   2039       ArgValue = LowerMemArgument(Chain, CallConv, Ins, dl, DAG, VA, MFI, i);
   2040     }
   2041 
   2042     // If value is passed via pointer - do a load.
   2043     if (VA.getLocInfo() == CCValAssign::Indirect)
   2044       ArgValue = DAG.getLoad(VA.getValVT(), dl, Chain, ArgValue,
   2045                              MachinePointerInfo(), false, false, false, 0);
   2046 
   2047     InVals.push_back(ArgValue);
   2048   }
   2049 
   2050   // The x86-64 ABIs require that for returning structs by value we copy
   2051   // the sret argument into %rax/%eax (depending on ABI) for the return.
   2052   // Save the argument into a virtual register so that we can access it
   2053   // from the return points.
   2054   if (Is64Bit && MF.getFunction()->hasStructRetAttr()) {
   2055     X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
   2056     unsigned Reg = FuncInfo->getSRetReturnReg();
   2057     if (!Reg) {
   2058       MVT PtrTy = getPointerTy();
   2059       Reg = MF.getRegInfo().createVirtualRegister(getRegClassFor(PtrTy));
   2060       FuncInfo->setSRetReturnReg(Reg);
   2061     }
   2062     SDValue Copy = DAG.getCopyToReg(DAG.getEntryNode(), dl, Reg, InVals[0]);
   2063     Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Copy, Chain);
   2064   }
   2065 
   2066   unsigned StackSize = CCInfo.getNextStackOffset();
   2067   // Align stack specially for tail calls.
   2068   if (FuncIsMadeTailCallSafe(CallConv,
   2069                              MF.getTarget().Options.GuaranteedTailCallOpt))
   2070     StackSize = GetAlignedArgumentStackSize(StackSize, DAG);
   2071 
   2072   // If the function takes variable number of arguments, make a frame index for
   2073   // the start of the first vararg value... for expansion of llvm.va_start.
   2074   if (isVarArg) {
   2075     if (Is64Bit || (CallConv != CallingConv::X86_FastCall &&
   2076                     CallConv != CallingConv::X86_ThisCall)) {
   2077       FuncInfo->setVarArgsFrameIndex(MFI->CreateFixedObject(1, StackSize,true));
   2078     }
   2079     if (Is64Bit) {
   2080       unsigned TotalNumIntRegs = 0, TotalNumXMMRegs = 0;
   2081 
   2082       // FIXME: We should really autogenerate these arrays
   2083       static const uint16_t GPR64ArgRegsWin64[] = {
   2084         X86::RCX, X86::RDX, X86::R8,  X86::R9
   2085       };
   2086       static const uint16_t GPR64ArgRegs64Bit[] = {
   2087         X86::RDI, X86::RSI, X86::RDX, X86::RCX, X86::R8, X86::R9
   2088       };
   2089       static const uint16_t XMMArgRegs64Bit[] = {
   2090         X86::XMM0, X86::XMM1, X86::XMM2, X86::XMM3,
   2091         X86::XMM4, X86::XMM5, X86::XMM6, X86::XMM7
   2092       };
   2093       const uint16_t *GPR64ArgRegs;
   2094       unsigned NumXMMRegs = 0;
   2095 
   2096       if (IsWin64) {
   2097         // The XMM registers which might contain var arg parameters are shadowed
   2098         // in their paired GPR.  So we only need to save the GPR to their home
   2099         // slots.
   2100         TotalNumIntRegs = 4;
   2101         GPR64ArgRegs = GPR64ArgRegsWin64;
   2102       } else {
   2103         TotalNumIntRegs = 6; TotalNumXMMRegs = 8;
   2104         GPR64ArgRegs = GPR64ArgRegs64Bit;
   2105 
   2106         NumXMMRegs = CCInfo.getFirstUnallocated(XMMArgRegs64Bit,
   2107                                                 TotalNumXMMRegs);
   2108       }
   2109       unsigned NumIntRegs = CCInfo.getFirstUnallocated(GPR64ArgRegs,
   2110                                                        TotalNumIntRegs);
   2111 
   2112       bool NoImplicitFloatOps = Fn->getAttributes().
   2113         hasAttribute(AttributeSet::FunctionIndex, Attribute::NoImplicitFloat);
   2114       assert(!(NumXMMRegs && !Subtarget->hasSSE1()) &&
   2115              "SSE register cannot be used when SSE is disabled!");
   2116       assert(!(NumXMMRegs && MF.getTarget().Options.UseSoftFloat &&
   2117                NoImplicitFloatOps) &&
   2118              "SSE register cannot be used when SSE is disabled!");
   2119       if (MF.getTarget().Options.UseSoftFloat || NoImplicitFloatOps ||
   2120           !Subtarget->hasSSE1())
   2121         // Kernel mode asks for SSE to be disabled, so don't push them
   2122         // on the stack.
   2123         TotalNumXMMRegs = 0;
   2124 
   2125       if (IsWin64) {
   2126         const TargetFrameLowering &TFI = *getTargetMachine().getFrameLowering();
   2127         // Get to the caller-allocated home save location.  Add 8 to account
   2128         // for the return address.
   2129         int HomeOffset = TFI.getOffsetOfLocalArea() + 8;
   2130         FuncInfo->setRegSaveFrameIndex(
   2131           MFI->CreateFixedObject(1, NumIntRegs * 8 + HomeOffset, false));
   2132         // Fixup to set vararg frame on shadow area (4 x i64).
   2133         if (NumIntRegs < 4)
   2134           FuncInfo->setVarArgsFrameIndex(FuncInfo->getRegSaveFrameIndex());
   2135       } else {
   2136         // For X86-64, if there are vararg parameters that are passed via
   2137         // registers, then we must store them to their spots on the stack so
   2138         // they may be loaded by deferencing the result of va_next.
   2139         FuncInfo->setVarArgsGPOffset(NumIntRegs * 8);
   2140         FuncInfo->setVarArgsFPOffset(TotalNumIntRegs * 8 + NumXMMRegs * 16);
   2141         FuncInfo->setRegSaveFrameIndex(
   2142           MFI->CreateStackObject(TotalNumIntRegs * 8 + TotalNumXMMRegs * 16, 16,
   2143                                false));
   2144       }
   2145 
   2146       // Store the integer parameter registers.
   2147       SmallVector<SDValue, 8> MemOps;
   2148       SDValue RSFIN = DAG.getFrameIndex(FuncInfo->getRegSaveFrameIndex(),
   2149                                         getPointerTy());
   2150       unsigned Offset = FuncInfo->getVarArgsGPOffset();
   2151       for (; NumIntRegs != TotalNumIntRegs; ++NumIntRegs) {
   2152         SDValue FIN = DAG.getNode(ISD::ADD, dl, getPointerTy(), RSFIN,
   2153                                   DAG.getIntPtrConstant(Offset));
   2154         unsigned VReg = MF.addLiveIn(GPR64ArgRegs[NumIntRegs],
   2155                                      &X86::GR64RegClass);
   2156         SDValue Val = DAG.getCopyFromReg(Chain, dl, VReg, MVT::i64);
   2157         SDValue Store =
   2158           DAG.getStore(Val.getValue(1), dl, Val, FIN,
   2159                        MachinePointerInfo::getFixedStack(
   2160                          FuncInfo->getRegSaveFrameIndex(), Offset),
   2161                        false, false, 0);
   2162         MemOps.push_back(Store);
   2163         Offset += 8;
   2164       }
   2165 
   2166       if (TotalNumXMMRegs != 0 && NumXMMRegs != TotalNumXMMRegs) {
   2167         // Now store the XMM (fp + vector) parameter registers.
   2168         SmallVector<SDValue, 11> SaveXMMOps;
   2169         SaveXMMOps.push_back(Chain);
   2170 
   2171         unsigned AL = MF.addLiveIn(X86::AL, &X86::GR8RegClass);
   2172         SDValue ALVal = DAG.getCopyFromReg(DAG.getEntryNode(), dl, AL, MVT::i8);
   2173         SaveXMMOps.push_back(ALVal);
   2174 
   2175         SaveXMMOps.push_back(DAG.getIntPtrConstant(
   2176                                FuncInfo->getRegSaveFrameIndex()));
   2177         SaveXMMOps.push_back(DAG.getIntPtrConstant(
   2178                                FuncInfo->getVarArgsFPOffset()));
   2179 
   2180         for (; NumXMMRegs != TotalNumXMMRegs; ++NumXMMRegs) {
   2181           unsigned VReg = MF.addLiveIn(XMMArgRegs64Bit[NumXMMRegs],
   2182                                        &X86::VR128RegClass);
   2183           SDValue Val = DAG.getCopyFromReg(Chain, dl, VReg, MVT::v4f32);
   2184           SaveXMMOps.push_back(Val);
   2185         }
   2186         MemOps.push_back(DAG.getNode(X86ISD::VASTART_SAVE_XMM_REGS, dl,
   2187                                      MVT::Other,
   2188                                      &SaveXMMOps[0], SaveXMMOps.size()));
   2189       }
   2190 
   2191       if (!MemOps.empty())
   2192         Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
   2193                             &MemOps[0], MemOps.size());
   2194     }
   2195   }
   2196 
   2197   // Some CCs need callee pop.
   2198   if (X86::isCalleePop(CallConv, Is64Bit, isVarArg,
   2199                        MF.getTarget().Options.GuaranteedTailCallOpt)) {
   2200     FuncInfo->setBytesToPopOnReturn(StackSize); // Callee pops everything.
   2201   } else {
   2202     FuncInfo->setBytesToPopOnReturn(0); // Callee pops nothing.
   2203     // If this is an sret function, the return should pop the hidden pointer.
   2204     if (!Is64Bit && !IsTailCallConvention(CallConv) && !IsWindows &&
   2205         argsAreStructReturn(Ins) == StackStructReturn)
   2206       FuncInfo->setBytesToPopOnReturn(4);
   2207   }
   2208 
   2209   if (!Is64Bit) {
   2210     // RegSaveFrameIndex is X86-64 only.
   2211     FuncInfo->setRegSaveFrameIndex(0xAAAAAAA);
   2212     if (CallConv == CallingConv::X86_FastCall ||
   2213         CallConv == CallingConv::X86_ThisCall)
   2214       // fastcc functions can't have varargs.
   2215       FuncInfo->setVarArgsFrameIndex(0xAAAAAAA);
   2216   }
   2217 
   2218   FuncInfo->setArgumentStackSize(StackSize);
   2219 
   2220   return Chain;
   2221 }
   2222 
   2223 SDValue
   2224 X86TargetLowering::LowerMemOpCallTo(SDValue Chain,
   2225                                     SDValue StackPtr, SDValue Arg,
   2226                                     DebugLoc dl, SelectionDAG &DAG,
   2227                                     const CCValAssign &VA,
   2228                                     ISD::ArgFlagsTy Flags) const {
   2229   unsigned LocMemOffset = VA.getLocMemOffset();
   2230   SDValue PtrOff = DAG.getIntPtrConstant(LocMemOffset);
   2231   PtrOff = DAG.getNode(ISD::ADD, dl, getPointerTy(), StackPtr, PtrOff);
   2232   if (Flags.isByVal())
   2233     return CreateCopyOfByValArgument(Arg, PtrOff, Chain, Flags, DAG, dl);
   2234 
   2235   return DAG.getStore(Chain, dl, Arg, PtrOff,
   2236                       MachinePointerInfo::getStack(LocMemOffset),
   2237                       false, false, 0);
   2238 }
   2239 
   2240 /// EmitTailCallLoadRetAddr - Emit a load of return address if tail call
   2241 /// optimization is performed and it is required.
   2242 SDValue
   2243 X86TargetLowering::EmitTailCallLoadRetAddr(SelectionDAG &DAG,
   2244                                            SDValue &OutRetAddr, SDValue Chain,
   2245                                            bool IsTailCall, bool Is64Bit,
   2246                                            int FPDiff, DebugLoc dl) const {
   2247   // Adjust the Return address stack slot.
   2248   EVT VT = getPointerTy();
   2249   OutRetAddr = getReturnAddressFrameIndex(DAG);
   2250 
   2251   // Load the "old" Return address.
   2252   OutRetAddr = DAG.getLoad(VT, dl, Chain, OutRetAddr, MachinePointerInfo(),
   2253                            false, false, false, 0);
   2254   return SDValue(OutRetAddr.getNode(), 1);
   2255 }
   2256 
   2257 /// EmitTailCallStoreRetAddr - Emit a store of the return address if tail call
   2258 /// optimization is performed and it is required (FPDiff!=0).
   2259 static SDValue
   2260 EmitTailCallStoreRetAddr(SelectionDAG & DAG, MachineFunction &MF,
   2261                          SDValue Chain, SDValue RetAddrFrIdx, EVT PtrVT,
   2262                          unsigned SlotSize, int FPDiff, DebugLoc dl) {
   2263   // Store the return address to the appropriate stack slot.
   2264   if (!FPDiff) return Chain;
   2265   // Calculate the new stack slot for the return address.
   2266   int NewReturnAddrFI =
   2267     MF.getFrameInfo()->CreateFixedObject(SlotSize, FPDiff-SlotSize, false);
   2268   SDValue NewRetAddrFrIdx = DAG.getFrameIndex(NewReturnAddrFI, PtrVT);
   2269   Chain = DAG.getStore(Chain, dl, RetAddrFrIdx, NewRetAddrFrIdx,
   2270                        MachinePointerInfo::getFixedStack(NewReturnAddrFI),
   2271                        false, false, 0);
   2272   return Chain;
   2273 }
   2274 
   2275 SDValue
   2276 X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
   2277                              SmallVectorImpl<SDValue> &InVals) const {
   2278   SelectionDAG &DAG                     = CLI.DAG;
   2279   DebugLoc &dl                          = CLI.DL;
   2280   SmallVector<ISD::OutputArg, 32> &Outs = CLI.Outs;
   2281   SmallVector<SDValue, 32> &OutVals     = CLI.OutVals;
   2282   SmallVector<ISD::InputArg, 32> &Ins   = CLI.Ins;
   2283   SDValue Chain                         = CLI.Chain;
   2284   SDValue Callee                        = CLI.Callee;
   2285   CallingConv::ID CallConv              = CLI.CallConv;
   2286   bool &isTailCall                      = CLI.IsTailCall;
   2287   bool isVarArg                         = CLI.IsVarArg;
   2288 
   2289   MachineFunction &MF = DAG.getMachineFunction();
   2290   bool Is64Bit        = Subtarget->is64Bit();
   2291   bool IsWin64        = Subtarget->isTargetWin64();
   2292   bool IsWindows      = Subtarget->isTargetWindows();
   2293   StructReturnType SR = callIsStructReturn(Outs);
   2294   bool IsSibcall      = false;
   2295 
   2296   if (MF.getTarget().Options.DisableTailCalls)
   2297     isTailCall = false;
   2298 
   2299   if (isTailCall) {
   2300     // Check if it's really possible to do a tail call.
   2301     isTailCall = IsEligibleForTailCallOptimization(Callee, CallConv,
   2302                     isVarArg, SR != NotStructReturn,
   2303                     MF.getFunction()->hasStructRetAttr(), CLI.RetTy,
   2304                     Outs, OutVals, Ins, DAG);
   2305 
   2306     // Sibcalls are automatically detected tailcalls which do not require
   2307     // ABI changes.
   2308     if (!MF.getTarget().Options.GuaranteedTailCallOpt && isTailCall)
   2309       IsSibcall = true;
   2310 
   2311     if (isTailCall)
   2312       ++NumTailCalls;
   2313   }
   2314 
   2315   assert(!(isVarArg && IsTailCallConvention(CallConv)) &&
   2316          "Var args not supported with calling convention fastcc, ghc or hipe");
   2317 
   2318   // Analyze operands of the call, assigning locations to each operand.
   2319   SmallVector<CCValAssign, 16> ArgLocs;
   2320   CCState CCInfo(CallConv, isVarArg, MF, getTargetMachine(),
   2321                  ArgLocs, *DAG.getContext());
   2322 
   2323   // Allocate shadow area for Win64
   2324   if (IsWin64) {
   2325     CCInfo.AllocateStack(32, 8);
   2326   }
   2327 
   2328   CCInfo.AnalyzeCallOperands(Outs, CC_X86);
   2329 
   2330   // Get a count of how many bytes are to be pushed on the stack.
   2331   unsigned NumBytes = CCInfo.getNextStackOffset();
   2332   if (IsSibcall)
   2333     // This is a sibcall. The memory operands are available in caller's
   2334     // own caller's stack.
   2335     NumBytes = 0;
   2336   else if (getTargetMachine().Options.GuaranteedTailCallOpt &&
   2337            IsTailCallConvention(CallConv))
   2338     NumBytes = GetAlignedArgumentStackSize(NumBytes, DAG);
   2339 
   2340   int FPDiff = 0;
   2341   if (isTailCall && !IsSibcall) {
   2342     // Lower arguments at fp - stackoffset + fpdiff.
   2343     X86MachineFunctionInfo *X86Info = MF.getInfo<X86MachineFunctionInfo>();
   2344     unsigned NumBytesCallerPushed = X86Info->getBytesToPopOnReturn();
   2345 
   2346     FPDiff = NumBytesCallerPushed - NumBytes;
   2347 
   2348     // Set the delta of movement of the returnaddr stackslot.
   2349     // But only set if delta is greater than previous delta.
   2350     if (FPDiff < X86Info->getTCReturnAddrDelta())
   2351       X86Info->setTCReturnAddrDelta(FPDiff);
   2352   }
   2353 
   2354   if (!IsSibcall)
   2355     Chain = DAG.getCALLSEQ_START(Chain, DAG.getIntPtrConstant(NumBytes, true));
   2356 
   2357   SDValue RetAddrFrIdx;
   2358   // Load return address for tail calls.
   2359   if (isTailCall && FPDiff)
   2360     Chain = EmitTailCallLoadRetAddr(DAG, RetAddrFrIdx, Chain, isTailCall,
   2361                                     Is64Bit, FPDiff, dl);
   2362 
   2363   SmallVector<std::pair<unsigned, SDValue>, 8> RegsToPass;
   2364   SmallVector<SDValue, 8> MemOpChains;
   2365   SDValue StackPtr;
   2366 
   2367   // Walk the register/memloc assignments, inserting copies/loads.  In the case
   2368   // of tail call optimization arguments are handle later.
   2369   for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
   2370     CCValAssign &VA = ArgLocs[i];
   2371     EVT RegVT = VA.getLocVT();
   2372     SDValue Arg = OutVals[i];
   2373     ISD::ArgFlagsTy Flags = Outs[i].Flags;
   2374     bool isByVal = Flags.isByVal();
   2375 
   2376     // Promote the value if needed.
   2377     switch (VA.getLocInfo()) {
   2378     default: llvm_unreachable("Unknown loc info!");
   2379     case CCValAssign::Full: break;
   2380     case CCValAssign::SExt:
   2381       Arg = DAG.getNode(ISD::SIGN_EXTEND, dl, RegVT, Arg);
   2382       break;
   2383     case CCValAssign::ZExt:
   2384       Arg = DAG.getNode(ISD::ZERO_EXTEND, dl, RegVT, Arg);
   2385       break;
   2386     case CCValAssign::AExt:
   2387       if (RegVT.is128BitVector()) {
   2388         // Special case: passing MMX values in XMM registers.
   2389         Arg = DAG.getNode(ISD::BITCAST, dl, MVT::i64, Arg);
   2390         Arg = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, Arg);
   2391         Arg = getMOVL(DAG, dl, MVT::v2i64, DAG.getUNDEF(MVT::v2i64), Arg);
   2392       } else
   2393         Arg = DAG.getNode(ISD::ANY_EXTEND, dl, RegVT, Arg);
   2394       break;
   2395     case CCValAssign::BCvt:
   2396       Arg = DAG.getNode(ISD::BITCAST, dl, RegVT, Arg);
   2397       break;
   2398     case CCValAssign::Indirect: {
   2399       // Store the argument.
   2400       SDValue SpillSlot = DAG.CreateStackTemporary(VA.getValVT());
   2401       int FI = cast<FrameIndexSDNode>(SpillSlot)->getIndex();
   2402       Chain = DAG.getStore(Chain, dl, Arg, SpillSlot,
   2403                            MachinePointerInfo::getFixedStack(FI),
   2404                            false, false, 0);
   2405       Arg = SpillSlot;
   2406       break;
   2407     }
   2408     }
   2409 
   2410     if (VA.isRegLoc()) {
   2411       RegsToPass.push_back(std::make_pair(VA.getLocReg(), Arg));
   2412       if (isVarArg && IsWin64) {
   2413         // Win64 ABI requires argument XMM reg to be copied to the corresponding
   2414         // shadow reg if callee is a varargs function.
   2415         unsigned ShadowReg = 0;
   2416         switch (VA.getLocReg()) {
   2417         case X86::XMM0: ShadowReg = X86::RCX; break;
   2418         case X86::XMM1: ShadowReg = X86::RDX; break;
   2419         case X86::XMM2: ShadowReg = X86::R8; break;
   2420         case X86::XMM3: ShadowReg = X86::R9; break;
   2421         }
   2422         if (ShadowReg)
   2423           RegsToPass.push_back(std::make_pair(ShadowReg, Arg));
   2424       }
   2425     } else if (!IsSibcall && (!isTailCall || isByVal)) {
   2426       assert(VA.isMemLoc());
   2427       if (StackPtr.getNode() == 0)
   2428         StackPtr = DAG.getCopyFromReg(Chain, dl, RegInfo->getStackRegister(),
   2429                                       getPointerTy());
   2430       MemOpChains.push_back(LowerMemOpCallTo(Chain, StackPtr, Arg,
   2431                                              dl, DAG, VA, Flags));
   2432     }
   2433   }
   2434 
   2435   if (!MemOpChains.empty())
   2436     Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
   2437                         &MemOpChains[0], MemOpChains.size());
   2438 
   2439   if (Subtarget->isPICStyleGOT()) {
   2440     // ELF / PIC requires GOT in the EBX register before function calls via PLT
   2441     // GOT pointer.
   2442     if (!isTailCall) {
   2443       RegsToPass.push_back(std::make_pair(unsigned(X86::EBX),
   2444                DAG.getNode(X86ISD::GlobalBaseReg, DebugLoc(), getPointerTy())));
   2445     } else {
   2446       // If we are tail calling and generating PIC/GOT style code load the
   2447       // address of the callee into ECX. The value in ecx is used as target of
   2448       // the tail jump. This is done to circumvent the ebx/callee-saved problem
   2449       // for tail calls on PIC/GOT architectures. Normally we would just put the
   2450       // address of GOT into ebx and then call target@PLT. But for tail calls
   2451       // ebx would be restored (since ebx is callee saved) before jumping to the
   2452       // target@PLT.
   2453 
   2454       // Note: The actual moving to ECX is done further down.
   2455       GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee);
   2456       if (G && !G->getGlobal()->hasHiddenVisibility() &&
   2457           !G->getGlobal()->hasProtectedVisibility())
   2458         Callee = LowerGlobalAddress(Callee, DAG);
   2459       else if (isa<ExternalSymbolSDNode>(Callee))
   2460         Callee = LowerExternalSymbol(Callee, DAG);
   2461     }
   2462   }
   2463 
   2464   if (Is64Bit && isVarArg && !IsWin64) {
   2465     // From AMD64 ABI document:
   2466     // For calls that may call functions that use varargs or stdargs
   2467     // (prototype-less calls or calls to functions containing ellipsis (...) in
   2468     // the declaration) %al is used as hidden argument to specify the number
   2469     // of SSE registers used. The contents of %al do not need to match exactly
   2470     // the number of registers, but must be an ubound on the number of SSE
   2471     // registers used and is in the range 0 - 8 inclusive.
   2472 
   2473     // Count the number of XMM registers allocated.
   2474     static const uint16_t XMMArgRegs[] = {
   2475       X86::XMM0, X86::XMM1, X86::XMM2, X86::XMM3,
   2476       X86::XMM4, X86::XMM5, X86::XMM6, X86::XMM7
   2477     };
   2478     unsigned NumXMMRegs = CCInfo.getFirstUnallocated(XMMArgRegs, 8);
   2479     assert((Subtarget->hasSSE1() || !NumXMMRegs)
   2480            && "SSE registers cannot be used when SSE is disabled");
   2481 
   2482     RegsToPass.push_back(std::make_pair(unsigned(X86::AL),
   2483                                         DAG.getConstant(NumXMMRegs, MVT::i8)));
   2484   }
   2485 
   2486   // For tail calls lower the arguments to the 'real' stack slot.
   2487   if (isTailCall) {
   2488     // Force all the incoming stack arguments to be loaded from the stack
   2489     // before any new outgoing arguments are stored to the stack, because the
   2490     // outgoing stack slots may alias the incoming argument stack slots, and
   2491     // the alias isn't otherwise explicit. This is slightly more conservative
   2492     // than necessary, because it means that each store effectively depends
   2493     // on every argument instead of just those arguments it would clobber.
   2494     SDValue ArgChain = DAG.getStackArgumentTokenFactor(Chain);
   2495 
   2496     SmallVector<SDValue, 8> MemOpChains2;
   2497     SDValue FIN;
   2498     int FI = 0;
   2499     if (getTargetMachine().Options.GuaranteedTailCallOpt) {
   2500       for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
   2501         CCValAssign &VA = ArgLocs[i];
   2502         if (VA.isRegLoc())
   2503           continue;
   2504         assert(VA.isMemLoc());
   2505         SDValue Arg = OutVals[i];
   2506         ISD::ArgFlagsTy Flags = Outs[i].Flags;
   2507         // Create frame index.
   2508         int32_t Offset = VA.getLocMemOffset()+FPDiff;
   2509         uint32_t OpSize = (VA.getLocVT().getSizeInBits()+7)/8;
   2510         FI = MF.getFrameInfo()->CreateFixedObject(OpSize, Offset, true);
   2511         FIN = DAG.getFrameIndex(FI, getPointerTy());
   2512 
   2513         if (Flags.isByVal()) {
   2514           // Copy relative to framepointer.
   2515           SDValue Source = DAG.getIntPtrConstant(VA.getLocMemOffset());
   2516           if (StackPtr.getNode() == 0)
   2517             StackPtr = DAG.getCopyFromReg(Chain, dl,
   2518                                           RegInfo->getStackRegister(),
   2519                                           getPointerTy());
   2520           Source = DAG.getNode(ISD::ADD, dl, getPointerTy(), StackPtr, Source);
   2521 
   2522           MemOpChains2.push_back(CreateCopyOfByValArgument(Source, FIN,
   2523                                                            ArgChain,
   2524                                                            Flags, DAG, dl));
   2525         } else {
   2526           // Store relative to framepointer.
   2527           MemOpChains2.push_back(
   2528             DAG.getStore(ArgChain, dl, Arg, FIN,
   2529                          MachinePointerInfo::getFixedStack(FI),
   2530                          false, false, 0));
   2531         }
   2532       }
   2533     }
   2534 
   2535     if (!MemOpChains2.empty())
   2536       Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
   2537                           &MemOpChains2[0], MemOpChains2.size());
   2538 
   2539     // Store the return address to the appropriate stack slot.
   2540     Chain = EmitTailCallStoreRetAddr(DAG, MF, Chain, RetAddrFrIdx,
   2541                                      getPointerTy(), RegInfo->getSlotSize(),
   2542                                      FPDiff, dl);
   2543   }
   2544 
   2545   // Build a sequence of copy-to-reg nodes chained together with token chain
   2546   // and flag operands which copy the outgoing args into registers.
   2547   SDValue InFlag;
   2548   for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) {
   2549     Chain = DAG.getCopyToReg(Chain, dl, RegsToPass[i].first,
   2550                              RegsToPass[i].second, InFlag);
   2551     InFlag = Chain.getValue(1);
   2552   }
   2553 
   2554   if (getTargetMachine().getCodeModel() == CodeModel::Large) {
   2555     assert(Is64Bit && "Large code model is only legal in 64-bit mode.");
   2556     // In the 64-bit large code model, we have to make all calls
   2557     // through a register, since the call instruction's 32-bit
   2558     // pc-relative offset may not be large enough to hold the whole
   2559     // address.
   2560   } else if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee)) {
   2561     // If the callee is a GlobalAddress node (quite common, every direct call
   2562     // is) turn it into a TargetGlobalAddress node so that legalize doesn't hack
   2563     // it.
   2564 
   2565     // We should use extra load for direct calls to dllimported functions in
   2566     // non-JIT mode.
   2567     const GlobalValue *GV = G->getGlobal();
   2568     if (!GV->hasDLLImportLinkage()) {
   2569       unsigned char OpFlags = 0;
   2570       bool ExtraLoad = false;
   2571       unsigned WrapperKind = ISD::DELETED_NODE;
   2572 
   2573       // On ELF targets, in both X86-64 and X86-32 mode, direct calls to
   2574       // external symbols most go through the PLT in PIC mode.  If the symbol
   2575       // has hidden or protected visibility, or if it is static or local, then
   2576       // we don't need to use the PLT - we can directly call it.
   2577       if (Subtarget->isTargetELF() &&
   2578           getTargetMachine().getRelocationModel() == Reloc::PIC_ &&
   2579           GV->hasDefaultVisibility() && !GV->hasLocalLinkage()) {
   2580         OpFlags = X86II::MO_PLT;
   2581       } else if (Subtarget->isPICStyleStubAny() &&
   2582                  (GV->isDeclaration() || GV->isWeakForLinker()) &&
   2583                  (!Subtarget->getTargetTriple().isMacOSX() ||
   2584                   Subtarget->getTargetTriple().isMacOSXVersionLT(10, 5))) {
   2585         // PC-relative references to external symbols should go through $stub,
   2586         // unless we're building with the leopard linker or later, which
   2587         // automatically synthesizes these stubs.
   2588         OpFlags = X86II::MO_DARWIN_STUB;
   2589       } else if (Subtarget->isPICStyleRIPRel() &&
   2590                  isa<Function>(GV) &&
   2591                  cast<Function>(GV)->getAttributes().
   2592                    hasAttribute(AttributeSet::FunctionIndex,
   2593                                 Attribute::NonLazyBind)) {
   2594         // If the function is marked as non-lazy, generate an indirect call
   2595         // which loads from the GOT directly. This avoids runtime overhead
   2596         // at the cost of eager binding (and one extra byte of encoding).
   2597         OpFlags = X86II::MO_GOTPCREL;
   2598         WrapperKind = X86ISD::WrapperRIP;
   2599         ExtraLoad = true;
   2600       }
   2601 
   2602       Callee = DAG.getTargetGlobalAddress(GV, dl, getPointerTy(),
   2603                                           G->getOffset(), OpFlags);
   2604 
   2605       // Add a wrapper if needed.
   2606       if (WrapperKind != ISD::DELETED_NODE)
   2607         Callee = DAG.getNode(X86ISD::WrapperRIP, dl, getPointerTy(), Callee);
   2608       // Add extra indirection if needed.
   2609       if (ExtraLoad)
   2610         Callee = DAG.getLoad(getPointerTy(), dl, DAG.getEntryNode(), Callee,
   2611                              MachinePointerInfo::getGOT(),
   2612                              false, false, false, 0);
   2613     }
   2614   } else if (ExternalSymbolSDNode *S = dyn_cast<ExternalSymbolSDNode>(Callee)) {
   2615     unsigned char OpFlags = 0;
   2616 
   2617     // On ELF targets, in either X86-64 or X86-32 mode, direct calls to
   2618     // external symbols should go through the PLT.
   2619     if (Subtarget->isTargetELF() &&
   2620         getTargetMachine().getRelocationModel() == Reloc::PIC_) {
   2621       OpFlags = X86II::MO_PLT;
   2622     } else if (Subtarget->isPICStyleStubAny() &&
   2623                (!Subtarget->getTargetTriple().isMacOSX() ||
   2624                 Subtarget->getTargetTriple().isMacOSXVersionLT(10, 5))) {
   2625       // PC-relative references to external symbols should go through $stub,
   2626       // unless we're building with the leopard linker or later, which
   2627       // automatically synthesizes these stubs.
   2628       OpFlags = X86II::MO_DARWIN_STUB;
   2629     }
   2630 
   2631     Callee = DAG.getTargetExternalSymbol(S->getSymbol(), getPointerTy(),
   2632                                          OpFlags);
   2633   }
   2634 
   2635   // Returns a chain & a flag for retval copy to use.
   2636   SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
   2637   SmallVector<SDValue, 8> Ops;
   2638 
   2639   if (!IsSibcall && isTailCall) {
   2640     Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(NumBytes, true),
   2641                            DAG.getIntPtrConstant(0, true), InFlag);
   2642     InFlag = Chain.getValue(1);
   2643   }
   2644 
   2645   Ops.push_back(Chain);
   2646   Ops.push_back(Callee);
   2647 
   2648   if (isTailCall)
   2649     Ops.push_back(DAG.getConstant(FPDiff, MVT::i32));
   2650 
   2651   // Add argument registers to the end of the list so that they are known live
   2652   // into the call.
   2653   for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i)
   2654     Ops.push_back(DAG.getRegister(RegsToPass[i].first,
   2655                                   RegsToPass[i].second.getValueType()));
   2656 
   2657   // Add a register mask operand representing the call-preserved registers.
   2658   const TargetRegisterInfo *TRI = getTargetMachine().getRegisterInfo();
   2659   const uint32_t *Mask = TRI->getCallPreservedMask(CallConv);
   2660   assert(Mask && "Missing call preserved mask for calling convention");
   2661   Ops.push_back(DAG.getRegisterMask(Mask));
   2662 
   2663   if (InFlag.getNode())
   2664     Ops.push_back(InFlag);
   2665 
   2666   if (isTailCall) {
   2667     // We used to do:
   2668     //// If this is the first return lowered for this function, add the regs
   2669     //// to the liveout set for the function.
   2670     // This isn't right, although it's probably harmless on x86; liveouts
   2671     // should be computed from returns not tail calls.  Consider a void
   2672     // function making a tail call to a function returning int.
   2673     return DAG.getNode(X86ISD::TC_RETURN, dl, NodeTys, &Ops[0], Ops.size());
   2674   }
   2675 
   2676   Chain = DAG.getNode(X86ISD::CALL, dl, NodeTys, &Ops[0], Ops.size());
   2677   InFlag = Chain.getValue(1);
   2678 
   2679   // Create the CALLSEQ_END node.
   2680   unsigned NumBytesForCalleeToPush;
   2681   if (X86::isCalleePop(CallConv, Is64Bit, isVarArg,
   2682                        getTargetMachine().Options.GuaranteedTailCallOpt))
   2683     NumBytesForCalleeToPush = NumBytes;    // Callee pops everything
   2684   else if (!Is64Bit && !IsTailCallConvention(CallConv) && !IsWindows &&
   2685            SR == StackStructReturn)
   2686     // If this is a call to a struct-return function, the callee
   2687     // pops the hidden struct pointer, so we have to push it back.
   2688     // This is common for Darwin/X86, Linux & Mingw32 targets.
   2689     // For MSVC Win32 targets, the caller pops the hidden struct pointer.
   2690     NumBytesForCalleeToPush = 4;
   2691   else
   2692     NumBytesForCalleeToPush = 0;  // Callee pops nothing.
   2693 
   2694   // Returns a flag for retval copy to use.
   2695   if (!IsSibcall) {
   2696     Chain = DAG.getCALLSEQ_END(Chain,
   2697                                DAG.getIntPtrConstant(NumBytes, true),
   2698                                DAG.getIntPtrConstant(NumBytesForCalleeToPush,
   2699                                                      true),
   2700                                InFlag);
   2701     InFlag = Chain.getValue(1);
   2702   }
   2703 
   2704   // Handle result values, copying them out of physregs into vregs that we
   2705   // return.
   2706   return LowerCallResult(Chain, InFlag, CallConv, isVarArg,
   2707                          Ins, dl, DAG, InVals);
   2708 }
   2709 
   2710 //===----------------------------------------------------------------------===//
   2711 //                Fast Calling Convention (tail call) implementation
   2712 //===----------------------------------------------------------------------===//
   2713 
   2714 //  Like std call, callee cleans arguments, convention except that ECX is
   2715 //  reserved for storing the tail called function address. Only 2 registers are
   2716 //  free for argument passing (inreg). Tail call optimization is performed
   2717 //  provided:
   2718 //                * tailcallopt is enabled
   2719 //                * caller/callee are fastcc
   2720 //  On X86_64 architecture with GOT-style position independent code only local
   2721 //  (within module) calls are supported at the moment.
   2722 //  To keep the stack aligned according to platform abi the function
   2723 //  GetAlignedArgumentStackSize ensures that argument delta is always multiples
   2724 //  of stack alignment. (Dynamic linkers need this - darwin's dyld for example)
   2725 //  If a tail called function callee has more arguments than the caller the
   2726 //  caller needs to make sure that there is room to move the RETADDR to. This is
   2727 //  achieved by reserving an area the size of the argument delta right after the
   2728 //  original REtADDR, but before the saved framepointer or the spilled registers
   2729 //  e.g. caller(arg1, arg2) calls callee(arg1, arg2,arg3,arg4)
   2730 //  stack layout:
   2731 //    arg1
   2732 //    arg2
   2733 //    RETADDR
   2734 //    [ new RETADDR
   2735 //      move area ]
   2736 //    (possible EBP)
   2737 //    ESI
   2738 //    EDI
   2739 //    local1 ..
   2740 
   2741 /// GetAlignedArgumentStackSize - Make the stack size align e.g 16n + 12 aligned
   2742 /// for a 16 byte align requirement.
   2743 unsigned
   2744 X86TargetLowering::GetAlignedArgumentStackSize(unsigned StackSize,
   2745                                                SelectionDAG& DAG) const {
   2746   MachineFunction &MF = DAG.getMachineFunction();
   2747   const TargetMachine &TM = MF.getTarget();
   2748   const TargetFrameLowering &TFI = *TM.getFrameLowering();
   2749   unsigned StackAlignment = TFI.getStackAlignment();
   2750   uint64_t AlignMask = StackAlignment - 1;
   2751   int64_t Offset = StackSize;
   2752   unsigned SlotSize = RegInfo->getSlotSize();
   2753   if ( (Offset & AlignMask) <= (StackAlignment - SlotSize) ) {
   2754     // Number smaller than 12 so just add the difference.
   2755     Offset += ((StackAlignment - SlotSize) - (Offset & AlignMask));
   2756   } else {
   2757     // Mask out lower bits, add stackalignment once plus the 12 bytes.
   2758     Offset = ((~AlignMask) & Offset) + StackAlignment +
   2759       (StackAlignment-SlotSize);
   2760   }
   2761   return Offset;
   2762 }
   2763 
   2764 /// MatchingStackOffset - Return true if the given stack call argument is
   2765 /// already available in the same position (relatively) of the caller's
   2766 /// incoming argument stack.
   2767 static
   2768 bool MatchingStackOffset(SDValue Arg, unsigned Offset, ISD::ArgFlagsTy Flags,
   2769                          MachineFrameInfo *MFI, const MachineRegisterInfo *MRI,
   2770                          const X86InstrInfo *TII) {
   2771   unsigned Bytes = Arg.getValueType().getSizeInBits() / 8;
   2772   int FI = INT_MAX;
   2773   if (Arg.getOpcode() == ISD::CopyFromReg) {
   2774     unsigned VR = cast<RegisterSDNode>(Arg.getOperand(1))->getReg();
   2775     if (!TargetRegisterInfo::isVirtualRegister(VR))
   2776       return false;
   2777     MachineInstr *Def = MRI->getVRegDef(VR);
   2778     if (!Def)
   2779       return false;
   2780     if (!Flags.isByVal()) {
   2781       if (!TII->isLoadFromStackSlot(Def, FI))
   2782         return false;
   2783     } else {
   2784       unsigned Opcode = Def->getOpcode();
   2785       if ((Opcode == X86::LEA32r || Opcode == X86::LEA64r) &&
   2786           Def->getOperand(1).isFI()) {
   2787         FI = Def->getOperand(1).getIndex();
   2788         Bytes = Flags.getByValSize();
   2789       } else
   2790         return false;
   2791     }
   2792   } else if (LoadSDNode *Ld = dyn_cast<LoadSDNode>(Arg)) {
   2793     if (Flags.isByVal())
   2794       // ByVal argument is passed in as a pointer but it's now being
   2795       // dereferenced. e.g.
   2796       // define @foo(%struct.X* %A) {
   2797       //   tail call @bar(%struct.X* byval %A)
   2798       // }
   2799       return false;
   2800     SDValue Ptr = Ld->getBasePtr();
   2801     FrameIndexSDNode *FINode = dyn_cast<FrameIndexSDNode>(Ptr);
   2802     if (!FINode)
   2803       return false;
   2804     FI = FINode->getIndex();
   2805   } else if (Arg.getOpcode() == ISD::FrameIndex && Flags.isByVal()) {
   2806     FrameIndexSDNode *FINode = cast<FrameIndexSDNode>(Arg);
   2807     FI = FINode->getIndex();
   2808     Bytes = Flags.getByValSize();
   2809   } else
   2810     return false;
   2811 
   2812   assert(FI != INT_MAX);
   2813   if (!MFI->isFixedObjectIndex(FI))
   2814     return false;
   2815   return Offset == MFI->getObjectOffset(FI) && Bytes == MFI->getObjectSize(FI);
   2816 }
   2817 
   2818 /// IsEligibleForTailCallOptimization - Check whether the call is eligible
   2819 /// for tail call optimization. Targets which want to do tail call
   2820 /// optimization should implement this function.
   2821 bool
   2822 X86TargetLowering::IsEligibleForTailCallOptimization(SDValue Callee,
   2823                                                      CallingConv::ID CalleeCC,
   2824                                                      bool isVarArg,
   2825                                                      bool isCalleeStructRet,
   2826                                                      bool isCallerStructRet,
   2827                                                      Type *RetTy,
   2828                                     const SmallVectorImpl<ISD::OutputArg> &Outs,
   2829                                     const SmallVectorImpl<SDValue> &OutVals,
   2830                                     const SmallVectorImpl<ISD::InputArg> &Ins,
   2831                                                      SelectionDAG &DAG) const {
   2832   if (!IsTailCallConvention(CalleeCC) &&
   2833       CalleeCC != CallingConv::C)
   2834     return false;
   2835 
   2836   // If -tailcallopt is specified, make fastcc functions tail-callable.
   2837   const MachineFunction &MF = DAG.getMachineFunction();
   2838   const Function *CallerF = DAG.getMachineFunction().getFunction();
   2839 
   2840   // If the function return type is x86_fp80 and the callee return type is not,
   2841   // then the FP_EXTEND of the call result is not a nop. It's not safe to
   2842   // perform a tailcall optimization here.
   2843   if (CallerF->getReturnType()->isX86_FP80Ty() && !RetTy->isX86_FP80Ty())
   2844     return false;
   2845 
   2846   CallingConv::ID CallerCC = CallerF->getCallingConv();
   2847   bool CCMatch = CallerCC == CalleeCC;
   2848 
   2849   if (getTargetMachine().Options.GuaranteedTailCallOpt) {
   2850     if (IsTailCallConvention(CalleeCC) && CCMatch)
   2851       return true;
   2852     return false;
   2853   }
   2854 
   2855   // Look for obvious safe cases to perform tail call optimization that do not
   2856   // require ABI changes. This is what gcc calls sibcall.
   2857 
   2858   // Can't do sibcall if stack needs to be dynamically re-aligned. PEI needs to
   2859   // emit a special epilogue.
   2860   if (RegInfo->needsStackRealignment(MF))
   2861     return false;
   2862 
   2863   // Also avoid sibcall optimization if either caller or callee uses struct
   2864   // return semantics.
   2865   if (isCalleeStructRet || isCallerStructRet)
   2866     return false;
   2867 
   2868   // An stdcall caller is expected to clean up its arguments; the callee
   2869   // isn't going to do that.
   2870   if (!CCMatch && CallerCC == CallingConv::X86_StdCall)
   2871     return false;
   2872 
   2873   // Do not sibcall optimize vararg calls unless all arguments are passed via
   2874   // registers.
   2875   if (isVarArg && !Outs.empty()) {
   2876 
   2877     // Optimizing for varargs on Win64 is unlikely to be safe without
   2878     // additional testing.
   2879     if (Subtarget->isTargetWin64())
   2880       return false;
   2881 
   2882     SmallVector<CCValAssign, 16> ArgLocs;
   2883     CCState CCInfo(CalleeCC, isVarArg, DAG.getMachineFunction(),
   2884                    getTargetMachine(), ArgLocs, *DAG.getContext());
   2885 
   2886     CCInfo.AnalyzeCallOperands(Outs, CC_X86);
   2887     for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i)
   2888       if (!ArgLocs[i].isRegLoc())
   2889         return false;
   2890   }
   2891 
   2892   // If the call result is in ST0 / ST1, it needs to be popped off the x87
   2893   // stack.  Therefore, if it's not used by the call it is not safe to optimize
   2894   // this into a sibcall.
   2895   bool Unused = false;
   2896   for (unsigned i = 0, e = Ins.size(); i != e; ++i) {
   2897     if (!Ins[i].Used) {
   2898       Unused = true;
   2899       break;
   2900     }
   2901   }
   2902   if (Unused) {
   2903     SmallVector<CCValAssign, 16> RVLocs;
   2904     CCState CCInfo(CalleeCC, false, DAG.getMachineFunction(),
   2905                    getTargetMachine(), RVLocs, *DAG.getContext());
   2906     CCInfo.AnalyzeCallResult(Ins, RetCC_X86);
   2907     for (unsigned i = 0, e = RVLocs.size(); i != e; ++i) {
   2908       CCValAssign &VA = RVLocs[i];
   2909       if (VA.getLocReg() == X86::ST0 || VA.getLocReg() == X86::ST1)
   2910         return false;
   2911     }
   2912   }
   2913 
   2914   // If the calling conventions do not match, then we'd better make sure the
   2915   // results are returned in the same way as what the caller expects.
   2916   if (!CCMatch) {
   2917     SmallVector<CCValAssign, 16> RVLocs1;
   2918     CCState CCInfo1(CalleeCC, false, DAG.getMachineFunction(),
   2919                     getTargetMachine(), RVLocs1, *DAG.getContext());
   2920     CCInfo1.AnalyzeCallResult(Ins, RetCC_X86);
   2921 
   2922     SmallVector<CCValAssign, 16> RVLocs2;
   2923     CCState CCInfo2(CallerCC, false, DAG.getMachineFunction(),
   2924                     getTargetMachine(), RVLocs2, *DAG.getContext());
   2925     CCInfo2.AnalyzeCallResult(Ins, RetCC_X86);
   2926 
   2927     if (RVLocs1.size() != RVLocs2.size())
   2928       return false;
   2929     for (unsigned i = 0, e = RVLocs1.size(); i != e; ++i) {
   2930       if (RVLocs1[i].isRegLoc() != RVLocs2[i].isRegLoc())
   2931         return false;
   2932       if (RVLocs1[i].getLocInfo() != RVLocs2[i].getLocInfo())
   2933         return false;
   2934       if (RVLocs1[i].isRegLoc()) {
   2935         if (RVLocs1[i].getLocReg() != RVLocs2[i].getLocReg())
   2936           return false;
   2937       } else {
   2938         if (RVLocs1[i].getLocMemOffset() != RVLocs2[i].getLocMemOffset())
   2939           return false;
   2940       }
   2941     }
   2942   }
   2943 
   2944   // If the callee takes no arguments then go on to check the results of the
   2945   // call.
   2946   if (!Outs.empty()) {
   2947     // Check if stack adjustment is needed. For now, do not do this if any
   2948     // argument is passed on the stack.
   2949     SmallVector<CCValAssign, 16> ArgLocs;
   2950     CCState CCInfo(CalleeCC, isVarArg, DAG.getMachineFunction(),
   2951                    getTargetMachine(), ArgLocs, *DAG.getContext());
   2952 
   2953     // Allocate shadow area for Win64
   2954     if (Subtarget->isTargetWin64()) {
   2955       CCInfo.AllocateStack(32, 8);
   2956     }
   2957 
   2958     CCInfo.AnalyzeCallOperands(Outs, CC_X86);
   2959     if (CCInfo.getNextStackOffset()) {
   2960       MachineFunction &MF = DAG.getMachineFunction();
   2961       if (MF.getInfo<X86MachineFunctionInfo>()->getBytesToPopOnReturn())
   2962         return false;
   2963 
   2964       // Check if the arguments are already laid out in the right way as
   2965       // the caller's fixed stack objects.
   2966       MachineFrameInfo *MFI = MF.getFrameInfo();
   2967       const MachineRegisterInfo *MRI = &MF.getRegInfo();
   2968       const X86InstrInfo *TII =
   2969         ((const X86TargetMachine&)getTargetMachine()).getInstrInfo();
   2970       for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
   2971         CCValAssign &VA = ArgLocs[i];
   2972         SDValue Arg = OutVals[i];
   2973         ISD::ArgFlagsTy Flags = Outs[i].Flags;
   2974         if (VA.getLocInfo() == CCValAssign::Indirect)
   2975           return false;
   2976         if (!VA.isRegLoc()) {
   2977           if (!MatchingStackOffset(Arg, VA.getLocMemOffset(), Flags,
   2978                                    MFI, MRI, TII))
   2979             return false;
   2980         }
   2981       }
   2982     }
   2983 
   2984     // If the tailcall address may be in a register, then make sure it's
   2985     // possible to register allocate for it. In 32-bit, the call address can
   2986     // only target EAX, EDX, or ECX since the tail call must be scheduled after
   2987     // callee-saved registers are restored. These happen to be the same
   2988     // registers used to pass 'inreg' arguments so watch out for those.
   2989     if (!Subtarget->is64Bit() &&
   2990         ((!isa<GlobalAddressSDNode>(Callee) &&
   2991           !isa<ExternalSymbolSDNode>(Callee)) ||
   2992          getTargetMachine().getRelocationModel() == Reloc::PIC_)) {
   2993       unsigned NumInRegs = 0;
   2994       // In PIC we need an extra register to formulate the address computation
   2995       // for the callee.
   2996       unsigned MaxInRegs =
   2997           (getTargetMachine().getRelocationModel() == Reloc::PIC_) ? 2 : 3;
   2998 
   2999       for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
   3000         CCValAssign &VA = ArgLocs[i];
   3001         if (!VA.isRegLoc())
   3002           continue;
   3003         unsigned Reg = VA.getLocReg();
   3004         switch (Reg) {
   3005         default: break;
   3006         case X86::EAX: case X86::EDX: case X86::ECX:
   3007           if (++NumInRegs == MaxInRegs)
   3008             return false;
   3009           break;
   3010         }
   3011       }
   3012     }
   3013   }
   3014 
   3015   return true;
   3016 }
   3017 
   3018 FastISel *
   3019 X86TargetLowering::createFastISel(FunctionLoweringInfo &funcInfo,
   3020                                   const TargetLibraryInfo *libInfo) const {
   3021   return X86::createFastISel(funcInfo, libInfo);
   3022 }
   3023 
   3024 //===----------------------------------------------------------------------===//
   3025 //                           Other Lowering Hooks
   3026 //===----------------------------------------------------------------------===//
   3027 
   3028 static bool MayFoldLoad(SDValue Op) {
   3029   return Op.hasOneUse() && ISD::isNormalLoad(Op.getNode());
   3030 }
   3031 
   3032 static bool MayFoldIntoStore(SDValue Op) {
   3033   return Op.hasOneUse() && ISD::isNormalStore(*Op.getNode()->use_begin());
   3034 }
   3035 
   3036 static bool isTargetShuffle(unsigned Opcode) {
   3037   switch(Opcode) {
   3038   default: return false;
   3039   case X86ISD::PSHUFD:
   3040   case X86ISD::PSHUFHW:
   3041   case X86ISD::PSHUFLW:
   3042   case X86ISD::SHUFP:
   3043   case X86ISD::PALIGNR:
   3044   case X86ISD::MOVLHPS:
   3045   case X86ISD::MOVLHPD:
   3046   case X86ISD::MOVHLPS:
   3047   case X86ISD::MOVLPS:
   3048   case X86ISD::MOVLPD:
   3049   case X86ISD::MOVSHDUP:
   3050   case X86ISD::MOVSLDUP:
   3051   case X86ISD::MOVDDUP:
   3052   case X86ISD::MOVSS:
   3053   case X86ISD::MOVSD:
   3054   case X86ISD::UNPCKL:
   3055   case X86ISD::UNPCKH:
   3056   case X86ISD::VPERMILP:
   3057   case X86ISD::VPERM2X128:
   3058   case X86ISD::VPERMI:
   3059     return true;
   3060   }
   3061 }
   3062 
   3063 static SDValue getTargetShuffleNode(unsigned Opc, DebugLoc dl, EVT VT,
   3064                                     SDValue V1, SelectionDAG &DAG) {
   3065   switch(Opc) {
   3066   default: llvm_unreachable("Unknown x86 shuffle node");
   3067   case X86ISD::MOVSHDUP:
   3068   case X86ISD::MOVSLDUP:
   3069   case X86ISD::MOVDDUP:
   3070     return DAG.getNode(Opc, dl, VT, V1);
   3071   }
   3072 }
   3073 
   3074 static SDValue getTargetShuffleNode(unsigned Opc, DebugLoc dl, EVT VT,
   3075                                     SDValue V1, unsigned TargetMask,
   3076                                     SelectionDAG &DAG) {
   3077   switch(Opc) {
   3078   default: llvm_unreachable("Unknown x86 shuffle node");
   3079   case X86ISD::PSHUFD:
   3080   case X86ISD::PSHUFHW:
   3081   case X86ISD::PSHUFLW:
   3082   case X86ISD::VPERMILP:
   3083   case X86ISD::VPERMI:
   3084     return DAG.getNode(Opc, dl, VT, V1, DAG.getConstant(TargetMask, MVT::i8));
   3085   }
   3086 }
   3087 
   3088 static SDValue getTargetShuffleNode(unsigned Opc, DebugLoc dl, EVT VT,
   3089                                     SDValue V1, SDValue V2, unsigned TargetMask,
   3090                                     SelectionDAG &DAG) {
   3091   switch(Opc) {
   3092   default: llvm_unreachable("Unknown x86 shuffle node");
   3093   case X86ISD::PALIGNR:
   3094   case X86ISD::SHUFP:
   3095   case X86ISD::VPERM2X128:
   3096     return DAG.getNode(Opc, dl, VT, V1, V2,
   3097                        DAG.getConstant(TargetMask, MVT::i8));
   3098   }
   3099 }
   3100 
   3101 static SDValue getTargetShuffleNode(unsigned Opc, DebugLoc dl, EVT VT,
   3102                                     SDValue V1, SDValue V2, SelectionDAG &DAG) {
   3103   switch(Opc) {
   3104   default: llvm_unreachable("Unknown x86 shuffle node");
   3105   case X86ISD::MOVLHPS:
   3106   case X86ISD::MOVLHPD:
   3107   case X86ISD::MOVHLPS:
   3108   case X86ISD::MOVLPS:
   3109   case X86ISD::MOVLPD:
   3110   case X86ISD::MOVSS:
   3111   case X86ISD::MOVSD:
   3112   case X86ISD::UNPCKL:
   3113   case X86ISD::UNPCKH:
   3114     return DAG.getNode(Opc, dl, VT, V1, V2);
   3115   }
   3116 }
   3117 
   3118 SDValue X86TargetLowering::getReturnAddressFrameIndex(SelectionDAG &DAG) const {
   3119   MachineFunction &MF = DAG.getMachineFunction();
   3120   X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
   3121   int ReturnAddrIndex = FuncInfo->getRAIndex();
   3122 
   3123   if (ReturnAddrIndex == 0) {
   3124     // Set up a frame object for the return address.
   3125     unsigned SlotSize = RegInfo->getSlotSize();
   3126     ReturnAddrIndex = MF.getFrameInfo()->CreateFixedObject(SlotSize, -SlotSize,
   3127                                                            false);
   3128     FuncInfo->setRAIndex(ReturnAddrIndex);
   3129   }
   3130 
   3131   return DAG.getFrameIndex(ReturnAddrIndex, getPointerTy());
   3132 }
   3133 
   3134 bool X86::isOffsetSuitableForCodeModel(int64_t Offset, CodeModel::Model M,
   3135                                        bool hasSymbolicDisplacement) {
   3136   // Offset should fit into 32 bit immediate field.
   3137   if (!isInt<32>(Offset))
   3138     return false;
   3139 
   3140   // If we don't have a symbolic displacement - we don't have any extra
   3141   // restrictions.
   3142   if (!hasSymbolicDisplacement)
   3143     return true;
   3144 
   3145   // FIXME: Some tweaks might be needed for medium code model.
   3146   if (M != CodeModel::Small && M != CodeModel::Kernel)
   3147     return false;
   3148 
   3149   // For small code model we assume that latest object is 16MB before end of 31
   3150   // bits boundary. We may also accept pretty large negative constants knowing
   3151   // that all objects are in the positive half of address space.
   3152   if (M == CodeModel::Small && Offset < 16*1024*1024)
   3153     return true;
   3154 
   3155   // For kernel code model we know that all object resist in the negative half
   3156   // of 32bits address space. We may not accept negative offsets, since they may
   3157   // be just off and we may accept pretty large positive ones.
   3158   if (M == CodeModel::Kernel && Offset > 0)
   3159     return true;
   3160 
   3161   return false;
   3162 }
   3163 
   3164 /// isCalleePop - Determines whether the callee is required to pop its
   3165 /// own arguments. Callee pop is necessary to support tail calls.
   3166 bool X86::isCalleePop(CallingConv::ID CallingConv,
   3167                       bool is64Bit, bool IsVarArg, bool TailCallOpt) {
   3168   if (IsVarArg)
   3169     return false;
   3170 
   3171   switch (CallingConv) {
   3172   default:
   3173     return false;
   3174   case CallingConv::X86_StdCall:
   3175     return !is64Bit;
   3176   case CallingConv::X86_FastCall:
   3177     return !is64Bit;
   3178   case CallingConv::X86_ThisCall:
   3179     return !is64Bit;
   3180   case CallingConv::Fast:
   3181     return TailCallOpt;
   3182   case CallingConv::GHC:
   3183     return TailCallOpt;
   3184   case CallingConv::HiPE:
   3185     return TailCallOpt;
   3186   }
   3187 }
   3188 
   3189 /// TranslateX86CC - do a one to one translation of a ISD::CondCode to the X86
   3190 /// specific condition code, returning the condition code and the LHS/RHS of the
   3191 /// comparison to make.
   3192 static unsigned TranslateX86CC(ISD::CondCode SetCCOpcode, bool isFP,
   3193                                SDValue &LHS, SDValue &RHS, SelectionDAG &DAG) {
   3194   if (!isFP) {
   3195     if (ConstantSDNode *RHSC = dyn_cast<ConstantSDNode>(RHS)) {
   3196       if (SetCCOpcode == ISD::SETGT && RHSC->isAllOnesValue()) {
   3197         // X > -1   -> X == 0, jump !sign.
   3198         RHS = DAG.getConstant(0, RHS.getValueType());
   3199         return X86::COND_NS;
   3200       }
   3201       if (SetCCOpcode == ISD::SETLT && RHSC->isNullValue()) {
   3202         // X < 0   -> X == 0, jump on sign.
   3203         return X86::COND_S;
   3204       }
   3205       if (SetCCOpcode == ISD::SETLT && RHSC->getZExtValue() == 1) {
   3206         // X < 1   -> X <= 0
   3207         RHS = DAG.getConstant(0, RHS.getValueType());
   3208         return X86::COND_LE;
   3209       }
   3210     }
   3211 
   3212     switch (SetCCOpcode) {
   3213     default: llvm_unreachable("Invalid integer condition!");
   3214     case ISD::SETEQ:  return X86::COND_E;
   3215     case ISD::SETGT:  return X86::COND_G;
   3216     case ISD::SETGE:  return X86::COND_GE;
   3217     case ISD::SETLT:  return X86::COND_L;
   3218     case ISD::SETLE:  return X86::COND_LE;
   3219     case ISD::SETNE:  return X86::COND_NE;
   3220     case ISD::SETULT: return X86::COND_B;
   3221     case ISD::SETUGT: return X86::COND_A;
   3222     case ISD::SETULE: return X86::COND_BE;
   3223     case ISD::SETUGE: return X86::COND_AE;
   3224     }
   3225   }
   3226 
   3227   // First determine if it is required or is profitable to flip the operands.
   3228 
   3229   // If LHS is a foldable load, but RHS is not, flip the condition.
   3230   if (ISD::isNON_EXTLoad(LHS.getNode()) &&
   3231       !ISD::isNON_EXTLoad(RHS.getNode())) {
   3232     SetCCOpcode = getSetCCSwappedOperands(SetCCOpcode);
   3233     std::swap(LHS, RHS);
   3234   }
   3235 
   3236   switch (SetCCOpcode) {
   3237   default: break;
   3238   case ISD::SETOLT:
   3239   case ISD::SETOLE:
   3240   case ISD::SETUGT:
   3241   case ISD::SETUGE:
   3242     std::swap(LHS, RHS);
   3243     break;
   3244   }
   3245 
   3246   // On a floating point condition, the flags are set as follows:
   3247   // ZF  PF  CF   op
   3248   //  0 | 0 | 0 | X > Y
   3249   //  0 | 0 | 1 | X < Y
   3250   //  1 | 0 | 0 | X == Y
   3251   //  1 | 1 | 1 | unordered
   3252   switch (SetCCOpcode) {
   3253   default: llvm_unreachable("Condcode should be pre-legalized away");
   3254   case ISD::SETUEQ:
   3255   case ISD::SETEQ:   return X86::COND_E;
   3256   case ISD::SETOLT:              // flipped
   3257   case ISD::SETOGT:
   3258   case ISD::SETGT:   return X86::COND_A;
   3259   case ISD::SETOLE:              // flipped
   3260   case ISD::SETOGE:
   3261   case ISD::SETGE:   return X86::COND_AE;
   3262   case ISD::SETUGT:              // flipped
   3263   case ISD::SETULT:
   3264   case ISD::SETLT:   return X86::COND_B;
   3265   case ISD::SETUGE:              // flipped
   3266   case ISD::SETULE:
   3267   case ISD::SETLE:   return X86::COND_BE;
   3268   case ISD::SETONE:
   3269   case ISD::SETNE:   return X86::COND_NE;
   3270   case ISD::SETUO:   return X86::COND_P;
   3271   case ISD::SETO:    return X86::COND_NP;
   3272   case ISD::SETOEQ:
   3273   case ISD::SETUNE:  return X86::COND_INVALID;
   3274   }
   3275 }
   3276 
   3277 /// hasFPCMov - is there a floating point cmov for the specific X86 condition
   3278 /// code. Current x86 isa includes the following FP cmov instructions:
   3279 /// fcmovb, fcomvbe, fcomve, fcmovu, fcmovae, fcmova, fcmovne, fcmovnu.
   3280 static bool hasFPCMov(unsigned X86CC) {
   3281   switch (X86CC) {
   3282   default:
   3283     return false;
   3284   case X86::COND_B:
   3285   case X86::COND_BE:
   3286   case X86::COND_E:
   3287   case X86::COND_P:
   3288   case X86::COND_A:
   3289   case X86::COND_AE:
   3290   case X86::COND_NE:
   3291   case X86::COND_NP:
   3292     return true;
   3293   }
   3294 }
   3295 
   3296 /// isFPImmLegal - Returns true if the target can instruction select the
   3297 /// specified FP immediate natively. If false, the legalizer will
   3298 /// materialize the FP immediate as a load from a constant pool.
   3299 bool X86TargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT) const {
   3300   for (unsigned i = 0, e = LegalFPImmediates.size(); i != e; ++i) {
   3301     if (Imm.bitwiseIsEqual(LegalFPImmediates[i]))
   3302       return true;
   3303   }
   3304   return false;
   3305 }
   3306 
   3307 /// isUndefOrInRange - Return true if Val is undef or if its value falls within
   3308 /// the specified range (L, H].
   3309 static bool isUndefOrInRange(int Val, int Low, int Hi) {
   3310   return (Val < 0) || (Val >= Low && Val < Hi);
   3311 }
   3312 
   3313 /// isUndefOrEqual - Val is either less than zero (undef) or equal to the
   3314 /// specified value.
   3315 static bool isUndefOrEqual(int Val, int CmpVal) {
   3316   return (Val < 0 || Val == CmpVal);
   3317 }
   3318 
   3319 /// isSequentialOrUndefInRange - Return true if every element in Mask, beginning
   3320 /// from position Pos and ending in Pos+Size, falls within the specified
   3321 /// sequential range (L, L+Pos]. or is undef.
   3322 static bool isSequentialOrUndefInRange(ArrayRef<int> Mask,
   3323                                        unsigned Pos, unsigned Size, int Low) {
   3324   for (unsigned i = Pos, e = Pos+Size; i != e; ++i, ++Low)
   3325     if (!isUndefOrEqual(Mask[i], Low))
   3326       return false;
   3327   return true;
   3328 }
   3329 
   3330 /// isPSHUFDMask - Return true if the node specifies a shuffle of elements that
   3331 /// is suitable for input to PSHUFD or PSHUFW.  That is, it doesn't reference
   3332 /// the second operand.
   3333 static bool isPSHUFDMask(ArrayRef<int> Mask, EVT VT) {
   3334   if (VT == MVT::v4f32 || VT == MVT::v4i32 )
   3335     return (Mask[0] < 4 && Mask[1] < 4 && Mask[2] < 4 && Mask[3] < 4);
   3336   if (VT == MVT::v2f64 || VT == MVT::v2i64)
   3337     return (Mask[0] < 2 && Mask[1] < 2);
   3338   return false;
   3339 }
   3340 
   3341 /// isPSHUFHWMask - Return true if the node specifies a shuffle of elements that
   3342 /// is suitable for input to PSHUFHW.
   3343 static bool isPSHUFHWMask(ArrayRef<int> Mask, EVT VT, bool HasInt256) {
   3344   if (VT != MVT::v8i16 && (!HasInt256 || VT != MVT::v16i16))
   3345     return false;
   3346 
   3347   // Lower quadword copied in order or undef.
   3348   if (!isSequentialOrUndefInRange(Mask, 0, 4, 0))
   3349     return false;
   3350 
   3351   // Upper quadword shuffled.
   3352   for (unsigned i = 4; i != 8; ++i)
   3353     if (!isUndefOrInRange(Mask[i], 4, 8))
   3354       return false;
   3355 
   3356   if (VT == MVT::v16i16) {
   3357     // Lower quadword copied in order or undef.
   3358     if (!isSequentialOrUndefInRange(Mask, 8, 4, 8))
   3359       return false;
   3360 
   3361     // Upper quadword shuffled.
   3362     for (unsigned i = 12; i != 16; ++i)
   3363       if (!isUndefOrInRange(Mask[i], 12, 16))
   3364         return false;
   3365   }
   3366 
   3367   return true;
   3368 }
   3369 
   3370 /// isPSHUFLWMask - Return true if the node specifies a shuffle of elements that
   3371 /// is suitable for input to PSHUFLW.
   3372 static bool isPSHUFLWMask(ArrayRef<int> Mask, EVT VT, bool HasInt256) {
   3373   if (VT != MVT::v8i16 && (!HasInt256 || VT != MVT::v16i16))
   3374     return false;
   3375 
   3376   // Upper quadword copied in order.
   3377   if (!isSequentialOrUndefInRange(Mask, 4, 4, 4))
   3378     return false;
   3379 
   3380   // Lower quadword shuffled.
   3381   for (unsigned i = 0; i != 4; ++i)
   3382     if (!isUndefOrInRange(Mask[i], 0, 4))
   3383       return false;
   3384 
   3385   if (VT == MVT::v16i16) {
   3386     // Upper quadword copied in order.
   3387     if (!isSequentialOrUndefInRange(Mask, 12, 4, 12))
   3388       return false;
   3389 
   3390     // Lower quadword shuffled.
   3391     for (unsigned i = 8; i != 12; ++i)
   3392       if (!isUndefOrInRange(Mask[i], 8, 12))
   3393         return false;
   3394   }
   3395 
   3396   return true;
   3397 }
   3398 
   3399 /// isPALIGNRMask - Return true if the node specifies a shuffle of elements that
   3400 /// is suitable for input to PALIGNR.
   3401 static bool isPALIGNRMask(ArrayRef<int> Mask, EVT VT,
   3402                           const X86Subtarget *Subtarget) {
   3403   if ((VT.is128BitVector() && !Subtarget->hasSSSE3()) ||
   3404       (VT.is256BitVector() && !Subtarget->hasInt256()))
   3405     return false;
   3406 
   3407   unsigned NumElts = VT.getVectorNumElements();
   3408   unsigned NumLanes = VT.getSizeInBits()/128;
   3409   unsigned NumLaneElts = NumElts/NumLanes;
   3410 
   3411   // Do not handle 64-bit element shuffles with palignr.
   3412   if (NumLaneElts == 2)
   3413     return false;
   3414 
   3415   for (unsigned l = 0; l != NumElts; l+=NumLaneElts) {
   3416     unsigned i;
   3417     for (i = 0; i != NumLaneElts; ++i) {
   3418       if (Mask[i+l] >= 0)
   3419         break;
   3420     }
   3421 
   3422     // Lane is all undef, go to next lane
   3423     if (i == NumLaneElts)
   3424       continue;
   3425 
   3426     int Start = Mask[i+l];
   3427 
   3428     // Make sure its in this lane in one of the sources
   3429     if (!isUndefOrInRange(Start, l, l+NumLaneElts) &&
   3430         !isUndefOrInRange(Start, l+NumElts, l+NumElts+NumLaneElts))
   3431       return false;
   3432 
   3433     // If not lane 0, then we must match lane 0
   3434     if (l != 0 && Mask[i] >= 0 && !isUndefOrEqual(Start, Mask[i]+l))
   3435       return false;
   3436 
   3437     // Correct second source to be contiguous with first source
   3438     if (Start >= (int)NumElts)
   3439       Start -= NumElts - NumLaneElts;
   3440 
   3441     // Make sure we're shifting in the right direction.
   3442     if (Start <= (int)(i+l))
   3443       return false;
   3444 
   3445     Start -= i;
   3446 
   3447     // Check the rest of the elements to see if they are consecutive.
   3448     for (++i; i != NumLaneElts; ++i) {
   3449       int Idx = Mask[i+l];
   3450 
   3451       // Make sure its in this lane
   3452       if (!isUndefOrInRange(Idx, l, l+NumLaneElts) &&
   3453           !isUndefOrInRange(Idx, l+NumElts, l+NumElts+NumLaneElts))
   3454         return false;
   3455 
   3456       // If not lane 0, then we must match lane 0
   3457       if (l != 0 && Mask[i] >= 0 && !isUndefOrEqual(Idx, Mask[i]+l))
   3458         return false;
   3459 
   3460       if (Idx >= (int)NumElts)
   3461         Idx -= NumElts - NumLaneElts;
   3462 
   3463       if (!isUndefOrEqual(Idx, Start+i))
   3464         return false;
   3465 
   3466     }
   3467   }
   3468 
   3469   return true;
   3470 }
   3471 
   3472 /// CommuteVectorShuffleMask - Change values in a shuffle permute mask assuming
   3473 /// the two vector operands have swapped position.
   3474 static void CommuteVectorShuffleMask(SmallVectorImpl<int> &Mask,
   3475                                      unsigned NumElems) {
   3476   for (unsigned i = 0; i != NumElems; ++i) {
   3477     int idx = Mask[i];
   3478     if (idx < 0)
   3479       continue;
   3480     else if (idx < (int)NumElems)
   3481       Mask[i] = idx + NumElems;
   3482     else
   3483       Mask[i] = idx - NumElems;
   3484   }
   3485 }
   3486 
   3487 /// isSHUFPMask - Return true if the specified VECTOR_SHUFFLE operand
   3488 /// specifies a shuffle of elements that is suitable for input to 128/256-bit
   3489 /// SHUFPS and SHUFPD. If Commuted is true, then it checks for sources to be
   3490 /// reverse of what x86 shuffles want.
   3491 static bool isSHUFPMask(ArrayRef<int> Mask, EVT VT, bool HasFp256,
   3492                         bool Commuted = false) {
   3493   if (!HasFp256 && VT.is256BitVector())
   3494     return false;
   3495 
   3496   unsigned NumElems = VT.getVectorNumElements();
   3497   unsigned NumLanes = VT.getSizeInBits()/128;
   3498   unsigned NumLaneElems = NumElems/NumLanes;
   3499 
   3500   if (NumLaneElems != 2 && NumLaneElems != 4)
   3501     return false;
   3502 
   3503   // VSHUFPSY divides the resulting vector into 4 chunks.
   3504   // The sources are also splitted into 4 chunks, and each destination
   3505   // chunk must come from a different source chunk.
   3506   //
   3507   //  SRC1 =>   X7    X6    X5    X4    X3    X2    X1    X0
   3508   //  SRC2 =>   Y7    Y6    Y5    Y4    Y3    Y2    Y1    Y9
   3509   //
   3510   //  DST  =>  Y7..Y4,   Y7..Y4,   X7..X4,   X7..X4,
   3511   //           Y3..Y0,   Y3..Y0,   X3..X0,   X3..X0
   3512   //
   3513   // VSHUFPDY divides the resulting vector into 4 chunks.
   3514   // The sources are also splitted into 4 chunks, and each destination
   3515   // chunk must come from a different source chunk.
   3516   //
   3517   //  SRC1 =>      X3       X2       X1       X0
   3518   //  SRC2 =>      Y3       Y2       Y1       Y0
   3519   //
   3520   //  DST  =>  Y3..Y2,  X3..X2,  Y1..Y0,  X1..X0
   3521   //
   3522   unsigned HalfLaneElems = NumLaneElems/2;
   3523   for (unsigned l = 0; l != NumElems; l += NumLaneElems) {
   3524     for (unsigned i = 0; i != NumLaneElems; ++i) {
   3525       int Idx = Mask[i+l];
   3526       unsigned RngStart = l + ((Commuted == (i<HalfLaneElems)) ? NumElems : 0);
   3527       if (!isUndefOrInRange(Idx, RngStart, RngStart+NumLaneElems))
   3528         return false;
   3529       // For VSHUFPSY, the mask of the second half must be the same as the
   3530       // first but with the appropriate offsets. This works in the same way as
   3531       // VPERMILPS works with masks.
   3532       if (NumElems != 8 || l == 0 || Mask[i] < 0)
   3533         continue;
   3534       if (!isUndefOrEqual(Idx, Mask[i]+l))
   3535         return false;
   3536     }
   3537   }
   3538 
   3539   return true;
   3540 }
   3541 
   3542 /// isMOVHLPSMask - Return true if the specified VECTOR_SHUFFLE operand
   3543 /// specifies a shuffle of elements that is suitable for input to MOVHLPS.
   3544 static bool isMOVHLPSMask(ArrayRef<int> Mask, EVT VT) {
   3545   if (!VT.is128BitVector())
   3546     return false;
   3547 
   3548   unsigned NumElems = VT.getVectorNumElements();
   3549 
   3550   if (NumElems != 4)
   3551     return false;
   3552 
   3553   // Expect bit0 == 6, bit1 == 7, bit2 == 2, bit3 == 3
   3554   return isUndefOrEqual(Mask[0], 6) &&
   3555          isUndefOrEqual(Mask[1], 7) &&
   3556          isUndefOrEqual(Mask[2], 2) &&
   3557          isUndefOrEqual(Mask[3], 3);
   3558 }
   3559 
   3560 /// isMOVHLPS_v_undef_Mask - Special case of isMOVHLPSMask for canonical form
   3561 /// of vector_shuffle v, v, <2, 3, 2, 3>, i.e. vector_shuffle v, undef,
   3562 /// <2, 3, 2, 3>
   3563 static bool isMOVHLPS_v_undef_Mask(ArrayRef<int> Mask, EVT VT) {
   3564   if (!VT.is128BitVector())
   3565     return false;
   3566 
   3567   unsigned NumElems = VT.getVectorNumElements();
   3568 
   3569   if (NumElems != 4)
   3570     return false;
   3571 
   3572   return isUndefOrEqual(Mask[0], 2) &&
   3573          isUndefOrEqual(Mask[1], 3) &&
   3574          isUndefOrEqual(Mask[2], 2) &&
   3575          isUndefOrEqual(Mask[3], 3);
   3576 }
   3577 
   3578 /// isMOVLPMask - Return true if the specified VECTOR_SHUFFLE operand
   3579 /// specifies a shuffle of elements that is suitable for input to MOVLP{S|D}.
   3580 static bool isMOVLPMask(ArrayRef<int> Mask, EVT VT) {
   3581   if (!VT.is128BitVector())
   3582     return false;
   3583 
   3584   unsigned NumElems = VT.getVectorNumElements();
   3585 
   3586   if (NumElems != 2 && NumElems != 4)
   3587     return false;
   3588 
   3589   for (unsigned i = 0, e = NumElems/2; i != e; ++i)
   3590     if (!isUndefOrEqual(Mask[i], i + NumElems))
   3591       return false;
   3592 
   3593   for (unsigned i = NumElems/2, e = NumElems; i != e; ++i)
   3594     if (!isUndefOrEqual(Mask[i], i))
   3595       return false;
   3596 
   3597   return true;
   3598 }
   3599 
   3600 /// isMOVLHPSMask - Return true if the specified VECTOR_SHUFFLE operand
   3601 /// specifies a shuffle of elements that is suitable for input to MOVLHPS.
   3602 static bool isMOVLHPSMask(ArrayRef<int> Mask, EVT VT) {
   3603   if (!VT.is128BitVector())
   3604     return false;
   3605 
   3606   unsigned NumElems = VT.getVectorNumElements();
   3607 
   3608   if (NumElems != 2 && NumElems != 4)
   3609     return false;
   3610 
   3611   for (unsigned i = 0, e = NumElems/2; i != e; ++i)
   3612     if (!isUndefOrEqual(Mask[i], i))
   3613       return false;
   3614 
   3615   for (unsigned i = 0, e = NumElems/2; i != e; ++i)
   3616     if (!isUndefOrEqual(Mask[i + e], i + NumElems))
   3617       return false;
   3618 
   3619   return true;
   3620 }
   3621 
   3622 //
   3623 // Some special combinations that can be optimized.
   3624 //
   3625 static
   3626 SDValue Compact8x32ShuffleNode(ShuffleVectorSDNode *SVOp,
   3627                                SelectionDAG &DAG) {
   3628   MVT VT = SVOp->getValueType(0).getSimpleVT();
   3629   DebugLoc dl = SVOp->getDebugLoc();
   3630 
   3631   if (VT != MVT::v8i32 && VT != MVT::v8f32)
   3632     return SDValue();
   3633 
   3634   ArrayRef<int> Mask = SVOp->getMask();
   3635 
   3636   // These are the special masks that may be optimized.
   3637   static const int MaskToOptimizeEven[] = {0, 8, 2, 10, 4, 12, 6, 14};
   3638   static const int MaskToOptimizeOdd[]  = {1, 9, 3, 11, 5, 13, 7, 15};
   3639   bool MatchEvenMask = true;
   3640   bool MatchOddMask  = true;
   3641   for (int i=0; i<8; ++i) {
   3642     if (!isUndefOrEqual(Mask[i], MaskToOptimizeEven[i]))
   3643       MatchEvenMask = false;
   3644     if (!isUndefOrEqual(Mask[i], MaskToOptimizeOdd[i]))
   3645       MatchOddMask = false;
   3646   }
   3647 
   3648   if (!MatchEvenMask && !MatchOddMask)
   3649     return SDValue();
   3650 
   3651   SDValue UndefNode = DAG.getNode(ISD::UNDEF, dl, VT);
   3652 
   3653   SDValue Op0 = SVOp->getOperand(0);
   3654   SDValue Op1 = SVOp->getOperand(1);
   3655 
   3656   if (MatchEvenMask) {
   3657     // Shift the second operand right to 32 bits.
   3658     static const int ShiftRightMask[] = {-1, 0, -1, 2, -1, 4, -1, 6 };
   3659     Op1 = DAG.getVectorShuffle(VT, dl, Op1, UndefNode, ShiftRightMask);
   3660   } else {
   3661     // Shift the first operand left to 32 bits.
   3662     static const int ShiftLeftMask[] = {1, -1, 3, -1, 5, -1, 7, -1 };
   3663     Op0 = DAG.getVectorShuffle(VT, dl, Op0, UndefNode, ShiftLeftMask);
   3664   }
   3665   static const int BlendMask[] = {0, 9, 2, 11, 4, 13, 6, 15};
   3666   return DAG.getVectorShuffle(VT, dl, Op0, Op1, BlendMask);
   3667 }
   3668 
   3669 /// isUNPCKLMask - Return true if the specified VECTOR_SHUFFLE operand
   3670 /// specifies a shuffle of elements that is suitable for input to UNPCKL.
   3671 static bool isUNPCKLMask(ArrayRef<int> Mask, EVT VT,
   3672                          bool HasInt256, bool V2IsSplat = false) {
   3673   unsigned NumElts = VT.getVectorNumElements();
   3674 
   3675   assert((VT.is128BitVector() || VT.is256BitVector()) &&
   3676          "Unsupported vector type for unpckh");
   3677 
   3678   if (VT.is256BitVector() && NumElts != 4 && NumElts != 8 &&
   3679       (!HasInt256 || (NumElts != 16 && NumElts != 32)))
   3680     return false;
   3681 
   3682   // Handle 128 and 256-bit vector lengths. AVX defines UNPCK* to operate
   3683   // independently on 128-bit lanes.
   3684   unsigned NumLanes = VT.getSizeInBits()/128;
   3685   unsigned NumLaneElts = NumElts/NumLanes;
   3686 
   3687   for (unsigned l = 0; l != NumLanes; ++l) {
   3688     for (unsigned i = l*NumLaneElts, j = l*NumLaneElts;
   3689          i != (l+1)*NumLaneElts;
   3690          i += 2, ++j) {
   3691       int BitI  = Mask[i];
   3692       int BitI1 = Mask[i+1];
   3693       if (!isUndefOrEqual(BitI, j))
   3694         return false;
   3695       if (V2IsSplat) {
   3696         if (!isUndefOrEqual(BitI1, NumElts))
   3697           return false;
   3698       } else {
   3699         if (!isUndefOrEqual(BitI1, j + NumElts))
   3700           return false;
   3701       }
   3702     }
   3703   }
   3704 
   3705   return true;
   3706 }
   3707 
   3708 /// isUNPCKHMask - Return true if the specified VECTOR_SHUFFLE operand
   3709 /// specifies a shuffle of elements that is suitable for input to UNPCKH.
   3710 static bool isUNPCKHMask(ArrayRef<int> Mask, EVT VT,
   3711                          bool HasInt256, bool V2IsSplat = false) {
   3712   unsigned NumElts = VT.getVectorNumElements();
   3713 
   3714   assert((VT.is128BitVector() || VT.is256BitVector()) &&
   3715          "Unsupported vector type for unpckh");
   3716 
   3717   if (VT.is256BitVector() && NumElts != 4 && NumElts != 8 &&
   3718       (!HasInt256 || (NumElts != 16 && NumElts != 32)))
   3719     return false;
   3720 
   3721   // Handle 128 and 256-bit vector lengths. AVX defines UNPCK* to operate
   3722   // independently on 128-bit lanes.
   3723   unsigned NumLanes = VT.getSizeInBits()/128;
   3724   unsigned NumLaneElts = NumElts/NumLanes;
   3725 
   3726   for (unsigned l = 0; l != NumLanes; ++l) {
   3727     for (unsigned i = l*NumLaneElts, j = (l*NumLaneElts)+NumLaneElts/2;
   3728          i != (l+1)*NumLaneElts; i += 2, ++j) {
   3729       int BitI  = Mask[i];
   3730       int BitI1 = Mask[i+1];
   3731       if (!isUndefOrEqual(BitI, j))
   3732         return false;
   3733       if (V2IsSplat) {
   3734         if (isUndefOrEqual(BitI1, NumElts))
   3735           return false;
   3736       } else {
   3737         if (!isUndefOrEqual(BitI1, j+NumElts))
   3738           return false;
   3739       }
   3740     }
   3741   }
   3742   return true;
   3743 }
   3744 
   3745 /// isUNPCKL_v_undef_Mask - Special case of isUNPCKLMask for canonical form
   3746 /// of vector_shuffle v, v, <0, 4, 1, 5>, i.e. vector_shuffle v, undef,
   3747 /// <0, 0, 1, 1>
   3748 static bool isUNPCKL_v_undef_Mask(ArrayRef<int> Mask, EVT VT, bool HasInt256) {
   3749   unsigned NumElts = VT.getVectorNumElements();
   3750   bool Is256BitVec = VT.is256BitVector();
   3751 
   3752   assert((VT.is128BitVector() || VT.is256BitVector()) &&
   3753          "Unsupported vector type for unpckh");
   3754 
   3755   if (Is256BitVec && NumElts != 4 && NumElts != 8 &&
   3756       (!HasInt256 || (NumElts != 16 && NumElts != 32)))
   3757     return false;
   3758 
   3759   // For 256-bit i64/f64, use MOVDDUPY instead, so reject the matching pattern
   3760   // FIXME: Need a better way to get rid of this, there's no latency difference
   3761   // between UNPCKLPD and MOVDDUP, the later should always be checked first and
   3762   // the former later. We should also remove the "_undef" special mask.
   3763   if (NumElts == 4 && Is256BitVec)
   3764     return false;
   3765 
   3766   // Handle 128 and 256-bit vector lengths. AVX defines UNPCK* to operate
   3767   // independently on 128-bit lanes.
   3768   unsigned NumLanes = VT.getSizeInBits()/128;
   3769   unsigned NumLaneElts = NumElts/NumLanes;
   3770 
   3771   for (unsigned l = 0; l != NumLanes; ++l) {
   3772     for (unsigned i = l*NumLaneElts, j = l*NumLaneElts;
   3773          i != (l+1)*NumLaneElts;
   3774          i += 2, ++j) {
   3775       int BitI  = Mask[i];
   3776       int BitI1 = Mask[i+1];
   3777 
   3778       if (!isUndefOrEqual(BitI, j))
   3779         return false;
   3780       if (!isUndefOrEqual(BitI1, j))
   3781         return false;
   3782     }
   3783   }
   3784 
   3785   return true;
   3786 }
   3787 
   3788 /// isUNPCKH_v_undef_Mask - Special case of isUNPCKHMask for canonical form
   3789 /// of vector_shuffle v, v, <2, 6, 3, 7>, i.e. vector_shuffle v, undef,
   3790 /// <2, 2, 3, 3>
   3791 static bool isUNPCKH_v_undef_Mask(ArrayRef<int> Mask, EVT VT, bool HasInt256) {
   3792   unsigned NumElts = VT.getVectorNumElements();
   3793 
   3794   assert((VT.is128BitVector() || VT.is256BitVector()) &&
   3795          "Unsupported vector type for unpckh");
   3796 
   3797   if (VT.is256BitVector() && NumElts != 4 && NumElts != 8 &&
   3798       (!HasInt256 || (NumElts != 16 && NumElts != 32)))
   3799     return false;
   3800 
   3801   // Handle 128 and 256-bit vector lengths. AVX defines UNPCK* to operate
   3802   // independently on 128-bit lanes.
   3803   unsigned NumLanes = VT.getSizeInBits()/128;
   3804   unsigned NumLaneElts = NumElts/NumLanes;
   3805 
   3806   for (unsigned l = 0; l != NumLanes; ++l) {
   3807     for (unsigned i = l*NumLaneElts, j = (l*NumLaneElts)+NumLaneElts/2;
   3808          i != (l+1)*NumLaneElts; i += 2, ++j) {
   3809       int BitI  = Mask[i];
   3810       int BitI1 = Mask[i+1];
   3811       if (!isUndefOrEqual(BitI, j))
   3812         return false;
   3813       if (!isUndefOrEqual(BitI1, j))
   3814         return false;
   3815     }
   3816   }
   3817   return true;
   3818 }
   3819 
   3820 /// isMOVLMask - Return true if the specified VECTOR_SHUFFLE operand
   3821 /// specifies a shuffle of elements that is suitable for input to MOVSS,
   3822 /// MOVSD, and MOVD, i.e. setting the lowest element.
   3823 static bool isMOVLMask(ArrayRef<int> Mask, EVT VT) {
   3824   if (VT.getVectorElementType().getSizeInBits() < 32)
   3825     return false;
   3826   if (!VT.is128BitVector())
   3827     return false;
   3828 
   3829   unsigned NumElts = VT.getVectorNumElements();
   3830 
   3831   if (!isUndefOrEqual(Mask[0], NumElts))
   3832     return false;
   3833 
   3834   for (unsigned i = 1; i != NumElts; ++i)
   3835     if (!isUndefOrEqual(Mask[i], i))
   3836       return false;
   3837 
   3838   return true;
   3839 }
   3840 
   3841 /// isVPERM2X128Mask - Match 256-bit shuffles where the elements are considered
   3842 /// as permutations between 128-bit chunks or halves. As an example: this
   3843 /// shuffle bellow:
   3844 ///   vector_shuffle <4, 5, 6, 7, 12, 13, 14, 15>
   3845 /// The first half comes from the second half of V1 and the second half from the
   3846 /// the second half of V2.
   3847 static bool isVPERM2X128Mask(ArrayRef<int> Mask, EVT VT, bool HasFp256) {
   3848   if (!HasFp256 || !VT.is256BitVector())
   3849     return false;
   3850 
   3851   // The shuffle result is divided into half A and half B. In total the two
   3852   // sources have 4 halves, namely: C, D, E, F. The final values of A and
   3853   // B must come from C, D, E or F.
   3854   unsigned HalfSize = VT.getVectorNumElements()/2;
   3855   bool MatchA = false, MatchB = false;
   3856 
   3857   // Check if A comes from one of C, D, E, F.
   3858   for (unsigned Half = 0; Half != 4; ++Half) {
   3859     if (isSequentialOrUndefInRange(Mask, 0, HalfSize, Half*HalfSize)) {
   3860       MatchA = true;
   3861       break;
   3862     }
   3863   }
   3864 
   3865   // Check if B comes from one of C, D, E, F.
   3866   for (unsigned Half = 0; Half != 4; ++Half) {
   3867     if (isSequentialOrUndefInRange(Mask, HalfSize, HalfSize, Half*HalfSize)) {
   3868       MatchB = true;
   3869       break;
   3870     }
   3871   }
   3872 
   3873   return MatchA && MatchB;
   3874 }
   3875 
   3876 /// getShuffleVPERM2X128Immediate - Return the appropriate immediate to shuffle
   3877 /// the specified VECTOR_MASK mask with VPERM2F128/VPERM2I128 instructions.
   3878 static unsigned getShuffleVPERM2X128Immediate(ShuffleVectorSDNode *SVOp) {
   3879   MVT VT = SVOp->getValueType(0).getSimpleVT();
   3880 
   3881   unsigned HalfSize = VT.getVectorNumElements()/2;
   3882 
   3883   unsigned FstHalf = 0, SndHalf = 0;
   3884   for (unsigned i = 0; i < HalfSize; ++i) {
   3885     if (SVOp->getMaskElt(i) > 0) {
   3886       FstHalf = SVOp->getMaskElt(i)/HalfSize;
   3887       break;
   3888     }
   3889   }
   3890   for (unsigned i = HalfSize; i < HalfSize*2; ++i) {
   3891     if (SVOp->getMaskElt(i) > 0) {
   3892       SndHalf = SVOp->getMaskElt(i)/HalfSize;
   3893       break;
   3894     }
   3895   }
   3896 
   3897   return (FstHalf | (SndHalf << 4));
   3898 }
   3899 
   3900 /// isVPERMILPMask - Return true if the specified VECTOR_SHUFFLE operand
   3901 /// specifies a shuffle of elements that is suitable for input to VPERMILPD*.
   3902 /// Note that VPERMIL mask matching is different depending whether theunderlying
   3903 /// type is 32 or 64. In the VPERMILPS the high half of the mask should point
   3904 /// to the same elements of the low, but to the higher half of the source.
   3905 /// In VPERMILPD the two lanes could be shuffled independently of each other
   3906 /// with the same restriction that lanes can't be crossed. Also handles PSHUFDY.
   3907 static bool isVPERMILPMask(ArrayRef<int> Mask, EVT VT, bool HasFp256) {
   3908   if (!HasFp256)
   3909     return false;
   3910 
   3911   unsigned NumElts = VT.getVectorNumElements();
   3912   // Only match 256-bit with 32/64-bit types
   3913   if (!VT.is256BitVector() || (NumElts != 4 && NumElts != 8))
   3914     return false;
   3915 
   3916   unsigned NumLanes = VT.getSizeInBits()/128;
   3917   unsigned LaneSize = NumElts/NumLanes;
   3918   for (unsigned l = 0; l != NumElts; l += LaneSize) {
   3919     for (unsigned i = 0; i != LaneSize; ++i) {
   3920       if (!isUndefOrInRange(Mask[i+l], l, l+LaneSize))
   3921         return false;
   3922       if (NumElts != 8 || l == 0)
   3923         continue;
   3924       // VPERMILPS handling
   3925       if (Mask[i] < 0)
   3926         continue;
   3927       if (!isUndefOrEqual(Mask[i+l], Mask[i]+l))
   3928         return false;
   3929     }
   3930   }
   3931 
   3932   return true;
   3933 }
   3934 
   3935 /// isCommutedMOVLMask - Returns true if the shuffle mask is except the reverse
   3936 /// of what x86 movss want. X86 movs requires the lowest  element to be lowest
   3937 /// element of vector 2 and the other elements to come from vector 1 in order.
   3938 static bool isCommutedMOVLMask(ArrayRef<int> Mask, EVT VT,
   3939                                bool V2IsSplat = false, bool V2IsUndef = false) {
   3940   if (!VT.is128BitVector())
   3941     return false;
   3942 
   3943   unsigned NumOps = VT.getVectorNumElements();
   3944   if (NumOps != 2 && NumOps != 4 && NumOps != 8 && NumOps != 16)
   3945     return false;
   3946 
   3947   if (!isUndefOrEqual(Mask[0], 0))
   3948     return false;
   3949 
   3950   for (unsigned i = 1; i != NumOps; ++i)
   3951     if (!(isUndefOrEqual(Mask[i], i+NumOps) ||
   3952           (V2IsUndef && isUndefOrInRange(Mask[i], NumOps, NumOps*2)) ||
   3953           (V2IsSplat && isUndefOrEqual(Mask[i], NumOps))))
   3954       return false;
   3955 
   3956   return true;
   3957 }
   3958 
   3959 /// isMOVSHDUPMask - Return true if the specified VECTOR_SHUFFLE operand
   3960 /// specifies a shuffle of elements that is suitable for input to MOVSHDUP.
   3961 /// Masks to match: <1, 1, 3, 3> or <1, 1, 3, 3, 5, 5, 7, 7>
   3962 static bool isMOVSHDUPMask(ArrayRef<int> Mask, EVT VT,
   3963                            const X86Subtarget *Subtarget) {
   3964   if (!Subtarget->hasSSE3())
   3965     return false;
   3966 
   3967   unsigned NumElems = VT.getVectorNumElements();
   3968 
   3969   if ((VT.is128BitVector() && NumElems != 4) ||
   3970       (VT.is256BitVector() && NumElems != 8))
   3971     return false;
   3972 
   3973   // "i+1" is the value the indexed mask element must have
   3974   for (unsigned i = 0; i != NumElems; i += 2)
   3975     if (!isUndefOrEqual(Mask[i], i+1) ||
   3976         !isUndefOrEqual(Mask[i+1], i+1))
   3977       return false;
   3978 
   3979   return true;
   3980 }
   3981 
   3982 /// isMOVSLDUPMask - Return true if the specified VECTOR_SHUFFLE operand
   3983 /// specifies a shuffle of elements that is suitable for input to MOVSLDUP.
   3984 /// Masks to match: <0, 0, 2, 2> or <0, 0, 2, 2, 4, 4, 6, 6>
   3985 static bool isMOVSLDUPMask(ArrayRef<int> Mask, EVT VT,
   3986                            const X86Subtarget *Subtarget) {
   3987   if (!Subtarget->hasSSE3())
   3988     return false;
   3989 
   3990   unsigned NumElems = VT.getVectorNumElements();
   3991 
   3992   if ((VT.is128BitVector() && NumElems != 4) ||
   3993       (VT.is256BitVector() && NumElems != 8))
   3994     return false;
   3995 
   3996   // "i" is the value the indexed mask element must have
   3997   for (unsigned i = 0; i != NumElems; i += 2)
   3998     if (!isUndefOrEqual(Mask[i], i) ||
   3999         !isUndefOrEqual(Mask[i+1], i))
   4000       return false;
   4001 
   4002   return true;
   4003 }
   4004 
   4005 /// isMOVDDUPYMask - Return true if the specified VECTOR_SHUFFLE operand
   4006 /// specifies a shuffle of elements that is suitable for input to 256-bit
   4007 /// version of MOVDDUP.
   4008 static bool isMOVDDUPYMask(ArrayRef<int> Mask, EVT VT, bool HasFp256) {
   4009   if (!HasFp256 || !VT.is256BitVector())
   4010     return false;
   4011 
   4012   unsigned NumElts = VT.getVectorNumElements();
   4013   if (NumElts != 4)
   4014     return false;
   4015 
   4016   for (unsigned i = 0; i != NumElts/2; ++i)
   4017     if (!isUndefOrEqual(Mask[i], 0))
   4018       return false;
   4019   for (unsigned i = NumElts/2; i != NumElts; ++i)
   4020     if (!isUndefOrEqual(Mask[i], NumElts/2))
   4021       return false;
   4022   return true;
   4023 }
   4024 
   4025 /// isMOVDDUPMask - Return true if the specified VECTOR_SHUFFLE operand
   4026 /// specifies a shuffle of elements that is suitable for input to 128-bit
   4027 /// version of MOVDDUP.
   4028 static bool isMOVDDUPMask(ArrayRef<int> Mask, EVT VT) {
   4029   if (!VT.is128BitVector())
   4030     return false;
   4031 
   4032   unsigned e = VT.getVectorNumElements() / 2;
   4033   for (unsigned i = 0; i != e; ++i)
   4034     if (!isUndefOrEqual(Mask[i], i))
   4035       return false;
   4036   for (unsigned i = 0; i != e; ++i)
   4037     if (!isUndefOrEqual(Mask[e+i], i))
   4038       return false;
   4039   return true;
   4040 }
   4041 
   4042 /// isVEXTRACTF128Index - Return true if the specified
   4043 /// EXTRACT_SUBVECTOR operand specifies a vector extract that is
   4044 /// suitable for input to VEXTRACTF128.
   4045 bool X86::isVEXTRACTF128Index(SDNode *N) {
   4046   if (!isa<ConstantSDNode>(N->getOperand(1).getNode()))
   4047     return false;
   4048 
   4049   // The index should be aligned on a 128-bit boundary.
   4050   uint64_t Index =
   4051     cast<ConstantSDNode>(N->getOperand(1).getNode())->getZExtValue();
   4052 
   4053   MVT VT = N->getValueType(0).getSimpleVT();
   4054   unsigned ElSize = VT.getVectorElementType().getSizeInBits();
   4055   bool Result = (Index * ElSize) % 128 == 0;
   4056 
   4057   return Result;
   4058 }
   4059 
   4060 /// isVINSERTF128Index - Return true if the specified INSERT_SUBVECTOR
   4061 /// operand specifies a subvector insert that is suitable for input to
   4062 /// VINSERTF128.
   4063 bool X86::isVINSERTF128Index(SDNode *N) {
   4064   if (!isa<ConstantSDNode>(N->getOperand(2).getNode()))
   4065     return false;
   4066 
   4067   // The index should be aligned on a 128-bit boundary.
   4068   uint64_t Index =
   4069     cast<ConstantSDNode>(N->getOperand(2).getNode())->getZExtValue();
   4070 
   4071   MVT VT = N->getValueType(0).getSimpleVT();
   4072   unsigned ElSize = VT.getVectorElementType().getSizeInBits();
   4073   bool Result = (Index * ElSize) % 128 == 0;
   4074 
   4075   return Result;
   4076 }
   4077 
   4078 /// getShuffleSHUFImmediate - Return the appropriate immediate to shuffle
   4079 /// the specified VECTOR_SHUFFLE mask with PSHUF* and SHUFP* instructions.
   4080 /// Handles 128-bit and 256-bit.
   4081 static unsigned getShuffleSHUFImmediate(ShuffleVectorSDNode *N) {
   4082   MVT VT = N->getValueType(0).getSimpleVT();
   4083 
   4084   assert((VT.is128BitVector() || VT.is256BitVector()) &&
   4085          "Unsupported vector type for PSHUF/SHUFP");
   4086 
   4087   // Handle 128 and 256-bit vector lengths. AVX defines PSHUF/SHUFP to operate
   4088   // independently on 128-bit lanes.
   4089   unsigned NumElts = VT.getVectorNumElements();
   4090   unsigned NumLanes = VT.getSizeInBits()/128;
   4091   unsigned NumLaneElts = NumElts/NumLanes;
   4092 
   4093   assert((NumLaneElts == 2 || NumLaneElts == 4) &&
   4094          "Only supports 2 or 4 elements per lane");
   4095 
   4096   unsigned Shift = (NumLaneElts == 4) ? 1 : 0;
   4097   unsigned Mask = 0;
   4098   for (unsigned i = 0; i != NumElts; ++i) {
   4099     int Elt = N->getMaskElt(i);
   4100     if (Elt < 0) continue;
   4101     Elt &= NumLaneElts - 1;
   4102     unsigned ShAmt = (i << Shift) % 8;
   4103     Mask |= Elt << ShAmt;
   4104   }
   4105 
   4106   return Mask;
   4107 }
   4108 
   4109 /// getShufflePSHUFHWImmediate - Return the appropriate immediate to shuffle
   4110 /// the specified VECTOR_SHUFFLE mask with the PSHUFHW instruction.
   4111 static unsigned getShufflePSHUFHWImmediate(ShuffleVectorSDNode *N) {
   4112   MVT VT = N->getValueType(0).getSimpleVT();
   4113 
   4114   assert((VT == MVT::v8i16 || VT == MVT::v16i16) &&
   4115          "Unsupported vector type for PSHUFHW");
   4116 
   4117   unsigned NumElts = VT.getVectorNumElements();
   4118 
   4119   unsigned Mask = 0;
   4120   for (unsigned l = 0; l != NumElts; l += 8) {
   4121     // 8 nodes per lane, but we only care about the last 4.
   4122     for (unsigned i = 0; i < 4; ++i) {
   4123       int Elt = N->getMaskElt(l+i+4);
   4124       if (Elt < 0) continue;
   4125       Elt &= 0x3; // only 2-bits.
   4126       Mask |= Elt << (i * 2);
   4127     }
   4128   }
   4129 
   4130   return Mask;
   4131 }
   4132 
   4133 /// getShufflePSHUFLWImmediate - Return the appropriate immediate to shuffle
   4134 /// the specified VECTOR_SHUFFLE mask with the PSHUFLW instruction.
   4135 static unsigned getShufflePSHUFLWImmediate(ShuffleVectorSDNode *N) {
   4136   MVT VT = N->getValueType(0).getSimpleVT();
   4137 
   4138   assert((VT == MVT::v8i16 || VT == MVT::v16i16) &&
   4139          "Unsupported vector type for PSHUFHW");
   4140 
   4141   unsigned NumElts = VT.getVectorNumElements();
   4142 
   4143   unsigned Mask = 0;
   4144   for (unsigned l = 0; l != NumElts; l += 8) {
   4145     // 8 nodes per lane, but we only care about the first 4.
   4146     for (unsigned i = 0; i < 4; ++i) {
   4147       int Elt = N->getMaskElt(l+i);
   4148       if (Elt < 0) continue;
   4149       Elt &= 0x3; // only 2-bits
   4150       Mask |= Elt << (i * 2);
   4151     }
   4152   }
   4153 
   4154   return Mask;
   4155 }
   4156 
   4157 /// getShufflePALIGNRImmediate - Return the appropriate immediate to shuffle
   4158 /// the specified VECTOR_SHUFFLE mask with the PALIGNR instruction.
   4159 static unsigned getShufflePALIGNRImmediate(ShuffleVectorSDNode *SVOp) {
   4160   MVT VT = SVOp->getValueType(0).getSimpleVT();
   4161   unsigned EltSize = VT.getVectorElementType().getSizeInBits() >> 3;
   4162 
   4163   unsigned NumElts = VT.getVectorNumElements();
   4164   unsigned NumLanes = VT.getSizeInBits()/128;
   4165   unsigned NumLaneElts = NumElts/NumLanes;
   4166 
   4167   int Val = 0;
   4168   unsigned i;
   4169   for (i = 0; i != NumElts; ++i) {
   4170     Val = SVOp->getMaskElt(i);
   4171     if (Val >= 0)
   4172       break;
   4173   }
   4174   if (Val >= (int)NumElts)
   4175     Val -= NumElts - NumLaneElts;
   4176 
   4177   assert(Val - i > 0 && "PALIGNR imm should be positive");
   4178   return (Val - i) * EltSize;
   4179 }
   4180 
   4181 /// getExtractVEXTRACTF128Immediate - Return the appropriate immediate
   4182 /// to extract the specified EXTRACT_SUBVECTOR index with VEXTRACTF128
   4183 /// instructions.
   4184 unsigned X86::getExtractVEXTRACTF128Immediate(SDNode *N) {
   4185   if (!isa<ConstantSDNode>(N->getOperand(1).getNode()))
   4186     llvm_unreachable("Illegal extract subvector for VEXTRACTF128");
   4187 
   4188   uint64_t Index =
   4189     cast<ConstantSDNode>(N->getOperand(1).getNode())->getZExtValue();
   4190 
   4191   MVT VecVT = N->getOperand(0).getValueType().getSimpleVT();
   4192   MVT ElVT = VecVT.getVectorElementType();
   4193 
   4194   unsigned NumElemsPerChunk = 128 / ElVT.getSizeInBits();
   4195   return Index / NumElemsPerChunk;
   4196 }
   4197 
   4198 /// getInsertVINSERTF128Immediate - Return the appropriate immediate
   4199 /// to insert at the specified INSERT_SUBVECTOR index with VINSERTF128
   4200 /// instructions.
   4201 unsigned X86::getInsertVINSERTF128Immediate(SDNode *N) {
   4202   if (!isa<ConstantSDNode>(N->getOperand(2).getNode()))
   4203     llvm_unreachable("Illegal insert subvector for VINSERTF128");
   4204 
   4205   uint64_t Index =
   4206     cast<ConstantSDNode>(N->getOperand(2).getNode())->getZExtValue();
   4207 
   4208   MVT VecVT = N->getValueType(0).getSimpleVT();
   4209   MVT ElVT = VecVT.getVectorElementType();
   4210 
   4211   unsigned NumElemsPerChunk = 128 / ElVT.getSizeInBits();
   4212   return Index / NumElemsPerChunk;
   4213 }
   4214 
   4215 /// getShuffleCLImmediate - Return the appropriate immediate to shuffle
   4216 /// the specified VECTOR_SHUFFLE mask with VPERMQ and VPERMPD instructions.
   4217 /// Handles 256-bit.
   4218 static unsigned getShuffleCLImmediate(ShuffleVectorSDNode *N) {
   4219   MVT VT = N->getValueType(0).getSimpleVT();
   4220 
   4221   unsigned NumElts = VT.getVectorNumElements();
   4222 
   4223   assert((VT.is256BitVector() && NumElts == 4) &&
   4224          "Unsupported vector type for VPERMQ/VPERMPD");
   4225 
   4226   unsigned Mask = 0;
   4227   for (unsigned i = 0; i != NumElts; ++i) {
   4228     int Elt = N->getMaskElt(i);
   4229     if (Elt < 0)
   4230       continue;
   4231     Mask |= Elt << (i*2);
   4232   }
   4233 
   4234   return Mask;
   4235 }
   4236 /// isZeroNode - Returns true if Elt is a constant zero or a floating point
   4237 /// constant +0.0.
   4238 bool X86::isZeroNode(SDValue Elt) {
   4239   if (ConstantSDNode *CN = dyn_cast<ConstantSDNode>(Elt))
   4240     return CN->isNullValue();
   4241   if (ConstantFPSDNode *CFP = dyn_cast<ConstantFPSDNode>(Elt))
   4242     return CFP->getValueAPF().isPosZero();
   4243   return false;
   4244 }
   4245 
   4246 /// CommuteVectorShuffle - Swap vector_shuffle operands as well as values in
   4247 /// their permute mask.
   4248 static SDValue CommuteVectorShuffle(ShuffleVectorSDNode *SVOp,
   4249                                     SelectionDAG &DAG) {
   4250   MVT VT = SVOp->getValueType(0).getSimpleVT();
   4251   unsigned NumElems = VT.getVectorNumElements();
   4252   SmallVector<int, 8> MaskVec;
   4253 
   4254   for (unsigned i = 0; i != NumElems; ++i) {
   4255     int Idx = SVOp->getMaskElt(i);
   4256     if (Idx >= 0) {
   4257       if (Idx < (int)NumElems)
   4258         Idx += NumElems;
   4259       else
   4260         Idx -= NumElems;
   4261     }
   4262     MaskVec.push_back(Idx);
   4263   }
   4264   return DAG.getVectorShuffle(VT, SVOp->getDebugLoc(), SVOp->getOperand(1),
   4265                               SVOp->getOperand(0), &MaskVec[0]);
   4266 }
   4267 
   4268 /// ShouldXformToMOVHLPS - Return true if the node should be transformed to
   4269 /// match movhlps. The lower half elements should come from upper half of
   4270 /// V1 (and in order), and the upper half elements should come from the upper
   4271 /// half of V2 (and in order).
   4272 static bool ShouldXformToMOVHLPS(ArrayRef<int> Mask, EVT VT) {
   4273   if (!VT.is128BitVector())
   4274     return false;
   4275   if (VT.getVectorNumElements() != 4)
   4276     return false;
   4277   for (unsigned i = 0, e = 2; i != e; ++i)
   4278     if (!isUndefOrEqual(Mask[i], i+2))
   4279       return false;
   4280   for (unsigned i = 2; i != 4; ++i)
   4281     if (!isUndefOrEqual(Mask[i], i+4))
   4282       return false;
   4283   return true;
   4284 }
   4285 
   4286 /// isScalarLoadToVector - Returns true if the node is a scalar load that
   4287 /// is promoted to a vector. It also returns the LoadSDNode by reference if
   4288 /// required.
   4289 static bool isScalarLoadToVector(SDNode *N, LoadSDNode **LD = NULL) {
   4290   if (N->getOpcode() != ISD::SCALAR_TO_VECTOR)
   4291     return false;
   4292   N = N->getOperand(0).getNode();
   4293   if (!ISD::isNON_EXTLoad(N))
   4294     return false;
   4295   if (LD)
   4296     *LD = cast<LoadSDNode>(N);
   4297   return true;
   4298 }
   4299 
   4300 // Test whether the given value is a vector value which will be legalized
   4301 // into a load.
   4302 static bool WillBeConstantPoolLoad(SDNode *N) {
   4303   if (N->getOpcode() != ISD::BUILD_VECTOR)
   4304     return false;
   4305 
   4306   // Check for any non-constant elements.
   4307   for (unsigned i = 0, e = N->getNumOperands(); i != e; ++i)
   4308     switch (N->getOperand(i).getNode()->getOpcode()) {
   4309     case ISD::UNDEF:
   4310     case ISD::ConstantFP:
   4311     case ISD::Constant:
   4312       break;
   4313     default:
   4314       return false;
   4315     }
   4316 
   4317   // Vectors of all-zeros and all-ones are materialized with special
   4318   // instructions rather than being loaded.
   4319   return !ISD::isBuildVectorAllZeros(N) &&
   4320          !ISD::isBuildVectorAllOnes(N);
   4321 }
   4322 
   4323 /// ShouldXformToMOVLP{S|D} - Return true if the node should be transformed to
   4324 /// match movlp{s|d}. The lower half elements should come from lower half of
   4325 /// V1 (and in order), and the upper half elements should come from the upper
   4326 /// half of V2 (and in order). And since V1 will become the source of the
   4327 /// MOVLP, it must be either a vector load or a scalar load to vector.
   4328 static bool ShouldXformToMOVLP(SDNode *V1, SDNode *V2,
   4329                                ArrayRef<int> Mask, EVT VT) {
   4330   if (!VT.is128BitVector())
   4331     return false;
   4332 
   4333   if (!ISD::isNON_EXTLoad(V1) && !isScalarLoadToVector(V1))
   4334     return false;
   4335   // Is V2 is a vector load, don't do this transformation. We will try to use
   4336   // load folding shufps op.
   4337   if (ISD::isNON_EXTLoad(V2) || WillBeConstantPoolLoad(V2))
   4338     return false;
   4339 
   4340   unsigned NumElems = VT.getVectorNumElements();
   4341 
   4342   if (NumElems != 2 && NumElems != 4)
   4343     return false;
   4344   for (unsigned i = 0, e = NumElems/2; i != e; ++i)
   4345     if (!isUndefOrEqual(Mask[i], i))
   4346       return false;
   4347   for (unsigned i = NumElems/2, e = NumElems; i != e; ++i)
   4348     if (!isUndefOrEqual(Mask[i], i+NumElems))
   4349       return false;
   4350   return true;
   4351 }
   4352 
   4353 /// isSplatVector - Returns true if N is a BUILD_VECTOR node whose elements are
   4354 /// all the same.
   4355 static bool isSplatVector(SDNode *N) {
   4356   if (N->getOpcode() != ISD::BUILD_VECTOR)
   4357     return false;
   4358 
   4359   SDValue SplatValue = N->getOperand(0);
   4360   for (unsigned i = 1, e = N->getNumOperands(); i != e; ++i)
   4361     if (N->getOperand(i) != SplatValue)
   4362       return false;
   4363   return true;
   4364 }
   4365 
   4366 /// isZeroShuffle - Returns true if N is a VECTOR_SHUFFLE that can be resolved
   4367 /// to an zero vector.
   4368 /// FIXME: move to dag combiner / method on ShuffleVectorSDNode
   4369 static bool isZeroShuffle(ShuffleVectorSDNode *N) {
   4370   SDValue V1 = N->getOperand(0);
   4371   SDValue V2 = N->getOperand(1);
   4372   unsigned NumElems = N->getValueType(0).getVectorNumElements();
   4373   for (unsigned i = 0; i != NumElems; ++i) {
   4374     int Idx = N->getMaskElt(i);
   4375     if (Idx >= (int)NumElems) {
   4376       unsigned Opc = V2.getOpcode();
   4377       if (Opc == ISD::UNDEF || ISD::isBuildVectorAllZeros(V2.getNode()))
   4378         continue;
   4379       if (Opc != ISD::BUILD_VECTOR ||
   4380           !X86::isZeroNode(V2.getOperand(Idx-NumElems)))
   4381         return false;
   4382     } else if (Idx >= 0) {
   4383       unsigned Opc = V1.getOpcode();
   4384       if (Opc == ISD::UNDEF || ISD::isBuildVectorAllZeros(V1.getNode()))
   4385         continue;
   4386       if (Opc != ISD::BUILD_VECTOR ||
   4387           !X86::isZeroNode(V1.getOperand(Idx)))
   4388         return false;
   4389     }
   4390   }
   4391   return true;
   4392 }
   4393 
   4394 /// getZeroVector - Returns a vector of specified type with all zero elements.
   4395 ///
   4396 static SDValue getZeroVector(EVT VT, const X86Subtarget *Subtarget,
   4397                              SelectionDAG &DAG, DebugLoc dl) {
   4398   assert(VT.isVector() && "Expected a vector type");
   4399 
   4400   // Always build SSE zero vectors as <4 x i32> bitcasted
   4401   // to their dest type. This ensures they get CSE'd.
   4402   SDValue Vec;
   4403   if (VT.is128BitVector()) {  // SSE
   4404     if (Subtarget->hasSSE2()) {  // SSE2
   4405       SDValue Cst = DAG.getTargetConstant(0, MVT::i32);
   4406       Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32, Cst, Cst, Cst, Cst);
   4407     } else { // SSE1
   4408       SDValue Cst = DAG.getTargetConstantFP(+0.0, MVT::f32);
   4409       Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4f32, Cst, Cst, Cst, Cst);
   4410     }
   4411   } else if (VT.is256BitVector()) { // AVX
   4412     if (Subtarget->hasInt256()) { // AVX2
   4413       SDValue Cst = DAG.getTargetConstant(0, MVT::i32);
   4414       SDValue Ops[] = { Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst };
   4415       Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v8i32, Ops, 8);
   4416     } else {
   4417       // 256-bit logic and arithmetic instructions in AVX are all
   4418       // floating-point, no support for integer ops. Emit fp zeroed vectors.
   4419       SDValue Cst = DAG.getTargetConstantFP(+0.0, MVT::f32);
   4420       SDValue Ops[] = { Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst };
   4421       Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v8f32, Ops, 8);
   4422     }
   4423   } else
   4424     llvm_unreachable("Unexpected vector type");
   4425 
   4426   return DAG.getNode(ISD::BITCAST, dl, VT, Vec);
   4427 }
   4428 
   4429 /// getOnesVector - Returns a vector of specified type with all bits set.
   4430 /// Always build ones vectors as <4 x i32> or <8 x i32>. For 256-bit types with
   4431 /// no AVX2 supprt, use two <4 x i32> inserted in a <8 x i32> appropriately.
   4432 /// Then bitcast to their original type, ensuring they get CSE'd.
   4433 static SDValue getOnesVector(MVT VT, bool HasInt256, SelectionDAG &DAG,
   4434                              DebugLoc dl) {
   4435   assert(VT.isVector() && "Expected a vector type");
   4436 
   4437   SDValue Cst = DAG.getTargetConstant(~0U, MVT::i32);
   4438   SDValue Vec;
   4439   if (VT.is256BitVector()) {
   4440     if (HasInt256) { // AVX2
   4441       SDValue Ops[] = { Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst };
   4442       Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v8i32, Ops, 8);
   4443     } else { // AVX
   4444       Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32, Cst, Cst, Cst, Cst);
   4445       Vec = Concat128BitVectors(Vec, Vec, MVT::v8i32, 8, DAG, dl);
   4446     }
   4447   } else if (VT.is128BitVector()) {
   4448     Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32, Cst, Cst, Cst, Cst);
   4449   } else
   4450     llvm_unreachable("Unexpected vector type");
   4451 
   4452   return DAG.getNode(ISD::BITCAST, dl, VT, Vec);
   4453 }
   4454 
   4455 /// NormalizeMask - V2 is a splat, modify the mask (if needed) so all elements
   4456 /// that point to V2 points to its first element.
   4457 static void NormalizeMask(SmallVectorImpl<int> &Mask, unsigned NumElems) {
   4458   for (unsigned i = 0; i != NumElems; ++i) {
   4459     if (Mask[i] > (int)NumElems) {
   4460       Mask[i] = NumElems;
   4461     }
   4462   }
   4463 }
   4464 
   4465 /// getMOVLMask - Returns a vector_shuffle mask for an movs{s|d}, movd
   4466 /// operation of specified width.
   4467 static SDValue getMOVL(SelectionDAG &DAG, DebugLoc dl, EVT VT, SDValue V1,
   4468                        SDValue V2) {
   4469   unsigned NumElems = VT.getVectorNumElements();
   4470   SmallVector<int, 8> Mask;
   4471   Mask.push_back(NumElems);
   4472   for (unsigned i = 1; i != NumElems; ++i)
   4473     Mask.push_back(i);
   4474   return DAG.getVectorShuffle(VT, dl, V1, V2, &Mask[0]);
   4475 }
   4476 
   4477 /// getUnpackl - Returns a vector_shuffle node for an unpackl operation.
   4478 static SDValue getUnpackl(SelectionDAG &DAG, DebugLoc dl, EVT VT, SDValue V1,
   4479                           SDValue V2) {
   4480   unsigned NumElems = VT.getVectorNumElements();
   4481   SmallVector<int, 8> Mask;
   4482   for (unsigned i = 0, e = NumElems/2; i != e; ++i) {
   4483     Mask.push_back(i);
   4484     Mask.push_back(i + NumElems);
   4485   }
   4486   return DAG.getVectorShuffle(VT, dl, V1, V2, &Mask[0]);
   4487 }
   4488 
   4489 /// getUnpackh - Returns a vector_shuffle node for an unpackh operation.
   4490 static SDValue getUnpackh(SelectionDAG &DAG, DebugLoc dl, EVT VT, SDValue V1,
   4491                           SDValue V2) {
   4492   unsigned NumElems = VT.getVectorNumElements();
   4493   SmallVector<int, 8> Mask;
   4494   for (unsigned i = 0, Half = NumElems/2; i != Half; ++i) {
   4495     Mask.push_back(i + Half);
   4496     Mask.push_back(i + NumElems + Half);
   4497   }
   4498   return DAG.getVectorShuffle(VT, dl, V1, V2, &Mask[0]);
   4499 }
   4500 
   4501 // PromoteSplati8i16 - All i16 and i8 vector types can't be used directly by
   4502 // a generic shuffle instruction because the target has no such instructions.
   4503 // Generate shuffles which repeat i16 and i8 several times until they can be
   4504 // represented by v4f32 and then be manipulated by target suported shuffles.
   4505 static SDValue PromoteSplati8i16(SDValue V, SelectionDAG &DAG, int &EltNo) {
   4506   EVT VT = V.getValueType();
   4507   int NumElems = VT.getVectorNumElements();
   4508   DebugLoc dl = V.getDebugLoc();
   4509 
   4510   while (NumElems > 4) {
   4511     if (EltNo < NumElems/2) {
   4512       V = getUnpackl(DAG, dl, VT, V, V);
   4513     } else {
   4514       V = getUnpackh(DAG, dl, VT, V, V);
   4515       EltNo -= NumElems/2;
   4516     }
   4517     NumElems >>= 1;
   4518   }
   4519   return V;
   4520 }
   4521 
   4522 /// getLegalSplat - Generate a legal splat with supported x86 shuffles
   4523 static SDValue getLegalSplat(SelectionDAG &DAG, SDValue V, int EltNo) {
   4524   EVT VT = V.getValueType();
   4525   DebugLoc dl = V.getDebugLoc();
   4526 
   4527   if (VT.is128BitVector()) {
   4528     V = DAG.getNode(ISD::BITCAST, dl, MVT::v4f32, V);
   4529     int SplatMask[4] = { EltNo, EltNo, EltNo, EltNo };
   4530     V = DAG.getVectorShuffle(MVT::v4f32, dl, V, DAG.getUNDEF(MVT::v4f32),
   4531                              &SplatMask[0]);
   4532   } else if (VT.is256BitVector()) {
   4533     // To use VPERMILPS to splat scalars, the second half of indicies must
   4534     // refer to the higher part, which is a duplication of the lower one,
   4535     // because VPERMILPS can only handle in-lane permutations.
   4536     int SplatMask[8] = { EltNo, EltNo, EltNo, EltNo,
   4537                          EltNo+4, EltNo+4, EltNo+4, EltNo+4 };
   4538 
   4539     V = DAG.getNode(ISD::BITCAST, dl, MVT::v8f32, V);
   4540     V = DAG.getVectorShuffle(MVT::v8f32, dl, V, DAG.getUNDEF(MVT::v8f32),
   4541                              &SplatMask[0]);
   4542   } else
   4543     llvm_unreachable("Vector size not supported");
   4544 
   4545   return DAG.getNode(ISD::BITCAST, dl, VT, V);
   4546 }
   4547 
   4548 /// PromoteSplat - Splat is promoted to target supported vector shuffles.
   4549 static SDValue PromoteSplat(ShuffleVectorSDNode *SV, SelectionDAG &DAG) {
   4550   EVT SrcVT = SV->getValueType(0);
   4551   SDValue V1 = SV->getOperand(0);
   4552   DebugLoc dl = SV->getDebugLoc();
   4553 
   4554   int EltNo = SV->getSplatIndex();
   4555   int NumElems = SrcVT.getVectorNumElements();
   4556   bool Is256BitVec = SrcVT.is256BitVector();
   4557 
   4558   assert(((SrcVT.is128BitVector() && NumElems > 4) || Is256BitVec) &&
   4559          "Unknown how to promote splat for type");
   4560 
   4561   // Extract the 128-bit part containing the splat element and update
   4562   // the splat element index when it refers to the higher register.
   4563   if (Is256BitVec) {
   4564     V1 = Extract128BitVector(V1, EltNo, DAG, dl);
   4565     if (EltNo >= NumElems/2)
   4566       EltNo -= NumElems/2;
   4567   }
   4568 
   4569   // All i16 and i8 vector types can't be used directly by a generic shuffle
   4570   // instruction because the target has no such instruction. Generate shuffles
   4571   // which repeat i16 and i8 several times until they fit in i32, and then can
   4572   // be manipulated by target suported shuffles.
   4573   EVT EltVT = SrcVT.getVectorElementType();
   4574   if (EltVT == MVT::i8 || EltVT == MVT::i16)
   4575     V1 = PromoteSplati8i16(V1, DAG, EltNo);
   4576 
   4577   // Recreate the 256-bit vector and place the same 128-bit vector
   4578   // into the low and high part. This is necessary because we want
   4579   // to use VPERM* to shuffle the vectors
   4580   if (Is256BitVec) {
   4581     V1 = DAG.getNode(ISD::CONCAT_VECTORS, dl, SrcVT, V1, V1);
   4582   }
   4583 
   4584   return getLegalSplat(DAG, V1, EltNo);
   4585 }
   4586 
   4587 /// getShuffleVectorZeroOrUndef - Return a vector_shuffle of the specified
   4588 /// vector of zero or undef vector.  This produces a shuffle where the low
   4589 /// element of V2 is swizzled into the zero/undef vector, landing at element
   4590 /// Idx.  This produces a shuffle mask like 4,1,2,3 (idx=0) or  0,1,2,4 (idx=3).
   4591 static SDValue getShuffleVectorZeroOrUndef(SDValue V2, unsigned Idx,
   4592                                            bool IsZero,
   4593                                            const X86Subtarget *Subtarget,
   4594                                            SelectionDAG &DAG) {
   4595   EVT VT = V2.getValueType();
   4596   SDValue V1 = IsZero
   4597     ? getZeroVector(VT, Subtarget, DAG, V2.getDebugLoc()) : DAG.getUNDEF(VT);
   4598   unsigned NumElems = VT.getVectorNumElements();
   4599   SmallVector<int, 16> MaskVec;
   4600   for (unsigned i = 0; i != NumElems; ++i)
   4601     // If this is the insertion idx, put the low elt of V2 here.
   4602     MaskVec.push_back(i == Idx ? NumElems : i);
   4603   return DAG.getVectorShuffle(VT, V2.getDebugLoc(), V1, V2, &MaskVec[0]);
   4604 }
   4605 
   4606 /// getTargetShuffleMask - Calculates the shuffle mask corresponding to the
   4607 /// target specific opcode. Returns true if the Mask could be calculated.
   4608 /// Sets IsUnary to true if only uses one source.
   4609 static bool getTargetShuffleMask(SDNode *N, MVT VT,
   4610                                  SmallVectorImpl<int> &Mask, bool &IsUnary) {
   4611   unsigned NumElems = VT.getVectorNumElements();
   4612   SDValue ImmN;
   4613 
   4614   IsUnary = false;
   4615   switch(N->getOpcode()) {
   4616   case X86ISD::SHUFP:
   4617     ImmN = N->getOperand(N->getNumOperands()-1);
   4618     DecodeSHUFPMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
   4619     break;
   4620   case X86ISD::UNPCKH:
   4621     DecodeUNPCKHMask(VT, Mask);
   4622     break;
   4623   case X86ISD::UNPCKL:
   4624     DecodeUNPCKLMask(VT, Mask);
   4625     break;
   4626   case X86ISD::MOVHLPS:
   4627     DecodeMOVHLPSMask(NumElems, Mask);
   4628     break;
   4629   case X86ISD::MOVLHPS:
   4630     DecodeMOVLHPSMask(NumElems, Mask);
   4631     break;
   4632   case X86ISD::PALIGNR:
   4633     ImmN = N->getOperand(N->getNumOperands()-1);
   4634     DecodePALIGNRMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
   4635     break;
   4636   case X86ISD::PSHUFD:
   4637   case X86ISD::VPERMILP:
   4638     ImmN = N->getOperand(N->getNumOperands()-1);
   4639     DecodePSHUFMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
   4640     IsUnary = true;
   4641     break;
   4642   case X86ISD::PSHUFHW:
   4643     ImmN = N->getOperand(N->getNumOperands()-1);
   4644     DecodePSHUFHWMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
   4645     IsUnary = true;
   4646     break;
   4647   case X86ISD::PSHUFLW:
   4648     ImmN = N->getOperand(N->getNumOperands()-1);
   4649     DecodePSHUFLWMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
   4650     IsUnary = true;
   4651     break;
   4652   case X86ISD::VPERMI:
   4653     ImmN = N->getOperand(N->getNumOperands()-1);
   4654     DecodeVPERMMask(cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
   4655     IsUnary = true;
   4656     break;
   4657   case X86ISD::MOVSS:
   4658   case X86ISD::MOVSD: {
   4659     // The index 0 always comes from the first element of the second source,
   4660     // this is why MOVSS and MOVSD are used in the first place. The other
   4661     // elements come from the other positions of the first source vector
   4662     Mask.push_back(NumElems);
   4663     for (unsigned i = 1; i != NumElems; ++i) {
   4664       Mask.push_back(i);
   4665     }
   4666     break;
   4667   }
   4668   case X86ISD::VPERM2X128:
   4669     ImmN = N->getOperand(N->getNumOperands()-1);
   4670     DecodeVPERM2X128Mask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
   4671     if (Mask.empty()) return false;
   4672     break;
   4673   case X86ISD::MOVDDUP:
   4674   case X86ISD::MOVLHPD:
   4675   case X86ISD::MOVLPD:
   4676   case X86ISD::MOVLPS:
   4677   case X86ISD::MOVSHDUP:
   4678   case X86ISD::MOVSLDUP:
   4679     // Not yet implemented
   4680     return false;
   4681   default: llvm_unreachable("unknown target shuffle node");
   4682   }
   4683 
   4684   return true;
   4685 }
   4686 
   4687 /// getShuffleScalarElt - Returns the scalar element that will make up the ith
   4688 /// element of the result of the vector shuffle.
   4689 static SDValue getShuffleScalarElt(SDNode *N, unsigned Index, SelectionDAG &DAG,
   4690                                    unsigned Depth) {
   4691   if (Depth == 6)
   4692     return SDValue();  // Limit search depth.
   4693 
   4694   SDValue V = SDValue(N, 0);
   4695   EVT VT = V.getValueType();
   4696   unsigned Opcode = V.getOpcode();
   4697 
   4698   // Recurse into ISD::VECTOR_SHUFFLE node to find scalars.
   4699   if (const ShuffleVectorSDNode *SV = dyn_cast<ShuffleVectorSDNode>(N)) {
   4700     int Elt = SV->getMaskElt(Index);
   4701 
   4702     if (Elt < 0)
   4703       return DAG.getUNDEF(VT.getVectorElementType());
   4704 
   4705     unsigned NumElems = VT.getVectorNumElements();
   4706     SDValue NewV = (Elt < (int)NumElems) ? SV->getOperand(0)
   4707                                          : SV->getOperand(1);
   4708     return getShuffleScalarElt(NewV.getNode(), Elt % NumElems, DAG, Depth+1);
   4709   }
   4710 
   4711   // Recurse into target specific vector shuffles to find scalars.
   4712   if (isTargetShuffle(Opcode)) {
   4713     MVT ShufVT = V.getValueType().getSimpleVT();
   4714     unsigned NumElems = ShufVT.getVectorNumElements();
   4715     SmallVector<int, 16> ShuffleMask;
   4716     bool IsUnary;
   4717 
   4718     if (!getTargetShuffleMask(N, ShufVT, ShuffleMask, IsUnary))
   4719       return SDValue();
   4720 
   4721     int Elt = ShuffleMask[Index];
   4722     if (Elt < 0)
   4723       return DAG.getUNDEF(ShufVT.getVectorElementType());
   4724 
   4725     SDValue NewV = (Elt < (int)NumElems) ? N->getOperand(0)
   4726                                          : N->getOperand(1);
   4727     return getShuffleScalarElt(NewV.getNode(), Elt % NumElems, DAG,
   4728                                Depth+1);
   4729   }
   4730 
   4731   // Actual nodes that may contain scalar elements
   4732   if (Opcode == ISD::BITCAST) {
   4733     V = V.getOperand(0);
   4734     EVT SrcVT = V.getValueType();
   4735     unsigned NumElems = VT.getVectorNumElements();
   4736 
   4737     if (!SrcVT.isVector() || SrcVT.getVectorNumElements() != NumElems)
   4738       return SDValue();
   4739   }
   4740 
   4741   if (V.getOpcode() == ISD::SCALAR_TO_VECTOR)
   4742     return (Index == 0) ? V.getOperand(0)
   4743                         : DAG.getUNDEF(VT.getVectorElementType());
   4744 
   4745   if (V.getOpcode() == ISD::BUILD_VECTOR)
   4746     return V.getOperand(Index);
   4747 
   4748   return SDValue();
   4749 }
   4750 
   4751 /// getNumOfConsecutiveZeros - Return the number of elements of a vector
   4752 /// shuffle operation which come from a consecutively from a zero. The
   4753 /// search can start in two different directions, from left or right.
   4754 static
   4755 unsigned getNumOfConsecutiveZeros(ShuffleVectorSDNode *SVOp, unsigned NumElems,
   4756                                   bool ZerosFromLeft, SelectionDAG &DAG) {
   4757   unsigned i;
   4758   for (i = 0; i != NumElems; ++i) {
   4759     unsigned Index = ZerosFromLeft ? i : NumElems-i-1;
   4760     SDValue Elt = getShuffleScalarElt(SVOp, Index, DAG, 0);
   4761     if (!(Elt.getNode() &&
   4762          (Elt.getOpcode() == ISD::UNDEF || X86::isZeroNode(Elt))))
   4763       break;
   4764   }
   4765 
   4766   return i;
   4767 }
   4768 
   4769 /// isShuffleMaskConsecutive - Check if the shuffle mask indicies [MaskI, MaskE)
   4770 /// correspond consecutively to elements from one of the vector operands,
   4771 /// starting from its index OpIdx. Also tell OpNum which source vector operand.
   4772 static
   4773 bool isShuffleMaskConsecutive(ShuffleVectorSDNode *SVOp,
   4774                               unsigned MaskI, unsigned MaskE, unsigned OpIdx,
   4775                               unsigned NumElems, unsigned &OpNum) {
   4776   bool SeenV1 = false;
   4777   bool SeenV2 = false;
   4778 
   4779   for (unsigned i = MaskI; i != MaskE; ++i, ++OpIdx) {
   4780     int Idx = SVOp->getMaskElt(i);
   4781     // Ignore undef indicies
   4782     if (Idx < 0)
   4783       continue;
   4784 
   4785     if (Idx < (int)NumElems)
   4786       SeenV1 = true;
   4787     else
   4788       SeenV2 = true;
   4789 
   4790     // Only accept consecutive elements from the same vector
   4791     if ((Idx % NumElems != OpIdx) || (SeenV1 && SeenV2))
   4792       return false;
   4793   }
   4794 
   4795   OpNum = SeenV1 ? 0 : 1;
   4796   return true;
   4797 }
   4798 
   4799 /// isVectorShiftRight - Returns true if the shuffle can be implemented as a
   4800 /// logical left shift of a vector.
   4801 static bool isVectorShiftRight(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG,
   4802                                bool &isLeft, SDValue &ShVal, unsigned &ShAmt) {
   4803   unsigned NumElems = SVOp->getValueType(0).getVectorNumElements();
   4804   unsigned NumZeros = getNumOfConsecutiveZeros(SVOp, NumElems,
   4805               false /* check zeros from right */, DAG);
   4806   unsigned OpSrc;
   4807 
   4808   if (!NumZeros)
   4809     return false;
   4810 
   4811   // Considering the elements in the mask that are not consecutive zeros,
   4812   // check if they consecutively come from only one of the source vectors.
   4813   //
   4814   //               V1 = {X, A, B, C}     0
   4815   //                         \  \  \    /
   4816   //   vector_shuffle V1, V2 <1, 2, 3, X>
   4817   //
   4818   if (!isShuffleMaskConsecutive(SVOp,
   4819             0,                   // Mask Start Index
   4820             NumElems-NumZeros,   // Mask End Index(exclusive)
   4821             NumZeros,            // Where to start looking in the src vector
   4822             NumElems,            // Number of elements in vector
   4823             OpSrc))              // Which source operand ?
   4824     return false;
   4825 
   4826   isLeft = false;
   4827   ShAmt = NumZeros;
   4828   ShVal = SVOp->getOperand(OpSrc);
   4829   return true;
   4830 }
   4831 
   4832 /// isVectorShiftLeft - Returns true if the shuffle can be implemented as a
   4833 /// logical left shift of a vector.
   4834 static bool isVectorShiftLeft(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG,
   4835                               bool &isLeft, SDValue &ShVal, unsigned &ShAmt) {
   4836   unsigned NumElems = SVOp->getValueType(0).getVectorNumElements();
   4837   unsigned NumZeros = getNumOfConsecutiveZeros(SVOp, NumElems,
   4838               true /* check zeros from left */, DAG);
   4839   unsigned OpSrc;
   4840 
   4841   if (!NumZeros)
   4842     return false;
   4843 
   4844   // Considering the elements in the mask that are not consecutive zeros,
   4845   // check if they consecutively come from only one of the source vectors.
   4846   //
   4847   //                           0    { A, B, X, X } = V2
   4848   //                          / \    /  /
   4849   //   vector_shuffle V1, V2 <X, X, 4, 5>
   4850   //
   4851   if (!isShuffleMaskConsecutive(SVOp,
   4852             NumZeros,     // Mask Start Index
   4853             NumElems,     // Mask End Index(exclusive)
   4854             0,            // Where to start looking in the src vector
   4855             NumElems,     // Number of elements in vector
   4856             OpSrc))       // Which source operand ?
   4857     return false;
   4858 
   4859   isLeft = true;
   4860   ShAmt = NumZeros;
   4861   ShVal = SVOp->getOperand(OpSrc);
   4862   return true;
   4863 }
   4864 
   4865 /// isVectorShift - Returns true if the shuffle can be implemented as a
   4866 /// logical left or right shift of a vector.
   4867 static bool isVectorShift(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG,
   4868                           bool &isLeft, SDValue &ShVal, unsigned &ShAmt) {
   4869   // Although the logic below support any bitwidth size, there are no
   4870   // shift instructions which handle more than 128-bit vectors.
   4871   if (!SVOp->getValueType(0).is128BitVector())
   4872     return false;
   4873 
   4874   if (isVectorShiftLeft(SVOp, DAG, isLeft, ShVal, ShAmt) ||
   4875       isVectorShiftRight(SVOp, DAG, isLeft, ShVal, ShAmt))
   4876     return true;
   4877 
   4878   return false;
   4879 }
   4880 
   4881 /// LowerBuildVectorv16i8 - Custom lower build_vector of v16i8.
   4882 ///
   4883 static SDValue LowerBuildVectorv16i8(SDValue Op, unsigned NonZeros,
   4884                                        unsigned NumNonZero, unsigned NumZero,
   4885                                        SelectionDAG &DAG,
   4886                                        const X86Subtarget* Subtarget,
   4887                                        const TargetLowering &TLI) {
   4888   if (NumNonZero > 8)
   4889     return SDValue();
   4890 
   4891   DebugLoc dl = Op.getDebugLoc();
   4892   SDValue V(0, 0);
   4893   bool First = true;
   4894   for (unsigned i = 0; i < 16; ++i) {
   4895     bool ThisIsNonZero = (NonZeros & (1 << i)) != 0;
   4896     if (ThisIsNonZero && First) {
   4897       if (NumZero)
   4898         V = getZeroVector(MVT::v8i16, Subtarget, DAG, dl);
   4899       else
   4900         V = DAG.getUNDEF(MVT::v8i16);
   4901       First = false;
   4902     }
   4903 
   4904     if ((i & 1) != 0) {
   4905       SDValue ThisElt(0, 0), LastElt(0, 0);
   4906       bool LastIsNonZero = (NonZeros & (1 << (i-1))) != 0;
   4907       if (LastIsNonZero) {
   4908         LastElt = DAG.getNode(ISD::ZERO_EXTEND, dl,
   4909                               MVT::i16, Op.getOperand(i-1));
   4910       }
   4911       if (ThisIsNonZero) {
   4912         ThisElt = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i16, Op.getOperand(i));
   4913         ThisElt = DAG.getNode(ISD::SHL, dl, MVT::i16,
   4914                               ThisElt, DAG.getConstant(8, MVT::i8));
   4915         if (LastIsNonZero)
   4916           ThisElt = DAG.getNode(ISD::OR, dl, MVT::i16, ThisElt, LastElt);
   4917       } else
   4918         ThisElt = LastElt;
   4919 
   4920       if (ThisElt.getNode())
   4921         V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v8i16, V, ThisElt,
   4922                         DAG.getIntPtrConstant(i/2));
   4923     }
   4924   }
   4925 
   4926   return DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, V);
   4927 }
   4928 
   4929 /// LowerBuildVectorv8i16 - Custom lower build_vector of v8i16.
   4930 ///
   4931 static SDValue LowerBuildVectorv8i16(SDValue Op, unsigned NonZeros,
   4932                                      unsigned NumNonZero, unsigned NumZero,
   4933                                      SelectionDAG &DAG,
   4934                                      const X86Subtarget* Subtarget,
   4935                                      const TargetLowering &TLI) {
   4936   if (NumNonZero > 4)
   4937     return SDValue();
   4938 
   4939   DebugLoc dl = Op.getDebugLoc();
   4940   SDValue V(0, 0);
   4941   bool First = true;
   4942   for (unsigned i = 0; i < 8; ++i) {
   4943     bool isNonZero = (NonZeros & (1 << i)) != 0;
   4944     if (isNonZero) {
   4945       if (First) {
   4946         if (NumZero)
   4947           V = getZeroVector(MVT::v8i16, Subtarget, DAG, dl);
   4948         else
   4949           V = DAG.getUNDEF(MVT::v8i16);
   4950         First = false;
   4951       }
   4952       V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl,
   4953                       MVT::v8i16, V, Op.getOperand(i),
   4954                       DAG.getIntPtrConstant(i));
   4955     }
   4956   }
   4957 
   4958   return V;
   4959 }
   4960 
   4961 /// getVShift - Return a vector logical shift node.
   4962 ///
   4963 static SDValue getVShift(bool isLeft, EVT VT, SDValue SrcOp,
   4964                          unsigned NumBits, SelectionDAG &DAG,
   4965                          const TargetLowering &TLI, DebugLoc dl) {
   4966   assert(VT.is128BitVector() && "Unknown type for VShift");
   4967   EVT ShVT = MVT::v2i64;
   4968   unsigned Opc = isLeft ? X86ISD::VSHLDQ : X86ISD::VSRLDQ;
   4969   SrcOp = DAG.getNode(ISD::BITCAST, dl, ShVT, SrcOp);
   4970   return DAG.getNode(ISD::BITCAST, dl, VT,
   4971                      DAG.getNode(Opc, dl, ShVT, SrcOp,
   4972                              DAG.getConstant(NumBits,
   4973                                   TLI.getScalarShiftAmountTy(SrcOp.getValueType()))));
   4974 }
   4975 
   4976 SDValue
   4977 X86TargetLowering::LowerAsSplatVectorLoad(SDValue SrcOp, EVT VT, DebugLoc dl,
   4978                                           SelectionDAG &DAG) const {
   4979 
   4980   // Check if the scalar load can be widened into a vector load. And if
   4981   // the address is "base + cst" see if the cst can be "absorbed" into
   4982   // the shuffle mask.
   4983   if (LoadSDNode *LD = dyn_cast<LoadSDNode>(SrcOp)) {
   4984     SDValue Ptr = LD->getBasePtr();
   4985     if (!ISD::isNormalLoad(LD) || LD->isVolatile())
   4986       return SDValue();
   4987     EVT PVT = LD->getValueType(0);
   4988     if (PVT != MVT::i32 && PVT != MVT::f32)
   4989       return SDValue();
   4990 
   4991     int FI = -1;
   4992     int64_t Offset = 0;
   4993     if (FrameIndexSDNode *FINode = dyn_cast<FrameIndexSDNode>(Ptr)) {
   4994       FI = FINode->getIndex();
   4995       Offset = 0;
   4996     } else if (DAG.isBaseWithConstantOffset(Ptr) &&
   4997                isa<FrameIndexSDNode>(Ptr.getOperand(0))) {
   4998       FI = cast<FrameIndexSDNode>(Ptr.getOperand(0))->getIndex();
   4999       Offset = Ptr.getConstantOperandVal(1);
   5000       Ptr = Ptr.getOperand(0);
   5001     } else {
   5002       return SDValue();
   5003     }
   5004 
   5005     // FIXME: 256-bit vector instructions don't require a strict alignment,
   5006     // improve this code to support it better.
   5007     unsigned RequiredAlign = VT.getSizeInBits()/8;
   5008     SDValue Chain = LD->getChain();
   5009     // Make sure the stack object alignment is at least 16 or 32.
   5010     MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo();
   5011     if (DAG.InferPtrAlignment(Ptr) < RequiredAlign) {
   5012       if (MFI->isFixedObjectIndex(FI)) {
   5013         // Can't change the alignment. FIXME: It's possible to compute
   5014         // the exact stack offset and reference FI + adjust offset instead.
   5015         // If someone *really* cares about this. That's the way to implement it.
   5016         return SDValue();
   5017       } else {
   5018         MFI->setObjectAlignment(FI, RequiredAlign);
   5019       }
   5020     }
   5021 
   5022     // (Offset % 16 or 32) must be multiple of 4. Then address is then
   5023     // Ptr + (Offset & ~15).
   5024     if (Offset < 0)
   5025       return SDValue();
   5026     if ((Offset % RequiredAlign) & 3)
   5027       return SDValue();
   5028     int64_t StartOffset = Offset & ~(RequiredAlign-1);
   5029     if (StartOffset)
   5030       Ptr = DAG.getNode(ISD::ADD, Ptr.getDebugLoc(), Ptr.getValueType(),
   5031                         Ptr,DAG.getConstant(StartOffset, Ptr.getValueType()));
   5032 
   5033     int EltNo = (Offset - StartOffset) >> 2;
   5034     unsigned NumElems = VT.getVectorNumElements();
   5035 
   5036     EVT NVT = EVT::getVectorVT(*DAG.getContext(), PVT, NumElems);
   5037     SDValue V1 = DAG.getLoad(NVT, dl, Chain, Ptr,
   5038                              LD->getPointerInfo().getWithOffset(StartOffset),
   5039                              false, false, false, 0);
   5040 
   5041     SmallVector<int, 8> Mask;
   5042     for (unsigned i = 0; i != NumElems; ++i)
   5043       Mask.push_back(EltNo);
   5044 
   5045     return DAG.getVectorShuffle(NVT, dl, V1, DAG.getUNDEF(NVT), &Mask[0]);
   5046   }
   5047 
   5048   return SDValue();
   5049 }
   5050 
   5051 /// EltsFromConsecutiveLoads - Given the initializing elements 'Elts' of a
   5052 /// vector of type 'VT', see if the elements can be replaced by a single large
   5053 /// load which has the same value as a build_vector whose operands are 'elts'.
   5054 ///
   5055 /// Example: <load i32 *a, load i32 *a+4, undef, undef> -> zextload a
   5056 ///
   5057 /// FIXME: we'd also like to handle the case where the last elements are zero
   5058 /// rather than undef via VZEXT_LOAD, but we do not detect that case today.
   5059 /// There's even a handy isZeroNode for that purpose.
   5060 static SDValue EltsFromConsecutiveLoads(EVT VT, SmallVectorImpl<SDValue> &Elts,
   5061                                         DebugLoc &DL, SelectionDAG &DAG) {
   5062   EVT EltVT = VT.getVectorElementType();
   5063   unsigned NumElems = Elts.size();
   5064 
   5065   LoadSDNode *LDBase = NULL;
   5066   unsigned LastLoadedElt = -1U;
   5067 
   5068   // For each element in the initializer, see if we've found a load or an undef.
   5069   // If we don't find an initial load element, or later load elements are
   5070   // non-consecutive, bail out.
   5071   for (unsigned i = 0; i < NumElems; ++i) {
   5072     SDValue Elt = Elts[i];
   5073 
   5074     if (!Elt.getNode() ||
   5075         (Elt.getOpcode() != ISD::UNDEF && !ISD::isNON_EXTLoad(Elt.getNode())))
   5076       return SDValue();
   5077     if (!LDBase) {
   5078       if (Elt.getNode()->getOpcode() == ISD::UNDEF)
   5079         return SDValue();
   5080       LDBase = cast<LoadSDNode>(Elt.getNode());
   5081       LastLoadedElt = i;
   5082       continue;
   5083     }
   5084     if (Elt.getOpcode() == ISD::UNDEF)
   5085       continue;
   5086 
   5087     LoadSDNode *LD = cast<LoadSDNode>(Elt);
   5088     if (!DAG.isConsecutiveLoad(LD, LDBase, EltVT.getSizeInBits()/8, i))
   5089       return SDValue();
   5090     LastLoadedElt = i;
   5091   }
   5092 
   5093   // If we have found an entire vector of loads and undefs, then return a large
   5094   // load of the entire vector width starting at the base pointer.  If we found
   5095   // consecutive loads for the low half, generate a vzext_load node.
   5096   if (LastLoadedElt == NumElems - 1) {
   5097     if (DAG.InferPtrAlignment(LDBase->getBasePtr()) >= 16)
   5098       return DAG.getLoad(VT, DL, LDBase->getChain(), LDBase->getBasePtr(),
   5099                          LDBase->getPointerInfo(),
   5100                          LDBase->isVolatile(), LDBase->isNonTemporal(),
   5101                          LDBase->isInvariant(), 0);
   5102     return DAG.getLoad(VT, DL, LDBase->getChain(), LDBase->getBasePtr(),
   5103                        LDBase->getPointerInfo(),
   5104                        LDBase->isVolatile(), LDBase->isNonTemporal(),
   5105                        LDBase->isInvariant(), LDBase->getAlignment());
   5106   }
   5107   if (NumElems == 4 && LastLoadedElt == 1 &&
   5108       DAG.getTargetLoweringInfo().isTypeLegal(MVT::v2i64)) {
   5109     SDVTList Tys = DAG.getVTList(MVT::v2i64, MVT::Other);
   5110     SDValue Ops[] = { LDBase->getChain(), LDBase->getBasePtr() };
   5111     SDValue ResNode =
   5112         DAG.getMemIntrinsicNode(X86ISD::VZEXT_LOAD, DL, Tys, Ops, 2, MVT::i64,
   5113                                 LDBase->getPointerInfo(),
   5114                                 LDBase->getAlignment(),
   5115                                 false/*isVolatile*/, true/*ReadMem*/,
   5116                                 false/*WriteMem*/);
   5117 
   5118     // Make sure the newly-created LOAD is in the same position as LDBase in
   5119     // terms of dependency. We create a TokenFactor for LDBase and ResNode, and
   5120     // update uses of LDBase's output chain to use the TokenFactor.
   5121     if (LDBase->hasAnyUseOfValue(1)) {
   5122       SDValue NewChain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other,
   5123                              SDValue(LDBase, 1), SDValue(ResNode.getNode(), 1));
   5124       DAG.ReplaceAllUsesOfValueWith(SDValue(LDBase, 1), NewChain);
   5125       DAG.UpdateNodeOperands(NewChain.getNode(), SDValue(LDBase, 1),
   5126                              SDValue(ResNode.getNode(), 1));
   5127     }
   5128 
   5129     return DAG.getNode(ISD::BITCAST, DL, VT, ResNode);
   5130   }
   5131   return SDValue();
   5132 }
   5133 
   5134 /// LowerVectorBroadcast - Attempt to use the vbroadcast instruction
   5135 /// to generate a splat value for the following cases:
   5136 /// 1. A splat BUILD_VECTOR which uses a single scalar load, or a constant.
   5137 /// 2. A splat shuffle which uses a scalar_to_vector node which comes from
   5138 /// a scalar load, or a constant.
   5139 /// The VBROADCAST node is returned when a pattern is found,
   5140 /// or SDValue() otherwise.
   5141 SDValue
   5142 X86TargetLowering::LowerVectorBroadcast(SDValue Op, SelectionDAG &DAG) const {
   5143   if (!Subtarget->hasFp256())
   5144     return SDValue();
   5145 
   5146   MVT VT = Op.getValueType().getSimpleVT();
   5147   DebugLoc dl = Op.getDebugLoc();
   5148 
   5149   assert((VT.is128BitVector() || VT.is256BitVector()) &&
   5150          "Unsupported vector type for broadcast.");
   5151 
   5152   SDValue Ld;
   5153   bool ConstSplatVal;
   5154 
   5155   switch (Op.getOpcode()) {
   5156     default:
   5157       // Unknown pattern found.
   5158       return SDValue();
   5159 
   5160     case ISD::BUILD_VECTOR: {
   5161       // The BUILD_VECTOR node must be a splat.
   5162       if (!isSplatVector(Op.getNode()))
   5163         return SDValue();
   5164 
   5165       Ld = Op.getOperand(0);
   5166       ConstSplatVal = (Ld.getOpcode() == ISD::Constant ||
   5167                      Ld.getOpcode() == ISD::ConstantFP);
   5168 
   5169       // The suspected load node has several users. Make sure that all
   5170       // of its users are from the BUILD_VECTOR node.
   5171       // Constants may have multiple users.
   5172       if (!ConstSplatVal && !Ld->hasNUsesOfValue(VT.getVectorNumElements(), 0))
   5173         return SDValue();
   5174       break;
   5175     }
   5176 
   5177     case ISD::VECTOR_SHUFFLE: {
   5178       ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
   5179 
   5180       // Shuffles must have a splat mask where the first element is
   5181       // broadcasted.
   5182       if ((!SVOp->isSplat()) || SVOp->getMaskElt(0) != 0)
   5183         return SDValue();
   5184 
   5185       SDValue Sc = Op.getOperand(0);
   5186       if (Sc.getOpcode() != ISD::SCALAR_TO_VECTOR &&
   5187           Sc.getOpcode() != ISD::BUILD_VECTOR) {
   5188 
   5189         if (!Subtarget->hasInt256())
   5190           return SDValue();
   5191 
   5192         // Use the register form of the broadcast instruction available on AVX2.
   5193         if (VT.is256BitVector())
   5194           Sc = Extract128BitVector(Sc, 0, DAG, dl);
   5195         return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Sc);
   5196       }
   5197 
   5198       Ld = Sc.getOperand(0);
   5199       ConstSplatVal = (Ld.getOpcode() == ISD::Constant ||
   5200                        Ld.getOpcode() == ISD::ConstantFP);
   5201 
   5202       // The scalar_to_vector node and the suspected
   5203       // load node must have exactly one user.
   5204       // Constants may have multiple users.
   5205       if (!ConstSplatVal && (!Sc.hasOneUse() || !Ld.hasOneUse()))
   5206         return SDValue();
   5207       break;
   5208     }
   5209   }
   5210 
   5211   bool Is256 = VT.is256BitVector();
   5212 
   5213   // Handle the broadcasting a single constant scalar from the constant pool
   5214   // into a vector. On Sandybridge it is still better to load a constant vector
   5215   // from the constant pool and not to broadcast it from a scalar.
   5216   if (ConstSplatVal && Subtarget->hasInt256()) {
   5217     EVT CVT = Ld.getValueType();
   5218     assert(!CVT.isVector() && "Must not broadcast a vector type");
   5219     unsigned ScalarSize = CVT.getSizeInBits();
   5220 
   5221     if (ScalarSize == 32 || (Is256 && ScalarSize == 64)) {
   5222       const Constant *C = 0;
   5223       if (ConstantSDNode *CI = dyn_cast<ConstantSDNode>(Ld))
   5224         C = CI->getConstantIntValue();
   5225       else if (ConstantFPSDNode *CF = dyn_cast<ConstantFPSDNode>(Ld))
   5226         C = CF->getConstantFPValue();
   5227 
   5228       assert(C && "Invalid constant type");
   5229 
   5230       SDValue CP = DAG.getConstantPool(C, getPointerTy());
   5231       unsigned Alignment = cast<ConstantPoolSDNode>(CP)->getAlignment();
   5232       Ld = DAG.getLoad(CVT, dl, DAG.getEntryNode(), CP,
   5233                        MachinePointerInfo::getConstantPool(),
   5234                        false, false, false, Alignment);
   5235 
   5236       return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld);
   5237     }
   5238   }
   5239 
   5240   bool IsLoad = ISD::isNormalLoad(Ld.getNode());
   5241   unsigned ScalarSize = Ld.getValueType().getSizeInBits();
   5242 
   5243   // Handle AVX2 in-register broadcasts.
   5244   if (!IsLoad && Subtarget->hasInt256() &&
   5245       (ScalarSize == 32 || (Is256 && ScalarSize == 64)))
   5246     return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld);
   5247 
   5248   // The scalar source must be a normal load.
   5249   if (!IsLoad)
   5250     return SDValue();
   5251 
   5252   if (ScalarSize == 32 || (Is256 && ScalarSize == 64))
   5253     return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld);
   5254 
   5255   // The integer check is needed for the 64-bit into 128-bit so it doesn't match
   5256   // double since there is no vbroadcastsd xmm
   5257   if (Subtarget->hasInt256() && Ld.getValueType().isInteger()) {
   5258     if (ScalarSize == 8 || ScalarSize == 16 || ScalarSize == 64)
   5259       return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld);
   5260   }
   5261 
   5262   // Unsupported broadcast.
   5263   return SDValue();
   5264 }
   5265 
   5266 SDValue
   5267 X86TargetLowering::buildFromShuffleMostly(SDValue Op, SelectionDAG &DAG) const {
   5268   EVT VT = Op.getValueType();
   5269 
   5270   // Skip if insert_vec_elt is not supported.
   5271   if (!isOperationLegalOrCustom(ISD::INSERT_VECTOR_ELT, VT))
   5272     return SDValue();
   5273 
   5274   DebugLoc DL = Op.getDebugLoc();
   5275   unsigned NumElems = Op.getNumOperands();
   5276 
   5277   SDValue VecIn1;
   5278   SDValue VecIn2;
   5279   SmallVector<unsigned, 4> InsertIndices;
   5280   SmallVector<int, 8> Mask(NumElems, -1);
   5281 
   5282   for (unsigned i = 0; i != NumElems; ++i) {
   5283     unsigned Opc = Op.getOperand(i).getOpcode();
   5284 
   5285     if (Opc == ISD::UNDEF)
   5286       continue;
   5287 
   5288     if (Opc != ISD::EXTRACT_VECTOR_ELT) {
   5289       // Quit if more than 1 elements need inserting.
   5290       if (InsertIndices.size() > 1)
   5291         return SDValue();
   5292 
   5293       InsertIndices.push_back(i);
   5294       continue;
   5295     }
   5296 
   5297     SDValue ExtractedFromVec = Op.getOperand(i).getOperand(0);
   5298     SDValue ExtIdx = Op.getOperand(i).getOperand(1);
   5299 
   5300     // Quit if extracted from vector of different type.
   5301     if (ExtractedFromVec.getValueType() != VT)
   5302       return SDValue();
   5303 
   5304     // Quit if non-constant index.
   5305     if (!isa<ConstantSDNode>(ExtIdx))
   5306       return SDValue();
   5307 
   5308     if (VecIn1.getNode() == 0)
   5309       VecIn1 = ExtractedFromVec;
   5310     else if (VecIn1 != ExtractedFromVec) {
   5311       if (VecIn2.getNode() == 0)
   5312         VecIn2 = ExtractedFromVec;
   5313       else if (VecIn2 != ExtractedFromVec)
   5314         // Quit if more than 2 vectors to shuffle
   5315         return SDValue();
   5316     }
   5317 
   5318     unsigned Idx = cast<ConstantSDNode>(ExtIdx)->getZExtValue();
   5319 
   5320     if (ExtractedFromVec == VecIn1)
   5321       Mask[i] = Idx;
   5322     else if (ExtractedFromVec == VecIn2)
   5323       Mask[i] = Idx + NumElems;
   5324   }
   5325 
   5326   if (VecIn1.getNode() == 0)
   5327     return SDValue();
   5328 
   5329   VecIn2 = VecIn2.getNode() ? VecIn2 : DAG.getUNDEF(VT);
   5330   SDValue NV = DAG.getVectorShuffle(VT, DL, VecIn1, VecIn2, &Mask[0]);
   5331   for (unsigned i = 0, e = InsertIndices.size(); i != e; ++i) {
   5332     unsigned Idx = InsertIndices[i];
   5333     NV = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, VT, NV, Op.getOperand(Idx),
   5334                      DAG.getIntPtrConstant(Idx));
   5335   }
   5336 
   5337   return NV;
   5338 }
   5339 
   5340 SDValue
   5341 X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const {
   5342   DebugLoc dl = Op.getDebugLoc();
   5343 
   5344   MVT VT = Op.getValueType().getSimpleVT();
   5345   MVT ExtVT = VT.getVectorElementType();
   5346   unsigned NumElems = Op.getNumOperands();
   5347 
   5348   // Vectors containing all zeros can be matched by pxor and xorps later
   5349   if (ISD::isBuildVectorAllZeros(Op.getNode())) {
   5350     // Canonicalize this to <4 x i32> to 1) ensure the zero vectors are CSE'd
   5351     // and 2) ensure that i64 scalars are eliminated on x86-32 hosts.
   5352     if (VT == MVT::v4i32 || VT == MVT::v8i32)
   5353       return Op;
   5354 
   5355     return getZeroVector(VT, Subtarget, DAG, dl);
   5356   }
   5357 
   5358   // Vectors containing all ones can be matched by pcmpeqd on 128-bit width
   5359   // vectors or broken into v4i32 operations on 256-bit vectors. AVX2 can use
   5360   // vpcmpeqd on 256-bit vectors.
   5361   if (Subtarget->hasSSE2() && ISD::isBuildVectorAllOnes(Op.getNode())) {
   5362     if (VT == MVT::v4i32 || (VT == MVT::v8i32 && Subtarget->hasInt256()))
   5363       return Op;
   5364 
   5365     return getOnesVector(VT, Subtarget->hasInt256(), DAG, dl);
   5366   }
   5367 
   5368   SDValue Broadcast = LowerVectorBroadcast(Op, DAG);
   5369   if (Broadcast.getNode())
   5370     return Broadcast;
   5371 
   5372   unsigned EVTBits = ExtVT.getSizeInBits();
   5373 
   5374   unsigned NumZero  = 0;
   5375   unsigned NumNonZero = 0;
   5376   unsigned NonZeros = 0;
   5377   bool IsAllConstants = true;
   5378   SmallSet<SDValue, 8> Values;
   5379   for (unsigned i = 0; i < NumElems; ++i) {
   5380     SDValue Elt = Op.getOperand(i);
   5381     if (Elt.getOpcode() == ISD::UNDEF)
   5382       continue;
   5383     Values.insert(Elt);
   5384     if (Elt.getOpcode() != ISD::Constant &&
   5385         Elt.getOpcode() != ISD::ConstantFP)
   5386       IsAllConstants = false;
   5387     if (X86::isZeroNode(Elt))
   5388       NumZero++;
   5389     else {
   5390       NonZeros |= (1 << i);
   5391       NumNonZero++;
   5392     }
   5393   }
   5394 
   5395   // All undef vector. Return an UNDEF.  All zero vectors were handled above.
   5396   if (NumNonZero == 0)
   5397     return DAG.getUNDEF(VT);
   5398 
   5399   // Special case for single non-zero, non-undef, element.
   5400   if (NumNonZero == 1) {
   5401     unsigned Idx = CountTrailingZeros_32(NonZeros);
   5402     SDValue Item = Op.getOperand(Idx);
   5403 
   5404     // If this is an insertion of an i64 value on x86-32, and if the top bits of
   5405     // the value are obviously zero, truncate the value to i32 and do the
   5406     // insertion that way.  Only do this if the value is non-constant or if the
   5407     // value is a constant being inserted into element 0.  It is cheaper to do
   5408     // a constant pool load than it is to do a movd + shuffle.
   5409     if (ExtVT == MVT::i64 && !Subtarget->is64Bit() &&
   5410         (!IsAllConstants || Idx == 0)) {
   5411       if (DAG.MaskedValueIsZero(Item, APInt::getBitsSet(64, 32, 64))) {
   5412         // Handle SSE only.
   5413         assert(VT == MVT::v2i64 && "Expected an SSE value type!");
   5414         EVT VecVT = MVT::v4i32;
   5415         unsigned VecElts = 4;
   5416 
   5417         // Truncate the value (which may itself be a constant) to i32, and
   5418         // convert it to a vector with movd (S2V+shuffle to zero extend).
   5419         Item = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, Item);
   5420         Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VecVT, Item);
   5421         Item = getShuffleVectorZeroOrUndef(Item, 0, true, Subtarget, DAG);
   5422 
   5423         // Now we have our 32-bit value zero extended in the low element of
   5424         // a vector.  If Idx != 0, swizzle it into place.
   5425         if (Idx != 0) {
   5426           SmallVector<int, 4> Mask;
   5427           Mask.push_back(Idx);
   5428           for (unsigned i = 1; i != VecElts; ++i)
   5429             Mask.push_back(i);
   5430           Item = DAG.getVectorShuffle(VecVT, dl, Item, DAG.getUNDEF(VecVT),
   5431                                       &Mask[0]);
   5432         }
   5433         return DAG.getNode(ISD::BITCAST, dl, VT, Item);
   5434       }
   5435     }
   5436 
   5437     // If we have a constant or non-constant insertion into the low element of
   5438     // a vector, we can do this with SCALAR_TO_VECTOR + shuffle of zero into
   5439     // the rest of the elements.  This will be matched as movd/movq/movss/movsd
   5440     // depending on what the source datatype is.
   5441     if (Idx == 0) {
   5442       if (NumZero == 0)
   5443         return DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item);
   5444 
   5445       if (ExtVT == MVT::i32 || ExtVT == MVT::f32 || ExtVT == MVT::f64 ||
   5446           (ExtVT == MVT::i64 && Subtarget->is64Bit())) {
   5447         if (VT.is256BitVector()) {
   5448           SDValue ZeroVec = getZeroVector(VT, Subtarget, DAG, dl);
   5449           return DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, ZeroVec,
   5450                              Item, DAG.getIntPtrConstant(0));
   5451         }
   5452         assert(VT.is128BitVector() && "Expected an SSE value type!");
   5453         Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item);
   5454         // Turn it into a MOVL (i.e. movss, movsd, or movd) to a zero vector.
   5455         return getShuffleVectorZeroOrUndef(Item, 0, true, Subtarget, DAG);
   5456       }
   5457 
   5458       if (ExtVT == MVT::i16 || ExtVT == MVT::i8) {
   5459         Item = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, Item);
   5460         Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, Item);
   5461         if (VT.is256BitVector()) {
   5462           SDValue ZeroVec = getZeroVector(MVT::v8i32, Subtarget, DAG, dl);
   5463           Item = Insert128BitVector(ZeroVec, Item, 0, DAG, dl);
   5464         } else {
   5465           assert(VT.is128BitVector() && "Expected an SSE value type!");
   5466           Item = getShuffleVectorZeroOrUndef(Item, 0, true, Subtarget, DAG);
   5467         }
   5468         return DAG.getNode(ISD::BITCAST, dl, VT, Item);
   5469       }
   5470     }
   5471 
   5472     // Is it a vector logical left shift?
   5473     if (NumElems == 2 && Idx == 1 &&
   5474         X86::isZeroNode(Op.getOperand(0)) &&
   5475         !X86::isZeroNode(Op.getOperand(1))) {
   5476       unsigned NumBits = VT.getSizeInBits();
   5477       return getVShift(true, VT,
   5478                        DAG.getNode(ISD::SCALAR_TO_VECTOR, dl,
   5479                                    VT, Op.getOperand(1)),
   5480                        NumBits/2, DAG, *this, dl);
   5481     }
   5482 
   5483     if (IsAllConstants) // Otherwise, it's better to do a constpool load.
   5484       return SDValue();
   5485 
   5486     // Otherwise, if this is a vector with i32 or f32 elements, and the element
   5487     // is a non-constant being inserted into an element other than the low one,
   5488     // we can't use a constant pool load.  Instead, use SCALAR_TO_VECTOR (aka
   5489     // movd/movss) to move this into the low element, then shuffle it into
   5490     // place.
   5491     if (EVTBits == 32) {
   5492       Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item);
   5493 
   5494       // Turn it into a shuffle of zero and zero-extended scalar to vector.
   5495       Item = getShuffleVectorZeroOrUndef(Item, 0, NumZero > 0, Subtarget, DAG);
   5496       SmallVector<int, 8> MaskVec;
   5497       for (unsigned i = 0; i != NumElems; ++i)
   5498         MaskVec.push_back(i == Idx ? 0 : 1);
   5499       return DAG.getVectorShuffle(VT, dl, Item, DAG.getUNDEF(VT), &MaskVec[0]);
   5500     }
   5501   }
   5502 
   5503   // Splat is obviously ok. Let legalizer expand it to a shuffle.
   5504   if (Values.size() == 1) {
   5505     if (EVTBits == 32) {
   5506       // Instead of a shuffle like this:
   5507       // shuffle (scalar_to_vector (load (ptr + 4))), undef, <0, 0, 0, 0>
   5508       // Check if it's possible to issue this instead.
   5509       // shuffle (vload ptr)), undef, <1, 1, 1, 1>
   5510       unsigned Idx = CountTrailingZeros_32(NonZeros);
   5511       SDValue Item = Op.getOperand(Idx);
   5512       if (Op.getNode()->isOnlyUserOf(Item.getNode()))
   5513         return LowerAsSplatVectorLoad(Item, VT, dl, DAG);
   5514     }
   5515     return SDValue();
   5516   }
   5517 
   5518   // A vector full of immediates; various special cases are already
   5519   // handled, so this is best done with a single constant-pool load.
   5520   if (IsAllConstants)
   5521     return SDValue();
   5522 
   5523   // For AVX-length vectors, build the individual 128-bit pieces and use
   5524   // shuffles to put them in place.
   5525   if (VT.is256BitVector()) {
   5526     SmallVector<SDValue, 32> V;
   5527     for (unsigned i = 0; i != NumElems; ++i)
   5528       V.push_back(Op.getOperand(i));
   5529 
   5530     EVT HVT = EVT::getVectorVT(*DAG.getContext(), ExtVT, NumElems/2);
   5531 
   5532     // Build both the lower and upper subvector.
   5533     SDValue Lower = DAG.getNode(ISD::BUILD_VECTOR, dl, HVT, &V[0], NumElems/2);
   5534     SDValue Upper = DAG.getNode(ISD::BUILD_VECTOR, dl, HVT, &V[NumElems / 2],
   5535                                 NumElems/2);
   5536 
   5537     // Recreate the wider vector with the lower and upper part.
   5538     return Concat128BitVectors(Lower, Upper, VT, NumElems, DAG, dl);
   5539   }
   5540 
   5541   // Let legalizer expand 2-wide build_vectors.
   5542   if (EVTBits == 64) {
   5543     if (NumNonZero == 1) {
   5544       // One half is zero or undef.
   5545       unsigned Idx = CountTrailingZeros_32(NonZeros);
   5546       SDValue V2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT,
   5547                                  Op.getOperand(Idx));
   5548       return getShuffleVectorZeroOrUndef(V2, Idx, true, Subtarget, DAG);
   5549     }
   5550     return SDValue();
   5551   }
   5552 
   5553   // If element VT is < 32 bits, convert it to inserts into a zero vector.
   5554   if (EVTBits == 8 && NumElems == 16) {
   5555     SDValue V = LowerBuildVectorv16i8(Op, NonZeros,NumNonZero,NumZero, DAG,
   5556                                         Subtarget, *this);
   5557     if (V.getNode()) return V;
   5558   }
   5559 
   5560   if (EVTBits == 16 && NumElems == 8) {
   5561     SDValue V = LowerBuildVectorv8i16(Op, NonZeros,NumNonZero,NumZero, DAG,
   5562                                       Subtarget, *this);
   5563     if (V.getNode()) return V;
   5564   }
   5565 
   5566   // If element VT is == 32 bits, turn it into a number of shuffles.
   5567   SmallVector<SDValue, 8> V(NumElems);
   5568   if (NumElems == 4 && NumZero > 0) {
   5569     for (unsigned i = 0; i < 4; ++i) {
   5570       bool isZero = !(NonZeros & (1 << i));
   5571       if (isZero)
   5572         V[i] = getZeroVector(VT, Subtarget, DAG, dl);
   5573       else
   5574         V[i] = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(i));
   5575     }
   5576 
   5577     for (unsigned i = 0; i < 2; ++i) {
   5578       switch ((NonZeros & (0x3 << i*2)) >> (i*2)) {
   5579         default: break;
   5580         case 0:
   5581           V[i] = V[i*2];  // Must be a zero vector.
   5582           break;
   5583         case 1:
   5584           V[i] = getMOVL(DAG, dl, VT, V[i*2+1], V[i*2]);
   5585           break;
   5586         case 2:
   5587           V[i] = getMOVL(DAG, dl, VT, V[i*2], V[i*2+1]);
   5588           break;
   5589         case 3:
   5590           V[i] = getUnpackl(DAG, dl, VT, V[i*2], V[i*2+1]);
   5591           break;
   5592       }
   5593     }
   5594 
   5595     bool Reverse1 = (NonZeros & 0x3) == 2;
   5596     bool Reverse2 = ((NonZeros & (0x3 << 2)) >> 2) == 2;
   5597     int MaskVec[] = {
   5598       Reverse1 ? 1 : 0,
   5599       Reverse1 ? 0 : 1,
   5600       static_cast<int>(Reverse2 ? NumElems+1 : NumElems),
   5601       static_cast<int>(Reverse2 ? NumElems   : NumElems+1)
   5602     };
   5603     return DAG.getVectorShuffle(VT, dl, V[0], V[1], &MaskVec[0]);
   5604   }
   5605 
   5606   if (Values.size() > 1 && VT.is128BitVector()) {
   5607     // Check for a build vector of consecutive loads.
   5608     for (unsigned i = 0; i < NumElems; ++i)
   5609       V[i] = Op.getOperand(i);
   5610 
   5611     // Check for elements which are consecutive loads.
   5612     SDValue LD = EltsFromConsecutiveLoads(VT, V, dl, DAG);
   5613     if (LD.getNode())
   5614       return LD;
   5615 
   5616     // Check for a build vector from mostly shuffle plus few inserting.
   5617     SDValue Sh = buildFromShuffleMostly(Op, DAG);
   5618     if (Sh.getNode())
   5619       return Sh;
   5620 
   5621     // For SSE 4.1, use insertps to put the high elements into the low element.
   5622     if (getSubtarget()->hasSSE41()) {
   5623       SDValue Result;
   5624       if (Op.getOperand(0).getOpcode() != ISD::UNDEF)
   5625         Result = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(0));
   5626       else
   5627         Result = DAG.getUNDEF(VT);
   5628 
   5629       for (unsigned i = 1; i < NumElems; ++i) {
   5630         if (Op.getOperand(i).getOpcode() == ISD::UNDEF) continue;
   5631         Result = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, Result,
   5632                              Op.getOperand(i), DAG.getIntPtrConstant(i));
   5633       }
   5634       return Result;
   5635     }
   5636 
   5637     // Otherwise, expand into a number of unpckl*, start by extending each of
   5638     // our (non-undef) elements to the full vector width with the element in the
   5639     // bottom slot of the vector (which generates no code for SSE).
   5640     for (unsigned i = 0; i < NumElems; ++i) {
   5641       if (Op.getOperand(i).getOpcode() != ISD::UNDEF)
   5642         V[i] = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(i));
   5643       else
   5644         V[i] = DAG.getUNDEF(VT);
   5645     }
   5646 
   5647     // Next, we iteratively mix elements, e.g. for v4f32:
   5648     //   Step 1: unpcklps 0, 2 ==> X: <?, ?, 2, 0>
   5649     //         : unpcklps 1, 3 ==> Y: <?, ?, 3, 1>
   5650     //   Step 2: unpcklps X, Y ==>    <3, 2, 1, 0>
   5651     unsigned EltStride = NumElems >> 1;
   5652     while (EltStride != 0) {
   5653       for (unsigned i = 0; i < EltStride; ++i) {
   5654         // If V[i+EltStride] is undef and this is the first round of mixing,
   5655         // then it is safe to just drop this shuffle: V[i] is already in the
   5656         // right place, the one element (since it's the first round) being
   5657         // inserted as undef can be dropped.  This isn't safe for successive
   5658         // rounds because they will permute elements within both vectors.
   5659         if (V[i+EltStride].getOpcode() == ISD::UNDEF &&
   5660             EltStride == NumElems/2)
   5661           continue;
   5662 
   5663         V[i] = getUnpackl(DAG, dl, VT, V[i], V[i + EltStride]);
   5664       }
   5665       EltStride >>= 1;
   5666     }
   5667     return V[0];
   5668   }
   5669   return SDValue();
   5670 }
   5671 
   5672 // LowerAVXCONCAT_VECTORS - 256-bit AVX can use the vinsertf128 instruction
   5673 // to create 256-bit vectors from two other 128-bit ones.
   5674 static SDValue LowerAVXCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) {
   5675   DebugLoc dl = Op.getDebugLoc();
   5676   MVT ResVT = Op.getValueType().getSimpleVT();
   5677 
   5678   assert(ResVT.is256BitVector() && "Value type must be 256-bit wide");
   5679 
   5680   SDValue V1 = Op.getOperand(0);
   5681   SDValue V2 = Op.getOperand(1);
   5682   unsigned NumElems = ResVT.getVectorNumElements();
   5683 
   5684   return Concat128BitVectors(V1, V2, ResVT, NumElems, DAG, dl);
   5685 }
   5686 
   5687 static SDValue LowerCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) {
   5688   assert(Op.getNumOperands() == 2);
   5689 
   5690   // 256-bit AVX can use the vinsertf128 instruction to create 256-bit vectors
   5691   // from two other 128-bit ones.
   5692   return LowerAVXCONCAT_VECTORS(Op, DAG);
   5693 }
   5694 
   5695 // Try to lower a shuffle node into a simple blend instruction.
   5696 static SDValue
   5697 LowerVECTOR_SHUFFLEtoBlend(ShuffleVectorSDNode *SVOp,
   5698                            const X86Subtarget *Subtarget, SelectionDAG &DAG) {
   5699   SDValue V1 = SVOp->getOperand(0);
   5700   SDValue V2 = SVOp->getOperand(1);
   5701   DebugLoc dl = SVOp->getDebugLoc();
   5702   MVT VT = SVOp->getValueType(0).getSimpleVT();
   5703   MVT EltVT = VT.getVectorElementType();
   5704   unsigned NumElems = VT.getVectorNumElements();
   5705 
   5706   if (!Subtarget->hasSSE41() || EltVT == MVT::i8)
   5707     return SDValue();
   5708   if (!Subtarget->hasInt256() && VT == MVT::v16i16)
   5709     return SDValue();
   5710 
   5711   // Check the mask for BLEND and build the value.
   5712   unsigned MaskValue = 0;
   5713   // There are 2 lanes if (NumElems > 8), and 1 lane otherwise.
   5714   unsigned NumLanes = (NumElems-1)/8 + 1;
   5715   unsigned NumElemsInLane = NumElems / NumLanes;
   5716 
   5717   // Blend for v16i16 should be symetric for the both lanes.
   5718   for (unsigned i = 0; i < NumElemsInLane; ++i) {
   5719 
   5720     int SndLaneEltIdx = (NumLanes == 2) ?
   5721       SVOp->getMaskElt(i + NumElemsInLane) : -1;
   5722     int EltIdx = SVOp->getMaskElt(i);
   5723 
   5724     if ((EltIdx < 0 || EltIdx == (int)i) &&
   5725         (SndLaneEltIdx < 0 || SndLaneEltIdx == (int)(i + NumElemsInLane)))
   5726       continue;
   5727 
   5728     if (((unsigned)EltIdx == (i + NumElems)) &&
   5729         (SndLaneEltIdx < 0 ||
   5730          (unsigned)SndLaneEltIdx == i + NumElems + NumElemsInLane))
   5731       MaskValue |= (1<<i);
   5732     else
   5733       return SDValue();
   5734   }
   5735 
   5736   // Convert i32 vectors to floating point if it is not AVX2.
   5737   // AVX2 introduced VPBLENDD instruction for 128 and 256-bit vectors.
   5738   MVT BlendVT = VT;
   5739   if (EltVT == MVT::i64 || (EltVT == MVT::i32 && !Subtarget->hasInt256())) {
   5740     BlendVT = MVT::getVectorVT(MVT::getFloatingPointVT(EltVT.getSizeInBits()),
   5741                                NumElems);
   5742     V1 = DAG.getNode(ISD::BITCAST, dl, VT, V1);
   5743     V2 = DAG.getNode(ISD::BITCAST, dl, VT, V2);
   5744   }
   5745 
   5746   SDValue Ret = DAG.getNode(X86ISD::BLENDI, dl, BlendVT, V1, V2,
   5747                             DAG.getConstant(MaskValue, MVT::i32));
   5748   return DAG.getNode(ISD::BITCAST, dl, VT, Ret);
   5749 }
   5750 
   5751 // v8i16 shuffles - Prefer shuffles in the following order:
   5752 // 1. [all]   pshuflw, pshufhw, optional move
   5753 // 2. [ssse3] 1 x pshufb
   5754 // 3. [ssse3] 2 x pshufb + 1 x por
   5755 // 4. [all]   mov + pshuflw + pshufhw + N x (pextrw + pinsrw)
   5756 static SDValue
   5757 LowerVECTOR_SHUFFLEv8i16(SDValue Op, const X86Subtarget *Subtarget,
   5758                          SelectionDAG &DAG) {
   5759   ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
   5760   SDValue V1 = SVOp->getOperand(0);
   5761   SDValue V2 = SVOp->getOperand(1);
   5762   DebugLoc dl = SVOp->getDebugLoc();
   5763   SmallVector<int, 8> MaskVals;
   5764 
   5765   // Determine if more than 1 of the words in each of the low and high quadwords
   5766   // of the result come from the same quadword of one of the two inputs.  Undef
   5767   // mask values count as coming from any quadword, for better codegen.
   5768   unsigned LoQuad[] = { 0, 0, 0, 0 };
   5769   unsigned HiQuad[] = { 0, 0, 0, 0 };
   5770   std::bitset<4> InputQuads;
   5771   for (unsigned i = 0; i < 8; ++i) {
   5772     unsigned *Quad = i < 4 ? LoQuad : HiQuad;
   5773     int EltIdx = SVOp->getMaskElt(i);
   5774     MaskVals.push_back(EltIdx);
   5775     if (EltIdx < 0) {
   5776       ++Quad[0];
   5777       ++Quad[1];
   5778       ++Quad[2];
   5779       ++Quad[3];
   5780       continue;
   5781     }
   5782     ++Quad[EltIdx / 4];
   5783     InputQuads.set(EltIdx / 4);
   5784   }
   5785 
   5786   int BestLoQuad = -1;
   5787   unsigned MaxQuad = 1;
   5788   for (unsigned i = 0; i < 4; ++i) {
   5789     if (LoQuad[i] > MaxQuad) {
   5790       BestLoQuad = i;
   5791       MaxQuad = LoQuad[i];
   5792     }
   5793   }
   5794 
   5795   int BestHiQuad = -1;
   5796   MaxQuad = 1;
   5797   for (unsigned i = 0; i < 4; ++i) {
   5798     if (HiQuad[i] > MaxQuad) {
   5799       BestHiQuad = i;
   5800       MaxQuad = HiQuad[i];
   5801     }
   5802   }
   5803 
   5804   // For SSSE3, If all 8 words of the result come from only 1 quadword of each
   5805   // of the two input vectors, shuffle them into one input vector so only a
   5806   // single pshufb instruction is necessary. If There are more than 2 input
   5807   // quads, disable the next transformation since it does not help SSSE3.
   5808   bool V1Used = InputQuads[0] || InputQuads[1];
   5809   bool V2Used = InputQuads[2] || InputQuads[3];
   5810   if (Subtarget->hasSSSE3()) {
   5811     if (InputQuads.count() == 2 && V1Used && V2Used) {
   5812       BestLoQuad = InputQuads[0] ? 0 : 1;
   5813       BestHiQuad = InputQuads[2] ? 2 : 3;
   5814     }
   5815     if (InputQuads.count() > 2) {
   5816       BestLoQuad = -1;
   5817       BestHiQuad = -1;
   5818     }
   5819   }
   5820 
   5821   // If BestLoQuad or BestHiQuad are set, shuffle the quads together and update
   5822   // the shuffle mask.  If a quad is scored as -1, that means that it contains
   5823   // words from all 4 input quadwords.
   5824   SDValue NewV;
   5825   if (BestLoQuad >= 0 || BestHiQuad >= 0) {
   5826     int MaskV[] = {
   5827       BestLoQuad < 0 ? 0 : BestLoQuad,
   5828       BestHiQuad < 0 ? 1 : BestHiQuad
   5829     };
   5830     NewV = DAG.getVectorShuffle(MVT::v2i64, dl,
   5831                   DAG.getNode(ISD::BITCAST, dl, MVT::v2i64, V1),
   5832                   DAG.getNode(ISD::BITCAST, dl, MVT::v2i64, V2), &MaskV[0]);
   5833     NewV = DAG.getNode(ISD::BITCAST, dl, MVT::v8i16, NewV);
   5834 
   5835     // Rewrite the MaskVals and assign NewV to V1 if NewV now contains all the
   5836     // source words for the shuffle, to aid later transformations.
   5837     bool AllWordsInNewV = true;
   5838     bool InOrder[2] = { true, true };
   5839     for (unsigned i = 0; i != 8; ++i) {
   5840       int idx = MaskVals[i];
   5841       if (idx != (int)i)
   5842         InOrder[i/4] = false;
   5843       if (idx < 0 || (idx/4) == BestLoQuad || (idx/4) == BestHiQuad)
   5844         continue;
   5845       AllWordsInNewV = false;
   5846       break;
   5847     }
   5848 
   5849     bool pshuflw = AllWordsInNewV, pshufhw = AllWordsInNewV;
   5850     if (AllWordsInNewV) {
   5851       for (int i = 0; i != 8; ++i) {
   5852         int idx = MaskVals[i];
   5853         if (idx < 0)
   5854           continue;
   5855         idx = MaskVals[i] = (idx / 4) == BestLoQuad ? (idx & 3) : (idx & 3) + 4;
   5856         if ((idx != i) && idx < 4)
   5857           pshufhw = false;
   5858         if ((idx != i) && idx > 3)
   5859           pshuflw = false;
   5860       }
   5861       V1 = NewV;
   5862       V2Used = false;
   5863       BestLoQuad = 0;
   5864       BestHiQuad = 1;
   5865     }
   5866 
   5867     // If we've eliminated the use of V2, and the new mask is a pshuflw or
   5868     // pshufhw, that's as cheap as it gets.  Return the new shuffle.
   5869     if ((pshufhw && InOrder[0]) || (pshuflw && InOrder[1])) {
   5870       unsigned Opc = pshufhw ? X86ISD::PSHUFHW : X86ISD::PSHUFLW;
   5871       unsigned TargetMask = 0;
   5872       NewV = DAG.getVectorShuffle(MVT::v8i16, dl, NewV,
   5873                                   DAG.getUNDEF(MVT::v8i16), &MaskVals[0]);
   5874       ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(NewV.getNode());
   5875       TargetMask = pshufhw ? getShufflePSHUFHWImmediate(SVOp):
   5876                              getShufflePSHUFLWImmediate(SVOp);
   5877       V1 = NewV.getOperand(0);
   5878       return getTargetShuffleNode(Opc, dl, MVT::v8i16, V1, TargetMask, DAG);
   5879     }
   5880   }
   5881 
   5882   // Promote splats to a larger type which usually leads to more efficient code.
   5883   // FIXME: Is this true if pshufb is available?
   5884   if (SVOp->isSplat())
   5885     return PromoteSplat(SVOp, DAG);
   5886 
   5887   // If we have SSSE3, and all words of the result are from 1 input vector,
   5888   // case 2 is generated, otherwise case 3 is generated.  If no SSSE3
   5889   // is present, fall back to case 4.
   5890   if (Subtarget->hasSSSE3()) {
   5891     SmallVector<SDValue,16> pshufbMask;
   5892 
   5893     // If we have elements from both input vectors, set the high bit of the
   5894     // shuffle mask element to zero out elements that come from V2 in the V1
   5895     // mask, and elements that come from V1 in the V2 mask, so that the two
   5896     // results can be OR'd together.
   5897     bool TwoInputs = V1Used && V2Used;
   5898     for (unsigned i = 0; i != 8; ++i) {
   5899       int EltIdx = MaskVals[i] * 2;
   5900       int Idx0 = (TwoInputs && (EltIdx >= 16)) ? 0x80 : EltIdx;
   5901       int Idx1 = (TwoInputs && (EltIdx >= 16)) ? 0x80 : EltIdx+1;
   5902       pshufbMask.push_back(DAG.getConstant(Idx0, MVT::i8));
   5903       pshufbMask.push_back(DAG.getConstant(Idx1, MVT::i8));
   5904     }
   5905     V1 = DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, V1);
   5906     V1 = DAG.getNode(X86ISD::PSHUFB, dl, MVT::v16i8, V1,
   5907                      DAG.getNode(ISD::BUILD_VECTOR, dl,
   5908                                  MVT::v16i8, &pshufbMask[0], 16));
   5909     if (!TwoInputs)
   5910       return DAG.getNode(ISD::BITCAST, dl, MVT::v8i16, V1);
   5911 
   5912     // Calculate the shuffle mask for the second input, shuffle it, and
   5913     // OR it with the first shuffled input.
   5914     pshufbMask.clear();
   5915     for (unsigned i = 0; i != 8; ++i) {
   5916       int EltIdx = MaskVals[i] * 2;
   5917       int Idx0 = (EltIdx < 16) ? 0x80 : EltIdx - 16;
   5918       int Idx1 = (EltIdx < 16) ? 0x80 : EltIdx - 15;
   5919       pshufbMask.push_back(DAG.getConstant(Idx0, MVT::i8));
   5920       pshufbMask.push_back(DAG.getConstant(Idx1, MVT::i8));
   5921     }
   5922     V2 = DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, V2);
   5923     V2 = DAG.getNode(X86ISD::PSHUFB, dl, MVT::v16i8, V2,
   5924                      DAG.getNode(ISD::BUILD_VECTOR, dl,
   5925                                  MVT::v16i8, &pshufbMask[0], 16));
   5926     V1 = DAG.getNode(ISD::OR, dl, MVT::v16i8, V1, V2);
   5927     return DAG.getNode(ISD::BITCAST, dl, MVT::v8i16, V1);
   5928   }
   5929 
   5930   // If BestLoQuad >= 0, generate a pshuflw to put the low elements in order,
   5931   // and update MaskVals with new element order.
   5932   std::bitset<8> InOrder;
   5933   if (BestLoQuad >= 0) {
   5934     int MaskV[] = { -1, -1, -1, -1, 4, 5, 6, 7 };
   5935     for (int i = 0; i != 4; ++i) {
   5936       int idx = MaskVals[i];
   5937       if (idx < 0) {
   5938         InOrder.set(i);
   5939       } else if ((idx / 4) == BestLoQuad) {
   5940         MaskV[i] = idx & 3;
   5941         InOrder.set(i);
   5942       }
   5943     }
   5944     NewV = DAG.getVectorShuffle(MVT::v8i16, dl, NewV, DAG.getUNDEF(MVT::v8i16),
   5945                                 &MaskV[0]);
   5946 
   5947     if (NewV.getOpcode() == ISD::VECTOR_SHUFFLE && Subtarget->hasSSSE3()) {
   5948       ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(NewV.getNode());
   5949       NewV = getTargetShuffleNode(X86ISD::PSHUFLW, dl, MVT::v8i16,
   5950                                   NewV.getOperand(0),
   5951                                   getShufflePSHUFLWImmediate(SVOp), DAG);
   5952     }
   5953   }
   5954 
   5955   // If BestHi >= 0, generate a pshufhw to put the high elements in order,
   5956   // and update MaskVals with the new element order.
   5957   if (BestHiQuad >= 0) {
   5958     int MaskV[] = { 0, 1, 2, 3, -1, -1, -1, -1 };
   5959     for (unsigned i = 4; i != 8; ++i) {
   5960       int idx = MaskVals[i];
   5961       if (idx < 0) {
   5962         InOrder.set(i);
   5963       } else if ((idx / 4) == BestHiQuad) {
   5964         MaskV[i] = (idx & 3) + 4;
   5965         InOrder.set(i);
   5966       }
   5967     }
   5968     NewV = DAG.getVectorShuffle(MVT::v8i16, dl, NewV, DAG.getUNDEF(MVT::v8i16),
   5969                                 &MaskV[0]);
   5970 
   5971     if (NewV.getOpcode() == ISD::VECTOR_SHUFFLE && Subtarget->hasSSSE3()) {
   5972       ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(NewV.getNode());
   5973       NewV = getTargetShuffleNode(X86ISD::PSHUFHW, dl, MVT::v8i16,
   5974                                   NewV.getOperand(0),
   5975                                   getShufflePSHUFHWImmediate(SVOp), DAG);
   5976     }
   5977   }
   5978 
   5979   // In case BestHi & BestLo were both -1, which means each quadword has a word
   5980   // from each of the four input quadwords, calculate the InOrder bitvector now
   5981   // before falling through to the insert/extract cleanup.
   5982   if (BestLoQuad == -1 && BestHiQuad == -1) {
   5983     NewV = V1;
   5984     for (int i = 0; i != 8; ++i)
   5985       if (MaskVals[i] < 0 || MaskVals[i] == i)
   5986         InOrder.set(i);
   5987   }
   5988 
   5989   // The other elements are put in the right place using pextrw and pinsrw.
   5990   for (unsigned i = 0; i != 8; ++i) {
   5991     if (InOrder[i])
   5992       continue;
   5993     int EltIdx = MaskVals[i];
   5994     if (EltIdx < 0)
   5995       continue;
   5996     SDValue ExtOp = (EltIdx < 8) ?
   5997       DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16, V1,
   5998                   DAG.getIntPtrConstant(EltIdx)) :
   5999       DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16, V2,
   6000                   DAG.getIntPtrConstant(EltIdx - 8));
   6001     NewV = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v8i16, NewV, ExtOp,
   6002                        DAG.getIntPtrConstant(i));
   6003   }
   6004   return NewV;
   6005 }
   6006 
   6007 // v16i8 shuffles - Prefer shuffles in the following order:
   6008 // 1. [ssse3] 1 x pshufb
   6009 // 2. [ssse3] 2 x pshufb + 1 x por
   6010 // 3. [all]   v8i16 shuffle + N x pextrw + rotate + pinsrw
   6011 static
   6012 SDValue LowerVECTOR_SHUFFLEv16i8(ShuffleVectorSDNode *SVOp,
   6013                                  SelectionDAG &DAG,
   6014                                  const X86TargetLowering &TLI) {
   6015   SDValue V1 = SVOp->getOperand(0);
   6016   SDValue V2 = SVOp->getOperand(1);
   6017   DebugLoc dl = SVOp->getDebugLoc();
   6018   ArrayRef<int> MaskVals = SVOp->getMask();
   6019 
   6020   // Promote splats to a larger type which usually leads to more efficient code.
   6021   // FIXME: Is this true if pshufb is available?
   6022   if (SVOp->isSplat())
   6023     return PromoteSplat(SVOp, DAG);
   6024 
   6025   // If we have SSSE3, case 1 is generated when all result bytes come from
   6026   // one of  the inputs.  Otherwise, case 2 is generated.  If no SSSE3 is
   6027   // present, fall back to case 3.
   6028 
   6029   // If SSSE3, use 1 pshufb instruction per vector with elements in the result.
   6030   if (TLI.getSubtarget()->hasSSSE3()) {
   6031     SmallVector<SDValue,16> pshufbMask;
   6032 
   6033     // If all result elements are from one input vector, then only translate
   6034     // undef mask values to 0x80 (zero out result) in the pshufb mask.
   6035     //
   6036     // Otherwise, we have elements from both input vectors, and must zero out
   6037     // elements that come from V2 in the first mask, and V1 in the second mask
   6038     // so that we can OR them together.
   6039     for (unsigned i = 0; i != 16; ++i) {
   6040       int EltIdx = MaskVals[i];
   6041       if (EltIdx < 0 || EltIdx >= 16)
   6042         EltIdx = 0x80;
   6043       pshufbMask.push_back(DAG.getConstant(EltIdx, MVT::i8));
   6044     }
   6045     V1 = DAG.getNode(X86ISD::PSHUFB, dl, MVT::v16i8, V1,
   6046                      DAG.getNode(ISD::BUILD_VECTOR, dl,
   6047                                  MVT::v16i8, &pshufbMask[0], 16));
   6048 
   6049     // As PSHUFB will zero elements with negative indices, it's safe to ignore
   6050     // the 2nd operand if it's undefined or zero.
   6051     if (V2.getOpcode() == ISD::UNDEF ||
   6052         ISD::isBuildVectorAllZeros(V2.getNode()))
   6053       return V1;
   6054 
   6055     // Calculate the shuffle mask for the second input, shuffle it, and
   6056     // OR it with the first shuffled input.
   6057     pshufbMask.clear();
   6058     for (unsigned i = 0; i != 16; ++i) {
   6059       int EltIdx = MaskVals[i];
   6060       EltIdx = (EltIdx < 16) ? 0x80 : EltIdx - 16;
   6061       pshufbMask.push_back(DAG.getConstant(EltIdx, MVT::i8));
   6062     }
   6063     V2 = DAG.getNode(X86ISD::PSHUFB, dl, MVT::v16i8, V2,
   6064                      DAG.getNode(ISD::BUILD_VECTOR, dl,
   6065                                  MVT::v16i8, &pshufbMask[0], 16));
   6066     return DAG.getNode(ISD::OR, dl, MVT::v16i8, V1, V2);
   6067   }
   6068 
   6069   // No SSSE3 - Calculate in place words and then fix all out of place words
   6070   // With 0-16 extracts & inserts.  Worst case is 16 bytes out of order from
   6071   // the 16 different words that comprise the two doublequadword input vectors.
   6072   V1 = DAG.getNode(ISD::BITCAST, dl, MVT::v8i16, V1);
   6073   V2 = DAG.getNode(ISD::BITCAST, dl, MVT::v8i16, V2);
   6074   SDValue NewV = V1;
   6075   for (int i = 0; i != 8; ++i) {
   6076     int Elt0 = MaskVals[i*2];
   6077     int Elt1 = MaskVals[i*2+1];
   6078 
   6079     // This word of the result is all undef, skip it.
   6080     if (Elt0 < 0 && Elt1 < 0)
   6081       continue;
   6082 
   6083     // This word of the result is already in the correct place, skip it.
   6084     if ((Elt0 == i*2) && (Elt1 == i*2+1))
   6085       continue;
   6086 
   6087     SDValue Elt0Src = Elt0 < 16 ? V1 : V2;
   6088     SDValue Elt1Src = Elt1 < 16 ? V1 : V2;
   6089     SDValue InsElt;
   6090 
   6091     // If Elt0 and Elt1 are defined, are consecutive, and can be load
   6092     // using a single extract together, load it and store it.
   6093     if ((Elt0 >= 0) && ((Elt0 + 1) == Elt1) && ((Elt0 & 1) == 0)) {
   6094       InsElt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16, Elt1Src,
   6095                            DAG.getIntPtrConstant(Elt1 / 2));
   6096       NewV = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v8i16, NewV, InsElt,
   6097                         DAG.getIntPtrConstant(i));
   6098       continue;
   6099     }
   6100 
   6101     // If Elt1 is defined, extract it from the appropriate source.  If the
   6102     // source byte is not also odd, shift the extracted word left 8 bits
   6103     // otherwise clear the bottom 8 bits if we need to do an or.
   6104     if (Elt1 >= 0) {
   6105       InsElt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16, Elt1Src,
   6106                            DAG.getIntPtrConstant(Elt1 / 2));
   6107       if ((Elt1 & 1) == 0)
   6108         InsElt = DAG.getNode(ISD::SHL, dl, MVT::i16, InsElt,
   6109                              DAG.getConstant(8,
   6110                                   TLI.getShiftAmountTy(InsElt.getValueType())));
   6111       else if (Elt0 >= 0)
   6112         InsElt = DAG.getNode(ISD::AND, dl, MVT::i16, InsElt,
   6113                              DAG.getConstant(0xFF00, MVT::i16));
   6114     }
   6115     // If Elt0 is defined, extract it from the appropriate source.  If the
   6116     // source byte is not also even, shift the extracted word right 8 bits. If
   6117     // Elt1 was also defined, OR the extracted values together before
   6118     // inserting them in the result.
   6119     if (Elt0 >= 0) {
   6120       SDValue InsElt0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16,
   6121                                     Elt0Src, DAG.getIntPtrConstant(Elt0 / 2));
   6122       if ((Elt0 & 1) != 0)
   6123         InsElt0 = DAG.getNode(ISD::SRL, dl, MVT::i16, InsElt0,
   6124                               DAG.getConstant(8,
   6125                                  TLI.getShiftAmountTy(InsElt0.getValueType())));
   6126       else if (Elt1 >= 0)
   6127         InsElt0 = DAG.getNode(ISD::AND, dl, MVT::i16, InsElt0,
   6128                              DAG.getConstant(0x00FF, MVT::i16));
   6129       InsElt = Elt1 >= 0 ? DAG.getNode(ISD::OR, dl, MVT::i16, InsElt, InsElt0)
   6130                          : InsElt0;
   6131     }
   6132     NewV = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v8i16, NewV, InsElt,
   6133                        DAG.getIntPtrConstant(i));
   6134   }
   6135   return DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, NewV);
   6136 }
   6137 
   6138 // v32i8 shuffles - Translate to VPSHUFB if possible.
   6139 static
   6140 SDValue LowerVECTOR_SHUFFLEv32i8(ShuffleVectorSDNode *SVOp,
   6141                                  const X86Subtarget *Subtarget,
   6142                                  SelectionDAG &DAG) {
   6143   MVT VT = SVOp->getValueType(0).getSimpleVT();
   6144   SDValue V1 = SVOp->getOperand(0);
   6145   SDValue V2 = SVOp->getOperand(1);
   6146   DebugLoc dl = SVOp->getDebugLoc();
   6147   SmallVector<int, 32> MaskVals(SVOp->getMask().begin(), SVOp->getMask().end());
   6148 
   6149   bool V2IsUndef = V2.getOpcode() == ISD::UNDEF;
   6150   bool V1IsAllZero = ISD::isBuildVectorAllZeros(V1.getNode());
   6151   bool V2IsAllZero = ISD::isBuildVectorAllZeros(V2.getNode());
   6152 
   6153   // VPSHUFB may be generated if
   6154   // (1) one of input vector is undefined or zeroinitializer.
   6155   // The mask value 0x80 puts 0 in the corresponding slot of the vector.
   6156   // And (2) the mask indexes don't cross the 128-bit lane.
   6157   if (VT != MVT::v32i8 || !Subtarget->hasInt256() ||
   6158       (!V2IsUndef && !V2IsAllZero && !V1IsAllZero))
   6159     return SDValue();
   6160 
   6161   if (V1IsAllZero && !V2IsAllZero) {
   6162     CommuteVectorShuffleMask(MaskVals, 32);
   6163     V1 = V2;
   6164   }
   6165   SmallVector<SDValue, 32> pshufbMask;
   6166   for (unsigned i = 0; i != 32; i++) {
   6167     int EltIdx = MaskVals[i];
   6168     if (EltIdx < 0 || EltIdx >= 32)
   6169       EltIdx = 0x80;
   6170     else {
   6171       if ((EltIdx >= 16 && i < 16) || (EltIdx < 16 && i >= 16))
   6172         // Cross lane is not allowed.
   6173         return SDValue();
   6174       EltIdx &= 0xf;
   6175     }
   6176     pshufbMask.push_back(DAG.getConstant(EltIdx, MVT::i8));
   6177   }
   6178   return DAG.getNode(X86ISD::PSHUFB, dl, MVT::v32i8, V1,
   6179                       DAG.getNode(ISD::BUILD_VECTOR, dl,
   6180                                   MVT::v32i8, &pshufbMask[0], 32));
   6181 }
   6182 
   6183 /// RewriteAsNarrowerShuffle - Try rewriting v8i16 and v16i8 shuffles as 4 wide
   6184 /// ones, or rewriting v4i32 / v4f32 as 2 wide ones if possible. This can be
   6185 /// done when every pair / quad of shuffle mask elements point to elements in
   6186 /// the right sequence. e.g.
   6187 /// vector_shuffle X, Y, <2, 3, | 10, 11, | 0, 1, | 14, 15>
   6188 static
   6189 SDValue RewriteAsNarrowerShuffle(ShuffleVectorSDNode *SVOp,
   6190                                  SelectionDAG &DAG) {
   6191   MVT VT = SVOp->getValueType(0).getSimpleVT();
   6192   DebugLoc dl = SVOp->getDebugLoc();
   6193   unsigned NumElems = VT.getVectorNumElements();
   6194   MVT NewVT;
   6195   unsigned Scale;
   6196   switch (VT.SimpleTy) {
   6197   default: llvm_unreachable("Unexpected!");
   6198   case MVT::v4f32:  NewVT = MVT::v2f64; Scale = 2; break;
   6199   case MVT::v4i32:  NewVT = MVT::v2i64; Scale = 2; break;
   6200   case MVT::v8i16:  NewVT = MVT::v4i32; Scale = 2; break;
   6201   case MVT::v16i8:  NewVT = MVT::v4i32; Scale = 4; break;
   6202   case MVT::v16i16: NewVT = MVT::v8i32; Scale = 2; break;
   6203   case MVT::v32i8:  NewVT = MVT::v8i32; Scale = 4; break;
   6204   }
   6205 
   6206   SmallVector<int, 8> MaskVec;
   6207   for (unsigned i = 0; i != NumElems; i += Scale) {
   6208     int StartIdx = -1;
   6209     for (unsigned j = 0; j != Scale; ++j) {
   6210       int EltIdx = SVOp->getMaskElt(i+j);
   6211       if (EltIdx < 0)
   6212         continue;
   6213       if (StartIdx < 0)
   6214         StartIdx = (EltIdx / Scale);
   6215       if (EltIdx != (int)(StartIdx*Scale + j))
   6216         return SDValue();
   6217     }
   6218     MaskVec.push_back(StartIdx);
   6219   }
   6220 
   6221   SDValue V1 = DAG.getNode(ISD::BITCAST, dl, NewVT, SVOp->getOperand(0));
   6222   SDValue V2 = DAG.getNode(ISD::BITCAST, dl, NewVT, SVOp->getOperand(1));
   6223   return DAG.getVectorShuffle(NewVT, dl, V1, V2, &MaskVec[0]);
   6224 }
   6225 
   6226 /// getVZextMovL - Return a zero-extending vector move low node.
   6227 ///
   6228 static SDValue getVZextMovL(MVT VT, EVT OpVT,
   6229                             SDValue SrcOp, SelectionDAG &DAG,
   6230                             const X86Subtarget *Subtarget, DebugLoc dl) {
   6231   if (VT == MVT::v2f64 || VT == MVT::v4f32) {
   6232     LoadSDNode *LD = NULL;
   6233     if (!isScalarLoadToVector(SrcOp.getNode(), &LD))
   6234       LD = dyn_cast<LoadSDNode>(SrcOp);
   6235     if (!LD) {
   6236       // movssrr and movsdrr do not clear top bits. Try to use movd, movq
   6237       // instead.
   6238       MVT ExtVT = (OpVT == MVT::v2f64) ? MVT::i64 : MVT::i32;
   6239       if ((ExtVT != MVT::i64 || Subtarget->is64Bit()) &&
   6240           SrcOp.getOpcode() == ISD::SCALAR_TO_VECTOR &&
   6241           SrcOp.getOperand(0).getOpcode() == ISD::BITCAST &&
   6242           SrcOp.getOperand(0).getOperand(0).getValueType() == ExtVT) {
   6243         // PR2108
   6244         OpVT = (OpVT == MVT::v2f64) ? MVT::v2i64 : MVT::v4i32;
   6245         return DAG.getNode(ISD::BITCAST, dl, VT,
   6246                            DAG.getNode(X86ISD::VZEXT_MOVL, dl, OpVT,
   6247                                        DAG.getNode(ISD::SCALAR_TO_VECTOR, dl,
   6248                                                    OpVT,
   6249                                                    SrcOp.getOperand(0)
   6250                                                           .getOperand(0))));
   6251       }
   6252     }
   6253   }
   6254 
   6255   return DAG.getNode(ISD::BITCAST, dl, VT,
   6256                      DAG.getNode(X86ISD::VZEXT_MOVL, dl, OpVT,
   6257                                  DAG.getNode(ISD::BITCAST, dl,
   6258                                              OpVT, SrcOp)));
   6259 }
   6260 
   6261 /// LowerVECTOR_SHUFFLE_256 - Handle all 256-bit wide vectors shuffles
   6262 /// which could not be matched by any known target speficic shuffle
   6263 static SDValue
   6264 LowerVECTOR_SHUFFLE_256(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG) {
   6265 
   6266   SDValue NewOp = Compact8x32ShuffleNode(SVOp, DAG);
   6267   if (NewOp.getNode())
   6268     return NewOp;
   6269 
   6270   MVT VT = SVOp->getValueType(0).getSimpleVT();
   6271 
   6272   unsigned NumElems = VT.getVectorNumElements();
   6273   unsigned NumLaneElems = NumElems / 2;
   6274 
   6275   DebugLoc dl = SVOp->getDebugLoc();
   6276   MVT EltVT = VT.getVectorElementType();
   6277   MVT NVT = MVT::getVectorVT(EltVT, NumLaneElems);
   6278   SDValue Output[2];
   6279 
   6280   SmallVector<int, 16> Mask;
   6281   for (unsigned l = 0; l < 2; ++l) {
   6282     // Build a shuffle mask for the output, discovering on the fly which
   6283     // input vectors to use as shuffle operands (recorded in InputUsed).
   6284     // If building a suitable shuffle vector proves too hard, then bail
   6285     // out with UseBuildVector set.
   6286     bool UseBuildVector = false;
   6287     int InputUsed[2] = { -1, -1 }; // Not yet discovered.
   6288     unsigned LaneStart = l * NumLaneElems;
   6289     for (unsigned i = 0; i != NumLaneElems; ++i) {
   6290       // The mask element.  This indexes into the input.
   6291       int Idx = SVOp->getMaskElt(i+LaneStart);
   6292       if (Idx < 0) {
   6293         // the mask element does not index into any input vector.
   6294         Mask.push_back(-1);
   6295         continue;
   6296       }
   6297 
   6298       // The input vector this mask element indexes into.
   6299       int Input = Idx / NumLaneElems;
   6300 
   6301       // Turn the index into an offset from the start of the input vector.
   6302       Idx -= Input * NumLaneElems;
   6303 
   6304       // Find or create a shuffle vector operand to hold this input.
   6305       unsigned OpNo;
   6306       for (OpNo = 0; OpNo < array_lengthof(InputUsed); ++OpNo) {
   6307         if (InputUsed[OpNo] == Input)
   6308           // This input vector is already an operand.
   6309           break;
   6310         if (InputUsed[OpNo] < 0) {
   6311           // Create a new operand for this input vector.
   6312           InputUsed[OpNo] = Input;
   6313           break;
   6314         }
   6315       }
   6316 
   6317       if (OpNo >= array_lengthof(InputUsed)) {
   6318         // More than two input vectors used!  Give up on trying to create a
   6319         // shuffle vector.  Insert all elements into a BUILD_VECTOR instead.
   6320         UseBuildVector = true;
   6321         break;
   6322       }
   6323 
   6324       // Add the mask index for the new shuffle vector.
   6325       Mask.push_back(Idx + OpNo * NumLaneElems);
   6326     }
   6327 
   6328     if (UseBuildVector) {
   6329       SmallVector<SDValue, 16> SVOps;
   6330       for (unsigned i = 0; i != NumLaneElems; ++i) {
   6331         // The mask element.  This indexes into the input.
   6332         int Idx = SVOp->getMaskElt(i+LaneStart);
   6333         if (Idx < 0) {
   6334           SVOps.push_back(DAG.getUNDEF(EltVT));
   6335           continue;
   6336         }
   6337 
   6338         // The input vector this mask element indexes into.
   6339         int Input = Idx / NumElems;
   6340 
   6341         // Turn the index into an offset from the start of the input vector.
   6342         Idx -= Input * NumElems;
   6343 
   6344         // Extract the vector element by hand.
   6345         SVOps.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT,
   6346                                     SVOp->getOperand(Input),
   6347                                     DAG.getIntPtrConstant(Idx)));
   6348       }
   6349 
   6350       // Construct the output using a BUILD_VECTOR.
   6351       Output[l] = DAG.getNode(ISD::BUILD_VECTOR, dl, NVT, &SVOps[0],
   6352                               SVOps.size());
   6353     } else if (InputUsed[0] < 0) {
   6354       // No input vectors were used! The result is undefined.
   6355       Output[l] = DAG.getUNDEF(NVT);
   6356     } else {
   6357       SDValue Op0 = Extract128BitVector(SVOp->getOperand(InputUsed[0] / 2),
   6358                                         (InputUsed[0] % 2) * NumLaneElems,
   6359                                         DAG, dl);
   6360       // If only one input was used, use an undefined vector for the other.
   6361       SDValue Op1 = (InputUsed[1] < 0) ? DAG.getUNDEF(NVT) :
   6362         Extract128BitVector(SVOp->getOperand(InputUsed[1] / 2),
   6363                             (InputUsed[1] % 2) * NumLaneElems, DAG, dl);
   6364       // At least one input vector was used. Create a new shuffle vector.
   6365       Output[l] = DAG.getVectorShuffle(NVT, dl, Op0, Op1, &Mask[0]);
   6366     }
   6367 
   6368     Mask.clear();
   6369   }
   6370 
   6371   // Concatenate the result back
   6372   return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Output[0], Output[1]);
   6373 }
   6374 
   6375 /// LowerVECTOR_SHUFFLE_128v4 - Handle all 128-bit wide vectors with
   6376 /// 4 elements, and match them with several different shuffle types.
   6377 static SDValue
   6378 LowerVECTOR_SHUFFLE_128v4(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG) {
   6379   SDValue V1 = SVOp->getOperand(0);
   6380   SDValue V2 = SVOp->getOperand(1);
   6381   DebugLoc dl = SVOp->getDebugLoc();
   6382   MVT VT = SVOp->getValueType(0).getSimpleVT();
   6383 
   6384   assert(VT.is128BitVector() && "Unsupported vector size");
   6385 
   6386   std::pair<int, int> Locs[4];
   6387   int Mask1[] = { -1, -1, -1, -1 };
   6388   SmallVector<int, 8> PermMask(SVOp->getMask().begin(), SVOp->getMask().end());
   6389 
   6390   unsigned NumHi = 0;
   6391   unsigned NumLo = 0;
   6392   for (unsigned i = 0; i != 4; ++i) {
   6393     int Idx = PermMask[i];
   6394     if (Idx < 0) {
   6395       Locs[i] = std::make_pair(-1, -1);
   6396     } else {
   6397       assert(Idx < 8 && "Invalid VECTOR_SHUFFLE index!");
   6398       if (Idx < 4) {
   6399         Locs[i] = std::make_pair(0, NumLo);
   6400         Mask1[NumLo] = Idx;
   6401         NumLo++;
   6402       } else {
   6403         Locs[i] = std::make_pair(1, NumHi);
   6404         if (2+NumHi < 4)
   6405           Mask1[2+NumHi] = Idx;
   6406         NumHi++;
   6407       }
   6408     }
   6409   }
   6410 
   6411   if (NumLo <= 2 && NumHi <= 2) {
   6412     // If no more than two elements come from either vector. This can be
   6413     // implemented with two shuffles. First shuffle gather the elements.
   6414     // The second shuffle, which takes the first shuffle as both of its
   6415     // vector operands, put the elements into the right order.
   6416     V1 = DAG.getVectorShuffle(VT, dl, V1, V2, &Mask1[0]);
   6417 
   6418     int Mask2[] = { -1, -1, -1, -1 };
   6419 
   6420     for (unsigned i = 0; i != 4; ++i)
   6421       if (Locs[i].first != -1) {
   6422         unsigned Idx = (i < 2) ? 0 : 4;
   6423         Idx += Locs[i].first * 2 + Locs[i].second;
   6424         Mask2[i] = Idx;
   6425       }
   6426 
   6427     return DAG.getVectorShuffle(VT, dl, V1, V1, &Mask2[0]);
   6428   }
   6429 
   6430   if (NumLo == 3 || NumHi == 3) {
   6431     // Otherwise, we must have three elements from one vector, call it X, and
   6432     // one element from the other, call it Y.  First, use a shufps to build an
   6433     // intermediate vector with the one element from Y and the element from X
   6434     // that will be in the same half in the final destination (the indexes don't
   6435     // matter). Then, use a shufps to build the final vector, taking the half
   6436     // containing the element from Y from the intermediate, and the other half
   6437     // from X.
   6438     if (NumHi == 3) {
   6439       // Normalize it so the 3 elements come from V1.
   6440       CommuteVectorShuffleMask(PermMask, 4);
   6441       std::swap(V1, V2);
   6442     }
   6443 
   6444     // Find the element from V2.
   6445     unsigned HiIndex;
   6446     for (HiIndex = 0; HiIndex < 3; ++HiIndex) {
   6447       int Val = PermMask[HiIndex];
   6448       if (Val < 0)
   6449         continue;
   6450       if (Val >= 4)
   6451         break;
   6452     }
   6453 
   6454     Mask1[0] = PermMask[HiIndex];
   6455     Mask1[1] = -1;
   6456     Mask1[2] = PermMask[HiIndex^1];
   6457     Mask1[3] = -1;
   6458     V2 = DAG.getVectorShuffle(VT, dl, V1, V2, &Mask1[0]);
   6459 
   6460     if (HiIndex >= 2) {
   6461       Mask1[0] = PermMask[0];
   6462       Mask1[1] = PermMask[1];
   6463       Mask1[2] = HiIndex & 1 ? 6 : 4;
   6464       Mask1[3] = HiIndex & 1 ? 4 : 6;
   6465       return DAG.getVectorShuffle(VT, dl, V1, V2, &Mask1[0]);
   6466     }
   6467 
   6468     Mask1[0] = HiIndex & 1 ? 2 : 0;
   6469     Mask1[1] = HiIndex & 1 ? 0 : 2;
   6470     Mask1[2] = PermMask[2];
   6471     Mask1[3] = PermMask[3];
   6472     if (Mask1[2] >= 0)
   6473       Mask1[2] += 4;
   6474     if (Mask1[3] >= 0)
   6475       Mask1[3] += 4;
   6476     return DAG.getVectorShuffle(VT, dl, V2, V1, &Mask1[0]);
   6477   }
   6478 
   6479   // Break it into (shuffle shuffle_hi, shuffle_lo).
   6480   int LoMask[] = { -1, -1, -1, -1 };
   6481   int HiMask[] = { -1, -1, -1, -1 };
   6482 
   6483   int *MaskPtr = LoMask;
   6484   unsigned MaskIdx = 0;
   6485   unsigned LoIdx = 0;
   6486   unsigned HiIdx = 2;
   6487   for (unsigned i = 0; i != 4; ++i) {
   6488     if (i == 2) {
   6489       MaskPtr = HiMask;
   6490       MaskIdx = 1;
   6491       LoIdx = 0;
   6492       HiIdx = 2;
   6493     }
   6494     int Idx = PermMask[i];
   6495     if (Idx < 0) {
   6496       Locs[i] = std::make_pair(-1, -1);
   6497     } else if (Idx < 4) {
   6498       Locs[i] = std::make_pair(MaskIdx, LoIdx);
   6499       MaskPtr[LoIdx] = Idx;
   6500       LoIdx++;
   6501     } else {
   6502       Locs[i] = std::make_pair(MaskIdx, HiIdx);
   6503       MaskPtr[HiIdx] = Idx;
   6504       HiIdx++;
   6505     }
   6506   }
   6507 
   6508   SDValue LoShuffle = DAG.getVectorShuffle(VT, dl, V1, V2, &LoMask[0]);
   6509   SDValue HiShuffle = DAG.getVectorShuffle(VT, dl, V1, V2, &HiMask[0]);
   6510   int MaskOps[] = { -1, -1, -1, -1 };
   6511   for (unsigned i = 0; i != 4; ++i)
   6512     if (Locs[i].first != -1)
   6513       MaskOps[i] = Locs[i].first * 4 + Locs[i].second;
   6514   return DAG.getVectorShuffle(VT, dl, LoShuffle, HiShuffle, &MaskOps[0]);
   6515 }
   6516 
   6517 static bool MayFoldVectorLoad(SDValue V) {
   6518   while (V.hasOneUse() && V.getOpcode() == ISD::BITCAST)
   6519     V = V.getOperand(0);
   6520 
   6521   if (V.hasOneUse() && V.getOpcode() == ISD::SCALAR_TO_VECTOR)
   6522     V = V.getOperand(0);
   6523   if (V.hasOneUse() && V.getOpcode() == ISD::BUILD_VECTOR &&
   6524       V.getNumOperands() == 2 && V.getOperand(1).getOpcode() == ISD::UNDEF)
   6525     // BUILD_VECTOR (load), undef
   6526     V = V.getOperand(0);
   6527 
   6528   return MayFoldLoad(V);
   6529 }
   6530 
   6531 static
   6532 SDValue getMOVDDup(SDValue &Op, DebugLoc &dl, SDValue V1, SelectionDAG &DAG) {
   6533   EVT VT = Op.getValueType();
   6534 
   6535   // Canonizalize to v2f64.
   6536   V1 = DAG.getNode(ISD::BITCAST, dl, MVT::v2f64, V1);
   6537   return DAG.getNode(ISD::BITCAST, dl, VT,
   6538                      getTargetShuffleNode(X86ISD::MOVDDUP, dl, MVT::v2f64,
   6539                                           V1, DAG));
   6540 }
   6541 
   6542 static
   6543 SDValue getMOVLowToHigh(SDValue &Op, DebugLoc &dl, SelectionDAG &DAG,
   6544                         bool HasSSE2) {
   6545   SDValue V1 = Op.getOperand(0);
   6546   SDValue V2 = Op.getOperand(1);
   6547   EVT VT = Op.getValueType();
   6548 
   6549   assert(VT != MVT::v2i64 && "unsupported shuffle type");
   6550 
   6551   if (HasSSE2 && VT == MVT::v2f64)
   6552     return getTargetShuffleNode(X86ISD::MOVLHPD, dl, VT, V1, V2, DAG);
   6553 
   6554   // v4f32 or v4i32: canonizalized to v4f32 (which is legal for SSE1)
   6555   return DAG.getNode(ISD::BITCAST, dl, VT,
   6556                      getTargetShuffleNode(X86ISD::MOVLHPS, dl, MVT::v4f32,
   6557                            DAG.getNode(ISD::BITCAST, dl, MVT::v4f32, V1),
   6558                            DAG.getNode(ISD::BITCAST, dl, MVT::v4f32, V2), DAG));
   6559 }
   6560 
   6561 static
   6562 SDValue getMOVHighToLow(SDValue &Op, DebugLoc &dl, SelectionDAG &DAG) {
   6563   SDValue V1 = Op.getOperand(0);
   6564   SDValue V2 = Op.getOperand(1);
   6565   EVT VT = Op.getValueType();
   6566 
   6567   assert((VT == MVT::v4i32 || VT == MVT::v4f32) &&
   6568          "unsupported shuffle type");
   6569 
   6570   if (V2.getOpcode() == ISD::UNDEF)
   6571     V2 = V1;
   6572 
   6573   // v4i32 or v4f32
   6574   return getTargetShuffleNode(X86ISD::MOVHLPS, dl, VT, V1, V2, DAG);
   6575 }
   6576 
   6577 static
   6578 SDValue getMOVLP(SDValue &Op, DebugLoc &dl, SelectionDAG &DAG, bool HasSSE2) {
   6579   SDValue V1 = Op.getOperand(0);
   6580   SDValue V2 = Op.getOperand(1);
   6581   EVT VT = Op.getValueType();
   6582   unsigned NumElems = VT.getVectorNumElements();
   6583 
   6584   // Use MOVLPS and MOVLPD in case V1 or V2 are loads. During isel, the second
   6585   // operand of these instructions is only memory, so check if there's a
   6586   // potencial load folding here, otherwise use SHUFPS or MOVSD to match the
   6587   // same masks.
   6588   bool CanFoldLoad = false;
   6589 
   6590   // Trivial case, when V2 comes from a load.
   6591   if (MayFoldVectorLoad(V2))
   6592     CanFoldLoad = true;
   6593 
   6594   // When V1 is a load, it can be folded later into a store in isel, example:
   6595   //  (store (v4f32 (X86Movlps (load addr:$src1), VR128:$src2)), addr:$src1)
   6596   //    turns into:
   6597   //  (MOVLPSmr addr:$src1, VR128:$src2)
   6598   // So, recognize this potential and also use MOVLPS or MOVLPD
   6599   else if (MayFoldVectorLoad(V1) && MayFoldIntoStore(Op))
   6600     CanFoldLoad = true;
   6601 
   6602   ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
   6603   if (CanFoldLoad) {
   6604     if (HasSSE2 && NumElems == 2)
   6605       return getTargetShuffleNode(X86ISD::MOVLPD, dl, VT, V1, V2, DAG);
   6606 
   6607     if (NumElems == 4)
   6608       // If we don't care about the second element, proceed to use movss.
   6609       if (SVOp->getMaskElt(1) != -1)
   6610         return getTargetShuffleNode(X86ISD::MOVLPS, dl, VT, V1, V2, DAG);
   6611   }
   6612 
   6613   // movl and movlp will both match v2i64, but v2i64 is never matched by
   6614   // movl earlier because we make it strict to avoid messing with the movlp load
   6615   // folding logic (see the code above getMOVLP call). Match it here then,
   6616   // this is horrible, but will stay like this until we move all shuffle
   6617   // matching to x86 specific nodes. Note that for the 1st condition all
   6618   // types are matched with movsd.
   6619   if (HasSSE2) {
   6620     // FIXME: isMOVLMask should be checked and matched before getMOVLP,
   6621     // as to remove this logic from here, as much as possible
   6622     if (NumElems == 2 || !isMOVLMask(SVOp->getMask(), VT))
   6623       return getTargetShuffleNode(X86ISD::MOVSD, dl, VT, V1, V2, DAG);
   6624     return getTargetShuffleNode(X86ISD::MOVSS, dl, VT, V1, V2, DAG);
   6625   }
   6626 
   6627   assert(VT != MVT::v4i32 && "unsupported shuffle type");
   6628 
   6629   // Invert the operand order and use SHUFPS to match it.
   6630   return getTargetShuffleNode(X86ISD::SHUFP, dl, VT, V2, V1,
   6631                               getShuffleSHUFImmediate(SVOp), DAG);
   6632 }
   6633 
   6634 // Reduce a vector shuffle to zext.
   6635 SDValue
   6636 X86TargetLowering::LowerVectorIntExtend(SDValue Op, SelectionDAG &DAG) const {
   6637   // PMOVZX is only available from SSE41.
   6638   if (!Subtarget->hasSSE41())
   6639     return SDValue();
   6640 
   6641   EVT VT = Op.getValueType();
   6642 
   6643   // Only AVX2 support 256-bit vector integer extending.
   6644   if (!Subtarget->hasInt256() && VT.is256BitVector())
   6645     return SDValue();
   6646 
   6647   ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
   6648   DebugLoc DL = Op.getDebugLoc();
   6649   SDValue V1 = Op.getOperand(0);
   6650   SDValue V2 = Op.getOperand(1);
   6651   unsigned NumElems = VT.getVectorNumElements();
   6652 
   6653   // Extending is an unary operation and the element type of the source vector
   6654   // won't be equal to or larger than i64.
   6655   if (V2.getOpcode() != ISD::UNDEF || !VT.isInteger() ||
   6656       VT.getVectorElementType() == MVT::i64)
   6657     return SDValue();
   6658 
   6659   // Find the expansion ratio, e.g. expanding from i8 to i32 has a ratio of 4.
   6660   unsigned Shift = 1; // Start from 2, i.e. 1 << 1.
   6661   while ((1U << Shift) < NumElems) {
   6662     if (SVOp->getMaskElt(1U << Shift) == 1)
   6663       break;
   6664     Shift += 1;
   6665     // The maximal ratio is 8, i.e. from i8 to i64.
   6666     if (Shift > 3)
   6667       return SDValue();
   6668   }
   6669 
   6670   // Check the shuffle mask.
   6671   unsigned Mask = (1U << Shift) - 1;
   6672   for (unsigned i = 0; i != NumElems; ++i) {
   6673     int EltIdx = SVOp->getMaskElt(i);
   6674     if ((i & Mask) != 0 && EltIdx != -1)
   6675       return SDValue();
   6676     if ((i & Mask) == 0 && (unsigned)EltIdx != (i >> Shift))
   6677       return SDValue();
   6678   }
   6679 
   6680   LLVMContext *Context = DAG.getContext();
   6681   unsigned NBits = VT.getVectorElementType().getSizeInBits() << Shift;
   6682   EVT NeVT = EVT::getIntegerVT(*Context, NBits);
   6683   EVT NVT = EVT::getVectorVT(*Context, NeVT, NumElems >> Shift);
   6684 
   6685   if (!isTypeLegal(NVT))
   6686     return SDValue();
   6687 
   6688   // Simplify the operand as it's prepared to be fed into shuffle.
   6689   unsigned SignificantBits = NVT.getSizeInBits() >> Shift;
   6690   if (V1.getOpcode() == ISD::BITCAST &&
   6691       V1.getOperand(0).getOpcode() == ISD::SCALAR_TO_VECTOR &&
   6692       V1.getOperand(0).getOperand(0).getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
   6693       V1.getOperand(0)
   6694         .getOperand(0).getValueType().getSizeInBits() == SignificantBits) {
   6695     // (bitcast (sclr2vec (ext_vec_elt x))) -> (bitcast x)
   6696     SDValue V = V1.getOperand(0).getOperand(0).getOperand(0);
   6697     ConstantSDNode *CIdx =
   6698       dyn_cast<ConstantSDNode>(V1.getOperand(0).getOperand(0).getOperand(1));
   6699     // If it's foldable, i.e. normal load with single use, we will let code
   6700     // selection to fold it. Otherwise, we will short the conversion sequence.
   6701     if (CIdx && CIdx->getZExtValue() == 0 &&
   6702         (!ISD::isNormalLoad(V.getNode()) || !V.hasOneUse())) {
   6703       if (V.getValueSizeInBits() > V1.getValueSizeInBits()) {
   6704         // The "ext_vec_elt" node is wider than the result node.
   6705         // In this case we should extract subvector from V.
   6706         // (bitcast (sclr2vec (ext_vec_elt x))) -> (bitcast (extract_subvector x)).
   6707         unsigned Ratio = V.getValueSizeInBits() / V1.getValueSizeInBits();
   6708         EVT FullVT = V.getValueType();
   6709         EVT SubVecVT = EVT::getVectorVT(*Context,
   6710                                         FullVT.getVectorElementType(),
   6711                                         FullVT.getVectorNumElements()/Ratio);
   6712         V = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVecVT, V,
   6713                         DAG.getIntPtrConstant(0));
   6714       }
   6715       V1 = DAG.getNode(ISD::BITCAST, DL, V1.getValueType(), V);
   6716     }
   6717   }
   6718 
   6719   return DAG.getNode(ISD::BITCAST, DL, VT,
   6720                      DAG.getNode(X86ISD::VZEXT, DL, NVT, V1));
   6721 }
   6722 
   6723 SDValue
   6724 X86TargetLowering::NormalizeVectorShuffle(SDValue Op, SelectionDAG &DAG) const {
   6725   ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
   6726   MVT VT = Op.getValueType().getSimpleVT();
   6727   DebugLoc dl = Op.getDebugLoc();
   6728   SDValue V1 = Op.getOperand(0);
   6729   SDValue V2 = Op.getOperand(1);
   6730 
   6731   if (isZeroShuffle(SVOp))
   6732     return getZeroVector(VT, Subtarget, DAG, dl);
   6733 
   6734   // Handle splat operations
   6735   if (SVOp->isSplat()) {
   6736     // Use vbroadcast whenever the splat comes from a foldable load
   6737     SDValue Broadcast = LowerVectorBroadcast(Op, DAG);
   6738     if (Broadcast.getNode())
   6739       return Broadcast;
   6740   }
   6741 
   6742   // Check integer expanding shuffles.
   6743   SDValue NewOp = LowerVectorIntExtend(Op, DAG);
   6744   if (NewOp.getNode())
   6745     return NewOp;
   6746 
   6747   // If the shuffle can be profitably rewritten as a narrower shuffle, then
   6748   // do it!
   6749   if (VT == MVT::v8i16  || VT == MVT::v16i8 ||
   6750       VT == MVT::v16i16 || VT == MVT::v32i8) {
   6751     SDValue NewOp = RewriteAsNarrowerShuffle(SVOp, DAG);
   6752     if (NewOp.getNode())
   6753       return DAG.getNode(ISD::BITCAST, dl, VT, NewOp);
   6754   } else if ((VT == MVT::v4i32 ||
   6755              (VT == MVT::v4f32 && Subtarget->hasSSE2()))) {
   6756     // FIXME: Figure out a cleaner way to do this.
   6757     // Try to make use of movq to zero out the top part.
   6758     if (ISD::isBuildVectorAllZeros(V2.getNode())) {
   6759       SDValue NewOp = RewriteAsNarrowerShuffle(SVOp, DAG);
   6760       if (NewOp.getNode()) {
   6761         MVT NewVT = NewOp.getValueType().getSimpleVT();
   6762         if (isCommutedMOVLMask(cast<ShuffleVectorSDNode>(NewOp)->getMask(),
   6763                                NewVT, true, false))
   6764           return getVZextMovL(VT, NewVT, NewOp.getOperand(0),
   6765                               DAG, Subtarget, dl);
   6766       }
   6767     } else if (ISD::isBuildVectorAllZeros(V1.getNode())) {
   6768       SDValue NewOp = RewriteAsNarrowerShuffle(SVOp, DAG);
   6769       if (NewOp.getNode()) {
   6770         MVT NewVT = NewOp.getValueType().getSimpleVT();
   6771         if (isMOVLMask(cast<ShuffleVectorSDNode>(NewOp)->getMask(), NewVT))
   6772           return getVZextMovL(VT, NewVT, NewOp.getOperand(1),
   6773                               DAG, Subtarget, dl);
   6774       }
   6775     }
   6776   }
   6777   return SDValue();
   6778 }
   6779 
   6780 SDValue
   6781 X86TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const {
   6782   ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
   6783   SDValue V1 = Op.getOperand(0);
   6784   SDValue V2 = Op.getOperand(1);
   6785   MVT VT = Op.getValueType().getSimpleVT();
   6786   DebugLoc dl = Op.getDebugLoc();
   6787   unsigned NumElems = VT.getVectorNumElements();
   6788   bool V1IsUndef = V1.getOpcode() == ISD::UNDEF;
   6789   bool V2IsUndef = V2.getOpcode() == ISD::UNDEF;
   6790   bool V1IsSplat = false;
   6791   bool V2IsSplat = false;
   6792   bool HasSSE2 = Subtarget->hasSSE2();
   6793   bool HasFp256    = Subtarget->hasFp256();
   6794   bool HasInt256   = Subtarget->hasInt256();
   6795   MachineFunction &MF = DAG.getMachineFunction();
   6796   bool OptForSize = MF.getFunction()->getAttributes().
   6797     hasAttribute(AttributeSet::FunctionIndex, Attribute::OptimizeForSize);
   6798 
   6799   assert(VT.getSizeInBits() != 64 && "Can't lower MMX shuffles");
   6800 
   6801   if (V1IsUndef && V2IsUndef)
   6802     return DAG.getUNDEF(VT);
   6803 
   6804   assert(!V1IsUndef && "Op 1 of shuffle should not be undef");
   6805 
   6806   // Vector shuffle lowering takes 3 steps:
   6807   //
   6808   // 1) Normalize the input vectors. Here splats, zeroed vectors, profitable
   6809   //    narrowing and commutation of operands should be handled.
   6810   // 2) Matching of shuffles with known shuffle masks to x86 target specific
   6811   //    shuffle nodes.
   6812   // 3) Rewriting of unmatched masks into new generic shuffle operations,
   6813   //    so the shuffle can be broken into other shuffles and the legalizer can
   6814   //    try the lowering again.
   6815   //
   6816   // The general idea is that no vector_shuffle operation should be left to
   6817   // be matched during isel, all of them must be converted to a target specific
   6818   // node here.
   6819 
   6820   // Normalize the input vectors. Here splats, zeroed vectors, profitable
   6821   // narrowing and commutation of operands should be handled. The actual code
   6822   // doesn't include all of those, work in progress...
   6823   SDValue NewOp = NormalizeVectorShuffle(Op, DAG);
   6824   if (NewOp.getNode())
   6825     return NewOp;
   6826 
   6827   SmallVector<int, 8> M(SVOp->getMask().begin(), SVOp->getMask().end());
   6828 
   6829   // NOTE: isPSHUFDMask can also match both masks below (unpckl_undef and
   6830   // unpckh_undef). Only use pshufd if speed is more important than size.
   6831   if (OptForSize && isUNPCKL_v_undef_Mask(M, VT, HasInt256))
   6832     return getTargetShuffleNode(X86ISD::UNPCKL, dl, VT, V1, V1, DAG);
   6833   if (OptForSize && isUNPCKH_v_undef_Mask(M, VT, HasInt256))
   6834     return getTargetShuffleNode(X86ISD::UNPCKH, dl, VT, V1, V1, DAG);
   6835 
   6836   if (isMOVDDUPMask(M, VT) && Subtarget->hasSSE3() &&
   6837       V2IsUndef && MayFoldVectorLoad(V1))
   6838     return getMOVDDup(Op, dl, V1, DAG);
   6839 
   6840   if (isMOVHLPS_v_undef_Mask(M, VT))
   6841     return getMOVHighToLow(Op, dl, DAG);
   6842 
   6843   // Use to match splats
   6844   if (HasSSE2 && isUNPCKHMask(M, VT, HasInt256) && V2IsUndef &&
   6845       (VT == MVT::v2f64 || VT == MVT::v2i64))
   6846     return getTargetShuffleNode(X86ISD::UNPCKH, dl, VT, V1, V1, DAG);
   6847 
   6848   if (isPSHUFDMask(M, VT)) {
   6849     // The actual implementation will match the mask in the if above and then
   6850     // during isel it can match several different instructions, not only pshufd
   6851     // as its name says, sad but true, emulate the behavior for now...
   6852     if (isMOVDDUPMask(M, VT) && ((VT == MVT::v4f32 || VT == MVT::v2i64)))
   6853       return getTargetShuffleNode(X86ISD::MOVLHPS, dl, VT, V1, V1, DAG);
   6854 
   6855     unsigned TargetMask = getShuffleSHUFImmediate(SVOp);
   6856 
   6857     if (HasSSE2 && (VT == MVT::v4f32 || VT == MVT::v4i32))
   6858       return getTargetShuffleNode(X86ISD::PSHUFD, dl, VT, V1, TargetMask, DAG);
   6859 
   6860     if (HasFp256 && (VT == MVT::v4f32 || VT == MVT::v2f64))
   6861       return getTargetShuffleNode(X86ISD::VPERMILP, dl, VT, V1, TargetMask,
   6862                                   DAG);
   6863 
   6864     return getTargetShuffleNode(X86ISD::SHUFP, dl, VT, V1, V1,
   6865                                 TargetMask, DAG);
   6866   }
   6867 
   6868   // Check if this can be converted into a logical shift.
   6869   bool isLeft = false;
   6870   unsigned ShAmt = 0;
   6871   SDValue ShVal;
   6872   bool isShift = HasSSE2 && isVectorShift(SVOp, DAG, isLeft, ShVal, ShAmt);
   6873   if (isShift && ShVal.hasOneUse()) {
   6874     // If the shifted value has multiple uses, it may be cheaper to use
   6875     // v_set0 + movlhps or movhlps, etc.
   6876     MVT EltVT = VT.getVectorElementType();
   6877     ShAmt *= EltVT.getSizeInBits();
   6878     return getVShift(isLeft, VT, ShVal, ShAmt, DAG, *this, dl);
   6879   }
   6880 
   6881   if (isMOVLMask(M, VT)) {
   6882     if (ISD::isBuildVectorAllZeros(V1.getNode()))
   6883       return getVZextMovL(VT, VT, V2, DAG, Subtarget, dl);
   6884     if (!isMOVLPMask(M, VT)) {
   6885       if (HasSSE2 && (VT == MVT::v2i64 || VT == MVT::v2f64))
   6886         return getTargetShuffleNode(X86ISD::MOVSD, dl, VT, V1, V2, DAG);
   6887 
   6888       if (VT == MVT::v4i32 || VT == MVT::v4f32)
   6889         return getTargetShuffleNode(X86ISD::MOVSS, dl, VT, V1, V2, DAG);
   6890     }
   6891   }
   6892 
   6893   // FIXME: fold these into legal mask.
   6894   if (isMOVLHPSMask(M, VT) && !isUNPCKLMask(M, VT, HasInt256))
   6895     return getMOVLowToHigh(Op, dl, DAG, HasSSE2);
   6896 
   6897   if (isMOVHLPSMask(M, VT))
   6898     return getMOVHighToLow(Op, dl, DAG);
   6899 
   6900   if (V2IsUndef && isMOVSHDUPMask(M, VT, Subtarget))
   6901     return getTargetShuffleNode(X86ISD::MOVSHDUP, dl, VT, V1, DAG);
   6902 
   6903   if (V2IsUndef && isMOVSLDUPMask(M, VT, Subtarget))
   6904     return getTargetShuffleNode(X86ISD::MOVSLDUP, dl, VT, V1, DAG);
   6905 
   6906   if (isMOVLPMask(M, VT))
   6907     return getMOVLP(Op, dl, DAG, HasSSE2);
   6908 
   6909   if (ShouldXformToMOVHLPS(M, VT) ||
   6910       ShouldXformToMOVLP(V1.getNode(), V2.getNode(), M, VT))
   6911     return CommuteVectorShuffle(SVOp, DAG);
   6912 
   6913   if (isShift) {
   6914     // No better options. Use a vshldq / vsrldq.
   6915     MVT EltVT = VT.getVectorElementType();
   6916     ShAmt *= EltVT.getSizeInBits();
   6917     return getVShift(isLeft, VT, ShVal, ShAmt, DAG, *this, dl);
   6918   }
   6919 
   6920   bool Commuted = false;
   6921   // FIXME: This should also accept a bitcast of a splat?  Be careful, not
   6922   // 1,1,1,1 -> v8i16 though.
   6923   V1IsSplat = isSplatVector(V1.getNode());
   6924   V2IsSplat = isSplatVector(V2.getNode());
   6925 
   6926   // Canonicalize the splat or undef, if present, to be on the RHS.
   6927   if (!V2IsUndef && V1IsSplat && !V2IsSplat) {
   6928     CommuteVectorShuffleMask(M, NumElems);
   6929     std::swap(V1, V2);
   6930     std::swap(V1IsSplat, V2IsSplat);
   6931     Commuted = true;
   6932   }
   6933 
   6934   if (isCommutedMOVLMask(M, VT, V2IsSplat, V2IsUndef)) {
   6935     // Shuffling low element of v1 into undef, just return v1.
   6936     if (V2IsUndef)
   6937       return V1;
   6938     // If V2 is a splat, the mask may be malformed such as <4,3,3,3>, which
   6939     // the instruction selector will not match, so get a canonical MOVL with
   6940     // swapped operands to undo the commute.
   6941     return getMOVL(DAG, dl, VT, V2, V1);
   6942   }
   6943 
   6944   if (isUNPCKLMask(M, VT, HasInt256))
   6945     return getTargetShuffleNode(X86ISD::UNPCKL, dl, VT, V1, V2, DAG);
   6946 
   6947   if (isUNPCKHMask(M, VT, HasInt256))
   6948     return getTargetShuffleNode(X86ISD::UNPCKH, dl, VT, V1, V2, DAG);
   6949 
   6950   if (V2IsSplat) {
   6951     // Normalize mask so all entries that point to V2 points to its first
   6952     // element then try to match unpck{h|l} again. If match, return a
   6953     // new vector_shuffle with the corrected mask.p
   6954     SmallVector<int, 8> NewMask(M.begin(), M.end());
   6955     NormalizeMask(NewMask, NumElems);
   6956     if (isUNPCKLMask(NewMask, VT, HasInt256, true))
   6957       return getTargetShuffleNode(X86ISD::UNPCKL, dl, VT, V1, V2, DAG);
   6958     if (isUNPCKHMask(NewMask, VT, HasInt256, true))
   6959       return getTargetShuffleNode(X86ISD::UNPCKH, dl, VT, V1, V2, DAG);
   6960   }
   6961 
   6962   if (Commuted) {
   6963     // Commute is back and try unpck* again.
   6964     // FIXME: this seems wrong.
   6965     CommuteVectorShuffleMask(M, NumElems);
   6966     std::swap(V1, V2);
   6967     std::swap(V1IsSplat, V2IsSplat);
   6968     Commuted = false;
   6969 
   6970     if (isUNPCKLMask(M, VT, HasInt256))
   6971       return getTargetShuffleNode(X86ISD::UNPCKL, dl, VT, V1, V2, DAG);
   6972 
   6973     if (isUNPCKHMask(M, VT, HasInt256))
   6974       return getTargetShuffleNode(X86ISD::UNPCKH, dl, VT, V1, V2, DAG);
   6975   }
   6976 
   6977   // Normalize the node to match x86 shuffle ops if needed
   6978   if (!V2IsUndef && (isSHUFPMask(M, VT, HasFp256, /* Commuted */ true)))
   6979     return CommuteVectorShuffle(SVOp, DAG);
   6980 
   6981   // The checks below are all present in isShuffleMaskLegal, but they are
   6982   // inlined here right now to enable us to directly emit target specific
   6983   // nodes, and remove one by one until they don't return Op anymore.
   6984 
   6985   if (isPALIGNRMask(M, VT, Subtarget))
   6986     return getTargetShuffleNode(X86ISD::PALIGNR, dl, VT, V1, V2,
   6987                                 getShufflePALIGNRImmediate(SVOp),
   6988                                 DAG);
   6989 
   6990   if (ShuffleVectorSDNode::isSplatMask(&M[0], VT) &&
   6991       SVOp->getSplatIndex() == 0 && V2IsUndef) {
   6992     if (VT == MVT::v2f64 || VT == MVT::v2i64)
   6993       return getTargetShuffleNode(X86ISD::UNPCKL, dl, VT, V1, V1, DAG);
   6994   }
   6995 
   6996   if (isPSHUFHWMask(M, VT, HasInt256))
   6997     return getTargetShuffleNode(X86ISD::PSHUFHW, dl, VT, V1,
   6998                                 getShufflePSHUFHWImmediate(SVOp),
   6999                                 DAG);
   7000 
   7001   if (isPSHUFLWMask(M, VT, HasInt256))
   7002     return getTargetShuffleNode(X86ISD::PSHUFLW, dl, VT, V1,
   7003                                 getShufflePSHUFLWImmediate(SVOp),
   7004                                 DAG);
   7005 
   7006   if (isSHUFPMask(M, VT, HasFp256))
   7007     return getTargetShuffleNode(X86ISD::SHUFP, dl, VT, V1, V2,
   7008                                 getShuffleSHUFImmediate(SVOp), DAG);
   7009 
   7010   if (isUNPCKL_v_undef_Mask(M, VT, HasInt256))
   7011     return getTargetShuffleNode(X86ISD::UNPCKL, dl, VT, V1, V1, DAG);
   7012   if (isUNPCKH_v_undef_Mask(M, VT, HasInt256))
   7013     return getTargetShuffleNode(X86ISD::UNPCKH, dl, VT, V1, V1, DAG);
   7014 
   7015   //===--------------------------------------------------------------------===//
   7016   // Generate target specific nodes for 128 or 256-bit shuffles only
   7017   // supported in the AVX instruction set.
   7018   //
   7019 
   7020   // Handle VMOVDDUPY permutations
   7021   if (V2IsUndef && isMOVDDUPYMask(M, VT, HasFp256))
   7022     return getTargetShuffleNode(X86ISD::MOVDDUP, dl, VT, V1, DAG);
   7023 
   7024   // Handle VPERMILPS/D* permutations
   7025   if (isVPERMILPMask(M, VT, HasFp256)) {
   7026     if (HasInt256 && VT == MVT::v8i32)
   7027       return getTargetShuffleNode(X86ISD::PSHUFD, dl, VT, V1,
   7028                                   getShuffleSHUFImmediate(SVOp), DAG);
   7029     return getTargetShuffleNode(X86ISD::VPERMILP, dl, VT, V1,
   7030                                 getShuffleSHUFImmediate(SVOp), DAG);
   7031   }
   7032 
   7033   // Handle VPERM2F128/VPERM2I128 permutations
   7034   if (isVPERM2X128Mask(M, VT, HasFp256))
   7035     return getTargetShuffleNode(X86ISD::VPERM2X128, dl, VT, V1,
   7036                                 V2, getShuffleVPERM2X128Immediate(SVOp), DAG);
   7037 
   7038   SDValue BlendOp = LowerVECTOR_SHUFFLEtoBlend(SVOp, Subtarget, DAG);
   7039   if (BlendOp.getNode())
   7040     return BlendOp;
   7041 
   7042   if (V2IsUndef && HasInt256 && (VT == MVT::v8i32 || VT == MVT::v8f32)) {
   7043     SmallVector<SDValue, 8> permclMask;
   7044     for (unsigned i = 0; i != 8; ++i) {
   7045       permclMask.push_back(DAG.getConstant((M[i]>=0) ? M[i] : 0, MVT::i32));
   7046     }
   7047     SDValue Mask = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v8i32,
   7048                                &permclMask[0], 8);
   7049     // Bitcast is for VPERMPS since mask is v8i32 but node takes v8f32
   7050     return DAG.getNode(X86ISD::VPERMV, dl, VT,
   7051                        DAG.getNode(ISD::BITCAST, dl, VT, Mask), V1);
   7052   }
   7053 
   7054   if (V2IsUndef && HasInt256 && (VT == MVT::v4i64 || VT == MVT::v4f64))
   7055     return getTargetShuffleNode(X86ISD::VPERMI, dl, VT, V1,
   7056                                 getShuffleCLImmediate(SVOp), DAG);
   7057 
   7058   //===--------------------------------------------------------------------===//
   7059   // Since no target specific shuffle was selected for this generic one,
   7060   // lower it into other known shuffles. FIXME: this isn't true yet, but
   7061   // this is the plan.
   7062   //
   7063 
   7064   // Handle v8i16 specifically since SSE can do byte extraction and insertion.
   7065   if (VT == MVT::v8i16) {
   7066     SDValue NewOp = LowerVECTOR_SHUFFLEv8i16(Op, Subtarget, DAG);
   7067     if (NewOp.getNode())
   7068       return NewOp;
   7069   }
   7070 
   7071   if (VT == MVT::v16i8) {
   7072     SDValue NewOp = LowerVECTOR_SHUFFLEv16i8(SVOp, DAG, *this);
   7073     if (NewOp.getNode())
   7074       return NewOp;
   7075   }
   7076 
   7077   if (VT == MVT::v32i8) {
   7078     SDValue NewOp = LowerVECTOR_SHUFFLEv32i8(SVOp, Subtarget, DAG);
   7079     if (NewOp.getNode())
   7080       return NewOp;
   7081   }
   7082 
   7083   // Handle all 128-bit wide vectors with 4 elements, and match them with
   7084   // several different shuffle types.
   7085   if (NumElems == 4 && VT.is128BitVector())
   7086     return LowerVECTOR_SHUFFLE_128v4(SVOp, DAG);
   7087 
   7088   // Handle general 256-bit shuffles
   7089   if (VT.is256BitVector())
   7090     return LowerVECTOR_SHUFFLE_256(SVOp, DAG);
   7091 
   7092   return SDValue();
   7093 }
   7094 
   7095 static SDValue LowerEXTRACT_VECTOR_ELT_SSE4(SDValue Op, SelectionDAG &DAG) {
   7096   MVT VT = Op.getValueType().getSimpleVT();
   7097   DebugLoc dl = Op.getDebugLoc();
   7098 
   7099   if (!Op.getOperand(0).getValueType().getSimpleVT().is128BitVector())
   7100     return SDValue();
   7101 
   7102   if (VT.getSizeInBits() == 8) {
   7103     SDValue Extract = DAG.getNode(X86ISD::PEXTRB, dl, MVT::i32,
   7104                                   Op.getOperand(0), Op.getOperand(1));
   7105     SDValue Assert  = DAG.getNode(ISD::AssertZext, dl, MVT::i32, Extract,
   7106                                   DAG.getValueType(VT));
   7107     return DAG.getNode(ISD::TRUNCATE, dl, VT, Assert);
   7108   }
   7109 
   7110   if (VT.getSizeInBits() == 16) {
   7111     unsigned Idx = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
   7112     // If Idx is 0, it's cheaper to do a move instead of a pextrw.
   7113     if (Idx == 0)
   7114       return DAG.getNode(ISD::TRUNCATE, dl, MVT::i16,
   7115                          DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32,
   7116                                      DAG.getNode(ISD::BITCAST, dl,
   7117                                                  MVT::v4i32,
   7118                                                  Op.getOperand(0)),
   7119                                      Op.getOperand(1)));
   7120     SDValue Extract = DAG.getNode(X86ISD::PEXTRW, dl, MVT::i32,
   7121                                   Op.getOperand(0), Op.getOperand(1));
   7122     SDValue Assert  = DAG.getNode(ISD::AssertZext, dl, MVT::i32, Extract,
   7123                                   DAG.getValueType(VT));
   7124     return DAG.getNode(ISD::TRUNCATE, dl, VT, Assert);
   7125   }
   7126 
   7127   if (VT == MVT::f32) {
   7128     // EXTRACTPS outputs to a GPR32 register which will require a movd to copy
   7129     // the result back to FR32 register. It's only worth matching if the
   7130     // result has a single use which is a store or a bitcast to i32.  And in
   7131     // the case of a store, it's not worth it if the index is a constant 0,
   7132     // because a MOVSSmr can be used instead, which is smaller and faster.
   7133     if (!Op.hasOneUse())
   7134       return SDValue();
   7135     SDNode *User = *Op.getNode()->use_begin();
   7136     if ((User->getOpcode() != ISD::STORE ||
   7137          (isa<ConstantSDNode>(Op.getOperand(1)) &&
   7138           cast<ConstantSDNode>(Op.getOperand(1))->isNullValue())) &&
   7139         (User->getOpcode() != ISD::BITCAST ||
   7140          User->getValueType(0) != MVT::i32))
   7141       return SDValue();
   7142     SDValue Extract = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32,
   7143                                   DAG.getNode(ISD::BITCAST, dl, MVT::v4i32,
   7144                                               Op.getOperand(0)),
   7145                                               Op.getOperand(1));
   7146     return DAG.getNode(ISD::BITCAST, dl, MVT::f32, Extract);
   7147   }
   7148 
   7149   if (VT == MVT::i32 || VT == MVT::i64) {
   7150     // ExtractPS/pextrq works with constant index.
   7151     if (isa<ConstantSDNode>(Op.getOperand(1)))
   7152       return Op;
   7153   }
   7154   return SDValue();
   7155 }
   7156 
   7157 SDValue
   7158 X86TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op,
   7159                                            SelectionDAG &DAG) const {
   7160   if (!isa<ConstantSDNode>(Op.getOperand(1)))
   7161     return SDValue();
   7162 
   7163   SDValue Vec = Op.getOperand(0);
   7164   MVT VecVT = Vec.getValueType().getSimpleVT();
   7165 
   7166   // If this is a 256-bit vector result, first extract the 128-bit vector and
   7167   // then extract the element from the 128-bit vector.
   7168   if (VecVT.is256BitVector()) {
   7169     DebugLoc dl = Op.getNode()->getDebugLoc();
   7170     unsigned NumElems = VecVT.getVectorNumElements();
   7171     SDValue Idx = Op.getOperand(1);
   7172     unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
   7173 
   7174     // Get the 128-bit vector.
   7175     Vec = Extract128BitVector(Vec, IdxVal, DAG, dl);
   7176 
   7177     if (IdxVal >= NumElems/2)
   7178       IdxVal -= NumElems/2;
   7179     return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, Op.getValueType(), Vec,
   7180                        DAG.getConstant(IdxVal, MVT::i32));
   7181   }
   7182 
   7183   assert(VecVT.is128BitVector() && "Unexpected vector length");
   7184 
   7185   if (Subtarget->hasSSE41()) {
   7186     SDValue Res = LowerEXTRACT_VECTOR_ELT_SSE4(Op, DAG);
   7187     if (Res.getNode())
   7188       return Res;
   7189   }
   7190 
   7191   MVT VT = Op.getValueType().getSimpleVT();
   7192   DebugLoc dl = Op.getDebugLoc();
   7193   // TODO: handle v16i8.
   7194   if (VT.getSizeInBits() == 16) {
   7195     SDValue Vec = Op.getOperand(0);
   7196     unsigned Idx = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
   7197     if (Idx == 0)
   7198       return DAG.getNode(ISD::TRUNCATE, dl, MVT::i16,
   7199                          DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32,
   7200                                      DAG.getNode(ISD::BITCAST, dl,
   7201                                                  MVT::v4i32, Vec),
   7202                                      Op.getOperand(1)));
   7203     // Transform it so it match pextrw which produces a 32-bit result.
   7204     MVT EltVT = MVT::i32;
   7205     SDValue Extract = DAG.getNode(X86ISD::PEXTRW, dl, EltVT,
   7206                                   Op.getOperand(0), Op.getOperand(1));
   7207     SDValue Assert  = DAG.getNode(ISD::AssertZext, dl, EltVT, Extract,
   7208                                   DAG.getValueType(VT));
   7209     return DAG.getNode(ISD::TRUNCATE, dl, VT, Assert);
   7210   }
   7211 
   7212   if (VT.getSizeInBits() == 32) {
   7213     unsigned Idx = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
   7214     if (Idx == 0)
   7215       return Op;
   7216 
   7217     // SHUFPS the element to the lowest double word, then movss.
   7218     int Mask[4] = { static_cast<int>(Idx), -1, -1, -1 };
   7219     MVT VVT = Op.getOperand(0).getValueType().getSimpleVT();
   7220     SDValue Vec = DAG.getVectorShuffle(VVT, dl, Op.getOperand(0),
   7221                                        DAG.getUNDEF(VVT), Mask);
   7222     return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Vec,
   7223                        DAG.getIntPtrConstant(0));
   7224   }
   7225 
   7226   if (VT.getSizeInBits() == 64) {
   7227     // FIXME: .td only matches this for <2 x f64>, not <2 x i64> on 32b
   7228     // FIXME: seems like this should be unnecessary if mov{h,l}pd were taught
   7229     //        to match extract_elt for f64.
   7230     unsigned Idx = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
   7231     if (Idx == 0)
   7232       return Op;
   7233 
   7234     // UNPCKHPD the element to the lowest double word, then movsd.
   7235     // Note if the lower 64 bits of the result of the UNPCKHPD is then stored
   7236     // to a f64mem, the whole operation is folded into a single MOVHPDmr.
   7237     int Mask[2] = { 1, -1 };
   7238     MVT VVT = Op.getOperand(0).getValueType().getSimpleVT();
   7239     SDValue Vec = DAG.getVectorShuffle(VVT, dl, Op.getOperand(0),
   7240                                        DAG.getUNDEF(VVT), Mask);
   7241     return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Vec,
   7242                        DAG.getIntPtrConstant(0));
   7243   }
   7244 
   7245   return SDValue();
   7246 }
   7247 
   7248 static SDValue LowerINSERT_VECTOR_ELT_SSE4(SDValue Op, SelectionDAG &DAG) {
   7249   MVT VT = Op.getValueType().getSimpleVT();
   7250   MVT EltVT = VT.getVectorElementType();
   7251   DebugLoc dl = Op.getDebugLoc();
   7252 
   7253   SDValue N0 = Op.getOperand(0);
   7254   SDValue N1 = Op.getOperand(1);
   7255   SDValue N2 = Op.getOperand(2);
   7256 
   7257   if (!VT.is128BitVector())
   7258     return SDValue();
   7259 
   7260   if ((EltVT.getSizeInBits() == 8 || EltVT.getSizeInBits() == 16) &&
   7261       isa<ConstantSDNode>(N2)) {
   7262     unsigned Opc;
   7263     if (VT == MVT::v8i16)
   7264       Opc = X86ISD::PINSRW;
   7265     else if (VT == MVT::v16i8)
   7266       Opc = X86ISD::PINSRB;
   7267     else
   7268       Opc = X86ISD::PINSRB;
   7269 
   7270     // Transform it so it match pinsr{b,w} which expects a GR32 as its second
   7271     // argument.
   7272     if (N1.getValueType() != MVT::i32)
   7273       N1 = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, N1);
   7274     if (N2.getValueType() != MVT::i32)
   7275       N2 = DAG.getIntPtrConstant(cast<ConstantSDNode>(N2)->getZExtValue());
   7276     return DAG.getNode(Opc, dl, VT, N0, N1, N2);
   7277   }
   7278 
   7279   if (EltVT == MVT::f32 && isa<ConstantSDNode>(N2)) {
   7280     // Bits [7:6] of the constant are the source select.  This will always be
   7281     //  zero here.  The DAG Combiner may combine an extract_elt index into these
   7282     //  bits.  For example (insert (extract, 3), 2) could be matched by putting
   7283     //  the '3' into bits [7:6] of X86ISD::INSERTPS.
   7284     // Bits [5:4] of the constant are the destination select.  This is the
   7285     //  value of the incoming immediate.
   7286     // Bits [3:0] of the constant are the zero mask.  The DAG Combiner may
   7287     //   combine either bitwise AND or insert of float 0.0 to set these bits.
   7288     N2 = DAG.getIntPtrConstant(cast<ConstantSDNode>(N2)->getZExtValue() << 4);
   7289     // Create this as a scalar to vector..
   7290     N1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4f32, N1);
   7291     return DAG.getNode(X86ISD::INSERTPS, dl, VT, N0, N1, N2);
   7292   }
   7293 
   7294   if ((EltVT == MVT::i32 || EltVT == MVT::i64) && isa<ConstantSDNode>(N2)) {
   7295     // PINSR* works with constant index.
   7296     return Op;
   7297   }
   7298   return SDValue();
   7299 }
   7300 
   7301 SDValue
   7302 X86TargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const {
   7303   MVT VT = Op.getValueType().getSimpleVT();
   7304   MVT EltVT = VT.getVectorElementType();
   7305 
   7306   DebugLoc dl = Op.getDebugLoc();
   7307   SDValue N0 = Op.getOperand(0);
   7308   SDValue N1 = Op.getOperand(1);
   7309   SDValue N2 = Op.getOperand(2);
   7310 
   7311   // If this is a 256-bit vector result, first extract the 128-bit vector,
   7312   // insert the element into the extracted half and then place it back.
   7313   if (VT.is256BitVector()) {
   7314     if (!isa<ConstantSDNode>(N2))
   7315       return SDValue();
   7316 
   7317     // Get the desired 128-bit vector half.
   7318     unsigned NumElems = VT.getVectorNumElements();
   7319     unsigned IdxVal = cast<ConstantSDNode>(N2)->getZExtValue();
   7320     SDValue V = Extract128BitVector(N0, IdxVal, DAG, dl);
   7321 
   7322     // Insert the element into the desired half.
   7323     bool Upper = IdxVal >= NumElems/2;
   7324     V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, V.getValueType(), V, N1,
   7325                  DAG.getConstant(Upper ? IdxVal-NumElems/2 : IdxVal, MVT::i32));
   7326 
   7327     // Insert the changed part back to the 256-bit vector
   7328     return Insert128BitVector(N0, V, IdxVal, DAG, dl);
   7329   }
   7330 
   7331   if (Subtarget->hasSSE41())
   7332     return LowerINSERT_VECTOR_ELT_SSE4(Op, DAG);
   7333 
   7334   if (EltVT == MVT::i8)
   7335     return SDValue();
   7336 
   7337   if (EltVT.getSizeInBits() == 16 && isa<ConstantSDNode>(N2)) {
   7338     // Transform it so it match pinsrw which expects a 16-bit value in a GR32
   7339     // as its second argument.
   7340     if (N1.getValueType() != MVT::i32)
   7341       N1 = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, N1);
   7342     if (N2.getValueType() != MVT::i32)
   7343       N2 = DAG.getIntPtrConstant(cast<ConstantSDNode>(N2)->getZExtValue());
   7344     return DAG.getNode(X86ISD::PINSRW, dl, VT, N0, N1, N2);
   7345   }
   7346   return SDValue();
   7347 }
   7348 
   7349 static SDValue LowerSCALAR_TO_VECTOR(SDValue Op, SelectionDAG &DAG) {
   7350   LLVMContext *Context = DAG.getContext();
   7351   DebugLoc dl = Op.getDebugLoc();
   7352   MVT OpVT = Op.getValueType().getSimpleVT();
   7353 
   7354   // If this is a 256-bit vector result, first insert into a 128-bit
   7355   // vector and then insert into the 256-bit vector.
   7356   if (!OpVT.is128BitVector()) {
   7357     // Insert into a 128-bit vector.
   7358     EVT VT128 = EVT::getVectorVT(*Context,
   7359                                  OpVT.getVectorElementType(),
   7360                                  OpVT.getVectorNumElements() / 2);
   7361 
   7362     Op = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT128, Op.getOperand(0));
   7363 
   7364     // Insert the 128-bit vector.
   7365     return Insert128BitVector(DAG.getUNDEF(OpVT), Op, 0, DAG, dl);
   7366   }
   7367 
   7368   if (OpVT == MVT::v1i64 &&
   7369       Op.getOperand(0).getValueType() == MVT::i64)
   7370     return DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v1i64, Op.getOperand(0));
   7371 
   7372   SDValue AnyExt = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, Op.getOperand(0));
   7373   assert(OpVT.is128BitVector() && "Expected an SSE type!");
   7374   return DAG.getNode(ISD::BITCAST, dl, OpVT,
   7375                      DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32,AnyExt));
   7376 }
   7377 
   7378 // Lower a node with an EXTRACT_SUBVECTOR opcode.  This may result in
   7379 // a simple subregister reference or explicit instructions to grab
   7380 // upper bits of a vector.
   7381 static SDValue LowerEXTRACT_SUBVECTOR(SDValue Op, const X86Subtarget *Subtarget,
   7382                                       SelectionDAG &DAG) {
   7383   if (Subtarget->hasFp256()) {
   7384     DebugLoc dl = Op.getNode()->getDebugLoc();
   7385     SDValue Vec = Op.getNode()->getOperand(0);
   7386     SDValue Idx = Op.getNode()->getOperand(1);
   7387 
   7388     if (Op.getNode()->getValueType(0).is128BitVector() &&
   7389         Vec.getNode()->getValueType(0).is256BitVector() &&
   7390         isa<ConstantSDNode>(Idx)) {
   7391       unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
   7392       return Extract128BitVector(Vec, IdxVal, DAG, dl);
   7393     }
   7394   }
   7395   return SDValue();
   7396 }
   7397 
   7398 // Lower a node with an INSERT_SUBVECTOR opcode.  This may result in a
   7399 // simple superregister reference or explicit instructions to insert
   7400 // the upper bits of a vector.
   7401 static SDValue LowerINSERT_SUBVECTOR(SDValue Op, const X86Subtarget *Subtarget,
   7402                                      SelectionDAG &DAG) {
   7403   if (Subtarget->hasFp256()) {
   7404     DebugLoc dl = Op.getNode()->getDebugLoc();
   7405     SDValue Vec = Op.getNode()->getOperand(0);
   7406     SDValue SubVec = Op.getNode()->getOperand(1);
   7407     SDValue Idx = Op.getNode()->getOperand(2);
   7408 
   7409     if (Op.getNode()->getValueType(0).is256BitVector() &&
   7410         SubVec.getNode()->getValueType(0).is128BitVector() &&
   7411         isa<ConstantSDNode>(Idx)) {
   7412       unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
   7413       return Insert128BitVector(Vec, SubVec, IdxVal, DAG, dl);
   7414     }
   7415   }
   7416   return SDValue();
   7417 }
   7418 
   7419 // ConstantPool, JumpTable, GlobalAddress, and ExternalSymbol are lowered as
   7420 // their target countpart wrapped in the X86ISD::Wrapper node. Suppose N is
   7421 // one of the above mentioned nodes. It has to be wrapped because otherwise
   7422 // Select(N) returns N. So the raw TargetGlobalAddress nodes, etc. can only
   7423 // be used to form addressing mode. These wrapped nodes will be selected
   7424 // into MOV32ri.
   7425 SDValue
   7426 X86TargetLowering::LowerConstantPool(SDValue Op, SelectionDAG &DAG) const {
   7427   ConstantPoolSDNode *CP = cast<ConstantPoolSDNode>(Op);
   7428 
   7429   // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the
   7430   // global base reg.
   7431   unsigned char OpFlag = 0;
   7432   unsigned WrapperKind = X86ISD::Wrapper;
   7433   CodeModel::Model M = getTargetMachine().getCodeModel();
   7434 
   7435   if (Subtarget->isPICStyleRIPRel() &&
   7436       (M == CodeModel::Small || M == CodeModel::Kernel))
   7437     WrapperKind = X86ISD::WrapperRIP;
   7438   else if (Subtarget->isPICStyleGOT())
   7439     OpFlag = X86II::MO_GOTOFF;
   7440   else if (Subtarget->isPICStyleStubPIC())
   7441     OpFlag = X86II::MO_PIC_BASE_OFFSET;
   7442 
   7443   SDValue Result = DAG.getTargetConstantPool(CP->getConstVal(), getPointerTy(),
   7444                                              CP->getAlignment(),
   7445                                              CP->getOffset(), OpFlag);
   7446   DebugLoc DL = CP->getDebugLoc();
   7447   Result = DAG.getNode(WrapperKind, DL, getPointerTy(), Result);
   7448   // With PIC, the address is actually $g + Offset.
   7449   if (OpFlag) {
   7450     Result = DAG.getNode(ISD::ADD, DL, getPointerTy(),
   7451                          DAG.getNode(X86ISD::GlobalBaseReg,
   7452                                      DebugLoc(), getPointerTy()),
   7453                          Result);
   7454   }
   7455 
   7456   return Result;
   7457 }
   7458 
   7459 SDValue X86TargetLowering::LowerJumpTable(SDValue Op, SelectionDAG &DAG) const {
   7460   JumpTableSDNode *JT = cast<JumpTableSDNode>(Op);
   7461 
   7462   // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the
   7463   // global base reg.
   7464   unsigned char OpFlag = 0;
   7465   unsigned WrapperKind = X86ISD::Wrapper;
   7466   CodeModel::Model M = getTargetMachine().getCodeModel();
   7467 
   7468   if (Subtarget->isPICStyleRIPRel() &&
   7469       (M == CodeModel::Small || M == CodeModel::Kernel))
   7470     WrapperKind = X86ISD::WrapperRIP;
   7471   else if (Subtarget->isPICStyleGOT())
   7472     OpFlag = X86II::MO_GOTOFF;
   7473   else if (Subtarget->isPICStyleStubPIC())
   7474     OpFlag = X86II::MO_PIC_BASE_OFFSET;
   7475 
   7476   SDValue Result = DAG.getTargetJumpTable(JT->getIndex(), getPointerTy(),
   7477                                           OpFlag);
   7478   DebugLoc DL = JT->getDebugLoc();
   7479   Result = DAG.getNode(WrapperKind, DL, getPointerTy(), Result);
   7480 
   7481   // With PIC, the address is actually $g + Offset.
   7482   if (OpFlag)
   7483     Result = DAG.getNode(ISD::ADD, DL, getPointerTy(),
   7484                          DAG.getNode(X86ISD::GlobalBaseReg,
   7485                                      DebugLoc(), getPointerTy()),
   7486                          Result);
   7487 
   7488   return Result;
   7489 }
   7490 
   7491 SDValue
   7492 X86TargetLowering::LowerExternalSymbol(SDValue Op, SelectionDAG &DAG) const {
   7493   const char *Sym = cast<ExternalSymbolSDNode>(Op)->getSymbol();
   7494 
   7495   // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the
   7496   // global base reg.
   7497   unsigned char OpFlag = 0;
   7498   unsigned WrapperKind = X86ISD::Wrapper;
   7499   CodeModel::Model M = getTargetMachine().getCodeModel();
   7500 
   7501   if (Subtarget->isPICStyleRIPRel() &&
   7502       (M == CodeModel::Small || M == CodeModel::Kernel)) {
   7503     if (Subtarget->isTargetDarwin() || Subtarget->isTargetELF())
   7504       OpFlag = X86II::MO_GOTPCREL;
   7505     WrapperKind = X86ISD::WrapperRIP;
   7506   } else if (Subtarget->isPICStyleGOT()) {
   7507     OpFlag = X86II::MO_GOT;
   7508   } else if (Subtarget->isPICStyleStubPIC()) {
   7509     OpFlag = X86II::MO_DARWIN_NONLAZY_PIC_BASE;
   7510   } else if (Subtarget->isPICStyleStubNoDynamic()) {
   7511     OpFlag = X86II::MO_DARWIN_NONLAZY;
   7512   }
   7513 
   7514   SDValue Result = DAG.getTargetExternalSymbol(Sym, getPointerTy(), OpFlag);
   7515 
   7516   DebugLoc DL = Op.getDebugLoc();
   7517   Result = DAG.getNode(WrapperKind, DL, getPointerTy(), Result);
   7518 
   7519   // With PIC, the address is actually $g + Offset.
   7520   if (getTargetMachine().getRelocationModel() == Reloc::PIC_ &&
   7521       !Subtarget->is64Bit()) {
   7522     Result = DAG.getNode(ISD::ADD, DL, getPointerTy(),
   7523                          DAG.getNode(X86ISD::GlobalBaseReg,
   7524                                      DebugLoc(), getPointerTy()),
   7525                          Result);
   7526   }
   7527 
   7528   // For symbols that require a load from a stub to get the address, emit the
   7529   // load.
   7530   if (isGlobalStubReference(OpFlag))
   7531     Result = DAG.getLoad(getPointerTy(), DL, DAG.getEntryNode(), Result,
   7532                          MachinePointerInfo::getGOT(), false, false, false, 0);
   7533 
   7534   return Result;
   7535 }
   7536 
   7537 SDValue
   7538 X86TargetLowering::LowerBlockAddress(SDValue Op, SelectionDAG &DAG) const {
   7539   // Create the TargetBlockAddressAddress node.
   7540   unsigned char OpFlags =
   7541     Subtarget->ClassifyBlockAddressReference();
   7542   CodeModel::Model M = getTargetMachine().getCodeModel();
   7543   const BlockAddress *BA = cast<BlockAddressSDNode>(Op)->getBlockAddress();
   7544   int64_t Offset = cast<BlockAddressSDNode>(Op)->getOffset();
   7545   DebugLoc dl = Op.getDebugLoc();
   7546   SDValue Result = DAG.getTargetBlockAddress(BA, getPointerTy(), Offset,
   7547                                              OpFlags);
   7548 
   7549   if (Subtarget->isPICStyleRIPRel() &&
   7550       (M == CodeModel::Small || M == CodeModel::Kernel))
   7551     Result = DAG.getNode(X86ISD::WrapperRIP, dl, getPointerTy(), Result);
   7552   else
   7553     Result = DAG.getNode(X86ISD::Wrapper, dl, getPointerTy(), Result);
   7554 
   7555   // With PIC, the address is actually $g + Offset.
   7556   if (isGlobalRelativeToPICBase(OpFlags)) {
   7557     Result = DAG.getNode(ISD::ADD, dl, getPointerTy(),
   7558                          DAG.getNode(X86ISD::GlobalBaseReg, dl, getPointerTy()),
   7559                          Result);
   7560   }
   7561 
   7562   return Result;
   7563 }
   7564 
   7565 SDValue
   7566 X86TargetLowering::LowerGlobalAddress(const GlobalValue *GV, DebugLoc dl,
   7567                                       int64_t Offset, SelectionDAG &DAG) const {
   7568   // Create the TargetGlobalAddress node, folding in the constant
   7569   // offset if it is legal.
   7570   unsigned char OpFlags =
   7571     Subtarget->ClassifyGlobalReference(GV, getTargetMachine());
   7572   CodeModel::Model M = getTargetMachine().getCodeModel();