Home | History | Annotate | Download | only in X86
      1 //===-- X86ISelLowering.cpp - X86 DAG Lowering Implementation -------------===//
      2 //
      3 //                     The LLVM Compiler Infrastructure
      4 //
      5 // This file is distributed under the University of Illinois Open Source
      6 // License. See LICENSE.TXT for details.
      7 //
      8 //===----------------------------------------------------------------------===//
      9 //
     10 // This file defines the interfaces that X86 uses to lower LLVM code into a
     11 // selection DAG.
     12 //
     13 //===----------------------------------------------------------------------===//
     14 
     15 #include "X86ISelLowering.h"
     16 #include "Utils/X86ShuffleDecode.h"
     17 #include "X86CallingConv.h"
     18 #include "X86InstrBuilder.h"
     19 #include "X86MachineFunctionInfo.h"
     20 #include "X86TargetMachine.h"
     21 #include "X86TargetObjectFile.h"
     22 #include "llvm/ADT/SmallSet.h"
     23 #include "llvm/ADT/Statistic.h"
     24 #include "llvm/ADT/StringExtras.h"
     25 #include "llvm/ADT/StringSwitch.h"
     26 #include "llvm/ADT/VariadicFunction.h"
     27 #include "llvm/CodeGen/IntrinsicLowering.h"
     28 #include "llvm/CodeGen/MachineFrameInfo.h"
     29 #include "llvm/CodeGen/MachineFunction.h"
     30 #include "llvm/CodeGen/MachineInstrBuilder.h"
     31 #include "llvm/CodeGen/MachineJumpTableInfo.h"
     32 #include "llvm/CodeGen/MachineModuleInfo.h"
     33 #include "llvm/CodeGen/MachineRegisterInfo.h"
     34 #include "llvm/IR/CallSite.h"
     35 #include "llvm/IR/CallingConv.h"
     36 #include "llvm/IR/Constants.h"
     37 #include "llvm/IR/DerivedTypes.h"
     38 #include "llvm/IR/Function.h"
     39 #include "llvm/IR/GlobalAlias.h"
     40 #include "llvm/IR/GlobalVariable.h"
     41 #include "llvm/IR/Instructions.h"
     42 #include "llvm/IR/Intrinsics.h"
     43 #include "llvm/MC/MCAsmInfo.h"
     44 #include "llvm/MC/MCContext.h"
     45 #include "llvm/MC/MCExpr.h"
     46 #include "llvm/MC/MCSymbol.h"
     47 #include "llvm/Support/CommandLine.h"
     48 #include "llvm/Support/Debug.h"
     49 #include "llvm/Support/ErrorHandling.h"
     50 #include "llvm/Support/MathExtras.h"
     51 #include "llvm/Target/TargetOptions.h"
     52 #include <bitset>
     53 #include <numeric>
     54 #include <cctype>
     55 using namespace llvm;
     56 
     57 #define DEBUG_TYPE "x86-isel"
     58 
     59 STATISTIC(NumTailCalls, "Number of tail calls");
     60 
     61 static cl::opt<bool> ExperimentalVectorWideningLegalization(
     62     "x86-experimental-vector-widening-legalization", cl::init(false),
     63     cl::desc("Enable an experimental vector type legalization through widening "
     64              "rather than promotion."),
     65     cl::Hidden);
     66 
     67 static cl::opt<bool> ExperimentalVectorShuffleLowering(
     68     "x86-experimental-vector-shuffle-lowering", cl::init(false),
     69     cl::desc("Enable an experimental vector shuffle lowering code path."),
     70     cl::Hidden);
     71 
     72 // Forward declarations.
     73 static SDValue getMOVL(SelectionDAG &DAG, SDLoc dl, EVT VT, SDValue V1,
     74                        SDValue V2);
     75 
     76 static SDValue ExtractSubVector(SDValue Vec, unsigned IdxVal,
     77                                 SelectionDAG &DAG, SDLoc dl,
     78                                 unsigned vectorWidth) {
     79   assert((vectorWidth == 128 || vectorWidth == 256) &&
     80          "Unsupported vector width");
     81   EVT VT = Vec.getValueType();
     82   EVT ElVT = VT.getVectorElementType();
     83   unsigned Factor = VT.getSizeInBits()/vectorWidth;
     84   EVT ResultVT = EVT::getVectorVT(*DAG.getContext(), ElVT,
     85                                   VT.getVectorNumElements()/Factor);
     86 
     87   // Extract from UNDEF is UNDEF.
     88   if (Vec.getOpcode() == ISD::UNDEF)
     89     return DAG.getUNDEF(ResultVT);
     90 
     91   // Extract the relevant vectorWidth bits.  Generate an EXTRACT_SUBVECTOR
     92   unsigned ElemsPerChunk = vectorWidth / ElVT.getSizeInBits();
     93 
     94   // This is the index of the first element of the vectorWidth-bit chunk
     95   // we want.
     96   unsigned NormalizedIdxVal = (((IdxVal * ElVT.getSizeInBits()) / vectorWidth)
     97                                * ElemsPerChunk);
     98 
     99   // If the input is a buildvector just emit a smaller one.
    100   if (Vec.getOpcode() == ISD::BUILD_VECTOR)
    101     return DAG.getNode(ISD::BUILD_VECTOR, dl, ResultVT,
    102                        makeArrayRef(Vec->op_begin()+NormalizedIdxVal,
    103                                     ElemsPerChunk));
    104 
    105   SDValue VecIdx = DAG.getIntPtrConstant(NormalizedIdxVal);
    106   SDValue Result = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, ResultVT, Vec,
    107                                VecIdx);
    108 
    109   return Result;
    110 
    111 }
    112 /// Generate a DAG to grab 128-bits from a vector > 128 bits.  This
    113 /// sets things up to match to an AVX VEXTRACTF128 / VEXTRACTI128
    114 /// or AVX-512 VEXTRACTF32x4 / VEXTRACTI32x4
    115 /// instructions or a simple subregister reference. Idx is an index in the
    116 /// 128 bits we want.  It need not be aligned to a 128-bit bounday.  That makes
    117 /// lowering EXTRACT_VECTOR_ELT operations easier.
    118 static SDValue Extract128BitVector(SDValue Vec, unsigned IdxVal,
    119                                    SelectionDAG &DAG, SDLoc dl) {
    120   assert((Vec.getValueType().is256BitVector() ||
    121           Vec.getValueType().is512BitVector()) && "Unexpected vector size!");
    122   return ExtractSubVector(Vec, IdxVal, DAG, dl, 128);
    123 }
    124 
    125 /// Generate a DAG to grab 256-bits from a 512-bit vector.
    126 static SDValue Extract256BitVector(SDValue Vec, unsigned IdxVal,
    127                                    SelectionDAG &DAG, SDLoc dl) {
    128   assert(Vec.getValueType().is512BitVector() && "Unexpected vector size!");
    129   return ExtractSubVector(Vec, IdxVal, DAG, dl, 256);
    130 }
    131 
    132 static SDValue InsertSubVector(SDValue Result, SDValue Vec,
    133                                unsigned IdxVal, SelectionDAG &DAG,
    134                                SDLoc dl, unsigned vectorWidth) {
    135   assert((vectorWidth == 128 || vectorWidth == 256) &&
    136          "Unsupported vector width");
    137   // Inserting UNDEF is Result
    138   if (Vec.getOpcode() == ISD::UNDEF)
    139     return Result;
    140   EVT VT = Vec.getValueType();
    141   EVT ElVT = VT.getVectorElementType();
    142   EVT ResultVT = Result.getValueType();
    143 
    144   // Insert the relevant vectorWidth bits.
    145   unsigned ElemsPerChunk = vectorWidth/ElVT.getSizeInBits();
    146 
    147   // This is the index of the first element of the vectorWidth-bit chunk
    148   // we want.
    149   unsigned NormalizedIdxVal = (((IdxVal * ElVT.getSizeInBits())/vectorWidth)
    150                                * ElemsPerChunk);
    151 
    152   SDValue VecIdx = DAG.getIntPtrConstant(NormalizedIdxVal);
    153   return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResultVT, Result, Vec,
    154                      VecIdx);
    155 }
    156 /// Generate a DAG to put 128-bits into a vector > 128 bits.  This
    157 /// sets things up to match to an AVX VINSERTF128/VINSERTI128 or
    158 /// AVX-512 VINSERTF32x4/VINSERTI32x4 instructions or a
    159 /// simple superregister reference.  Idx is an index in the 128 bits
    160 /// we want.  It need not be aligned to a 128-bit bounday.  That makes
    161 /// lowering INSERT_VECTOR_ELT operations easier.
    162 static SDValue Insert128BitVector(SDValue Result, SDValue Vec,
    163                                   unsigned IdxVal, SelectionDAG &DAG,
    164                                   SDLoc dl) {
    165   assert(Vec.getValueType().is128BitVector() && "Unexpected vector size!");
    166   return InsertSubVector(Result, Vec, IdxVal, DAG, dl, 128);
    167 }
    168 
    169 static SDValue Insert256BitVector(SDValue Result, SDValue Vec,
    170                                   unsigned IdxVal, SelectionDAG &DAG,
    171                                   SDLoc dl) {
    172   assert(Vec.getValueType().is256BitVector() && "Unexpected vector size!");
    173   return InsertSubVector(Result, Vec, IdxVal, DAG, dl, 256);
    174 }
    175 
    176 /// Concat two 128-bit vectors into a 256 bit vector using VINSERTF128
    177 /// instructions. This is used because creating CONCAT_VECTOR nodes of
    178 /// BUILD_VECTORS returns a larger BUILD_VECTOR while we're trying to lower
    179 /// large BUILD_VECTORS.
    180 static SDValue Concat128BitVectors(SDValue V1, SDValue V2, EVT VT,
    181                                    unsigned NumElems, SelectionDAG &DAG,
    182                                    SDLoc dl) {
    183   SDValue V = Insert128BitVector(DAG.getUNDEF(VT), V1, 0, DAG, dl);
    184   return Insert128BitVector(V, V2, NumElems/2, DAG, dl);
    185 }
    186 
    187 static SDValue Concat256BitVectors(SDValue V1, SDValue V2, EVT VT,
    188                                    unsigned NumElems, SelectionDAG &DAG,
    189                                    SDLoc dl) {
    190   SDValue V = Insert256BitVector(DAG.getUNDEF(VT), V1, 0, DAG, dl);
    191   return Insert256BitVector(V, V2, NumElems/2, DAG, dl);
    192 }
    193 
    194 static TargetLoweringObjectFile *createTLOF(const Triple &TT) {
    195   if (TT.isOSBinFormatMachO()) {
    196     if (TT.getArch() == Triple::x86_64)
    197       return new X86_64MachoTargetObjectFile();
    198     return new TargetLoweringObjectFileMachO();
    199   }
    200 
    201   if (TT.isOSLinux())
    202     return new X86LinuxTargetObjectFile();
    203   if (TT.isOSBinFormatELF())
    204     return new TargetLoweringObjectFileELF();
    205   if (TT.isKnownWindowsMSVCEnvironment())
    206     return new X86WindowsTargetObjectFile();
    207   if (TT.isOSBinFormatCOFF())
    208     return new TargetLoweringObjectFileCOFF();
    209   llvm_unreachable("unknown subtarget type");
    210 }
    211 
    212 // FIXME: This should stop caching the target machine as soon as
    213 // we can remove resetOperationActions et al.
    214 X86TargetLowering::X86TargetLowering(X86TargetMachine &TM)
    215   : TargetLowering(TM, createTLOF(Triple(TM.getTargetTriple()))) {
    216   Subtarget = &TM.getSubtarget<X86Subtarget>();
    217   X86ScalarSSEf64 = Subtarget->hasSSE2();
    218   X86ScalarSSEf32 = Subtarget->hasSSE1();
    219   TD = getDataLayout();
    220 
    221   resetOperationActions();
    222 }
    223 
    224 void X86TargetLowering::resetOperationActions() {
    225   const TargetMachine &TM = getTargetMachine();
    226   static bool FirstTimeThrough = true;
    227 
    228   // If none of the target options have changed, then we don't need to reset the
    229   // operation actions.
    230   if (!FirstTimeThrough && TO == TM.Options) return;
    231 
    232   if (!FirstTimeThrough) {
    233     // Reinitialize the actions.
    234     initActions();
    235     FirstTimeThrough = false;
    236   }
    237 
    238   TO = TM.Options;
    239 
    240   // Set up the TargetLowering object.
    241   static const MVT IntVTs[] = { MVT::i8, MVT::i16, MVT::i32, MVT::i64 };
    242 
    243   // X86 is weird, it always uses i8 for shift amounts and setcc results.
    244   setBooleanContents(ZeroOrOneBooleanContent);
    245   // X86-SSE is even stranger. It uses -1 or 0 for vector masks.
    246   setBooleanVectorContents(ZeroOrNegativeOneBooleanContent);
    247 
    248   // For 64-bit since we have so many registers use the ILP scheduler, for
    249   // 32-bit code use the register pressure specific scheduling.
    250   // For Atom, always use ILP scheduling.
    251   if (Subtarget->isAtom())
    252     setSchedulingPreference(Sched::ILP);
    253   else if (Subtarget->is64Bit())
    254     setSchedulingPreference(Sched::ILP);
    255   else
    256     setSchedulingPreference(Sched::RegPressure);
    257   const X86RegisterInfo *RegInfo =
    258     static_cast<const X86RegisterInfo*>(TM.getRegisterInfo());
    259   setStackPointerRegisterToSaveRestore(RegInfo->getStackRegister());
    260 
    261   // Bypass expensive divides on Atom when compiling with O2
    262   if (Subtarget->hasSlowDivide() && TM.getOptLevel() >= CodeGenOpt::Default) {
    263     addBypassSlowDiv(32, 8);
    264     if (Subtarget->is64Bit())
    265       addBypassSlowDiv(64, 16);
    266   }
    267 
    268   if (Subtarget->isTargetKnownWindowsMSVC()) {
    269     // Setup Windows compiler runtime calls.
    270     setLibcallName(RTLIB::SDIV_I64, "_alldiv");
    271     setLibcallName(RTLIB::UDIV_I64, "_aulldiv");
    272     setLibcallName(RTLIB::SREM_I64, "_allrem");
    273     setLibcallName(RTLIB::UREM_I64, "_aullrem");
    274     setLibcallName(RTLIB::MUL_I64, "_allmul");
    275     setLibcallCallingConv(RTLIB::SDIV_I64, CallingConv::X86_StdCall);
    276     setLibcallCallingConv(RTLIB::UDIV_I64, CallingConv::X86_StdCall);
    277     setLibcallCallingConv(RTLIB::SREM_I64, CallingConv::X86_StdCall);
    278     setLibcallCallingConv(RTLIB::UREM_I64, CallingConv::X86_StdCall);
    279     setLibcallCallingConv(RTLIB::MUL_I64, CallingConv::X86_StdCall);
    280 
    281     // The _ftol2 runtime function has an unusual calling conv, which
    282     // is modeled by a special pseudo-instruction.
    283     setLibcallName(RTLIB::FPTOUINT_F64_I64, nullptr);
    284     setLibcallName(RTLIB::FPTOUINT_F32_I64, nullptr);
    285     setLibcallName(RTLIB::FPTOUINT_F64_I32, nullptr);
    286     setLibcallName(RTLIB::FPTOUINT_F32_I32, nullptr);
    287   }
    288 
    289   if (Subtarget->isTargetDarwin()) {
    290     // Darwin should use _setjmp/_longjmp instead of setjmp/longjmp.
    291     setUseUnderscoreSetJmp(false);
    292     setUseUnderscoreLongJmp(false);
    293   } else if (Subtarget->isTargetWindowsGNU()) {
    294     // MS runtime is weird: it exports _setjmp, but longjmp!
    295     setUseUnderscoreSetJmp(true);
    296     setUseUnderscoreLongJmp(false);
    297   } else {
    298     setUseUnderscoreSetJmp(true);
    299     setUseUnderscoreLongJmp(true);
    300   }
    301 
    302   // Set up the register classes.
    303   addRegisterClass(MVT::i8, &X86::GR8RegClass);
    304   addRegisterClass(MVT::i16, &X86::GR16RegClass);
    305   addRegisterClass(MVT::i32, &X86::GR32RegClass);
    306   if (Subtarget->is64Bit())
    307     addRegisterClass(MVT::i64, &X86::GR64RegClass);
    308 
    309   setLoadExtAction(ISD::SEXTLOAD, MVT::i1, Promote);
    310 
    311   // We don't accept any truncstore of integer registers.
    312   setTruncStoreAction(MVT::i64, MVT::i32, Expand);
    313   setTruncStoreAction(MVT::i64, MVT::i16, Expand);
    314   setTruncStoreAction(MVT::i64, MVT::i8 , Expand);
    315   setTruncStoreAction(MVT::i32, MVT::i16, Expand);
    316   setTruncStoreAction(MVT::i32, MVT::i8 , Expand);
    317   setTruncStoreAction(MVT::i16, MVT::i8,  Expand);
    318 
    319   // SETOEQ and SETUNE require checking two conditions.
    320   setCondCodeAction(ISD::SETOEQ, MVT::f32, Expand);
    321   setCondCodeAction(ISD::SETOEQ, MVT::f64, Expand);
    322   setCondCodeAction(ISD::SETOEQ, MVT::f80, Expand);
    323   setCondCodeAction(ISD::SETUNE, MVT::f32, Expand);
    324   setCondCodeAction(ISD::SETUNE, MVT::f64, Expand);
    325   setCondCodeAction(ISD::SETUNE, MVT::f80, Expand);
    326 
    327   // Promote all UINT_TO_FP to larger SINT_TO_FP's, as X86 doesn't have this
    328   // operation.
    329   setOperationAction(ISD::UINT_TO_FP       , MVT::i1   , Promote);
    330   setOperationAction(ISD::UINT_TO_FP       , MVT::i8   , Promote);
    331   setOperationAction(ISD::UINT_TO_FP       , MVT::i16  , Promote);
    332 
    333   if (Subtarget->is64Bit()) {
    334     setOperationAction(ISD::UINT_TO_FP     , MVT::i32  , Promote);
    335     setOperationAction(ISD::UINT_TO_FP     , MVT::i64  , Custom);
    336   } else if (!TM.Options.UseSoftFloat) {
    337     // We have an algorithm for SSE2->double, and we turn this into a
    338     // 64-bit FILD followed by conditional FADD for other targets.
    339     setOperationAction(ISD::UINT_TO_FP     , MVT::i64  , Custom);
    340     // We have an algorithm for SSE2, and we turn this into a 64-bit
    341     // FILD for other targets.
    342     setOperationAction(ISD::UINT_TO_FP     , MVT::i32  , Custom);
    343   }
    344 
    345   // Promote i1/i8 SINT_TO_FP to larger SINT_TO_FP's, as X86 doesn't have
    346   // this operation.
    347   setOperationAction(ISD::SINT_TO_FP       , MVT::i1   , Promote);
    348   setOperationAction(ISD::SINT_TO_FP       , MVT::i8   , Promote);
    349 
    350   if (!TM.Options.UseSoftFloat) {
    351     // SSE has no i16 to fp conversion, only i32
    352     if (X86ScalarSSEf32) {
    353       setOperationAction(ISD::SINT_TO_FP     , MVT::i16  , Promote);
    354       // f32 and f64 cases are Legal, f80 case is not
    355       setOperationAction(ISD::SINT_TO_FP     , MVT::i32  , Custom);
    356     } else {
    357       setOperationAction(ISD::SINT_TO_FP     , MVT::i16  , Custom);
    358       setOperationAction(ISD::SINT_TO_FP     , MVT::i32  , Custom);
    359     }
    360   } else {
    361     setOperationAction(ISD::SINT_TO_FP     , MVT::i16  , Promote);
    362     setOperationAction(ISD::SINT_TO_FP     , MVT::i32  , Promote);
    363   }
    364 
    365   // In 32-bit mode these are custom lowered.  In 64-bit mode F32 and F64
    366   // are Legal, f80 is custom lowered.
    367   setOperationAction(ISD::FP_TO_SINT     , MVT::i64  , Custom);
    368   setOperationAction(ISD::SINT_TO_FP     , MVT::i64  , Custom);
    369 
    370   // Promote i1/i8 FP_TO_SINT to larger FP_TO_SINTS's, as X86 doesn't have
    371   // this operation.
    372   setOperationAction(ISD::FP_TO_SINT       , MVT::i1   , Promote);
    373   setOperationAction(ISD::FP_TO_SINT       , MVT::i8   , Promote);
    374 
    375   if (X86ScalarSSEf32) {
    376     setOperationAction(ISD::FP_TO_SINT     , MVT::i16  , Promote);
    377     // f32 and f64 cases are Legal, f80 case is not
    378     setOperationAction(ISD::FP_TO_SINT     , MVT::i32  , Custom);
    379   } else {
    380     setOperationAction(ISD::FP_TO_SINT     , MVT::i16  , Custom);
    381     setOperationAction(ISD::FP_TO_SINT     , MVT::i32  , Custom);
    382   }
    383 
    384   // Handle FP_TO_UINT by promoting the destination to a larger signed
    385   // conversion.
    386   setOperationAction(ISD::FP_TO_UINT       , MVT::i1   , Promote);
    387   setOperationAction(ISD::FP_TO_UINT       , MVT::i8   , Promote);
    388   setOperationAction(ISD::FP_TO_UINT       , MVT::i16  , Promote);
    389 
    390   if (Subtarget->is64Bit()) {
    391     setOperationAction(ISD::FP_TO_UINT     , MVT::i64  , Expand);
    392     setOperationAction(ISD::FP_TO_UINT     , MVT::i32  , Promote);
    393   } else if (!TM.Options.UseSoftFloat) {
    394     // Since AVX is a superset of SSE3, only check for SSE here.
    395     if (Subtarget->hasSSE1() && !Subtarget->hasSSE3())
    396       // Expand FP_TO_UINT into a select.
    397       // FIXME: We would like to use a Custom expander here eventually to do
    398       // the optimal thing for SSE vs. the default expansion in the legalizer.
    399       setOperationAction(ISD::FP_TO_UINT   , MVT::i32  , Expand);
    400     else
    401       // With SSE3 we can use fisttpll to convert to a signed i64; without
    402       // SSE, we're stuck with a fistpll.
    403       setOperationAction(ISD::FP_TO_UINT   , MVT::i32  , Custom);
    404   }
    405 
    406   if (isTargetFTOL()) {
    407     // Use the _ftol2 runtime function, which has a pseudo-instruction
    408     // to handle its weird calling convention.
    409     setOperationAction(ISD::FP_TO_UINT     , MVT::i64  , Custom);
    410   }
    411 
    412   // TODO: when we have SSE, these could be more efficient, by using movd/movq.
    413   if (!X86ScalarSSEf64) {
    414     setOperationAction(ISD::BITCAST        , MVT::f32  , Expand);
    415     setOperationAction(ISD::BITCAST        , MVT::i32  , Expand);
    416     if (Subtarget->is64Bit()) {
    417       setOperationAction(ISD::BITCAST      , MVT::f64  , Expand);
    418       // Without SSE, i64->f64 goes through memory.
    419       setOperationAction(ISD::BITCAST      , MVT::i64  , Expand);
    420     }
    421   }
    422 
    423   // Scalar integer divide and remainder are lowered to use operations that
    424   // produce two results, to match the available instructions. This exposes
    425   // the two-result form to trivial CSE, which is able to combine x/y and x%y
    426   // into a single instruction.
    427   //
    428   // Scalar integer multiply-high is also lowered to use two-result
    429   // operations, to match the available instructions. However, plain multiply
    430   // (low) operations are left as Legal, as there are single-result
    431   // instructions for this in x86. Using the two-result multiply instructions
    432   // when both high and low results are needed must be arranged by dagcombine.
    433   for (unsigned i = 0; i != array_lengthof(IntVTs); ++i) {
    434     MVT VT = IntVTs[i];
    435     setOperationAction(ISD::MULHS, VT, Expand);
    436     setOperationAction(ISD::MULHU, VT, Expand);
    437     setOperationAction(ISD::SDIV, VT, Expand);
    438     setOperationAction(ISD::UDIV, VT, Expand);
    439     setOperationAction(ISD::SREM, VT, Expand);
    440     setOperationAction(ISD::UREM, VT, Expand);
    441 
    442     // Add/Sub overflow ops with MVT::Glues are lowered to EFLAGS dependences.
    443     setOperationAction(ISD::ADDC, VT, Custom);
    444     setOperationAction(ISD::ADDE, VT, Custom);
    445     setOperationAction(ISD::SUBC, VT, Custom);
    446     setOperationAction(ISD::SUBE, VT, Custom);
    447   }
    448 
    449   setOperationAction(ISD::BR_JT            , MVT::Other, Expand);
    450   setOperationAction(ISD::BRCOND           , MVT::Other, Custom);
    451   setOperationAction(ISD::BR_CC            , MVT::f32,   Expand);
    452   setOperationAction(ISD::BR_CC            , MVT::f64,   Expand);
    453   setOperationAction(ISD::BR_CC            , MVT::f80,   Expand);
    454   setOperationAction(ISD::BR_CC            , MVT::i8,    Expand);
    455   setOperationAction(ISD::BR_CC            , MVT::i16,   Expand);
    456   setOperationAction(ISD::BR_CC            , MVT::i32,   Expand);
    457   setOperationAction(ISD::BR_CC            , MVT::i64,   Expand);
    458   setOperationAction(ISD::SELECT_CC        , MVT::f32,   Expand);
    459   setOperationAction(ISD::SELECT_CC        , MVT::f64,   Expand);
    460   setOperationAction(ISD::SELECT_CC        , MVT::f80,   Expand);
    461   setOperationAction(ISD::SELECT_CC        , MVT::i8,    Expand);
    462   setOperationAction(ISD::SELECT_CC        , MVT::i16,   Expand);
    463   setOperationAction(ISD::SELECT_CC        , MVT::i32,   Expand);
    464   setOperationAction(ISD::SELECT_CC        , MVT::i64,   Expand);
    465   if (Subtarget->is64Bit())
    466     setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i32, Legal);
    467   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i16  , Legal);
    468   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i8   , Legal);
    469   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1   , Expand);
    470   setOperationAction(ISD::FP_ROUND_INREG   , MVT::f32  , Expand);
    471   setOperationAction(ISD::FREM             , MVT::f32  , Expand);
    472   setOperationAction(ISD::FREM             , MVT::f64  , Expand);
    473   setOperationAction(ISD::FREM             , MVT::f80  , Expand);
    474   setOperationAction(ISD::FLT_ROUNDS_      , MVT::i32  , Custom);
    475 
    476   // Promote the i8 variants and force them on up to i32 which has a shorter
    477   // encoding.
    478   setOperationAction(ISD::CTTZ             , MVT::i8   , Promote);
    479   AddPromotedToType (ISD::CTTZ             , MVT::i8   , MVT::i32);
    480   setOperationAction(ISD::CTTZ_ZERO_UNDEF  , MVT::i8   , Promote);
    481   AddPromotedToType (ISD::CTTZ_ZERO_UNDEF  , MVT::i8   , MVT::i32);
    482   if (Subtarget->hasBMI()) {
    483     setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i16  , Expand);
    484     setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i32  , Expand);
    485     if (Subtarget->is64Bit())
    486       setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i64, Expand);
    487   } else {
    488     setOperationAction(ISD::CTTZ           , MVT::i16  , Custom);
    489     setOperationAction(ISD::CTTZ           , MVT::i32  , Custom);
    490     if (Subtarget->is64Bit())
    491       setOperationAction(ISD::CTTZ         , MVT::i64  , Custom);
    492   }
    493 
    494   if (Subtarget->hasLZCNT()) {
    495     // When promoting the i8 variants, force them to i32 for a shorter
    496     // encoding.
    497     setOperationAction(ISD::CTLZ           , MVT::i8   , Promote);
    498     AddPromotedToType (ISD::CTLZ           , MVT::i8   , MVT::i32);
    499     setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i8   , Promote);
    500     AddPromotedToType (ISD::CTLZ_ZERO_UNDEF, MVT::i8   , MVT::i32);
    501     setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i16  , Expand);
    502     setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i32  , Expand);
    503     if (Subtarget->is64Bit())
    504       setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i64, Expand);
    505   } else {
    506     setOperationAction(ISD::CTLZ           , MVT::i8   , Custom);
    507     setOperationAction(ISD::CTLZ           , MVT::i16  , Custom);
    508     setOperationAction(ISD::CTLZ           , MVT::i32  , Custom);
    509     setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i8   , Custom);
    510     setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i16  , Custom);
    511     setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i32  , Custom);
    512     if (Subtarget->is64Bit()) {
    513       setOperationAction(ISD::CTLZ         , MVT::i64  , Custom);
    514       setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i64, Custom);
    515     }
    516   }
    517 
    518   // Special handling for half-precision floating point conversions.
    519   // If we don't have F16C support, then lower half float conversions
    520   // into library calls.
    521   if (TM.Options.UseSoftFloat || !Subtarget->hasF16C()) {
    522     setOperationAction(ISD::FP16_TO_FP32, MVT::f32, Expand);
    523     setOperationAction(ISD::FP32_TO_FP16, MVT::i16, Expand);
    524   }
    525 
    526   if (Subtarget->hasPOPCNT()) {
    527     setOperationAction(ISD::CTPOP          , MVT::i8   , Promote);
    528   } else {
    529     setOperationAction(ISD::CTPOP          , MVT::i8   , Expand);
    530     setOperationAction(ISD::CTPOP          , MVT::i16  , Expand);
    531     setOperationAction(ISD::CTPOP          , MVT::i32  , Expand);
    532     if (Subtarget->is64Bit())
    533       setOperationAction(ISD::CTPOP        , MVT::i64  , Expand);
    534   }
    535 
    536   setOperationAction(ISD::READCYCLECOUNTER , MVT::i64  , Custom);
    537 
    538   if (!Subtarget->hasMOVBE())
    539     setOperationAction(ISD::BSWAP          , MVT::i16  , Expand);
    540 
    541   // These should be promoted to a larger select which is supported.
    542   setOperationAction(ISD::SELECT          , MVT::i1   , Promote);
    543   // X86 wants to expand cmov itself.
    544   setOperationAction(ISD::SELECT          , MVT::i8   , Custom);
    545   setOperationAction(ISD::SELECT          , MVT::i16  , Custom);
    546   setOperationAction(ISD::SELECT          , MVT::i32  , Custom);
    547   setOperationAction(ISD::SELECT          , MVT::f32  , Custom);
    548   setOperationAction(ISD::SELECT          , MVT::f64  , Custom);
    549   setOperationAction(ISD::SELECT          , MVT::f80  , Custom);
    550   setOperationAction(ISD::SETCC           , MVT::i8   , Custom);
    551   setOperationAction(ISD::SETCC           , MVT::i16  , Custom);
    552   setOperationAction(ISD::SETCC           , MVT::i32  , Custom);
    553   setOperationAction(ISD::SETCC           , MVT::f32  , Custom);
    554   setOperationAction(ISD::SETCC           , MVT::f64  , Custom);
    555   setOperationAction(ISD::SETCC           , MVT::f80  , Custom);
    556   if (Subtarget->is64Bit()) {
    557     setOperationAction(ISD::SELECT        , MVT::i64  , Custom);
    558     setOperationAction(ISD::SETCC         , MVT::i64  , Custom);
    559   }
    560   setOperationAction(ISD::EH_RETURN       , MVT::Other, Custom);
    561   // NOTE: EH_SJLJ_SETJMP/_LONGJMP supported here is NOT intended to support
    562   // SjLj exception handling but a light-weight setjmp/longjmp replacement to
    563   // support continuation, user-level threading, and etc.. As a result, no
    564   // other SjLj exception interfaces are implemented and please don't build
    565   // your own exception handling based on them.
    566   // LLVM/Clang supports zero-cost DWARF exception handling.
    567   setOperationAction(ISD::EH_SJLJ_SETJMP, MVT::i32, Custom);
    568   setOperationAction(ISD::EH_SJLJ_LONGJMP, MVT::Other, Custom);
    569 
    570   // Darwin ABI issue.
    571   setOperationAction(ISD::ConstantPool    , MVT::i32  , Custom);
    572   setOperationAction(ISD::JumpTable       , MVT::i32  , Custom);
    573   setOperationAction(ISD::GlobalAddress   , MVT::i32  , Custom);
    574   setOperationAction(ISD::GlobalTLSAddress, MVT::i32  , Custom);
    575   if (Subtarget->is64Bit())
    576     setOperationAction(ISD::GlobalTLSAddress, MVT::i64, Custom);
    577   setOperationAction(ISD::ExternalSymbol  , MVT::i32  , Custom);
    578   setOperationAction(ISD::BlockAddress    , MVT::i32  , Custom);
    579   if (Subtarget->is64Bit()) {
    580     setOperationAction(ISD::ConstantPool  , MVT::i64  , Custom);
    581     setOperationAction(ISD::JumpTable     , MVT::i64  , Custom);
    582     setOperationAction(ISD::GlobalAddress , MVT::i64  , Custom);
    583     setOperationAction(ISD::ExternalSymbol, MVT::i64  , Custom);
    584     setOperationAction(ISD::BlockAddress  , MVT::i64  , Custom);
    585   }
    586   // 64-bit addm sub, shl, sra, srl (iff 32-bit x86)
    587   setOperationAction(ISD::SHL_PARTS       , MVT::i32  , Custom);
    588   setOperationAction(ISD::SRA_PARTS       , MVT::i32  , Custom);
    589   setOperationAction(ISD::SRL_PARTS       , MVT::i32  , Custom);
    590   if (Subtarget->is64Bit()) {
    591     setOperationAction(ISD::SHL_PARTS     , MVT::i64  , Custom);
    592     setOperationAction(ISD::SRA_PARTS     , MVT::i64  , Custom);
    593     setOperationAction(ISD::SRL_PARTS     , MVT::i64  , Custom);
    594   }
    595 
    596   if (Subtarget->hasSSE1())
    597     setOperationAction(ISD::PREFETCH      , MVT::Other, Legal);
    598 
    599   setOperationAction(ISD::ATOMIC_FENCE  , MVT::Other, Custom);
    600 
    601   // Expand certain atomics
    602   for (unsigned i = 0; i != array_lengthof(IntVTs); ++i) {
    603     MVT VT = IntVTs[i];
    604     setOperationAction(ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS, VT, Custom);
    605     setOperationAction(ISD::ATOMIC_LOAD_SUB, VT, Custom);
    606     setOperationAction(ISD::ATOMIC_STORE, VT, Custom);
    607   }
    608 
    609   if (Subtarget->hasCmpxchg16b()) {
    610     setOperationAction(ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS, MVT::i128, Custom);
    611   }
    612 
    613   // FIXME - use subtarget debug flags
    614   if (!Subtarget->isTargetDarwin() && !Subtarget->isTargetELF() &&
    615       !Subtarget->isTargetCygMing() && !Subtarget->isTargetWin64()) {
    616     setOperationAction(ISD::EH_LABEL, MVT::Other, Expand);
    617   }
    618 
    619   if (Subtarget->is64Bit()) {
    620     setExceptionPointerRegister(X86::RAX);
    621     setExceptionSelectorRegister(X86::RDX);
    622   } else {
    623     setExceptionPointerRegister(X86::EAX);
    624     setExceptionSelectorRegister(X86::EDX);
    625   }
    626   setOperationAction(ISD::FRAME_TO_ARGS_OFFSET, MVT::i32, Custom);
    627   setOperationAction(ISD::FRAME_TO_ARGS_OFFSET, MVT::i64, Custom);
    628 
    629   setOperationAction(ISD::INIT_TRAMPOLINE, MVT::Other, Custom);
    630   setOperationAction(ISD::ADJUST_TRAMPOLINE, MVT::Other, Custom);
    631 
    632   setOperationAction(ISD::TRAP, MVT::Other, Legal);
    633   setOperationAction(ISD::DEBUGTRAP, MVT::Other, Legal);
    634 
    635   // VASTART needs to be custom lowered to use the VarArgsFrameIndex
    636   setOperationAction(ISD::VASTART           , MVT::Other, Custom);
    637   setOperationAction(ISD::VAEND             , MVT::Other, Expand);
    638   if (Subtarget->is64Bit() && !Subtarget->isTargetWin64()) {
    639     // TargetInfo::X86_64ABIBuiltinVaList
    640     setOperationAction(ISD::VAARG           , MVT::Other, Custom);
    641     setOperationAction(ISD::VACOPY          , MVT::Other, Custom);
    642   } else {
    643     // TargetInfo::CharPtrBuiltinVaList
    644     setOperationAction(ISD::VAARG           , MVT::Other, Expand);
    645     setOperationAction(ISD::VACOPY          , MVT::Other, Expand);
    646   }
    647 
    648   setOperationAction(ISD::STACKSAVE,          MVT::Other, Expand);
    649   setOperationAction(ISD::STACKRESTORE,       MVT::Other, Expand);
    650 
    651   setOperationAction(ISD::DYNAMIC_STACKALLOC, Subtarget->is64Bit() ?
    652                      MVT::i64 : MVT::i32, Custom);
    653 
    654   if (!TM.Options.UseSoftFloat && X86ScalarSSEf64) {
    655     // f32 and f64 use SSE.
    656     // Set up the FP register classes.
    657     addRegisterClass(MVT::f32, &X86::FR32RegClass);
    658     addRegisterClass(MVT::f64, &X86::FR64RegClass);
    659 
    660     // Use ANDPD to simulate FABS.
    661     setOperationAction(ISD::FABS , MVT::f64, Custom);
    662     setOperationAction(ISD::FABS , MVT::f32, Custom);
    663 
    664     // Use XORP to simulate FNEG.
    665     setOperationAction(ISD::FNEG , MVT::f64, Custom);
    666     setOperationAction(ISD::FNEG , MVT::f32, Custom);
    667 
    668     // Use ANDPD and ORPD to simulate FCOPYSIGN.
    669     setOperationAction(ISD::FCOPYSIGN, MVT::f64, Custom);
    670     setOperationAction(ISD::FCOPYSIGN, MVT::f32, Custom);
    671 
    672     // Lower this to FGETSIGNx86 plus an AND.
    673     setOperationAction(ISD::FGETSIGN, MVT::i64, Custom);
    674     setOperationAction(ISD::FGETSIGN, MVT::i32, Custom);
    675 
    676     // We don't support sin/cos/fmod
    677     setOperationAction(ISD::FSIN   , MVT::f64, Expand);
    678     setOperationAction(ISD::FCOS   , MVT::f64, Expand);
    679     setOperationAction(ISD::FSINCOS, MVT::f64, Expand);
    680     setOperationAction(ISD::FSIN   , MVT::f32, Expand);
    681     setOperationAction(ISD::FCOS   , MVT::f32, Expand);
    682     setOperationAction(ISD::FSINCOS, MVT::f32, Expand);
    683 
    684     // Expand FP immediates into loads from the stack, except for the special
    685     // cases we handle.
    686     addLegalFPImmediate(APFloat(+0.0)); // xorpd
    687     addLegalFPImmediate(APFloat(+0.0f)); // xorps
    688   } else if (!TM.Options.UseSoftFloat && X86ScalarSSEf32) {
    689     // Use SSE for f32, x87 for f64.
    690     // Set up the FP register classes.
    691     addRegisterClass(MVT::f32, &X86::FR32RegClass);
    692     addRegisterClass(MVT::f64, &X86::RFP64RegClass);
    693 
    694     // Use ANDPS to simulate FABS.
    695     setOperationAction(ISD::FABS , MVT::f32, Custom);
    696 
    697     // Use XORP to simulate FNEG.
    698     setOperationAction(ISD::FNEG , MVT::f32, Custom);
    699 
    700     setOperationAction(ISD::UNDEF,     MVT::f64, Expand);
    701 
    702     // Use ANDPS and ORPS to simulate FCOPYSIGN.
    703     setOperationAction(ISD::FCOPYSIGN, MVT::f64, Expand);
    704     setOperationAction(ISD::FCOPYSIGN, MVT::f32, Custom);
    705 
    706     // We don't support sin/cos/fmod
    707     setOperationAction(ISD::FSIN   , MVT::f32, Expand);
    708     setOperationAction(ISD::FCOS   , MVT::f32, Expand);
    709     setOperationAction(ISD::FSINCOS, MVT::f32, Expand);
    710 
    711     // Special cases we handle for FP constants.
    712     addLegalFPImmediate(APFloat(+0.0f)); // xorps
    713     addLegalFPImmediate(APFloat(+0.0)); // FLD0
    714     addLegalFPImmediate(APFloat(+1.0)); // FLD1
    715     addLegalFPImmediate(APFloat(-0.0)); // FLD0/FCHS
    716     addLegalFPImmediate(APFloat(-1.0)); // FLD1/FCHS
    717 
    718     if (!TM.Options.UnsafeFPMath) {
    719       setOperationAction(ISD::FSIN   , MVT::f64, Expand);
    720       setOperationAction(ISD::FCOS   , MVT::f64, Expand);
    721       setOperationAction(ISD::FSINCOS, MVT::f64, Expand);
    722     }
    723   } else if (!TM.Options.UseSoftFloat) {
    724     // f32 and f64 in x87.
    725     // Set up the FP register classes.
    726     addRegisterClass(MVT::f64, &X86::RFP64RegClass);
    727     addRegisterClass(MVT::f32, &X86::RFP32RegClass);
    728 
    729     setOperationAction(ISD::UNDEF,     MVT::f64, Expand);
    730     setOperationAction(ISD::UNDEF,     MVT::f32, Expand);
    731     setOperationAction(ISD::FCOPYSIGN, MVT::f64, Expand);
    732     setOperationAction(ISD::FCOPYSIGN, MVT::f32, Expand);
    733 
    734     if (!TM.Options.UnsafeFPMath) {
    735       setOperationAction(ISD::FSIN   , MVT::f64, Expand);
    736       setOperationAction(ISD::FSIN   , MVT::f32, Expand);
    737       setOperationAction(ISD::FCOS   , MVT::f64, Expand);
    738       setOperationAction(ISD::FCOS   , MVT::f32, Expand);
    739       setOperationAction(ISD::FSINCOS, MVT::f64, Expand);
    740       setOperationAction(ISD::FSINCOS, MVT::f32, Expand);
    741     }
    742     addLegalFPImmediate(APFloat(+0.0)); // FLD0
    743     addLegalFPImmediate(APFloat(+1.0)); // FLD1
    744     addLegalFPImmediate(APFloat(-0.0)); // FLD0/FCHS
    745     addLegalFPImmediate(APFloat(-1.0)); // FLD1/FCHS
    746     addLegalFPImmediate(APFloat(+0.0f)); // FLD0
    747     addLegalFPImmediate(APFloat(+1.0f)); // FLD1
    748     addLegalFPImmediate(APFloat(-0.0f)); // FLD0/FCHS
    749     addLegalFPImmediate(APFloat(-1.0f)); // FLD1/FCHS
    750   }
    751 
    752   // We don't support FMA.
    753   setOperationAction(ISD::FMA, MVT::f64, Expand);
    754   setOperationAction(ISD::FMA, MVT::f32, Expand);
    755 
    756   // Long double always uses X87.
    757   if (!TM.Options.UseSoftFloat) {
    758     addRegisterClass(MVT::f80, &X86::RFP80RegClass);
    759     setOperationAction(ISD::UNDEF,     MVT::f80, Expand);
    760     setOperationAction(ISD::FCOPYSIGN, MVT::f80, Expand);
    761     {
    762       APFloat TmpFlt = APFloat::getZero(APFloat::x87DoubleExtended);
    763       addLegalFPImmediate(TmpFlt);  // FLD0
    764       TmpFlt.changeSign();
    765       addLegalFPImmediate(TmpFlt);  // FLD0/FCHS
    766 
    767       bool ignored;
    768       APFloat TmpFlt2(+1.0);
    769       TmpFlt2.convert(APFloat::x87DoubleExtended, APFloat::rmNearestTiesToEven,
    770                       &ignored);
    771       addLegalFPImmediate(TmpFlt2);  // FLD1
    772       TmpFlt2.changeSign();
    773       addLegalFPImmediate(TmpFlt2);  // FLD1/FCHS
    774     }
    775 
    776     if (!TM.Options.UnsafeFPMath) {
    777       setOperationAction(ISD::FSIN   , MVT::f80, Expand);
    778       setOperationAction(ISD::FCOS   , MVT::f80, Expand);
    779       setOperationAction(ISD::FSINCOS, MVT::f80, Expand);
    780     }
    781 
    782     setOperationAction(ISD::FFLOOR, MVT::f80, Expand);
    783     setOperationAction(ISD::FCEIL,  MVT::f80, Expand);
    784     setOperationAction(ISD::FTRUNC, MVT::f80, Expand);
    785     setOperationAction(ISD::FRINT,  MVT::f80, Expand);
    786     setOperationAction(ISD::FNEARBYINT, MVT::f80, Expand);
    787     setOperationAction(ISD::FMA, MVT::f80, Expand);
    788   }
    789 
    790   // Always use a library call for pow.
    791   setOperationAction(ISD::FPOW             , MVT::f32  , Expand);
    792   setOperationAction(ISD::FPOW             , MVT::f64  , Expand);
    793   setOperationAction(ISD::FPOW             , MVT::f80  , Expand);
    794 
    795   setOperationAction(ISD::FLOG, MVT::f80, Expand);
    796   setOperationAction(ISD::FLOG2, MVT::f80, Expand);
    797   setOperationAction(ISD::FLOG10, MVT::f80, Expand);
    798   setOperationAction(ISD::FEXP, MVT::f80, Expand);
    799   setOperationAction(ISD::FEXP2, MVT::f80, Expand);
    800 
    801   // First set operation action for all vector types to either promote
    802   // (for widening) or expand (for scalarization). Then we will selectively
    803   // turn on ones that can be effectively codegen'd.
    804   for (int i = MVT::FIRST_VECTOR_VALUETYPE;
    805            i <= MVT::LAST_VECTOR_VALUETYPE; ++i) {
    806     MVT VT = (MVT::SimpleValueType)i;
    807     setOperationAction(ISD::ADD , VT, Expand);
    808     setOperationAction(ISD::SUB , VT, Expand);
    809     setOperationAction(ISD::FADD, VT, Expand);
    810     setOperationAction(ISD::FNEG, VT, Expand);
    811     setOperationAction(ISD::FSUB, VT, Expand);
    812     setOperationAction(ISD::MUL , VT, Expand);
    813     setOperationAction(ISD::FMUL, VT, Expand);
    814     setOperationAction(ISD::SDIV, VT, Expand);
    815     setOperationAction(ISD::UDIV, VT, Expand);
    816     setOperationAction(ISD::FDIV, VT, Expand);
    817     setOperationAction(ISD::SREM, VT, Expand);
    818     setOperationAction(ISD::UREM, VT, Expand);
    819     setOperationAction(ISD::LOAD, VT, Expand);
    820     setOperationAction(ISD::VECTOR_SHUFFLE, VT, Expand);
    821     setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT,Expand);
    822     setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Expand);
    823     setOperationAction(ISD::EXTRACT_SUBVECTOR, VT,Expand);
    824     setOperationAction(ISD::INSERT_SUBVECTOR, VT,Expand);
    825     setOperationAction(ISD::FABS, VT, Expand);
    826     setOperationAction(ISD::FSIN, VT, Expand);
    827     setOperationAction(ISD::FSINCOS, VT, Expand);
    828     setOperationAction(ISD::FCOS, VT, Expand);
    829     setOperationAction(ISD::FSINCOS, VT, Expand);
    830     setOperationAction(ISD::FREM, VT, Expand);
    831     setOperationAction(ISD::FMA,  VT, Expand);
    832     setOperationAction(ISD::FPOWI, VT, Expand);
    833     setOperationAction(ISD::FSQRT, VT, Expand);
    834     setOperationAction(ISD::FCOPYSIGN, VT, Expand);
    835     setOperationAction(ISD::FFLOOR, VT, Expand);
    836     setOperationAction(ISD::FCEIL, VT, Expand);
    837     setOperationAction(ISD::FTRUNC, VT, Expand);
    838     setOperationAction(ISD::FRINT, VT, Expand);
    839     setOperationAction(ISD::FNEARBYINT, VT, Expand);
    840     setOperationAction(ISD::SMUL_LOHI, VT, Expand);
    841     setOperationAction(ISD::MULHS, VT, Expand);
    842     setOperationAction(ISD::UMUL_LOHI, VT, Expand);
    843     setOperationAction(ISD::MULHU, VT, Expand);
    844     setOperationAction(ISD::SDIVREM, VT, Expand);
    845     setOperationAction(ISD::UDIVREM, VT, Expand);
    846     setOperationAction(ISD::FPOW, VT, Expand);
    847     setOperationAction(ISD::CTPOP, VT, Expand);
    848     setOperationAction(ISD::CTTZ, VT, Expand);
    849     setOperationAction(ISD::CTTZ_ZERO_UNDEF, VT, Expand);
    850     setOperationAction(ISD::CTLZ, VT, Expand);
    851     setOperationAction(ISD::CTLZ_ZERO_UNDEF, VT, Expand);
    852     setOperationAction(ISD::SHL, VT, Expand);
    853     setOperationAction(ISD::SRA, VT, Expand);
    854     setOperationAction(ISD::SRL, VT, Expand);
    855     setOperationAction(ISD::ROTL, VT, Expand);
    856     setOperationAction(ISD::ROTR, VT, Expand);
    857     setOperationAction(ISD::BSWAP, VT, Expand);
    858     setOperationAction(ISD::SETCC, VT, Expand);
    859     setOperationAction(ISD::FLOG, VT, Expand);
    860     setOperationAction(ISD::FLOG2, VT, Expand);
    861     setOperationAction(ISD::FLOG10, VT, Expand);
    862     setOperationAction(ISD::FEXP, VT, Expand);
    863     setOperationAction(ISD::FEXP2, VT, Expand);
    864     setOperationAction(ISD::FP_TO_UINT, VT, Expand);
    865     setOperationAction(ISD::FP_TO_SINT, VT, Expand);
    866     setOperationAction(ISD::UINT_TO_FP, VT, Expand);
    867     setOperationAction(ISD::SINT_TO_FP, VT, Expand);
    868     setOperationAction(ISD::SIGN_EXTEND_INREG, VT,Expand);
    869     setOperationAction(ISD::TRUNCATE, VT, Expand);
    870     setOperationAction(ISD::SIGN_EXTEND, VT, Expand);
    871     setOperationAction(ISD::ZERO_EXTEND, VT, Expand);
    872     setOperationAction(ISD::ANY_EXTEND, VT, Expand);
    873     setOperationAction(ISD::VSELECT, VT, Expand);
    874     setOperationAction(ISD::SELECT_CC, VT, Expand);
    875     for (int InnerVT = MVT::FIRST_VECTOR_VALUETYPE;
    876              InnerVT <= MVT::LAST_VECTOR_VALUETYPE; ++InnerVT)
    877       setTruncStoreAction(VT,
    878                           (MVT::SimpleValueType)InnerVT, Expand);
    879     setLoadExtAction(ISD::SEXTLOAD, VT, Expand);
    880     setLoadExtAction(ISD::ZEXTLOAD, VT, Expand);
    881     setLoadExtAction(ISD::EXTLOAD, VT, Expand);
    882   }
    883 
    884   // FIXME: In order to prevent SSE instructions being expanded to MMX ones
    885   // with -msoft-float, disable use of MMX as well.
    886   if (!TM.Options.UseSoftFloat && Subtarget->hasMMX()) {
    887     addRegisterClass(MVT::x86mmx, &X86::VR64RegClass);
    888     // No operations on x86mmx supported, everything uses intrinsics.
    889   }
    890 
    891   // MMX-sized vectors (other than x86mmx) are expected to be expanded
    892   // into smaller operations.
    893   setOperationAction(ISD::MULHS,              MVT::v8i8,  Expand);
    894   setOperationAction(ISD::MULHS,              MVT::v4i16, Expand);
    895   setOperationAction(ISD::MULHS,              MVT::v2i32, Expand);
    896   setOperationAction(ISD::MULHS,              MVT::v1i64, Expand);
    897   setOperationAction(ISD::AND,                MVT::v8i8,  Expand);
    898   setOperationAction(ISD::AND,                MVT::v4i16, Expand);
    899   setOperationAction(ISD::AND,                MVT::v2i32, Expand);
    900   setOperationAction(ISD::AND,                MVT::v1i64, Expand);
    901   setOperationAction(ISD::OR,                 MVT::v8i8,  Expand);
    902   setOperationAction(ISD::OR,                 MVT::v4i16, Expand);
    903   setOperationAction(ISD::OR,                 MVT::v2i32, Expand);
    904   setOperationAction(ISD::OR,                 MVT::v1i64, Expand);
    905   setOperationAction(ISD::XOR,                MVT::v8i8,  Expand);
    906   setOperationAction(ISD::XOR,                MVT::v4i16, Expand);
    907   setOperationAction(ISD::XOR,                MVT::v2i32, Expand);
    908   setOperationAction(ISD::XOR,                MVT::v1i64, Expand);
    909   setOperationAction(ISD::SCALAR_TO_VECTOR,   MVT::v8i8,  Expand);
    910   setOperationAction(ISD::SCALAR_TO_VECTOR,   MVT::v4i16, Expand);
    911   setOperationAction(ISD::SCALAR_TO_VECTOR,   MVT::v2i32, Expand);
    912   setOperationAction(ISD::SCALAR_TO_VECTOR,   MVT::v1i64, Expand);
    913   setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v1i64, Expand);
    914   setOperationAction(ISD::SELECT,             MVT::v8i8,  Expand);
    915   setOperationAction(ISD::SELECT,             MVT::v4i16, Expand);
    916   setOperationAction(ISD::SELECT,             MVT::v2i32, Expand);
    917   setOperationAction(ISD::SELECT,             MVT::v1i64, Expand);
    918   setOperationAction(ISD::BITCAST,            MVT::v8i8,  Expand);
    919   setOperationAction(ISD::BITCAST,            MVT::v4i16, Expand);
    920   setOperationAction(ISD::BITCAST,            MVT::v2i32, Expand);
    921   setOperationAction(ISD::BITCAST,            MVT::v1i64, Expand);
    922 
    923   if (!TM.Options.UseSoftFloat && Subtarget->hasSSE1()) {
    924     addRegisterClass(MVT::v4f32, &X86::VR128RegClass);
    925 
    926     setOperationAction(ISD::FADD,               MVT::v4f32, Legal);
    927     setOperationAction(ISD::FSUB,               MVT::v4f32, Legal);
    928     setOperationAction(ISD::FMUL,               MVT::v4f32, Legal);
    929     setOperationAction(ISD::FDIV,               MVT::v4f32, Legal);
    930     setOperationAction(ISD::FSQRT,              MVT::v4f32, Legal);
    931     setOperationAction(ISD::FNEG,               MVT::v4f32, Custom);
    932     setOperationAction(ISD::FABS,               MVT::v4f32, Custom);
    933     setOperationAction(ISD::LOAD,               MVT::v4f32, Legal);
    934     setOperationAction(ISD::BUILD_VECTOR,       MVT::v4f32, Custom);
    935     setOperationAction(ISD::VECTOR_SHUFFLE,     MVT::v4f32, Custom);
    936     setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4f32, Custom);
    937     setOperationAction(ISD::SELECT,             MVT::v4f32, Custom);
    938   }
    939 
    940   if (!TM.Options.UseSoftFloat && Subtarget->hasSSE2()) {
    941     addRegisterClass(MVT::v2f64, &X86::VR128RegClass);
    942 
    943     // FIXME: Unfortunately -soft-float and -no-implicit-float means XMM
    944     // registers cannot be used even for integer operations.
    945     addRegisterClass(MVT::v16i8, &X86::VR128RegClass);
    946     addRegisterClass(MVT::v8i16, &X86::VR128RegClass);
    947     addRegisterClass(MVT::v4i32, &X86::VR128RegClass);
    948     addRegisterClass(MVT::v2i64, &X86::VR128RegClass);
    949 
    950     setOperationAction(ISD::ADD,                MVT::v16i8, Legal);
    951     setOperationAction(ISD::ADD,                MVT::v8i16, Legal);
    952     setOperationAction(ISD::ADD,                MVT::v4i32, Legal);
    953     setOperationAction(ISD::ADD,                MVT::v2i64, Legal);
    954     setOperationAction(ISD::MUL,                MVT::v4i32, Custom);
    955     setOperationAction(ISD::MUL,                MVT::v2i64, Custom);
    956     setOperationAction(ISD::UMUL_LOHI,          MVT::v4i32, Custom);
    957     setOperationAction(ISD::SMUL_LOHI,          MVT::v4i32, Custom);
    958     setOperationAction(ISD::MULHU,              MVT::v8i16, Legal);
    959     setOperationAction(ISD::MULHS,              MVT::v8i16, Legal);
    960     setOperationAction(ISD::SUB,                MVT::v16i8, Legal);
    961     setOperationAction(ISD::SUB,                MVT::v8i16, Legal);
    962     setOperationAction(ISD::SUB,                MVT::v4i32, Legal);
    963     setOperationAction(ISD::SUB,                MVT::v2i64, Legal);
    964     setOperationAction(ISD::MUL,                MVT::v8i16, Legal);
    965     setOperationAction(ISD::FADD,               MVT::v2f64, Legal);
    966     setOperationAction(ISD::FSUB,               MVT::v2f64, Legal);
    967     setOperationAction(ISD::FMUL,               MVT::v2f64, Legal);
    968     setOperationAction(ISD::FDIV,               MVT::v2f64, Legal);
    969     setOperationAction(ISD::FSQRT,              MVT::v2f64, Legal);
    970     setOperationAction(ISD::FNEG,               MVT::v2f64, Custom);
    971     setOperationAction(ISD::FABS,               MVT::v2f64, Custom);
    972 
    973     setOperationAction(ISD::SETCC,              MVT::v2i64, Custom);
    974     setOperationAction(ISD::SETCC,              MVT::v16i8, Custom);
    975     setOperationAction(ISD::SETCC,              MVT::v8i16, Custom);
    976     setOperationAction(ISD::SETCC,              MVT::v4i32, Custom);
    977 
    978     setOperationAction(ISD::SCALAR_TO_VECTOR,   MVT::v16i8, Custom);
    979     setOperationAction(ISD::SCALAR_TO_VECTOR,   MVT::v8i16, Custom);
    980     setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v8i16, Custom);
    981     setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v4i32, Custom);
    982     setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v4f32, Custom);
    983 
    984     // Custom lower build_vector, vector_shuffle, and extract_vector_elt.
    985     for (int i = MVT::v16i8; i != MVT::v2i64; ++i) {
    986       MVT VT = (MVT::SimpleValueType)i;
    987       // Do not attempt to custom lower non-power-of-2 vectors
    988       if (!isPowerOf2_32(VT.getVectorNumElements()))
    989         continue;
    990       // Do not attempt to custom lower non-128-bit vectors
    991       if (!VT.is128BitVector())
    992         continue;
    993       setOperationAction(ISD::BUILD_VECTOR,       VT, Custom);
    994       setOperationAction(ISD::VECTOR_SHUFFLE,     VT, Custom);
    995       setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
    996     }
    997 
    998     setOperationAction(ISD::BUILD_VECTOR,       MVT::v2f64, Custom);
    999     setOperationAction(ISD::BUILD_VECTOR,       MVT::v2i64, Custom);
   1000     setOperationAction(ISD::VECTOR_SHUFFLE,     MVT::v2f64, Custom);
   1001     setOperationAction(ISD::VECTOR_SHUFFLE,     MVT::v2i64, Custom);
   1002     setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v2f64, Custom);
   1003     setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2f64, Custom);
   1004 
   1005     if (Subtarget->is64Bit()) {
   1006       setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v2i64, Custom);
   1007       setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2i64, Custom);
   1008     }
   1009 
   1010     // Promote v16i8, v8i16, v4i32 load, select, and, or, xor to v2i64.
   1011     for (int i = MVT::v16i8; i != MVT::v2i64; ++i) {
   1012       MVT VT = (MVT::SimpleValueType)i;
   1013 
   1014       // Do not attempt to promote non-128-bit vectors
   1015       if (!VT.is128BitVector())
   1016         continue;
   1017 
   1018       setOperationAction(ISD::AND,    VT, Promote);
   1019       AddPromotedToType (ISD::AND,    VT, MVT::v2i64);
   1020       setOperationAction(ISD::OR,     VT, Promote);
   1021       AddPromotedToType (ISD::OR,     VT, MVT::v2i64);
   1022       setOperationAction(ISD::XOR,    VT, Promote);
   1023       AddPromotedToType (ISD::XOR,    VT, MVT::v2i64);
   1024       setOperationAction(ISD::LOAD,   VT, Promote);
   1025       AddPromotedToType (ISD::LOAD,   VT, MVT::v2i64);
   1026       setOperationAction(ISD::SELECT, VT, Promote);
   1027       AddPromotedToType (ISD::SELECT, VT, MVT::v2i64);
   1028     }
   1029 
   1030     setTruncStoreAction(MVT::f64, MVT::f32, Expand);
   1031 
   1032     // Custom lower v2i64 and v2f64 selects.
   1033     setOperationAction(ISD::LOAD,               MVT::v2f64, Legal);
   1034     setOperationAction(ISD::LOAD,               MVT::v2i64, Legal);
   1035     setOperationAction(ISD::SELECT,             MVT::v2f64, Custom);
   1036     setOperationAction(ISD::SELECT,             MVT::v2i64, Custom);
   1037 
   1038     setOperationAction(ISD::FP_TO_SINT,         MVT::v4i32, Legal);
   1039     setOperationAction(ISD::SINT_TO_FP,         MVT::v4i32, Legal);
   1040 
   1041     setOperationAction(ISD::UINT_TO_FP,         MVT::v4i8,  Custom);
   1042     setOperationAction(ISD::UINT_TO_FP,         MVT::v4i16, Custom);
   1043     // As there is no 64-bit GPR available, we need build a special custom
   1044     // sequence to convert from v2i32 to v2f32.
   1045     if (!Subtarget->is64Bit())
   1046       setOperationAction(ISD::UINT_TO_FP,       MVT::v2f32, Custom);
   1047 
   1048     setOperationAction(ISD::FP_EXTEND,          MVT::v2f32, Custom);
   1049     setOperationAction(ISD::FP_ROUND,           MVT::v2f32, Custom);
   1050 
   1051     setLoadExtAction(ISD::EXTLOAD,              MVT::v2f32, Legal);
   1052 
   1053     setOperationAction(ISD::BITCAST,            MVT::v2i32, Custom);
   1054     setOperationAction(ISD::BITCAST,            MVT::v4i16, Custom);
   1055     setOperationAction(ISD::BITCAST,            MVT::v8i8,  Custom);
   1056   }
   1057 
   1058   if (!TM.Options.UseSoftFloat && Subtarget->hasSSE41()) {
   1059     setOperationAction(ISD::FFLOOR,             MVT::f32,   Legal);
   1060     setOperationAction(ISD::FCEIL,              MVT::f32,   Legal);
   1061     setOperationAction(ISD::FTRUNC,             MVT::f32,   Legal);
   1062     setOperationAction(ISD::FRINT,              MVT::f32,   Legal);
   1063     setOperationAction(ISD::FNEARBYINT,         MVT::f32,   Legal);
   1064     setOperationAction(ISD::FFLOOR,             MVT::f64,   Legal);
   1065     setOperationAction(ISD::FCEIL,              MVT::f64,   Legal);
   1066     setOperationAction(ISD::FTRUNC,             MVT::f64,   Legal);
   1067     setOperationAction(ISD::FRINT,              MVT::f64,   Legal);
   1068     setOperationAction(ISD::FNEARBYINT,         MVT::f64,   Legal);
   1069 
   1070     setOperationAction(ISD::FFLOOR,             MVT::v4f32, Legal);
   1071     setOperationAction(ISD::FCEIL,              MVT::v4f32, Legal);
   1072     setOperationAction(ISD::FTRUNC,             MVT::v4f32, Legal);
   1073     setOperationAction(ISD::FRINT,              MVT::v4f32, Legal);
   1074     setOperationAction(ISD::FNEARBYINT,         MVT::v4f32, Legal);
   1075     setOperationAction(ISD::FFLOOR,             MVT::v2f64, Legal);
   1076     setOperationAction(ISD::FCEIL,              MVT::v2f64, Legal);
   1077     setOperationAction(ISD::FTRUNC,             MVT::v2f64, Legal);
   1078     setOperationAction(ISD::FRINT,              MVT::v2f64, Legal);
   1079     setOperationAction(ISD::FNEARBYINT,         MVT::v2f64, Legal);
   1080 
   1081     // FIXME: Do we need to handle scalar-to-vector here?
   1082     setOperationAction(ISD::MUL,                MVT::v4i32, Legal);
   1083 
   1084     setOperationAction(ISD::VSELECT,            MVT::v2f64, Custom);
   1085     setOperationAction(ISD::VSELECT,            MVT::v2i64, Custom);
   1086     setOperationAction(ISD::VSELECT,            MVT::v4i32, Custom);
   1087     setOperationAction(ISD::VSELECT,            MVT::v4f32, Custom);
   1088     setOperationAction(ISD::VSELECT,            MVT::v8i16, Custom);
   1089     // There is no BLENDI for byte vectors. We don't need to custom lower
   1090     // some vselects for now.
   1091     setOperationAction(ISD::VSELECT,            MVT::v16i8, Legal);
   1092 
   1093     // i8 and i16 vectors are custom , because the source register and source
   1094     // source memory operand types are not the same width.  f32 vectors are
   1095     // custom since the immediate controlling the insert encodes additional
   1096     // information.
   1097     setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v16i8, Custom);
   1098     setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v8i16, Custom);
   1099     setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v4i32, Custom);
   1100     setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v4f32, Custom);
   1101 
   1102     setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v16i8, Custom);
   1103     setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v8i16, Custom);
   1104     setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4i32, Custom);
   1105     setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4f32, Custom);
   1106 
   1107     // FIXME: these should be Legal but thats only for the case where
   1108     // the index is constant.  For now custom expand to deal with that.
   1109     if (Subtarget->is64Bit()) {
   1110       setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v2i64, Custom);
   1111       setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2i64, Custom);
   1112     }
   1113   }
   1114 
   1115   if (Subtarget->hasSSE2()) {
   1116     setOperationAction(ISD::SRL,               MVT::v8i16, Custom);
   1117     setOperationAction(ISD::SRL,               MVT::v16i8, Custom);
   1118 
   1119     setOperationAction(ISD::SHL,               MVT::v8i16, Custom);
   1120     setOperationAction(ISD::SHL,               MVT::v16i8, Custom);
   1121 
   1122     setOperationAction(ISD::SRA,               MVT::v8i16, Custom);
   1123     setOperationAction(ISD::SRA,               MVT::v16i8, Custom);
   1124 
   1125     // In the customized shift lowering, the legal cases in AVX2 will be
   1126     // recognized.
   1127     setOperationAction(ISD::SRL,               MVT::v2i64, Custom);
   1128     setOperationAction(ISD::SRL,               MVT::v4i32, Custom);
   1129 
   1130     setOperationAction(ISD::SHL,               MVT::v2i64, Custom);
   1131     setOperationAction(ISD::SHL,               MVT::v4i32, Custom);
   1132 
   1133     setOperationAction(ISD::SRA,               MVT::v4i32, Custom);
   1134   }
   1135 
   1136   if (!TM.Options.UseSoftFloat && Subtarget->hasFp256()) {
   1137     addRegisterClass(MVT::v32i8,  &X86::VR256RegClass);
   1138     addRegisterClass(MVT::v16i16, &X86::VR256RegClass);
   1139     addRegisterClass(MVT::v8i32,  &X86::VR256RegClass);
   1140     addRegisterClass(MVT::v8f32,  &X86::VR256RegClass);
   1141     addRegisterClass(MVT::v4i64,  &X86::VR256RegClass);
   1142     addRegisterClass(MVT::v4f64,  &X86::VR256RegClass);
   1143 
   1144     setOperationAction(ISD::LOAD,               MVT::v8f32, Legal);
   1145     setOperationAction(ISD::LOAD,               MVT::v4f64, Legal);
   1146     setOperationAction(ISD::LOAD,               MVT::v4i64, Legal);
   1147 
   1148     setOperationAction(ISD::FADD,               MVT::v8f32, Legal);
   1149     setOperationAction(ISD::FSUB,               MVT::v8f32, Legal);
   1150     setOperationAction(ISD::FMUL,               MVT::v8f32, Legal);
   1151     setOperationAction(ISD::FDIV,               MVT::v8f32, Legal);
   1152     setOperationAction(ISD::FSQRT,              MVT::v8f32, Legal);
   1153     setOperationAction(ISD::FFLOOR,             MVT::v8f32, Legal);
   1154     setOperationAction(ISD::FCEIL,              MVT::v8f32, Legal);
   1155     setOperationAction(ISD::FTRUNC,             MVT::v8f32, Legal);
   1156     setOperationAction(ISD::FRINT,              MVT::v8f32, Legal);
   1157     setOperationAction(ISD::FNEARBYINT,         MVT::v8f32, Legal);
   1158     setOperationAction(ISD::FNEG,               MVT::v8f32, Custom);
   1159     setOperationAction(ISD::FABS,               MVT::v8f32, Custom);
   1160 
   1161     setOperationAction(ISD::FADD,               MVT::v4f64, Legal);
   1162     setOperationAction(ISD::FSUB,               MVT::v4f64, Legal);
   1163     setOperationAction(ISD::FMUL,               MVT::v4f64, Legal);
   1164     setOperationAction(ISD::FDIV,               MVT::v4f64, Legal);
   1165     setOperationAction(ISD::FSQRT,              MVT::v4f64, Legal);
   1166     setOperationAction(ISD::FFLOOR,             MVT::v4f64, Legal);
   1167     setOperationAction(ISD::FCEIL,              MVT::v4f64, Legal);
   1168     setOperationAction(ISD::FTRUNC,             MVT::v4f64, Legal);
   1169     setOperationAction(ISD::FRINT,              MVT::v4f64, Legal);
   1170     setOperationAction(ISD::FNEARBYINT,         MVT::v4f64, Legal);
   1171     setOperationAction(ISD::FNEG,               MVT::v4f64, Custom);
   1172     setOperationAction(ISD::FABS,               MVT::v4f64, Custom);
   1173 
   1174     // (fp_to_int:v8i16 (v8f32 ..)) requires the result type to be promoted
   1175     // even though v8i16 is a legal type.
   1176     setOperationAction(ISD::FP_TO_SINT,         MVT::v8i16, Promote);
   1177     setOperationAction(ISD::FP_TO_UINT,         MVT::v8i16, Promote);
   1178     setOperationAction(ISD::FP_TO_SINT,         MVT::v8i32, Legal);
   1179 
   1180     setOperationAction(ISD::SINT_TO_FP,         MVT::v8i16, Promote);
   1181     setOperationAction(ISD::SINT_TO_FP,         MVT::v8i32, Legal);
   1182     setOperationAction(ISD::FP_ROUND,           MVT::v4f32, Legal);
   1183 
   1184     setOperationAction(ISD::UINT_TO_FP,         MVT::v8i8,  Custom);
   1185     setOperationAction(ISD::UINT_TO_FP,         MVT::v8i16, Custom);
   1186 
   1187     setLoadExtAction(ISD::EXTLOAD,              MVT::v4f32, Legal);
   1188 
   1189     setOperationAction(ISD::SRL,               MVT::v16i16, Custom);
   1190     setOperationAction(ISD::SRL,               MVT::v32i8, Custom);
   1191 
   1192     setOperationAction(ISD::SHL,               MVT::v16i16, Custom);
   1193     setOperationAction(ISD::SHL,               MVT::v32i8, Custom);
   1194 
   1195     setOperationAction(ISD::SRA,               MVT::v16i16, Custom);
   1196     setOperationAction(ISD::SRA,               MVT::v32i8, Custom);
   1197 
   1198     setOperationAction(ISD::SETCC,             MVT::v32i8, Custom);
   1199     setOperationAction(ISD::SETCC,             MVT::v16i16, Custom);
   1200     setOperationAction(ISD::SETCC,             MVT::v8i32, Custom);
   1201     setOperationAction(ISD::SETCC,             MVT::v4i64, Custom);
   1202 
   1203     setOperationAction(ISD::SELECT,            MVT::v4f64, Custom);
   1204     setOperationAction(ISD::SELECT,            MVT::v4i64, Custom);
   1205     setOperationAction(ISD::SELECT,            MVT::v8f32, Custom);
   1206 
   1207     setOperationAction(ISD::VSELECT,           MVT::v4f64, Custom);
   1208     setOperationAction(ISD::VSELECT,           MVT::v4i64, Custom);
   1209     setOperationAction(ISD::VSELECT,           MVT::v8i32, Custom);
   1210     setOperationAction(ISD::VSELECT,           MVT::v8f32, Custom);
   1211 
   1212     setOperationAction(ISD::SIGN_EXTEND,       MVT::v4i64, Custom);
   1213     setOperationAction(ISD::SIGN_EXTEND,       MVT::v8i32, Custom);
   1214     setOperationAction(ISD::SIGN_EXTEND,       MVT::v16i16, Custom);
   1215     setOperationAction(ISD::ZERO_EXTEND,       MVT::v4i64, Custom);
   1216     setOperationAction(ISD::ZERO_EXTEND,       MVT::v8i32, Custom);
   1217     setOperationAction(ISD::ZERO_EXTEND,       MVT::v16i16, Custom);
   1218     setOperationAction(ISD::ANY_EXTEND,        MVT::v4i64, Custom);
   1219     setOperationAction(ISD::ANY_EXTEND,        MVT::v8i32, Custom);
   1220     setOperationAction(ISD::ANY_EXTEND,        MVT::v16i16, Custom);
   1221     setOperationAction(ISD::TRUNCATE,          MVT::v16i8, Custom);
   1222     setOperationAction(ISD::TRUNCATE,          MVT::v8i16, Custom);
   1223     setOperationAction(ISD::TRUNCATE,          MVT::v4i32, Custom);
   1224 
   1225     if (Subtarget->hasFMA() || Subtarget->hasFMA4()) {
   1226       setOperationAction(ISD::FMA,             MVT::v8f32, Legal);
   1227       setOperationAction(ISD::FMA,             MVT::v4f64, Legal);
   1228       setOperationAction(ISD::FMA,             MVT::v4f32, Legal);
   1229       setOperationAction(ISD::FMA,             MVT::v2f64, Legal);
   1230       setOperationAction(ISD::FMA,             MVT::f32, Legal);
   1231       setOperationAction(ISD::FMA,             MVT::f64, Legal);
   1232     }
   1233 
   1234     if (Subtarget->hasInt256()) {
   1235       setOperationAction(ISD::ADD,             MVT::v4i64, Legal);
   1236       setOperationAction(ISD::ADD,             MVT::v8i32, Legal);
   1237       setOperationAction(ISD::ADD,             MVT::v16i16, Legal);
   1238       setOperationAction(ISD::ADD,             MVT::v32i8, Legal);
   1239 
   1240       setOperationAction(ISD::SUB,             MVT::v4i64, Legal);
   1241       setOperationAction(ISD::SUB,             MVT::v8i32, Legal);
   1242       setOperationAction(ISD::SUB,             MVT::v16i16, Legal);
   1243       setOperationAction(ISD::SUB,             MVT::v32i8, Legal);
   1244 
   1245       setOperationAction(ISD::MUL,             MVT::v4i64, Custom);
   1246       setOperationAction(ISD::MUL,             MVT::v8i32, Legal);
   1247       setOperationAction(ISD::MUL,             MVT::v16i16, Legal);
   1248       // Don't lower v32i8 because there is no 128-bit byte mul
   1249 
   1250       setOperationAction(ISD::UMUL_LOHI,       MVT::v8i32, Custom);
   1251       setOperationAction(ISD::SMUL_LOHI,       MVT::v8i32, Custom);
   1252       setOperationAction(ISD::MULHU,           MVT::v16i16, Legal);
   1253       setOperationAction(ISD::MULHS,           MVT::v16i16, Legal);
   1254 
   1255       setOperationAction(ISD::VSELECT,         MVT::v16i16, Custom);
   1256       setOperationAction(ISD::VSELECT,         MVT::v32i8, Legal);
   1257     } else {
   1258       setOperationAction(ISD::ADD,             MVT::v4i64, Custom);
   1259       setOperationAction(ISD::ADD,             MVT::v8i32, Custom);
   1260       setOperationAction(ISD::ADD,             MVT::v16i16, Custom);
   1261       setOperationAction(ISD::ADD,             MVT::v32i8, Custom);
   1262 
   1263       setOperationAction(ISD::SUB,             MVT::v4i64, Custom);
   1264       setOperationAction(ISD::SUB,             MVT::v8i32, Custom);
   1265       setOperationAction(ISD::SUB,             MVT::v16i16, Custom);
   1266       setOperationAction(ISD::SUB,             MVT::v32i8, Custom);
   1267 
   1268       setOperationAction(ISD::MUL,             MVT::v4i64, Custom);
   1269       setOperationAction(ISD::MUL,             MVT::v8i32, Custom);
   1270       setOperationAction(ISD::MUL,             MVT::v16i16, Custom);
   1271       // Don't lower v32i8 because there is no 128-bit byte mul
   1272     }
   1273 
   1274     // In the customized shift lowering, the legal cases in AVX2 will be
   1275     // recognized.
   1276     setOperationAction(ISD::SRL,               MVT::v4i64, Custom);
   1277     setOperationAction(ISD::SRL,               MVT::v8i32, Custom);
   1278 
   1279     setOperationAction(ISD::SHL,               MVT::v4i64, Custom);
   1280     setOperationAction(ISD::SHL,               MVT::v8i32, Custom);
   1281 
   1282     setOperationAction(ISD::SRA,               MVT::v8i32, Custom);
   1283 
   1284     // Custom lower several nodes for 256-bit types.
   1285     for (int i = MVT::FIRST_VECTOR_VALUETYPE;
   1286              i <= MVT::LAST_VECTOR_VALUETYPE; ++i) {
   1287       MVT VT = (MVT::SimpleValueType)i;
   1288 
   1289       // Extract subvector is special because the value type
   1290       // (result) is 128-bit but the source is 256-bit wide.
   1291       if (VT.is128BitVector())
   1292         setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Custom);
   1293 
   1294       // Do not attempt to custom lower other non-256-bit vectors
   1295       if (!VT.is256BitVector())
   1296         continue;
   1297 
   1298       setOperationAction(ISD::BUILD_VECTOR,       VT, Custom);
   1299       setOperationAction(ISD::VECTOR_SHUFFLE,     VT, Custom);
   1300       setOperationAction(ISD::INSERT_VECTOR_ELT,  VT, Custom);
   1301       setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
   1302       setOperationAction(ISD::SCALAR_TO_VECTOR,   VT, Custom);
   1303       setOperationAction(ISD::INSERT_SUBVECTOR,   VT, Custom);
   1304       setOperationAction(ISD::CONCAT_VECTORS,     VT, Custom);
   1305     }
   1306 
   1307     // Promote v32i8, v16i16, v8i32 select, and, or, xor to v4i64.
   1308     for (int i = MVT::v32i8; i != MVT::v4i64; ++i) {
   1309       MVT VT = (MVT::SimpleValueType)i;
   1310 
   1311       // Do not attempt to promote non-256-bit vectors
   1312       if (!VT.is256BitVector())
   1313         continue;
   1314 
   1315       setOperationAction(ISD::AND,    VT, Promote);
   1316       AddPromotedToType (ISD::AND,    VT, MVT::v4i64);
   1317       setOperationAction(ISD::OR,     VT, Promote);
   1318       AddPromotedToType (ISD::OR,     VT, MVT::v4i64);
   1319       setOperationAction(ISD::XOR,    VT, Promote);
   1320       AddPromotedToType (ISD::XOR,    VT, MVT::v4i64);
   1321       setOperationAction(ISD::LOAD,   VT, Promote);
   1322       AddPromotedToType (ISD::LOAD,   VT, MVT::v4i64);
   1323       setOperationAction(ISD::SELECT, VT, Promote);
   1324       AddPromotedToType (ISD::SELECT, VT, MVT::v4i64);
   1325     }
   1326   }
   1327 
   1328   if (!TM.Options.UseSoftFloat && Subtarget->hasAVX512()) {
   1329     addRegisterClass(MVT::v16i32, &X86::VR512RegClass);
   1330     addRegisterClass(MVT::v16f32, &X86::VR512RegClass);
   1331     addRegisterClass(MVT::v8i64,  &X86::VR512RegClass);
   1332     addRegisterClass(MVT::v8f64,  &X86::VR512RegClass);
   1333 
   1334     addRegisterClass(MVT::i1,     &X86::VK1RegClass);
   1335     addRegisterClass(MVT::v8i1,   &X86::VK8RegClass);
   1336     addRegisterClass(MVT::v16i1,  &X86::VK16RegClass);
   1337 
   1338     setOperationAction(ISD::BR_CC,              MVT::i1,    Expand);
   1339     setOperationAction(ISD::SETCC,              MVT::i1,    Custom);
   1340     setOperationAction(ISD::XOR,                MVT::i1,    Legal);
   1341     setOperationAction(ISD::OR,                 MVT::i1,    Legal);
   1342     setOperationAction(ISD::AND,                MVT::i1,    Legal);
   1343     setLoadExtAction(ISD::EXTLOAD,              MVT::v8f32, Legal);
   1344     setOperationAction(ISD::LOAD,               MVT::v16f32, Legal);
   1345     setOperationAction(ISD::LOAD,               MVT::v8f64, Legal);
   1346     setOperationAction(ISD::LOAD,               MVT::v8i64, Legal);
   1347     setOperationAction(ISD::LOAD,               MVT::v16i32, Legal);
   1348     setOperationAction(ISD::LOAD,               MVT::v16i1, Legal);
   1349 
   1350     setOperationAction(ISD::FADD,               MVT::v16f32, Legal);
   1351     setOperationAction(ISD::FSUB,               MVT::v16f32, Legal);
   1352     setOperationAction(ISD::FMUL,               MVT::v16f32, Legal);
   1353     setOperationAction(ISD::FDIV,               MVT::v16f32, Legal);
   1354     setOperationAction(ISD::FSQRT,              MVT::v16f32, Legal);
   1355     setOperationAction(ISD::FNEG,               MVT::v16f32, Custom);
   1356 
   1357     setOperationAction(ISD::FADD,               MVT::v8f64, Legal);
   1358     setOperationAction(ISD::FSUB,               MVT::v8f64, Legal);
   1359     setOperationAction(ISD::FMUL,               MVT::v8f64, Legal);
   1360     setOperationAction(ISD::FDIV,               MVT::v8f64, Legal);
   1361     setOperationAction(ISD::FSQRT,              MVT::v8f64, Legal);
   1362     setOperationAction(ISD::FNEG,               MVT::v8f64, Custom);
   1363     setOperationAction(ISD::FMA,                MVT::v8f64, Legal);
   1364     setOperationAction(ISD::FMA,                MVT::v16f32, Legal);
   1365 
   1366     setOperationAction(ISD::FP_TO_SINT,         MVT::i32, Legal);
   1367     setOperationAction(ISD::FP_TO_UINT,         MVT::i32, Legal);
   1368     setOperationAction(ISD::SINT_TO_FP,         MVT::i32, Legal);
   1369     setOperationAction(ISD::UINT_TO_FP,         MVT::i32, Legal);
   1370     if (Subtarget->is64Bit()) {
   1371       setOperationAction(ISD::FP_TO_UINT,       MVT::i64, Legal);
   1372       setOperationAction(ISD::FP_TO_SINT,       MVT::i64, Legal);
   1373       setOperationAction(ISD::SINT_TO_FP,       MVT::i64, Legal);
   1374       setOperationAction(ISD::UINT_TO_FP,       MVT::i64, Legal);
   1375     }
   1376     setOperationAction(ISD::FP_TO_SINT,         MVT::v16i32, Legal);
   1377     setOperationAction(ISD::FP_TO_UINT,         MVT::v16i32, Legal);
   1378     setOperationAction(ISD::FP_TO_UINT,         MVT::v8i32, Legal);
   1379     setOperationAction(ISD::FP_TO_UINT,         MVT::v4i32, Legal);
   1380     setOperationAction(ISD::SINT_TO_FP,         MVT::v16i32, Legal);
   1381     setOperationAction(ISD::UINT_TO_FP,         MVT::v16i32, Legal);
   1382     setOperationAction(ISD::UINT_TO_FP,         MVT::v8i32, Legal);
   1383     setOperationAction(ISD::UINT_TO_FP,         MVT::v4i32, Legal);
   1384     setOperationAction(ISD::FP_ROUND,           MVT::v8f32, Legal);
   1385     setOperationAction(ISD::FP_EXTEND,          MVT::v8f32, Legal);
   1386 
   1387     setOperationAction(ISD::TRUNCATE,           MVT::i1, Custom);
   1388     setOperationAction(ISD::TRUNCATE,           MVT::v16i8, Custom);
   1389     setOperationAction(ISD::TRUNCATE,           MVT::v8i32, Custom);
   1390     setOperationAction(ISD::TRUNCATE,           MVT::v8i1, Custom);
   1391     setOperationAction(ISD::TRUNCATE,           MVT::v16i1, Custom);
   1392     setOperationAction(ISD::TRUNCATE,           MVT::v16i16, Custom);
   1393     setOperationAction(ISD::ZERO_EXTEND,        MVT::v16i32, Custom);
   1394     setOperationAction(ISD::ZERO_EXTEND,        MVT::v8i64, Custom);
   1395     setOperationAction(ISD::SIGN_EXTEND,        MVT::v16i32, Custom);
   1396     setOperationAction(ISD::SIGN_EXTEND,        MVT::v8i64, Custom);
   1397     setOperationAction(ISD::SIGN_EXTEND,        MVT::v16i8, Custom);
   1398     setOperationAction(ISD::SIGN_EXTEND,        MVT::v8i16, Custom);
   1399     setOperationAction(ISD::SIGN_EXTEND,        MVT::v16i16, Custom);
   1400 
   1401     setOperationAction(ISD::CONCAT_VECTORS,     MVT::v8f64,  Custom);
   1402     setOperationAction(ISD::CONCAT_VECTORS,     MVT::v8i64,  Custom);
   1403     setOperationAction(ISD::CONCAT_VECTORS,     MVT::v16f32,  Custom);
   1404     setOperationAction(ISD::CONCAT_VECTORS,     MVT::v16i32,  Custom);
   1405     setOperationAction(ISD::CONCAT_VECTORS,     MVT::v8i1,    Custom);
   1406     setOperationAction(ISD::CONCAT_VECTORS,     MVT::v16i1, Legal);
   1407 
   1408     setOperationAction(ISD::SETCC,              MVT::v16i1, Custom);
   1409     setOperationAction(ISD::SETCC,              MVT::v8i1, Custom);
   1410 
   1411     setOperationAction(ISD::MUL,              MVT::v8i64, Custom);
   1412 
   1413     setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v8i1,  Custom);
   1414     setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v16i1, Custom);
   1415     setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v16i1, Custom);
   1416     setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v8i1, Custom);
   1417     setOperationAction(ISD::BUILD_VECTOR,       MVT::v8i1, Custom);
   1418     setOperationAction(ISD::BUILD_VECTOR,       MVT::v16i1, Custom);
   1419     setOperationAction(ISD::SELECT,             MVT::v8f64, Custom);
   1420     setOperationAction(ISD::SELECT,             MVT::v8i64, Custom);
   1421     setOperationAction(ISD::SELECT,             MVT::v16f32, Custom);
   1422 
   1423     setOperationAction(ISD::ADD,                MVT::v8i64, Legal);
   1424     setOperationAction(ISD::ADD,                MVT::v16i32, Legal);
   1425 
   1426     setOperationAction(ISD::SUB,                MVT::v8i64, Legal);
   1427     setOperationAction(ISD::SUB,                MVT::v16i32, Legal);
   1428 
   1429     setOperationAction(ISD::MUL,                MVT::v16i32, Legal);
   1430 
   1431     setOperationAction(ISD::SRL,                MVT::v8i64, Custom);
   1432     setOperationAction(ISD::SRL,                MVT::v16i32, Custom);
   1433 
   1434     setOperationAction(ISD::SHL,                MVT::v8i64, Custom);
   1435     setOperationAction(ISD::SHL,                MVT::v16i32, Custom);
   1436 
   1437     setOperationAction(ISD::SRA,                MVT::v8i64, Custom);
   1438     setOperationAction(ISD::SRA,                MVT::v16i32, Custom);
   1439 
   1440     setOperationAction(ISD::AND,                MVT::v8i64, Legal);
   1441     setOperationAction(ISD::OR,                 MVT::v8i64, Legal);
   1442     setOperationAction(ISD::XOR,                MVT::v8i64, Legal);
   1443     setOperationAction(ISD::AND,                MVT::v16i32, Legal);
   1444     setOperationAction(ISD::OR,                 MVT::v16i32, Legal);
   1445     setOperationAction(ISD::XOR,                MVT::v16i32, Legal);
   1446 
   1447     if (Subtarget->hasCDI()) {
   1448       setOperationAction(ISD::CTLZ,             MVT::v8i64, Legal);
   1449       setOperationAction(ISD::CTLZ,             MVT::v16i32, Legal);
   1450     }
   1451 
   1452     // Custom lower several nodes.
   1453     for (int i = MVT::FIRST_VECTOR_VALUETYPE;
   1454              i <= MVT::LAST_VECTOR_VALUETYPE; ++i) {
   1455       MVT VT = (MVT::SimpleValueType)i;
   1456 
   1457       unsigned EltSize = VT.getVectorElementType().getSizeInBits();
   1458       // Extract subvector is special because the value type
   1459       // (result) is 256/128-bit but the source is 512-bit wide.
   1460       if (VT.is128BitVector() || VT.is256BitVector())
   1461         setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Custom);
   1462 
   1463       if (VT.getVectorElementType() == MVT::i1)
   1464         setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Legal);
   1465 
   1466       // Do not attempt to custom lower other non-512-bit vectors
   1467       if (!VT.is512BitVector())
   1468         continue;
   1469 
   1470       if ( EltSize >= 32) {
   1471         setOperationAction(ISD::VECTOR_SHUFFLE,      VT, Custom);
   1472         setOperationAction(ISD::INSERT_VECTOR_ELT,   VT, Custom);
   1473         setOperationAction(ISD::BUILD_VECTOR,        VT, Custom);
   1474         setOperationAction(ISD::VSELECT,             VT, Legal);
   1475         setOperationAction(ISD::EXTRACT_VECTOR_ELT,  VT, Custom);
   1476         setOperationAction(ISD::SCALAR_TO_VECTOR,    VT, Custom);
   1477         setOperationAction(ISD::INSERT_SUBVECTOR,    VT, Custom);
   1478       }
   1479     }
   1480     for (int i = MVT::v32i8; i != MVT::v8i64; ++i) {
   1481       MVT VT = (MVT::SimpleValueType)i;
   1482 
   1483       // Do not attempt to promote non-256-bit vectors
   1484       if (!VT.is512BitVector())
   1485         continue;
   1486 
   1487       setOperationAction(ISD::SELECT, VT, Promote);
   1488       AddPromotedToType (ISD::SELECT, VT, MVT::v8i64);
   1489     }
   1490   }// has  AVX-512
   1491 
   1492   // SIGN_EXTEND_INREGs are evaluated by the extend type. Handle the expansion
   1493   // of this type with custom code.
   1494   for (int VT = MVT::FIRST_VECTOR_VALUETYPE;
   1495            VT != MVT::LAST_VECTOR_VALUETYPE; VT++) {
   1496     setOperationAction(ISD::SIGN_EXTEND_INREG, (MVT::SimpleValueType)VT,
   1497                        Custom);
   1498   }
   1499 
   1500   // We want to custom lower some of our intrinsics.
   1501   setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom);
   1502   setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::Other, Custom);
   1503   setOperationAction(ISD::INTRINSIC_VOID, MVT::Other, Custom);
   1504   if (!Subtarget->is64Bit())
   1505     setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::i64, Custom);
   1506 
   1507   // Only custom-lower 64-bit SADDO and friends on 64-bit because we don't
   1508   // handle type legalization for these operations here.
   1509   //
   1510   // FIXME: We really should do custom legalization for addition and
   1511   // subtraction on x86-32 once PR3203 is fixed.  We really can't do much better
   1512   // than generic legalization for 64-bit multiplication-with-overflow, though.
   1513   for (unsigned i = 0, e = 3+Subtarget->is64Bit(); i != e; ++i) {
   1514     // Add/Sub/Mul with overflow operations are custom lowered.
   1515     MVT VT = IntVTs[i];
   1516     setOperationAction(ISD::SADDO, VT, Custom);
   1517     setOperationAction(ISD::UADDO, VT, Custom);
   1518     setOperationAction(ISD::SSUBO, VT, Custom);
   1519     setOperationAction(ISD::USUBO, VT, Custom);
   1520     setOperationAction(ISD::SMULO, VT, Custom);
   1521     setOperationAction(ISD::UMULO, VT, Custom);
   1522   }
   1523 
   1524   // There are no 8-bit 3-address imul/mul instructions
   1525   setOperationAction(ISD::SMULO, MVT::i8, Expand);
   1526   setOperationAction(ISD::UMULO, MVT::i8, Expand);
   1527 
   1528   if (!Subtarget->is64Bit()) {
   1529     // These libcalls are not available in 32-bit.
   1530     setLibcallName(RTLIB::SHL_I128, nullptr);
   1531     setLibcallName(RTLIB::SRL_I128, nullptr);
   1532     setLibcallName(RTLIB::SRA_I128, nullptr);
   1533   }
   1534 
   1535   // Combine sin / cos into one node or libcall if possible.
   1536   if (Subtarget->hasSinCos()) {
   1537     setLibcallName(RTLIB::SINCOS_F32, "sincosf");
   1538     setLibcallName(RTLIB::SINCOS_F64, "sincos");
   1539     if (Subtarget->isTargetDarwin()) {
   1540       // For MacOSX, we don't want to the normal expansion of a libcall to
   1541       // sincos. We want to issue a libcall to __sincos_stret to avoid memory
   1542       // traffic.
   1543       setOperationAction(ISD::FSINCOS, MVT::f64, Custom);
   1544       setOperationAction(ISD::FSINCOS, MVT::f32, Custom);
   1545     }
   1546   }
   1547 
   1548   if (Subtarget->isTargetWin64()) {
   1549     setOperationAction(ISD::SDIV, MVT::i128, Custom);
   1550     setOperationAction(ISD::UDIV, MVT::i128, Custom);
   1551     setOperationAction(ISD::SREM, MVT::i128, Custom);
   1552     setOperationAction(ISD::UREM, MVT::i128, Custom);
   1553     setOperationAction(ISD::SDIVREM, MVT::i128, Custom);
   1554     setOperationAction(ISD::UDIVREM, MVT::i128, Custom);
   1555   }
   1556 
   1557   // We have target-specific dag combine patterns for the following nodes:
   1558   setTargetDAGCombine(ISD::VECTOR_SHUFFLE);
   1559   setTargetDAGCombine(ISD::EXTRACT_VECTOR_ELT);
   1560   setTargetDAGCombine(ISD::VSELECT);
   1561   setTargetDAGCombine(ISD::SELECT);
   1562   setTargetDAGCombine(ISD::SHL);
   1563   setTargetDAGCombine(ISD::SRA);
   1564   setTargetDAGCombine(ISD::SRL);
   1565   setTargetDAGCombine(ISD::OR);
   1566   setTargetDAGCombine(ISD::AND);
   1567   setTargetDAGCombine(ISD::ADD);
   1568   setTargetDAGCombine(ISD::FADD);
   1569   setTargetDAGCombine(ISD::FSUB);
   1570   setTargetDAGCombine(ISD::FMA);
   1571   setTargetDAGCombine(ISD::SUB);
   1572   setTargetDAGCombine(ISD::LOAD);
   1573   setTargetDAGCombine(ISD::STORE);
   1574   setTargetDAGCombine(ISD::ZERO_EXTEND);
   1575   setTargetDAGCombine(ISD::ANY_EXTEND);
   1576   setTargetDAGCombine(ISD::SIGN_EXTEND);
   1577   setTargetDAGCombine(ISD::SIGN_EXTEND_INREG);
   1578   setTargetDAGCombine(ISD::TRUNCATE);
   1579   setTargetDAGCombine(ISD::SINT_TO_FP);
   1580   setTargetDAGCombine(ISD::SETCC);
   1581   setTargetDAGCombine(ISD::INTRINSIC_WO_CHAIN);
   1582   setTargetDAGCombine(ISD::BUILD_VECTOR);
   1583   if (Subtarget->is64Bit())
   1584     setTargetDAGCombine(ISD::MUL);
   1585   setTargetDAGCombine(ISD::XOR);
   1586 
   1587   computeRegisterProperties();
   1588 
   1589   // On Darwin, -Os means optimize for size without hurting performance,
   1590   // do not reduce the limit.
   1591   MaxStoresPerMemset = 16; // For @llvm.memset -> sequence of stores
   1592   MaxStoresPerMemsetOptSize = Subtarget->isTargetDarwin() ? 16 : 8;
   1593   MaxStoresPerMemcpy = 8; // For @llvm.memcpy -> sequence of stores
   1594   MaxStoresPerMemcpyOptSize = Subtarget->isTargetDarwin() ? 8 : 4;
   1595   MaxStoresPerMemmove = 8; // For @llvm.memmove -> sequence of stores
   1596   MaxStoresPerMemmoveOptSize = Subtarget->isTargetDarwin() ? 8 : 4;
   1597   setPrefLoopAlignment(4); // 2^4 bytes.
   1598 
   1599   // Predictable cmov don't hurt on atom because it's in-order.
   1600   PredictableSelectIsExpensive = !Subtarget->isAtom();
   1601 
   1602   setPrefFunctionAlignment(4); // 2^4 bytes.
   1603 }
   1604 
   1605 TargetLoweringBase::LegalizeTypeAction
   1606 X86TargetLowering::getPreferredVectorAction(EVT VT) const {
   1607   if (ExperimentalVectorWideningLegalization &&
   1608       VT.getVectorNumElements() != 1 &&
   1609       VT.getVectorElementType().getSimpleVT() != MVT::i1)
   1610     return TypeWidenVector;
   1611 
   1612   return TargetLoweringBase::getPreferredVectorAction(VT);
   1613 }
   1614 
   1615 EVT X86TargetLowering::getSetCCResultType(LLVMContext &, EVT VT) const {
   1616   if (!VT.isVector())
   1617     return Subtarget->hasAVX512() ? MVT::i1: MVT::i8;
   1618 
   1619   if (Subtarget->hasAVX512())
   1620     switch(VT.getVectorNumElements()) {
   1621     case  8: return MVT::v8i1;
   1622     case 16: return MVT::v16i1;
   1623   }
   1624 
   1625   return VT.changeVectorElementTypeToInteger();
   1626 }
   1627 
   1628 /// getMaxByValAlign - Helper for getByValTypeAlignment to determine
   1629 /// the desired ByVal argument alignment.
   1630 static void getMaxByValAlign(Type *Ty, unsigned &MaxAlign) {
   1631   if (MaxAlign == 16)
   1632     return;
   1633   if (VectorType *VTy = dyn_cast<VectorType>(Ty)) {
   1634     if (VTy->getBitWidth() == 128)
   1635       MaxAlign = 16;
   1636   } else if (ArrayType *ATy = dyn_cast<ArrayType>(Ty)) {
   1637     unsigned EltAlign = 0;
   1638     getMaxByValAlign(ATy->getElementType(), EltAlign);
   1639     if (EltAlign > MaxAlign)
   1640       MaxAlign = EltAlign;
   1641   } else if (StructType *STy = dyn_cast<StructType>(Ty)) {
   1642     for (unsigned i = 0, e = STy->getNumElements(); i != e; ++i) {
   1643       unsigned EltAlign = 0;
   1644       getMaxByValAlign(STy->getElementType(i), EltAlign);
   1645       if (EltAlign > MaxAlign)
   1646         MaxAlign = EltAlign;
   1647       if (MaxAlign == 16)
   1648         break;
   1649     }
   1650   }
   1651 }
   1652 
   1653 /// getByValTypeAlignment - Return the desired alignment for ByVal aggregate
   1654 /// function arguments in the caller parameter area. For X86, aggregates
   1655 /// that contain SSE vectors are placed at 16-byte boundaries while the rest
   1656 /// are at 4-byte boundaries.
   1657 unsigned X86TargetLowering::getByValTypeAlignment(Type *Ty) const {
   1658   if (Subtarget->is64Bit()) {
   1659     // Max of 8 and alignment of type.
   1660     unsigned TyAlign = TD->getABITypeAlignment(Ty);
   1661     if (TyAlign > 8)
   1662       return TyAlign;
   1663     return 8;
   1664   }
   1665 
   1666   unsigned Align = 4;
   1667   if (Subtarget->hasSSE1())
   1668     getMaxByValAlign(Ty, Align);
   1669   return Align;
   1670 }
   1671 
   1672 /// getOptimalMemOpType - Returns the target specific optimal type for load
   1673 /// and store operations as a result of memset, memcpy, and memmove
   1674 /// lowering. If DstAlign is zero that means it's safe to destination
   1675 /// alignment can satisfy any constraint. Similarly if SrcAlign is zero it
   1676 /// means there isn't a need to check it against alignment requirement,
   1677 /// probably because the source does not need to be loaded. If 'IsMemset' is
   1678 /// true, that means it's expanding a memset. If 'ZeroMemset' is true, that
   1679 /// means it's a memset of zero. 'MemcpyStrSrc' indicates whether the memcpy
   1680 /// source is constant so it does not need to be loaded.
   1681 /// It returns EVT::Other if the type should be determined using generic
   1682 /// target-independent logic.
   1683 EVT
   1684 X86TargetLowering::getOptimalMemOpType(uint64_t Size,
   1685                                        unsigned DstAlign, unsigned SrcAlign,
   1686                                        bool IsMemset, bool ZeroMemset,
   1687                                        bool MemcpyStrSrc,
   1688                                        MachineFunction &MF) const {
   1689   const Function *F = MF.getFunction();
   1690   if ((!IsMemset || ZeroMemset) &&
   1691       !F->getAttributes().hasAttribute(AttributeSet::FunctionIndex,
   1692                                        Attribute::NoImplicitFloat)) {
   1693     if (Size >= 16 &&
   1694         (Subtarget->isUnalignedMemAccessFast() ||
   1695          ((DstAlign == 0 || DstAlign >= 16) &&
   1696           (SrcAlign == 0 || SrcAlign >= 16)))) {
   1697       if (Size >= 32) {
   1698         if (Subtarget->hasInt256())
   1699           return MVT::v8i32;
   1700         if (Subtarget->hasFp256())
   1701           return MVT::v8f32;
   1702       }
   1703       if (Subtarget->hasSSE2())
   1704         return MVT::v4i32;
   1705       if (Subtarget->hasSSE1())
   1706         return MVT::v4f32;
   1707     } else if (!MemcpyStrSrc && Size >= 8 &&
   1708                !Subtarget->is64Bit() &&
   1709                Subtarget->hasSSE2()) {
   1710       // Do not use f64 to lower memcpy if source is string constant. It's
   1711       // better to use i32 to avoid the loads.
   1712       return MVT::f64;
   1713     }
   1714   }
   1715   if (Subtarget->is64Bit() && Size >= 8)
   1716     return MVT::i64;
   1717   return MVT::i32;
   1718 }
   1719 
   1720 bool X86TargetLowering::isSafeMemOpType(MVT VT) const {
   1721   if (VT == MVT::f32)
   1722     return X86ScalarSSEf32;
   1723   else if (VT == MVT::f64)
   1724     return X86ScalarSSEf64;
   1725   return true;
   1726 }
   1727 
   1728 bool
   1729 X86TargetLowering::allowsUnalignedMemoryAccesses(EVT VT,
   1730                                                  unsigned,
   1731                                                  bool *Fast) const {
   1732   if (Fast)
   1733     *Fast = Subtarget->isUnalignedMemAccessFast();
   1734   return true;
   1735 }
   1736 
   1737 /// getJumpTableEncoding - Return the entry encoding for a jump table in the
   1738 /// current function.  The returned value is a member of the
   1739 /// MachineJumpTableInfo::JTEntryKind enum.
   1740 unsigned X86TargetLowering::getJumpTableEncoding() const {
   1741   // In GOT pic mode, each entry in the jump table is emitted as a @GOTOFF
   1742   // symbol.
   1743   if (getTargetMachine().getRelocationModel() == Reloc::PIC_ &&
   1744       Subtarget->isPICStyleGOT())
   1745     return MachineJumpTableInfo::EK_Custom32;
   1746 
   1747   // Otherwise, use the normal jump table encoding heuristics.
   1748   return TargetLowering::getJumpTableEncoding();
   1749 }
   1750 
   1751 const MCExpr *
   1752 X86TargetLowering::LowerCustomJumpTableEntry(const MachineJumpTableInfo *MJTI,
   1753                                              const MachineBasicBlock *MBB,
   1754                                              unsigned uid,MCContext &Ctx) const{
   1755   assert(MBB->getParent()->getTarget().getRelocationModel() == Reloc::PIC_ &&
   1756          Subtarget->isPICStyleGOT());
   1757   // In 32-bit ELF systems, our jump table entries are formed with @GOTOFF
   1758   // entries.
   1759   return MCSymbolRefExpr::Create(MBB->getSymbol(),
   1760                                  MCSymbolRefExpr::VK_GOTOFF, Ctx);
   1761 }
   1762 
   1763 /// getPICJumpTableRelocaBase - Returns relocation base for the given PIC
   1764 /// jumptable.
   1765 SDValue X86TargetLowering::getPICJumpTableRelocBase(SDValue Table,
   1766                                                     SelectionDAG &DAG) const {
   1767   if (!Subtarget->is64Bit())
   1768     // This doesn't have SDLoc associated with it, but is not really the
   1769     // same as a Register.
   1770     return DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), getPointerTy());
   1771   return Table;
   1772 }
   1773 
   1774 /// getPICJumpTableRelocBaseExpr - This returns the relocation base for the
   1775 /// given PIC jumptable, the same as getPICJumpTableRelocBase, but as an
   1776 /// MCExpr.
   1777 const MCExpr *X86TargetLowering::
   1778 getPICJumpTableRelocBaseExpr(const MachineFunction *MF, unsigned JTI,
   1779                              MCContext &Ctx) const {
   1780   // X86-64 uses RIP relative addressing based on the jump table label.
   1781   if (Subtarget->isPICStyleRIPRel())
   1782     return TargetLowering::getPICJumpTableRelocBaseExpr(MF, JTI, Ctx);
   1783 
   1784   // Otherwise, the reference is relative to the PIC base.
   1785   return MCSymbolRefExpr::Create(MF->getPICBaseSymbol(), Ctx);
   1786 }
   1787 
   1788 // FIXME: Why this routine is here? Move to RegInfo!
   1789 std::pair<const TargetRegisterClass*, uint8_t>
   1790 X86TargetLowering::findRepresentativeClass(MVT VT) const{
   1791   const TargetRegisterClass *RRC = nullptr;
   1792   uint8_t Cost = 1;
   1793   switch (VT.SimpleTy) {
   1794   default:
   1795     return TargetLowering::findRepresentativeClass(VT);
   1796   case MVT::i8: case MVT::i16: case MVT::i32: case MVT::i64:
   1797     RRC = Subtarget->is64Bit() ?
   1798       (const TargetRegisterClass*)&X86::GR64RegClass :
   1799       (const TargetRegisterClass*)&X86::GR32RegClass;
   1800     break;
   1801   case MVT::x86mmx:
   1802     RRC = &X86::VR64RegClass;
   1803     break;
   1804   case MVT::f32: case MVT::f64:
   1805   case MVT::v16i8: case MVT::v8i16: case MVT::v4i32: case MVT::v2i64:
   1806   case MVT::v4f32: case MVT::v2f64:
   1807   case MVT::v32i8: case MVT::v8i32: case MVT::v4i64: case MVT::v8f32:
   1808   case MVT::v4f64:
   1809     RRC = &X86::VR128RegClass;
   1810     break;
   1811   }
   1812   return std::make_pair(RRC, Cost);
   1813 }
   1814 
   1815 bool X86TargetLowering::getStackCookieLocation(unsigned &AddressSpace,
   1816                                                unsigned &Offset) const {
   1817   if (!Subtarget->isTargetLinux())
   1818     return false;
   1819 
   1820   if (Subtarget->is64Bit()) {
   1821     // %fs:0x28, unless we're using a Kernel code model, in which case it's %gs:
   1822     Offset = 0x28;
   1823     if (getTargetMachine().getCodeModel() == CodeModel::Kernel)
   1824       AddressSpace = 256;
   1825     else
   1826       AddressSpace = 257;
   1827   } else {
   1828     // %gs:0x14 on i386
   1829     Offset = 0x14;
   1830     AddressSpace = 256;
   1831   }
   1832   return true;
   1833 }
   1834 
   1835 bool X86TargetLowering::isNoopAddrSpaceCast(unsigned SrcAS,
   1836                                             unsigned DestAS) const {
   1837   assert(SrcAS != DestAS && "Expected different address spaces!");
   1838 
   1839   return SrcAS < 256 && DestAS < 256;
   1840 }
   1841 
   1842 //===----------------------------------------------------------------------===//
   1843 //               Return Value Calling Convention Implementation
   1844 //===----------------------------------------------------------------------===//
   1845 
   1846 #include "X86GenCallingConv.inc"
   1847 
   1848 bool
   1849 X86TargetLowering::CanLowerReturn(CallingConv::ID CallConv,
   1850                                   MachineFunction &MF, bool isVarArg,
   1851                         const SmallVectorImpl<ISD::OutputArg> &Outs,
   1852                         LLVMContext &Context) const {
   1853   SmallVector<CCValAssign, 16> RVLocs;
   1854   CCState CCInfo(CallConv, isVarArg, MF, MF.getTarget(),
   1855                  RVLocs, Context);
   1856   return CCInfo.CheckReturn(Outs, RetCC_X86);
   1857 }
   1858 
   1859 const MCPhysReg *X86TargetLowering::getScratchRegisters(CallingConv::ID) const {
   1860   static const MCPhysReg ScratchRegs[] = { X86::R11, 0 };
   1861   return ScratchRegs;
   1862 }
   1863 
   1864 SDValue
   1865 X86TargetLowering::LowerReturn(SDValue Chain,
   1866                                CallingConv::ID CallConv, bool isVarArg,
   1867                                const SmallVectorImpl<ISD::OutputArg> &Outs,
   1868                                const SmallVectorImpl<SDValue> &OutVals,
   1869                                SDLoc dl, SelectionDAG &DAG) const {
   1870   MachineFunction &MF = DAG.getMachineFunction();
   1871   X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
   1872 
   1873   SmallVector<CCValAssign, 16> RVLocs;
   1874   CCState CCInfo(CallConv, isVarArg, MF, DAG.getTarget(),
   1875                  RVLocs, *DAG.getContext());
   1876   CCInfo.AnalyzeReturn(Outs, RetCC_X86);
   1877 
   1878   SDValue Flag;
   1879   SmallVector<SDValue, 6> RetOps;
   1880   RetOps.push_back(Chain); // Operand #0 = Chain (updated below)
   1881   // Operand #1 = Bytes To Pop
   1882   RetOps.push_back(DAG.getTargetConstant(FuncInfo->getBytesToPopOnReturn(),
   1883                    MVT::i16));
   1884 
   1885   // Copy the result values into the output registers.
   1886   for (unsigned i = 0; i != RVLocs.size(); ++i) {
   1887     CCValAssign &VA = RVLocs[i];
   1888     assert(VA.isRegLoc() && "Can only return in registers!");
   1889     SDValue ValToCopy = OutVals[i];
   1890     EVT ValVT = ValToCopy.getValueType();
   1891 
   1892     // Promote values to the appropriate types
   1893     if (VA.getLocInfo() == CCValAssign::SExt)
   1894       ValToCopy = DAG.getNode(ISD::SIGN_EXTEND, dl, VA.getLocVT(), ValToCopy);
   1895     else if (VA.getLocInfo() == CCValAssign::ZExt)
   1896       ValToCopy = DAG.getNode(ISD::ZERO_EXTEND, dl, VA.getLocVT(), ValToCopy);
   1897     else if (VA.getLocInfo() == CCValAssign::AExt)
   1898       ValToCopy = DAG.getNode(ISD::ANY_EXTEND, dl, VA.getLocVT(), ValToCopy);
   1899     else if (VA.getLocInfo() == CCValAssign::BCvt)
   1900       ValToCopy = DAG.getNode(ISD::BITCAST, dl, VA.getLocVT(), ValToCopy);
   1901 
   1902     assert(VA.getLocInfo() != CCValAssign::FPExt &&
   1903            "Unexpected FP-extend for return value.");
   1904 
   1905     // If this is x86-64, and we disabled SSE, we can't return FP values,
   1906     // or SSE or MMX vectors.
   1907     if ((ValVT == MVT::f32 || ValVT == MVT::f64 ||
   1908          VA.getLocReg() == X86::XMM0 || VA.getLocReg() == X86::XMM1) &&
   1909           (Subtarget->is64Bit() && !Subtarget->hasSSE1())) {
   1910       report_fatal_error("SSE register return with SSE disabled");
   1911     }
   1912     // Likewise we can't return F64 values with SSE1 only.  gcc does so, but
   1913     // llvm-gcc has never done it right and no one has noticed, so this
   1914     // should be OK for now.
   1915     if (ValVT == MVT::f64 &&
   1916         (Subtarget->is64Bit() && !Subtarget->hasSSE2()))
   1917       report_fatal_error("SSE2 register return with SSE2 disabled");
   1918 
   1919     // Returns in ST0/ST1 are handled specially: these are pushed as operands to
   1920     // the RET instruction and handled by the FP Stackifier.
   1921     if (VA.getLocReg() == X86::ST0 ||
   1922         VA.getLocReg() == X86::ST1) {
   1923       // If this is a copy from an xmm register to ST(0), use an FPExtend to
   1924       // change the value to the FP stack register class.
   1925       if (isScalarFPTypeInSSEReg(VA.getValVT()))
   1926         ValToCopy = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f80, ValToCopy);
   1927       RetOps.push_back(ValToCopy);
   1928       // Don't emit a copytoreg.
   1929       continue;
   1930     }
   1931 
   1932     // 64-bit vector (MMX) values are returned in XMM0 / XMM1 except for v1i64
   1933     // which is returned in RAX / RDX.
   1934     if (Subtarget->is64Bit()) {
   1935       if (ValVT == MVT::x86mmx) {
   1936         if (VA.getLocReg() == X86::XMM0 || VA.getLocReg() == X86::XMM1) {
   1937           ValToCopy = DAG.getNode(ISD::BITCAST, dl, MVT::i64, ValToCopy);
   1938           ValToCopy = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64,
   1939                                   ValToCopy);
   1940           // If we don't have SSE2 available, convert to v4f32 so the generated
   1941           // register is legal.
   1942           if (!Subtarget->hasSSE2())
   1943             ValToCopy = DAG.getNode(ISD::BITCAST, dl, MVT::v4f32,ValToCopy);
   1944         }
   1945       }
   1946     }
   1947 
   1948     Chain = DAG.getCopyToReg(Chain, dl, VA.getLocReg(), ValToCopy, Flag);
   1949     Flag = Chain.getValue(1);
   1950     RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT()));
   1951   }
   1952 
   1953   // The x86-64 ABIs require that for returning structs by value we copy
   1954   // the sret argument into %rax/%eax (depending on ABI) for the return.
   1955   // Win32 requires us to put the sret argument to %eax as well.
   1956   // We saved the argument into a virtual register in the entry block,
   1957   // so now we copy the value out and into %rax/%eax.
   1958   if (DAG.getMachineFunction().getFunction()->hasStructRetAttr() &&
   1959       (Subtarget->is64Bit() || Subtarget->isTargetKnownWindowsMSVC())) {
   1960     MachineFunction &MF = DAG.getMachineFunction();
   1961     X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
   1962     unsigned Reg = FuncInfo->getSRetReturnReg();
   1963     assert(Reg &&
   1964            "SRetReturnReg should have been set in LowerFormalArguments().");
   1965     SDValue Val = DAG.getCopyFromReg(Chain, dl, Reg, getPointerTy());
   1966 
   1967     unsigned RetValReg
   1968         = (Subtarget->is64Bit() && !Subtarget->isTarget64BitILP32()) ?
   1969           X86::RAX : X86::EAX;
   1970     Chain = DAG.getCopyToReg(Chain, dl, RetValReg, Val, Flag);
   1971     Flag = Chain.getValue(1);
   1972 
   1973     // RAX/EAX now acts like a return value.
   1974     RetOps.push_back(DAG.getRegister(RetValReg, getPointerTy()));
   1975   }
   1976 
   1977   RetOps[0] = Chain;  // Update chain.
   1978 
   1979   // Add the flag if we have it.
   1980   if (Flag.getNode())
   1981     RetOps.push_back(Flag);
   1982 
   1983   return DAG.getNode(X86ISD::RET_FLAG, dl, MVT::Other, RetOps);
   1984 }
   1985 
   1986 bool X86TargetLowering::isUsedByReturnOnly(SDNode *N, SDValue &Chain) const {
   1987   if (N->getNumValues() != 1)
   1988     return false;
   1989   if (!N->hasNUsesOfValue(1, 0))
   1990     return false;
   1991 
   1992   SDValue TCChain = Chain;
   1993   SDNode *Copy = *N->use_begin();
   1994   if (Copy->getOpcode() == ISD::CopyToReg) {
   1995     // If the copy has a glue operand, we conservatively assume it isn't safe to
   1996     // perform a tail call.
   1997     if (Copy->getOperand(Copy->getNumOperands()-1).getValueType() == MVT::Glue)
   1998       return false;
   1999     TCChain = Copy->getOperand(0);
   2000   } else if (Copy->getOpcode() != ISD::FP_EXTEND)
   2001     return false;
   2002 
   2003   bool HasRet = false;
   2004   for (SDNode::use_iterator UI = Copy->use_begin(), UE = Copy->use_end();
   2005        UI != UE; ++UI) {
   2006     if (UI->getOpcode() != X86ISD::RET_FLAG)
   2007       return false;
   2008     HasRet = true;
   2009   }
   2010 
   2011   if (!HasRet)
   2012     return false;
   2013 
   2014   Chain = TCChain;
   2015   return true;
   2016 }
   2017 
   2018 MVT
   2019 X86TargetLowering::getTypeForExtArgOrReturn(MVT VT,
   2020                                             ISD::NodeType ExtendKind) const {
   2021   MVT ReturnMVT;
   2022   // TODO: Is this also valid on 32-bit?
   2023   if (Subtarget->is64Bit() && VT == MVT::i1 && ExtendKind == ISD::ZERO_EXTEND)
   2024     ReturnMVT = MVT::i8;
   2025   else
   2026     ReturnMVT = MVT::i32;
   2027 
   2028   MVT MinVT = getRegisterType(ReturnMVT);
   2029   return VT.bitsLT(MinVT) ? MinVT : VT;
   2030 }
   2031 
   2032 /// LowerCallResult - Lower the result values of a call into the
   2033 /// appropriate copies out of appropriate physical registers.
   2034 ///
   2035 SDValue
   2036 X86TargetLowering::LowerCallResult(SDValue Chain, SDValue InFlag,
   2037                                    CallingConv::ID CallConv, bool isVarArg,
   2038                                    const SmallVectorImpl<ISD::InputArg> &Ins,
   2039                                    SDLoc dl, SelectionDAG &DAG,
   2040                                    SmallVectorImpl<SDValue> &InVals) const {
   2041 
   2042   // Assign locations to each value returned by this call.
   2043   SmallVector<CCValAssign, 16> RVLocs;
   2044   bool Is64Bit = Subtarget->is64Bit();
   2045   CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(),
   2046                  DAG.getTarget(), RVLocs, *DAG.getContext());
   2047   CCInfo.AnalyzeCallResult(Ins, RetCC_X86);
   2048 
   2049   // Copy all of the result registers out of their specified physreg.
   2050   for (unsigned i = 0, e = RVLocs.size(); i != e; ++i) {
   2051     CCValAssign &VA = RVLocs[i];
   2052     EVT CopyVT = VA.getValVT();
   2053 
   2054     // If this is x86-64, and we disabled SSE, we can't return FP values
   2055     if ((CopyVT == MVT::f32 || CopyVT == MVT::f64) &&
   2056         ((Is64Bit || Ins[i].Flags.isInReg()) && !Subtarget->hasSSE1())) {
   2057       report_fatal_error("SSE register return with SSE disabled");
   2058     }
   2059 
   2060     SDValue Val;
   2061 
   2062     // If this is a call to a function that returns an fp value on the floating
   2063     // point stack, we must guarantee the value is popped from the stack, so
   2064     // a CopyFromReg is not good enough - the copy instruction may be eliminated
   2065     // if the return value is not used. We use the FpPOP_RETVAL instruction
   2066     // instead.
   2067     if (VA.getLocReg() == X86::ST0 || VA.getLocReg() == X86::ST1) {
   2068       // If we prefer to use the value in xmm registers, copy it out as f80 and
   2069       // use a truncate to move it from fp stack reg to xmm reg.
   2070       if (isScalarFPTypeInSSEReg(VA.getValVT())) CopyVT = MVT::f80;
   2071       SDValue Ops[] = { Chain, InFlag };
   2072       Chain = SDValue(DAG.getMachineNode(X86::FpPOP_RETVAL, dl, CopyVT,
   2073                                          MVT::Other, MVT::Glue, Ops), 1);
   2074       Val = Chain.getValue(0);
   2075 
   2076       // Round the f80 to the right size, which also moves it to the appropriate
   2077       // xmm register.
   2078       if (CopyVT != VA.getValVT())
   2079         Val = DAG.getNode(ISD::FP_ROUND, dl, VA.getValVT(), Val,
   2080                           // This truncation won't change the value.
   2081                           DAG.getIntPtrConstant(1));
   2082     } else {
   2083       Chain = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(),
   2084                                  CopyVT, InFlag).getValue(1);
   2085       Val = Chain.getValue(0);
   2086     }
   2087     InFlag = Chain.getValue(2);
   2088     InVals.push_back(Val);
   2089   }
   2090 
   2091   return Chain;
   2092 }
   2093 
   2094 //===----------------------------------------------------------------------===//
   2095 //                C & StdCall & Fast Calling Convention implementation
   2096 //===----------------------------------------------------------------------===//
   2097 //  StdCall calling convention seems to be standard for many Windows' API
   2098 //  routines and around. It differs from C calling convention just a little:
   2099 //  callee should clean up the stack, not caller. Symbols should be also
   2100 //  decorated in some fancy way :) It doesn't support any vector arguments.
   2101 //  For info on fast calling convention see Fast Calling Convention (tail call)
   2102 //  implementation LowerX86_32FastCCCallTo.
   2103 
   2104 /// CallIsStructReturn - Determines whether a call uses struct return
   2105 /// semantics.
   2106 enum StructReturnType {
   2107   NotStructReturn,
   2108   RegStructReturn,
   2109   StackStructReturn
   2110 };
   2111 static StructReturnType
   2112 callIsStructReturn(const SmallVectorImpl<ISD::OutputArg> &Outs) {
   2113   if (Outs.empty())
   2114     return NotStructReturn;
   2115 
   2116   const ISD::ArgFlagsTy &Flags = Outs[0].Flags;
   2117   if (!Flags.isSRet())
   2118     return NotStructReturn;
   2119   if (Flags.isInReg())
   2120     return RegStructReturn;
   2121   return StackStructReturn;
   2122 }
   2123 
   2124 /// ArgsAreStructReturn - Determines whether a function uses struct
   2125 /// return semantics.
   2126 static StructReturnType
   2127 argsAreStructReturn(const SmallVectorImpl<ISD::InputArg> &Ins) {
   2128   if (Ins.empty())
   2129     return NotStructReturn;
   2130 
   2131   const ISD::ArgFlagsTy &Flags = Ins[0].Flags;
   2132   if (!Flags.isSRet())
   2133     return NotStructReturn;
   2134   if (Flags.isInReg())
   2135     return RegStructReturn;
   2136   return StackStructReturn;
   2137 }
   2138 
   2139 /// CreateCopyOfByValArgument - Make a copy of an aggregate at address specified
   2140 /// by "Src" to address "Dst" with size and alignment information specified by
   2141 /// the specific parameter attribute. The copy will be passed as a byval
   2142 /// function parameter.
   2143 static SDValue
   2144 CreateCopyOfByValArgument(SDValue Src, SDValue Dst, SDValue Chain,
   2145                           ISD::ArgFlagsTy Flags, SelectionDAG &DAG,
   2146                           SDLoc dl) {
   2147   SDValue SizeNode = DAG.getConstant(Flags.getByValSize(), MVT::i32);
   2148 
   2149   return DAG.getMemcpy(Chain, dl, Dst, Src, SizeNode, Flags.getByValAlign(),
   2150                        /*isVolatile*/false, /*AlwaysInline=*/true,
   2151                        MachinePointerInfo(), MachinePointerInfo());
   2152 }
   2153 
   2154 /// IsTailCallConvention - Return true if the calling convention is one that
   2155 /// supports tail call optimization.
   2156 static bool IsTailCallConvention(CallingConv::ID CC) {
   2157   return (CC == CallingConv::Fast || CC == CallingConv::GHC ||
   2158           CC == CallingConv::HiPE);
   2159 }
   2160 
   2161 /// \brief Return true if the calling convention is a C calling convention.
   2162 static bool IsCCallConvention(CallingConv::ID CC) {
   2163   return (CC == CallingConv::C || CC == CallingConv::X86_64_Win64 ||
   2164           CC == CallingConv::X86_64_SysV);
   2165 }
   2166 
   2167 bool X86TargetLowering::mayBeEmittedAsTailCall(CallInst *CI) const {
   2168   if (!CI->isTailCall() || getTargetMachine().Options.DisableTailCalls)
   2169     return false;
   2170 
   2171   CallSite CS(CI);
   2172   CallingConv::ID CalleeCC = CS.getCallingConv();
   2173   if (!IsTailCallConvention(CalleeCC) && !IsCCallConvention(CalleeCC))
   2174     return false;
   2175 
   2176   return true;
   2177 }
   2178 
   2179 /// FuncIsMadeTailCallSafe - Return true if the function is being made into
   2180 /// a tailcall target by changing its ABI.
   2181 static bool FuncIsMadeTailCallSafe(CallingConv::ID CC,
   2182                                    bool GuaranteedTailCallOpt) {
   2183   return GuaranteedTailCallOpt && IsTailCallConvention(CC);
   2184 }
   2185 
   2186 SDValue
   2187 X86TargetLowering::LowerMemArgument(SDValue Chain,
   2188                                     CallingConv::ID CallConv,
   2189                                     const SmallVectorImpl<ISD::InputArg> &Ins,
   2190                                     SDLoc dl, SelectionDAG &DAG,
   2191                                     const CCValAssign &VA,
   2192                                     MachineFrameInfo *MFI,
   2193                                     unsigned i) const {
   2194   // Create the nodes corresponding to a load from this parameter slot.
   2195   ISD::ArgFlagsTy Flags = Ins[i].Flags;
   2196   bool AlwaysUseMutable = FuncIsMadeTailCallSafe(
   2197       CallConv, DAG.getTarget().Options.GuaranteedTailCallOpt);
   2198   bool isImmutable = !AlwaysUseMutable && !Flags.isByVal();
   2199   EVT ValVT;
   2200 
   2201   // If value is passed by pointer we have address passed instead of the value
   2202   // itself.
   2203   if (VA.getLocInfo() == CCValAssign::Indirect)
   2204     ValVT = VA.getLocVT();
   2205   else
   2206     ValVT = VA.getValVT();
   2207 
   2208   // FIXME: For now, all byval parameter objects are marked mutable. This can be
   2209   // changed with more analysis.
   2210   // In case of tail call optimization mark all arguments mutable. Since they
   2211   // could be overwritten by lowering of arguments in case of a tail call.
   2212   if (Flags.isByVal()) {
   2213     unsigned Bytes = Flags.getByValSize();
   2214     if (Bytes == 0) Bytes = 1; // Don't create zero-sized stack objects.
   2215     int FI = MFI->CreateFixedObject(Bytes, VA.getLocMemOffset(), isImmutable);
   2216     return DAG.getFrameIndex(FI, getPointerTy());
   2217   } else {
   2218     int FI = MFI->CreateFixedObject(ValVT.getSizeInBits()/8,
   2219                                     VA.getLocMemOffset(), isImmutable);
   2220     SDValue FIN = DAG.getFrameIndex(FI, getPointerTy());
   2221     return DAG.getLoad(ValVT, dl, Chain, FIN,
   2222                        MachinePointerInfo::getFixedStack(FI),
   2223                        false, false, false, 0);
   2224   }
   2225 }
   2226 
   2227 SDValue
   2228 X86TargetLowering::LowerFormalArguments(SDValue Chain,
   2229                                         CallingConv::ID CallConv,
   2230                                         bool isVarArg,
   2231                                       const SmallVectorImpl<ISD::InputArg> &Ins,
   2232                                         SDLoc dl,
   2233                                         SelectionDAG &DAG,
   2234                                         SmallVectorImpl<SDValue> &InVals)
   2235                                           const {
   2236   MachineFunction &MF = DAG.getMachineFunction();
   2237   X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
   2238 
   2239   const Function* Fn = MF.getFunction();
   2240   if (Fn->hasExternalLinkage() &&
   2241       Subtarget->isTargetCygMing() &&
   2242       Fn->getName() == "main")
   2243     FuncInfo->setForceFramePointer(true);
   2244 
   2245   MachineFrameInfo *MFI = MF.getFrameInfo();
   2246   bool Is64Bit = Subtarget->is64Bit();
   2247   bool IsWin64 = Subtarget->isCallingConvWin64(CallConv);
   2248 
   2249   assert(!(isVarArg && IsTailCallConvention(CallConv)) &&
   2250          "Var args not supported with calling convention fastcc, ghc or hipe");
   2251 
   2252   // Assign locations to all of the incoming arguments.
   2253   SmallVector<CCValAssign, 16> ArgLocs;
   2254   CCState CCInfo(CallConv, isVarArg, MF, DAG.getTarget(),
   2255                  ArgLocs, *DAG.getContext());
   2256 
   2257   // Allocate shadow area for Win64
   2258   if (IsWin64)
   2259     CCInfo.AllocateStack(32, 8);
   2260 
   2261   CCInfo.AnalyzeFormalArguments(Ins, CC_X86);
   2262 
   2263   unsigned LastVal = ~0U;
   2264   SDValue ArgValue;
   2265   for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
   2266     CCValAssign &VA = ArgLocs[i];
   2267     // TODO: If an arg is passed in two places (e.g. reg and stack), skip later
   2268     // places.
   2269     assert(VA.getValNo() != LastVal &&
   2270            "Don't support value assigned to multiple locs yet");
   2271     (void)LastVal;
   2272     LastVal = VA.getValNo();
   2273 
   2274     if (VA.isRegLoc()) {
   2275       EVT RegVT = VA.getLocVT();
   2276       const TargetRegisterClass *RC;
   2277       if (RegVT == MVT::i32)
   2278         RC = &X86::GR32RegClass;
   2279       else if (Is64Bit && RegVT == MVT::i64)
   2280         RC = &X86::GR64RegClass;
   2281       else if (RegVT == MVT::f32)
   2282         RC = &X86::FR32RegClass;
   2283       else if (RegVT == MVT::f64)
   2284         RC = &X86::FR64RegClass;
   2285       else if (RegVT.is512BitVector())
   2286         RC = &X86::VR512RegClass;
   2287       else if (RegVT.is256BitVector())
   2288         RC = &X86::VR256RegClass;
   2289       else if (RegVT.is128BitVector())
   2290         RC = &X86::VR128RegClass;
   2291       else if (RegVT == MVT::x86mmx)
   2292         RC = &X86::VR64RegClass;
   2293       else if (RegVT == MVT::i1)
   2294         RC = &X86::VK1RegClass;
   2295       else if (RegVT == MVT::v8i1)
   2296         RC = &X86::VK8RegClass;
   2297       else if (RegVT == MVT::v16i1)
   2298         RC = &X86::VK16RegClass;
   2299       else
   2300         llvm_unreachable("Unknown argument type!");
   2301 
   2302       unsigned Reg = MF.addLiveIn(VA.getLocReg(), RC);
   2303       ArgValue = DAG.getCopyFromReg(Chain, dl, Reg, RegVT);
   2304 
   2305       // If this is an 8 or 16-bit value, it is really passed promoted to 32
   2306       // bits.  Insert an assert[sz]ext to capture this, then truncate to the
   2307       // right size.
   2308       if (VA.getLocInfo() == CCValAssign::SExt)
   2309         ArgValue = DAG.getNode(ISD::AssertSext, dl, RegVT, ArgValue,
   2310                                DAG.getValueType(VA.getValVT()));
   2311       else if (VA.getLocInfo() == CCValAssign::ZExt)
   2312         ArgValue = DAG.getNode(ISD::AssertZext, dl, RegVT, ArgValue,
   2313                                DAG.getValueType(VA.getValVT()));
   2314       else if (VA.getLocInfo() == CCValAssign::BCvt)
   2315         ArgValue = DAG.getNode(ISD::BITCAST, dl, VA.getValVT(), ArgValue);
   2316 
   2317       if (VA.isExtInLoc()) {
   2318         // Handle MMX values passed in XMM regs.
   2319         if (RegVT.isVector())
   2320           ArgValue = DAG.getNode(X86ISD::MOVDQ2Q, dl, VA.getValVT(), ArgValue);
   2321         else
   2322           ArgValue = DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), ArgValue);
   2323       }
   2324     } else {
   2325       assert(VA.isMemLoc());
   2326       ArgValue = LowerMemArgument(Chain, CallConv, Ins, dl, DAG, VA, MFI, i);
   2327     }
   2328 
   2329     // If value is passed via pointer - do a load.
   2330     if (VA.getLocInfo() == CCValAssign::Indirect)
   2331       ArgValue = DAG.getLoad(VA.getValVT(), dl, Chain, ArgValue,
   2332                              MachinePointerInfo(), false, false, false, 0);
   2333 
   2334     InVals.push_back(ArgValue);
   2335   }
   2336 
   2337   if (Subtarget->is64Bit() || Subtarget->isTargetKnownWindowsMSVC()) {
   2338     for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
   2339       // The x86-64 ABIs require that for returning structs by value we copy
   2340       // the sret argument into %rax/%eax (depending on ABI) for the return.
   2341       // Win32 requires us to put the sret argument to %eax as well.
   2342       // Save the argument into a virtual register so that we can access it
   2343       // from the return points.
   2344       if (Ins[i].Flags.isSRet()) {
   2345         unsigned Reg = FuncInfo->getSRetReturnReg();
   2346         if (!Reg) {
   2347           MVT PtrTy = getPointerTy();
   2348           Reg = MF.getRegInfo().createVirtualRegister(getRegClassFor(PtrTy));
   2349           FuncInfo->setSRetReturnReg(Reg);
   2350         }
   2351         SDValue Copy = DAG.getCopyToReg(DAG.getEntryNode(), dl, Reg, InVals[i]);
   2352         Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Copy, Chain);
   2353         break;
   2354       }
   2355     }
   2356   }
   2357 
   2358   unsigned StackSize = CCInfo.getNextStackOffset();
   2359   // Align stack specially for tail calls.
   2360   if (FuncIsMadeTailCallSafe(CallConv,
   2361                              MF.getTarget().Options.GuaranteedTailCallOpt))
   2362     StackSize = GetAlignedArgumentStackSize(StackSize, DAG);
   2363 
   2364   // If the function takes variable number of arguments, make a frame index for
   2365   // the start of the first vararg value... for expansion of llvm.va_start.
   2366   if (isVarArg) {
   2367     if (Is64Bit || (CallConv != CallingConv::X86_FastCall &&
   2368                     CallConv != CallingConv::X86_ThisCall)) {
   2369       FuncInfo->setVarArgsFrameIndex(MFI->CreateFixedObject(1, StackSize,true));
   2370     }
   2371     if (Is64Bit) {
   2372       unsigned TotalNumIntRegs = 0, TotalNumXMMRegs = 0;
   2373 
   2374       // FIXME: We should really autogenerate these arrays
   2375       static const MCPhysReg GPR64ArgRegsWin64[] = {
   2376         X86::RCX, X86::RDX, X86::R8,  X86::R9
   2377       };
   2378       static const MCPhysReg GPR64ArgRegs64Bit[] = {
   2379         X86::RDI, X86::RSI, X86::RDX, X86::RCX, X86::R8, X86::R9
   2380       };
   2381       static const MCPhysReg XMMArgRegs64Bit[] = {
   2382         X86::XMM0, X86::XMM1, X86::XMM2, X86::XMM3,
   2383         X86::XMM4, X86::XMM5, X86::XMM6, X86::XMM7
   2384       };
   2385       const MCPhysReg *GPR64ArgRegs;
   2386       unsigned NumXMMRegs = 0;
   2387 
   2388       if (IsWin64) {
   2389         // The XMM registers which might contain var arg parameters are shadowed
   2390         // in their paired GPR.  So we only need to save the GPR to their home
   2391         // slots.
   2392         TotalNumIntRegs = 4;
   2393         GPR64ArgRegs = GPR64ArgRegsWin64;
   2394       } else {
   2395         TotalNumIntRegs = 6; TotalNumXMMRegs = 8;
   2396         GPR64ArgRegs = GPR64ArgRegs64Bit;
   2397 
   2398         NumXMMRegs = CCInfo.getFirstUnallocated(XMMArgRegs64Bit,
   2399                                                 TotalNumXMMRegs);
   2400       }
   2401       unsigned NumIntRegs = CCInfo.getFirstUnallocated(GPR64ArgRegs,
   2402                                                        TotalNumIntRegs);
   2403 
   2404       bool NoImplicitFloatOps = Fn->getAttributes().
   2405         hasAttribute(AttributeSet::FunctionIndex, Attribute::NoImplicitFloat);
   2406       assert(!(NumXMMRegs && !Subtarget->hasSSE1()) &&
   2407              "SSE register cannot be used when SSE is disabled!");
   2408       assert(!(NumXMMRegs && MF.getTarget().Options.UseSoftFloat &&
   2409                NoImplicitFloatOps) &&
   2410              "SSE register cannot be used when SSE is disabled!");
   2411       if (MF.getTarget().Options.UseSoftFloat || NoImplicitFloatOps ||
   2412           !Subtarget->hasSSE1())
   2413         // Kernel mode asks for SSE to be disabled, so don't push them
   2414         // on the stack.
   2415         TotalNumXMMRegs = 0;
   2416 
   2417       if (IsWin64) {
   2418         const TargetFrameLowering &TFI = *MF.getTarget().getFrameLowering();
   2419         // Get to the caller-allocated home save location.  Add 8 to account
   2420         // for the return address.
   2421         int HomeOffset = TFI.getOffsetOfLocalArea() + 8;
   2422         FuncInfo->setRegSaveFrameIndex(
   2423           MFI->CreateFixedObject(1, NumIntRegs * 8 + HomeOffset, false));
   2424         // Fixup to set vararg frame on shadow area (4 x i64).
   2425         if (NumIntRegs < 4)
   2426           FuncInfo->setVarArgsFrameIndex(FuncInfo->getRegSaveFrameIndex());
   2427       } else {
   2428         // For X86-64, if there are vararg parameters that are passed via
   2429         // registers, then we must store them to their spots on the stack so
   2430         // they may be loaded by deferencing the result of va_next.
   2431         FuncInfo->setVarArgsGPOffset(NumIntRegs * 8);
   2432         FuncInfo->setVarArgsFPOffset(TotalNumIntRegs * 8 + NumXMMRegs * 16);
   2433         FuncInfo->setRegSaveFrameIndex(
   2434           MFI->CreateStackObject(TotalNumIntRegs * 8 + TotalNumXMMRegs * 16, 16,
   2435                                false));
   2436       }
   2437 
   2438       // Store the integer parameter registers.
   2439       SmallVector<SDValue, 8> MemOps;
   2440       SDValue RSFIN = DAG.getFrameIndex(FuncInfo->getRegSaveFrameIndex(),
   2441                                         getPointerTy());
   2442       unsigned Offset = FuncInfo->getVarArgsGPOffset();
   2443       for (; NumIntRegs != TotalNumIntRegs; ++NumIntRegs) {
   2444         SDValue FIN = DAG.getNode(ISD::ADD, dl, getPointerTy(), RSFIN,
   2445                                   DAG.getIntPtrConstant(Offset));
   2446         unsigned VReg = MF.addLiveIn(GPR64ArgRegs[NumIntRegs],
   2447                                      &X86::GR64RegClass);
   2448         SDValue Val = DAG.getCopyFromReg(Chain, dl, VReg, MVT::i64);
   2449         SDValue Store =
   2450           DAG.getStore(Val.getValue(1), dl, Val, FIN,
   2451                        MachinePointerInfo::getFixedStack(
   2452                          FuncInfo->getRegSaveFrameIndex(), Offset),
   2453                        false, false, 0);
   2454         MemOps.push_back(Store);
   2455         Offset += 8;
   2456       }
   2457 
   2458       if (TotalNumXMMRegs != 0 && NumXMMRegs != TotalNumXMMRegs) {
   2459         // Now store the XMM (fp + vector) parameter registers.
   2460         SmallVector<SDValue, 11> SaveXMMOps;
   2461         SaveXMMOps.push_back(Chain);
   2462 
   2463         unsigned AL = MF.addLiveIn(X86::AL, &X86::GR8RegClass);
   2464         SDValue ALVal = DAG.getCopyFromReg(DAG.getEntryNode(), dl, AL, MVT::i8);
   2465         SaveXMMOps.push_back(ALVal);
   2466 
   2467         SaveXMMOps.push_back(DAG.getIntPtrConstant(
   2468                                FuncInfo->getRegSaveFrameIndex()));
   2469         SaveXMMOps.push_back(DAG.getIntPtrConstant(
   2470                                FuncInfo->getVarArgsFPOffset()));
   2471 
   2472         for (; NumXMMRegs != TotalNumXMMRegs; ++NumXMMRegs) {
   2473           unsigned VReg = MF.addLiveIn(XMMArgRegs64Bit[NumXMMRegs],
   2474                                        &X86::VR128RegClass);
   2475           SDValue Val = DAG.getCopyFromReg(Chain, dl, VReg, MVT::v4f32);
   2476           SaveXMMOps.push_back(Val);
   2477         }
   2478         MemOps.push_back(DAG.getNode(X86ISD::VASTART_SAVE_XMM_REGS, dl,
   2479                                      MVT::Other, SaveXMMOps));
   2480       }
   2481 
   2482       if (!MemOps.empty())
   2483         Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOps);
   2484     }
   2485   }
   2486 
   2487   // Some CCs need callee pop.
   2488   if (X86::isCalleePop(CallConv, Is64Bit, isVarArg,
   2489                        MF.getTarget().Options.GuaranteedTailCallOpt)) {
   2490     FuncInfo->setBytesToPopOnReturn(StackSize); // Callee pops everything.
   2491   } else {
   2492     FuncInfo->setBytesToPopOnReturn(0); // Callee pops nothing.
   2493     // If this is an sret function, the return should pop the hidden pointer.
   2494     if (!Is64Bit && !IsTailCallConvention(CallConv) &&
   2495         !Subtarget->getTargetTriple().isOSMSVCRT() &&
   2496         argsAreStructReturn(Ins) == StackStructReturn)
   2497       FuncInfo->setBytesToPopOnReturn(4);
   2498   }
   2499 
   2500   if (!Is64Bit) {
   2501     // RegSaveFrameIndex is X86-64 only.
   2502     FuncInfo->setRegSaveFrameIndex(0xAAAAAAA);
   2503     if (CallConv == CallingConv::X86_FastCall ||
   2504         CallConv == CallingConv::X86_ThisCall)
   2505       // fastcc functions can't have varargs.
   2506       FuncInfo->setVarArgsFrameIndex(0xAAAAAAA);
   2507   }
   2508 
   2509   FuncInfo->setArgumentStackSize(StackSize);
   2510 
   2511   return Chain;
   2512 }
   2513 
   2514 SDValue
   2515 X86TargetLowering::LowerMemOpCallTo(SDValue Chain,
   2516                                     SDValue StackPtr, SDValue Arg,
   2517                                     SDLoc dl, SelectionDAG &DAG,
   2518                                     const CCValAssign &VA,
   2519                                     ISD::ArgFlagsTy Flags) const {
   2520   unsigned LocMemOffset = VA.getLocMemOffset();
   2521   SDValue PtrOff = DAG.getIntPtrConstant(LocMemOffset);
   2522   PtrOff = DAG.getNode(ISD::ADD, dl, getPointerTy(), StackPtr, PtrOff);
   2523   if (Flags.isByVal())
   2524     return CreateCopyOfByValArgument(Arg, PtrOff, Chain, Flags, DAG, dl);
   2525 
   2526   return DAG.getStore(Chain, dl, Arg, PtrOff,
   2527                       MachinePointerInfo::getStack(LocMemOffset),
   2528                       false, false, 0);
   2529 }
   2530 
   2531 /// EmitTailCallLoadRetAddr - Emit a load of return address if tail call
   2532 /// optimization is performed and it is required.
   2533 SDValue
   2534 X86TargetLowering::EmitTailCallLoadRetAddr(SelectionDAG &DAG,
   2535                                            SDValue &OutRetAddr, SDValue Chain,
   2536                                            bool IsTailCall, bool Is64Bit,
   2537                                            int FPDiff, SDLoc dl) const {
   2538   // Adjust the Return address stack slot.
   2539   EVT VT = getPointerTy();
   2540   OutRetAddr = getReturnAddressFrameIndex(DAG);
   2541 
   2542   // Load the "old" Return address.
   2543   OutRetAddr = DAG.getLoad(VT, dl, Chain, OutRetAddr, MachinePointerInfo(),
   2544                            false, false, false, 0);
   2545   return SDValue(OutRetAddr.getNode(), 1);
   2546 }
   2547 
   2548 /// EmitTailCallStoreRetAddr - Emit a store of the return address if tail call
   2549 /// optimization is performed and it is required (FPDiff!=0).
   2550 static SDValue EmitTailCallStoreRetAddr(SelectionDAG &DAG, MachineFunction &MF,
   2551                                         SDValue Chain, SDValue RetAddrFrIdx,
   2552                                         EVT PtrVT, unsigned SlotSize,
   2553                                         int FPDiff, SDLoc dl) {
   2554   // Store the return address to the appropriate stack slot.
   2555   if (!FPDiff) return Chain;
   2556   // Calculate the new stack slot for the return address.
   2557   int NewReturnAddrFI =
   2558     MF.getFrameInfo()->CreateFixedObject(SlotSize, (int64_t)FPDiff - SlotSize,
   2559                                          false);
   2560   SDValue NewRetAddrFrIdx = DAG.getFrameIndex(NewReturnAddrFI, PtrVT);
   2561   Chain = DAG.getStore(Chain, dl, RetAddrFrIdx, NewRetAddrFrIdx,
   2562                        MachinePointerInfo::getFixedStack(NewReturnAddrFI),
   2563                        false, false, 0);
   2564   return Chain;
   2565 }
   2566 
   2567 SDValue
   2568 X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
   2569                              SmallVectorImpl<SDValue> &InVals) const {
   2570   SelectionDAG &DAG                     = CLI.DAG;
   2571   SDLoc &dl                             = CLI.DL;
   2572   SmallVectorImpl<ISD::OutputArg> &Outs = CLI.Outs;
   2573   SmallVectorImpl<SDValue> &OutVals     = CLI.OutVals;
   2574   SmallVectorImpl<ISD::InputArg> &Ins   = CLI.Ins;
   2575   SDValue Chain                         = CLI.Chain;
   2576   SDValue Callee                        = CLI.Callee;
   2577   CallingConv::ID CallConv              = CLI.CallConv;
   2578   bool &isTailCall                      = CLI.IsTailCall;
   2579   bool isVarArg                         = CLI.IsVarArg;
   2580 
   2581   MachineFunction &MF = DAG.getMachineFunction();
   2582   bool Is64Bit        = Subtarget->is64Bit();
   2583   bool IsWin64        = Subtarget->isCallingConvWin64(CallConv);
   2584   StructReturnType SR = callIsStructReturn(Outs);
   2585   bool IsSibcall      = false;
   2586 
   2587   if (MF.getTarget().Options.DisableTailCalls)
   2588     isTailCall = false;
   2589 
   2590   bool IsMustTail = CLI.CS && CLI.CS->isMustTailCall();
   2591   if (IsMustTail) {
   2592     // Force this to be a tail call.  The verifier rules are enough to ensure
   2593     // that we can lower this successfully without moving the return address
   2594     // around.
   2595     isTailCall = true;
   2596   } else if (isTailCall) {
   2597     // Check if it's really possible to do a tail call.
   2598     isTailCall = IsEligibleForTailCallOptimization(Callee, CallConv,
   2599                     isVarArg, SR != NotStructReturn,
   2600                     MF.getFunction()->hasStructRetAttr(), CLI.RetTy,
   2601                     Outs, OutVals, Ins, DAG);
   2602 
   2603     // Sibcalls are automatically detected tailcalls which do not require
   2604     // ABI changes.
   2605     if (!MF.getTarget().Options.GuaranteedTailCallOpt && isTailCall)
   2606       IsSibcall = true;
   2607 
   2608     if (isTailCall)
   2609       ++NumTailCalls;
   2610   }
   2611 
   2612   assert(!(isVarArg && IsTailCallConvention(CallConv)) &&
   2613          "Var args not supported with calling convention fastcc, ghc or hipe");
   2614 
   2615   // Analyze operands of the call, assigning locations to each operand.
   2616   SmallVector<CCValAssign, 16> ArgLocs;
   2617   CCState CCInfo(CallConv, isVarArg, MF, MF.getTarget(),
   2618                  ArgLocs, *DAG.getContext());
   2619 
   2620   // Allocate shadow area for Win64
   2621   if (IsWin64)
   2622     CCInfo.AllocateStack(32, 8);
   2623 
   2624   CCInfo.AnalyzeCallOperands(Outs, CC_X86);
   2625 
   2626   // Get a count of how many bytes are to be pushed on the stack.
   2627   unsigned NumBytes = CCInfo.getNextStackOffset();
   2628   if (IsSibcall)
   2629     // This is a sibcall. The memory operands are available in caller's
   2630     // own caller's stack.
   2631     NumBytes = 0;
   2632   else if (MF.getTarget().Options.GuaranteedTailCallOpt &&
   2633            IsTailCallConvention(CallConv))
   2634     NumBytes = GetAlignedArgumentStackSize(NumBytes, DAG);
   2635 
   2636   int FPDiff = 0;
   2637   if (isTailCall && !IsSibcall && !IsMustTail) {
   2638     // Lower arguments at fp - stackoffset + fpdiff.
   2639     X86MachineFunctionInfo *X86Info = MF.getInfo<X86MachineFunctionInfo>();
   2640     unsigned NumBytesCallerPushed = X86Info->getBytesToPopOnReturn();
   2641 
   2642     FPDiff = NumBytesCallerPushed - NumBytes;
   2643 
   2644     // Set the delta of movement of the returnaddr stackslot.
   2645     // But only set if delta is greater than previous delta.
   2646     if (FPDiff < X86Info->getTCReturnAddrDelta())
   2647       X86Info->setTCReturnAddrDelta(FPDiff);
   2648   }
   2649 
   2650   unsigned NumBytesToPush = NumBytes;
   2651   unsigned NumBytesToPop = NumBytes;
   2652 
   2653   // If we have an inalloca argument, all stack space has already been allocated
   2654   // for us and be right at the top of the stack.  We don't support multiple
   2655   // arguments passed in memory when using inalloca.
   2656   if (!Outs.empty() && Outs.back().Flags.isInAlloca()) {
   2657     NumBytesToPush = 0;
   2658     assert(ArgLocs.back().getLocMemOffset() == 0 &&
   2659            "an inalloca argument must be the only memory argument");
   2660   }
   2661 
   2662   if (!IsSibcall)
   2663     Chain = DAG.getCALLSEQ_START(
   2664         Chain, DAG.getIntPtrConstant(NumBytesToPush, true), dl);
   2665 
   2666   SDValue RetAddrFrIdx;
   2667   // Load return address for tail calls.
   2668   if (isTailCall && FPDiff)
   2669     Chain = EmitTailCallLoadRetAddr(DAG, RetAddrFrIdx, Chain, isTailCall,
   2670                                     Is64Bit, FPDiff, dl);
   2671 
   2672   SmallVector<std::pair<unsigned, SDValue>, 8> RegsToPass;
   2673   SmallVector<SDValue, 8> MemOpChains;
   2674   SDValue StackPtr;
   2675 
   2676   // Walk the register/memloc assignments, inserting copies/loads.  In the case
   2677   // of tail call optimization arguments are handle later.
   2678   const X86RegisterInfo *RegInfo =
   2679     static_cast<const X86RegisterInfo*>(DAG.getTarget().getRegisterInfo());
   2680   for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
   2681     // Skip inalloca arguments, they have already been written.
   2682     ISD::ArgFlagsTy Flags = Outs[i].Flags;
   2683     if (Flags.isInAlloca())
   2684       continue;
   2685 
   2686     CCValAssign &VA = ArgLocs[i];
   2687     EVT RegVT = VA.getLocVT();
   2688     SDValue Arg = OutVals[i];
   2689     bool isByVal = Flags.isByVal();
   2690 
   2691     // Promote the value if needed.
   2692     switch (VA.getLocInfo()) {
   2693     default: llvm_unreachable("Unknown loc info!");
   2694     case CCValAssign::Full: break;
   2695     case CCValAssign::SExt:
   2696       Arg = DAG.getNode(ISD::SIGN_EXTEND, dl, RegVT, Arg);
   2697       break;
   2698     case CCValAssign::ZExt:
   2699       Arg = DAG.getNode(ISD::ZERO_EXTEND, dl, RegVT, Arg);
   2700       break;
   2701     case CCValAssign::AExt:
   2702       if (RegVT.is128BitVector()) {
   2703         // Special case: passing MMX values in XMM registers.
   2704         Arg = DAG.getNode(ISD::BITCAST, dl, MVT::i64, Arg);
   2705         Arg = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, Arg);
   2706         Arg = getMOVL(DAG, dl, MVT::v2i64, DAG.getUNDEF(MVT::v2i64), Arg);
   2707       } else
   2708         Arg = DAG.getNode(ISD::ANY_EXTEND, dl, RegVT, Arg);
   2709       break;
   2710     case CCValAssign::BCvt:
   2711       Arg = DAG.getNode(ISD::BITCAST, dl, RegVT, Arg);
   2712       break;
   2713     case CCValAssign::Indirect: {
   2714       // Store the argument.
   2715       SDValue SpillSlot = DAG.CreateStackTemporary(VA.getValVT());
   2716       int FI = cast<FrameIndexSDNode>(SpillSlot)->getIndex();
   2717       Chain = DAG.getStore(Chain, dl, Arg, SpillSlot,
   2718                            MachinePointerInfo::getFixedStack(FI),
   2719                            false, false, 0);
   2720       Arg = SpillSlot;
   2721       break;
   2722     }
   2723     }
   2724 
   2725     if (VA.isRegLoc()) {
   2726       RegsToPass.push_back(std::make_pair(VA.getLocReg(), Arg));
   2727       if (isVarArg && IsWin64) {
   2728         // Win64 ABI requires argument XMM reg to be copied to the corresponding
   2729         // shadow reg if callee is a varargs function.
   2730         unsigned ShadowReg = 0;
   2731         switch (VA.getLocReg()) {
   2732         case X86::XMM0: ShadowReg = X86::RCX; break;
   2733         case X86::XMM1: ShadowReg = X86::RDX; break;
   2734         case X86::XMM2: ShadowReg = X86::R8; break;
   2735         case X86::XMM3: ShadowReg = X86::R9; break;
   2736         }
   2737         if (ShadowReg)
   2738           RegsToPass.push_back(std::make_pair(ShadowReg, Arg));
   2739       }
   2740     } else if (!IsSibcall && (!isTailCall || isByVal)) {
   2741       assert(VA.isMemLoc());
   2742       if (!StackPtr.getNode())
   2743         StackPtr = DAG.getCopyFromReg(Chain, dl, RegInfo->getStackRegister(),
   2744                                       getPointerTy());
   2745       MemOpChains.push_back(LowerMemOpCallTo(Chain, StackPtr, Arg,
   2746                                              dl, DAG, VA, Flags));
   2747     }
   2748   }
   2749 
   2750   if (!MemOpChains.empty())
   2751     Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOpChains);
   2752 
   2753   if (Subtarget->isPICStyleGOT()) {
   2754     // ELF / PIC requires GOT in the EBX register before function calls via PLT
   2755     // GOT pointer.
   2756     if (!isTailCall) {
   2757       RegsToPass.push_back(std::make_pair(unsigned(X86::EBX),
   2758                DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), getPointerTy())));
   2759     } else {
   2760       // If we are tail calling and generating PIC/GOT style code load the
   2761       // address of the callee into ECX. The value in ecx is used as target of
   2762       // the tail jump. This is done to circumvent the ebx/callee-saved problem
   2763       // for tail calls on PIC/GOT architectures. Normally we would just put the
   2764       // address of GOT into ebx and then call target@PLT. But for tail calls
   2765       // ebx would be restored (since ebx is callee saved) before jumping to the
   2766       // target@PLT.
   2767 
   2768       // Note: The actual moving to ECX is done further down.
   2769       GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee);
   2770       if (G && !G->getGlobal()->hasHiddenVisibility() &&
   2771           !G->getGlobal()->hasProtectedVisibility())
   2772         Callee = LowerGlobalAddress(Callee, DAG);
   2773       else if (isa<ExternalSymbolSDNode>(Callee))
   2774         Callee = LowerExternalSymbol(Callee, DAG);
   2775     }
   2776   }
   2777 
   2778   if (Is64Bit && isVarArg && !IsWin64) {
   2779     // From AMD64 ABI document:
   2780     // For calls that may call functions that use varargs or stdargs
   2781     // (prototype-less calls or calls to functions containing ellipsis (...) in
   2782     // the declaration) %al is used as hidden argument to specify the number
   2783     // of SSE registers used. The contents of %al do not need to match exactly
   2784     // the number of registers, but must be an ubound on the number of SSE
   2785     // registers used and is in the range 0 - 8 inclusive.
   2786 
   2787     // Count the number of XMM registers allocated.
   2788     static const MCPhysReg XMMArgRegs[] = {
   2789       X86::XMM0, X86::XMM1, X86::XMM2, X86::XMM3,
   2790       X86::XMM4, X86::XMM5, X86::XMM6, X86::XMM7
   2791     };
   2792     unsigned NumXMMRegs = CCInfo.getFirstUnallocated(XMMArgRegs, 8);
   2793     assert((Subtarget->hasSSE1() || !NumXMMRegs)
   2794            && "SSE registers cannot be used when SSE is disabled");
   2795 
   2796     RegsToPass.push_back(std::make_pair(unsigned(X86::AL),
   2797                                         DAG.getConstant(NumXMMRegs, MVT::i8)));
   2798   }
   2799 
   2800   // For tail calls lower the arguments to the 'real' stack slots.  Sibcalls
   2801   // don't need this because the eligibility check rejects calls that require
   2802   // shuffling arguments passed in memory.
   2803   if (!IsSibcall && isTailCall) {
   2804     // Force all the incoming stack arguments to be loaded from the stack
   2805     // before any new outgoing arguments are stored to the stack, because the
   2806     // outgoing stack slots may alias the incoming argument stack slots, and
   2807     // the alias isn't otherwise explicit. This is slightly more conservative
   2808     // than necessary, because it means that each store effectively depends
   2809     // on every argument instead of just those arguments it would clobber.
   2810     SDValue ArgChain = DAG.getStackArgumentTokenFactor(Chain);
   2811 
   2812     SmallVector<SDValue, 8> MemOpChains2;
   2813     SDValue FIN;
   2814     int FI = 0;
   2815     for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
   2816       CCValAssign &VA = ArgLocs[i];
   2817       if (VA.isRegLoc())
   2818         continue;
   2819       assert(VA.isMemLoc());
   2820       SDValue Arg = OutVals[i];
   2821       ISD::ArgFlagsTy Flags = Outs[i].Flags;
   2822       // Skip inalloca arguments.  They don't require any work.
   2823       if (Flags.isInAlloca())
   2824         continue;
   2825       // Create frame index.
   2826       int32_t Offset = VA.getLocMemOffset()+FPDiff;
   2827       uint32_t OpSize = (VA.getLocVT().getSizeInBits()+7)/8;
   2828       FI = MF.getFrameInfo()->CreateFixedObject(OpSize, Offset, true);
   2829       FIN = DAG.getFrameIndex(FI, getPointerTy());
   2830 
   2831       if (Flags.isByVal()) {
   2832         // Copy relative to framepointer.
   2833         SDValue Source = DAG.getIntPtrConstant(VA.getLocMemOffset());
   2834         if (!StackPtr.getNode())
   2835           StackPtr = DAG.getCopyFromReg(Chain, dl,
   2836                                         RegInfo->getStackRegister(),
   2837                                         getPointerTy());
   2838         Source = DAG.getNode(ISD::ADD, dl, getPointerTy(), StackPtr, Source);
   2839 
   2840         MemOpChains2.push_back(CreateCopyOfByValArgument(Source, FIN,
   2841                                                          ArgChain,
   2842                                                          Flags, DAG, dl));
   2843       } else {
   2844         // Store relative to framepointer.
   2845         MemOpChains2.push_back(
   2846           DAG.getStore(ArgChain, dl, Arg, FIN,
   2847                        MachinePointerInfo::getFixedStack(FI),
   2848                        false, false, 0));
   2849       }
   2850     }
   2851 
   2852     if (!MemOpChains2.empty())
   2853       Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOpChains2);
   2854 
   2855     // Store the return address to the appropriate stack slot.
   2856     Chain = EmitTailCallStoreRetAddr(DAG, MF, Chain, RetAddrFrIdx,
   2857                                      getPointerTy(), RegInfo->getSlotSize(),
   2858                                      FPDiff, dl);
   2859   }
   2860 
   2861   // Build a sequence of copy-to-reg nodes chained together with token chain
   2862   // and flag operands which copy the outgoing args into registers.
   2863   SDValue InFlag;
   2864   for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) {
   2865     Chain = DAG.getCopyToReg(Chain, dl, RegsToPass[i].first,
   2866                              RegsToPass[i].second, InFlag);
   2867     InFlag = Chain.getValue(1);
   2868   }
   2869 
   2870   if (DAG.getTarget().getCodeModel() == CodeModel::Large) {
   2871     assert(Is64Bit && "Large code model is only legal in 64-bit mode.");
   2872     // In the 64-bit large code model, we have to make all calls
   2873     // through a register, since the call instruction's 32-bit
   2874     // pc-relative offset may not be large enough to hold the whole
   2875     // address.
   2876   } else if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee)) {
   2877     // If the callee is a GlobalAddress node (quite common, every direct call
   2878     // is) turn it into a TargetGlobalAddress node so that legalize doesn't hack
   2879     // it.
   2880 
   2881     // We should use extra load for direct calls to dllimported functions in
   2882     // non-JIT mode.
   2883     const GlobalValue *GV = G->getGlobal();
   2884     if (!GV->hasDLLImportStorageClass()) {
   2885       unsigned char OpFlags = 0;
   2886       bool ExtraLoad = false;
   2887       unsigned WrapperKind = ISD::DELETED_NODE;
   2888 
   2889       // On ELF targets, in both X86-64 and X86-32 mode, direct calls to
   2890       // external symbols most go through the PLT in PIC mode.  If the symbol
   2891       // has hidden or protected visibility, or if it is static or local, then
   2892       // we don't need to use the PLT - we can directly call it.
   2893       if (Subtarget->isTargetELF() &&
   2894           DAG.getTarget().getRelocationModel() == Reloc::PIC_ &&
   2895           GV->hasDefaultVisibility() && !GV->hasLocalLinkage()) {
   2896         OpFlags = X86II::MO_PLT;
   2897       } else if (Subtarget->isPICStyleStubAny() &&
   2898                  (GV->isDeclaration() || GV->isWeakForLinker()) &&
   2899                  (!Subtarget->getTargetTriple().isMacOSX() ||
   2900                   Subtarget->getTargetTriple().isMacOSXVersionLT(10, 5))) {
   2901         // PC-relative references to external symbols should go through $stub,
   2902         // unless we're building with the leopard linker or later, which
   2903         // automatically synthesizes these stubs.
   2904         OpFlags = X86II::MO_DARWIN_STUB;
   2905       } else if (Subtarget->isPICStyleRIPRel() &&
   2906                  isa<Function>(GV) &&
   2907                  cast<Function>(GV)->getAttributes().
   2908                    hasAttribute(AttributeSet::FunctionIndex,
   2909                                 Attribute::NonLazyBind)) {
   2910         // If the function is marked as non-lazy, generate an indirect call
   2911         // which loads from the GOT directly. This avoids runtime overhead
   2912         // at the cost of eager binding (and one extra byte of encoding).
   2913         OpFlags = X86II::MO_GOTPCREL;
   2914         WrapperKind = X86ISD::WrapperRIP;
   2915         ExtraLoad = true;
   2916       }
   2917 
   2918       Callee = DAG.getTargetGlobalAddress(GV, dl, getPointerTy(),
   2919                                           G->getOffset(), OpFlags);
   2920 
   2921       // Add a wrapper if needed.
   2922       if (WrapperKind != ISD::DELETED_NODE)
   2923         Callee = DAG.getNode(X86ISD::WrapperRIP, dl, getPointerTy(), Callee);
   2924       // Add extra indirection if needed.
   2925       if (ExtraLoad)
   2926         Callee = DAG.getLoad(getPointerTy(), dl, DAG.getEntryNode(), Callee,
   2927                              MachinePointerInfo::getGOT(),
   2928                              false, false, false, 0);
   2929     }
   2930   } else if (ExternalSymbolSDNode *S = dyn_cast<ExternalSymbolSDNode>(Callee)) {
   2931     unsigned char OpFlags = 0;
   2932 
   2933     // On ELF targets, in either X86-64 or X86-32 mode, direct calls to
   2934     // external symbols should go through the PLT.
   2935     if (Subtarget->isTargetELF() &&
   2936         DAG.getTarget().getRelocationModel() == Reloc::PIC_) {
   2937       OpFlags = X86II::MO_PLT;
   2938     } else if (Subtarget->isPICStyleStubAny() &&
   2939                (!Subtarget->getTargetTriple().isMacOSX() ||
   2940                 Subtarget->getTargetTriple().isMacOSXVersionLT(10, 5))) {
   2941       // PC-relative references to external symbols should go through $stub,
   2942       // unless we're building with the leopard linker or later, which
   2943       // automatically synthesizes these stubs.
   2944       OpFlags = X86II::MO_DARWIN_STUB;
   2945     }
   2946 
   2947     Callee = DAG.getTargetExternalSymbol(S->getSymbol(), getPointerTy(),
   2948                                          OpFlags);
   2949   }
   2950 
   2951   // Returns a chain & a flag for retval copy to use.
   2952   SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
   2953   SmallVector<SDValue, 8> Ops;
   2954 
   2955   if (!IsSibcall && isTailCall) {
   2956     Chain = DAG.getCALLSEQ_END(Chain,
   2957                                DAG.getIntPtrConstant(NumBytesToPop, true),
   2958                                DAG.getIntPtrConstant(0, true), InFlag, dl);
   2959     InFlag = Chain.getValue(1);
   2960   }
   2961 
   2962   Ops.push_back(Chain);
   2963   Ops.push_back(Callee);
   2964 
   2965   if (isTailCall)
   2966     Ops.push_back(DAG.getConstant(FPDiff, MVT::i32));
   2967 
   2968   // Add argument registers to the end of the list so that they are known live
   2969   // into the call.
   2970   for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i)
   2971     Ops.push_back(DAG.getRegister(RegsToPass[i].first,
   2972                                   RegsToPass[i].second.getValueType()));
   2973 
   2974   // Add a register mask operand representing the call-preserved registers.
   2975   const TargetRegisterInfo *TRI = DAG.getTarget().getRegisterInfo();
   2976   const uint32_t *Mask = TRI->getCallPreservedMask(CallConv);
   2977   assert(Mask && "Missing call preserved mask for calling convention");
   2978   Ops.push_back(DAG.getRegisterMask(Mask));
   2979 
   2980   if (InFlag.getNode())
   2981     Ops.push_back(InFlag);
   2982 
   2983   if (isTailCall) {
   2984     // We used to do:
   2985     //// If this is the first return lowered for this function, add the regs
   2986     //// to the liveout set for the function.
   2987     // This isn't right, although it's probably harmless on x86; liveouts
   2988     // should be computed from returns not tail calls.  Consider a void
   2989     // function making a tail call to a function returning int.
   2990     return DAG.getNode(X86ISD::TC_RETURN, dl, NodeTys, Ops);
   2991   }
   2992 
   2993   Chain = DAG.getNode(X86ISD::CALL, dl, NodeTys, Ops);
   2994   InFlag = Chain.getValue(1);
   2995 
   2996   // Create the CALLSEQ_END node.
   2997   unsigned NumBytesForCalleeToPop;
   2998   if (X86::isCalleePop(CallConv, Is64Bit, isVarArg,
   2999                        DAG.getTarget().Options.GuaranteedTailCallOpt))
   3000     NumBytesForCalleeToPop = NumBytes;    // Callee pops everything
   3001   else if (!Is64Bit && !IsTailCallConvention(CallConv) &&
   3002            !Subtarget->getTargetTriple().isOSMSVCRT() &&
   3003            SR == StackStructReturn)
   3004     // If this is a call to a struct-return function, the callee
   3005     // pops the hidden struct pointer, so we have to push it back.
   3006     // This is common for Darwin/X86, Linux & Mingw32 targets.
   3007     // For MSVC Win32 targets, the caller pops the hidden struct pointer.
   3008     NumBytesForCalleeToPop = 4;
   3009   else
   3010     NumBytesForCalleeToPop = 0;  // Callee pops nothing.
   3011 
   3012   // Returns a flag for retval copy to use.
   3013   if (!IsSibcall) {
   3014     Chain = DAG.getCALLSEQ_END(Chain,
   3015                                DAG.getIntPtrConstant(NumBytesToPop, true),
   3016                                DAG.getIntPtrConstant(NumBytesForCalleeToPop,
   3017                                                      true),
   3018                                InFlag, dl);
   3019     InFlag = Chain.getValue(1);
   3020   }
   3021 
   3022   // Handle result values, copying them out of physregs into vregs that we
   3023   // return.
   3024   return LowerCallResult(Chain, InFlag, CallConv, isVarArg,
   3025                          Ins, dl, DAG, InVals);
   3026 }
   3027 
   3028 //===----------------------------------------------------------------------===//
   3029 //                Fast Calling Convention (tail call) implementation
   3030 //===----------------------------------------------------------------------===//
   3031 
   3032 //  Like std call, callee cleans arguments, convention except that ECX is
   3033 //  reserved for storing the tail called function address. Only 2 registers are
   3034 //  free for argument passing (inreg). Tail call optimization is performed
   3035 //  provided:
   3036 //                * tailcallopt is enabled
   3037 //                * caller/callee are fastcc
   3038 //  On X86_64 architecture with GOT-style position independent code only local
   3039 //  (within module) calls are supported at the moment.
   3040 //  To keep the stack aligned according to platform abi the function
   3041 //  GetAlignedArgumentStackSize ensures that argument delta is always multiples
   3042 //  of stack alignment. (Dynamic linkers need this - darwin's dyld for example)
   3043 //  If a tail called function callee has more arguments than the caller the
   3044 //  caller needs to make sure that there is room to move the RETADDR to. This is
   3045 //  achieved by reserving an area the size of the argument delta right after the
   3046 //  original REtADDR, but before the saved framepointer or the spilled registers
   3047 //  e.g. caller(arg1, arg2) calls callee(arg1, arg2,arg3,arg4)
   3048 //  stack layout:
   3049 //    arg1
   3050 //    arg2
   3051 //    RETADDR
   3052 //    [ new RETADDR
   3053 //      move area ]
   3054 //    (possible EBP)
   3055 //    ESI
   3056 //    EDI
   3057 //    local1 ..
   3058 
   3059 /// GetAlignedArgumentStackSize - Make the stack size align e.g 16n + 12 aligned
   3060 /// for a 16 byte align requirement.
   3061 unsigned
   3062 X86TargetLowering::GetAlignedArgumentStackSize(unsigned StackSize,
   3063                                                SelectionDAG& DAG) const {
   3064   MachineFunction &MF = DAG.getMachineFunction();
   3065   const TargetMachine &TM = MF.getTarget();
   3066   const X86RegisterInfo *RegInfo =
   3067     static_cast<const X86RegisterInfo*>(TM.getRegisterInfo());
   3068   const TargetFrameLowering &TFI = *TM.getFrameLowering();
   3069   unsigned StackAlignment = TFI.getStackAlignment();
   3070   uint64_t AlignMask = StackAlignment - 1;
   3071   int64_t Offset = StackSize;
   3072   unsigned SlotSize = RegInfo->getSlotSize();
   3073   if ( (Offset & AlignMask) <= (StackAlignment - SlotSize) ) {
   3074     // Number smaller than 12 so just add the difference.
   3075     Offset += ((StackAlignment - SlotSize) - (Offset & AlignMask));
   3076   } else {
   3077     // Mask out lower bits, add stackalignment once plus the 12 bytes.
   3078     Offset = ((~AlignMask) & Offset) + StackAlignment +
   3079       (StackAlignment-SlotSize);
   3080   }
   3081   return Offset;
   3082 }
   3083 
   3084 /// MatchingStackOffset - Return true if the given stack call argument is
   3085 /// already available in the same position (relatively) of the caller's
   3086 /// incoming argument stack.
   3087 static
   3088 bool MatchingStackOffset(SDValue Arg, unsigned Offset, ISD::ArgFlagsTy Flags,
   3089                          MachineFrameInfo *MFI, const MachineRegisterInfo *MRI,
   3090                          const X86InstrInfo *TII) {
   3091   unsigned Bytes = Arg.getValueType().getSizeInBits() / 8;
   3092   int FI = INT_MAX;
   3093   if (Arg.getOpcode() == ISD::CopyFromReg) {
   3094     unsigned VR = cast<RegisterSDNode>(Arg.getOperand(1))->getReg();
   3095     if (!TargetRegisterInfo::isVirtualRegister(VR))
   3096       return false;
   3097     MachineInstr *Def = MRI->getVRegDef(VR);
   3098     if (!Def)
   3099       return false;
   3100     if (!Flags.isByVal()) {
   3101       if (!TII->isLoadFromStackSlot(Def, FI))
   3102         return false;
   3103     } else {
   3104       unsigned Opcode = Def->getOpcode();
   3105       if ((Opcode == X86::LEA32r || Opcode == X86::LEA64r) &&
   3106           Def->getOperand(1).isFI()) {
   3107         FI = Def->getOperand(1).getIndex();
   3108         Bytes = Flags.getByValSize();
   3109       } else
   3110         return false;
   3111     }
   3112   } else if (LoadSDNode *Ld = dyn_cast<LoadSDNode>(Arg)) {
   3113     if (Flags.isByVal())
   3114       // ByVal argument is passed in as a pointer but it's now being
   3115       // dereferenced. e.g.
   3116       // define @foo(%struct.X* %A) {
   3117       //   tail call @bar(%struct.X* byval %A)
   3118       // }
   3119       return false;
   3120     SDValue Ptr = Ld->getBasePtr();
   3121     FrameIndexSDNode *FINode = dyn_cast<FrameIndexSDNode>(Ptr);
   3122     if (!FINode)
   3123       return false;
   3124     FI = FINode->getIndex();
   3125   } else if (Arg.getOpcode() == ISD::FrameIndex && Flags.isByVal()) {
   3126     FrameIndexSDNode *FINode = cast<FrameIndexSDNode>(Arg);
   3127     FI = FINode->getIndex();
   3128     Bytes = Flags.getByValSize();
   3129   } else
   3130     return false;
   3131 
   3132   assert(FI != INT_MAX);
   3133   if (!MFI->isFixedObjectIndex(FI))
   3134     return false;
   3135   return Offset == MFI->getObjectOffset(FI) && Bytes == MFI->getObjectSize(FI);
   3136 }
   3137 
   3138 /// IsEligibleForTailCallOptimization - Check whether the call is eligible
   3139 /// for tail call optimization. Targets which want to do tail call
   3140 /// optimization should implement this function.
   3141 bool
   3142 X86TargetLowering::IsEligibleForTailCallOptimization(SDValue Callee,
   3143                                                      CallingConv::ID CalleeCC,
   3144                                                      bool isVarArg,
   3145                                                      bool isCalleeStructRet,
   3146                                                      bool isCallerStructRet,
   3147                                                      Type *RetTy,
   3148                                     const SmallVectorImpl<ISD::OutputArg> &Outs,
   3149                                     const SmallVectorImpl<SDValue> &OutVals,
   3150                                     const SmallVectorImpl<ISD::InputArg> &Ins,
   3151                                                      SelectionDAG &DAG) const {
   3152   if (!IsTailCallConvention(CalleeCC) && !IsCCallConvention(CalleeCC))
   3153     return false;
   3154 
   3155   // If -tailcallopt is specified, make fastcc functions tail-callable.
   3156   const MachineFunction &MF = DAG.getMachineFunction();
   3157   const Function *CallerF = MF.getFunction();
   3158 
   3159   // If the function return type is x86_fp80 and the callee return type is not,
   3160   // then the FP_EXTEND of the call result is not a nop. It's not safe to
   3161   // perform a tailcall optimization here.
   3162   if (CallerF->getReturnType()->isX86_FP80Ty() && !RetTy->isX86_FP80Ty())
   3163     return false;
   3164 
   3165   CallingConv::ID CallerCC = CallerF->getCallingConv();
   3166   bool CCMatch = CallerCC == CalleeCC;
   3167   bool IsCalleeWin64 = Subtarget->isCallingConvWin64(CalleeCC);
   3168   bool IsCallerWin64 = Subtarget->isCallingConvWin64(CallerCC);
   3169 
   3170   if (DAG.getTarget().Options.GuaranteedTailCallOpt) {
   3171     if (IsTailCallConvention(CalleeCC) && CCMatch)
   3172       return true;
   3173     return false;
   3174   }
   3175 
   3176   // Look for obvious safe cases to perform tail call optimization that do not
   3177   // require ABI changes. This is what gcc calls sibcall.
   3178 
   3179   // Can't do sibcall if stack needs to be dynamically re-aligned. PEI needs to
   3180   // emit a special epilogue.
   3181   const X86RegisterInfo *RegInfo =
   3182     static_cast<const X86RegisterInfo*>(DAG.getTarget().getRegisterInfo());
   3183   if (RegInfo->needsStackRealignment(MF))
   3184     return false;
   3185 
   3186   // Also avoid sibcall optimization if either caller or callee uses struct
   3187   // return semantics.
   3188   if (isCalleeStructRet || isCallerStructRet)
   3189     return false;
   3190 
   3191   // An stdcall/thiscall caller is expected to clean up its arguments; the
   3192   // callee isn't going to do that.
   3193   // FIXME: this is more restrictive than needed. We could produce a tailcall
   3194   // when the stack adjustment matches. For example, with a thiscall that takes
   3195   // only one argument.
   3196   if (!CCMatch && (CallerCC == CallingConv::X86_StdCall ||
   3197                    CallerCC == CallingConv::X86_ThisCall))
   3198     return false;
   3199 
   3200   // Do not sibcall optimize vararg calls unless all arguments are passed via
   3201   // registers.
   3202   if (isVarArg && !Outs.empty()) {
   3203 
   3204     // Optimizing for varargs on Win64 is unlikely to be safe without
   3205     // additional testing.
   3206     if (IsCalleeWin64 || IsCallerWin64)
   3207       return false;
   3208 
   3209     SmallVector<CCValAssign, 16> ArgLocs;
   3210     CCState CCInfo(CalleeCC, isVarArg, DAG.getMachineFunction(),
   3211                    DAG.getTarget(), ArgLocs, *DAG.getContext());
   3212 
   3213     CCInfo.AnalyzeCallOperands(Outs, CC_X86);
   3214     for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i)
   3215       if (!ArgLocs[i].isRegLoc())
   3216         return false;
   3217   }
   3218 
   3219   // If the call result is in ST0 / ST1, it needs to be popped off the x87
   3220   // stack.  Therefore, if it's not used by the call it is not safe to optimize
   3221   // this into a sibcall.
   3222   bool Unused = false;
   3223   for (unsigned i = 0, e = Ins.size(); i != e; ++i) {
   3224     if (!Ins[i].Used) {
   3225       Unused = true;
   3226       break;
   3227     }
   3228   }
   3229   if (Unused) {
   3230     SmallVector<CCValAssign, 16> RVLocs;
   3231     CCState CCInfo(CalleeCC, false, DAG.getMachineFunction(),
   3232                    DAG.getTarget(), RVLocs, *DAG.getContext());
   3233     CCInfo.AnalyzeCallResult(Ins, RetCC_X86);
   3234     for (unsigned i = 0, e = RVLocs.size(); i != e; ++i) {
   3235       CCValAssign &VA = RVLocs[i];
   3236       if (VA.getLocReg() == X86::ST0 || VA.getLocReg() == X86::ST1)
   3237         return false;
   3238     }
   3239   }
   3240 
   3241   // If the calling conventions do not match, then we'd better make sure the
   3242   // results are returned in the same way as what the caller expects.
   3243   if (!CCMatch) {
   3244     SmallVector<CCValAssign, 16> RVLocs1;
   3245     CCState CCInfo1(CalleeCC, false, DAG.getMachineFunction(),
   3246                     DAG.getTarget(), RVLocs1, *DAG.getContext());
   3247     CCInfo1.AnalyzeCallResult(Ins, RetCC_X86);
   3248 
   3249     SmallVector<CCValAssign, 16> RVLocs2;
   3250     CCState CCInfo2(CallerCC, false, DAG.getMachineFunction(),
   3251                     DAG.getTarget(), RVLocs2, *DAG.getContext());
   3252     CCInfo2.AnalyzeCallResult(Ins, RetCC_X86);
   3253 
   3254     if (RVLocs1.size() != RVLocs2.size())
   3255       return false;
   3256     for (unsigned i = 0, e = RVLocs1.size(); i != e; ++i) {
   3257       if (RVLocs1[i].isRegLoc() != RVLocs2[i].isRegLoc())
   3258         return false;
   3259       if (RVLocs1[i].getLocInfo() != RVLocs2[i].getLocInfo())
   3260         return false;
   3261       if (RVLocs1[i].isRegLoc()) {
   3262         if (RVLocs1[i].getLocReg() != RVLocs2[i].getLocReg())
   3263           return false;
   3264       } else {
   3265         if (RVLocs1[i].getLocMemOffset() != RVLocs2[i].getLocMemOffset())
   3266           return false;
   3267       }
   3268     }
   3269   }
   3270 
   3271   // If the callee takes no arguments then go on to check the results of the
   3272   // call.
   3273   if (!Outs.empty()) {
   3274     // Check if stack adjustment is needed. For now, do not do this if any
   3275     // argument is passed on the stack.
   3276     SmallVector<CCValAssign, 16> ArgLocs;
   3277     CCState CCInfo(CalleeCC, isVarArg, DAG.getMachineFunction(),
   3278                    DAG.getTarget(), ArgLocs, *DAG.getContext());
   3279 
   3280     // Allocate shadow area for Win64
   3281     if (IsCalleeWin64)
   3282       CCInfo.AllocateStack(32, 8);
   3283 
   3284     CCInfo.AnalyzeCallOperands(Outs, CC_X86);
   3285     if (CCInfo.getNextStackOffset()) {
   3286       MachineFunction &MF = DAG.getMachineFunction();
   3287       if (MF.getInfo<X86MachineFunctionInfo>()->getBytesToPopOnReturn())
   3288         return false;
   3289 
   3290       // Check if the arguments are already laid out in the right way as
   3291       // the caller's fixed stack objects.
   3292       MachineFrameInfo *MFI = MF.getFrameInfo();
   3293       const MachineRegisterInfo *MRI = &MF.getRegInfo();
   3294       const X86InstrInfo *TII =
   3295           static_cast<const X86InstrInfo *>(DAG.getTarget().getInstrInfo());
   3296       for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
   3297         CCValAssign &VA = ArgLocs[i];
   3298         SDValue Arg = OutVals[i];
   3299         ISD::ArgFlagsTy Flags = Outs[i].Flags;
   3300         if (VA.getLocInfo() == CCValAssign::Indirect)
   3301           return false;
   3302         if (!VA.isRegLoc()) {
   3303           if (!MatchingStackOffset(Arg, VA.getLocMemOffset(), Flags,
   3304                                    MFI, MRI, TII))
   3305             return false;
   3306         }
   3307       }
   3308     }
   3309 
   3310     // If the tailcall address may be in a register, then make sure it's
   3311     // possible to register allocate for it. In 32-bit, the call address can
   3312     // only target EAX, EDX, or ECX since the tail call must be scheduled after
   3313     // callee-saved registers are restored. These happen to be the same
   3314     // registers used to pass 'inreg' arguments so watch out for those.
   3315     if (!Subtarget->is64Bit() &&
   3316         ((!isa<GlobalAddressSDNode>(Callee) &&
   3317           !isa<ExternalSymbolSDNode>(Callee)) ||
   3318          DAG.getTarget().getRelocationModel() == Reloc::PIC_)) {
   3319       unsigned NumInRegs = 0;
   3320       // In PIC we need an extra register to formulate the address computation
   3321       // for the callee.
   3322       unsigned MaxInRegs =
   3323 	(DAG.getTarget().getRelocationModel() == Reloc::PIC_) ? 2 : 3;
   3324 
   3325       for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
   3326         CCValAssign &VA = ArgLocs[i];
   3327         if (!VA.isRegLoc())
   3328           continue;
   3329         unsigned Reg = VA.getLocReg();
   3330         switch (Reg) {
   3331         default: break;
   3332         case X86::EAX: case X86::EDX: case X86::ECX:
   3333           if (++NumInRegs == MaxInRegs)
   3334             return false;
   3335           break;
   3336         }
   3337       }
   3338     }
   3339   }
   3340 
   3341   return true;
   3342 }
   3343 
   3344 FastISel *
   3345 X86TargetLowering::createFastISel(FunctionLoweringInfo &funcInfo,
   3346                                   const TargetLibraryInfo *libInfo) const {
   3347   return X86::createFastISel(funcInfo, libInfo);
   3348 }
   3349 
   3350 //===----------------------------------------------------------------------===//
   3351 //                           Other Lowering Hooks
   3352 //===----------------------------------------------------------------------===//
   3353 
   3354 static bool MayFoldLoad(SDValue Op) {
   3355   return Op.hasOneUse() && ISD::isNormalLoad(Op.getNode());
   3356 }
   3357 
   3358 static bool MayFoldIntoStore(SDValue Op) {
   3359   return Op.hasOneUse() && ISD::isNormalStore(*Op.getNode()->use_begin());
   3360 }
   3361 
   3362 static bool isTargetShuffle(unsigned Opcode) {
   3363   switch(Opcode) {
   3364   default: return false;
   3365   case X86ISD::PSHUFD:
   3366   case X86ISD::PSHUFHW:
   3367   case X86ISD::PSHUFLW:
   3368   case X86ISD::SHUFP:
   3369   case X86ISD::PALIGNR:
   3370   case X86ISD::MOVLHPS:
   3371   case X86ISD::MOVLHPD:
   3372   case X86ISD::MOVHLPS:
   3373   case X86ISD::MOVLPS:
   3374   case X86ISD::MOVLPD:
   3375   case X86ISD::MOVSHDUP:
   3376   case X86ISD::MOVSLDUP:
   3377   case X86ISD::MOVDDUP:
   3378   case X86ISD::MOVSS:
   3379   case X86ISD::MOVSD:
   3380   case X86ISD::UNPCKL:
   3381   case X86ISD::UNPCKH:
   3382   case X86ISD::VPERMILP:
   3383   case X86ISD::VPERM2X128:
   3384   case X86ISD::VPERMI:
   3385     return true;
   3386   }
   3387 }
   3388 
   3389 static SDValue getTargetShuffleNode(unsigned Opc, SDLoc dl, EVT VT,
   3390                                     SDValue V1, SelectionDAG &DAG) {
   3391   switch(Opc) {
   3392   default: llvm_unreachable("Unknown x86 shuffle node");
   3393   case X86ISD::MOVSHDUP:
   3394   case X86ISD::MOVSLDUP:
   3395   case X86ISD::MOVDDUP:
   3396     return DAG.getNode(Opc, dl, VT, V1);
   3397   }
   3398 }
   3399 
   3400 static SDValue getTargetShuffleNode(unsigned Opc, SDLoc dl, EVT VT,
   3401                                     SDValue V1, unsigned TargetMask,
   3402                                     SelectionDAG &DAG) {
   3403   switch(Opc) {
   3404   default: llvm_unreachable("Unknown x86 shuffle node");
   3405   case X86ISD::PSHUFD:
   3406   case X86ISD::PSHUFHW:
   3407   case X86ISD::PSHUFLW:
   3408   case X86ISD::VPERMILP:
   3409   case X86ISD::VPERMI:
   3410     return DAG.getNode(Opc, dl, VT, V1, DAG.getConstant(TargetMask, MVT::i8));
   3411   }
   3412 }
   3413 
   3414 static SDValue getTargetShuffleNode(unsigned Opc, SDLoc dl, EVT VT,
   3415                                     SDValue V1, SDValue V2, unsigned TargetMask,
   3416                                     SelectionDAG &DAG) {
   3417   switch(Opc) {
   3418   default: llvm_unreachable("Unknown x86 shuffle node");
   3419   case X86ISD::PALIGNR:
   3420   case X86ISD::SHUFP:
   3421   case X86ISD::VPERM2X128:
   3422     return DAG.getNode(Opc, dl, VT, V1, V2,
   3423                        DAG.getConstant(TargetMask, MVT::i8));
   3424   }
   3425 }
   3426 
   3427 static SDValue getTargetShuffleNode(unsigned Opc, SDLoc dl, EVT VT,
   3428                                     SDValue V1, SDValue V2, SelectionDAG &DAG) {
   3429   switch(Opc) {
   3430   default: llvm_unreachable("Unknown x86 shuffle node");
   3431   case X86ISD::MOVLHPS:
   3432   case X86ISD::MOVLHPD:
   3433   case X86ISD::MOVHLPS:
   3434   case X86ISD::MOVLPS:
   3435   case X86ISD::MOVLPD:
   3436   case X86ISD::MOVSS:
   3437   case X86ISD::MOVSD:
   3438   case X86ISD::UNPCKL:
   3439   case X86ISD::UNPCKH:
   3440     return DAG.getNode(Opc, dl, VT, V1, V2);
   3441   }
   3442 }
   3443 
   3444 SDValue X86TargetLowering::getReturnAddressFrameIndex(SelectionDAG &DAG) const {
   3445   MachineFunction &MF = DAG.getMachineFunction();
   3446   const X86RegisterInfo *RegInfo =
   3447     static_cast<const X86RegisterInfo*>(DAG.getTarget().getRegisterInfo());
   3448   X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
   3449   int ReturnAddrIndex = FuncInfo->getRAIndex();
   3450 
   3451   if (ReturnAddrIndex == 0) {
   3452     // Set up a frame object for the return address.
   3453     unsigned SlotSize = RegInfo->getSlotSize();
   3454     ReturnAddrIndex = MF.getFrameInfo()->CreateFixedObject(SlotSize,
   3455                                                            -(int64_t)SlotSize,
   3456                                                            false);
   3457     FuncInfo->setRAIndex(ReturnAddrIndex);
   3458   }
   3459 
   3460   return DAG.getFrameIndex(ReturnAddrIndex, getPointerTy());
   3461 }
   3462 
   3463 bool X86::isOffsetSuitableForCodeModel(int64_t Offset, CodeModel::Model M,
   3464                                        bool hasSymbolicDisplacement) {
   3465   // Offset should fit into 32 bit immediate field.
   3466   if (!isInt<32>(Offset))
   3467     return false;
   3468 
   3469   // If we don't have a symbolic displacement - we don't have any extra
   3470   // restrictions.
   3471   if (!hasSymbolicDisplacement)
   3472     return true;
   3473 
   3474   // FIXME: Some tweaks might be needed for medium code model.
   3475   if (M != CodeModel::Small && M != CodeModel::Kernel)
   3476     return false;
   3477 
   3478   // For small code model we assume that latest object is 16MB before end of 31
   3479   // bits boundary. We may also accept pretty large negative constants knowing
   3480   // that all objects are in the positive half of address space.
   3481   if (M == CodeModel::Small && Offset < 16*1024*1024)
   3482     return true;
   3483 
   3484   // For kernel code model we know that all object resist in the negative half
   3485   // of 32bits address space. We may not accept negative offsets, since they may
   3486   // be just off and we may accept pretty large positive ones.
   3487   if (M == CodeModel::Kernel && Offset > 0)
   3488     return true;
   3489 
   3490   return false;
   3491 }
   3492 
   3493 /// isCalleePop - Determines whether the callee is required to pop its
   3494 /// own arguments. Callee pop is necessary to support tail calls.
   3495 bool X86::isCalleePop(CallingConv::ID CallingConv,
   3496                       bool is64Bit, bool IsVarArg, bool TailCallOpt) {
   3497   if (IsVarArg)
   3498     return false;
   3499 
   3500   switch (CallingConv) {
   3501   default:
   3502     return false;
   3503   case CallingConv::X86_StdCall:
   3504     return !is64Bit;
   3505   case CallingConv::X86_FastCall:
   3506     return !is64Bit;
   3507   case CallingConv::X86_ThisCall:
   3508     return !is64Bit;
   3509   case CallingConv::Fast:
   3510     return TailCallOpt;
   3511   case CallingConv::GHC:
   3512     return TailCallOpt;
   3513   case CallingConv::HiPE:
   3514     return TailCallOpt;
   3515   }
   3516 }
   3517 
   3518 /// \brief Return true if the condition is an unsigned comparison operation.
   3519 static bool isX86CCUnsigned(unsigned X86CC) {
   3520   switch (X86CC) {
   3521   default: llvm_unreachable("Invalid integer condition!");
   3522   case X86::COND_E:     return true;
   3523   case X86::COND_G:     return false;
   3524   case X86::COND_GE:    return false;
   3525   case X86::COND_L:     return false;
   3526   case X86::COND_LE:    return false;
   3527   case X86::COND_NE:    return true;
   3528   case X86::COND_B:     return true;
   3529   case X86::COND_A:     return true;
   3530   case X86::COND_BE:    return true;
   3531   case X86::COND_AE:    return true;
   3532   }
   3533   llvm_unreachable("covered switch fell through?!");
   3534 }
   3535 
   3536 /// TranslateX86CC - do a one to one translation of a ISD::CondCode to the X86
   3537 /// specific condition code, returning the condition code and the LHS/RHS of the
   3538 /// comparison to make.
   3539 static unsigned TranslateX86CC(ISD::CondCode SetCCOpcode, bool isFP,
   3540                                SDValue &LHS, SDValue &RHS, SelectionDAG &DAG) {
   3541   if (!isFP) {
   3542     if (ConstantSDNode *RHSC = dyn_cast<ConstantSDNode>(RHS)) {
   3543       if (SetCCOpcode == ISD::SETGT && RHSC->isAllOnesValue()) {
   3544         // X > -1   -> X == 0, jump !sign.
   3545         RHS = DAG.getConstant(0, RHS.getValueType());
   3546         return X86::COND_NS;
   3547       }
   3548       if (SetCCOpcode == ISD::SETLT && RHSC->isNullValue()) {
   3549         // X < 0   -> X == 0, jump on sign.
   3550         return X86::COND_S;
   3551       }
   3552       if (SetCCOpcode == ISD::SETLT && RHSC->getZExtValue() == 1) {
   3553         // X < 1   -> X <= 0
   3554         RHS = DAG.getConstant(0, RHS.getValueType());
   3555         return X86::COND_LE;
   3556       }
   3557     }
   3558 
   3559     switch (SetCCOpcode) {
   3560     default: llvm_unreachable("Invalid integer condition!");
   3561     case ISD::SETEQ:  return X86::COND_E;
   3562     case ISD::SETGT:  return X86::COND_G;
   3563     case ISD::SETGE:  return X86::COND_GE;
   3564     case ISD::SETLT:  return X86::COND_L;
   3565     case ISD::SETLE:  return X86::COND_LE;
   3566     case ISD::SETNE:  return X86::COND_NE;
   3567     case ISD::SETULT: return X86::COND_B;
   3568     case ISD::SETUGT: return X86::COND_A;
   3569     case ISD::SETULE: return X86::COND_BE;
   3570     case ISD::SETUGE: return X86::COND_AE;
   3571     }
   3572   }
   3573 
   3574   // First determine if it is required or is profitable to flip the operands.
   3575 
   3576   // If LHS is a foldable load, but RHS is not, flip the condition.
   3577   if (ISD::isNON_EXTLoad(LHS.getNode()) &&
   3578       !ISD::isNON_EXTLoad(RHS.getNode())) {
   3579     SetCCOpcode = getSetCCSwappedOperands(SetCCOpcode);
   3580     std::swap(LHS, RHS);
   3581   }
   3582 
   3583   switch (SetCCOpcode) {
   3584   default: break;
   3585   case ISD::SETOLT:
   3586   case ISD::SETOLE:
   3587   case ISD::SETUGT:
   3588   case ISD::SETUGE:
   3589     std::swap(LHS, RHS);
   3590     break;
   3591   }
   3592 
   3593   // On a floating point condition, the flags are set as follows:
   3594   // ZF  PF  CF   op
   3595   //  0 | 0 | 0 | X > Y
   3596   //  0 | 0 | 1 | X < Y
   3597   //  1 | 0 | 0 | X == Y
   3598   //  1 | 1 | 1 | unordered
   3599   switch (SetCCOpcode) {
   3600   default: llvm_unreachable("Condcode should be pre-legalized away");
   3601   case ISD::SETUEQ:
   3602   case ISD::SETEQ:   return X86::COND_E;
   3603   case ISD::SETOLT:              // flipped
   3604   case ISD::SETOGT:
   3605   case ISD::SETGT:   return X86::COND_A;
   3606   case ISD::SETOLE:              // flipped
   3607   case ISD::SETOGE:
   3608   case ISD::SETGE:   return X86::COND_AE;
   3609   case ISD::SETUGT:              // flipped
   3610   case ISD::SETULT:
   3611   case ISD::SETLT:   return X86::COND_B;
   3612   case ISD::SETUGE:              // flipped
   3613   case ISD::SETULE:
   3614   case ISD::SETLE:   return X86::COND_BE;
   3615   case ISD::SETONE:
   3616   case ISD::SETNE:   return X86::COND_NE;
   3617   case ISD::SETUO:   return X86::COND_P;
   3618   case ISD::SETO:    return X86::COND_NP;
   3619   case ISD::SETOEQ:
   3620   case ISD::SETUNE:  return X86::COND_INVALID;
   3621   }
   3622 }
   3623 
   3624 /// hasFPCMov - is there a floating point cmov for the specific X86 condition
   3625 /// code. Current x86 isa includes the following FP cmov instructions:
   3626 /// fcmovb, fcomvbe, fcomve, fcmovu, fcmovae, fcmova, fcmovne, fcmovnu.
   3627 static bool hasFPCMov(unsigned X86CC) {
   3628   switch (X86CC) {
   3629   default:
   3630     return false;
   3631   case X86::COND_B:
   3632   case X86::COND_BE:
   3633   case X86::COND_E:
   3634   case X86::COND_P:
   3635   case X86::COND_A:
   3636   case X86::COND_AE:
   3637   case X86::COND_NE:
   3638   case X86::COND_NP:
   3639     return true;
   3640   }
   3641 }
   3642 
   3643 /// isFPImmLegal - Returns true if the target can instruction select the
   3644 /// specified FP immediate natively. If false, the legalizer will
   3645 /// materialize the FP immediate as a load from a constant pool.
   3646 bool X86TargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT) const {
   3647   for (unsigned i = 0, e = LegalFPImmediates.size(); i != e; ++i) {
   3648     if (Imm.bitwiseIsEqual(LegalFPImmediates[i]))
   3649       return true;
   3650   }
   3651   return false;
   3652 }
   3653 
   3654 /// \brief Returns true if it is beneficial to convert a load of a constant
   3655 /// to just the constant itself.
   3656 bool X86TargetLowering::shouldConvertConstantLoadToIntImm(const APInt &Imm,
   3657                                                           Type *Ty) const {
   3658   assert(Ty->isIntegerTy());
   3659 
   3660   unsigned BitSize = Ty->getPrimitiveSizeInBits();
   3661   if (BitSize == 0 || BitSize > 64)
   3662     return false;
   3663   return true;
   3664 }
   3665 
   3666 /// isUndefOrInRange - Return true if Val is undef or if its value falls within
   3667 /// the specified range (L, H].
   3668 static bool isUndefOrInRange(int Val, int Low, int Hi) {
   3669   return (Val < 0) || (Val >= Low && Val < Hi);
   3670 }
   3671 
   3672 /// isUndefOrEqual - Val is either less than zero (undef) or equal to the
   3673 /// specified value.
   3674 static bool isUndefOrEqual(int Val, int CmpVal) {
   3675   return (Val < 0 || Val == CmpVal);
   3676 }
   3677 
   3678 /// isSequentialOrUndefInRange - Return true if every element in Mask, beginning
   3679 /// from position Pos and ending in Pos+Size, falls within the specified
   3680 /// sequential range (L, L+Pos]. or is undef.
   3681 static bool isSequentialOrUndefInRange(ArrayRef<int> Mask,
   3682                                        unsigned Pos, unsigned Size, int Low) {
   3683   for (unsigned i = Pos, e = Pos+Size; i != e; ++i, ++Low)
   3684     if (!isUndefOrEqual(Mask[i], Low))
   3685       return false;
   3686   return true;
   3687 }
   3688 
   3689 /// isPSHUFDMask - Return true if the node specifies a shuffle of elements that
   3690 /// is suitable for input to PSHUFD or PSHUFW.  That is, it doesn't reference
   3691 /// the second operand.
   3692 static bool isPSHUFDMask(ArrayRef<int> Mask, MVT VT) {
   3693   if (VT == MVT::v4f32 || VT == MVT::v4i32 )
   3694     return (Mask[0] < 4 && Mask[1] < 4 && Mask[2] < 4 && Mask[3] < 4);
   3695   if (VT == MVT::v2f64 || VT == MVT::v2i64)
   3696     return (Mask[0] < 2 && Mask[1] < 2);
   3697   return false;
   3698 }
   3699 
   3700 /// isPSHUFHWMask - Return true if the node specifies a shuffle of elements that
   3701 /// is suitable for input to PSHUFHW.
   3702 static bool isPSHUFHWMask(ArrayRef<int> Mask, MVT VT, bool HasInt256) {
   3703   if (VT != MVT::v8i16 && (!HasInt256 || VT != MVT::v16i16))
   3704     return false;
   3705 
   3706   // Lower quadword copied in order or undef.
   3707   if (!isSequentialOrUndefInRange(Mask, 0, 4, 0))
   3708     return false;
   3709 
   3710   // Upper quadword shuffled.
   3711   for (unsigned i = 4; i != 8; ++i)
   3712     if (!isUndefOrInRange(Mask[i], 4, 8))
   3713       return false;
   3714 
   3715   if (VT == MVT::v16i16) {
   3716     // Lower quadword copied in order or undef.
   3717     if (!isSequentialOrUndefInRange(Mask, 8, 4, 8))
   3718       return false;
   3719 
   3720     // Upper quadword shuffled.
   3721     for (unsigned i = 12; i != 16; ++i)
   3722       if (!isUndefOrInRange(Mask[i], 12, 16))
   3723         return false;
   3724   }
   3725 
   3726   return true;
   3727 }
   3728 
   3729 /// isPSHUFLWMask - Return true if the node specifies a shuffle of elements that
   3730 /// is suitable for input to PSHUFLW.
   3731 static bool isPSHUFLWMask(ArrayRef<int> Mask, MVT VT, bool HasInt256) {
   3732   if (VT != MVT::v8i16 && (!HasInt256 || VT != MVT::v16i16))
   3733     return false;
   3734 
   3735   // Upper quadword copied in order.
   3736   if (!isSequentialOrUndefInRange(Mask, 4, 4, 4))
   3737     return false;
   3738 
   3739   // Lower quadword shuffled.
   3740   for (unsigned i = 0; i != 4; ++i)
   3741     if (!isUndefOrInRange(Mask[i], 0, 4))
   3742       return false;
   3743 
   3744   if (VT == MVT::v16i16) {
   3745     // Upper quadword copied in order.
   3746     if (!isSequentialOrUndefInRange(Mask, 12, 4, 12))
   3747       return false;
   3748 
   3749     // Lower quadword shuffled.
   3750     for (unsigned i = 8; i != 12; ++i)
   3751       if (!isUndefOrInRange(Mask[i], 8, 12))
   3752         return false;
   3753   }
   3754 
   3755   return true;
   3756 }
   3757 
   3758 /// isPALIGNRMask - Return true if the node specifies a shuffle of elements that
   3759 /// is suitable for input to PALIGNR.
   3760 static bool isPALIGNRMask(ArrayRef<int> Mask, MVT VT,
   3761                           const X86Subtarget *Subtarget) {
   3762   if ((VT.is128BitVector() && !Subtarget->hasSSSE3()) ||
   3763       (VT.is256BitVector() && !Subtarget->hasInt256()))
   3764     return false;
   3765 
   3766   unsigned NumElts = VT.getVectorNumElements();
   3767   unsigned NumLanes = VT.is512BitVector() ? 1: VT.getSizeInBits()/128;
   3768   unsigned NumLaneElts = NumElts/NumLanes;
   3769 
   3770   // Do not handle 64-bit element shuffles with palignr.
   3771   if (NumLaneElts == 2)
   3772     return false;
   3773 
   3774   for (unsigned l = 0; l != NumElts; l+=NumLaneElts) {
   3775     unsigned i;
   3776     for (i = 0; i != NumLaneElts; ++i) {
   3777       if (Mask[i+l] >= 0)
   3778         break;
   3779     }
   3780 
   3781     // Lane is all undef, go to next lane
   3782     if (i == NumLaneElts)
   3783       continue;
   3784 
   3785     int Start = Mask[i+l];
   3786 
   3787     // Make sure its in this lane in one of the sources
   3788     if (!isUndefOrInRange(Start, l, l+NumLaneElts) &&
   3789         !isUndefOrInRange(Start, l+NumElts, l+NumElts+NumLaneElts))
   3790       return false;
   3791 
   3792     // If not lane 0, then we must match lane 0
   3793     if (l != 0 && Mask[i] >= 0 && !isUndefOrEqual(Start, Mask[i]+l))
   3794       return false;
   3795 
   3796     // Correct second source to be contiguous with first source
   3797     if (Start >= (int)NumElts)
   3798       Start -= NumElts - NumLaneElts;
   3799 
   3800     // Make sure we're shifting in the right direction.
   3801     if (Start <= (int)(i+l))
   3802       return false;
   3803 
   3804     Start -= i;
   3805 
   3806     // Check the rest of the elements to see if they are consecutive.
   3807     for (++i; i != NumLaneElts; ++i) {
   3808       int Idx = Mask[i+l];
   3809 
   3810       // Make sure its in this lane
   3811       if (!isUndefOrInRange(Idx, l, l+NumLaneElts) &&
   3812           !isUndefOrInRange(Idx, l+NumElts, l+NumElts+NumLaneElts))
   3813         return false;
   3814 
   3815       // If not lane 0, then we must match lane 0
   3816       if (l != 0 && Mask[i] >= 0 && !isUndefOrEqual(Idx, Mask[i]+l))
   3817         return false;
   3818 
   3819       if (Idx >= (int)NumElts)
   3820         Idx -= NumElts - NumLaneElts;
   3821 
   3822       if (!isUndefOrEqual(Idx, Start+i))
   3823         return false;
   3824 
   3825     }
   3826   }
   3827 
   3828   return true;
   3829 }
   3830 
   3831 /// CommuteVectorShuffleMask - Change values in a shuffle permute mask assuming
   3832 /// the two vector operands have swapped position.
   3833 static void CommuteVectorShuffleMask(SmallVectorImpl<int> &Mask,
   3834                                      unsigned NumElems) {
   3835   for (unsigned i = 0; i != NumElems; ++i) {
   3836     int idx = Mask[i];
   3837     if (idx < 0)
   3838       continue;
   3839     else if (idx < (int)NumElems)
   3840       Mask[i] = idx + NumElems;
   3841     else
   3842       Mask[i] = idx - NumElems;
   3843   }
   3844 }
   3845 
   3846 /// isSHUFPMask - Return true if the specified VECTOR_SHUFFLE operand
   3847 /// specifies a shuffle of elements that is suitable for input to 128/256-bit
   3848 /// SHUFPS and SHUFPD. If Commuted is true, then it checks for sources to be
   3849 /// reverse of what x86 shuffles want.
   3850 static bool isSHUFPMask(ArrayRef<int> Mask, MVT VT, bool Commuted = false) {
   3851 
   3852   unsigned NumElems = VT.getVectorNumElements();
   3853   unsigned NumLanes = VT.getSizeInBits()/128;
   3854   unsigned NumLaneElems = NumElems/NumLanes;
   3855 
   3856   if (NumLaneElems != 2 && NumLaneElems != 4)
   3857     return false;
   3858 
   3859   unsigned EltSize = VT.getVectorElementType().getSizeInBits();
   3860   bool symetricMaskRequired =
   3861     (VT.getSizeInBits() >= 256) && (EltSize == 32);
   3862 
   3863   // VSHUFPSY divides the resulting vector into 4 chunks.
   3864   // The sources are also splitted into 4 chunks, and each destination
   3865   // chunk must come from a different source chunk.
   3866   //
   3867   //  SRC1 =>   X7    X6    X5    X4    X3    X2    X1    X0
   3868   //  SRC2 =>   Y7    Y6    Y5    Y4    Y3    Y2    Y1    Y9
   3869   //
   3870   //  DST  =>  Y7..Y4,   Y7..Y4,   X7..X4,   X7..X4,
   3871   //           Y3..Y0,   Y3..Y0,   X3..X0,   X3..X0
   3872   //
   3873   // VSHUFPDY divides the resulting vector into 4 chunks.
   3874   // The sources are also splitted into 4 chunks, and each destination
   3875   // chunk must come from a different source chunk.
   3876   //
   3877   //  SRC1 =>      X3       X2       X1       X0
   3878   //  SRC2 =>      Y3       Y2       Y1       Y0
   3879   //
   3880   //  DST  =>  Y3..Y2,  X3..X2,  Y1..Y0,  X1..X0
   3881   //
   3882   SmallVector<int, 4> MaskVal(NumLaneElems, -1);
   3883   unsigned HalfLaneElems = NumLaneElems/2;
   3884   for (unsigned l = 0; l != NumElems; l += NumLaneElems) {
   3885     for (unsigned i = 0; i != NumLaneElems; ++i) {
   3886       int Idx = Mask[i+l];
   3887       unsigned RngStart = l + ((Commuted == (i<HalfLaneElems)) ? NumElems : 0);
   3888       if (!isUndefOrInRange(Idx, RngStart, RngStart+NumLaneElems))
   3889         return false;
   3890       // For VSHUFPSY, the mask of the second half must be the same as the
   3891       // first but with the appropriate offsets. This works in the same way as
   3892       // VPERMILPS works with masks.
   3893       if (!symetricMaskRequired || Idx < 0)
   3894         continue;
   3895       if (MaskVal[i] < 0) {
   3896         MaskVal[i] = Idx - l;
   3897         continue;
   3898       }
   3899       if ((signed)(Idx - l) != MaskVal[i])
   3900         return false;
   3901     }
   3902   }
   3903 
   3904   return true;
   3905 }
   3906 
   3907 /// isMOVHLPSMask - Return true if the specified VECTOR_SHUFFLE operand
   3908 /// specifies a shuffle of elements that is suitable for input to MOVHLPS.
   3909 static bool isMOVHLPSMask(ArrayRef<int> Mask, MVT VT) {
   3910   if (!VT.is128BitVector())
   3911     return false;
   3912 
   3913   unsigned NumElems = VT.getVectorNumElements();
   3914 
   3915   if (NumElems != 4)
   3916     return false;
   3917 
   3918   // Expect bit0 == 6, bit1 == 7, bit2 == 2, bit3 == 3
   3919   return isUndefOrEqual(Mask[0], 6) &&
   3920          isUndefOrEqual(Mask[1], 7) &&
   3921          isUndefOrEqual(Mask[2], 2) &&
   3922          isUndefOrEqual(Mask[3], 3);
   3923 }
   3924 
   3925 /// isMOVHLPS_v_undef_Mask - Special case of isMOVHLPSMask for canonical form
   3926 /// of vector_shuffle v, v, <2, 3, 2, 3>, i.e. vector_shuffle v, undef,
   3927 /// <2, 3, 2, 3>
   3928 static bool isMOVHLPS_v_undef_Mask(ArrayRef<int> Mask, MVT VT) {
   3929   if (!VT.is128BitVector())
   3930     return false;
   3931 
   3932   unsigned NumElems = VT.getVectorNumElements();
   3933 
   3934   if (NumElems != 4)
   3935     return false;
   3936 
   3937   return isUndefOrEqual(Mask[0], 2) &&
   3938          isUndefOrEqual(Mask[1], 3) &&
   3939          isUndefOrEqual(Mask[2], 2) &&
   3940          isUndefOrEqual(Mask[3], 3);
   3941 }
   3942 
   3943 /// isMOVLPMask - Return true if the specified VECTOR_SHUFFLE operand
   3944 /// specifies a shuffle of elements that is suitable for input to MOVLP{S|D}.
   3945 static bool isMOVLPMask(ArrayRef<int> Mask, MVT VT) {
   3946   if (!VT.is128BitVector())
   3947     return false;
   3948 
   3949   unsigned NumElems = VT.getVectorNumElements();
   3950 
   3951   if (NumElems != 2 && NumElems != 4)
   3952     return false;
   3953 
   3954   for (unsigned i = 0, e = NumElems/2; i != e; ++i)
   3955     if (!isUndefOrEqual(Mask[i], i + NumElems))
   3956       return false;
   3957 
   3958   for (unsigned i = NumElems/2, e = NumElems; i != e; ++i)
   3959     if (!isUndefOrEqual(Mask[i], i))
   3960       return false;
   3961 
   3962   return true;
   3963 }
   3964 
   3965 /// isMOVLHPSMask - Return true if the specified VECTOR_SHUFFLE operand
   3966 /// specifies a shuffle of elements that is suitable for input to MOVLHPS.
   3967 static bool isMOVLHPSMask(ArrayRef<int> Mask, MVT VT) {
   3968   if (!VT.is128BitVector())
   3969     return false;
   3970 
   3971   unsigned NumElems = VT.getVectorNumElements();
   3972 
   3973   if (NumElems != 2 && NumElems != 4)
   3974     return false;
   3975 
   3976   for (unsigned i = 0, e = NumElems/2; i != e; ++i)
   3977     if (!isUndefOrEqual(Mask[i], i))
   3978       return false;
   3979 
   3980   for (unsigned i = 0, e = NumElems/2; i != e; ++i)
   3981     if (!isUndefOrEqual(Mask[i + e], i + NumElems))
   3982       return false;
   3983 
   3984   return true;
   3985 }
   3986 
   3987 /// isINSERTPSMask - Return true if the specified VECTOR_SHUFFLE operand
   3988 /// specifies a shuffle of elements that is suitable for input to INSERTPS.
   3989 /// i. e: If all but one element come from the same vector.
   3990 static bool isINSERTPSMask(ArrayRef<int> Mask, MVT VT) {
   3991   // TODO: Deal with AVX's VINSERTPS
   3992   if (!VT.is128BitVector() || (VT != MVT::v4f32 && VT != MVT::v4i32))
   3993     return false;
   3994 
   3995   unsigned CorrectPosV1 = 0;
   3996   unsigned CorrectPosV2 = 0;
   3997   for (int i = 0, e = (int)VT.getVectorNumElements(); i != e; ++i) {
   3998     if (Mask[i] == -1) {
   3999       ++CorrectPosV1;
   4000       ++CorrectPosV2;
   4001       continue;
   4002     }
   4003 
   4004     if (Mask[i] == i)
   4005       ++CorrectPosV1;
   4006     else if (Mask[i] == i + 4)
   4007       ++CorrectPosV2;
   4008   }
   4009 
   4010   if (CorrectPosV1 == 3 || CorrectPosV2 == 3)
   4011     // We have 3 elements (undefs count as elements from any vector) from one
   4012     // vector, and one from another.
   4013     return true;
   4014 
   4015   return false;
   4016 }
   4017 
   4018 //
   4019 // Some special combinations that can be optimized.
   4020 //
   4021 static
   4022 SDValue Compact8x32ShuffleNode(ShuffleVectorSDNode *SVOp,
   4023                                SelectionDAG &DAG) {
   4024   MVT VT = SVOp->getSimpleValueType(0);
   4025   SDLoc dl(SVOp);
   4026 
   4027   if (VT != MVT::v8i32 && VT != MVT::v8f32)
   4028     return SDValue();
   4029 
   4030   ArrayRef<int> Mask = SVOp->getMask();
   4031 
   4032   // These are the special masks that may be optimized.
   4033   static const int MaskToOptimizeEven[] = {0, 8, 2, 10, 4, 12, 6, 14};
   4034   static const int MaskToOptimizeOdd[]  = {1, 9, 3, 11, 5, 13, 7, 15};
   4035   bool MatchEvenMask = true;
   4036   bool MatchOddMask  = true;
   4037   for (int i=0; i<8; ++i) {
   4038     if (!isUndefOrEqual(Mask[i], MaskToOptimizeEven[i]))
   4039       MatchEvenMask = false;
   4040     if (!isUndefOrEqual(Mask[i], MaskToOptimizeOdd[i]))
   4041       MatchOddMask = false;
   4042   }
   4043 
   4044   if (!MatchEvenMask && !MatchOddMask)
   4045     return SDValue();
   4046 
   4047   SDValue UndefNode = DAG.getNode(ISD::UNDEF, dl, VT);
   4048 
   4049   SDValue Op0 = SVOp->getOperand(0);
   4050   SDValue Op1 = SVOp->getOperand(1);
   4051 
   4052   if (MatchEvenMask) {
   4053     // Shift the second operand right to 32 bits.
   4054     static const int ShiftRightMask[] = {-1, 0, -1, 2, -1, 4, -1, 6 };
   4055     Op1 = DAG.getVectorShuffle(VT, dl, Op1, UndefNode, ShiftRightMask);
   4056   } else {
   4057     // Shift the first operand left to 32 bits.
   4058     static const int ShiftLeftMask[] = {1, -1, 3, -1, 5, -1, 7, -1 };
   4059     Op0 = DAG.getVectorShuffle(VT, dl, Op0, UndefNode, ShiftLeftMask);
   4060   }
   4061   static const int BlendMask[] = {0, 9, 2, 11, 4, 13, 6, 15};
   4062   return DAG.getVectorShuffle(VT, dl, Op0, Op1, BlendMask);
   4063 }
   4064 
   4065 /// isUNPCKLMask - Return true if the specified VECTOR_SHUFFLE operand
   4066 /// specifies a shuffle of elements that is suitable for input to UNPCKL.
   4067 static bool isUNPCKLMask(ArrayRef<int> Mask, MVT VT,
   4068                          bool HasInt256, bool V2IsSplat = false) {
   4069 
   4070   assert(VT.getSizeInBits() >= 128 &&
   4071          "Unsupported vector type for unpckl");
   4072 
   4073   // AVX defines UNPCK* to operate independently on 128-bit lanes.
   4074   unsigned NumLanes;
   4075   unsigned NumOf256BitLanes;
   4076   unsigned NumElts = VT.getVectorNumElements();
   4077   if (VT.is256BitVector()) {
   4078     if (NumElts != 4 && NumElts != 8 &&
   4079         (!HasInt256 || (NumElts != 16 && NumElts != 32)))
   4080     return false;
   4081     NumLanes = 2;
   4082     NumOf256BitLanes = 1;
   4083   } else if (VT.is512BitVector()) {
   4084     assert(VT.getScalarType().getSizeInBits() >= 32 &&
   4085            "Unsupported vector type for unpckh");
   4086     NumLanes = 2;
   4087     NumOf256BitLanes = 2;
   4088   } else {
   4089     NumLanes = 1;
   4090     NumOf256BitLanes = 1;
   4091   }
   4092 
   4093   unsigned NumEltsInStride = NumElts/NumOf256BitLanes;
   4094   unsigned NumLaneElts = NumEltsInStride/NumLanes;
   4095 
   4096   for (unsigned l256 = 0; l256 < NumOf256BitLanes; l256 += 1) {
   4097     for (unsigned l = 0; l != NumEltsInStride; l += NumLaneElts) {
   4098       for (unsigned i = 0, j = l; i != NumLaneElts; i += 2, ++j) {
   4099         int BitI  = Mask[l256*NumEltsInStride+l+i];
   4100         int BitI1 = Mask[l256*NumEltsInStride+l+i+1];
   4101         if (!isUndefOrEqual(BitI, j+l256*NumElts))
   4102           return false;
   4103         if (V2IsSplat && !isUndefOrEqual(BitI1, NumElts))
   4104           return false;
   4105         if (!isUndefOrEqual(BitI1, j+l256*NumElts+NumEltsInStride))
   4106           return false;
   4107       }
   4108     }
   4109   }
   4110   return true;
   4111 }
   4112 
   4113 /// isUNPCKHMask - Return true if the specified VECTOR_SHUFFLE operand
   4114 /// specifies a shuffle of elements that is suitable for input to UNPCKH.
   4115 static bool isUNPCKHMask(ArrayRef<int> Mask, MVT VT,
   4116                          bool HasInt256, bool V2IsSplat = false) {
   4117   assert(VT.getSizeInBits() >= 128 &&
   4118          "Unsupported vector type for unpckh");
   4119 
   4120   // AVX defines UNPCK* to operate independently on 128-bit lanes.
   4121   unsigned NumLanes;
   4122   unsigned NumOf256BitLanes;
   4123   unsigned NumElts = VT.getVectorNumElements();
   4124   if (VT.is256BitVector()) {
   4125     if (NumElts != 4 && NumElts != 8 &&
   4126         (!HasInt256 || (NumElts != 16 && NumElts != 32)))
   4127     return false;
   4128     NumLanes = 2;
   4129     NumOf256BitLanes = 1;
   4130   } else if (VT.is512BitVector()) {
   4131     assert(VT.getScalarType().getSizeInBits() >= 32 &&
   4132            "Unsupported vector type for unpckh");
   4133     NumLanes = 2;
   4134     NumOf256BitLanes = 2;
   4135   } else {
   4136     NumLanes = 1;
   4137     NumOf256BitLanes = 1;
   4138   }
   4139 
   4140   unsigned NumEltsInStride = NumElts/NumOf256BitLanes;
   4141   unsigned NumLaneElts = NumEltsInStride/NumLanes;
   4142 
   4143   for (unsigned l256 = 0; l256 < NumOf256BitLanes; l256 += 1) {
   4144     for (unsigned l = 0; l != NumEltsInStride; l += NumLaneElts) {
   4145       for (unsigned i = 0, j = l+NumLaneElts/2; i != NumLaneElts; i += 2, ++j) {
   4146         int BitI  = Mask[l256*NumEltsInStride+l+i];
   4147         int BitI1 = Mask[l256*NumEltsInStride+l+i+1];
   4148         if (!isUndefOrEqual(BitI, j+l256*NumElts))
   4149           return false;
   4150         if (V2IsSplat && !isUndefOrEqual(BitI1, NumElts))
   4151           return false;
   4152         if (!isUndefOrEqual(BitI1, j+l256*NumElts+NumEltsInStride))
   4153           return false;
   4154       }
   4155     }
   4156   }
   4157   return true;
   4158 }
   4159 
   4160 /// isUNPCKL_v_undef_Mask - Special case of isUNPCKLMask for canonical form
   4161 /// of vector_shuffle v, v, <0, 4, 1, 5>, i.e. vector_shuffle v, undef,
   4162 /// <0, 0, 1, 1>
   4163 static bool isUNPCKL_v_undef_Mask(ArrayRef<int> Mask, MVT VT, bool HasInt256) {
   4164   unsigned NumElts = VT.getVectorNumElements();
   4165   bool Is256BitVec = VT.is256BitVector();
   4166 
   4167   if (VT.is512BitVector())
   4168     return false;
   4169   assert((VT.is128BitVector() || VT.is256BitVector()) &&
   4170          "Unsupported vector type for unpckh");
   4171 
   4172   if (Is256BitVec && NumElts != 4 && NumElts != 8 &&
   4173       (!HasInt256 || (NumElts != 16 && NumElts != 32)))
   4174     return false;
   4175 
   4176   // For 256-bit i64/f64, use MOVDDUPY instead, so reject the matching pattern
   4177   // FIXME: Need a better way to get rid of this, there's no latency difference
   4178   // between UNPCKLPD and MOVDDUP, the later should always be checked first and
   4179   // the former later. We should also remove the "_undef" special mask.
   4180   if (NumElts == 4 && Is256BitVec)
   4181     return false;
   4182 
   4183   // Handle 128 and 256-bit vector lengths. AVX defines UNPCK* to operate
   4184   // independently on 128-bit lanes.
   4185   unsigned NumLanes = VT.getSizeInBits()/128;
   4186   unsigned NumLaneElts = NumElts/NumLanes;
   4187 
   4188   for (unsigned l = 0; l != NumElts; l += NumLaneElts) {
   4189     for (unsigned i = 0, j = l; i != NumLaneElts; i += 2, ++j) {
   4190       int BitI  = Mask[l+i];
   4191       int BitI1 = Mask[l+i+1];
   4192 
   4193       if (!isUndefOrEqual(BitI, j))
   4194         return false;
   4195       if (!isUndefOrEqual(BitI1, j))
   4196         return false;
   4197     }
   4198   }
   4199 
   4200   return true;
   4201 }
   4202 
   4203 /// isUNPCKH_v_undef_Mask - Special case of isUNPCKHMask for canonical form
   4204 /// of vector_shuffle v, v, <2, 6, 3, 7>, i.e. vector_shuffle v, undef,
   4205 /// <2, 2, 3, 3>
   4206 static bool isUNPCKH_v_undef_Mask(ArrayRef<int> Mask, MVT VT, bool HasInt256) {
   4207   unsigned NumElts = VT.getVectorNumElements();
   4208 
   4209   if (VT.is512BitVector())
   4210     return false;
   4211 
   4212   assert((VT.is128BitVector() || VT.is256BitVector()) &&
   4213          "Unsupported vector type for unpckh");
   4214 
   4215   if (VT.is256BitVector() && NumElts != 4 && NumElts != 8 &&
   4216       (!HasInt256 || (NumElts != 16 && NumElts != 32)))
   4217     return false;
   4218 
   4219   // Handle 128 and 256-bit vector lengths. AVX defines UNPCK* to operate
   4220   // independently on 128-bit lanes.
   4221   unsigned NumLanes = VT.getSizeInBits()/128;
   4222   unsigned NumLaneElts = NumElts/NumLanes;
   4223 
   4224   for (unsigned l = 0; l != NumElts; l += NumLaneElts) {
   4225     for (unsigned i = 0, j = l+NumLaneElts/2; i != NumLaneElts; i += 2, ++j) {
   4226       int BitI  = Mask[l+i];
   4227       int BitI1 = Mask[l+i+1];
   4228       if (!isUndefOrEqual(BitI, j))
   4229         return false;
   4230       if (!isUndefOrEqual(BitI1, j))
   4231         return false;
   4232     }
   4233   }
   4234   return true;
   4235 }
   4236 
   4237 // Match for INSERTI64x4 INSERTF64x4 instructions (src0[0], src1[0]) or
   4238 // (src1[0], src0[1]), manipulation with 256-bit sub-vectors
   4239 static bool isINSERT64x4Mask(ArrayRef<int> Mask, MVT VT, unsigned int *Imm) {
   4240   if (!VT.is512BitVector())
   4241     return false;
   4242 
   4243   unsigned NumElts = VT.getVectorNumElements();
   4244   unsigned HalfSize = NumElts/2;
   4245   if (isSequentialOrUndefInRange(Mask, 0, HalfSize, 0)) {
   4246     if (isSequentialOrUndefInRange(Mask, HalfSize, HalfSize, NumElts)) {
   4247       *Imm = 1;
   4248       return true;
   4249     }
   4250   }
   4251   if (isSequentialOrUndefInRange(Mask, 0, HalfSize, NumElts)) {
   4252     if (isSequentialOrUndefInRange(Mask, HalfSize, HalfSize, HalfSize)) {
   4253       *Imm = 0;
   4254       return true;
   4255     }
   4256   }
   4257   return false;
   4258 }
   4259 
   4260 /// isMOVLMask - Return true if the specified VECTOR_SHUFFLE operand
   4261 /// specifies a shuffle of elements that is suitable for input to MOVSS,
   4262 /// MOVSD, and MOVD, i.e. setting the lowest element.
   4263 static bool isMOVLMask(ArrayRef<int> Mask, EVT VT) {
   4264   if (VT.getVectorElementType().getSizeInBits() < 32)
   4265     return false;
   4266   if (!VT.is128BitVector())
   4267     return false;
   4268 
   4269   unsigned NumElts = VT.getVectorNumElements();
   4270 
   4271   if (!isUndefOrEqual(Mask[0], NumElts))
   4272     return false;
   4273 
   4274   for (unsigned i = 1; i != NumElts; ++i)
   4275     if (!isUndefOrEqual(Mask[i], i))
   4276       return false;
   4277 
   4278   return true;
   4279 }
   4280 
   4281 /// isVPERM2X128Mask - Match 256-bit shuffles where the elements are considered
   4282 /// as permutations between 128-bit chunks or halves. As an example: this
   4283 /// shuffle bellow:
   4284 ///   vector_shuffle <4, 5, 6, 7, 12, 13, 14, 15>
   4285 /// The first half comes from the second half of V1 and the second half from the
   4286 /// the second half of V2.
   4287 static bool isVPERM2X128Mask(ArrayRef<int> Mask, MVT VT, bool HasFp256) {
   4288   if (!HasFp256 || !VT.is256BitVector())
   4289     return false;
   4290 
   4291   // The shuffle result is divided into half A and half B. In total the two
   4292   // sources have 4 halves, namely: C, D, E, F. The final values of A and
   4293   // B must come from C, D, E or F.
   4294   unsigned HalfSize = VT.getVectorNumElements()/2;
   4295   bool MatchA = false, MatchB = false;
   4296 
   4297   // Check if A comes from one of C, D, E, F.
   4298   for (unsigned Half = 0; Half != 4; ++Half) {
   4299     if (isSequentialOrUndefInRange(Mask, 0, HalfSize, Half*HalfSize)) {
   4300       MatchA = true;
   4301       break;
   4302     }
   4303   }
   4304 
   4305   // Check if B comes from one of C, D, E, F.
   4306   for (unsigned Half = 0; Half != 4; ++Half) {
   4307     if (isSequentialOrUndefInRange(Mask, HalfSize, HalfSize, Half*HalfSize)) {
   4308       MatchB = true;
   4309       break;
   4310     }
   4311   }
   4312 
   4313   return MatchA && MatchB;
   4314 }
   4315 
   4316 /// getShuffleVPERM2X128Immediate - Return the appropriate immediate to shuffle
   4317 /// the specified VECTOR_MASK mask with VPERM2F128/VPERM2I128 instructions.
   4318 static unsigned getShuffleVPERM2X128Immediate(ShuffleVectorSDNode *SVOp) {
   4319   MVT VT = SVOp->getSimpleValueType(0);
   4320 
   4321   unsigned HalfSize = VT.getVectorNumElements()/2;
   4322 
   4323   unsigned FstHalf = 0, SndHalf = 0;
   4324   for (unsigned i = 0; i < HalfSize; ++i) {
   4325     if (SVOp->getMaskElt(i) > 0) {
   4326       FstHalf = SVOp->getMaskElt(i)/HalfSize;
   4327       break;
   4328     }
   4329   }
   4330   for (unsigned i = HalfSize; i < HalfSize*2; ++i) {
   4331     if (SVOp->getMaskElt(i) > 0) {
   4332       SndHalf = SVOp->getMaskElt(i)/HalfSize;
   4333       break;
   4334     }
   4335   }
   4336 
   4337   return (FstHalf | (SndHalf << 4));
   4338 }
   4339 
   4340 // Symetric in-lane mask. Each lane has 4 elements (for imm8)
   4341 static bool isPermImmMask(ArrayRef<int> Mask, MVT VT, unsigned& Imm8) {
   4342   unsigned EltSize = VT.getVectorElementType().getSizeInBits();
   4343   if (EltSize < 32)
   4344     return false;
   4345 
   4346   unsigned NumElts = VT.getVectorNumElements();
   4347   Imm8 = 0;
   4348   if (VT.is128BitVector() || (VT.is256BitVector() && EltSize == 64)) {
   4349     for (unsigned i = 0; i != NumElts; ++i) {
   4350       if (Mask[i] < 0)
   4351         continue;
   4352       Imm8 |= Mask[i] << (i*2);
   4353     }
   4354     return true;
   4355   }
   4356 
   4357   unsigned LaneSize = 4;
   4358   SmallVector<int, 4> MaskVal(LaneSize, -1);
   4359 
   4360   for (unsigned l = 0; l != NumElts; l += LaneSize) {
   4361     for (unsigned i = 0; i != LaneSize; ++i) {
   4362       if (!isUndefOrInRange(Mask[i+l], l, l+LaneSize))
   4363         return false;
   4364       if (Mask[i+l] < 0)
   4365         continue;
   4366       if (MaskVal[i] < 0) {
   4367         MaskVal[i] = Mask[i+l] - l;
   4368         Imm8 |= MaskVal[i] << (i*2);
   4369         continue;
   4370       }
   4371       if (Mask[i+l] != (signed)(MaskVal[i]+l))
   4372         return false;
   4373     }
   4374   }
   4375   return true;
   4376 }
   4377 
   4378 /// isVPERMILPMask - Return true if the specified VECTOR_SHUFFLE operand
   4379 /// specifies a shuffle of elements that is suitable for input to VPERMILPD*.
   4380 /// Note that VPERMIL mask matching is different depending whether theunderlying
   4381 /// type is 32 or 64. In the VPERMILPS the high half of the mask should point
   4382 /// to the same elements of the low, but to the higher half of the source.
   4383 /// In VPERMILPD the two lanes could be shuffled independently of each other
   4384 /// with the same restriction that lanes can't be crossed. Also handles PSHUFDY.
   4385 static bool isVPERMILPMask(ArrayRef<int> Mask, MVT VT) {
   4386   unsigned EltSize = VT.getVectorElementType().getSizeInBits();
   4387   if (VT.getSizeInBits() < 256 || EltSize < 32)
   4388     return false;
   4389   bool symetricMaskRequired = (EltSize == 32);
   4390   unsigned NumElts = VT.getVectorNumElements();
   4391 
   4392   unsigned NumLanes = VT.getSizeInBits()/128;
   4393   unsigned LaneSize = NumElts/NumLanes;
   4394   // 2 or 4 elements in one lane
   4395 
   4396   SmallVector<int, 4> ExpectedMaskVal(LaneSize, -1);
   4397   for (unsigned l = 0; l != NumElts; l += LaneSize) {
   4398     for (unsigned i = 0; i != LaneSize; ++i) {
   4399       if (!isUndefOrInRange(Mask[i+l], l, l+LaneSize))
   4400         return false;
   4401       if (symetricMaskRequired) {
   4402         if (ExpectedMaskVal[i] < 0 && Mask[i+l] >= 0) {
   4403           ExpectedMaskVal[i] = Mask[i+l] - l;
   4404           continue;
   4405         }
   4406         if (!isUndefOrEqual(Mask[i+l], ExpectedMaskVal[i]+l))
   4407           return false;
   4408       }
   4409     }
   4410   }
   4411   return true;
   4412 }
   4413 
   4414 /// isCommutedMOVLMask - Returns true if the shuffle mask is except the reverse
   4415 /// of what x86 movss want. X86 movs requires the lowest  element to be lowest
   4416 /// element of vector 2 and the other elements to come from vector 1 in order.
   4417 static bool isCommutedMOVLMask(ArrayRef<int> Mask, MVT VT,
   4418                                bool V2IsSplat = false, bool V2IsUndef = false) {
   4419   if (!VT.is128BitVector())
   4420     return false;
   4421 
   4422   unsigned NumOps = VT.getVectorNumElements();
   4423   if (NumOps != 2 && NumOps != 4 && NumOps != 8 && NumOps != 16)
   4424     return false;
   4425 
   4426   if (!isUndefOrEqual(Mask[0], 0))
   4427     return false;
   4428 
   4429   for (unsigned i = 1; i != NumOps; ++i)
   4430     if (!(isUndefOrEqual(Mask[i], i+NumOps) ||
   4431           (V2IsUndef && isUndefOrInRange(Mask[i], NumOps, NumOps*2)) ||
   4432           (V2IsSplat && isUndefOrEqual(Mask[i], NumOps))))
   4433       return false;
   4434 
   4435   return true;
   4436 }
   4437 
   4438 /// isMOVSHDUPMask - Return true if the specified VECTOR_SHUFFLE operand
   4439 /// specifies a shuffle of elements that is suitable for input to MOVSHDUP.
   4440 /// Masks to match: <1, 1, 3, 3> or <1, 1, 3, 3, 5, 5, 7, 7>
   4441 static bool isMOVSHDUPMask(ArrayRef<int> Mask, MVT VT,
   4442                            const X86Subtarget *Subtarget) {
   4443   if (!Subtarget->hasSSE3())
   4444     return false;
   4445 
   4446   unsigned NumElems = VT.getVectorNumElements();
   4447 
   4448   if ((VT.is128BitVector() && NumElems != 4) ||
   4449       (VT.is256BitVector() && NumElems != 8) ||
   4450       (VT.is512BitVector() && NumElems != 16))
   4451     return false;
   4452 
   4453   // "i+1" is the value the indexed mask element must have
   4454   for (unsigned i = 0; i != NumElems; i += 2)
   4455     if (!isUndefOrEqual(Mask[i], i+1) ||
   4456         !isUndefOrEqual(Mask[i+1], i+1))
   4457       return false;
   4458 
   4459   return true;
   4460 }
   4461 
   4462 /// isMOVSLDUPMask - Return true if the specified VECTOR_SHUFFLE operand
   4463 /// specifies a shuffle of elements that is suitable for input to MOVSLDUP.
   4464 /// Masks to match: <0, 0, 2, 2> or <0, 0, 2, 2, 4, 4, 6, 6>
   4465 static bool isMOVSLDUPMask(ArrayRef<int> Mask, MVT VT,
   4466                            const X86Subtarget *Subtarget) {
   4467   if (!Subtarget->hasSSE3())
   4468     return false;
   4469 
   4470   unsigned NumElems = VT.getVectorNumElements();
   4471 
   4472   if ((VT.is128BitVector() && NumElems != 4) ||
   4473       (VT.is256BitVector() && NumElems != 8) ||
   4474       (VT.is512BitVector() && NumElems != 16))
   4475     return false;
   4476 
   4477   // "i" is the value the indexed mask element must have
   4478   for (unsigned i = 0; i != NumElems; i += 2)
   4479     if (!isUndefOrEqual(Mask[i], i) ||
   4480         !isUndefOrEqual(Mask[i+1], i))
   4481       return false;
   4482 
   4483   return true;
   4484 }
   4485 
   4486 /// isMOVDDUPYMask - Return true if the specified VECTOR_SHUFFLE operand
   4487 /// specifies a shuffle of elements that is suitable for input to 256-bit
   4488 /// version of MOVDDUP.
   4489 static bool isMOVDDUPYMask(ArrayRef<int> Mask, MVT VT, bool HasFp256) {
   4490   if (!HasFp256 || !VT.is256BitVector())
   4491     return false;
   4492 
   4493   unsigned NumElts = VT.getVectorNumElements();
   4494   if (NumElts != 4)
   4495     return false;
   4496 
   4497   for (unsigned i = 0; i != NumElts/2; ++i)
   4498     if (!isUndefOrEqual(Mask[i], 0))
   4499       return false;
   4500   for (unsigned i = NumElts/2; i != NumElts; ++i)
   4501     if (!isUndefOrEqual(Mask[i], NumElts/2))
   4502       return false;
   4503   return true;
   4504 }
   4505 
   4506 /// isMOVDDUPMask - Return true if the specified VECTOR_SHUFFLE operand
   4507 /// specifies a shuffle of elements that is suitable for input to 128-bit
   4508 /// version of MOVDDUP.
   4509 static bool isMOVDDUPMask(ArrayRef<int> Mask, MVT VT) {
   4510   if (!VT.is128BitVector())
   4511     return false;
   4512 
   4513   unsigned e = VT.getVectorNumElements() / 2;
   4514   for (unsigned i = 0; i != e; ++i)
   4515     if (!isUndefOrEqual(Mask[i], i))
   4516       return false;
   4517   for (unsigned i = 0; i != e; ++i)
   4518     if (!isUndefOrEqual(Mask[e+i], i))
   4519       return false;
   4520   return true;
   4521 }
   4522 
   4523 /// isVEXTRACTIndex - Return true if the specified
   4524 /// EXTRACT_SUBVECTOR operand specifies a vector extract that is
   4525 /// suitable for instruction that extract 128 or 256 bit vectors
   4526 static bool isVEXTRACTIndex(SDNode *N, unsigned vecWidth) {
   4527   assert((vecWidth == 128 || vecWidth == 256) && "Unexpected vector width");
   4528   if (!isa<ConstantSDNode>(N->getOperand(1).getNode()))
   4529     return false;
   4530 
   4531   // The index should be aligned on a vecWidth-bit boundary.
   4532   uint64_t Index =
   4533     cast<ConstantSDNode>(N->getOperand(1).getNode())->getZExtValue();
   4534 
   4535   MVT VT = N->getSimpleValueType(0);
   4536   unsigned ElSize = VT.getVectorElementType().getSizeInBits();
   4537   bool Result = (Index * ElSize) % vecWidth == 0;
   4538 
   4539   return Result;
   4540 }
   4541 
   4542 /// isVINSERTIndex - Return true if the specified INSERT_SUBVECTOR
   4543 /// operand specifies a subvector insert that is suitable for input to
   4544 /// insertion of 128 or 256-bit subvectors
   4545 static bool isVINSERTIndex(SDNode *N, unsigned vecWidth) {
   4546   assert((vecWidth == 128 || vecWidth == 256) && "Unexpected vector width");
   4547   if (!isa<ConstantSDNode>(N->getOperand(2).getNode()))
   4548     return false;
   4549   // The index should be aligned on a vecWidth-bit boundary.
   4550   uint64_t Index =
   4551     cast<ConstantSDNode>(N->getOperand(2).getNode())->getZExtValue();
   4552 
   4553   MVT VT = N->getSimpleValueType(0);
   4554   unsigned ElSize = VT.getVectorElementType().getSizeInBits();
   4555   bool Result = (Index * ElSize) % vecWidth == 0;
   4556 
   4557   return Result;
   4558 }
   4559 
   4560 bool X86::isVINSERT128Index(SDNode *N) {
   4561   return isVINSERTIndex(N, 128);
   4562 }
   4563 
   4564 bool X86::isVINSERT256Index(SDNode *N) {
   4565   return isVINSERTIndex(N, 256);
   4566 }
   4567 
   4568 bool X86::isVEXTRACT128Index(SDNode *N) {
   4569   return isVEXTRACTIndex(N, 128);
   4570 }
   4571 
   4572 bool X86::isVEXTRACT256Index(SDNode *N) {
   4573   return isVEXTRACTIndex(N, 256);
   4574 }
   4575 
   4576 /// getShuffleSHUFImmediate - Return the appropriate immediate to shuffle
   4577 /// the specified VECTOR_SHUFFLE mask with PSHUF* and SHUFP* instructions.
   4578 /// Handles 128-bit and 256-bit.
   4579 static unsigned getShuffleSHUFImmediate(ShuffleVectorSDNode *N) {
   4580   MVT VT = N->getSimpleValueType(0);
   4581 
   4582   assert((VT.getSizeInBits() >= 128) &&
   4583          "Unsupported vector type for PSHUF/SHUFP");
   4584 
   4585   // Handle 128 and 256-bit vector lengths. AVX defines PSHUF/SHUFP to operate
   4586   // independently on 128-bit lanes.
   4587   unsigned NumElts = VT.getVectorNumElements();
   4588   unsigned NumLanes = VT.getSizeInBits()/128;
   4589   unsigned NumLaneElts = NumElts/NumLanes;
   4590 
   4591   assert((NumLaneElts == 2 || NumLaneElts == 4 || NumLaneElts == 8) &&
   4592          "Only supports 2, 4 or 8 elements per lane");
   4593 
   4594   unsigned Shift = (NumLaneElts >= 4) ? 1 : 0;
   4595   unsigned Mask = 0;
   4596   for (unsigned i = 0; i != NumElts; ++i) {
   4597     int Elt = N->getMaskElt(i);
   4598     if (Elt < 0) continue;
   4599     Elt &= NumLaneElts - 1;
   4600     unsigned ShAmt = (i << Shift) % 8;
   4601     Mask |= Elt << ShAmt;
   4602   }
   4603 
   4604   return Mask;
   4605 }
   4606 
   4607 /// getShufflePSHUFHWImmediate - Return the appropriate immediate to shuffle
   4608 /// the specified VECTOR_SHUFFLE mask with the PSHUFHW instruction.
   4609 static unsigned getShufflePSHUFHWImmediate(ShuffleVectorSDNode *N) {
   4610   MVT VT = N->getSimpleValueType(0);
   4611 
   4612   assert((VT == MVT::v8i16 || VT == MVT::v16i16) &&
   4613          "Unsupported vector type for PSHUFHW");
   4614 
   4615   unsigned NumElts = VT.getVectorNumElements();
   4616 
   4617   unsigned Mask = 0;
   4618   for (unsigned l = 0; l != NumElts; l += 8) {
   4619     // 8 nodes per lane, but we only care about the last 4.
   4620     for (unsigned i = 0; i < 4; ++i) {
   4621       int Elt = N->getMaskElt(l+i+4);
   4622       if (Elt < 0) continue;
   4623       Elt &= 0x3; // only 2-bits.
   4624       Mask |= Elt << (i * 2);
   4625     }
   4626   }
   4627 
   4628   return Mask;
   4629 }
   4630 
   4631 /// getShufflePSHUFLWImmediate - Return the appropriate immediate to shuffle
   4632 /// the specified VECTOR_SHUFFLE mask with the PSHUFLW instruction.
   4633 static unsigned getShufflePSHUFLWImmediate(ShuffleVectorSDNode *N) {
   4634   MVT VT = N->getSimpleValueType(0);
   4635 
   4636   assert((VT == MVT::v8i16 || VT == MVT::v16i16) &&
   4637          "Unsupported vector type for PSHUFHW");
   4638 
   4639   unsigned NumElts = VT.getVectorNumElements();
   4640 
   4641   unsigned Mask = 0;
   4642   for (unsigned l = 0; l != NumElts; l += 8) {
   4643     // 8 nodes per lane, but we only care about the first 4.
   4644     for (unsigned i = 0; i < 4; ++i) {
   4645       int Elt = N->getMaskElt(l+i);
   4646       if (Elt < 0) continue;
   4647       Elt &= 0x3; // only 2-bits
   4648       Mask |= Elt << (i * 2);
   4649     }
   4650   }
   4651 
   4652   return Mask;
   4653 }
   4654 
   4655 /// getShufflePALIGNRImmediate - Return the appropriate immediate to shuffle
   4656 /// the specified VECTOR_SHUFFLE mask with the PALIGNR instruction.
   4657 static unsigned getShufflePALIGNRImmediate(ShuffleVectorSDNode *SVOp) {
   4658   MVT VT = SVOp->getSimpleValueType(0);
   4659   unsigned EltSize = VT.is512BitVector() ? 1 :
   4660     VT.getVectorElementType().getSizeInBits() >> 3;
   4661 
   4662   unsigned NumElts = VT.getVectorNumElements();
   4663   unsigned NumLanes = VT.is512BitVector() ? 1 : VT.getSizeInBits()/128;
   4664   unsigned NumLaneElts = NumElts/NumLanes;
   4665 
   4666   int Val = 0;
   4667   unsigned i;
   4668   for (i = 0; i != NumElts; ++i) {
   4669     Val = SVOp->getMaskElt(i);
   4670     if (Val >= 0)
   4671       break;
   4672   }
   4673   if (Val >= (int)NumElts)
   4674     Val -= NumElts - NumLaneElts;
   4675 
   4676   assert(Val - i > 0 && "PALIGNR imm should be positive");
   4677   return (Val - i) * EltSize;
   4678 }
   4679 
   4680 static unsigned getExtractVEXTRACTImmediate(SDNode *N, unsigned vecWidth) {
   4681   assert((vecWidth == 128 || vecWidth == 256) && "Unsupported vector width");
   4682   if (!isa<ConstantSDNode>(N->getOperand(1).getNode()))
   4683     llvm_unreachable("Illegal extract subvector for VEXTRACT");
   4684 
   4685   uint64_t Index =
   4686     cast<ConstantSDNode>(N->getOperand(1).getNode())->getZExtValue();
   4687 
   4688   MVT VecVT = N->getOperand(0).getSimpleValueType();
   4689   MVT ElVT = VecVT.getVectorElementType();
   4690 
   4691   unsigned NumElemsPerChunk = vecWidth / ElVT.getSizeInBits();
   4692   return Index / NumElemsPerChunk;
   4693 }
   4694 
   4695 static unsigned getInsertVINSERTImmediate(SDNode *N, unsigned vecWidth) {
   4696   assert((vecWidth == 128 || vecWidth == 256) && "Unsupported vector width");
   4697   if (!isa<ConstantSDNode>(N->getOperand(2).getNode()))
   4698     llvm_unreachable("Illegal insert subvector for VINSERT");
   4699 
   4700   uint64_t Index =
   4701     cast<ConstantSDNode>(N->getOperand(2).getNode())->getZExtValue();
   4702 
   4703   MVT VecVT = N->getSimpleValueType(0);
   4704   MVT ElVT = VecVT.getVectorElementType();
   4705 
   4706   unsigned NumElemsPerChunk = vecWidth / ElVT.getSizeInBits();
   4707   return Index / NumElemsPerChunk;
   4708 }
   4709 
   4710 /// getExtractVEXTRACT128Immediate - Return the appropriate immediate
   4711 /// to extract the specified EXTRACT_SUBVECTOR index with VEXTRACTF128
   4712 /// and VINSERTI128 instructions.
   4713 unsigned X86::getExtractVEXTRACT128Immediate(SDNode *N) {
   4714   return getExtractVEXTRACTImmediate(N, 128);
   4715 }
   4716 
   4717 /// getExtractVEXTRACT256Immediate - Return the appropriate immediate
   4718 /// to extract the specified EXTRACT_SUBVECTOR index with VEXTRACTF64x4
   4719 /// and VINSERTI64x4 instructions.
   4720 unsigned X86::getExtractVEXTRACT256Immediate(SDNode *N) {
   4721   return getExtractVEXTRACTImmediate(N, 256);
   4722 }
   4723 
   4724 /// getInsertVINSERT128Immediate - Return the appropriate immediate
   4725 /// to insert at the specified INSERT_SUBVECTOR index with VINSERTF128
   4726 /// and VINSERTI128 instructions.
   4727 unsigned X86::getInsertVINSERT128Immediate(SDNode *N) {
   4728   return getInsertVINSERTImmediate(N, 128);
   4729 }
   4730 
   4731 /// getInsertVINSERT256Immediate - Return the appropriate immediate
   4732 /// to insert at the specified INSERT_SUBVECTOR index with VINSERTF46x4
   4733 /// and VINSERTI64x4 instructions.
   4734 unsigned X86::getInsertVINSERT256Immediate(SDNode *N) {
   4735   return getInsertVINSERTImmediate(N, 256);
   4736 }
   4737 
   4738 /// isZero - Returns true if Elt is a constant integer zero
   4739 static bool isZero(SDValue V) {
   4740   ConstantSDNode *C = dyn_cast<ConstantSDNode>(V);
   4741   return C && C->isNullValue();
   4742 }
   4743 
   4744 /// isZeroNode - Returns true if Elt is a constant zero or a floating point
   4745 /// constant +0.0.
   4746 bool X86::isZeroNode(SDValue Elt) {
   4747   if (isZero(Elt))
   4748     return true;
   4749   if (ConstantFPSDNode *CFP = dyn_cast<ConstantFPSDNode>(Elt))
   4750     return CFP->getValueAPF().isPosZero();
   4751   return false;
   4752 }
   4753 
   4754 /// CommuteVectorShuffle - Swap vector_shuffle operands as well as values in
   4755 /// their permute mask.
   4756 static SDValue CommuteVectorShuffle(ShuffleVectorSDNode *SVOp,
   4757                                     SelectionDAG &DAG) {
   4758   MVT VT = SVOp->getSimpleValueType(0);
   4759   unsigned NumElems = VT.getVectorNumElements();
   4760   SmallVector<int, 8> MaskVec;
   4761 
   4762   for (unsigned i = 0; i != NumElems; ++i) {
   4763     int Idx = SVOp->getMaskElt(i);
   4764     if (Idx >= 0) {
   4765       if (Idx < (int)NumElems)
   4766         Idx += NumElems;
   4767       else
   4768         Idx -= NumElems;
   4769     }
   4770     MaskVec.push_back(Idx);
   4771   }
   4772   return DAG.getVectorShuffle(VT, SDLoc(SVOp), SVOp->getOperand(1),
   4773                               SVOp->getOperand(0), &MaskVec[0]);
   4774 }
   4775 
   4776 /// ShouldXformToMOVHLPS - Return true if the node should be transformed to
   4777 /// match movhlps. The lower half elements should come from upper half of
   4778 /// V1 (and in order), and the upper half elements should come from the upper
   4779 /// half of V2 (and in order).
   4780 static bool ShouldXformToMOVHLPS(ArrayRef<int> Mask, MVT VT) {
   4781   if (!VT.is128BitVector())
   4782     return false;
   4783   if (VT.getVectorNumElements() != 4)
   4784     return false;
   4785   for (unsigned i = 0, e = 2; i != e; ++i)
   4786     if (!isUndefOrEqual(Mask[i], i+2))
   4787       return false;
   4788   for (unsigned i = 2; i != 4; ++i)
   4789     if (!isUndefOrEqual(Mask[i], i+4))
   4790       return false;
   4791   return true;
   4792 }
   4793 
   4794 /// isScalarLoadToVector - Returns true if the node is a scalar load that
   4795 /// is promoted to a vector. It also returns the LoadSDNode by reference if
   4796 /// required.
   4797 static bool isScalarLoadToVector(SDNode *N, LoadSDNode **LD = nullptr) {
   4798   if (N->getOpcode() != ISD::SCALAR_TO_VECTOR)
   4799     return false;
   4800   N = N->getOperand(0).getNode();
   4801   if (!ISD::isNON_EXTLoad(N))
   4802     return false;
   4803   if (LD)
   4804     *LD = cast<LoadSDNode>(N);
   4805   return true;
   4806 }
   4807 
   4808 // Test whether the given value is a vector value which will be legalized
   4809 // into a load.
   4810 static bool WillBeConstantPoolLoad(SDNode *N) {
   4811   if (N->getOpcode() != ISD::BUILD_VECTOR)
   4812     return false;
   4813 
   4814   // Check for any non-constant elements.
   4815   for (unsigned i = 0, e = N->getNumOperands(); i != e; ++i)
   4816     switch (N->getOperand(i).getNode()->getOpcode()) {
   4817     case ISD::UNDEF:
   4818     case ISD::ConstantFP:
   4819     case ISD::Constant:
   4820       break;
   4821     default:
   4822       return false;
   4823     }
   4824 
   4825   // Vectors of all-zeros and all-ones are materialized with special
   4826   // instructions rather than being loaded.
   4827   return !ISD::isBuildVectorAllZeros(N) &&
   4828          !ISD::isBuildVectorAllOnes(N);
   4829 }
   4830 
   4831 /// ShouldXformToMOVLP{S|D} - Return true if the node should be transformed to
   4832 /// match movlp{s|d}. The lower half elements should come from lower half of
   4833 /// V1 (and in order), and the upper half elements should come from the upper
   4834 /// half of V2 (and in order). And since V1 will become the source of the
   4835 /// MOVLP, it must be either a vector load or a scalar load to vector.
   4836 static bool ShouldXformToMOVLP(SDNode *V1, SDNode *V2,
   4837                                ArrayRef<int> Mask, MVT VT) {
   4838   if (!VT.is128BitVector())
   4839     return false;
   4840 
   4841   if (!ISD::isNON_EXTLoad(V1) && !isScalarLoadToVector(V1))
   4842     return false;
   4843   // Is V2 is a vector load, don't do this transformation. We will try to use
   4844   // load folding shufps op.
   4845   if (ISD::isNON_EXTLoad(V2) || WillBeConstantPoolLoad(V2))
   4846     return false;
   4847 
   4848   unsigned NumElems = VT.getVectorNumElements();
   4849 
   4850   if (NumElems != 2 && NumElems != 4)
   4851     return false;
   4852   for (unsigned i = 0, e = NumElems/2; i != e; ++i)
   4853     if (!isUndefOrEqual(Mask[i], i))
   4854       return false;
   4855   for (unsigned i = NumElems/2, e = NumElems; i != e; ++i)
   4856     if (!isUndefOrEqual(Mask[i], i+NumElems))
   4857       return false;
   4858   return true;
   4859 }
   4860 
   4861 /// isZeroShuffle - Returns true if N is a VECTOR_SHUFFLE that can be resolved
   4862 /// to an zero vector.
   4863 /// FIXME: move to dag combiner / method on ShuffleVectorSDNode
   4864 static bool isZeroShuffle(