android-5.1.1_r9.0/s

//===-- X86ISelLowering.cpp - X86 DAG Lowering Implementation -------------===//
//
//                     The LLVM Compiler Infrastructure
//
// This file is distributed under the University of Illinois Open Source
// License. See LICENSE.TXT for details.
//
//===----------------------------------------------------------------------===//
//
// This file defines the interfaces that X86 uses to lower LLVM code into a
// selection DAG.
//
//===----------------------------------------------------------------------===//

#include "X86ISelLowering.h"
#include "Utils/X86ShuffleDecode.h"
#include "X86CallingConv.h"
#include "X86InstrBuilder.h"
#include "X86MachineFunctionInfo.h"
#include "X86TargetMachine.h"
#include "X86TargetObjectFile.h"
#include "llvm/ADT/SmallSet.h"
#include "llvm/ADT/Statistic.h"
#include "llvm/ADT/StringExtras.h"
#include "llvm/ADT/StringSwitch.h"
#include "llvm/ADT/VariadicFunction.h"
#include "llvm/CodeGen/IntrinsicLowering.h"
#include "llvm/CodeGen/MachineFrameInfo.h"
#include "llvm/CodeGen/MachineFunction.h"
#include "llvm/CodeGen/MachineInstrBuilder.h"
#include "llvm/CodeGen/MachineJumpTableInfo.h"
#include "llvm/CodeGen/MachineModuleInfo.h"
#include "llvm/CodeGen/MachineRegisterInfo.h"
#include "llvm/IR/CallSite.h"
#include "llvm/IR/CallingConv.h"
#include "llvm/IR/Constants.h"
#include "llvm/IR/DerivedTypes.h"
#include "llvm/IR/Function.h"
#include "llvm/IR/GlobalAlias.h"
#include "llvm/IR/GlobalVariable.h"
#include "llvm/IR/Instructions.h"
#include "llvm/IR/Intrinsics.h"
#include "llvm/MC/MCAsmInfo.h"
#include "llvm/MC/MCContext.h"
#include "llvm/MC/MCExpr.h"
#include "llvm/MC/MCSymbol.h"
#include "llvm/Support/CommandLine.h"
#include "llvm/Support/Debug.h"
#include "llvm/Support/ErrorHandling.h"
#include "llvm/Support/MathExtras.h"
#include "llvm/Target/TargetOptions.h"
#include <bitset>
#include <numeric>
#include <cctype>
using namespace llvm;

#define DEBUG_TYPE "x86-isel"

STATISTIC(NumTailCalls, "Number of tail calls");

static cl::opt<bool> ExperimentalVectorWideningLegalization(
    "x86-experimental-vector-widening-legalization", cl::init(false),
    cl::desc("Enable an experimental vector type legalization through widening "
             "rather than promotion."),
    cl::Hidden);

static cl::opt<bool> ExperimentalVectorShuffleLowering(
    "x86-experimental-vector-shuffle-lowering", cl::init(false),
    cl::desc("Enable an experimental vector shuffle lowering code path."),
    cl::Hidden);

// Forward declarations.
static SDValue getMOVL(SelectionDAG &DAG, SDLoc dl, EVT VT, SDValue V1,
                       SDValue V2);

static SDValue ExtractSubVector(SDValue Vec, unsigned IdxVal,
                                SelectionDAG &DAG, SDLoc dl,
                                unsigned vectorWidth) {
  assert((vectorWidth == 128 || vectorWidth == 256) &&
         "Unsupported vector width");
  EVT VT = Vec.getValueType();
  EVT ElVT = VT.getVectorElementType();
  unsigned Factor = VT.getSizeInBits()/vectorWidth;
  EVT ResultVT = EVT::getVectorVT(*DAG.getContext(), ElVT,
                                  VT.getVectorNumElements()/Factor);

  // Extract from UNDEF is UNDEF.
  if (Vec.getOpcode() == ISD::UNDEF)
    return DAG.getUNDEF(ResultVT);

  // Extract the relevant vectorWidth bits.  Generate an EXTRACT_SUBVECTOR
  unsigned ElemsPerChunk = vectorWidth / ElVT.getSizeInBits();

  // This is the index of the first element of the vectorWidth-bit chunk
  // we want.
  unsigned NormalizedIdxVal = (((IdxVal * ElVT.getSizeInBits()) / vectorWidth)
                               * ElemsPerChunk);

  // If the input is a buildvector just emit a smaller one.
  if (Vec.getOpcode() == ISD::BUILD_VECTOR)
    return DAG.getNode(ISD::BUILD_VECTOR, dl, ResultVT,
                       makeArrayRef(Vec->op_begin()+NormalizedIdxVal,
                                    ElemsPerChunk));

  SDValue VecIdx = DAG.getIntPtrConstant(NormalizedIdxVal);
  SDValue Result = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, ResultVT, Vec,
                               VecIdx);

  return Result;

}
/// Generate a DAG to grab 128-bits from a vector > 128 bits.  This
/// sets things up to match to an AVX VEXTRACTF128 / VEXTRACTI128
/// or AVX-512 VEXTRACTF32x4 / VEXTRACTI32x4
/// instructions or a simple subregister reference. Idx is an index in the
/// 128 bits we want.  It need not be aligned to a 128-bit bounday.  That makes
/// lowering EXTRACT_VECTOR_ELT operations easier.
static SDValue Extract128BitVector(SDValue Vec, unsigned IdxVal,
                                   SelectionDAG &DAG, SDLoc dl) {
  assert((Vec.getValueType().is256BitVector() ||
          Vec.getValueType().is512BitVector()) && "Unexpected vector size!");
  return ExtractSubVector(Vec, IdxVal, DAG, dl, 128);
}

/// Generate a DAG to grab 256-bits from a 512-bit vector.
static SDValue Extract256BitVector(SDValue Vec, unsigned IdxVal,
                                   SelectionDAG &DAG, SDLoc dl) {
  assert(Vec.getValueType().is512BitVector() && "Unexpected vector size!");
  return ExtractSubVector(Vec, IdxVal, DAG, dl, 256);
}

static SDValue InsertSubVector(SDValue Result, SDValue Vec,
                               unsigned IdxVal, SelectionDAG &DAG,
                               SDLoc dl, unsigned vectorWidth) {
  assert((vectorWidth == 128 || vectorWidth == 256) &&
         "Unsupported vector width");
  // Inserting UNDEF is Result
  if (Vec.getOpcode() == ISD::UNDEF)
    return Result;
  EVT VT = Vec.getValueType();
  EVT ElVT = VT.getVectorElementType();
  EVT ResultVT = Result.getValueType();

  // Insert the relevant vectorWidth bits.
  unsigned ElemsPerChunk = vectorWidth/ElVT.getSizeInBits();

  // This is the index of the first element of the vectorWidth-bit chunk
  // we want.
  unsigned NormalizedIdxVal = (((IdxVal * ElVT.getSizeInBits())/vectorWidth)
                               * ElemsPerChunk);

  SDValue VecIdx = DAG.getIntPtrConstant(NormalizedIdxVal);
  return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResultVT, Result, Vec,
                     VecIdx);
}
/// Generate a DAG to put 128-bits into a vector > 128 bits.  This
/// sets things up to match to an AVX VINSERTF128/VINSERTI128 or
/// AVX-512 VINSERTF32x4/VINSERTI32x4 instructions or a
/// simple superregister reference.  Idx is an index in the 128 bits
/// we want.  It need not be aligned to a 128-bit bounday.  That makes
/// lowering INSERT_VECTOR_ELT operations easier.
static SDValue Insert128BitVector(SDValue Result, SDValue Vec,
                                  unsigned IdxVal, SelectionDAG &DAG,
                                  SDLoc dl) {
  assert(Vec.getValueType().is128BitVector() && "Unexpected vector size!");
  return InsertSubVector(Result, Vec, IdxVal, DAG, dl, 128);
}

static SDValue Insert256BitVector(SDValue Result, SDValue Vec,
                                  unsigned IdxVal, SelectionDAG &DAG,
                                  SDLoc dl) {
  assert(Vec.getValueType().is256BitVector() && "Unexpected vector size!");
  return InsertSubVector(Result, Vec, IdxVal, DAG, dl, 256);
}

/// Concat two 128-bit vectors into a 256 bit vector using VINSERTF128
/// instructions. This is used because creating CONCAT_VECTOR nodes of
/// BUILD_VECTORS returns a larger BUILD_VECTOR while we're trying to lower
/// large BUILD_VECTORS.
static SDValue Concat128BitVectors(SDValue V1, SDValue V2, EVT VT,
                                   unsigned NumElems, SelectionDAG &DAG,
                                   SDLoc dl) {
  SDValue V = Insert128BitVector(DAG.getUNDEF(VT), V1, 0, DAG, dl);
  return Insert128BitVector(V, V2, NumElems/2, DAG, dl);
}

static SDValue Concat256BitVectors(SDValue V1, SDValue V2, EVT VT,
                                   unsigned NumElems, SelectionDAG &DAG,
                                   SDLoc dl) {
  SDValue V = Insert256BitVector(DAG.getUNDEF(VT), V1, 0, DAG, dl);
  return Insert256BitVector(V, V2, NumElems/2, DAG, dl);
}

static TargetLoweringObjectFile *createTLOF(const Triple &TT) {
  if (TT.isOSBinFormatMachO()) {
    if (TT.getArch() == Triple::x86_64)
      return new X86_64MachoTargetObjectFile();
    return new TargetLoweringObjectFileMachO();
  }

  if (TT.isOSLinux())
    return new X86LinuxTargetObjectFile();
  if (TT.isOSBinFormatELF())
    return new TargetLoweringObjectFileELF();
  if (TT.isKnownWindowsMSVCEnvironment())
    return new X86WindowsTargetObjectFile();
  if (TT.isOSBinFormatCOFF())
    return new TargetLoweringObjectFileCOFF();
  llvm_unreachable("unknown subtarget type");
}

// FIXME: This should stop caching the target machine as soon as
// we can remove resetOperationActions et al.
X86TargetLowering::X86TargetLowering(X86TargetMachine &TM)
  : TargetLowering(TM, createTLOF(Triple(TM.getTargetTriple()))) {
  Subtarget = &TM.getSubtarget<X86Subtarget>();
  X86ScalarSSEf64 = Subtarget->hasSSE2();
  X86ScalarSSEf32 = Subtarget->hasSSE1();
  TD = getDataLayout();

  resetOperationActions();
}

void X86TargetLowering::resetOperationActions() {
  const TargetMachine &TM = getTargetMachine();
  static bool FirstTimeThrough = true;

  // If none of the target options have changed, then we don't need to reset the
  // operation actions.
  if (!FirstTimeThrough && TO == TM.Options) return;

  if (!FirstTimeThrough) {
    // Reinitialize the actions.
    initActions();
    FirstTimeThrough = false;
  }

  TO = TM.Options;

  // Set up the TargetLowering object.
  static const MVT IntVTs[] = { MVT::i8, MVT::i16, MVT::i32, MVT::i64 };

  // X86 is weird, it always uses i8 for shift amounts and setcc results.
  setBooleanContents(ZeroOrOneBooleanContent);
  // X86-SSE is even stranger. It uses -1 or 0 for vector masks.
  setBooleanVectorContents(ZeroOrNegativeOneBooleanContent);

  // For 64-bit since we have so many registers use the ILP scheduler, for
  // 32-bit code use the register pressure specific scheduling.
  // For Atom, always use ILP scheduling.
  if (Subtarget->isAtom())
    setSchedulingPreference(Sched::ILP);
  else if (Subtarget->is64Bit())
    setSchedulingPreference(Sched::ILP);
  else
    setSchedulingPreference(Sched::RegPressure);
  const X86RegisterInfo *RegInfo =
    static_cast<const X86RegisterInfo*>(TM.getRegisterInfo());
  setStackPointerRegisterToSaveRestore(RegInfo->getStackRegister());

  // Bypass expensive divides on Atom when compiling with O2
  if (Subtarget->hasSlowDivide() && TM.getOptLevel() >= CodeGenOpt::Default) {
    addBypassSlowDiv(32, 8);
    if (Subtarget->is64Bit())
      addBypassSlowDiv(64, 16);
  }

  if (Subtarget->isTargetKnownWindowsMSVC()) {
    // Setup Windows compiler runtime calls.
    setLibcallName(RTLIB::SDIV_I64, "_alldiv");
    setLibcallName(RTLIB::UDIV_I64, "_aulldiv");
    setLibcallName(RTLIB::SREM_I64, "_allrem");
    setLibcallName(RTLIB::UREM_I64, "_aullrem");
    setLibcallName(RTLIB::MUL_I64, "_allmul");
    setLibcallCallingConv(RTLIB::SDIV_I64, CallingConv::X86_StdCall);
    setLibcallCallingConv(RTLIB::UDIV_I64, CallingConv::X86_StdCall);
    setLibcallCallingConv(RTLIB::SREM_I64, CallingConv::X86_StdCall);
    setLibcallCallingConv(RTLIB::UREM_I64, CallingConv::X86_StdCall);
    setLibcallCallingConv(RTLIB::MUL_I64, CallingConv::X86_StdCall);

    // The _ftol2 runtime function has an unusual calling conv, which
    // is modeled by a special pseudo-instruction.
    setLibcallName(RTLIB::FPTOUINT_F64_I64, nullptr);
    setLibcallName(RTLIB::FPTOUINT_F32_I64, nullptr);
    setLibcallName(RTLIB::FPTOUINT_F64_I32, nullptr);
    setLibcallName(RTLIB::FPTOUINT_F32_I32, nullptr);
  }

  if (Subtarget->isTargetDarwin()) {
    // Darwin should use _setjmp/_longjmp instead of setjmp/longjmp.
    setUseUnderscoreSetJmp(false);
    setUseUnderscoreLongJmp(false);
  } else if (Subtarget->isTargetWindowsGNU()) {
    // MS runtime is weird: it exports _setjmp, but longjmp!
    setUseUnderscoreSetJmp(true);
    setUseUnderscoreLongJmp(false);
  } else {
    setUseUnderscoreSetJmp(true);
    setUseUnderscoreLongJmp(true);
  }

  // Set up the register classes.
  addRegisterClass(MVT::i8, &X86::GR8RegClass);
  addRegisterClass(MVT::i16, &X86::GR16RegClass);
  addRegisterClass(MVT::i32, &X86::GR32RegClass);
  if (Subtarget->is64Bit())
    addRegisterClass(MVT::i64, &X86::GR64RegClass);

  setLoadExtAction(ISD::SEXTLOAD, MVT::i1, Promote);

  // We don't accept any truncstore of integer registers.
  setTruncStoreAction(MVT::i64, MVT::i32, Expand);
  setTruncStoreAction(MVT::i64, MVT::i16, Expand);
  setTruncStoreAction(MVT::i64, MVT::i8 , Expand);
  setTruncStoreAction(MVT::i32, MVT::i16, Expand);
  setTruncStoreAction(MVT::i32, MVT::i8 , Expand);
  setTruncStoreAction(MVT::i16, MVT::i8,  Expand);

  // SETOEQ and SETUNE require checking two conditions.
  setCondCodeAction(ISD::SETOEQ, MVT::f32, Expand);
  setCondCodeAction(ISD::SETOEQ, MVT::f64, Expand);
  setCondCodeAction(ISD::SETOEQ, MVT::f80, Expand);
  setCondCodeAction(ISD::SETUNE, MVT::f32, Expand);
  setCondCodeAction(ISD::SETUNE, MVT::f64, Expand);
  setCondCodeAction(ISD::SETUNE, MVT::f80, Expand);

  // Promote all UINT_TO_FP to larger SINT_TO_FP's, as X86 doesn't have this
  // operation.
  setOperationAction(ISD::UINT_TO_FP       , MVT::i1   , Promote);
  setOperationAction(ISD::UINT_TO_FP       , MVT::i8   , Promote);
  setOperationAction(ISD::UINT_TO_FP       , MVT::i16  , Promote);

  if (Subtarget->is64Bit()) {
    setOperationAction(ISD::UINT_TO_FP     , MVT::i32  , Promote);
    setOperationAction(ISD::UINT_TO_FP     , MVT::i64  , Custom);
  } else if (!TM.Options.UseSoftFloat) {
    // We have an algorithm for SSE2->double, and we turn this into a
    // 64-bit FILD followed by conditional FADD for other targets.
    setOperationAction(ISD::UINT_TO_FP     , MVT::i64  , Custom);
    // We have an algorithm for SSE2, and we turn this into a 64-bit
    // FILD for other targets.
    setOperationAction(ISD::UINT_TO_FP     , MVT::i32  , Custom);
  }

  // Promote i1/i8 SINT_TO_FP to larger SINT_TO_FP's, as X86 doesn't have
  // this operation.
  setOperationAction(ISD::SINT_TO_FP       , MVT::i1   , Promote);
  setOperationAction(ISD::SINT_TO_FP       , MVT::i8   , Promote);

  if (!TM.Options.UseSoftFloat) {
    // SSE has no i16 to fp conversion, only i32
    if (X86ScalarSSEf32) {
      setOperationAction(ISD::SINT_TO_FP     , MVT::i16  , Promote);
      // f32 and f64 cases are Legal, f80 case is not
      setOperationAction(ISD::SINT_TO_FP     , MVT::i32  , Custom);
    } else {
      setOperationAction(ISD::SINT_TO_FP     , MVT::i16  , Custom);
      setOperationAction(ISD::SINT_TO_FP     , MVT::i32  , Custom);
    }
  } else {
    setOperationAction(ISD::SINT_TO_FP     , MVT::i16  , Promote);
    setOperationAction(ISD::SINT_TO_FP     , MVT::i32  , Promote);
  }

  // In 32-bit mode these are custom lowered.  In 64-bit mode F32 and F64
  // are Legal, f80 is custom lowered.
  setOperationAction(ISD::FP_TO_SINT     , MVT::i64  , Custom);
  setOperationAction(ISD::SINT_TO_FP     , MVT::i64  , Custom);

  // Promote i1/i8 FP_TO_SINT to larger FP_TO_SINTS's, as X86 doesn't have
  // this operation.
  setOperationAction(ISD::FP_TO_SINT       , MVT::i1   , Promote);
  setOperationAction(ISD::FP_TO_SINT       , MVT::i8   , Promote);

  if (X86ScalarSSEf32) {
    setOperationAction(ISD::FP_TO_SINT     , MVT::i16  , Promote);
    // f32 and f64 cases are Legal, f80 case is not
    setOperationAction(ISD::FP_TO_SINT     , MVT::i32  , Custom);
  } else {
    setOperationAction(ISD::FP_TO_SINT     , MVT::i16  , Custom);
    setOperationAction(ISD::FP_TO_SINT     , MVT::i32  , Custom);
  }

  // Handle FP_TO_UINT by promoting the destination to a larger signed
  // conversion.
  setOperationAction(ISD::FP_TO_UINT       , MVT::i1   , Promote);
  setOperationAction(ISD::FP_TO_UINT       , MVT::i8   , Promote);
  setOperationAction(ISD::FP_TO_UINT       , MVT::i16  , Promote);

  if (Subtarget->is64Bit()) {
    setOperationAction(ISD::FP_TO_UINT     , MVT::i64  , Expand);
    setOperationAction(ISD::FP_TO_UINT     , MVT::i32  , Promote);
  } else if (!TM.Options.UseSoftFloat) {
    // Since AVX is a superset of SSE3, only check for SSE here.
    if (Subtarget->hasSSE1() && !Subtarget->hasSSE3())
      // Expand FP_TO_UINT into a select.
      // FIXME: We would like to use a Custom expander here eventually to do
      // the optimal thing for SSE vs. the default expansion in the legalizer.
      setOperationAction(ISD::FP_TO_UINT   , MVT::i32  , Expand);
    else
      // With SSE3 we can use fisttpll to convert to a signed i64; without
      // SSE, we're stuck with a fistpll.
      setOperationAction(ISD::FP_TO_UINT   , MVT::i32  , Custom);
  }

  if (isTargetFTOL()) {
    // Use the _ftol2 runtime function, which has a pseudo-instruction
    // to handle its weird calling convention.
    setOperationAction(ISD::FP_TO_UINT     , MVT::i64  , Custom);
  }

  // TODO: when we have SSE, these could be more efficient, by using movd/movq.
  if (!X86ScalarSSEf64) {
    setOperationAction(ISD::BITCAST        , MVT::f32  , Expand);
    setOperationAction(ISD::BITCAST        , MVT::i32  , Expand);
    if (Subtarget->is64Bit()) {
      setOperationAction(ISD::BITCAST      , MVT::f64  , Expand);
      // Without SSE, i64->f64 goes through memory.
      setOperationAction(ISD::BITCAST      , MVT::i64  , Expand);
    }
  }

  // Scalar integer divide and remainder are lowered to use operations that
  // produce two results, to match the available instructions. This exposes
  // the two-result form to trivial CSE, which is able to combine x/y and x%y
  // into a single instruction.
  //
  // Scalar integer multiply-high is also lowered to use two-result
  // operations, to match the available instructions. However, plain multiply
  // (low) operations are left as Legal, as there are single-result
  // instructions for this in x86. Using the two-result multiply instructions
  // when both high and low results are needed must be arranged by dagcombine.
  for (unsigned i = 0; i != array_lengthof(IntVTs); ++i) {
    MVT VT = IntVTs[i];
    setOperationAction(ISD::MULHS, VT, Expand);
    setOperationAction(ISD::MULHU, VT, Expand);
    setOperationAction(ISD::SDIV, VT, Expand);
    setOperationAction(ISD::UDIV, VT, Expand);
    setOperationAction(ISD::SREM, VT, Expand);
    setOperationAction(ISD::UREM, VT, Expand);

    // Add/Sub overflow ops with MVT::Glues are lowered to EFLAGS dependences.
    setOperationAction(ISD::ADDC, VT, Custom);
    setOperationAction(ISD::ADDE, VT, Custom);
    setOperationAction(ISD::SUBC, VT, Custom);
    setOperationAction(ISD::SUBE, VT, Custom);
  }

  setOperationAction(ISD::BR_JT            , MVT::Other, Expand);
  setOperationAction(ISD::BRCOND           , MVT::Other, Custom);
  setOperationAction(ISD::BR_CC            , MVT::f32,   Expand);
  setOperationAction(ISD::BR_CC            , MVT::f64,   Expand);
  setOperationAction(ISD::BR_CC            , MVT::f80,   Expand);
  setOperationAction(ISD::BR_CC            , MVT::i8,    Expand);
  setOperationAction(ISD::BR_CC            , MVT::i16,   Expand);
  setOperationAction(ISD::BR_CC            , MVT::i32,   Expand);
  setOperationAction(ISD::BR_CC            , MVT::i64,   Expand);
  setOperationAction(ISD::SELECT_CC        , MVT::f32,   Expand);
  setOperationAction(ISD::SELECT_CC        , MVT::f64,   Expand);
  setOperationAction(ISD::SELECT_CC        , MVT::f80,   Expand);
  setOperationAction(ISD::SELECT_CC        , MVT::i8,    Expand);
  setOperationAction(ISD::SELECT_CC        , MVT::i16,   Expand);
  setOperationAction(ISD::SELECT_CC        , MVT::i32,   Expand);
  setOperationAction(ISD::SELECT_CC        , MVT::i64,   Expand);
  if (Subtarget->is64Bit())
    setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i32, Legal);
  setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i16  , Legal);
  setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i8   , Legal);
  setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1   , Expand);
  setOperationAction(ISD::FP_ROUND_INREG   , MVT::f32  , Expand);
  setOperationAction(ISD::FREM             , MVT::f32  , Expand);
  setOperationAction(ISD::FREM             , MVT::f64  , Expand);
  setOperationAction(ISD::FREM             , MVT::f80  , Expand);
  setOperationAction(ISD::FLT_ROUNDS_      , MVT::i32  , Custom);

  // Promote the i8 variants and force them on up to i32 which has a shorter
  // encoding.
  setOperationAction(ISD::CTTZ             , MVT::i8   , Promote);
  AddPromotedToType (ISD::CTTZ             , MVT::i8   , MVT::i32);
  setOperationAction(ISD::CTTZ_ZERO_UNDEF  , MVT::i8   , Promote);
  AddPromotedToType (ISD::CTTZ_ZERO_UNDEF  , MVT::i8   , MVT::i32);
  if (Subtarget->hasBMI()) {
    setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i16  , Expand);
    setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i32  , Expand);
    if (Subtarget->is64Bit())
      setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i64, Expand);
  } else {
    setOperationAction(ISD::CTTZ           , MVT::i16  , Custom);
    setOperationAction(ISD::CTTZ           , MVT::i32  , Custom);
    if (Subtarget->is64Bit())
      setOperationAction(ISD::CTTZ         , MVT::i64  , Custom);
  }

  if (Subtarget->hasLZCNT()) {
    // When promoting the i8 variants, force them to i32 for a shorter
    // encoding.
    setOperationAction(ISD::CTLZ           , MVT::i8   , Promote);
    AddPromotedToType (ISD::CTLZ           , MVT::i8   , MVT::i32);
    setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i8   , Promote);
    AddPromotedToType (ISD::CTLZ_ZERO_UNDEF, MVT::i8   , MVT::i32);
    setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i16  , Expand);
    setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i32  , Expand);
    if (Subtarget->is64Bit())
      setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i64, Expand);
  } else {
    setOperationAction(ISD::CTLZ           , MVT::i8   , Custom);
    setOperationAction(ISD::CTLZ           , MVT::i16  , Custom);
    setOperationAction(ISD::CTLZ           , MVT::i32  , Custom);
    setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i8   , Custom);
    setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i16  , Custom);
    setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i32  , Custom);
    if (Subtarget->is64Bit()) {
      setOperationAction(ISD::CTLZ         , MVT::i64  , Custom);
      setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i64, Custom);
    }
  }

  // Special handling for half-precision floating point conversions.
  // If we don't have F16C support, then lower half float conversions
  // into library calls.
  if (TM.Options.UseSoftFloat || !Subtarget->hasF16C()) {
    setOperationAction(ISD::FP16_TO_FP32, MVT::f32, Expand);
    setOperationAction(ISD::FP32_TO_FP16, MVT::i16, Expand);
  }

  if (Subtarget->hasPOPCNT()) {
    setOperationAction(ISD::CTPOP          , MVT::i8   , Promote);
  } else {
    setOperationAction(ISD::CTPOP          , MVT::i8   , Expand);
    setOperationAction(ISD::CTPOP          , MVT::i16  , Expand);
    setOperationAction(ISD::CTPOP          , MVT::i32  , Expand);
    if (Subtarget->is64Bit())
      setOperationAction(ISD::CTPOP        , MVT::i64  , Expand);
  }

  setOperationAction(ISD::READCYCLECOUNTER , MVT::i64  , Custom);

  if (!Subtarget->hasMOVBE())
    setOperationAction(ISD::BSWAP          , MVT::i16  , Expand);

  // These should be promoted to a larger select which is supported.
  setOperationAction(ISD::SELECT          , MVT::i1   , Promote);
  // X86 wants to expand cmov itself.
  setOperationAction(ISD::SELECT          , MVT::i8   , Custom);
  setOperationAction(ISD::SELECT          , MVT::i16  , Custom);
  setOperationAction(ISD::SELECT          , MVT::i32  , Custom);
  setOperationAction(ISD::SELECT          , MVT::f32  , Custom);
  setOperationAction(ISD::SELECT          , MVT::f64  , Custom);
  setOperationAction(ISD::SELECT          , MVT::f80  , Custom);
  setOperationAction(ISD::SETCC           , MVT::i8   , Custom);
  setOperationAction(ISD::SETCC           , MVT::i16  , Custom);
  setOperationAction(ISD::SETCC           , MVT::i32  , Custom);
  setOperationAction(ISD::SETCC           , MVT::f32  , Custom);
  setOperationAction(ISD::SETCC           , MVT::f64  , Custom);
  setOperationAction(ISD::SETCC           , MVT::f80  , Custom);
  if (Subtarget->is64Bit()) {
    setOperationAction(ISD::SELECT        , MVT::i64  , Custom);
    setOperationAction(ISD::SETCC         , MVT::i64  , Custom);
  }
  setOperationAction(ISD::EH_RETURN       , MVT::Other, Custom);
  // NOTE: EH_SJLJ_SETJMP/_LONGJMP supported here is NOT intended to support
  // SjLj exception handling but a light-weight setjmp/longjmp replacement to
  // support continuation, user-level threading, and etc.. As a result, no
  // other SjLj exception interfaces are implemented and please don't build
  // your own exception handling based on them.
  // LLVM/Clang supports zero-cost DWARF exception handling.
  setOperationAction(ISD::EH_SJLJ_SETJMP, MVT::i32, Custom);
  setOperationAction(ISD::EH_SJLJ_LONGJMP, MVT::Other, Custom);

  // Darwin ABI issue.
  setOperationAction(ISD::ConstantPool    , MVT::i32  , Custom);
  setOperationAction(ISD::JumpTable       , MVT::i32  , Custom);
  setOperationAction(ISD::GlobalAddress   , MVT::i32  , Custom);
  setOperationAction(ISD::GlobalTLSAddress, MVT::i32  , Custom);
  if (Subtarget->is64Bit())
    setOperationAction(ISD::GlobalTLSAddress, MVT::i64, Custom);
  setOperationAction(ISD::ExternalSymbol  , MVT::i32  , Custom);
  setOperationAction(ISD::BlockAddress    , MVT::i32  , Custom);
  if (Subtarget->is64Bit()) {
    setOperationAction(ISD::ConstantPool  , MVT::i64  , Custom);
    setOperationAction(ISD::JumpTable     , MVT::i64  , Custom);
    setOperationAction(ISD::GlobalAddress , MVT::i64  , Custom);
    setOperationAction(ISD::ExternalSymbol, MVT::i64  , Custom);
    setOperationAction(ISD::BlockAddress  , MVT::i64  , Custom);
  }
  // 64-bit addm sub, shl, sra, srl (iff 32-bit x86)
  setOperationAction(ISD::SHL_PARTS       , MVT::i32  , Custom);
  setOperationAction(ISD::SRA_PARTS       , MVT::i32  , Custom);
  setOperationAction(ISD::SRL_PARTS       , MVT::i32  , Custom);
  if (Subtarget->is64Bit()) {
    setOperationAction(ISD::SHL_PARTS     , MVT::i64  , Custom);
    setOperationAction(ISD::SRA_PARTS     , MVT::i64  , Custom);
    setOperationAction(ISD::SRL_PARTS     , MVT::i64  , Custom);
  }

  if (Subtarget->hasSSE1())
    setOperationAction(ISD::PREFETCH      , MVT::Other, Legal);

  setOperationAction(ISD::ATOMIC_FENCE  , MVT::Other, Custom);

  // Expand certain atomics
  for (unsigned i = 0; i != array_lengthof(IntVTs); ++i) {
    MVT VT = IntVTs[i];
    setOperationAction(ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS, VT, Custom);
    setOperationAction(ISD::ATOMIC_LOAD_SUB, VT, Custom);
    setOperationAction(ISD::ATOMIC_STORE, VT, Custom);
  }

  if (Subtarget->hasCmpxchg16b()) {
    setOperationAction(ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS, MVT::i128, Custom);
  }

  // FIXME - use subtarget debug flags
  if (!Subtarget->isTargetDarwin() && !Subtarget->isTargetELF() &&
      !Subtarget->isTargetCygMing() && !Subtarget->isTargetWin64()) {
    setOperationAction(ISD::EH_LABEL, MVT::Other, Expand);
  }

  if (Subtarget->is64Bit()) {
    setExceptionPointerRegister(X86::RAX);
    setExceptionSelectorRegister(X86::RDX);
  } else {
    setExceptionPointerRegister(X86::EAX);
    setExceptionSelectorRegister(X86::EDX);
  }
  setOperationAction(ISD::FRAME_TO_ARGS_OFFSET, MVT::i32, Custom);
  setOperationAction(ISD::FRAME_TO_ARGS_OFFSET, MVT::i64, Custom);

  setOperationAction(ISD::INIT_TRAMPOLINE, MVT::Other, Custom);
  setOperationAction(ISD::ADJUST_TRAMPOLINE, MVT::Other, Custom);

  setOperationAction(ISD::TRAP, MVT::Other, Legal);
  setOperationAction(ISD::DEBUGTRAP, MVT::Other, Legal);

  // VASTART needs to be custom lowered to use the VarArgsFrameIndex
  setOperationAction(ISD::VASTART           , MVT::Other, Custom);
  setOperationAction(ISD::VAEND             , MVT::Other, Expand);
  if (Subtarget->is64Bit() && !Subtarget->isTargetWin64()) {
    // TargetInfo::X86_64ABIBuiltinVaList
    setOperationAction(ISD::VAARG           , MVT::Other, Custom);
    setOperationAction(ISD::VACOPY          , MVT::Other, Custom);
  } else {
    // TargetInfo::CharPtrBuiltinVaList
    setOperationAction(ISD::VAARG           , MVT::Other, Expand);
    setOperationAction(ISD::VACOPY          , MVT::Other, Expand);
  }

  setOperationAction(ISD::STACKSAVE,          MVT::Other, Expand);
  setOperationAction(ISD::STACKRESTORE,       MVT::Other, Expand);

  setOperationAction(ISD::DYNAMIC_STACKALLOC, Subtarget->is64Bit() ?
                     MVT::i64 : MVT::i32, Custom);

  if (!TM.Options.UseSoftFloat && X86ScalarSSEf64) {
    // f32 and f64 use SSE.
    // Set up the FP register classes.
    addRegisterClass(MVT::f32, &X86::FR32RegClass);
    addRegisterClass(MVT::f64, &X86::FR64RegClass);

    // Use ANDPD to simulate FABS.
    setOperationAction(ISD::FABS , MVT::f64, Custom);
    setOperationAction(ISD::FABS , MVT::f32, Custom);

    // Use XORP to simulate FNEG.
    setOperationAction(ISD::FNEG , MVT::f64, Custom);
    setOperationAction(ISD::FNEG , MVT::f32, Custom);

    // Use ANDPD and ORPD to simulate FCOPYSIGN.
    setOperationAction(ISD::FCOPYSIGN, MVT::f64, Custom);
    setOperationAction(ISD::FCOPYSIGN, MVT::f32, Custom);

    // Lower this to FGETSIGNx86 plus an AND.
    setOperationAction(ISD::FGETSIGN, MVT::i64, Custom);
    setOperationAction(ISD::FGETSIGN, MVT::i32, Custom);

    // We don't support sin/cos/fmod
    setOperationAction(ISD::FSIN   , MVT::f64, Expand);
    setOperationAction(ISD::FCOS   , MVT::f64, Expand);
    setOperationAction(ISD::FSINCOS, MVT::f64, Expand);
    setOperationAction(ISD::FSIN   , MVT::f32, Expand);
    setOperationAction(ISD::FCOS   , MVT::f32, Expand);
    setOperationAction(ISD::FSINCOS, MVT::f32, Expand);

    // Expand FP immediates into loads from the stack, except for the special
    // cases we handle.
    addLegalFPImmediate(APFloat(+0.0)); // xorpd
    addLegalFPImmediate(APFloat(+0.0f)); // xorps
  } else if (!TM.Options.UseSoftFloat && X86ScalarSSEf32) {
    // Use SSE for f32, x87 for f64.
    // Set up the FP register classes.
    addRegisterClass(MVT::f32, &X86::FR32RegClass);
    addRegisterClass(MVT::f64, &X86::RFP64RegClass);

    // Use ANDPS to simulate FABS.
    setOperationAction(ISD::FABS , MVT::f32, Custom);

    // Use XORP to simulate FNEG.
    setOperationAction(ISD::FNEG , MVT::f32, Custom);

    setOperationAction(ISD::UNDEF,     MVT::f64, Expand);

    // Use ANDPS and ORPS to simulate FCOPYSIGN.
    setOperationAction(ISD::FCOPYSIGN, MVT::f64, Expand);
    setOperationAction(ISD::FCOPYSIGN, MVT::f32, Custom);

    // We don't support sin/cos/fmod
    setOperationAction(ISD::FSIN   , MVT::f32, Expand);
    setOperationAction(ISD::FCOS   , MVT::f32, Expand);
    setOperationAction(ISD::FSINCOS, MVT::f32, Expand);

    // Special cases we handle for FP constants.
    addLegalFPImmediate(APFloat(+0.0f)); // xorps
    addLegalFPImmediate(APFloat(+0.0)); // FLD0
    addLegalFPImmediate(APFloat(+1.0)); // FLD1
    addLegalFPImmediate(APFloat(-0.0)); // FLD0/FCHS
    addLegalFPImmediate(APFloat(-1.0)); // FLD1/FCHS

    if (!TM.Options.UnsafeFPMath) {
      setOperationAction(ISD::FSIN   , MVT::f64, Expand);
      setOperationAction(ISD::FCOS   , MVT::f64, Expand);
      setOperationAction(ISD::FSINCOS, MVT::f64, Expand);
    }
  } else if (!TM.Options.UseSoftFloat) {
    // f32 and f64 in x87.
    // Set up the FP register classes.
    addRegisterClass(MVT::f64, &X86::RFP64RegClass);
    addRegisterClass(MVT::f32, &X86::RFP32RegClass);

    setOperationAction(ISD::UNDEF,     MVT::f64, Expand);
    setOperationAction(ISD::UNDEF,     MVT::f32, Expand);
    setOperationAction(ISD::FCOPYSIGN, MVT::f64, Expand);
    setOperationAction(ISD::FCOPYSIGN, MVT::f32, Expand);

    if (!TM.Options.UnsafeFPMath) {
      setOperationAction(ISD::FSIN   , MVT::f64, Expand);
      setOperationAction(ISD::FSIN   , MVT::f32, Expand);
      setOperationAction(ISD::FCOS   , MVT::f64, Expand);
      setOperationAction(ISD::FCOS   , MVT::f32, Expand);
      setOperationAction(ISD::FSINCOS, MVT::f64, Expand);
      setOperationAction(ISD::FSINCOS, MVT::f32, Expand);
    }
    addLegalFPImmediate(APFloat(+0.0)); // FLD0
    addLegalFPImmediate(APFloat(+1.0)); // FLD1
    addLegalFPImmediate(APFloat(-0.0)); // FLD0/FCHS
    addLegalFPImmediate(APFloat(-1.0)); // FLD1/FCHS
    addLegalFPImmediate(APFloat(+0.0f)); // FLD0
    addLegalFPImmediate(APFloat(+1.0f)); // FLD1
    addLegalFPImmediate(APFloat(-0.0f)); // FLD0/FCHS
    addLegalFPImmediate(APFloat(-1.0f)); // FLD1/FCHS
  }

  // We don't support FMA.
  setOperationAction(ISD::FMA, MVT::f64, Expand);
  setOperationAction(ISD::FMA, MVT::f32, Expand);

  // Long double always uses X87.
  if (!TM.Options.UseSoftFloat) {
    addRegisterClass(MVT::f80, &X86::RFP80RegClass);
    setOperationAction(ISD::UNDEF,     MVT::f80, Expand);
    setOperationAction(ISD::FCOPYSIGN, MVT::f80, Expand);
    {
      APFloat TmpFlt = APFloat::getZero(APFloat::x87DoubleExtended);
      addLegalFPImmediate(TmpFlt);  // FLD0
      TmpFlt.changeSign();
      addLegalFPImmediate(TmpFlt);  // FLD0/FCHS

      bool ignored;
      APFloat TmpFlt2(+1.0);
      TmpFlt2.convert(APFloat::x87DoubleExtended, APFloat::rmNearestTiesToEven,
                      &ignored);
      addLegalFPImmediate(TmpFlt2);  // FLD1
      TmpFlt2.changeSign();
      addLegalFPImmediate(TmpFlt2);  // FLD1/FCHS
    }

    if (!TM.Options.UnsafeFPMath) {
      setOperationAction(ISD::FSIN   , MVT::f80, Expand);
      setOperationAction(ISD::FCOS   , MVT::f80, Expand);
      setOperationAction(ISD::FSINCOS, MVT::f80, Expand);
    }

    setOperationAction(ISD::FFLOOR, MVT::f80, Expand);
    setOperationAction(ISD::FCEIL,  MVT::f80, Expand);
    setOperationAction(ISD::FTRUNC, MVT::f80, Expand);
    setOperationAction(ISD::FRINT,  MVT::f80, Expand);
    setOperationAction(ISD::FNEARBYINT, MVT::f80, Expand);
    setOperationAction(ISD::FMA, MVT::f80, Expand);
  }

  // Always use a library call for pow.
  setOperationAction(ISD::FPOW             , MVT::f32  , Expand);
  setOperationAction(ISD::FPOW             , MVT::f64  , Expand);
  setOperationAction(ISD::FPOW             , MVT::f80  , Expand);

  setOperationAction(ISD::FLOG, MVT::f80, Expand);
  setOperationAction(ISD::FLOG2, MVT::f80, Expand);
  setOperationAction(ISD::FLOG10, MVT::f80, Expand);
  setOperationAction(ISD::FEXP, MVT::f80, Expand);
  setOperationAction(ISD::FEXP2, MVT::f80, Expand);

  // First set operation action for all vector types to either promote
  // (for widening) or expand (for scalarization). Then we will selectively
  // turn on ones that can be effectively codegen'd.
  for (int i = MVT::FIRST_VECTOR_VALUETYPE;
           i <= MVT::LAST_VECTOR_VALUETYPE; ++i) {
    MVT VT = (MVT::SimpleValueType)i;
    setOperationAction(ISD::ADD , VT, Expand);
    setOperationAction(ISD::SUB , VT, Expand);
    setOperationAction(ISD::FADD, VT, Expand);
    setOperationAction(ISD::FNEG, VT, Expand);
    setOperationAction(ISD::FSUB, VT, Expand);
    setOperationAction(ISD::MUL , VT, Expand);
    setOperationAction(ISD::FMUL, VT, Expand);
    setOperationAction(ISD::SDIV, VT, Expand);
    setOperationAction(ISD::UDIV, VT, Expand);
    setOperationAction(ISD::FDIV, VT, Expand);
    setOperationAction(ISD::SREM, VT, Expand);
    setOperationAction(ISD::UREM, VT, Expand);
    setOperationAction(ISD::LOAD, VT, Expand);
    setOperationAction(ISD::VECTOR_SHUFFLE, VT, Expand);
    setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT,Expand);
    setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Expand);
    setOperationAction(ISD::EXTRACT_SUBVECTOR, VT,Expand);
    setOperationAction(ISD::INSERT_SUBVECTOR, VT,Expand);
    setOperationAction(ISD::FABS, VT, Expand);
    setOperationAction(ISD::FSIN, VT, Expand);
    setOperationAction(ISD::FSINCOS, VT, Expand);
    setOperationAction(ISD::FCOS, VT, Expand);
    setOperationAction(ISD::FSINCOS, VT, Expand);
    setOperationAction(ISD::FREM, VT, Expand);
    setOperationAction(ISD::FMA,  VT, Expand);
    setOperationAction(ISD::FPOWI, VT, Expand);
    setOperationAction(ISD::FSQRT, VT, Expand);
    setOperationAction(ISD::FCOPYSIGN, VT, Expand);
    setOperationAction(ISD::FFLOOR, VT, Expand);
    setOperationAction(ISD::FCEIL, VT, Expand);
    setOperationAction(ISD::FTRUNC, VT, Expand);
    setOperationAction(ISD::FRINT, VT, Expand);
    setOperationAction(ISD::FNEARBYINT, VT, Expand);
    setOperationAction(ISD::SMUL_LOHI, VT, Expand);
    setOperationAction(ISD::MULHS, VT, Expand);
    setOperationAction(ISD::UMUL_LOHI, VT, Expand);
    setOperationAction(ISD::MULHU, VT, Expand);
    setOperationAction(ISD::SDIVREM, VT, Expand);
    setOperationAction(ISD::UDIVREM, VT, Expand);
    setOperationAction(ISD::FPOW, VT, Expand);
    setOperationAction(ISD::CTPOP, VT, Expand);
    setOperationAction(ISD::CTTZ, VT, Expand);
    setOperationAction(ISD::CTTZ_ZERO_UNDEF, VT, Expand);
    setOperationAction(ISD::CTLZ, VT, Expand);
    setOperationAction(ISD::CTLZ_ZERO_UNDEF, VT, Expand);
    setOperationAction(ISD::SHL, VT, Expand);
    setOperationAction(ISD::SRA, VT, Expand);
    setOperationAction(ISD::SRL, VT, Expand);
    setOperationAction(ISD::ROTL, VT, Expand);
    setOperationAction(ISD::ROTR, VT, Expand);
    setOperationAction(ISD::BSWAP, VT, Expand);
    setOperationAction(ISD::SETCC, VT, Expand);
    setOperationAction(ISD::FLOG, VT, Expand);
    setOperationAction(ISD::FLOG2, VT, Expand);
    setOperationAction(ISD::FLOG10, VT, Expand);
    setOperationAction(ISD::FEXP, VT, Expand);
    setOperationAction(ISD::FEXP2, VT, Expand);
    setOperationAction(ISD::FP_TO_UINT, VT, Expand);
    setOperationAction(ISD::FP_TO_SINT, VT, Expand);
    setOperationAction(ISD::UINT_TO_FP, VT, Expand);
    setOperationAction(ISD::SINT_TO_FP, VT, Expand);
    setOperationAction(ISD::SIGN_EXTEND_INREG, VT,Expand);
    setOperationAction(ISD::TRUNCATE, VT, Expand);
    setOperationAction(ISD::SIGN_EXTEND, VT, Expand);
    setOperationAction(ISD::ZERO_EXTEND, VT, Expand);
    setOperationAction(ISD::ANY_EXTEND, VT, Expand);
    setOperationAction(ISD::VSELECT, VT, Expand);
    setOperationAction(ISD::SELECT_CC, VT, Expand);
    for (int InnerVT = MVT::FIRST_VECTOR_VALUETYPE;
             InnerVT <= MVT::LAST_VECTOR_VALUETYPE; ++InnerVT)
      setTruncStoreAction(VT,
                          (MVT::SimpleValueType)InnerVT, Expand);
    setLoadExtAction(ISD::SEXTLOAD, VT, Expand);
    setLoadExtAction(ISD::ZEXTLOAD, VT, Expand);
    setLoadExtAction(ISD::EXTLOAD, VT, Expand);
  }

  // FIXME: In order to prevent SSE instructions being expanded to MMX ones
  // with -msoft-float, disable use of MMX as well.
  if (!TM.Options.UseSoftFloat && Subtarget->hasMMX()) {
    addRegisterClass(MVT::x86mmx, &X86::VR64RegClass);
    // No operations on x86mmx supported, everything uses intrinsics.
  }

  // MMX-sized vectors (other than x86mmx) are expected to be expanded
  // into smaller operations.
  setOperationAction(ISD::MULHS,              MVT::v8i8,  Expand);
  setOperationAction(ISD::MULHS,              MVT::v4i16, Expand);
  setOperationAction(ISD::MULHS,              MVT::v2i32, Expand);
  setOperationAction(ISD::MULHS,              MVT::v1i64, Expand);
  setOperationAction(ISD::AND,                MVT::v8i8,  Expand);
  setOperationAction(ISD::AND,                MVT::v4i16, Expand);
  setOperationAction(ISD::AND,                MVT::v2i32, Expand);
  setOperationAction(ISD::AND,                MVT::v1i64, Expand);
  setOperationAction(ISD::OR,                 MVT::v8i8,  Expand);
  setOperationAction(ISD::OR,                 MVT::v4i16, Expand);
  setOperationAction(ISD::OR,                 MVT::v2i32, Expand);
  setOperationAction(ISD::OR,                 MVT::v1i64, Expand);
  setOperationAction(ISD::XOR,                MVT::v8i8,  Expand);
  setOperationAction(ISD::XOR,                MVT::v4i16, Expand);
  setOperationAction(ISD::XOR,                MVT::v2i32, Expand);
  setOperationAction(ISD::XOR,                MVT::v1i64, Expand);
  setOperationAction(ISD::SCALAR_TO_VECTOR,   MVT::v8i8,  Expand);
  setOperationAction(ISD::SCALAR_TO_VECTOR,   MVT::v4i16, Expand);
  setOperationAction(ISD::SCALAR_TO_VECTOR,   MVT::v2i32, Expand);
  setOperationAction(ISD::SCALAR_TO_VECTOR,   MVT::v1i64, Expand);
  setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v1i64, Expand);
  setOperationAction(ISD::SELECT,             MVT::v8i8,  Expand);
  setOperationAction(ISD::SELECT,             MVT::v4i16, Expand);
  setOperationAction(ISD::SELECT,             MVT::v2i32, Expand);
  setOperationAction(ISD::SELECT,             MVT::v1i64, Expand);
  setOperationAction(ISD::BITCAST,            MVT::v8i8,  Expand);
  setOperationAction(ISD::BITCAST,            MVT::v4i16, Expand);
  setOperationAction(ISD::BITCAST,            MVT::v2i32, Expand);
  setOperationAction(ISD::BITCAST,            MVT::v1i64, Expand);

  if (!TM.Options.UseSoftFloat && Subtarget->hasSSE1()) {
    addRegisterClass(MVT::v4f32, &X86::VR128RegClass);

    setOperationAction(ISD::FADD,               MVT::v4f32, Legal);
    setOperationAction(ISD::FSUB,               MVT::v4f32, Legal);
    setOperationAction(ISD::FMUL,               MVT::v4f32, Legal);
    setOperationAction(ISD::FDIV,               MVT::v4f32, Legal);
    setOperationAction(ISD::FSQRT,              MVT::v4f32, Legal);
    setOperationAction(ISD::FNEG,               MVT::v4f32, Custom);
    setOperationAction(ISD::FABS,               MVT::v4f32, Custom);
    setOperationAction(ISD::LOAD,               MVT::v4f32, Legal);
    setOperationAction(ISD::BUILD_VECTOR,       MVT::v4f32, Custom);
    setOperationAction(ISD::VECTOR_SHUFFLE,     MVT::v4f32, Custom);
    setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4f32, Custom);
    setOperationAction(ISD::SELECT,             MVT::v4f32, Custom);
  }

  if (!TM.Options.UseSoftFloat && Subtarget->hasSSE2()) {
    addRegisterClass(MVT::v2f64, &X86::VR128RegClass);

    // FIXME: Unfortunately -soft-float and -no-implicit-float means XMM
    // registers cannot be used even for integer operations.
    addRegisterClass(MVT::v16i8, &X86::VR128RegClass);
    addRegisterClass(MVT::v8i16, &X86::VR128RegClass);
    addRegisterClass(MVT::v4i32, &X86::VR128RegClass);
    addRegisterClass(MVT::v2i64, &X86::VR128RegClass);

    setOperationAction(ISD::ADD,                MVT::v16i8, Legal);
    setOperationAction(ISD::ADD,                MVT::v8i16, Legal);
    setOperationAction(ISD::ADD,                MVT::v4i32, Legal);
    setOperationAction(ISD::ADD,                MVT::v2i64, Legal);
    setOperationAction(ISD::MUL,                MVT::v4i32, Custom);
    setOperationAction(ISD::MUL,                MVT::v2i64, Custom);
    setOperationAction(ISD::UMUL_LOHI,          MVT::v4i32, Custom);
    setOperationAction(ISD::SMUL_LOHI,          MVT::v4i32, Custom);
    setOperationAction(ISD::MULHU,              MVT::v8i16, Legal);
    setOperationAction(ISD::MULHS,              MVT::v8i16, Legal);
    setOperationAction(ISD::SUB,                MVT::v16i8, Legal);
    setOperationAction(ISD::SUB,                MVT::v8i16, Legal);
    setOperationAction(ISD::SUB,                MVT::v4i32, Legal);
    setOperationAction(ISD::SUB,                MVT::v2i64, Legal);
    setOperationAction(ISD::MUL,                MVT::v8i16, Legal);
    setOperationAction(ISD::FADD,               MVT::v2f64, Legal);
    setOperationAction(ISD::FSUB,               MVT::v2f64, Legal);
    setOperationAction(ISD::FMUL,               MVT::v2f64, Legal);
    setOperationAction(ISD::FDIV,               MVT::v2f64, Legal);
    setOperationAction(ISD::FSQRT,              MVT::v2f64, Legal);
    setOperationAction(ISD::FNEG,               MVT::v2f64, Custom);
    setOperationAction(ISD::FABS,               MVT::v2f64, Custom);

    setOperationAction(ISD::SETCC,              MVT::v2i64, Custom);
    setOperationAction(ISD::SETCC,              MVT::v16i8, Custom);
    setOperationAction(ISD::SETCC,              MVT::v8i16, Custom);
    setOperationAction(ISD::SETCC,              MVT::v4i32, Custom);

    setOperationAction(ISD::SCALAR_TO_VECTOR,   MVT::v16i8, Custom);
    setOperationAction(ISD::SCALAR_TO_VECTOR,   MVT::v8i16, Custom);
    setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v8i16, Custom);
    setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v4i32, Custom);
    setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v4f32, Custom);

    // Custom lower build_vector, vector_shuffle, and extract_vector_elt.
    for (int i = MVT::v16i8; i != MVT::v2i64; ++i) {
      MVT VT = (MVT::SimpleValueType)i;
      // Do not attempt to custom lower non-power-of-2 vectors
      if (!isPowerOf2_32(VT.getVectorNumElements()))
        continue;
      // Do not attempt to custom lower non-128-bit vectors
      if (!VT.is128BitVector())
        continue;
      setOperationAction(ISD::BUILD_VECTOR,       VT, Custom);
      setOperationAction(ISD::VECTOR_SHUFFLE,     VT, Custom);
      setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
    }

    setOperationAction(ISD::BUILD_VECTOR,       MVT::v2f64, Custom);
    setOperationAction(ISD::BUILD_VECTOR,       MVT::v2i64, Custom);
    setOperationAction(ISD::VECTOR_SHUFFLE,     MVT::v2f64, Custom);
    setOperationAction(ISD::VECTOR_SHUFFLE,     MVT::v2i64, Custom);
    setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v2f64, Custom);
    setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2f64, Custom);

    if (Subtarget->is64Bit()) {
      setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v2i64, Custom);
      setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2i64, Custom);
    }

    // Promote v16i8, v8i16, v4i32 load, select, and, or, xor to v2i64.
    for (int i = MVT::v16i8; i != MVT::v2i64; ++i) {
      MVT VT = (MVT::SimpleValueType)i;

      // Do not attempt to promote non-128-bit vectors
      if (!VT.is128BitVector())
        continue;

      setOperationAction(ISD::AND,    VT, Promote);
      AddPromotedToType (ISD::AND,    VT, MVT::v2i64);
      setOperationAction(ISD::OR,     VT, Promote);
      AddPromotedToType (ISD::OR,     VT, MVT::v2i64);
      setOperationAction(ISD::XOR,    VT, Promote);
      AddPromotedToType (ISD::XOR,    VT, MVT::v2i64);
      setOperationAction(ISD::LOAD,   VT, Promote);
      AddPromotedToType (ISD::LOAD,   VT, MVT::v2i64);
      setOperationAction(ISD::SELECT, VT, Promote);
      AddPromotedToType (ISD::SELECT, VT, MVT::v2i64);
    }

    setTruncStoreAction(MVT::f64, MVT::f32, Expand);

    // Custom lower v2i64 and v2f64 selects.
    setOperationAction(ISD::LOAD,               MVT::v2f64, Legal);
    setOperationAction(ISD::LOAD,               MVT::v2i64, Legal);
    setOperationAction(ISD::SELECT,             MVT::v2f64, Custom);
    setOperationAction(ISD::SELECT,             MVT::v2i64, Custom);

    setOperationAction(ISD::FP_TO_SINT,         MVT::v4i32, Legal);
    setOperationAction(ISD::SINT_TO_FP,         MVT::v4i32, Legal);

    setOperationAction(ISD::UINT_TO_FP,         MVT::v4i8,  Custom);
    setOperationAction(ISD::UINT_TO_FP,         MVT::v4i16, Custom);
    // As there is no 64-bit GPR available, we need build a special custom
    // sequence to convert from v2i32 to v2f32.
    if (!Subtarget->is64Bit())
      setOperationAction(ISD::UINT_TO_FP,       MVT::v2f32, Custom);

    setOperationAction(ISD::FP_EXTEND,          MVT::v2f32, Custom);
    setOperationAction(ISD::FP_ROUND,           MVT::v2f32, Custom);

    setLoadExtAction(ISD::EXTLOAD,              MVT::v2f32, Legal);

    setOperationAction(ISD::BITCAST,            MVT::v2i32, Custom);
    setOperationAction(ISD::BITCAST,            MVT::v4i16, Custom);
    setOperationAction(ISD::BITCAST,            MVT::v8i8,  Custom);
  }

  if (!TM.Options.UseSoftFloat && Subtarget->hasSSE41()) {
    setOperationAction(ISD::FFLOOR,             MVT::f32,   Legal);
    setOperationAction(ISD::FCEIL,              MVT::f32,   Legal);
    setOperationAction(ISD::FTRUNC,             MVT::f32,   Legal);
    setOperationAction(ISD::FRINT,              MVT::f32,   Legal);
    setOperationAction(ISD::FNEARBYINT,         MVT::f32,   Legal);
    setOperationAction(ISD::FFLOOR,             MVT::f64,   Legal);
    setOperationAction(ISD::FCEIL,              MVT::f64,   Legal);
    setOperationAction(ISD::FTRUNC,             MVT::f64,   Legal);
    setOperationAction(ISD::FRINT,              MVT::f64,   Legal);
    setOperationAction(ISD::FNEARBYINT,         MVT::f64,   Legal);

    setOperationAction(ISD::FFLOOR,             MVT::v4f32, Legal);
    setOperationAction(ISD::FCEIL,              MVT::v4f32, Legal);
    setOperationAction(ISD::FTRUNC,             MVT::v4f32, Legal);
    setOperationAction(ISD::FRINT,              MVT::v4f32, Legal);
    setOperationAction(ISD::FNEARBYINT,         MVT::v4f32, Legal);
    setOperationAction(ISD::FFLOOR,             MVT::v2f64, Legal);
    setOperationAction(ISD::FCEIL,              MVT::v2f64, Legal);
    setOperationAction(ISD::FTRUNC,             MVT::v2f64, Legal);
    setOperationAction(ISD::FRINT,              MVT::v2f64, Legal);
    setOperationAction(ISD::FNEARBYINT,         MVT::v2f64, Legal);

    // FIXME: Do we need to handle scalar-to-vector here?
    setOperationAction(ISD::MUL,                MVT::v4i32, Legal);

    setOperationAction(ISD::VSELECT,            MVT::v2f64, Custom);
    setOperationAction(ISD::VSELECT,            MVT::v2i64, Custom);
    setOperationAction(ISD::VSELECT,            MVT::v4i32, Custom);
    setOperationAction(ISD::VSELECT,            MVT::v4f32, Custom);
    setOperationAction(ISD::VSELECT,            MVT::v8i16, Custom);
    // There is no BLENDI for byte vectors. We don't need to custom lower
    // some vselects for now.
    setOperationAction(ISD::VSELECT,            MVT::v16i8, Legal);

    // i8 and i16 vectors are custom , because the source register and source
    // source memory operand types are not the same width.  f32 vectors are
    // custom since the immediate controlling the insert encodes additional
    // information.
    setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v16i8, Custom);
    setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v8i16, Custom);
    setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v4i32, Custom);
    setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v4f32, Custom);

    setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v16i8, Custom);
    setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v8i16, Custom);
    setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4i32, Custom);
    setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4f32, Custom);

    // FIXME: these should be Legal but thats only for the case where
    // the index is constant.  For now custom expand to deal with that.
    if (Subtarget->is64Bit()) {
      setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v2i64, Custom);
      setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2i64, Custom);
    }
  }

  if (Subtarget->hasSSE2()) {
    setOperationAction(ISD::SRL,               MVT::v8i16, Custom);
    setOperationAction(ISD::SRL,               MVT::v16i8, Custom);

    setOperationAction(ISD::SHL,               MVT::v8i16, Custom);
    setOperationAction(ISD::SHL,               MVT::v16i8, Custom);

    setOperationAction(ISD::SRA,               MVT::v8i16, Custom);
    setOperationAction(ISD::SRA,               MVT::v16i8, Custom);

    // In the customized shift lowering, the legal cases in AVX2 will be
    // recognized.
    setOperationAction(ISD::SRL,               MVT::v2i64, Custom);
    setOperationAction(ISD::SRL,               MVT::v4i32, Custom);

    setOperationAction(ISD::SHL,               MVT::v2i64, Custom);
    setOperationAction(ISD::SHL,               MVT::v4i32, Custom);

    setOperationAction(ISD::SRA,               MVT::v4i32, Custom);
  }

  if (!TM.Options.UseSoftFloat && Subtarget->hasFp256()) {
    addRegisterClass(MVT::v32i8,  &X86::VR256RegClass);
    addRegisterClass(MVT::v16i16, &X86::VR256RegClass);
    addRegisterClass(MVT::v8i32,  &X86::VR256RegClass);
    addRegisterClass(MVT::v8f32,  &X86::VR256RegClass);
    addRegisterClass(MVT::v4i64,  &X86::VR256RegClass);
    addRegisterClass(MVT::v4f64,  &X86::VR256RegClass);

    setOperationAction(ISD::LOAD,               MVT::v8f32, Legal);
    setOperationAction(ISD::LOAD,               MVT::v4f64, Legal);
    setOperationAction(ISD::LOAD,               MVT::v4i64, Legal);

    setOperationAction(ISD::FADD,               MVT::v8f32, Legal);
    setOperationAction(ISD::FSUB,               MVT::v8f32, Legal);
    setOperationAction(ISD::FMUL,               MVT::v8f32, Legal);
    setOperationAction(ISD::FDIV,               MVT::v8f32, Legal);
    setOperationAction(ISD::FSQRT,              MVT::v8f32, Legal);
    setOperationAction(ISD::FFLOOR,             MVT::v8f32, Legal);
    setOperationAction(ISD::FCEIL,              MVT::v8f32, Legal);
    setOperationAction(ISD::FTRUNC,             MVT::v8f32, Legal);
    setOperationAction(ISD::FRINT,              MVT::v8f32, Legal);
    setOperationAction(ISD::FNEARBYINT,         MVT::v8f32, Legal);
    setOperationAction(ISD::FNEG,               MVT::v8f32, Custom);
    setOperationAction(ISD::FABS,               MVT::v8f32, Custom);

    setOperationAction(ISD::FADD,               MVT::v4f64, Legal);
    setOperationAction(ISD::FSUB,               MVT::v4f64, Legal);
    setOperationAction(ISD::FMUL,               MVT::v4f64, Legal);
    setOperationAction(ISD::FDIV,               MVT::v4f64, Legal);
    setOperationAction(ISD::FSQRT,              MVT::v4f64, Legal);
    setOperationAction(ISD::FFLOOR,             MVT::v4f64, Legal);
    setOperationAction(ISD::FCEIL,              MVT::v4f64, Legal);
    setOperationAction(ISD::FTRUNC,             MVT::v4f64, Legal);
    setOperationAction(ISD::FRINT,              MVT::v4f64, Legal);
    setOperationAction(ISD::FNEARBYINT,         MVT::v4f64, Legal);
    setOperationAction(ISD::FNEG,               MVT::v4f64, Custom);
    setOperationAction(ISD::FABS,               MVT::v4f64, Custom);

    // (fp_to_int:v8i16 (v8f32 ..)) requires the result type to be promoted
    // even though v8i16 is a legal type.
    setOperationAction(ISD::FP_TO_SINT,         MVT::v8i16, Promote);
    setOperationAction(ISD::FP_TO_UINT,         MVT::v8i16, Promote);
    setOperationAction(ISD::FP_TO_SINT,         MVT::v8i32, Legal);

    setOperationAction(ISD::SINT_TO_FP,         MVT::v8i16, Promote);
    setOperationAction(ISD::SINT_TO_FP,         MVT::v8i32, Legal);
    setOperationAction(ISD::FP_ROUND,           MVT::v4f32, Legal);

    setOperationAction(ISD::UINT_TO_FP,         MVT::v8i8,  Custom);
    setOperationAction(ISD::UINT_TO_FP,         MVT::v8i16, Custom);

    setLoadExtAction(ISD::EXTLOAD,              MVT::v4f32, Legal);

    setOperationAction(ISD::SRL,               MVT::v16i16, Custom);
    setOperationAction(ISD::SRL,               MVT::v32i8, Custom);

    setOperationAction(ISD::SHL,               MVT::v16i16, Custom);
    setOperationAction(ISD::SHL,               MVT::v32i8, Custom);

    setOperationAction(ISD::SRA,               MVT::v16i16, Custom);
    setOperationAction(ISD::SRA,               MVT::v32i8, Custom);

    setOperationAction(ISD::SETCC,             MVT::v32i8, Custom);
    setOperationAction(ISD::SETCC,             MVT::v16i16, Custom);
    setOperationAction(ISD::SETCC,             MVT::v8i32, Custom);
    setOperationAction(ISD::SETCC,             MVT::v4i64, Custom);

    setOperationAction(ISD::SELECT,            MVT::v4f64, Custom);
    setOperationAction(ISD::SELECT,            MVT::v4i64, Custom);
    setOperationAction(ISD::SELECT,            MVT::v8f32, Custom);

    setOperationAction(ISD::VSELECT,           MVT::v4f64, Custom);
    setOperationAction(ISD::VSELECT,           MVT::v4i64, Custom);
    setOperationAction(ISD::VSELECT,           MVT::v8i32, Custom);
    setOperationAction(ISD::VSELECT,           MVT::v8f32, Custom);

    setOperationAction(ISD::SIGN_EXTEND,       MVT::v4i64, Custom);
    setOperationAction(ISD::SIGN_EXTEND,       MVT::v8i32, Custom);
    setOperationAction(ISD::SIGN_EXTEND,       MVT::v16i16, Custom);
    setOperationAction(ISD::ZERO_EXTEND,       MVT::v4i64, Custom);
    setOperationAction(ISD::ZERO_EXTEND,       MVT::v8i32, Custom);
    setOperationAction(ISD::ZERO_EXTEND,       MVT::v16i16, Custom);
    setOperationAction(ISD::ANY_EXTEND,        MVT::v4i64, Custom);
    setOperationAction(ISD::ANY_EXTEND,        MVT::v8i32, Custom);
    setOperationAction(ISD::ANY_EXTEND,        MVT::v16i16, Custom);
    setOperationAction(ISD::TRUNCATE,          MVT::v16i8, Custom);
    setOperationAction(ISD::TRUNCATE,          MVT::v8i16, Custom);
    setOperationAction(ISD::TRUNCATE,          MVT::v4i32, Custom);

    if (Subtarget->hasFMA() || Subtarget->hasFMA4()) {
      setOperationAction(ISD::FMA,             MVT::v8f32, Legal);
      setOperationAction(ISD::FMA,             MVT::v4f64, Legal);
      setOperationAction(ISD::FMA,             MVT::v4f32, Legal);
      setOperationAction(ISD::FMA,             MVT::v2f64, Legal);
      setOperationAction(ISD::FMA,             MVT::f32, Legal);
      setOperationAction(ISD::FMA,             MVT::f64, Legal);
    }

    if (Subtarget->hasInt256()) {
      setOperationAction(ISD::ADD,             MVT::v4i64, Legal);
      setOperationAction(ISD::ADD,             MVT::v8i32, Legal);
      setOperationAction(ISD::ADD,             MVT::v16i16, Legal);
      setOperationAction(ISD::ADD,             MVT::v32i8, Legal);

      setOperationAction(ISD::SUB,             MVT::v4i64, Legal);
      setOperationAction(ISD::SUB,             MVT::v8i32, Legal);
      setOperationAction(ISD::SUB,             MVT::v16i16, Legal);
      setOperationAction(ISD::SUB,             MVT::v32i8, Legal);

      setOperationAction(ISD::MUL,             MVT::v4i64, Custom);
      setOperationAction(ISD::MUL,             MVT::v8i32, Legal);
      setOperationAction(ISD::MUL,             MVT::v16i16, Legal);
      // Don't lower v32i8 because there is no 128-bit byte mul

      setOperationAction(ISD::UMUL_LOHI,       MVT::v8i32, Custom);
      setOperationAction(ISD::SMUL_LOHI,       MVT::v8i32, Custom);
      setOperationAction(ISD::MULHU,           MVT::v16i16, Legal);
      setOperationAction(ISD::MULHS,           MVT::v16i16, Legal);

      setOperationAction(ISD::VSELECT,         MVT::v16i16, Custom);
      setOperationAction(ISD::VSELECT,         MVT::v32i8, Legal);
    } else {
      setOperationAction(ISD::ADD,             MVT::v4i64, Custom);
      setOperationAction(ISD::ADD,             MVT::v8i32, Custom);
      setOperationAction(ISD::ADD,             MVT::v16i16, Custom);
      setOperationAction(ISD::ADD,             MVT::v32i8, Custom);

      setOperationAction(ISD::SUB,             MVT::v4i64, Custom);
      setOperationAction(ISD::SUB,             MVT::v8i32, Custom);
      setOperationAction(ISD::SUB,             MVT::v16i16, Custom);
      setOperationAction(ISD::SUB,             MVT::v32i8, Custom);

      setOperationAction(ISD::MUL,             MVT::v4i64, Custom);
      setOperationAction(ISD::MUL,             MVT::v8i32, Custom);
      setOperationAction(ISD::MUL,             MVT::v16i16, Custom);
      // Don't lower v32i8 because there is no 128-bit byte mul
    }

    // In the customized shift lowering, the legal cases in AVX2 will be
    // recognized.
    setOperationAction(ISD::SRL,               MVT::v4i64, Custom);
    setOperationAction(ISD::SRL,               MVT::v8i32, Custom);

    setOperationAction(ISD::SHL,               MVT::v4i64, Custom);
    setOperationAction(ISD::SHL,               MVT::v8i32, Custom);

    setOperationAction(ISD::SRA,               MVT::v8i32, Custom);

    // Custom lower several nodes for 256-bit types.
    for (int i = MVT::FIRST_VECTOR_VALUETYPE;
             i <= MVT::LAST_VECTOR_VALUETYPE; ++i) {
      MVT VT = (MVT::SimpleValueType)i;

      // Extract subvector is special because the value type
      // (result) is 128-bit but the source is 256-bit wide.
      if (VT.is128BitVector())
        setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Custom);

      // Do not attempt to custom lower other non-256-bit vectors
      if (!VT.is256BitVector())
        continue;

      setOperationAction(ISD::BUILD_VECTOR,       VT, Custom);
      setOperationAction(ISD::VECTOR_SHUFFLE,     VT, Custom);
      setOperationAction(ISD::INSERT_VECTOR_ELT,  VT, Custom);
      setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
      setOperationAction(ISD::SCALAR_TO_VECTOR,   VT, Custom);
      setOperationAction(ISD::INSERT_SUBVECTOR,   VT, Custom);
      setOperationAction(ISD::CONCAT_VECTORS,     VT, Custom);
    }

    // Promote v32i8, v16i16, v8i32 select, and, or, xor to v4i64.
    for (int i = MVT::v32i8; i != MVT::v4i64; ++i) {
      MVT VT = (MVT::SimpleValueType)i;

      // Do not attempt to promote non-256-bit vectors
      if (!VT.is256BitVector())
        continue;

      setOperationAction(ISD::AND,    VT, Promote);
      AddPromotedToType (ISD::AND,    VT, MVT::v4i64);
      setOperationAction(ISD::OR,     VT, Promote);
      AddPromotedToType (ISD::OR,     VT, MVT::v4i64);
      setOperationAction(ISD::XOR,    VT, Promote);
      AddPromotedToType (ISD::XOR,    VT, MVT::v4i64);
      setOperationAction(ISD::LOAD,   VT, Promote);
      AddPromotedToType (ISD::LOAD,   VT, MVT::v4i64);
      setOperationAction(ISD::SELECT, VT, Promote);
      AddPromotedToType (ISD::SELECT, VT, MVT::v4i64);
    }
  }

  if (!TM.Options.UseSoftFloat && Subtarget->hasAVX512()) {
    addRegisterClass(MVT::v16i32, &X86::VR512RegClass);
    addRegisterClass(MVT::v16f32, &X86::VR512RegClass);
    addRegisterClass(MVT::v8i64,  &X86::VR512RegClass);
    addRegisterClass(MVT::v8f64,  &X86::VR512RegClass);

    addRegisterClass(MVT::i1,     &X86::VK1RegClass);
    addRegisterClass(MVT::v8i1,   &X86::VK8RegClass);
    addRegisterClass(MVT::v16i1,  &X86::VK16RegClass);

    setOperationAction(ISD::BR_CC,              MVT::i1,    Expand);
    setOperationAction(ISD::SETCC,              MVT::i1,    Custom);
    setOperationAction(ISD::XOR,                MVT::i1,    Legal);
    setOperationAction(ISD::OR,                 MVT::i1,    Legal);
    setOperationAction(ISD::AND,                MVT::i1,    Legal);
    setLoadExtAction(ISD::EXTLOAD,              MVT::v8f32, Legal);
    setOperationAction(ISD::LOAD,               MVT::v16f32, Legal);
    setOperationAction(ISD::LOAD,               MVT::v8f64, Legal);
    setOperationAction(ISD::LOAD,               MVT::v8i64, Legal);
    setOperationAction(ISD::LOAD,               MVT::v16i32, Legal);
    setOperationAction(ISD::LOAD,               MVT::v16i1, Legal);

    setOperationAction(ISD::FADD,               MVT::v16f32, Legal);
    setOperationAction(ISD::FSUB,               MVT::v16f32, Legal);
    setOperationAction(ISD::FMUL,               MVT::v16f32, Legal);
    setOperationAction(ISD::FDIV,               MVT::v16f32, Legal);
    setOperationAction(ISD::FSQRT,              MVT::v16f32, Legal);
    setOperationAction(ISD::FNEG,               MVT::v16f32, Custom);

    setOperationAction(ISD::FADD,               MVT::v8f64, Legal);
    setOperationAction(ISD::FSUB,               MVT::v8f64, Legal);
    setOperationAction(ISD::FMUL,               MVT::v8f64, Legal);
    setOperationAction(ISD::FDIV,               MVT::v8f64, Legal);
    setOperationAction(ISD::FSQRT,              MVT::v8f64, Legal);
    setOperationAction(ISD::FNEG,               MVT::v8f64, Custom);
    setOperationAction(ISD::FMA,                MVT::v8f64, Legal);
    setOperationAction(ISD::FMA,                MVT::v16f32, Legal);

    setOperationAction(ISD::FP_TO_SINT,         MVT::i32, Legal);
    setOperationAction(ISD::FP_TO_UINT,         MVT::i32, Legal);
    setOperationAction(ISD::SINT_TO_FP,         MVT::i32, Legal);
    setOperationAction(ISD::UINT_TO_FP,         MVT::i32, Legal);
    if (Subtarget->is64Bit()) {
      setOperationAction(ISD::FP_TO_UINT,       MVT::i64, Legal);
      setOperationAction(ISD::FP_TO_SINT,       MVT::i64, Legal);
      setOperationAction(ISD::SINT_TO_FP,       MVT::i64, Legal);
      setOperationAction(ISD::UINT_TO_FP,       MVT::i64, Legal);
    }
    setOperationAction(ISD::FP_TO_SINT,         MVT::v16i32, Legal);
    setOperationAction(ISD::FP_TO_UINT,         MVT::v16i32, Legal);
    setOperationAction(ISD::FP_TO_UINT,         MVT::v8i32, Legal);
    setOperationAction(ISD::FP_TO_UINT,         MVT::v4i32, Legal);
    setOperationAction(ISD::SINT_TO_FP,         MVT::v16i32, Legal);
    setOperationAction(ISD::UINT_TO_FP,         MVT::v16i32, Legal);
    setOperationAction(ISD::UINT_TO_FP,         MVT::v8i32, Legal);
    setOperationAction(ISD::UINT_TO_FP,         MVT::v4i32, Legal);
    setOperationAction(ISD::FP_ROUND,           MVT::v8f32, Legal);
    setOperationAction(ISD::FP_EXTEND,          MVT::v8f32, Legal);

    setOperationAction(ISD::TRUNCATE,           MVT::i1, Custom);
    setOperationAction(ISD::TRUNCATE,           MVT::v16i8, Custom);
    setOperationAction(ISD::TRUNCATE,           MVT::v8i32, Custom);
    setOperationAction(ISD::TRUNCATE,           MVT::v8i1, Custom);
    setOperationAction(ISD::TRUNCATE,           MVT::v16i1, Custom);
    setOperationAction(ISD::TRUNCATE,           MVT::v16i16, Custom);
    setOperationAction(ISD::ZERO_EXTEND,        MVT::v16i32, Custom);
    setOperationAction(ISD::ZERO_EXTEND,        MVT::v8i64, Custom);
    setOperationAction(ISD::SIGN_EXTEND,        MVT::v16i32, Custom);
    setOperationAction(ISD::SIGN_EXTEND,        MVT::v8i64, Custom);
    setOperationAction(ISD::SIGN_EXTEND,        MVT::v16i8, Custom);
    setOperationAction(ISD::SIGN_EXTEND,        MVT::v8i16, Custom);
    setOperationAction(ISD::SIGN_EXTEND,        MVT::v16i16, Custom);

    setOperationAction(ISD::CONCAT_VECTORS,     MVT::v8f64,  Custom);
    setOperationAction(ISD::CONCAT_VECTORS,     MVT::v8i64,  Custom);
    setOperationAction(ISD::CONCAT_VECTORS,     MVT::v16f32,  Custom);
    setOperationAction(ISD::CONCAT_VECTORS,     MVT::v16i32,  Custom);
    setOperationAction(ISD::CONCAT_VECTORS,     MVT::v8i1,    Custom);
    setOperationAction(ISD::CONCAT_VECTORS,     MVT::v16i1, Legal);

    setOperationAction(ISD::SETCC,              MVT::v16i1, Custom);
    setOperationAction(ISD::SETCC,              MVT::v8i1, Custom);

    setOperationAction(ISD::MUL,              MVT::v8i64, Custom);

    setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v8i1,  Custom);
    setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v16i1, Custom);
    setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v16i1, Custom);
    setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v8i1, Custom);
    setOperationAction(ISD::BUILD_VECTOR,       MVT::v8i1, Custom);
    setOperationAction(ISD::BUILD_VECTOR,       MVT::v16i1, Custom);
    setOperationAction(ISD::SELECT,             MVT::v8f64, Custom);
    setOperationAction(ISD::SELECT,             MVT::v8i64, Custom);
    setOperationAction(ISD::SELECT,             MVT::v16f32, Custom);

    setOperationAction(ISD::ADD,                MVT::v8i64, Legal);
    setOperationAction(ISD::ADD,                MVT::v16i32, Legal);

    setOperationAction(ISD::SUB,                MVT::v8i64, Legal);
    setOperationAction(ISD::SUB,                MVT::v16i32, Legal);

    setOperationAction(ISD::MUL,                MVT::v16i32, Legal);

    setOperationAction(ISD::SRL,                MVT::v8i64, Custom);
    setOperationAction(ISD::SRL,                MVT::v16i32, Custom);

    setOperationAction(ISD::SHL,                MVT::v8i64, Custom);
    setOperationAction(ISD::SHL,                MVT::v16i32, Custom);

    setOperationAction(ISD::SRA,                MVT::v8i64, Custom);
    setOperationAction(ISD::SRA,                MVT::v16i32, Custom);

    setOperationAction(ISD::AND,                MVT::v8i64, Legal);
    setOperationAction(ISD::OR,                 MVT::v8i64, Legal);
    setOperationAction(ISD::XOR,                MVT::v8i64, Legal);
    setOperationAction(ISD::AND,                MVT::v16i32, Legal);
    setOperationAction(ISD::OR,                 MVT::v16i32, Legal);
    setOperationAction(ISD::XOR,                MVT::v16i32, Legal);

    if (Subtarget->hasCDI()) {
      setOperationAction(ISD::CTLZ,             MVT::v8i64, Legal);
      setOperationAction(ISD::CTLZ,             MVT::v16i32, Legal);
    }

    // Custom lower several nodes.
    for (int i = MVT::FIRST_VECTOR_VALUETYPE;
             i <= MVT::LAST_VECTOR_VALUETYPE; ++i) {
      MVT VT = (MVT::SimpleValueType)i;

      unsigned EltSize = VT.getVectorElementType().getSizeInBits();
      // Extract subvector is special because the value type
      // (result) is 256/128-bit but the source is 512-bit wide.
      if (VT.is128BitVector() || VT.is256BitVector())
        setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Custom);

      if (VT.getVectorElementType() == MVT::i1)
        setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Legal);

      // Do not attempt to custom lower other non-512-bit vectors
      if (!VT.is512BitVector())
        continue;

      if ( EltSize >= 32) {
        setOperationAction(ISD::VECTOR_SHUFFLE,      VT, Custom);
        setOperationAction(ISD::INSERT_VECTOR_ELT,   VT, Custom);
        setOperationAction(ISD::BUILD_VECTOR,        VT, Custom);
        setOperationAction(ISD::VSELECT,             VT, Legal);
        setOperationAction(ISD::EXTRACT_VECTOR_ELT,  VT, Custom);
        setOperationAction(ISD::SCALAR_TO_VECTOR,    VT, Custom);
        setOperationAction(ISD::INSERT_SUBVECTOR,    VT, Custom);
      }
    }
    for (int i = MVT::v32i8; i != MVT::v8i64; ++i) {
      MVT VT = (MVT::SimpleValueType)i;

      // Do not attempt to promote non-256-bit vectors
      if (!VT.is512BitVector())
        continue;

      setOperationAction(ISD::SELECT, VT, Promote);
      AddPromotedToType (ISD::SELECT, VT, MVT::v8i64);
    }
  }// has  AVX-512

  // SIGN_EXTEND_INREGs are evaluated by the extend type. Handle the expansion
  // of this type with custom code.
  for (int VT = MVT::FIRST_VECTOR_VALUETYPE;
           VT != MVT::LAST_VECTOR_VALUETYPE; VT++) {
    setOperationAction(ISD::SIGN_EXTEND_INREG, (MVT::SimpleValueType)VT,
                       Custom);
  }

  // We want to custom lower some of our intrinsics.
  setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom);
  setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::Other, Custom);
  setOperationAction(ISD::INTRINSIC_VOID, MVT::Other, Custom);
  if (!Subtarget->is64Bit())
    setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::i64, Custom);

  // Only custom-lower 64-bit SADDO and friends on 64-bit because we don't
  // handle type legalization for these operations here.
  //
  // FIXME: We really should do custom legalization for addition and
  // subtraction on x86-32 once PR3203 is fixed.  We really can't do much better
  // than generic legalization for 64-bit multiplication-with-overflow, though.
  for (unsigned i = 0, e = 3+Subtarget->is64Bit(); i != e; ++i) {
    // Add/Sub/Mul with overflow operations are custom lowered.
    MVT VT = IntVTs[i];
    setOperationAction(ISD::SADDO, VT, Custom);
    setOperationAction(ISD::UADDO, VT, Custom);
    setOperationAction(ISD::SSUBO, VT, Custom);
    setOperationAction(ISD::USUBO, VT, Custom);
    setOperationAction(ISD::SMULO, VT, Custom);
    setOperationAction(ISD::UMULO, VT, Custom);
  }

  // There are no 8-bit 3-address imul/mul instructions
  setOperationAction(ISD::SMULO, MVT::i8, Expand);
  setOperationAction(ISD::UMULO, MVT::i8, Expand);

  if (!Subtarget->is64Bit()) {
    // These libcalls are not available in 32-bit.
    setLibcallName(RTLIB::SHL_I128, nullptr);
    setLibcallName(RTLIB::SRL_I128, nullptr);
    setLibcallName(RTLIB::SRA_I128, nullptr);
  }

  // Combine sin / cos into one node or libcall if possible.
  if (Subtarget->hasSinCos()) {
    setLibcallName(RTLIB::SINCOS_F32, "sincosf");
    setLibcallName(RTLIB::SINCOS_F64, "sincos");
    if (Subtarget->isTargetDarwin()) {
      // For MacOSX, we don't want to the normal expansion of a libcall to
      // sincos. We want to issue a libcall to __sincos_stret to avoid memory
      // traffic.
      setOperationAction(ISD::FSINCOS, MVT::f64, Custom);
      setOperationAction(ISD::FSINCOS, MVT::f32, Custom);
    }
  }

  if (Subtarget->isTargetWin64()) {
    setOperationAction(ISD::SDIV, MVT::i128, Custom);
    setOperationAction(ISD::UDIV, MVT::i128, Custom);
    setOperationAction(ISD::SREM, MVT::i128, Custom);
    setOperationAction(ISD::UREM, MVT::i128, Custom);
    setOperationAction(ISD::SDIVREM, MVT::i128, Custom);
    setOperationAction(ISD::UDIVREM, MVT::i128, Custom);
  }

  // We have target-specific dag combine patterns for the following nodes:
  setTargetDAGCombine(ISD::VECTOR_SHUFFLE);
  setTargetDAGCombine(ISD::EXTRACT_VECTOR_ELT);
  setTargetDAGCombine(ISD::VSELECT);
  setTargetDAGCombine(ISD::SELECT);
  setTargetDAGCombine(ISD::SHL);
  setTargetDAGCombine(ISD::SRA);
  setTargetDAGCombine(ISD::SRL);
  setTargetDAGCombine(ISD::OR);
  setTargetDAGCombine(ISD::AND);
  setTargetDAGCombine(ISD::ADD);
  setTargetDAGCombine(ISD::FADD);
  setTargetDAGCombine(ISD::FSUB);
  setTargetDAGCombine(ISD::FMA);
  setTargetDAGCombine(ISD::SUB);
  setTargetDAGCombine(ISD::LOAD);
  setTargetDAGCombine(ISD::STORE);
  setTargetDAGCombine(ISD::ZERO_EXTEND);
  setTargetDAGCombine(ISD::ANY_EXTEND);
  setTargetDAGCombine(ISD::SIGN_EXTEND);
  setTargetDAGCombine(ISD::SIGN_EXTEND_INREG);
  setTargetDAGCombine(ISD::TRUNCATE);
  setTargetDAGCombine(ISD::SINT_TO_FP);
  setTargetDAGCombine(ISD::SETCC);
  setTargetDAGCombine(ISD::INTRINSIC_WO_CHAIN);
  setTargetDAGCombine(ISD::BUILD_VECTOR);
  if (Subtarget->is64Bit())
    setTargetDAGCombine(ISD::MUL);
  setTargetDAGCombine(ISD::XOR);

  computeRegisterProperties();

  // On Darwin, -Os means optimize for size without hurting performance,
  // do not reduce the limit.
  MaxStoresPerMemset = 16; // For @llvm.memset -> sequence of stores
  MaxStoresPerMemsetOptSize = Subtarget->isTargetDarwin() ? 16 : 8;
  MaxStoresPerMemcpy = 8; // For @llvm.memcpy -> sequence of stores
  MaxStoresPerMemcpyOptSize = Subtarget->isTargetDarwin() ? 8 : 4;
  MaxStoresPerMemmove = 8; // For @llvm.memmove -> sequence of stores
  MaxStoresPerMemmoveOptSize = Subtarget->isTargetDarwin() ? 8 : 4;
  setPrefLoopAlignment(4); // 2^4 bytes.

  // Predictable cmov don't hurt on atom because it's in-order.
  PredictableSelectIsExpensive = !Subtarget->isAtom();

  setPrefFunctionAlignment(4); // 2^4 bytes.
}

TargetLoweringBase::LegalizeTypeAction
X86TargetLowering::getPreferredVectorAction(EVT VT) const {
  if (ExperimentalVectorWideningLegalization &&
      VT.getVectorNumElements() != 1 &&
      VT.getVectorElementType().getSimpleVT() != MVT::i1)
    return TypeWidenVector;

  return TargetLoweringBase::getPreferredVectorAction(VT);
}

EVT X86TargetLowering::getSetCCResultType(LLVMContext &, EVT VT) const {
  if (!VT.isVector())
    return Subtarget->hasAVX512() ? MVT::i1: MVT::i8;

  if (Subtarget->hasAVX512())
    switch(VT.getVectorNumElements()) {
    case  8: return MVT::v8i1;
    case 16: return MVT::v16i1;
  }

  return VT.changeVectorElementTypeToInteger();
}

/// getMaxByValAlign - Helper for getByValTypeAlignment to determine
/// the desired ByVal argument alignment.
static void getMaxByValAlign(Type *Ty, unsigned &MaxAlign) {
  if (MaxAlign == 16)
    return;
  if (VectorType *VTy = dyn_cast<VectorType>(Ty)) {
    if (VTy->getBitWidth() == 128)
      MaxAlign = 16;
  } else if (ArrayType *ATy = dyn_cast<ArrayType>(Ty)) {
    unsigned EltAlign = 0;
    getMaxByValAlign(ATy->getElementType(), EltAlign);
    if (EltAlign > MaxAlign)
      MaxAlign = EltAlign;
  } else if (StructType *STy = dyn_cast<StructType>(Ty)) {
    for (unsigned i = 0, e = STy->getNumElements(); i != e; ++i) {
      unsigned EltAlign = 0;
      getMaxByValAlign(STy->getElementType(i), EltAlign);
      if (EltAlign > MaxAlign)
        MaxAlign = EltAlign;
      if (MaxAlign == 16)
        break;
    }
  }
}

/// getByValTypeAlignment - Return the desired alignment for ByVal aggregate
/// function arguments in the caller parameter area. For X86, aggregates
/// that contain SSE vectors are placed at 16-byte boundaries while the rest
/// are at 4-byte boundaries.
unsigned X86TargetLowering::getByValTypeAlignment(Type *Ty) const {
  if (Subtarget->is64Bit()) {
    // Max of 8 and alignment of type.
    unsigned TyAlign = TD->getABITypeAlignment(Ty);
    if (TyAlign > 8)
      return TyAlign;
    return 8;
  }

  unsigned Align = 4;
  if (Subtarget->hasSSE1())
    getMaxByValAlign(Ty, Align);
  return Align;
}

/// getOptimalMemOpType - Returns the target specific optimal type for load
/// and store operations as a result of memset, memcpy, and memmove
/// lowering. If DstAlign is zero that means it's safe to destination
/// alignment can satisfy any constraint. Similarly if SrcAlign is zero it
/// means there isn't a need to check it against alignment requirement,
/// probably because the source does not need to be loaded. If 'IsMemset' is
/// true, that means it's expanding a memset. If 'ZeroMemset' is true, that
/// means it's a memset of zero. 'MemcpyStrSrc' indicates whether the memcpy
/// source is constant so it does not need to be loaded.
/// It returns EVT::Other if the type should be determined using generic
/// target-independent logic.
EVT
X86TargetLowering::getOptimalMemOpType(uint64_t Size,
                                       unsigned DstAlign, unsigned SrcAlign,
                                       bool IsMemset, bool ZeroMemset,
                                       bool MemcpyStrSrc,
                                       MachineFunction &MF) const {
  const Function *F = MF.getFunction();
  if ((!IsMemset || ZeroMemset) &&
      !F->getAttributes().hasAttribute(AttributeSet::FunctionIndex,
                                       Attribute::NoImplicitFloat)) {
    if (Size >= 16 &&
        (Subtarget->isUnalignedMemAccessFast() ||
         ((DstAlign == 0 || DstAlign >= 16) &&
          (SrcAlign == 0 || SrcAlign >= 16)))) {
      if (Size >= 32) {
        if (Subtarget->hasInt256())
          return MVT::v8i32;
        if (Subtarget->hasFp256())
          return MVT::v8f32;
      }
      if (Subtarget->hasSSE2())
        return MVT::v4i32;
      if (Subtarget->hasSSE1())
        return MVT::v4f32;
    } else if (!MemcpyStrSrc && Size >= 8 &&
               !Subtarget->is64Bit() &&
               Subtarget->hasSSE2()) {
      // Do not use f64 to lower memcpy if source is string constant. It's
      // better to use i32 to avoid the loads.
      return MVT::f64;
    }
  }
  if (Subtarget->is64Bit() && Size >= 8)
    return MVT::i64;
  return MVT::i32;
}

bool X86TargetLowering::isSafeMemOpType(MVT VT) const {
  if (VT == MVT::f32)
    return X86ScalarSSEf32;
  else if (VT == MVT::f64)
    return X86ScalarSSEf64;
  return true;
}

bool
X86TargetLowering::allowsUnalignedMemoryAccesses(EVT VT,
                                                 unsigned,
                                                 bool *Fast) const {
  if (Fast)
    *Fast = Subtarget->isUnalignedMemAccessFast();
  return true;
}

/// getJumpTableEncoding - Return the entry encoding for a jump table in the
/// current function.  The returned value is a member of the
/// MachineJumpTableInfo::JTEntryKind enum.
unsigned X86TargetLowering::getJumpTableEncoding() const {
  // In GOT pic mode, each entry in the jump table is emitted as a @GOTOFF
  // symbol.
  if (getTargetMachine().getRelocationModel() == Reloc::PIC_ &&
      Subtarget->isPICStyleGOT())
    return MachineJumpTableInfo::EK_Custom32;

  // Otherwise, use the normal jump table encoding heuristics.
  return TargetLowering::getJumpTableEncoding();
}

const MCExpr *
X86TargetLowering::LowerCustomJumpTableEntry(const MachineJumpTableInfo *MJTI,
                                             const MachineBasicBlock *MBB,
                                             unsigned uid,MCContext &Ctx) const{
  assert(MBB->getParent()->getTarget().getRelocationModel() == Reloc::PIC_ &&
         Subtarget->isPICStyleGOT());
  // In 32-bit ELF systems, our jump table entries are formed with @GOTOFF
  // entries.
  return MCSymbolRefExpr::Create(MBB->getSymbol(),
                                 MCSymbolRefExpr::VK_GOTOFF, Ctx);
}

/// getPICJumpTableRelocaBase - Returns relocation base for the given PIC
/// jumptable.
SDValue X86TargetLowering::getPICJumpTableRelocBase(SDValue Table,
                                                    SelectionDAG &DAG) const {
  if (!Subtarget->is64Bit())
    // This doesn't have SDLoc associated with it, but is not really the
    // same as a Register.
    return DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), getPointerTy());
  return Table;
}

/// getPICJumpTableRelocBaseExpr - This returns the relocation base for the
/// given PIC jumptable, the same as getPICJumpTableRelocBase, but as an
/// MCExpr.
const MCExpr *X86TargetLowering::
getPICJumpTableRelocBaseExpr(const MachineFunction *MF, unsigned JTI,
                             MCContext &Ctx) const {
  // X86-64 uses RIP relative addressing based on the jump table label.
  if (Subtarget->isPICStyleRIPRel())
    return TargetLowering::getPICJumpTableRelocBaseExpr(MF, JTI, Ctx);

  // Otherwise, the reference is relative to the PIC base.
  return MCSymbolRefExpr::Create(MF->getPICBaseSymbol(), Ctx);
}

// FIXME: Why this routine is here? Move to RegInfo!
std::pair<const TargetRegisterClass*, uint8_t>
X86TargetLowering::findRepresentativeClass(MVT VT) const{
  const TargetRegisterClass *RRC = nullptr;
  uint8_t Cost = 1;
  switch (VT.SimpleTy) {
  default:
    return TargetLowering::findRepresentativeClass(VT);
  case MVT::i8: case MVT::i16: case MVT::i32: case MVT::i64:
    RRC = Subtarget->is64Bit() ?
      (const TargetRegisterClass*)&X86::GR64RegClass :
      (const TargetRegisterClass*)&X86::GR32RegClass;
    break;
  case MVT::x86mmx:
    RRC = &X86::VR64RegClass;
    break;
  case MVT::f32: case MVT::f64:
  case MVT::v16i8: case MVT::v8i16: case MVT::v4i32: case MVT::v2i64:
  case MVT::v4f32: case MVT::v2f64:
  case MVT::v32i8: case MVT::v8i32: case MVT::v4i64: case MVT::v8f32:
  case MVT::v4f64:
    RRC = &X86::VR128RegClass;
    break;
  }
  return std::make_pair(RRC, Cost);
}

bool X86TargetLowering::getStackCookieLocation(unsigned &AddressSpace,
                                               unsigned &Offset) const {
  if (!Subtarget->isTargetLinux())
    return false;

  if (Subtarget->is64Bit()) {
    // %fs:0x28, unless we're using a Kernel code model, in which case it's %gs:
    Offset = 0x28;
    if (getTargetMachine().getCodeModel() == CodeModel::Kernel)
      AddressSpace = 256;
    else
      AddressSpace = 257;
  } else {
    // %gs:0x14 on i386
    Offset = 0x14;
    AddressSpace = 256;
  }
  return true;
}

bool X86TargetLowering::isNoopAddrSpaceCast(unsigned SrcAS,
                                            unsigned DestAS) const {
  assert(SrcAS != DestAS && "Expected different address spaces!");

  return SrcAS < 256 && DestAS < 256;
}

//===----------------------------------------------------------------------===//
//               Return Value Calling Convention Implementation
//===----------------------------------------------------------------------===//

#include "X86GenCallingConv.inc"

bool
X86TargetLowering::CanLowerReturn(CallingConv::ID CallConv,
                                  MachineFunction &MF, bool isVarArg,
                        const SmallVectorImpl<ISD::OutputArg> &Outs,
                        LLVMContext &Context) const {
  SmallVector<CCValAssign, 16> RVLocs;
  CCState CCInfo(CallConv, isVarArg, MF, MF.getTarget(),
                 RVLocs, Context);
  return CCInfo.CheckReturn(Outs, RetCC_X86);
}

const MCPhysReg *X86TargetLowering::getScratchRegisters(CallingConv::ID) const {
  static const MCPhysReg ScratchRegs[] = { X86::R11, 0 };
  return ScratchRegs;
}

SDValue
X86TargetLowering::LowerReturn(SDValue Chain,
                               CallingConv::ID CallConv, bool isVarArg,
                               const SmallVectorImpl<ISD::OutputArg> &Outs,
                               const SmallVectorImpl<SDValue> &OutVals,
                               SDLoc dl, SelectionDAG &DAG) const {
  MachineFunction &MF = DAG.getMachineFunction();
  X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();

  SmallVector<CCValAssign, 16> RVLocs;
  CCState CCInfo(CallConv, isVarArg, MF, DAG.getTarget(),
                 RVLocs, *DAG.getContext());
  CCInfo.AnalyzeReturn(Outs, RetCC_X86);

  SDValue Flag;
  SmallVector<SDValue, 6> RetOps;
  RetOps.push_back(Chain); // Operand #0 = Chain (updated below)
  // Operand #1 = Bytes To Pop
  RetOps.push_back(DAG.getTargetConstant(FuncInfo->getBytesToPopOnReturn(),
                   MVT::i16));

  // Copy the result values into the output registers.
  for (unsigned i = 0; i != RVLocs.size(); ++i) {
    CCValAssign &VA = RVLocs[i];
    assert(VA.isRegLoc() && "Can only return in registers!");
    SDValue ValToCopy = OutVals[i];
    EVT ValVT = ValToCopy.getValueType();

    // Promote values to the appropriate types
    if (VA.getLocInfo() == CCValAssign::SExt)
      ValToCopy = DAG.getNode(ISD::SIGN_EXTEND, dl, VA.getLocVT(), ValToCopy);
    else if (VA.getLocInfo() == CCValAssign::ZExt)
      ValToCopy = DAG.getNode(ISD::ZERO_EXTEND, dl, VA.getLocVT(), ValToCopy);
    else if (VA.getLocInfo() == CCValAssign::AExt)
      ValToCopy = DAG.getNode(ISD::ANY_EXTEND, dl, VA.getLocVT(), ValToCopy);
    else if (VA.getLocInfo() == CCValAssign::BCvt)
      ValToCopy = DAG.getNode(ISD::BITCAST, dl, VA.getLocVT(), ValToCopy);

    assert(VA.getLocInfo() != CCValAssign::FPExt &&
           "Unexpected FP-extend for return value.");

    // If this is x86-64, and we disabled SSE, we can't return FP values,
    // or SSE or MMX vectors.
    if ((ValVT == MVT::f32 || ValVT == MVT::f64 ||
         VA.getLocReg() == X86::XMM0 || VA.getLocReg() == X86::XMM1) &&
          (Subtarget->is64Bit() && !Subtarget->hasSSE1())) {
      report_fatal_error("SSE register return with SSE disabled");
    }
    // Likewise we can't return F64 values with SSE1 only.  gcc does so, but
    // llvm-gcc has never done it right and no one has noticed, so this
    // should be OK for now.
    if (ValVT == MVT::f64 &&
        (Subtarget->is64Bit() && !Subtarget->hasSSE2()))
      report_fatal_error("SSE2 register return with SSE2 disabled");

    // Returns in ST0/ST1 are handled specially: these are pushed as operands to
    // the RET instruction and handled by the FP Stackifier.
    if (VA.getLocReg() == X86::ST0 ||
        VA.getLocReg() == X86::ST1) {
      // If this is a copy from an xmm register to ST(0), use an FPExtend to
      // change the value to the FP stack register class.
      if (isScalarFPTypeInSSEReg(VA.getValVT()))
        ValToCopy = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f80, ValToCopy);
      RetOps.push_back(ValToCopy);
      // Don't emit a copytoreg.
      continue;
    }

    // 64-bit vector (MMX) values are returned in XMM0 / XMM1 except for v1i64
    // which is returned in RAX / RDX.
    if (Subtarget->is64Bit()) {
      if (ValVT == MVT::x86mmx) {
        if (VA.getLocReg() == X86::XMM0 || VA.getLocReg() == X86::XMM1) {
          ValToCopy = DAG.getNode(ISD::BITCAST, dl, MVT::i64, ValToCopy);
          ValToCopy = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64,
                                  ValToCopy);
          // If we don't have SSE2 available, convert to v4f32 so the generated
          // register is legal.
          if (!Subtarget->hasSSE2())
            ValToCopy = DAG.getNode(ISD::BITCAST, dl, MVT::v4f32,ValToCopy);
        }
      }
    }

    Chain = DAG.getCopyToReg(Chain, dl, VA.getLocReg(), ValToCopy, Flag);
    Flag = Chain.getValue(1);
    RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT()));
  }

  // The x86-64 ABIs require that for returning structs by value we copy
  // the sret argument into %rax/%eax (depending on ABI) for the return.
  // Win32 requires us to put the sret argument to %eax as well.
  // We saved the argument into a virtual register in the entry block,
  // so now we copy the value out and into %rax/%eax.
  if (DAG.getMachineFunction().getFunction()->hasStructRetAttr() &&
      (Subtarget->is64Bit() || Subtarget->isTargetKnownWindowsMSVC())) {
    MachineFunction &MF = DAG.getMachineFunction();
    X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
    unsigned Reg = FuncInfo->getSRetReturnReg();
    assert(Reg &&
           "SRetReturnReg should have been set in LowerFormalArguments().");
    SDValue Val = DAG.getCopyFromReg(Chain, dl, Reg, getPointerTy());

    unsigned RetValReg
        = (Subtarget->is64Bit() && !Subtarget->isTarget64BitILP32()) ?
          X86::RAX : X86::EAX;
    Chain = DAG.getCopyToReg(Chain, dl, RetValReg, Val, Flag);
    Flag = Chain.getValue(1);

    // RAX/EAX now acts like a return value.
    RetOps.push_back(DAG.getRegister(RetValReg, getPointerTy()));
  }

  RetOps[0] = Chain;  // Update chain.

  // Add the flag if we have it.
  if (Flag.getNode())
    RetOps.push_back(Flag);

  return DAG.getNode(X86ISD::RET_FLAG, dl, MVT::Other, RetOps);
}

bool X86TargetLowering::isUsedByReturnOnly(SDNode *N, SDValue &Chain) const {
  if (N->getNumValues() != 1)
    return false;
  if (!N->hasNUsesOfValue(1, 0))
    return false;

  SDValue TCChain = Chain;
  SDNode *Copy = *N->use_begin();
  if (Copy->getOpcode() == ISD::CopyToReg) {
    // If the copy has a glue operand, we conservatively assume it isn't safe to
    // perform a tail call.
    if (Copy->getOperand(Copy->getNumOperands()-1).getValueType() == MVT::Glue)
      return false;
    TCChain = Copy->getOperand(0);
  } else if (Copy->getOpcode() != ISD::FP_EXTEND)
    return false;

  bool HasRet = false;
  for (SDNode::use_iterator UI = Copy->use_begin(), UE = Copy->use_end();
       UI != UE; ++UI) {
    if (UI->getOpcode() != X86ISD::RET_FLAG)
      return false;
    HasRet = true;
  }

  if (!HasRet)
    return false;

  Chain = TCChain;
  return true;
}

MVT
X86TargetLowering::getTypeForExtArgOrReturn(MVT VT,
                                            ISD::NodeType ExtendKind) const {
  MVT ReturnMVT;
  // TODO: Is this also valid on 32-bit?
  if (Subtarget->is64Bit() && VT == MVT::i1 && ExtendKind == ISD::ZERO_EXTEND)
    ReturnMVT = MVT::i8;
  else
    ReturnMVT = MVT::i32;

  MVT MinVT = getRegisterType(ReturnMVT);
  return VT.bitsLT(MinVT) ? MinVT : VT;
}

/// LowerCallResult - Lower the result values of a call into the
/// appropriate copies out of appropriate physical registers.
///
SDValue
X86TargetLowering::LowerCallResult(SDValue Chain, SDValue InFlag,
                                   CallingConv::ID CallConv, bool isVarArg,
                                   const SmallVectorImpl<ISD::InputArg> &Ins,
                                   SDLoc dl, SelectionDAG &DAG,
                                   SmallVectorImpl<SDValue> &InVals) const {

  // Assign locations to each value returned by this call.
  SmallVector<CCValAssign, 16> RVLocs;
  bool Is64Bit = Subtarget->is64Bit();
  CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(),
                 DAG.getTarget(), RVLocs, *DAG.getContext());
  CCInfo.AnalyzeCallResult(Ins, RetCC_X86);

  // Copy all of the result registers out of their specified physreg.
  for (unsigned i = 0, e = RVLocs.size(); i != e; ++i) {
    CCValAssign &VA = RVLocs[i];
    EVT CopyVT = VA.getValVT();

    // If this is x86-64, and we disabled SSE, we can't return FP values
    if ((CopyVT == MVT::f32 || CopyVT == MVT::f64) &&
        ((Is64Bit || Ins[i].Flags.isInReg()) && !Subtarget->hasSSE1())) {
      report_fatal_error("SSE register return with SSE disabled");
    }

    SDValue Val;

    // If this is a call to a function that returns an fp value on the floating
    // point stack, we must guarantee the value is popped from the stack, so
    // a CopyFromReg is not good enough - the copy instruction may be eliminated
    // if the return value is not used. We use the FpPOP_RETVAL instruction
    // instead.
    if (VA.getLocReg() == X86::ST0 || VA.getLocReg() == X86::ST1) {
      // If we prefer to use the value in xmm registers, copy it out as f80 and
      // use a truncate to move it from fp stack reg to xmm reg.
      if (isScalarFPTypeInSSEReg(VA.getValVT())) CopyVT = MVT::f80;
      SDValue Ops[] = { Chain, InFlag };
      Chain = SDValue(DAG.getMachineNode(X86::FpPOP_RETVAL, dl, CopyVT,
                                         MVT::Other, MVT::Glue, Ops), 1);
      Val = Chain.getValue(0);

      // Round the f80 to the right size, which also moves it to the appropriate
      // xmm register.
      if (CopyVT != VA.getValVT())
        Val = DAG.getNode(ISD::FP_ROUND, dl, VA.getValVT(), Val,
                          // This truncation won't change the value.
                          DAG.getIntPtrConstant(1));
    } else {
      Chain = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(),
                                 CopyVT, InFlag).getValue(1);
      Val = Chain.getValue(0);
    }
    InFlag = Chain.getValue(2);
    InVals.push_back(Val);
  }

  return Chain;
}

//===----------------------------------------------------------------------===//
//                C & StdCall & Fast Calling Convention implementation
//===----------------------------------------------------------------------===//
//  StdCall calling convention seems to be standard for many Windows' API
//  routines and around. It differs from C calling convention just a little:
//  callee should clean up the stack, not caller. Symbols should be also
//  decorated in some fancy way :) It doesn't support any vector arguments.
//  For info on fast calling convention see Fast Calling Convention (tail call)
//  implementation LowerX86_32FastCCCallTo.

/// CallIsStructReturn - Determines whether a call uses struct return
/// semantics.
enum StructReturnType {
  NotStructReturn,
  RegStructReturn,
  StackStructReturn
};
static StructReturnType
callIsStructReturn(const SmallVectorImpl<ISD::OutputArg> &Outs) {
  if (Outs.empty())
    return NotStructReturn;

  const ISD::ArgFlagsTy &Flags = Outs[0].Flags;
  if (!Flags.isSRet())
    return NotStructReturn;
  if (Flags.isInReg())
    return RegStructReturn;
  return StackStructReturn;
}

/// ArgsAreStructReturn - Determines whether a function uses struct
/// return semantics.
static StructReturnType
argsAreStructReturn(const SmallVectorImpl<ISD::InputArg> &Ins) {
  if (Ins.empty())
    return NotStructReturn;

  const ISD::ArgFlagsTy &Flags = Ins[0].Flags;
  if (!Flags.isSRet())
    return NotStructReturn;
  if (Flags.isInReg())
    return RegStructReturn;
  return StackStructReturn;
}

/// CreateCopyOfByValArgument - Make a copy of an aggregate at address specified
/// by "Src" to address "Dst" with size and alignment information specified by
/// the specific parameter attribute. The copy will be passed as a byval
/// function parameter.
static SDValue
CreateCopyOfByValArgument(SDValue Src, SDValue Dst, SDValue Chain,
                          ISD::ArgFlagsTy Flags, SelectionDAG &DAG,
                          SDLoc dl) {
  SDValue SizeNode = DAG.getConstant(Flags.getByValSize(), MVT::i32);

  return DAG.getMemcpy(Chain, dl, Dst, Src, SizeNode, Flags.getByValAlign(),
                       /*isVolatile*/false, /*AlwaysInline=*/true,
                       MachinePointerInfo(), MachinePointerInfo());
}

/// IsTailCallConvention - Return true if the calling convention is one that
/// supports tail call optimization.
static bool IsTailCallConvention(CallingConv::ID CC) {
  return (CC == CallingConv::Fast || CC == CallingConv::GHC ||
          CC == CallingConv::HiPE);
}

/// \brief Return true if the calling convention is a C calling convention.
static bool IsCCallConvention(CallingConv::ID CC) {
  return (CC == CallingConv::C || CC == CallingConv::X86_64_Win64 ||
          CC == CallingConv::X86_64_SysV);
}

bool X86TargetLowering::mayBeEmittedAsTailCall(CallInst *CI) const {
  if (!CI->isTailCall() || getTargetMachine().Options.DisableTailCalls)
    return false;

  CallSite CS(CI);
  CallingConv::ID CalleeCC = CS.getCallingConv();
  if (!IsTailCallConvention(CalleeCC) && !IsCCallConvention(CalleeCC))
    return false;

  return true;
}

/// FuncIsMadeTailCallSafe - Return true if the function is being made into
/// a tailcall target by changing its ABI.
static bool FuncIsMadeTailCallSafe(CallingConv::ID CC,
                                   bool GuaranteedTailCallOpt) {
  return GuaranteedTailCallOpt && IsTailCallConvention(CC);
}

SDValue
X86TargetLowering::LowerMemArgument(SDValue Chain,
                                    CallingConv::ID CallConv,
                                    const SmallVectorImpl<ISD::InputArg> &Ins,
                                    SDLoc dl, SelectionDAG &DAG,
                                    const CCValAssign &VA,
                                    MachineFrameInfo *MFI,
                                    unsigned i) const {
  // Create the nodes corresponding to a load from this parameter slot.
  ISD::ArgFlagsTy Flags = Ins[i].Flags;
  bool AlwaysUseMutable = FuncIsMadeTailCallSafe(
      CallConv, DAG.getTarget().Options.GuaranteedTailCallOpt);
  bool isImmutable = !AlwaysUseMutable && !Flags.isByVal();
  EVT ValVT;

  // If value is passed by pointer we have address passed instead of the value
  // itself.
  if (VA.getLocInfo() == CCValAssign::Indirect)
    ValVT = VA.getLocVT();
  else
    ValVT = VA.getValVT();

  // FIXME: For now, all byval parameter objects are marked mutable. This can be
  // changed with more analysis.
  // In case of tail call optimization mark all arguments mutable. Since they
  // could be overwritten by lowering of arguments in case of a tail call.
  if (Flags.isByVal()) {
    unsigned Bytes = Flags.getByValSize();
    if (Bytes == 0) Bytes = 1; // Don't create zero-sized stack objects.
    int FI = MFI->CreateFixedObject(Bytes, VA.getLocMemOffset(), isImmutable);
    return DAG.getFrameIndex(FI, getPointerTy());
  } else {
    int FI = MFI->CreateFixedObject(ValVT.getSizeInBits()/8,
                                    VA.getLocMemOffset(), isImmutable);
    SDValue FIN = DAG.getFrameIndex(FI, getPointerTy());
    return DAG.getLoad(ValVT, dl, Chain, FIN,
                       MachinePointerInfo::getFixedStack(FI),
                       false, false, false, 0);
  }
}

SDValue
X86TargetLowering::LowerFormalArguments(SDValue Chain,
                                        CallingConv::ID CallConv,
                                        bool isVarArg,
                                      const SmallVectorImpl<ISD::InputArg> &Ins,
                                        SDLoc dl,
                                        SelectionDAG &DAG,
                                        SmallVectorImpl<SDValue> &InVals)
                                          const {
  MachineFunction &MF = DAG.getMachineFunction();
  X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();

  const Function* Fn = MF.getFunction();
  if (Fn->hasExternalLinkage() &&
      Subtarget->isTargetCygMing() &&
      Fn->getName() == "main")
    FuncInfo->setForceFramePointer(true);

  MachineFrameInfo *MFI = MF.getFrameInfo();
  bool Is64Bit = Subtarget->is64Bit();
  bool IsWin64 = Subtarget->isCallingConvWin64(CallConv);

  assert(!(isVarArg && IsTailCallConvention(CallConv)) &&
         "Var args not supported with calling convention fastcc, ghc or hipe");

  // Assign locations to all of the incoming arguments.
  SmallVector<CCValAssign, 16> ArgLocs;
  CCState CCInfo(CallConv, isVarArg, MF, DAG.getTarget(),
                 ArgLocs, *DAG.getContext());

  // Allocate shadow area for Win64
  if (IsWin64)
    CCInfo.AllocateStack(32, 8);

  CCInfo.AnalyzeFormalArguments(Ins, CC_X86);

  unsigned LastVal = ~0U;
  SDValue ArgValue;
  for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
    CCValAssign &VA = ArgLocs[i];
    // TODO: If an arg is passed in two places (e.g. reg and stack), skip later
    // places.
    assert(VA.getValNo() != LastVal &&
           "Don't support value assigned to multiple locs yet");
    (void)LastVal;
    LastVal = VA.getValNo();

    if (VA.isRegLoc()) {
      EVT RegVT = VA.getLocVT();
      const TargetRegisterClass *RC;
      if (RegVT == MVT::i32)
        RC = &X86::GR32RegClass;
      else if (Is64Bit && RegVT == MVT::i64)
        RC = &X86::GR64RegClass;
      else if (RegVT == MVT::f32)
        RC = &X86::FR32RegClass;
      else if (RegVT == MVT::f64)
        RC = &X86::FR64RegClass;
      else if (RegVT.is512BitVector())
        RC = &X86::VR512RegClass;
      else if (RegVT.is256BitVector())
        RC = &X86::VR256RegClass;
      else if (RegVT.is128BitVector())
        RC = &X86::VR128RegClass;
      else if (RegVT == MVT::x86mmx)
        RC = &X86::VR64RegClass;
      else if (RegVT == MVT::i1)
        RC = &X86::VK1RegClass;
      else if (RegVT == MVT::v8i1)
        RC = &X86::VK8RegClass;
      else if (RegVT == MVT::v16i1)
        RC = &X86::VK16RegClass;
      else
        llvm_unreachable("Unknown argument type!");

      unsigned Reg = MF.addLiveIn(VA.getLocReg(), RC);
      ArgValue = DAG.getCopyFromReg(Chain, dl, Reg, RegVT);

      // If this is an 8 or 16-bit value, it is really passed promoted to 32
      // bits.  Insert an assert[sz]ext to capture this, then truncate to the
      // right size.
      if (VA.getLocInfo() == CCValAssign::SExt)
        ArgValue = DAG.getNode(ISD::AssertSext, dl, RegVT, ArgValue,
                               DAG.getValueType(VA.getValVT()));
      else if (VA.getLocInfo() == CCValAssign::ZExt)
        ArgValue = DAG.getNode(ISD::AssertZext, dl, RegVT, ArgValue,
                               DAG.getValueType(VA.getValVT()));
      else if (VA.getLocInfo() == CCValAssign::BCvt)
        ArgValue = DAG.getNode(ISD::BITCAST, dl, VA.getValVT(), ArgValue);

      if (VA.isExtInLoc()) {
        // Handle MMX values passed in XMM regs.
        if (RegVT.isVector())
          ArgValue = DAG.getNode(X86ISD::MOVDQ2Q, dl, VA.getValVT(), ArgValue);
        else
          ArgValue = DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), ArgValue);
      }
    } else {
      assert(VA.isMemLoc());
      ArgValue = LowerMemArgument(Chain, CallConv, Ins, dl, DAG, VA, MFI, i);
    }

    // If value is passed via pointer - do a load.
    if (VA.getLocInfo() == CCValAssign::Indirect)
      ArgValue = DAG.getLoad(VA.getValVT(), dl, Chain, ArgValue,
                             MachinePointerInfo(), false, false, false, 0);

    InVals.push_back(ArgValue);
  }

  if (Subtarget->is64Bit() || Subtarget->isTargetKnownWindowsMSVC()) {
    for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
      // The x86-64 ABIs require that for returning structs by value we copy
      // the sret argument into %rax/%eax (depending on ABI) for the return.
      // Win32 requires us to put the sret argument to %eax as well.
      // Save the argument into a virtual register so that we can access it
      // from the return points.
      if (Ins[i].Flags.isSRet()) {
        unsigned Reg = FuncInfo->getSRetReturnReg();
        if (!Reg) {
          MVT PtrTy = getPointerTy();
          Reg = MF.getRegInfo().createVirtualRegister(getRegClassFor(PtrTy));
          FuncInfo->setSRetReturnReg(Reg);
        }
        SDValue Copy = DAG.getCopyToReg(DAG.getEntryNode(), dl, Reg, InVals[i]);
        Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Copy, Chain);
        break;
      }
    }
  }

  unsigned StackSize = CCInfo.getNextStackOffset();
  // Align stack specially for tail calls.
  if (FuncIsMadeTailCallSafe(CallConv,
                             MF.getTarget().Options.GuaranteedTailCallOpt))
    StackSize = GetAlignedArgumentStackSize(StackSize, DAG);

  // If the function takes variable number of arguments, make a frame index for
  // the start of the first vararg value... for expansion of llvm.va_start.
  if (isVarArg) {
    if (Is64Bit || (CallConv != CallingConv::X86_FastCall &&
                    CallConv != CallingConv::X86_ThisCall)) {
      FuncInfo->setVarArgsFrameIndex(MFI->CreateFixedObject(1, StackSize,true));
    }
    if (Is64Bit) {
      unsigned TotalNumIntRegs = 0, TotalNumXMMRegs = 0;

      // FIXME: We should really autogenerate these arrays
      static const MCPhysReg GPR64ArgRegsWin64[] = {
        X86::RCX, X86::RDX, X86::R8,  X86::R9
      };
      static const MCPhysReg GPR64ArgRegs64Bit[] = {
        X86::RDI, X86::RSI, X86::RDX, X86::RCX, X86::R8, X86::R9
      };
      static const MCPhysReg XMMArgRegs64Bit[] = {
        X86::XMM0, X86::XMM1, X86::XMM2, X86::XMM3,
        X86::XMM4, X86::XMM5, X86::XMM6, X86::XMM7
      };
      const MCPhysReg *GPR64ArgRegs;
      unsigned NumXMMRegs = 0;

      if (IsWin64) {
        // The XMM registers which might contain var arg parameters are shadowed
        // in their paired GPR.  So we only need to save the GPR to their home
        // slots.
        TotalNumIntRegs = 4;
        GPR64ArgRegs = GPR64ArgRegsWin64;
      } else {
        TotalNumIntRegs = 6; TotalNumXMMRegs = 8;
        GPR64ArgRegs = GPR64ArgRegs64Bit;

        NumXMMRegs = CCInfo.getFirstUnallocated(XMMArgRegs64Bit,
                                                TotalNumXMMRegs);
      }
      unsigned NumIntRegs = CCInfo.getFirstUnallocated(GPR64ArgRegs,
                                                       TotalNumIntRegs);

      bool NoImplicitFloatOps = Fn->getAttributes().
        hasAttribute(AttributeSet::FunctionIndex, Attribute::NoImplicitFloat);
      assert(!(NumXMMRegs && !Subtarget->hasSSE1()) &&
             "SSE register cannot be used when SSE is disabled!");
      assert(!(NumXMMRegs && MF.getTarget().Options.UseSoftFloat &&
               NoImplicitFloatOps) &&
             "SSE register cannot be used when SSE is disabled!");
      if (MF.getTarget().Options.UseSoftFloat || NoImplicitFloatOps ||
          !Subtarget->hasSSE1())
        // Kernel mode asks for SSE to be disabled, so don't push them
        // on the stack.
        TotalNumXMMRegs = 0;

      if (IsWin64) {
        const TargetFrameLowering &TFI = *MF.getTarget().getFrameLowering();
        // Get to the caller-allocated home save location.  Add 8 to account
        // for the return address.
        int HomeOffset = TFI.getOffsetOfLocalArea() + 8;
        FuncInfo->setRegSaveFrameIndex(
          MFI->CreateFixedObject(1, NumIntRegs * 8 + HomeOffset, false));
        // Fixup to set vararg frame on shadow area (4 x i64).
        if (NumIntRegs < 4)
          FuncInfo->setVarArgsFrameIndex(FuncInfo->getRegSaveFrameIndex());
      } else {
        // For X86-64, if there are vararg parameters that are passed via
        // registers, then we must store them to their spots on the stack so
        // they may be loaded by deferencing the result of va_next.
        FuncInfo->setVarArgsGPOffset(NumIntRegs * 8);
        FuncInfo->setVarArgsFPOffset(TotalNumIntRegs * 8 + NumXMMRegs * 16);
        FuncInfo->setRegSaveFrameIndex(
          MFI->CreateStackObject(TotalNumIntRegs * 8 + TotalNumXMMRegs * 16, 16,
                               false));
      }

      // Store the integer parameter registers.
      SmallVector<SDValue, 8> MemOps;
      SDValue RSFIN = DAG.getFrameIndex(FuncInfo->getRegSaveFrameIndex(),
                                        getPointerTy());
      unsigned Offset = FuncInfo->getVarArgsGPOffset();
      for (; NumIntRegs != TotalNumIntRegs; ++NumIntRegs) {
        SDValue FIN = DAG.getNode(ISD::ADD, dl, getPointerTy(), RSFIN,
                                  DAG.getIntPtrConstant(Offset));
        unsigned VReg = MF.addLiveIn(GPR64ArgRegs[NumIntRegs],
                                     &X86::GR64RegClass);
        SDValue Val = DAG.getCopyFromReg(Chain, dl, VReg, MVT::i64);
        SDValue Store =
          DAG.getStore(Val.getValue(1), dl, Val, FIN,
                       MachinePointerInfo::getFixedStack(
                         FuncInfo->getRegSaveFrameIndex(), Offset),
                       false, false, 0);
        MemOps.push_back(Store);
        Offset += 8;
      }

      if (TotalNumXMMRegs != 0 && NumXMMRegs != TotalNumXMMRegs) {
        // Now store the XMM (fp + vector) parameter registers.
        SmallVector<SDValue, 11> SaveXMMOps;
        SaveXMMOps.push_back(Chain);

        unsigned AL = MF.addLiveIn(X86::AL, &X86::GR8RegClass);
        SDValue ALVal = DAG.getCopyFromReg(DAG.getEntryNode(), dl, AL, MVT::i8);
        SaveXMMOps.push_back(ALVal);

        SaveXMMOps.push_back(DAG.getIntPtrConstant(
                               FuncInfo->getRegSaveFrameIndex()));
        SaveXMMOps.push_back(DAG.getIntPtrConstant(
                               FuncInfo->getVarArgsFPOffset()));

        for (; NumXMMRegs != TotalNumXMMRegs; ++NumXMMRegs) {
          unsigned VReg = MF.addLiveIn(XMMArgRegs64Bit[NumXMMRegs],
                                       &X86::VR128RegClass);
          SDValue Val = DAG.getCopyFromReg(Chain, dl, VReg, MVT::v4f32);
          SaveXMMOps.push_back(Val);
        }
        MemOps.push_back(DAG.getNode(X86ISD::VASTART_SAVE_XMM_REGS, dl,
                                     MVT::Other, SaveXMMOps));
      }

      if (!MemOps.empty())
        Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOps);
    }
  }

  // Some CCs need callee pop.
  if (X86::isCalleePop(CallConv, Is64Bit, isVarArg,
                       MF.getTarget().Options.GuaranteedTailCallOpt)) {
    FuncInfo->setBytesToPopOnReturn(StackSize); // Callee pops everything.
  } else {
    FuncInfo->setBytesToPopOnReturn(0); // Callee pops nothing.
    // If this is an sret function, the return should pop the hidden pointer.
    if (!Is64Bit && !IsTailCallConvention(CallConv) &&
        !Subtarget->getTargetTriple().isOSMSVCRT() &&
        argsAreStructReturn(Ins) == StackStructReturn)
      FuncInfo->setBytesToPopOnReturn(4);
  }

  if (!Is64Bit) {
    // RegSaveFrameIndex is X86-64 only.
    FuncInfo->setRegSaveFrameIndex(0xAAAAAAA);
    if (CallConv == CallingConv::X86_FastCall ||
        CallConv == CallingConv::X86_ThisCall)
      // fastcc functions can't have varargs.
      FuncInfo->setVarArgsFrameIndex(0xAAAAAAA);
  }

  FuncInfo->setArgumentStackSize(StackSize);

  return Chain;
}

SDValue
X86TargetLowering::LowerMemOpCallTo(SDValue Chain,
                                    SDValue StackPtr, SDValue Arg,
                                    SDLoc dl, SelectionDAG &DAG,
                                    const CCValAssign &VA,
                                    ISD::ArgFlagsTy Flags) const {
  unsigned LocMemOffset = VA.getLocMemOffset();
  SDValue PtrOff = DAG.getIntPtrConstant(LocMemOffset);
  PtrOff = DAG.getNode(ISD::ADD, dl, getPointerTy(), StackPtr, PtrOff);
  if (Flags.isByVal())
    return CreateCopyOfByValArgument(Arg, PtrOff, Chain, Flags, DAG, dl);

  return DAG.getStore(Chain, dl, Arg, PtrOff,
                      MachinePointerInfo::getStack(LocMemOffset),
                      false, false, 0);
}

/// EmitTailCallLoadRetAddr - Emit a load of return address if tail call
/// optimization is performed and it is required.
SDValue
X86TargetLowering::EmitTailCallLoadRetAddr(SelectionDAG &DAG,
                                           SDValue &OutRetAddr, SDValue Chain,
                                           bool IsTailCall, bool Is64Bit,
                                           int FPDiff, SDLoc dl) const {
  // Adjust the Return address stack slot.
  EVT VT = getPointerTy();
  OutRetAddr = getReturnAddressFrameIndex(DAG);

  // Load the "old" Return address.
  OutRetAddr = DAG.getLoad(VT, dl, Chain, OutRetAddr, MachinePointerInfo(),
                           false, false, false, 0);
  return SDValue(OutRetAddr.getNode(), 1);
}

/// EmitTailCallStoreRetAddr - Emit a store of the return address if tail call
/// optimization is performed and it is required (FPDiff!=0).
static SDValue EmitTailCallStoreRetAddr(SelectionDAG &DAG, MachineFunction &MF,
                                        SDValue Chain, SDValue RetAddrFrIdx,
                                        EVT PtrVT, unsigned SlotSize,
                                        int FPDiff, SDLoc dl) {
  // Store the return address to the appropriate stack slot.
  if (!FPDiff) return Chain;
  // Calculate the new stack slot for the return address.
  int NewReturnAddrFI =
    MF.getFrameInfo()->CreateFixedObject(SlotSize, (int64_t)FPDiff - SlotSize,
                                         false);
  SDValue NewRetAddrFrIdx = DAG.getFrameIndex(NewReturnAddrFI, PtrVT);
  Chain = DAG.getStore(Chain, dl, RetAddrFrIdx, NewRetAddrFrIdx,
                       MachinePointerInfo::getFixedStack(NewReturnAddrFI),
                       false, false, 0);
  return Chain;
}

SDValue
X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
                             SmallVectorImpl<SDValue> &InVals) const {
  SelectionDAG &DAG                     = CLI.DAG;
  SDLoc &dl                             = CLI.DL;
  SmallVectorImpl<ISD::OutputArg> &Outs = CLI.Outs;
  SmallVectorImpl<SDValue> &OutVals     = CLI.OutVals;
  SmallVectorImpl<ISD::InputArg> &Ins   = CLI.Ins;
  SDValue Chain                         = CLI.Chain;
  SDValue Callee                        = CLI.Callee;
  CallingConv::ID CallConv              = CLI.CallConv;
  bool &isTailCall                      = CLI.IsTailCall;
  bool isVarArg                         = CLI.IsVarArg;

  MachineFunction &MF = DAG.getMachineFunction();
  bool Is64Bit        = Subtarget->is64Bit();
  bool IsWin64        = Subtarget->isCallingConvWin64(CallConv);
  StructReturnType SR = callIsStructReturn(Outs);
  bool IsSibcall      = false;

  if (MF.getTarget().Options.DisableTailCalls)
    isTailCall = false;

  bool IsMustTail = CLI.CS && CLI.CS->isMustTailCall();
  if (IsMustTail) {
    // Force this to be a tail call.  The verifier rules are enough to ensure
    // that we can lower this successfully without moving the return address
    // around.
    isTailCall = true;
  } else if (isTailCall) {
    // Check if it's really possible to do a tail call.
    isTailCall = IsEligibleForTailCallOptimization(Callee, CallConv,
                    isVarArg, SR != NotStructReturn,
                    MF.getFunction()->hasStructRetAttr(), CLI.RetTy,
                    Outs, OutVals, Ins, DAG);

    // Sibcalls are automatically detected tailcalls which do not require
    // ABI changes.
    if (!MF.getTarget().Options.GuaranteedTailCallOpt && isTailCall)
      IsSibcall = true;

    if (isTailCall)
      ++NumTailCalls;
  }

  assert(!(isVarArg && IsTailCallConvention(CallConv)) &&
         "Var args not supported with calling convention fastcc, ghc or hipe");

  // Analyze operands of the call, assigning locations to each operand.
  SmallVector<CCValAssign, 16> ArgLocs;
  CCState CCInfo(CallConv, isVarArg, MF, MF.getTarget(),
                 ArgLocs, *DAG.getContext());

  // Allocate shadow area for Win64
  if (IsWin64)
    CCInfo.AllocateStack(32, 8);

  CCInfo.AnalyzeCallOperands(Outs, CC_X86);

  // Get a count of how many bytes are to be pushed on the stack.
  unsigned NumBytes = CCInfo.getNextStackOffset();
  if (IsSibcall)
    // This is a sibcall. The memory operands are available in caller's
    // own caller's stack.
    NumBytes = 0;
  else if (MF.getTarget().Options.GuaranteedTailCallOpt &&
           IsTailCallConvention(CallConv))
    NumBytes = GetAlignedArgumentStackSize(NumBytes, DAG);

  int FPDiff = 0;
  if (isTailCall && !IsSibcall && !IsMustTail) {
    // Lower arguments at fp - stackoffset + fpdiff.
    X86MachineFunctionInfo *X86Info = MF.getInfo<X86MachineFunctionInfo>();
    unsigned NumBytesCallerPushed = X86Info->getBytesToPopOnReturn();

    FPDiff = NumBytesCallerPushed - NumBytes;

    // Set the delta of movement of the returnaddr stackslot.
    // But only set if delta is greater than previous delta.
    if (FPDiff < X86Info->getTCReturnAddrDelta())
      X86Info->setTCReturnAddrDelta(FPDiff);
  }

  unsigned NumBytesToPush = NumBytes;
  unsigned NumBytesToPop = NumBytes;

  // If we have an inalloca argument, all stack space has already been allocated
  // for us and be right at the top of the stack.  We don't support multiple
  // arguments passed in memory when using inalloca.
  if (!Outs.empty() && Outs.back().Flags.isInAlloca()) {
    NumBytesToPush = 0;
    assert(ArgLocs.back().getLocMemOffset() == 0 &&
           "an inalloca argument must be the only memory argument");
  }

  if (!IsSibcall)
    Chain = DAG.getCALLSEQ_START(
        Chain, DAG.getIntPtrConstant(NumBytesToPush, true), dl);

  SDValue RetAddrFrIdx;
  // Load return address for tail calls.
  if (isTailCall && FPDiff)
    Chain = EmitTailCallLoadRetAddr(DAG, RetAddrFrIdx, Chain, isTailCall,
                                    Is64Bit, FPDiff, dl);

  SmallVector<std::pair<unsigned, SDValue>, 8> RegsToPass;
  SmallVector<SDValue, 8> MemOpChains;
  SDValue StackPtr;

  // Walk the register/memloc assignments, inserting copies/loads.  In the case
  // of tail call optimization arguments are handle later.
  const X86RegisterInfo *RegInfo =
    static_cast<const X86RegisterInfo*>(DAG.getTarget().getRegisterInfo());
  for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
    // Skip inalloca arguments, they have already been written.
    ISD::ArgFlagsTy Flags = Outs[i].Flags;
    if (Flags.isInAlloca())
      continue;

    CCValAssign &VA = ArgLocs[i];
    EVT RegVT = VA.getLocVT();
    SDValue Arg = OutVals[i];
    bool isByVal = Flags.isByVal();

    // Promote the value if needed.
    switch (VA.getLocInfo()) {
    default: llvm_unreachable("Unknown loc info!");
    case CCValAssign::Full: break;
    case CCValAssign::SExt:
      Arg = DAG.getNode(ISD::SIGN_EXTEND, dl, RegVT, Arg);
      break;
    case CCValAssign::ZExt:
      Arg = DAG.getNode(ISD::ZERO_EXTEND, dl, RegVT, Arg);
      break;
    case CCValAssign::AExt:
      if (RegVT.is128BitVector()) {
        // Special case: passing MMX values in XMM registers.
        Arg = DAG.getNode(ISD::BITCAST, dl, MVT::i64, Arg);
        Arg = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, Arg);
        Arg = getMOVL(DAG, dl, MVT::v2i64, DAG.getUNDEF(MVT::v2i64), Arg);
      } else
        Arg = DAG.getNode(ISD::ANY_EXTEND, dl, RegVT, Arg);
      break;
    case CCValAssign::BCvt:
      Arg = DAG.getNode(ISD::BITCAST, dl, RegVT, Arg);
      break;
    case CCValAssign::Indirect: {
      // Store the argument.
      SDValue SpillSlot = DAG.CreateStackTemporary(VA.getValVT());
      int FI = cast<FrameIndexSDNode>(SpillSlot)->getIndex();
      Chain = DAG.getStore(Chain, dl, Arg, SpillSlot,
                           MachinePointerInfo::getFixedStack(FI),
                           false, false, 0);
      Arg = SpillSlot;
      break;
    }
    }

    if (VA.isRegLoc()) {
      RegsToPass.push_back(std::make_pair(VA.getLocReg(), Arg));
      if (isVarArg && IsWin64) {
        // Win64 ABI requires argument XMM reg to be copied to the corresponding
        // shadow reg if callee is a varargs function.
        unsigned ShadowReg = 0;
        switch (VA.getLocReg()) {
        case X86::XMM0: ShadowReg = X86::RCX; break;
        case X86::XMM1: ShadowReg = X86::RDX; break;
        case X86::XMM2: ShadowReg = X86::R8; break;
        case X86::XMM3: ShadowReg = X86::R9; break;
        }
        if (ShadowReg)
          RegsToPass.push_back(std::make_pair(ShadowReg, Arg));
      }
    } else if (!IsSibcall && (!isTailCall || isByVal)) {
      assert(VA.isMemLoc());
      if (!StackPtr.getNode())
        StackPtr = DAG.getCopyFromReg(Chain, dl, RegInfo->getStackRegister(),
                                      getPointerTy());
      MemOpChains.push_back(LowerMemOpCallTo(Chain, StackPtr, Arg,
                                             dl, DAG, VA, Flags));
    }
  }

  if (!MemOpChains.empty())
    Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOpChains);

  if (Subtarget->isPICStyleGOT()) {
    // ELF / PIC requires GOT in the EBX register before function calls via PLT
    // GOT pointer.
    if (!isTailCall) {
      RegsToPass.push_back(std::make_pair(unsigned(X86::EBX),
               DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), getPointerTy())));
    } else {
      // If we are tail calling and generating PIC/GOT style code load the
      // address of the callee into ECX. The value in ecx is used as target of
      // the tail jump. This is done to circumvent the ebx/callee-saved problem
      // for tail calls on PIC/GOT architectures. Normally we would just put the
      // address of GOT into ebx and then call target@PLT. But for tail calls
      // ebx would be restored (since ebx is callee saved) before jumping to the
      // target@PLT.

      // Note: The actual moving to ECX is done further down.
      GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee);
      if (G && !G->getGlobal()->hasHiddenVisibility() &&
          !G->getGlobal()->hasProtectedVisibility())
        Callee = LowerGlobalAddress(Callee, DAG);
      else if (isa<ExternalSymbolSDNode>(Callee))
        Callee = LowerExternalSymbol(Callee, DAG);
    }
  }

  if (Is64Bit && isVarArg && !IsWin64) {
    // From AMD64 ABI document:
    // For calls that may call functions that use varargs or stdargs
    // (prototype-less calls or calls to functions containing ellipsis (...) in
    // the declaration) %al is used as hidden argument to specify the number
    // of SSE registers used. The contents of %al do not need to match exactly
    // the number of registers, but must be an ubound on the number of SSE
    // registers used and is in the range 0 - 8 inclusive.

    // Count the number of XMM registers allocated.
    static const MCPhysReg XMMArgRegs[] = {
      X86::XMM0, X86::XMM1, X86::XMM2, X86::XMM3,
      X86::XMM4, X86::XMM5, X86::XMM6, X86::XMM7
    };
    unsigned NumXMMRegs = CCInfo.getFirstUnallocated(XMMArgRegs, 8);
    assert((Subtarget->hasSSE1() || !NumXMMRegs)
           && "SSE registers cannot be used when SSE is disabled");

    RegsToPass.push_back(std::make_pair(unsigned(X86::AL),
                                        DAG.getConstant(NumXMMRegs, MVT::i8)));
  }

  // For tail calls lower the arguments to the 'real' stack slots.  Sibcalls
  // don't need this because the eligibility check rejects calls that require
  // shuffling arguments passed in memory.
  if (!IsSibcall && isTailCall) {
    // Force all the incoming stack arguments to be loaded from the stack
    // before any new outgoing arguments are stored to the stack, because the
    // outgoing stack slots may alias the incoming argument stack slots, and
    // the alias isn't otherwise explicit. This is slightly more conservative
    // than necessary, because it means that each store effectively depends
    // on every argument instead of just those arguments it would clobber.
    SDValue ArgChain = DAG.getStackArgumentTokenFactor(Chain);

    SmallVector<SDValue, 8> MemOpChains2;
    SDValue FIN;
    int FI = 0;
    for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
      CCValAssign &VA = ArgLocs[i];
      if (VA.isRegLoc())
        continue;
      assert(VA.isMemLoc());
      SDValue Arg = OutVals[i];
      ISD::ArgFlagsTy Flags = Outs[i].Flags;
      // Skip inalloca arguments.  They don't require any work.
      if (Flags.isInAlloca())
        continue;
      // Create frame index.
      int32_t Offset = VA.getLocMemOffset()+FPDiff;
      uint32_t OpSize = (VA.getLocVT().getSizeInBits()+7)/8;
      FI = MF.getFrameInfo()->CreateFixedObject(OpSize, Offset, true);
      FIN = DAG.getFrameIndex(FI, getPointerTy());

      if (Flags.isByVal()) {
        // Copy relative to framepointer.
        SDValue Source = DAG.getIntPtrConstant(VA.getLocMemOffset());
        if (!StackPtr.getNode())
          StackPtr = DAG.getCopyFromReg(Chain, dl,
                                        RegInfo->getStackRegister(),
                                        getPointerTy());
        Source = DAG.getNode(ISD::ADD, dl, getPointerTy(), StackPtr, Source);

        MemOpChains2.push_back(CreateCopyOfByValArgument(Source, FIN,
                                                         ArgChain,
                                                         Flags, DAG, dl));
      } else {
        // Store relative to framepointer.
        MemOpChains2.push_back(
          DAG.getStore(ArgChain, dl, Arg, FIN,
                       MachinePointerInfo::getFixedStack(FI),
                       false, false, 0));
      }
    }

    if (!MemOpChains2.empty())
      Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOpChains2);

    // Store the return address to the appropriate stack slot.
    Chain = EmitTailCallStoreRetAddr(DAG, MF, Chain, RetAddrFrIdx,
                                     getPointerTy(), RegInfo->getSlotSize(),
                                     FPDiff, dl);
  }

  // Build a sequence of copy-to-reg nodes chained together with token chain
  // and flag operands which copy the outgoing args into registers.
  SDValue InFlag;
  for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) {
    Chain = DAG.getCopyToReg(Chain, dl, RegsToPass[i].first,
                             RegsToPass[i].second, InFlag);
    InFlag = Chain.getValue(1);
  }

  if (DAG.getTarget().getCodeModel() == CodeModel::Large) {
    assert(Is64Bit && "Large code model is only legal in 64-bit mode.");
    // In the 64-bit large code model, we have to make all calls
    // through a register, since the call instruction's 32-bit
    // pc-relative offset may not be large enough to hold the whole
    // address.
  } else if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee)) {
    // If the callee is a GlobalAddress node (quite common, every direct call
    // is) turn it into a TargetGlobalAddress node so that legalize doesn't hack
    // it.

    // We should use extra load for direct calls to dllimported functions in
    // non-JIT mode.
    const GlobalValue *GV = G->getGlobal();
    if (!GV->hasDLLImportStorageClass()) {
      unsigned char OpFlags = 0;
      bool ExtraLoad = false;
      unsigned WrapperKind = ISD::DELETED_NODE;

      // On ELF targets, in both X86-64 and X86-32 mode, direct calls to
      // external symbols most go through the PLT in PIC mode.  If the symbol
      // has hidden or protected visibility, or if it is static or local, then
      // we don't need to use the PLT - we can directly call it.
      if (Subtarget->isTargetELF() &&
          DAG.getTarget().getRelocationModel() == Reloc::PIC_ &&
          GV->hasDefaultVisibility() && !GV->hasLocalLinkage()) {
        OpFlags = X86II::MO_PLT;
      } else if (Subtarget->isPICStyleStubAny() &&
                 (GV->isDeclaration() || GV->isWeakForLinker()) &&
                 (!Subtarget->getTargetTriple().isMacOSX() ||
                  Subtarget->getTargetTriple().isMacOSXVersionLT(10, 5))) {
        // PC-relative references to external symbols should go through $stub,
        // unless we're building with the leopard linker or later, which
        // automatically synthesizes these stubs.
        OpFlags = X86II::MO_DARWIN_STUB;
      } else if (Subtarget->isPICStyleRIPRel() &&
                 isa<Function>(GV) &&
                 cast<Function>(GV)->getAttributes().
                   hasAttribute(AttributeSet::FunctionIndex,
                                Attribute::NonLazyBind)) {
        // If the function is marked as non-lazy, generate an indirect call
        // which loads from the GOT directly. This avoids runtime overhead
        // at the cost of eager binding (and one extra byte of encoding).
        OpFlags = X86II::MO_GOTPCREL;
        WrapperKind = X86ISD::WrapperRIP;
        ExtraLoad = true;
      }

      Callee = DAG.getTargetGlobalAddress(GV, dl, getPointerTy(),
                                          G->getOffset(), OpFlags);

      // Add a wrapper if needed.
      if (WrapperKind != ISD::DELETED_NODE)
        Callee = DAG.getNode(X86ISD::WrapperRIP, dl, getPointerTy(), Callee);
      // Add extra indirection if needed.
      if (ExtraLoad)
        Callee = DAG.getLoad(getPointerTy(), dl, DAG.getEntryNode(), Callee,
                             MachinePointerInfo::getGOT(),
                             false, false, false, 0);
    }
  } else if (ExternalSymbolSDNode *S = dyn_cast<ExternalSymbolSDNode>(Callee)) {
    unsigned char OpFlags = 0;

    // On ELF targets, in either X86-64 or X86-32 mode, direct calls to
    // external symbols should go through the PLT.
    if (Subtarget->isTargetELF() &&
        DAG.getTarget().getRelocationModel() == Reloc::PIC_) {
      OpFlags = X86II::MO_PLT;
    } else if (Subtarget->isPICStyleStubAny() &&
               (!Subtarget->getTargetTriple().isMacOSX() ||
                Subtarget->getTargetTriple().isMacOSXVersionLT(10, 5))) {
      // PC-relative references to external symbols should go through $stub,
      // unless we're building with the leopard linker or later, which
      // automatically synthesizes these stubs.
      OpFlags = X86II::MO_DARWIN_STUB;
    }

    Callee = DAG.getTargetExternalSymbol(S->getSymbol(), getPointerTy(),
                                         OpFlags);
  }

  // Returns a chain & a flag for retval copy to use.
  SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
  SmallVector<SDValue, 8> Ops;

  if (!IsSibcall && isTailCall) {
    Chain = DAG.getCALLSEQ_END(Chain,
                               DAG.getIntPtrConstant(NumBytesToPop, true),
                               DAG.getIntPtrConstant(0, true), InFlag, dl);
    InFlag = Chain.getValue(1);
  }

  Ops.push_back(Chain);
  Ops.push_back(Callee);

  if (isTailCall)
    Ops.push_back(DAG.getConstant(FPDiff, MVT::i32));

  // Add argument registers to the end of the list so that they are known live
  // into the call.
  for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i)
    Ops.push_back(DAG.getRegister(RegsToPass[i].first,
                                  RegsToPass[i].second.getValueType()));

  // Add a register mask operand representing the call-preserved registers.
  const TargetRegisterInfo *TRI = DAG.getTarget().getRegisterInfo();
  const uint32_t *Mask = TRI->getCallPreservedMask(CallConv);
  assert(Mask && "Missing call preserved mask for calling convention");
  Ops.push_back(DAG.getRegisterMask(Mask));

  if (InFlag.getNode())
    Ops.push_back(InFlag);

  if (isTailCall) {
    // We used to do:
    //// If this is the first return lowered for this function, add the regs
    //// to the liveout set for the function.
    // This isn't right, although it's probably harmless on x86; liveouts
    // should be computed from returns not tail calls.  Consider a void
    // function making a tail call to a function returning int.
    return DAG.getNode(X86ISD::TC_RETURN, dl, NodeTys, Ops);
  }

  Chain = DAG.getNode(X86ISD::CALL, dl, NodeTys, Ops);
  InFlag = Chain.getValue(1);

  // Create the CALLSEQ_END node.
  unsigned NumBytesForCalleeToPop;
  if (X86::isCalleePop(CallConv, Is64Bit, isVarArg,
                       DAG.getTarget().Options.GuaranteedTailCallOpt))
    NumBytesForCalleeToPop = NumBytes;    // Callee pops everything
  else if (!Is64Bit && !IsTailCallConvention(CallConv) &&
           !Subtarget->getTargetTriple().isOSMSVCRT() &&
           SR == StackStructReturn)
    // If this is a call to a struct-return function, the callee
    // pops the hidden struct pointer, so we have to push it back.
    // This is common for Darwin/X86, Linux & Mingw32 targets.
    // For MSVC Win32 targets, the caller pops the hidden struct pointer.
    NumBytesForCalleeToPop = 4;
  else
    NumBytesForCalleeToPop = 0;  // Callee pops nothing.

  // Returns a flag for retval copy to use.
  if (!IsSibcall) {
    Chain = DAG.getCALLSEQ_END(Chain,
                               DAG.getIntPtrConstant(NumBytesToPop, true),
                               DAG.getIntPtrConstant(NumBytesForCalleeToPop,
                                                     true),
                               InFlag, dl);
    InFlag = Chain.getValue(1);
  }

  // Handle result values, copying them out of physregs into vregs that we
  // return.
  return LowerCallResult(Chain, InFlag, CallConv, isVarArg,
                         Ins, dl, DAG, InVals);
}

//===----------------------------------------------------------------------===//
//                Fast Calling Convention (tail call) implementation
//===----------------------------------------------------------------------===//

//  Like std call, callee cleans arguments, convention except that ECX is
//  reserved for storing the tail called function address. Only 2 registers are
//  free for argument passing (inreg). Tail call optimization is performed
//  provided:
//                * tailcallopt is enabled
//                * caller/callee are fastcc
//  On X86_64 architecture with GOT-style position independent code only local
//  (within module) calls are supported at the moment.
//  To keep the stack aligned according to platform abi the function
//  GetAlignedArgumentStackSize ensures that argument delta is always multiples
//  of stack alignment. (Dynamic linkers need this - darwin's dyld for example)
//  If a tail called function callee has more arguments than the caller the
//  caller needs to make sure that there is room to move the RETADDR to. This is
//  achieved by reserving an area the size of the argument delta right after the
//  original REtADDR, but before the saved framepointer or the spilled registers
//  e.g. caller(arg1, arg2) calls callee(arg1, arg2,arg3,arg4)
//  stack layout:
//    arg1
//    arg2
//    RETADDR
//    [ new RETADDR
//      move area ]
//    (possible EBP)
//    ESI
//    EDI
//    local1 ..

/// GetAlignedArgumentStackSize - Make the stack size align e.g 16n + 12 aligned
/// for a 16 byte align requirement.
unsigned
X86TargetLowering::GetAlignedArgumentStackSize(unsigned StackSize,
                                               SelectionDAG& DAG) const {
  MachineFunction &MF = DAG.getMachineFunction();
  const TargetMachine &TM = MF.getTarget();
  const X86RegisterInfo *RegInfo =
    static_cast<const X86RegisterInfo*>(TM.getRegisterInfo());
  const TargetFrameLowering &TFI = *TM.getFrameLowering();
  unsigned StackAlignment = TFI.getStackAlignment();
  uint64_t AlignMask = StackAlignment - 1;
  int64_t Offset = StackSize;
  unsigned SlotSize = RegInfo->getSlotSize();
  if ( (Offset & AlignMask) <= (StackAlignment - SlotSize) ) {
    // Number smaller than 12 so just add the difference.
    Offset += ((StackAlignment - SlotSize) - (Offset & AlignMask));
  } else {
    // Mask out lower bits, add stackalignment once plus the 12 bytes.
    Offset = ((~AlignMask) & Offset) + StackAlignment +
      (StackAlignment-SlotSize);
  }
  return Offset;
}

/// MatchingStackOffset - Return true if the given stack call argument is
/// already available in the same position (relatively) of the caller's
/// incoming argument stack.
static
bool MatchingStackOffset(SDValue Arg, unsigned Offset, ISD::ArgFlagsTy Flags,
                         MachineFrameInfo *MFI, const MachineRegisterInfo *MRI,
                         const X86InstrInfo *TII) {
  unsigned Bytes = Arg.getValueType().getSizeInBits() / 8;
  int FI = INT_MAX;
  if (Arg.getOpcode() == ISD::CopyFromReg) {
    unsigned VR = cast<RegisterSDNode>(Arg.getOperand(1))->getReg();
    if (!TargetRegisterInfo::isVirtualRegister(VR))
      return false;
    MachineInstr *Def = MRI->getVRegDef(VR);
    if (!Def)
      return false;
    if (!Flags.isByVal()) {
      if (!TII->isLoadFromStackSlot(Def, FI))
        return false;
    } else {
      unsigned Opcode = Def->getOpcode();
      if ((Opcode == X86::LEA32r || Opcode == X86::LEA64r) &&
          Def->getOperand(1).isFI()) {
        FI = Def->getOperand(1).getIndex();
        Bytes = Flags.getByValSize();
      } else
        return false;
    }
  } else if (LoadSDNode *Ld = dyn_cast<LoadSDNode>(Arg)) {
    if (Flags.isByVal())
      // ByVal argument is passed in as a pointer but it's now being
      // dereferenced. e.g.
      // define @foo(%struct.X* %A) {
      //   tail call @bar(%struct.X* byval %A)
      // }
      return false;
    SDValue Ptr = Ld->getBasePtr();
    FrameIndexSDNode *FINode = dyn_cast<FrameIndexSDNode>(Ptr);
    if (!FINode)
      return false;
    FI = FINode->getIndex();
  } else if (Arg.getOpcode() == ISD::FrameIndex && Flags.isByVal()) {
    FrameIndexSDNode *FINode = cast<FrameIndexSDNode>(Arg);
    FI = FINode->getIndex();
    Bytes = Flags.getByValSize();
  } else
    return false;

  assert(FI != INT_MAX);
  if (!MFI->isFixedObjectIndex(FI))
    return false;
  return Offset == MFI->getObjectOffset(FI) && Bytes == MFI->getObjectSize(FI);
}

/// IsEligibleForTailCallOptimization - Check whether the call is eligible
/// for tail call optimization. Targets which want to do tail call
/// optimization should implement this function.
bool
X86TargetLowering::IsEligibleForTailCallOptimization(SDValue Callee,
                                                     CallingConv::ID CalleeCC,
                                                     bool isVarArg,
                                                     bool isCalleeStructRet,
                                                     bool isCallerStructRet,
                                                     Type *RetTy,
                                    const SmallVectorImpl<ISD::OutputArg> &Outs,
                                    const SmallVectorImpl<SDValue> &OutVals,
                                    const SmallVectorImpl<ISD::InputArg> &Ins,
                                                     SelectionDAG &DAG) const {
  if (!IsTailCallConvention(CalleeCC) && !IsCCallConvention(CalleeCC))
    return false;

  // If -tailcallopt is specified, make fastcc functions tail-callable.
  const MachineFunction &MF = DAG.getMachineFunction();
  const Function *CallerF = MF.getFunction();

  // If the function return type is x86_fp80 and the callee return type is not,
  // then the FP_EXTEND of the call result is not a nop. It's not safe to
  // perform a tailcall optimization here.
  if (CallerF->getReturnType()->isX86_FP80Ty() && !RetTy->isX86_FP80Ty())
    return false;

  CallingConv::ID CallerCC = CallerF->getCallingConv();
  bool CCMatch = CallerCC == CalleeCC;
  bool IsCalleeWin64 = Subtarget->isCallingConvWin64(CalleeCC);
  bool IsCallerWin64 = Subtarget->isCallingConvWin64(CallerCC);

  if (DAG.getTarget().Options.GuaranteedTailCallOpt) {
    if (IsTailCallConvention(CalleeCC) && CCMatch)
      return true;
    return false;
  }

  // Look for obvious safe cases to perform tail call optimization that do not
  // require ABI changes. This is what gcc calls sibcall.

  // Can't do sibcall if stack needs to be dynamically re-aligned. PEI needs to
  // emit a special epilogue.
  const X86RegisterInfo *RegInfo =
    static_cast<const X86RegisterInfo*>(DAG.getTarget().getRegisterInfo());
  if (RegInfo->needsStackRealignment(MF))
    return false;

  // Also avoid sibcall optimization if either caller or callee uses struct
  // return semantics.
  if (isCalleeStructRet || isCallerStructRet)
    return false;

  // An stdcall/thiscall caller is expected to clean up its arguments; the
  // callee isn't going to do that.
  // FIXME: this is more restrictive than needed. We could produce a tailcall
  // when the stack adjustment matches. For example, with a thiscall that takes
  // only one argument.
  if (!CCMatch && (CallerCC == CallingConv::X86_StdCall ||
                   CallerCC == CallingConv::X86_ThisCall))
    return false;

  // Do not sibcall optimize vararg calls unless all arguments are passed via
  // registers.
  if (isVarArg && !Outs.empty()) {

    // Optimizing for varargs on Win64 is unlikely to be safe without
    // additional testing.
    if (IsCalleeWin64 || IsCallerWin64)
      return false;

    SmallVector<CCValAssign, 16> ArgLocs;
    CCState CCInfo(CalleeCC, isVarArg, DAG.getMachineFunction(),
                   DAG.getTarget(), ArgLocs, *DAG.getContext());

    CCInfo.AnalyzeCallOperands(Outs, CC_X86);
    for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i)
      if (!ArgLocs[i].isRegLoc())
        return false;
  }

  // If the call result is in ST0 / ST1, it needs to be popped off the x87
  // stack.  Therefore, if it's not used by the call it is not safe to optimize
  // this into a sibcall.
  bool Unused = false;
  for (unsigned i = 0, e = Ins.size(); i != e; ++i) {
    if (!Ins[i].Used) {
      Unused = true;
      break;
    }
  }
  if (Unused) {
    SmallVector<CCValAssign, 16> RVLocs;
    CCState CCInfo(CalleeCC, false, DAG.getMachineFunction(),
                   DAG.getTarget(), RVLocs, *DAG.getContext());
    CCInfo.AnalyzeCallResult(Ins, RetCC_X86);
    for (unsigned i = 0, e = RVLocs.size(); i != e; ++i) {
      CCValAssign &VA = RVLocs[i];
      if (VA.getLocReg() == X86::ST0 || VA.getLocReg() == X86::ST1)
        return false;
    }
  }

  // If the calling conventions do not match, then we'd better make sure the
  // results are returned in the same way as what the caller expects.
  if (!CCMatch) {
    SmallVector<CCValAssign, 16> RVLocs1;
    CCState CCInfo1(CalleeCC, false, DAG.getMachineFunction(),
                    DAG.getTarget(), RVLocs1, *DAG.getContext());
    CCInfo1.AnalyzeCallResult(Ins, RetCC_X86);

    SmallVector<CCValAssign, 16> RVLocs2;
    CCState CCInfo2(CallerCC, false, DAG.getMachineFunction(),
                    DAG.getTarget(), RVLocs2, *DAG.getContext());
    CCInfo2.AnalyzeCallResult(Ins, RetCC_X86);

    if (RVLocs1.size() != RVLocs2.size())
      return false;
    for (unsigned i = 0, e = RVLocs1.size(); i != e; ++i) {
      if (RVLocs1[i].isRegLoc() != RVLocs2[i].isRegLoc())
        return false;
      if (RVLocs1[i].getLocInfo() != RVLocs2[i].getLocInfo())
        return false;
      if (RVLocs1[i].isRegLoc()) {
        if (RVLocs1[i].getLocReg() != RVLocs2[i].getLocReg())
          return false;
      } else {
        if (RVLocs1[i].getLocMemOffset() != RVLocs2[i].getLocMemOffset())
          return false;
      }
    }
  }

  // If the callee takes no arguments then go on to check the results of the
  // call.
  if (!Outs.empty()) {
    // Check if stack adjustment is needed. For now, do not do this if any
    // argument is passed on the stack.
    SmallVector<CCValAssign, 16> ArgLocs;
    CCState CCInfo(CalleeCC, isVarArg, DAG.getMachineFunction(),
                   DAG.getTarget(), ArgLocs, *DAG.getContext());

    // Allocate shadow area for Win64
    if (IsCalleeWin64)
      CCInfo.AllocateStack(32, 8);

    CCInfo.AnalyzeCallOperands(Outs, CC_X86);
    if (CCInfo.getNextStackOffset()) {
      MachineFunction &MF = DAG.getMachineFunction();
      if (MF.getInfo<X86MachineFunctionInfo>()->getBytesToPopOnReturn())
        return false;

      // Check if the arguments are already laid out in the right way as
      // the caller's fixed stack objects.
      MachineFrameInfo *MFI = MF.getFrameInfo();
      const MachineRegisterInfo *MRI = &MF.getRegInfo();
      const X86InstrInfo *TII =
          static_cast<const X86InstrInfo *>(DAG.getTarget().getInstrInfo());
      for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
        CCValAssign &VA = ArgLocs[i];
        SDValue Arg = OutVals[i];
        ISD::ArgFlagsTy Flags = Outs[i].Flags;
        if (VA.getLocInfo() == CCValAssign::Indirect)
          return false;
        if (!VA.isRegLoc()) {
          if (!MatchingStackOffset(Arg, VA.getLocMemOffset(), Flags,
                                   MFI, MRI, TII))
            return false;
        }
      }
    }

    // If the tailcall address may be in a register, then make sure it's
    // possible to register allocate for it. In 32-bit, the call address can
    // only target EAX, EDX, or ECX since the tail call must be scheduled after
    // callee-saved registers are restored. These happen to be the same
    // registers used to pass 'inreg' arguments so watch out for those.
    if (!Subtarget->is64Bit() &&
        ((!isa<GlobalAddressSDNode>(Callee) &&
          !isa<ExternalSymbolSDNode>(Callee)) ||
         DAG.getTarget().getRelocationModel() == Reloc::PIC_)) {
      unsigned NumInRegs = 0;
      // In PIC we need an extra register to formulate the address computation
      // for the callee.
      unsigned MaxInRegs =
	(DAG.getTarget().getRelocationModel() == Reloc::PIC_) ? 2 : 3;

      for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
        CCValAssign &VA = ArgLocs[i];
        if (!VA.isRegLoc())
          continue;
        unsigned Reg = VA.getLocReg();
        switch (Reg) {
        default: break;
        case X86::EAX: case X86::EDX: case X86::ECX:
          if (++NumInRegs == MaxInRegs)
            return false;
          break;
        }
      }
    }
  }

  return true;
}

FastISel *
X86TargetLowering::createFastISel(FunctionLoweringInfo &funcInfo,
                                  const TargetLibraryInfo *libInfo) const {
  return X86::createFastISel(funcInfo, libInfo);
}

//===----------------------------------------------------------------------===//
//                           Other Lowering Hooks
//===----------------------------------------------------------------------===//

static bool MayFoldLoad(SDValue Op) {
  return Op.hasOneUse() && ISD::isNormalLoad(Op.getNode());
}

static bool MayFoldIntoStore(SDValue Op) {
  return Op.hasOneUse() && ISD::isNormalStore(*Op.getNode()->use_begin());
}

static bool isTargetShuffle(unsigned Opcode) {
  switch(Opcode) {
  default: return false;
  case X86ISD::PSHUFD:
  case X86ISD::PSHUFHW:
  case X86ISD::PSHUFLW:
  case X86ISD::SHUFP:
  case X86ISD::PALIGNR:
  case X86ISD::MOVLHPS:
  case X86ISD::MOVLHPD:
  case X86ISD::MOVHLPS:
  case X86ISD::MOVLPS:
  case X86ISD::MOVLPD:
  case X86ISD::MOVSHDUP:
  case X86ISD::MOVSLDUP:
  case X86ISD::MOVDDUP:
  case X86ISD::MOVSS:
  case X86ISD::MOVSD:
  case X86ISD::UNPCKL:
  case X86ISD::UNPCKH:
  case X86ISD::VPERMILP:
  case X86ISD::VPERM2X128:
  case X86ISD::VPERMI:
    return true;
  }
}

static SDValue getTargetShuffleNode(unsigned Opc, SDLoc dl, EVT VT,
                                    SDValue V1, SelectionDAG &DAG) {
  switch(Opc) {
  default: llvm_unreachable("Unknown x86 shuffle node");
  case X86ISD::MOVSHDUP:
  case X86ISD::MOVSLDUP:
  case X86ISD::MOVDDUP:
    return DAG.getNode(Opc, dl, VT, V1);
  }
}

static SDValue getTargetShuffleNode(unsigned Opc, SDLoc dl, EVT VT,
                                    SDValue V1, unsigned TargetMask,
                                    SelectionDAG &DAG) {
  switch(Opc) {
  default: llvm_unreachable("Unknown x86 shuffle node");
  case X86ISD::PSHUFD:
  case X86ISD::PSHUFHW:
  case X86ISD::PSHUFLW:
  case X86ISD::VPERMILP:
  case X86ISD::VPERMI:
    return DAG.getNode(Opc, dl, VT, V1, DAG.getConstant(TargetMask, MVT::i8));
  }
}

static SDValue getTargetShuffleNode(unsigned Opc, SDLoc dl, EVT VT,
                                    SDValue V1, SDValue V2, unsigned TargetMask,
                                    SelectionDAG &DAG) {
  switch(Opc) {
  default: llvm_unreachable("Unknown x86 shuffle node");
  case X86ISD::PALIGNR:
  case X86ISD::SHUFP:
  case X86ISD::VPERM2X128:
    return DAG.getNode(Opc, dl, VT, V1, V2,
                       DAG.getConstant(TargetMask, MVT::i8));
  }
}

static SDValue getTargetShuffleNode(unsigned Opc, SDLoc dl, EVT VT,
                                    SDValue V1, SDValue V2, SelectionDAG &DAG) {
  switch(Opc) {
  default: llvm_unreachable("Unknown x86 shuffle node");
  case X86ISD::MOVLHPS:
  case X86ISD::MOVLHPD:
  case X86ISD::MOVHLPS:
  case X86ISD::MOVLPS:
  case X86ISD::MOVLPD:
  case X86ISD::MOVSS:
  case X86ISD::MOVSD:
  case X86ISD::UNPCKL:
  case X86ISD::UNPCKH:
    return DAG.getNode(Opc, dl, VT, V1, V2);
  }
}

SDValue X86TargetLowering::getReturnAddressFrameIndex(SelectionDAG &DAG) const {
  MachineFunction &MF = DAG.getMachineFunction();
  const X86RegisterInfo *RegInfo =
    static_cast<const X86RegisterInfo*>(DAG.getTarget().getRegisterInfo());
  X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
  int ReturnAddrIndex = FuncInfo->getRAIndex();

  if (ReturnAddrIndex == 0) {
    // Set up a frame object for the return address.
    unsigned SlotSize = RegInfo->getSlotSize();
    ReturnAddrIndex = MF.getFrameInfo()->CreateFixedObject(SlotSize,
                                                           -(int64_t)SlotSize,
                                                           false);
    FuncInfo->setRAIndex(ReturnAddrIndex);
  }

  return DAG.getFrameIndex(ReturnAddrIndex, getPointerTy());
}

bool X86::isOffsetSuitableForCodeModel(int64_t Offset, CodeModel::Model M,
                                       bool hasSymbolicDisplacement) {
  // Offset should fit into 32 bit immediate field.
  if (!isInt<32>(Offset))
    return false;

  // If we don't have a symbolic displacement - we don't have any extra
  // restrictions.
  if (!hasSymbolicDisplacement)
    return true;

  // FIXME: Some tweaks might be needed for medium code model.
  if (M != CodeModel::Small && M != CodeModel::Kernel)
    return false;

  // For small code model we assume that latest object is 16MB before end of 31
  // bits boundary. We may also accept pretty large negative constants knowing
  // that all objects are in the positive half of address space.
  if (M == CodeModel::Small && Offset < 16*1024*1024)
    return true;

  // For kernel code model we know that all object resist in the negative half
  // of 32bits address space. We may not accept negative offsets, since they may
  // be just off and we may accept pretty large positive ones.
  if (M == CodeModel::Kernel && Offset > 0)
    return true;

  return false;
}

/// isCalleePop - Determines whether the callee is required to pop its
/// own arguments. Callee pop is necessary to support tail calls.
bool X86::isCalleePop(CallingConv::ID CallingConv,
                      bool is64Bit, bool IsVarArg, bool TailCallOpt) {
  if (IsVarArg)
    return false;

  switch (CallingConv) {
  default:
    return false;
  case CallingConv::X86_StdCall:
    return !is64Bit;
  case CallingConv::X86_FastCall:
    return !is64Bit;
  case CallingConv::X86_ThisCall:
    return !is64Bit;
  case CallingConv::Fast:
    return TailCallOpt;
  case CallingConv::GHC:
    return TailCallOpt;
  case CallingConv::HiPE:
    return TailCallOpt;
  }
}

/// \brief Return true if the condition is an unsigned comparison operation.
static bool isX86CCUnsigned(unsigned X86CC) {
  switch (X86CC) {
  default: llvm_unreachable("Invalid integer condition!");
  case X86::COND_E:     return true;
  case X86::COND_G:     return false;
  case X86::COND_GE:    return false;
  case X86::COND_L:     return false;
  case X86::COND_LE:    return false;
  case X86::COND_NE:    return true;
  case X86::COND_B:     return true;
  case X86::COND_A:     return true;
  case X86::COND_BE:    return true;
  case X86::COND_AE:    return true;
  }
  llvm_unreachable("covered switch fell through?!");
}

/// TranslateX86CC - do a one to one translation of a ISD::CondCode to the X86
/// specific condition code, returning the condition code and the LHS/RHS of the
/// comparison to make.
static unsigned TranslateX86CC(ISD::CondCode SetCCOpcode, bool isFP,
                               SDValue &LHS, SDValue &RHS, SelectionDAG &DAG) {
  if (!isFP) {
    if (ConstantSDNode *RHSC = dyn_cast<ConstantSDNode>(RHS)) {
      if (SetCCOpcode == ISD::SETGT && RHSC->isAllOnesValue()) {
        // X > -1   -> X == 0, jump !sign.
        RHS = DAG.getConstant(0, RHS.getValueType());
        return X86::COND_NS;
      }
      if (SetCCOpcode == ISD::SETLT && RHSC->isNullValue()) {
        // X < 0   -> X == 0, jump on sign.
        return X86::COND_S;
      }
      if (SetCCOpcode == ISD::SETLT && RHSC->getZExtValue() == 1) {
        // X < 1   -> X <= 0
        RHS = DAG.getConstant(0, RHS.getValueType());
        return X86::COND_LE;
      }
    }

    switch (SetCCOpcode) {
    default: llvm_unreachable("Invalid integer condition!");
    case ISD::SETEQ:  return X86::COND_E;
    case ISD::SETGT:  return X86::COND_G;
    case ISD::SETGE:  return X86::COND_GE;
    case ISD::SETLT:  return X86::COND_L;
    case ISD::SETLE:  return X86::COND_LE;
    case ISD::SETNE:  return X86::COND_NE;
    case ISD::SETULT: return X86::COND_B;
    case ISD::SETUGT: return X86::COND_A;
    case ISD::SETULE: return X86::COND_BE;
    case ISD::SETUGE: return X86::COND_AE;
    }
  }

  // First determine if it is required or is profitable to flip the operands.

  // If LHS is a foldable load, but RHS is not, flip the condition.
  if (ISD::isNON_EXTLoad(LHS.getNode()) &&
      !ISD::isNON_EXTLoad(RHS.getNode())) {
    SetCCOpcode = getSetCCSwappedOperands(SetCCOpcode);
    std::swap(LHS, RHS);
  }

  switch (SetCCOpcode) {
  default: break;
  case ISD::SETOLT:
  case ISD::SETOLE:
  case ISD::SETUGT:
  case ISD::SETUGE:
    std::swap(LHS, RHS);
    break;
  }

  // On a floating point condition, the flags are set as follows:
  // ZF  PF  CF   op
  //  0 | 0 | 0 | X > Y
  //  0 | 0 | 1 | X < Y
  //  1 | 0 | 0 | X == Y
  //  1 | 1 | 1 | unordered
  switch (SetCCOpcode) {
  default: llvm_unreachable("Condcode should be pre-legalized away");
  case ISD::SETUEQ:
  case ISD::SETEQ:   return X86::COND_E;
  case ISD::SETOLT:              // flipped
  case ISD::SETOGT:
  case ISD::SETGT:   return X86::COND_A;
  case ISD::SETOLE:              // flipped
  case ISD::SETOGE:
  case ISD::SETGE:   return X86::COND_AE;
  case ISD::SETUGT:              // flipped
  case ISD::SETULT:
  case ISD::SETLT:   return X86::COND_B;
  case ISD::SETUGE:              // flipped
  case ISD::SETULE:
  case ISD::SETLE:   return X86::COND_BE;
  case ISD::SETONE:
  case ISD::SETNE:   return X86::COND_NE;
  case ISD::SETUO:   return X86::COND_P;
  case ISD::SETO:    return X86::COND_NP;
  case ISD::SETOEQ:
  case ISD::SETUNE:  return X86::COND_INVALID;
  }
}

/// hasFPCMov - is there a floating point cmov for the specific X86 condition
/// code. Current x86 isa includes the following FP cmov instructions:
/// fcmovb, fcomvbe, fcomve, fcmovu, fcmovae, fcmova, fcmovne, fcmovnu.
static bool hasFPCMov(unsigned X86CC) {
  switch (X86CC) {
  default:
    return false;
  case X86::COND_B:
  case X86::COND_BE:
  case X86::COND_E:
  case X86::COND_P:
  case X86::COND_A:
  case X86::COND_AE:
  case X86::COND_NE:
  case X86::COND_NP:
    return true;
  }
}

/// isFPImmLegal - Returns true if the target can instruction select the
/// specified FP immediate natively. If false, the legalizer will
/// materialize the FP immediate as a load from a constant pool.
bool X86TargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT) const {
  for (unsigned i = 0, e = LegalFPImmediates.size(); i != e; ++i) {
    if (Imm.bitwiseIsEqual(LegalFPImmediates[i]))
      return true;
  }
  return false;
}

/// \brief Returns true if it is beneficial to convert a load of a constant
/// to just the constant itself.
bool X86TargetLowering::shouldConvertConstantLoadToIntImm(const APInt &Imm,
                                                          Type *Ty) const {
  assert(Ty->isIntegerTy());

  unsigned BitSize = Ty->getPrimitiveSizeInBits();
  if (BitSize == 0 || BitSize > 64)
    return false;
  return true;
}

/// isUndefOrInRange - Return true if Val is undef or if its value falls within
/// the specified range (L, H].
static bool isUndefOrInRange(int Val, int Low, int Hi) {
  return (Val < 0) || (Val >= Low && Val < Hi);
}

/// isUndefOrEqual - Val is either less than zero (undef) or equal to the
/// specified value.
static bool isUndefOrEqual(int Val, int CmpVal) {
  return (Val < 0 || Val == CmpVal);
}

/// isSequentialOrUndefInRange - Return true if every element in Mask, beginning
/// from position Pos and ending in Pos+Size, falls within the specified
/// sequential range (L, L+Pos]. or is undef.
static bool isSequentialOrUndefInRange(ArrayRef<int> Mask,
                                       unsigned Pos, unsigned Size, int Low) {
  for (unsigned i = Pos, e = Pos+Size; i != e; ++i, ++Low)
    if (!isUndefOrEqual(Mask[i], Low))
      return false;
  return true;
}

/// isPSHUFDMask - Return true if the node specifies a shuffle of elements that
/// is suitable for input to PSHUFD or PSHUFW.  That is, it doesn't reference
/// the second operand.
static bool isPSHUFDMask(ArrayRef<int> Mask, MVT VT) {
  if (VT == MVT::v4f32 || VT == MVT::v4i32 )
    return (Mask[0] < 4 && Mask[1] < 4 && Mask[2] < 4 && Mask[3] < 4);
  if (VT == MVT::v2f64 || VT == MVT::v2i64)
    return (Mask[0] < 2 && Mask[1] < 2);
  return false;
}

/// isPSHUFHWMask - Return true if the node specifies a shuffle of elements that
/// is suitable for input to PSHUFHW.
static bool isPSHUFHWMask(ArrayRef<int> Mask, MVT VT, bool HasInt256) {
  if (VT != MVT::v8i16 && (!HasInt256 || VT != MVT::v16i16))
    return false;

  // Lower quadword copied in order or undef.
  if (!isSequentialOrUndefInRange(Mask, 0, 4, 0))
    return false;

  // Upper quadword shuffled.
  for (unsigned i = 4; i != 8; ++i)
    if (!isUndefOrInRange(Mask[i], 4, 8))
      return false;

  if (VT == MVT::v16i16) {
    // Lower quadword copied in order or undef.
    if (!isSequentialOrUndefInRange(Mask, 8, 4, 8))
      return false;

    // Upper quadword shuffled.
    for (unsigned i = 12; i != 16; ++i)
      if (!isUndefOrInRange(Mask[i], 12, 16))
        return false;
  }

  return true;
}

/// isPSHUFLWMask - Return true if the node specifies a shuffle of elements that
/// is suitable for input to PSHUFLW.
static bool isPSHUFLWMask(ArrayRef<int> Mask, MVT VT, bool HasInt256) {
  if (VT != MVT::v8i16 && (!HasInt256 || VT != MVT::v16i16))
    return false;

  // Upper quadword copied in order.
  if (!isSequentialOrUndefInRange(Mask, 4, 4, 4))
    return false;

  // Lower quadword shuffled.
  for (unsigned i = 0; i != 4; ++i)
    if (!isUndefOrInRange(Mask[i], 0, 4))
      return false;

  if (VT == MVT::v16i16) {
    // Upper quadword copied in order.
    if (!isSequentialOrUndefInRange(Mask, 12, 4, 12))
      return false;

    // Lower quadword shuffled.
    for (unsigned i = 8; i != 12; ++i)
      if (!isUndefOrInRange(Mask[i], 8, 12))
        return false;
  }

  return true;
}

/// isPALIGNRMask - Return true if the node specifies a shuffle of elements that
/// is suitable for input to PALIGNR.
static bool isPALIGNRMask(ArrayRef<int> Mask, MVT VT,
                          const X86Subtarget *Subtarget) {
  if ((VT.is128BitVector() && !Subtarget->hasSSSE3()) ||
      (VT.is256BitVector() && !Subtarget->hasInt256()))
    return false;

  unsigned NumElts = VT.getVectorNumElements();
  unsigned NumLanes = VT.is512BitVector() ? 1: VT.getSizeInBits()/128;
  unsigned NumLaneElts = NumElts/NumLanes;

  // Do not handle 64-bit element shuffles with palignr.
  if (NumLaneElts == 2)
    return false;

  for (unsigned l = 0; l != NumElts; l+=NumLaneElts) {
    unsigned i;
    for (i = 0; i != NumLaneElts; ++i) {
      if (Mask[i+l] >= 0)
        break;
    }

    // Lane is all undef, go to next lane
    if (i == NumLaneElts)
      continue;

    int Start = Mask[i+l];

    // Make sure its in this lane in one of the sources
    if (!isUndefOrInRange(Start, l, l+NumLaneElts) &&
        !isUndefOrInRange(Start, l+NumElts, l+NumElts+NumLaneElts))
      return false;

    // If not lane 0, then we must match lane 0
    if (l != 0 && Mask[i] >= 0 && !isUndefOrEqual(Start, Mask[i]+l))
      return false;

    // Correct second source to be contiguous with first source
    if (Start >= (int)NumElts)
      Start -= NumElts - NumLaneElts;

    // Make sure we're shifting in the right direction.
    if (Start <= (int)(i+l))
      return false;

    Start -= i;

    // Check the rest of the elements to see if they are consecutive.
    for (++i; i != NumLaneElts; ++i) {
      int Idx = Mask[i+l];

      // Make sure its in this lane
      if (!isUndefOrInRange(Idx, l, l+NumLaneElts) &&
          !isUndefOrInRange(Idx, l+NumElts, l+NumElts+NumLaneElts))
        return false;

      // If not lane 0, then we must match lane 0
      if (l != 0 && Mask[i] >= 0 && !isUndefOrEqual(Idx, Mask[i]+l))
        return false;

      if (Idx >= (int)NumElts)
        Idx -= NumElts - NumLaneElts;

      if (!isUndefOrEqual(Idx, Start+i))
        return false;

    }
  }

  return true;
}

/// CommuteVectorShuffleMask - Change values in a shuffle permute mask assuming
/// the two vector operands have swapped position.
static void CommuteVectorShuffleMask(SmallVectorImpl<int> &Mask,
                                     unsigned NumElems) {
  for (unsigned i = 0; i != NumElems; ++i) {
    int idx = Mask[i];
    if (idx < 0)
      continue;
    else if (idx < (int)NumElems)
      Mask[i] = idx + NumElems;
    else
      Mask[i] = idx - NumElems;
  }
}

/// isSHUFPMask - Return true if the specified VECTOR_SHUFFLE operand
/// specifies a shuffle of elements that is suitable for input to 128/256-bit
/// SHUFPS and SHUFPD. If Commuted is true, then it checks for sources to be
/// reverse of what x86 shuffles want.
static bool isSHUFPMask(ArrayRef<int> Mask, MVT VT, bool Commuted = false) {

  unsigned NumElems = VT.getVectorNumElements();
  unsigned NumLanes = VT.getSizeInBits()/128;
  unsigned NumLaneElems = NumElems/NumLanes;

  if (NumLaneElems != 2 && NumLaneElems != 4)
    return false;

  unsigned EltSize = VT.getVectorElementType().getSizeInBits();
  bool symetricMaskRequired =
    (VT.getSizeInBits() >= 256) && (EltSize == 32);

  // VSHUFPSY divides the resulting vector into 4 chunks.
  // The sources are also splitted into 4 chunks, and each destination
  // chunk must come from a different source chunk.
  //
  //  SRC1 =>   X7    X6    X5    X4    X3    X2    X1    X0
  //  SRC2 =>   Y7    Y6    Y5    Y4    Y3    Y2    Y1    Y9
  //
  //  DST  =>  Y7..Y4,   Y7..Y4,   X7..X4,   X7..X4,
  //           Y3..Y0,   Y3..Y0,   X3..X0,   X3..X0
  //
  // VSHUFPDY divides the resulting vector into 4 chunks.
  // The sources are also splitted into 4 chunks, and each destination
  // chunk must come from a different source chunk.
  //
  //  SRC1 =>      X3       X2       X1       X0
  //  SRC2 =>      Y3       Y2       Y1       Y0
  //
  //  DST  =>  Y3..Y2,  X3..X2,  Y1..Y0,  X1..X0
  //
  SmallVector<int, 4> MaskVal(NumLaneElems, -1);
  unsigned HalfLaneElems = NumLaneElems/2;
  for (unsigned l = 0; l != NumElems; l += NumLaneElems) {
    for (unsigned i = 0; i != NumLaneElems; ++i) {
      int Idx = Mask[i+l];
      unsigned RngStart = l + ((Commuted == (i<HalfLaneElems)) ? NumElems : 0);
      if (!isUndefOrInRange(Idx, RngStart, RngStart+NumLaneElems))
        return false;
      // For VSHUFPSY, the mask of the second half must be the same as the
      // first but with the appropriate offsets. This works in the same way as
      // VPERMILPS works with masks.
      if (!symetricMaskRequired || Idx < 0)
        continue;
      if (MaskVal[i] < 0) {
        MaskVal[i] = Idx - l;
        continue;
      }
      if ((signed)(Idx - l) != MaskVal[i])
        return false;
    }
  }

  return true;
}

/// isMOVHLPSMask - Return true if the specified VECTOR_SHUFFLE operand
/// specifies a shuffle of elements that is suitable for input to MOVHLPS.
static bool isMOVHLPSMask(ArrayRef<int> Mask, MVT VT) {
  if (!VT.is128BitVector())
    return false;

  unsigned NumElems = VT.getVectorNumElements();

  if (NumElems != 4)
    return false;

  // Expect bit0 == 6, bit1 == 7, bit2 == 2, bit3 == 3
  return isUndefOrEqual(Mask[0], 6) &&
         isUndefOrEqual(Mask[1], 7) &&
         isUndefOrEqual(Mask[2], 2) &&
         isUndefOrEqual(Mask[3], 3);
}

/// isMOVHLPS_v_undef_Mask - Special case of isMOVHLPSMask for canonical form
/// of vector_shuffle v, v, <2, 3, 2, 3>, i.e. vector_shuffle v, undef,
/// <2, 3, 2, 3>
static bool isMOVHLPS_v_undef_Mask(ArrayRef<int> Mask, MVT VT) {
  if (!VT.is128BitVector())
    return false;

  unsigned NumElems = VT.getVectorNumElements();

  if (NumElems != 4)
    return false;

  return isUndefOrEqual(Mask[0], 2) &&
         isUndefOrEqual(Mask[1], 3) &&
         isUndefOrEqual(Mask[2], 2) &&
         isUndefOrEqual(Mask[3], 3);
}

/// isMOVLPMask - Return true if the specified VECTOR_SHUFFLE operand
/// specifies a shuffle of elements that is suitable for input to MOVLP{S|D}.
static bool isMOVLPMask(ArrayRef<int> Mask, MVT VT) {
  if (!VT.is128BitVector())
    return false;

  unsigned NumElems = VT.getVectorNumElements();

  if (NumElems != 2 && NumElems != 4)
    return false;

  for (unsigned i = 0, e = NumElems/2; i != e; ++i)
    if (!isUndefOrEqual(Mask[i], i + NumElems))
      return false;

  for (unsigned i = NumElems/2, e = NumElems; i != e; ++i)
    if (!isUndefOrEqual(Mask[i], i))
      return false;

  return true;
}

/// isMOVLHPSMask - Return true if the specified VECTOR_SHUFFLE operand
/// specifies a shuffle of elements that is suitable for input to MOVLHPS.
static bool isMOVLHPSMask(ArrayRef<int> Mask, MVT VT) {
  if (!VT.is128BitVector())
    return false;

  unsigned NumElems = VT.getVectorNumElements();

  if (NumElems != 2 && NumElems != 4)
    return false;

  for (unsigned i = 0, e = NumElems/2; i != e; ++i)
    if (!isUndefOrEqual(Mask[i], i))
      return false;

  for (unsigned i = 0, e = NumElems/2; i != e; ++i)
    if (!isUndefOrEqual(Mask[i + e], i + NumElems))
      return false;

  return true;
}

/// isINSERTPSMask - Return true if the specified VECTOR_SHUFFLE operand
/// specifies a shuffle of elements that is suitable for input to INSERTPS.
/// i. e: If all but one element come from the same vector.
static bool isINSERTPSMask(ArrayRef<int> Mask, MVT VT) {
  // TODO: Deal with AVX's VINSERTPS
  if (!VT.is128BitVector() || (VT != MVT::v4f32 && VT != MVT::v4i32))
    return false;

  unsigned CorrectPosV1 = 0;
  unsigned CorrectPosV2 = 0;
  for (int i = 0, e = (int)VT.getVectorNumElements(); i != e; ++i) {
    if (Mask[i] == -1) {
      ++CorrectPosV1;
      ++CorrectPosV2;
      continue;
    }

    if (Mask[i] == i)
      ++CorrectPosV1;
    else if (Mask[i] == i + 4)
      ++CorrectPosV2;
  }

  if (CorrectPosV1 == 3 || CorrectPosV2 == 3)
    // We have 3 elements (undefs count as elements from any vector) from one
    // vector, and one from another.
    return true;

  return false;
}

//
// Some special combinations that can be optimized.
//
static
SDValue Compact8x32ShuffleNode(ShuffleVectorSDNode *SVOp,
                               SelectionDAG &DAG) {
  MVT VT = SVOp->getSimpleValueType(0);
  SDLoc dl(SVOp);

  if (VT != MVT::v8i32 && VT != MVT::v8f32)
    return SDValue();

  ArrayRef<int> Mask = SVOp->getMask();

  // These are the special masks that may be optimized.
  static const int MaskToOptimizeEven[] = {0, 8, 2, 10, 4, 12, 6, 14};
  static const int MaskToOptimizeOdd[]  = {1, 9, 3, 11, 5, 13, 7, 15};
  bool MatchEvenMask = true;
  bool MatchOddMask  = true;
  for (int i=0; i<8; ++i) {
    if (!isUndefOrEqual(Mask[i], MaskToOptimizeEven[i]))
      MatchEvenMask = false;
    if (!isUndefOrEqual(Mask[i], MaskToOptimizeOdd[i]))
      MatchOddMask = false;
  }

  if (!MatchEvenMask && !MatchOddMask)
    return SDValue();

  SDValue UndefNode = DAG.getNode(ISD::UNDEF, dl, VT);

  SDValue Op0 = SVOp->getOperand(0);
  SDValue Op1 = SVOp->getOperand(1);

  if (MatchEvenMask) {
    // Shift the second operand right to 32 bits.
    static const int ShiftRightMask[] = {-1, 0, -1, 2, -1, 4, -1, 6 };
    Op1 = DAG.getVectorShuffle(VT, dl, Op1, UndefNode, ShiftRightMask);
  } else {
    // Shift the first operand left to 32 bits.
    static const int ShiftLeftMask[] = {1, -1, 3, -1, 5, -1, 7, -1 };
    Op0 = DAG.getVectorShuffle(VT, dl, Op0, UndefNode, ShiftLeftMask);
  }
  static const int BlendMask[] = {0, 9, 2, 11, 4, 13, 6, 15};
  return DAG.getVectorShuffle(VT, dl, Op0, Op1, BlendMask);
}

/// isUNPCKLMask - Return true if the specified VECTOR_SHUFFLE operand
/// specifies a shuffle of elements that is suitable for input to UNPCKL.
static bool isUNPCKLMask(ArrayRef<int> Mask, MVT VT,
                         bool HasInt256, bool V2IsSplat = false) {

  assert(VT.getSizeInBits() >= 128 &&
         "Unsupported vector type for unpckl");

  // AVX defines UNPCK* to operate independently on 128-bit lanes.
  unsigned NumLanes;
  unsigned NumOf256BitLanes;
  unsigned NumElts = VT.getVectorNumElements();
  if (VT.is256BitVector()) {
    if (NumElts != 4 && NumElts != 8 &&
        (!HasInt256 || (NumElts != 16 && NumElts != 32)))
    return false;
    NumLanes = 2;
    NumOf256BitLanes = 1;
  } else if (VT.is512BitVector()) {
    assert(VT.getScalarType().getSizeInBits() >= 32 &&
           "Unsupported vector type for unpckh");
    NumLanes = 2;
    NumOf256BitLanes = 2;
  } else {
    NumLanes = 1;
    NumOf256BitLanes = 1;
  }

  unsigned NumEltsInStride = NumElts/NumOf256BitLanes;
  unsigned NumLaneElts = NumEltsInStride/NumLanes;

  for (unsigned l256 = 0; l256 < NumOf256BitLanes; l256 += 1) {
    for (unsigned l = 0; l != NumEltsInStride; l += NumLaneElts) {
      for (unsigned i = 0, j = l; i != NumLaneElts; i += 2, ++j) {
        int BitI  = Mask[l256*NumEltsInStride+l+i];
        int BitI1 = Mask[l256*NumEltsInStride+l+i+1];
        if (!isUndefOrEqual(BitI, j+l256*NumElts))
          return false;
        if (V2IsSplat && !isUndefOrEqual(BitI1, NumElts))
          return false;
        if (!isUndefOrEqual(BitI1, j+l256*NumElts+NumEltsInStride))
          return false;
      }
    }
  }
  return true;
}

/// isUNPCKHMask - Return true if the specified VECTOR_SHUFFLE operand
/// specifies a shuffle of elements that is suitable for input to UNPCKH.
static bool isUNPCKHMask(ArrayRef<int> Mask, MVT VT,
                         bool HasInt256, bool V2IsSplat = false) {
  assert(VT.getSizeInBits() >= 128 &&
         "Unsupported vector type for unpckh");

  // AVX defines UNPCK* to operate independently on 128-bit lanes.
  unsigned NumLanes;
  unsigned NumOf256BitLanes;
  unsigned NumElts = VT.getVectorNumElements();
  if (VT.is256BitVector()) {
    if (NumElts != 4 && NumElts != 8 &&
        (!HasInt256 || (NumElts != 16 && NumElts != 32)))
    return false;
    NumLanes = 2;
    NumOf256BitLanes = 1;
  } else if (VT.is512BitVector()) {
    assert(VT.getScalarType().getSizeInBits() >= 32 &&
           "Unsupported vector type for unpckh");
    NumLanes = 2;
    NumOf256BitLanes = 2;
  } else {
    NumLanes = 1;
    NumOf256BitLanes = 1;
  }

  unsigned NumEltsInStride = NumElts/NumOf256BitLanes;
  unsigned NumLaneElts = NumEltsInStride/NumLanes;

  for (unsigned l256 = 0; l256 < NumOf256BitLanes; l256 += 1) {
    for (unsigned l = 0; l != NumEltsInStride; l += NumLaneElts) {
      for (unsigned i = 0, j = l+NumLaneElts/2; i != NumLaneElts; i += 2, ++j) {
        int BitI  = Mask[l256*NumEltsInStride+l+i];
        int BitI1 = Mask[l256*NumEltsInStride+l+i+1];
        if (!isUndefOrEqual(BitI, j+l256*NumElts))
          return false;
        if (V2IsSplat && !isUndefOrEqual(BitI1, NumElts))
          return false;
        if (!isUndefOrEqual(BitI1, j+l256*NumElts+NumEltsInStride))
          return false;
      }
    }
  }
  return true;
}

/// isUNPCKL_v_undef_Mask - Special case of isUNPCKLMask for canonical form
/// of vector_shuffle v, v, <0, 4, 1, 5>, i.e. vector_shuffle v, undef,
/// <0, 0, 1, 1>
static bool isUNPCKL_v_undef_Mask(ArrayRef<int> Mask, MVT VT, bool HasInt256) {
  unsigned NumElts = VT.getVectorNumElements();
  bool Is256BitVec = VT.is256BitVector();

  if (VT.is512BitVector())
    return false;
  assert((VT.is128BitVector() || VT.is256BitVector()) &&
         "Unsupported vector type for unpckh");

  if (Is256BitVec && NumElts != 4 && NumElts != 8 &&
      (!HasInt256 || (NumElts != 16 && NumElts != 32)))
    return false;

  // For 256-bit i64/f64, use MOVDDUPY instead, so reject the matching pattern
  // FIXME: Need a better way to get rid of this, there's no latency difference
  // between UNPCKLPD and MOVDDUP, the later should always be checked first and
  // the former later. We should also remove the "_undef" special mask.
  if (NumElts == 4 && Is256BitVec)
    return false;

  // Handle 128 and 256-bit vector lengths. AVX defines UNPCK* to operate
  // independently on 128-bit lanes.
  unsigned NumLanes = VT.getSizeInBits()/128;
  unsigned NumLaneElts = NumElts/NumLanes;

  for (unsigned l = 0; l != NumElts; l += NumLaneElts) {
    for (unsigned i = 0, j = l; i != NumLaneElts; i += 2, ++j) {
      int BitI  = Mask[l+i];
      int BitI1 = Mask[l+i+1];

      if (!isUndefOrEqual(BitI, j))
        return false;
      if (!isUndefOrEqual(BitI1, j))
        return false;
    }
  }

  return true;
}

/// isUNPCKH_v_undef_Mask - Special case of isUNPCKHMask for canonical form
/// of vector_shuffle v, v, <2, 6, 3, 7>, i.e. vector_shuffle v, undef,
/// <2, 2, 3, 3>
static bool isUNPCKH_v_undef_Mask(ArrayRef<int> Mask, MVT VT, bool HasInt256) {
  unsigned NumElts = VT.getVectorNumElements();

  if (VT.is512BitVector())
    return false;

  assert((VT.is128BitVector() || VT.is256BitVector()) &&
         "Unsupported vector type for unpckh");

  if (VT.is256BitVector() && NumElts != 4 && NumElts != 8 &&
      (!HasInt256 || (NumElts != 16 && NumElts != 32)))
    return false;

  // Handle 128 and 256-bit vector lengths. AVX defines UNPCK* to operate
  // independently on 128-bit lanes.
  unsigned NumLanes = VT.getSizeInBits()/128;
  unsigned NumLaneElts = NumElts/NumLanes;

  for (unsigned l = 0; l != NumElts; l += NumLaneElts) {
    for (unsigned i = 0, j = l+NumLaneElts/2; i != NumLaneElts; i += 2, ++j) {
      int BitI  = Mask[l+i];
      int BitI1 = Mask[l+i+1];
      if (!isUndefOrEqual(BitI, j))
        return false;
      if (!isUndefOrEqual(BitI1, j))
        return false;
    }
  }
  return true;
}

// Match for INSERTI64x4 INSERTF64x4 instructions (src0[0], src1[0]) or
// (src1[0], src0[1]), manipulation with 256-bit sub-vectors
static bool isINSERT64x4Mask(ArrayRef<int> Mask, MVT VT, unsigned int *Imm) {
  if (!VT.is512BitVector())
    return false;

  unsigned NumElts = VT.getVectorNumElements();
  unsigned HalfSize = NumElts/2;
  if (isSequentialOrUndefInRange(Mask, 0, HalfSize, 0)) {
    if (isSequentialOrUndefInRange(Mask, HalfSize, HalfSize, NumElts)) {
      *Imm = 1;
      return true;
    }
  }
  if (isSequentialOrUndefInRange(Mask, 0, HalfSize, NumElts)) {
    if (isSequentialOrUndefInRange(Mask, HalfSize, HalfSize, HalfSize)) {
      *Imm = 0;
      return true;
    }
  }
  return false;
}

/// isMOVLMask - Return true if the specified VECTOR_SHUFFLE operand
/// specifies a shuffle of elements that is suitable for input to MOVSS,
/// MOVSD, and MOVD, i.e. setting the lowest element.
static bool isMOVLMask(ArrayRef<int> Mask, EVT VT) {
  if (VT.getVectorElementType().getSizeInBits() < 32)
    return false;
  if (!VT.is128BitVector())
    return false;

  unsigned NumElts = VT.getVectorNumElements();

  if (!isUndefOrEqual(Mask[0], NumElts))
    return false;

  for (unsigned i = 1; i != NumElts; ++i)
    if (!isUndefOrEqual(Mask[i], i))
      return false;

  return true;
}

/// isVPERM2X128Mask - Match 256-bit shuffles where the elements are considered
/// as permutations between 128-bit chunks or halves. As an example: this
/// shuffle bellow:
///   vector_shuffle <4, 5, 6, 7, 12, 13, 14, 15>
/// The first half comes from the second half of V1 and the second half from the
/// the second half of V2.
static bool isVPERM2X128Mask(ArrayRef<int> Mask, MVT VT, bool HasFp256) {
  if (!HasFp256 || !VT.is256BitVector())
    return false;

  // The shuffle result is divided into half A and half B. In total the two
  // sources have 4 halves, namely: C, D, E, F. The final values of A and
  // B must come from C, D, E or F.
  unsigned HalfSize = VT.getVectorNumElements()/2;
  bool MatchA = false, MatchB = false;

  // Check if A comes from one of C, D, E, F.
  for (unsigned Half = 0; Half != 4; ++Half) {
    if (isSequentialOrUndefInRange(Mask, 0, HalfSize, Half*HalfSize)) {
      MatchA = true;
      break;
    }
  }

  // Check if B comes from one of C, D, E, F.
  for (unsigned Half = 0; Half != 4; ++Half) {
    if (isSequentialOrUndefInRange(Mask, HalfSize, HalfSize, Half*HalfSize)) {
      MatchB = true;
      break;
    }
  }

  return MatchA && MatchB;
}

/// getShuffleVPERM2X128Immediate - Return the appropriate immediate to shuffle
/// the specified VECTOR_MASK mask with VPERM2F128/VPERM2I128 instructions.
static unsigned getShuffleVPERM2X128Immediate(ShuffleVectorSDNode *SVOp) {
  MVT VT = SVOp->getSimpleValueType(0);

  unsigned HalfSize = VT.getVectorNumElements()/2;

  unsigned FstHalf = 0, SndHalf = 0;
  for (unsigned i = 0; i < HalfSize; ++i) {
    if (SVOp->getMaskElt(i) > 0) {
      FstHalf = SVOp->getMaskElt(i)/HalfSize;
      break;
    }
  }
  for (unsigned i = HalfSize; i < HalfSize*2; ++i) {
    if (SVOp->getMaskElt(i) > 0) {
      SndHalf = SVOp->getMaskElt(i)/HalfSize;
      break;
    }
  }

  return (FstHalf | (SndHalf << 4));
}

// Symetric in-lane mask. Each lane has 4 elements (for imm8)
static bool isPermImmMask(ArrayRef<int> Mask, MVT VT, unsigned& Imm8) {
  unsigned EltSize = VT.getVectorElementType().getSizeInBits();
  if (EltSize < 32)
    return false;

  unsigned NumElts = VT.getVectorNumElements();
  Imm8 = 0;
  if (VT.is128BitVector() || (VT.is256BitVector() && EltSize == 64)) {
    for (unsigned i = 0; i != NumElts; ++i) {
      if (Mask[i] < 0)
        continue;
      Imm8 |= Mask[i] << (i*2);
    }
    return true;
  }

  unsigned LaneSize = 4;
  SmallVector<int, 4> MaskVal(LaneSize, -1);

  for (unsigned l = 0; l != NumElts; l += LaneSize) {
    for (unsigned i = 0; i != LaneSize; ++i) {
      if (!isUndefOrInRange(Mask[i+l], l, l+LaneSize))
        return false;
      if (Mask[i+l] < 0)
        continue;
      if (MaskVal[i] < 0) {
        MaskVal[i] = Mask[i+l] - l;
        Imm8 |= MaskVal[i] << (i*2);
        continue;
      }
      if (Mask[i+l] != (signed)(MaskVal[i]+l))
        return false;
    }
  }
  return true;
}

/// isVPERMILPMask - Return true if the specified VECTOR_SHUFFLE operand
/// specifies a shuffle of elements that is suitable for input to VPERMILPD*.
/// Note that VPERMIL mask matching is different depending whether theunderlying
/// type is 32 or 64. In the VPERMILPS the high half of the mask should point
/// to the same elements of the low, but to the higher half of the source.
/// In VPERMILPD the two lanes could be shuffled independently of each other
/// with the same restriction that lanes can't be crossed. Also handles PSHUFDY.
static bool isVPERMILPMask(ArrayRef<int> Mask, MVT VT) {
  unsigned EltSize = VT.getVectorElementType().getSizeInBits();
  if (VT.getSizeInBits() < 256 || EltSize < 32)
    return false;
  bool symetricMaskRequired = (EltSize == 32);
  unsigned NumElts = VT.getVectorNumElements();

  unsigned NumLanes = VT.getSizeInBits()/128;
  unsigned LaneSize = NumElts/NumLanes;
  // 2 or 4 elements in one lane

  SmallVector<int, 4> ExpectedMaskVal(LaneSize, -1);
  for (unsigned l = 0; l != NumElts; l += LaneSize) {
    for (unsigned i = 0; i != LaneSize; ++i) {
      if (!isUndefOrInRange(Mask[i+l], l, l+LaneSize))
        return false;
      if (symetricMaskRequired) {
        if (ExpectedMaskVal[i] < 0 && Mask[i+l] >= 0) {
          ExpectedMaskVal[i] = Mask[i+l] - l;
          continue;
        }
        if (!isUndefOrEqual(Mask[i+l], ExpectedMaskVal[i]+l))
          return false;
      }
    }
  }
  return true;
}

/// isCommutedMOVLMask - Returns true if the shuffle mask is except the reverse
/// of what x86 movss want. X86 movs requires the lowest  element to be lowest
/// element of vector 2 and the other elements to come from vector 1 in order.
static bool isCommutedMOVLMask(ArrayRef<int> Mask, MVT VT,
                               bool V2IsSplat = false, bool V2IsUndef = false) {
  if (!VT.is128BitVector())
    return false;

  unsigned NumOps = VT.getVectorNumElements();
  if (NumOps != 2 && NumOps != 4 && NumOps != 8 && NumOps != 16)
    return false;

  if (!isUndefOrEqual(Mask[0], 0))
    return false;

  for (unsigned i = 1; i != NumOps; ++i)
    if (!(isUndefOrEqual(Mask[i], i+NumOps) ||
          (V2IsUndef && isUndefOrInRange(Mask[i], NumOps, NumOps*2)) ||
          (V2IsSplat && isUndefOrEqual(Mask[i], NumOps))))
      return false;

  return true;
}

/// isMOVSHDUPMask - Return true if the specified VECTOR_SHUFFLE operand
/// specifies a shuffle of elements that is suitable for input to MOVSHDUP.
/// Masks to match: <1, 1, 3, 3> or <1, 1, 3, 3, 5, 5, 7, 7>
static bool isMOVSHDUPMask(ArrayRef<int> Mask, MVT VT,
                           const X86Subtarget *Subtarget) {
  if (!Subtarget->hasSSE3())
    return false;

  unsigned NumElems = VT.getVectorNumElements();

  if ((VT.is128BitVector() && NumElems != 4) ||
      (VT.is256BitVector() && NumElems != 8) ||
      (VT.is512BitVector() && NumElems != 16))
    return false;

  // "i+1" is the value the indexed mask element must have
  for (unsigned i = 0; i != NumElems; i += 2)
    if (!isUndefOrEqual(Mask[i], i+1) ||
        !isUndefOrEqual(Mask[i+1], i+1))
      return false;

  return true;
}

/// isMOVSLDUPMask - Return true if the specified VECTOR_SHUFFLE operand
/// specifies a shuffle of elements that is suitable for input to MOVSLDUP.
/// Masks to match: <0, 0, 2, 2> or <0, 0, 2, 2, 4, 4, 6, 6>
static bool isMOVSLDUPMask(ArrayRef<int> Mask, MVT VT,
                           const X86Subtarget *Subtarget) {
  if (!Subtarget->hasSSE3())
    return false;

  unsigned NumElems = VT.getVectorNumElements();

  if ((VT.is128BitVector() && NumElems != 4) ||
      (VT.is256BitVector() && NumElems != 8) ||
      (VT.is512BitVector() && NumElems != 16))
    return false;

  // "i" is the value the indexed mask element must have
  for (unsigned i = 0; i != NumElems; i += 2)
    if (!isUndefOrEqual(Mask[i], i) ||
        !isUndefOrEqual(Mask[i+1], i))
      return false;

  return true;
}

/// isMOVDDUPYMask - Return true if the specified VECTOR_SHUFFLE operand
/// specifies a shuffle of elements that is suitable for input to 256-bit
/// version of MOVDDUP.
static bool isMOVDDUPYMask(ArrayRef<int> Mask, MVT VT, bool HasFp256) {
  if (!HasFp256 || !VT.is256BitVector())
    return false;

  unsigned NumElts = VT.getVectorNumElements();
  if (NumElts != 4)
    return false;

  for (unsigned i = 0; i != NumElts/2; ++i)
    if (!isUndefOrEqual(Mask[i], 0))
      return false;
  for (unsigned i = NumElts/2; i != NumElts; ++i)
    if (!isUndefOrEqual(Mask[i], NumElts/2))
      return false;
  return true;
}

/// isMOVDDUPMask - Return true if the specified VECTOR_SHUFFLE operand
/// specifies a shuffle of elements that is suitable for input to 128-bit
/// version of MOVDDUP.
static bool isMOVDDUPMask(ArrayRef<int> Mask, MVT VT) {
  if (!VT.is128BitVector())
    return false;

  unsigned e = VT.getVectorNumElements() / 2;
  for (unsigned i = 0; i != e; ++i)
    if (!isUndefOrEqual(Mask[i], i))
      return false;
  for (unsigned i = 0; i != e; ++i)
    if (!isUndefOrEqual(Mask[e+i], i))
      return false;
  return true;
}

/// isVEXTRACTIndex - Return true if the specified
/// EXTRACT_SUBVECTOR operand specifies a vector extract that is
/// suitable for instruction that extract 128 or 256 bit vectors
static bool isVEXTRACTIndex(SDNode *N, unsigned vecWidth) {
  assert((vecWidth == 128 || vecWidth == 256) && "Unexpected vector width");
  if (!isa<ConstantSDNode>(N->getOperand(1).getNode()))
    return false;

  // The index should be aligned on a vecWidth-bit boundary.
  uint64_t Index =
    cast<ConstantSDNode>(N->getOperand(1).getNode())->getZExtValue();

  MVT VT = N->getSimpleValueType(0);
  unsigned ElSize = VT.getVectorElementType().getSizeInBits();
  bool Result = (Index * ElSize) % vecWidth == 0;

  return Result;
}

/// isVINSERTIndex - Return true if the specified INSERT_SUBVECTOR
/// operand specifies a subvector insert that is suitable for input to
/// insertion of 128 or 256-bit subvectors
static bool isVINSERTIndex(SDNode *N, unsigned vecWidth) {
  assert((vecWidth == 128 || vecWidth == 256) && "Unexpected vector width");
  if (!isa<ConstantSDNode>(N->getOperand(2).getNode()))
    return false;
  // The index should be aligned on a vecWidth-bit boundary.
  uint64_t Index =
    cast<ConstantSDNode>(N->getOperand(2).getNode())->getZExtValue();

  MVT VT = N->getSimpleValueType(0);
  unsigned ElSize = VT.getVectorElementType().getSizeInBits();
  bool Result = (Index * ElSize) % vecWidth == 0;

  return Result;
}

bool X86::isVINSERT128Index(SDNode *N) {
  return isVINSERTIndex(N, 128);
}

bool X86::isVINSERT256Index(SDNode *N) {
  return isVINSERTIndex(N, 256);
}

bool X86::isVEXTRACT128Index(SDNode *N) {
  return isVEXTRACTIndex(N, 128);
}

bool X86::isVEXTRACT256Index(SDNode *N) {
  return isVEXTRACTIndex(N, 256);
}

/// getShuffleSHUFImmediate - Return the appropriate immediate to shuffle
/// the specified VECTOR_SHUFFLE mask with PSHUF* and SHUFP* instructions.
/// Handles 128-bit and 256-bit.
static unsigned getShuffleSHUFImmediate(ShuffleVectorSDNode *N) {
  MVT VT = N->getSimpleValueType(0);

  assert((VT.getSizeInBits() >= 128) &&
         "Unsupported vector type for PSHUF/SHUFP");

  // Handle 128 and 256-bit vector lengths. AVX defines PSHUF/SHUFP to operate
  // independently on 128-bit lanes.
  unsigned NumElts = VT.getVectorNumElements();
  unsigned NumLanes = VT.getSizeInBits()/128;
  unsigned NumLaneElts = NumElts/NumLanes;

  assert((NumLaneElts == 2 || NumLaneElts == 4 || NumLaneElts == 8) &&
         "Only supports 2, 4 or 8 elements per lane");

  unsigned Shift = (NumLaneElts >= 4) ? 1 : 0;
  unsigned Mask = 0;
  for (unsigned i = 0; i != NumElts; ++i) {
    int Elt = N->getMaskElt(i);
    if (Elt < 0) continue;
    Elt &= NumLaneElts - 1;
    unsigned ShAmt = (i << Shift) % 8;
    Mask |= Elt << ShAmt;
  }

  return Mask;
}

/// getShufflePSHUFHWImmediate - Return the appropriate immediate to shuffle
/// the specified VECTOR_SHUFFLE mask with the PSHUFHW instruction.
static unsigned getShufflePSHUFHWImmediate(ShuffleVectorSDNode *N) {
  MVT VT = N->getSimpleValueType(0);

  assert((VT == MVT::v8i16 || VT == MVT::v16i16) &&
         "Unsupported vector type for PSHUFHW");

  unsigned NumElts = VT.getVectorNumElements();

  unsigned Mask = 0;
  for (unsigned l = 0; l != NumElts; l += 8) {
    // 8 nodes per lane, but we only care about the last 4.
    for (unsigned i = 0; i < 4; ++i) {
      int Elt = N->getMaskElt(l+i+4);
      if (Elt < 0) continue;
      Elt &= 0x3; // only 2-bits.
      Mask |= Elt << (i * 2);
    }
  }

  return Mask;
}

/// getShufflePSHUFLWImmediate - Return the appropriate immediate to shuffle
/// the specified VECTOR_SHUFFLE mask with the PSHUFLW instruction.
static unsigned getShufflePSHUFLWImmediate(ShuffleVectorSDNode *N) {
  MVT VT = N->getSimpleValueType(0);

  assert((VT == MVT::v8i16 || VT == MVT::v16i16) &&
         "Unsupported vector type for PSHUFHW");

  unsigned NumElts = VT.getVectorNumElements();

  unsigned Mask = 0;
  for (unsigned l = 0; l != NumElts; l += 8) {
    // 8 nodes per lane, but we only care about the first 4.
    for (unsigned i = 0; i < 4; ++i) {
      int Elt = N->getMaskElt(l+i);
      if (Elt < 0) continue;
      Elt &= 0x3; // only 2-bits
      Mask |= Elt << (i * 2);
    }
  }

  return Mask;
}

/// getShufflePALIGNRImmediate - Return the appropriate immediate to shuffle
/// the specified VECTOR_SHUFFLE mask with the PALIGNR instruction.
static unsigned getShufflePALIGNRImmediate(ShuffleVectorSDNode *SVOp) {
  MVT VT = SVOp->getSimpleValueType(0);
  unsigned EltSize = VT.is512BitVector() ? 1 :
    VT.getVectorElementType().getSizeInBits() >> 3;

  unsigned NumElts = VT.getVectorNumElements();
  unsigned NumLanes = VT.is512BitVector() ? 1 : VT.getSizeInBits()/128;
  unsigned NumLaneElts = NumElts/NumLanes;

  int Val = 0;
  unsigned i;
  for (i = 0; i != NumElts; ++i) {
    Val = SVOp->getMaskElt(i);
    if (Val >= 0)
      break;
  }
  if (Val >= (int)NumElts)
    Val -= NumElts - NumLaneElts;

  assert(Val - i > 0 && "PALIGNR imm should be positive");
  return (Val - i) * EltSize;
}

static unsigned getExtractVEXTRACTImmediate(SDNode *N, unsigned vecWidth) {
  assert((vecWidth == 128 || vecWidth == 256) && "Unsupported vector width");
  if (!isa<ConstantSDNode>(N->getOperand(1).getNode()))
    llvm_unreachable("Illegal extract subvector for VEXTRACT");

  uint64_t Index =
    cast<ConstantSDNode>(N->getOperand(1).getNode())->getZExtValue();

  MVT VecVT = N->getOperand(0).getSimpleValueType();
  MVT ElVT = VecVT.getVectorElementType();

  unsigned NumElemsPerChunk = vecWidth / ElVT.getSizeInBits();
  return Index / NumElemsPerChunk;
}

static unsigned getInsertVINSERTImmediate(SDNode *N, unsigned vecWidth) {
  assert((vecWidth == 128 || vecWidth == 256) && "Unsupported vector width");
  if (!isa<ConstantSDNode>(N->getOperand(2).getNode()))
    llvm_unreachable("Illegal insert subvector for VINSERT");

  uint64_t Index =
    cast<ConstantSDNode>(N->getOperand(2).getNode())->getZExtValue();

  MVT VecVT = N->getSimpleValueType(0);
  MVT ElVT = VecVT.getVectorElementType();

  unsigned NumElemsPerChunk = vecWidth / ElVT.getSizeInBits();
  return Index / NumElemsPerChunk;
}

/// getExtractVEXTRACT128Immediate - Return the appropriate immediate
/// to extract the specified EXTRACT_SUBVECTOR index with VEXTRACTF128
/// and VINSERTI128 instructions.
unsigned X86::getExtractVEXTRACT128Immediate(SDNode *N) {
  return getExtractVEXTRACTImmediate(N, 128);
}

/// getExtractVEXTRACT256Immediate - Return the appropriate immediate
/// to extract the specified EXTRACT_SUBVECTOR index with VEXTRACTF64x4
/// and VINSERTI64x4 instructions.
unsigned X86::getExtractVEXTRACT256Immediate(SDNode *N) {
  return getExtractVEXTRACTImmediate(N, 256);
}

/// getInsertVINSERT128Immediate - Return the appropriate immediate
/// to insert at the specified INSERT_SUBVECTOR index with VINSERTF128
/// and VINSERTI128 instructions.
unsigned X86::getInsertVINSERT128Immediate(SDNode *N) {
  return getInsertVINSERTImmediate(N, 128);
}

/// getInsertVINSERT256Immediate - Return the appropriate immediate
/// to insert at the specified INSERT_SUBVECTOR index with VINSERTF46x4
/// and VINSERTI64x4 instructions.
unsigned X86::getInsertVINSERT256Immediate(SDNode *N) {
  return getInsertVINSERTImmediate(N, 256);
}

/// isZero - Returns true if Elt is a constant integer zero
static bool isZero(SDValue V) {
  ConstantSDNode *C = dyn_cast<ConstantSDNode>(V);
  return C && C->isNullValue();
}

/// isZeroNode - Returns true if Elt is a constant zero or a floating point
/// constant +0.0.
bool X86::isZeroNode(SDValue Elt) {
  if (isZero(Elt))
    return true;
  if (ConstantFPSDNode *CFP = dyn_cast<ConstantFPSDNode>(Elt))
    return CFP->getValueAPF().isPosZero();
  return false;
}

/// CommuteVectorShuffle - Swap vector_shuffle operands as well as values in
/// their permute mask.
static SDValue CommuteVectorShuffle(ShuffleVectorSDNode *SVOp,
                                    SelectionDAG &DAG) {
  MVT VT = SVOp->getSimpleValueType(0);
  unsigned NumElems = VT.getVectorNumElements();
  SmallVector<int, 8> MaskVec;

  for (unsigned i = 0; i != NumElems; ++i) {
    int Idx = SVOp->getMaskElt(i);
    if (Idx >= 0) {
      if (Idx < (int)NumElems)
        Idx += NumElems;
      else
        Idx -= NumElems;
    }
    MaskVec.push_back(Idx);
  }
  return DAG.getVectorShuffle(VT, SDLoc(SVOp), SVOp->getOperand(1),
                              SVOp->getOperand(0), &MaskVec[0]);
}

/// ShouldXformToMOVHLPS - Return true if the node should be transformed to
/// match movhlps. The lower half elements should come from upper half of
/// V1 (and in order), and the upper half elements should come from the upper
/// half of V2 (and in order).
static bool ShouldXformToMOVHLPS(ArrayRef<int> Mask, MVT VT) {
  if (!VT.is128BitVector())
    return false;
  if (VT.getVectorNumElements() != 4)
    return false;
  for (unsigned i = 0, e = 2; i != e; ++i)
    if (!isUndefOrEqual(Mask[i], i+2))
      return false;
  for (unsigned i = 2; i != 4; ++i)
    if (!isUndefOrEqual(Mask[i], i+4))
      return false;
  return true;
}

/// isScalarLoadToVector - Returns true if the node is a scalar load that
/// is promoted to a vector. It also returns the LoadSDNode by reference if
/// required.
static bool isScalarLoadToVector(SDNode *N, LoadSDNode **LD = nullptr) {
  if (N->getOpcode() != ISD::SCALAR_TO_VECTOR)
    return false;
  N = N->getOperand(0).getNode();
  if (!ISD::isNON_EXTLoad(N))
    return false;
  if (LD)
    *LD = cast<LoadSDNode>(N);
  return true;
}

// Test whether the given value is a vector value which will be legalized
// into a load.
static bool WillBeConstantPoolLoad(SDNode *N) {
  if (N->getOpcode() != ISD::BUILD_VECTOR)
    return false;

  // Check for any non-constant elements.
  for (unsigned i = 0, e = N->getNumOperands(); i != e; ++i)
    switch (N->getOperand(i).getNode()->getOpcode()) {
    case ISD::UNDEF:
    case ISD::ConstantFP:
    case ISD::Constant:
      break;
    default:
      return false;
    }

  // Vectors of all-zeros and all-ones are materialized with special
  // instructions rather than being loaded.
  return !ISD::isBuildVectorAllZeros(N) &&
         !ISD::isBuildVectorAllOnes(N);
}

/// ShouldXformToMOVLP{S|D} - Return true if the node should be transformed to
/// match movlp{s|d}. The lower half elements should come from lower half of
/// V1 (and in order), and the upper half elements should come from the upper
/// half of V2 (and in order). And since V1 will become the source of the
/// MOVLP, it must be either a vector load or a scalar load to vector.
static bool ShouldXformToMOVLP(SDNode *V1, SDNode *V2,
                               ArrayRef<int> Mask, MVT VT) {
  if (!VT.is128BitVector())
    return false;

  if (!ISD::isNON_EXTLoad(V1) && !isScalarLoadToVector(V1))
    return false;
  // Is V2 is a vector load, don't do this transformation. We will try to use
  // load folding shufps op.
  if (ISD::isNON_EXTLoad(V2) || WillBeConstantPoolLoad(V2))
    return false;

  unsigned NumElems = VT.getVectorNumElements();

  if (NumElems != 2 && NumElems != 4)
    return false;
  for (unsigned i = 0, e = NumElems/2; i != e; ++i)
    if (!isUndefOrEqual(Mask[i], i))
      return false;
  for (unsigned i = NumElems/2, e = NumElems; i != e; ++i)
    if (!isUndefOrEqual(Mask[i], i+NumElems))
      return false;
  return true;
}

/// isZeroShuffle - Returns true if N is a VECTOR_SHUFFLE that can be resolved
/// to an zero vector.
/// FIXME: move to dag combiner / method on ShuffleVectorSDNode
static bool isZeroShuffle(