Home | History | Annotate | Download | only in CellSPU
      1 //===-- SPUISelLowering.cpp - Cell SPU DAG Lowering Implementation --------===//
      2 //                     The LLVM Compiler Infrastructure
      3 //
      4 // This file is distributed under the University of Illinois Open Source
      5 // License. See LICENSE.TXT for details.
      6 //
      7 //===----------------------------------------------------------------------===//
      8 //
      9 // This file implements the SPUTargetLowering class.
     10 //
     11 //===----------------------------------------------------------------------===//
     12 
     13 #include "SPUISelLowering.h"
     14 #include "SPUTargetMachine.h"
     15 #include "SPUFrameLowering.h"
     16 #include "SPUMachineFunction.h"
     17 #include "llvm/Constants.h"
     18 #include "llvm/Function.h"
     19 #include "llvm/Intrinsics.h"
     20 #include "llvm/CallingConv.h"
     21 #include "llvm/Type.h"
     22 #include "llvm/CodeGen/CallingConvLower.h"
     23 #include "llvm/CodeGen/MachineFrameInfo.h"
     24 #include "llvm/CodeGen/MachineFunction.h"
     25 #include "llvm/CodeGen/MachineInstrBuilder.h"
     26 #include "llvm/CodeGen/MachineRegisterInfo.h"
     27 #include "llvm/CodeGen/SelectionDAG.h"
     28 #include "llvm/CodeGen/TargetLoweringObjectFileImpl.h"
     29 #include "llvm/Target/TargetOptions.h"
     30 #include "llvm/Support/Debug.h"
     31 #include "llvm/Support/ErrorHandling.h"
     32 #include "llvm/Support/MathExtras.h"
     33 #include "llvm/Support/raw_ostream.h"
     34 
     35 using namespace llvm;
     36 
     37 namespace {
     38   // Byte offset of the preferred slot (counted from the MSB)
     39   int prefslotOffset(EVT VT) {
     40     int retval=0;
     41     if (VT==MVT::i1) retval=3;
     42     if (VT==MVT::i8) retval=3;
     43     if (VT==MVT::i16) retval=2;
     44 
     45     return retval;
     46   }
     47 
     48   //! Expand a library call into an actual call DAG node
     49   /*!
     50    \note
     51    This code is taken from SelectionDAGLegalize, since it is not exposed as
     52    part of the LLVM SelectionDAG API.
     53    */
     54 
     55   SDValue
     56   ExpandLibCall(RTLIB::Libcall LC, SDValue Op, SelectionDAG &DAG,
     57                 bool isSigned, SDValue &Hi, const SPUTargetLowering &TLI) {
     58     // The input chain to this libcall is the entry node of the function.
     59     // Legalizing the call will automatically add the previous call to the
     60     // dependence.
     61     SDValue InChain = DAG.getEntryNode();
     62 
     63     TargetLowering::ArgListTy Args;
     64     TargetLowering::ArgListEntry Entry;
     65     for (unsigned i = 0, e = Op.getNumOperands(); i != e; ++i) {
     66       EVT ArgVT = Op.getOperand(i).getValueType();
     67       Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext());
     68       Entry.Node = Op.getOperand(i);
     69       Entry.Ty = ArgTy;
     70       Entry.isSExt = isSigned;
     71       Entry.isZExt = !isSigned;
     72       Args.push_back(Entry);
     73     }
     74     SDValue Callee = DAG.getExternalSymbol(TLI.getLibcallName(LC),
     75                                            TLI.getPointerTy());
     76 
     77     // Splice the libcall in wherever FindInputOutputChains tells us to.
     78     Type *RetTy =
     79                 Op.getNode()->getValueType(0).getTypeForEVT(*DAG.getContext());
     80     std::pair<SDValue, SDValue> CallInfo =
     81             TLI.LowerCallTo(InChain, RetTy, isSigned, !isSigned, false, false,
     82                             0, TLI.getLibcallCallingConv(LC),
     83                             /*isTailCall=*/false,
     84                             /*doesNotRet=*/false, /*isReturnValueUsed=*/true,
     85                             Callee, Args, DAG, Op.getDebugLoc());
     86 
     87     return CallInfo.first;
     88   }
     89 }
     90 
     91 SPUTargetLowering::SPUTargetLowering(SPUTargetMachine &TM)
     92   : TargetLowering(TM, new TargetLoweringObjectFileELF()),
     93     SPUTM(TM) {
     94 
     95   // Use _setjmp/_longjmp instead of setjmp/longjmp.
     96   setUseUnderscoreSetJmp(true);
     97   setUseUnderscoreLongJmp(true);
     98 
     99   // Set RTLIB libcall names as used by SPU:
    100   setLibcallName(RTLIB::DIV_F64, "__fast_divdf3");
    101 
    102   // Set up the SPU's register classes:
    103   addRegisterClass(MVT::i8,   SPU::R8CRegisterClass);
    104   addRegisterClass(MVT::i16,  SPU::R16CRegisterClass);
    105   addRegisterClass(MVT::i32,  SPU::R32CRegisterClass);
    106   addRegisterClass(MVT::i64,  SPU::R64CRegisterClass);
    107   addRegisterClass(MVT::f32,  SPU::R32FPRegisterClass);
    108   addRegisterClass(MVT::f64,  SPU::R64FPRegisterClass);
    109   addRegisterClass(MVT::i128, SPU::GPRCRegisterClass);
    110 
    111   // SPU has no sign or zero extended loads for i1, i8, i16:
    112   setLoadExtAction(ISD::EXTLOAD,  MVT::i1, Promote);
    113   setLoadExtAction(ISD::SEXTLOAD, MVT::i1, Promote);
    114   setLoadExtAction(ISD::ZEXTLOAD, MVT::i1, Promote);
    115 
    116   setLoadExtAction(ISD::EXTLOAD,  MVT::f32, Expand);
    117   setLoadExtAction(ISD::EXTLOAD,  MVT::f64, Expand);
    118 
    119   setTruncStoreAction(MVT::i128, MVT::i64, Expand);
    120   setTruncStoreAction(MVT::i128, MVT::i32, Expand);
    121   setTruncStoreAction(MVT::i128, MVT::i16, Expand);
    122   setTruncStoreAction(MVT::i128, MVT::i8, Expand);
    123 
    124   setTruncStoreAction(MVT::f64, MVT::f32, Expand);
    125 
    126   // SPU constant load actions are custom lowered:
    127   setOperationAction(ISD::ConstantFP, MVT::f32, Legal);
    128   setOperationAction(ISD::ConstantFP, MVT::f64, Custom);
    129 
    130   // SPU's loads and stores have to be custom lowered:
    131   for (unsigned sctype = (unsigned) MVT::i8; sctype < (unsigned) MVT::i128;
    132        ++sctype) {
    133     MVT::SimpleValueType VT = (MVT::SimpleValueType)sctype;
    134 
    135     setOperationAction(ISD::LOAD,   VT, Custom);
    136     setOperationAction(ISD::STORE,  VT, Custom);
    137     setLoadExtAction(ISD::EXTLOAD,  VT, Custom);
    138     setLoadExtAction(ISD::ZEXTLOAD, VT, Custom);
    139     setLoadExtAction(ISD::SEXTLOAD, VT, Custom);
    140 
    141     for (unsigned stype = sctype - 1; stype >= (unsigned) MVT::i8; --stype) {
    142       MVT::SimpleValueType StoreVT = (MVT::SimpleValueType) stype;
    143       setTruncStoreAction(VT, StoreVT, Expand);
    144     }
    145   }
    146 
    147   for (unsigned sctype = (unsigned) MVT::f32; sctype < (unsigned) MVT::f64;
    148        ++sctype) {
    149     MVT::SimpleValueType VT = (MVT::SimpleValueType) sctype;
    150 
    151     setOperationAction(ISD::LOAD,   VT, Custom);
    152     setOperationAction(ISD::STORE,  VT, Custom);
    153 
    154     for (unsigned stype = sctype - 1; stype >= (unsigned) MVT::f32; --stype) {
    155       MVT::SimpleValueType StoreVT = (MVT::SimpleValueType) stype;
    156       setTruncStoreAction(VT, StoreVT, Expand);
    157     }
    158   }
    159 
    160   // Expand the jumptable branches
    161   setOperationAction(ISD::BR_JT,        MVT::Other, Expand);
    162   setOperationAction(ISD::BR_CC,        MVT::Other, Expand);
    163 
    164   // Custom lower SELECT_CC for most cases, but expand by default
    165   setOperationAction(ISD::SELECT_CC,    MVT::Other, Expand);
    166   setOperationAction(ISD::SELECT_CC,    MVT::i8,    Custom);
    167   setOperationAction(ISD::SELECT_CC,    MVT::i16,   Custom);
    168   setOperationAction(ISD::SELECT_CC,    MVT::i32,   Custom);
    169   setOperationAction(ISD::SELECT_CC,    MVT::i64,   Custom);
    170 
    171   // SPU has no intrinsics for these particular operations:
    172   setOperationAction(ISD::MEMBARRIER, MVT::Other, Expand);
    173   setOperationAction(ISD::ATOMIC_FENCE, MVT::Other, Expand);
    174 
    175   // SPU has no division/remainder instructions
    176   setOperationAction(ISD::SREM,    MVT::i8,   Expand);
    177   setOperationAction(ISD::UREM,    MVT::i8,   Expand);
    178   setOperationAction(ISD::SDIV,    MVT::i8,   Expand);
    179   setOperationAction(ISD::UDIV,    MVT::i8,   Expand);
    180   setOperationAction(ISD::SDIVREM, MVT::i8,   Expand);
    181   setOperationAction(ISD::UDIVREM, MVT::i8,   Expand);
    182   setOperationAction(ISD::SREM,    MVT::i16,  Expand);
    183   setOperationAction(ISD::UREM,    MVT::i16,  Expand);
    184   setOperationAction(ISD::SDIV,    MVT::i16,  Expand);
    185   setOperationAction(ISD::UDIV,    MVT::i16,  Expand);
    186   setOperationAction(ISD::SDIVREM, MVT::i16,  Expand);
    187   setOperationAction(ISD::UDIVREM, MVT::i16,  Expand);
    188   setOperationAction(ISD::SREM,    MVT::i32,  Expand);
    189   setOperationAction(ISD::UREM,    MVT::i32,  Expand);
    190   setOperationAction(ISD::SDIV,    MVT::i32,  Expand);
    191   setOperationAction(ISD::UDIV,    MVT::i32,  Expand);
    192   setOperationAction(ISD::SDIVREM, MVT::i32,  Expand);
    193   setOperationAction(ISD::UDIVREM, MVT::i32,  Expand);
    194   setOperationAction(ISD::SREM,    MVT::i64,  Expand);
    195   setOperationAction(ISD::UREM,    MVT::i64,  Expand);
    196   setOperationAction(ISD::SDIV,    MVT::i64,  Expand);
    197   setOperationAction(ISD::UDIV,    MVT::i64,  Expand);
    198   setOperationAction(ISD::SDIVREM, MVT::i64,  Expand);
    199   setOperationAction(ISD::UDIVREM, MVT::i64,  Expand);
    200   setOperationAction(ISD::SREM,    MVT::i128, Expand);
    201   setOperationAction(ISD::UREM,    MVT::i128, Expand);
    202   setOperationAction(ISD::SDIV,    MVT::i128, Expand);
    203   setOperationAction(ISD::UDIV,    MVT::i128, Expand);
    204   setOperationAction(ISD::SDIVREM, MVT::i128, Expand);
    205   setOperationAction(ISD::UDIVREM, MVT::i128, Expand);
    206 
    207   // We don't support sin/cos/sqrt/fmod
    208   setOperationAction(ISD::FSIN , MVT::f64, Expand);
    209   setOperationAction(ISD::FCOS , MVT::f64, Expand);
    210   setOperationAction(ISD::FREM , MVT::f64, Expand);
    211   setOperationAction(ISD::FSIN , MVT::f32, Expand);
    212   setOperationAction(ISD::FCOS , MVT::f32, Expand);
    213   setOperationAction(ISD::FREM , MVT::f32, Expand);
    214 
    215   // Expand fsqrt to the appropriate libcall (NOTE: should use h/w fsqrt
    216   // for f32!)
    217   setOperationAction(ISD::FSQRT, MVT::f64, Expand);
    218   setOperationAction(ISD::FSQRT, MVT::f32, Expand);
    219 
    220   setOperationAction(ISD::FMA, MVT::f64, Expand);
    221   setOperationAction(ISD::FMA, MVT::f32, Expand);
    222 
    223   setOperationAction(ISD::FCOPYSIGN, MVT::f64, Expand);
    224   setOperationAction(ISD::FCOPYSIGN, MVT::f32, Expand);
    225 
    226   // SPU can do rotate right and left, so legalize it... but customize for i8
    227   // because instructions don't exist.
    228 
    229   // FIXME: Change from "expand" to appropriate type once ROTR is supported in
    230   //        .td files.
    231   setOperationAction(ISD::ROTR, MVT::i32,    Expand /*Legal*/);
    232   setOperationAction(ISD::ROTR, MVT::i16,    Expand /*Legal*/);
    233   setOperationAction(ISD::ROTR, MVT::i8,     Expand /*Custom*/);
    234 
    235   setOperationAction(ISD::ROTL, MVT::i32,    Legal);
    236   setOperationAction(ISD::ROTL, MVT::i16,    Legal);
    237   setOperationAction(ISD::ROTL, MVT::i8,     Custom);
    238 
    239   // SPU has no native version of shift left/right for i8
    240   setOperationAction(ISD::SHL,  MVT::i8,     Custom);
    241   setOperationAction(ISD::SRL,  MVT::i8,     Custom);
    242   setOperationAction(ISD::SRA,  MVT::i8,     Custom);
    243 
    244   // Make these operations legal and handle them during instruction selection:
    245   setOperationAction(ISD::SHL,  MVT::i64,    Legal);
    246   setOperationAction(ISD::SRL,  MVT::i64,    Legal);
    247   setOperationAction(ISD::SRA,  MVT::i64,    Legal);
    248 
    249   // Custom lower i8, i32 and i64 multiplications
    250   setOperationAction(ISD::MUL,  MVT::i8,     Custom);
    251   setOperationAction(ISD::MUL,  MVT::i32,    Legal);
    252   setOperationAction(ISD::MUL,  MVT::i64,    Legal);
    253 
    254   // Expand double-width multiplication
    255   // FIXME: It would probably be reasonable to support some of these operations
    256   setOperationAction(ISD::UMUL_LOHI, MVT::i8,  Expand);
    257   setOperationAction(ISD::SMUL_LOHI, MVT::i8,  Expand);
    258   setOperationAction(ISD::MULHU,     MVT::i8,  Expand);
    259   setOperationAction(ISD::MULHS,     MVT::i8,  Expand);
    260   setOperationAction(ISD::UMUL_LOHI, MVT::i16, Expand);
    261   setOperationAction(ISD::SMUL_LOHI, MVT::i16, Expand);
    262   setOperationAction(ISD::MULHU,     MVT::i16, Expand);
    263   setOperationAction(ISD::MULHS,     MVT::i16, Expand);
    264   setOperationAction(ISD::UMUL_LOHI, MVT::i32, Expand);
    265   setOperationAction(ISD::SMUL_LOHI, MVT::i32, Expand);
    266   setOperationAction(ISD::MULHU,     MVT::i32, Expand);
    267   setOperationAction(ISD::MULHS,     MVT::i32, Expand);
    268   setOperationAction(ISD::UMUL_LOHI, MVT::i64, Expand);
    269   setOperationAction(ISD::SMUL_LOHI, MVT::i64, Expand);
    270   setOperationAction(ISD::MULHU,     MVT::i64, Expand);
    271   setOperationAction(ISD::MULHS,     MVT::i64, Expand);
    272 
    273   // Need to custom handle (some) common i8, i64 math ops
    274   setOperationAction(ISD::ADD,  MVT::i8,     Custom);
    275   setOperationAction(ISD::ADD,  MVT::i64,    Legal);
    276   setOperationAction(ISD::SUB,  MVT::i8,     Custom);
    277   setOperationAction(ISD::SUB,  MVT::i64,    Legal);
    278 
    279   // SPU does not have BSWAP. It does have i32 support CTLZ.
    280   // CTPOP has to be custom lowered.
    281   setOperationAction(ISD::BSWAP, MVT::i32,   Expand);
    282   setOperationAction(ISD::BSWAP, MVT::i64,   Expand);
    283 
    284   setOperationAction(ISD::CTPOP, MVT::i8,    Custom);
    285   setOperationAction(ISD::CTPOP, MVT::i16,   Custom);
    286   setOperationAction(ISD::CTPOP, MVT::i32,   Custom);
    287   setOperationAction(ISD::CTPOP, MVT::i64,   Custom);
    288   setOperationAction(ISD::CTPOP, MVT::i128,  Expand);
    289 
    290   setOperationAction(ISD::CTTZ , MVT::i8,    Expand);
    291   setOperationAction(ISD::CTTZ , MVT::i16,   Expand);
    292   setOperationAction(ISD::CTTZ , MVT::i32,   Expand);
    293   setOperationAction(ISD::CTTZ , MVT::i64,   Expand);
    294   setOperationAction(ISD::CTTZ , MVT::i128,  Expand);
    295   setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i8,    Expand);
    296   setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i16,   Expand);
    297   setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i32,   Expand);
    298   setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i64,   Expand);
    299   setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i128,  Expand);
    300 
    301   setOperationAction(ISD::CTLZ , MVT::i8,    Promote);
    302   setOperationAction(ISD::CTLZ , MVT::i16,   Promote);
    303   setOperationAction(ISD::CTLZ , MVT::i32,   Legal);
    304   setOperationAction(ISD::CTLZ , MVT::i64,   Expand);
    305   setOperationAction(ISD::CTLZ , MVT::i128,  Expand);
    306   setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i8,    Expand);
    307   setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i16,   Expand);
    308   setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i32,   Expand);
    309   setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i64,   Expand);
    310   setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i128,  Expand);
    311 
    312   // SPU has a version of select that implements (a&~c)|(b&c), just like
    313   // select ought to work:
    314   setOperationAction(ISD::SELECT, MVT::i8,   Legal);
    315   setOperationAction(ISD::SELECT, MVT::i16,  Legal);
    316   setOperationAction(ISD::SELECT, MVT::i32,  Legal);
    317   setOperationAction(ISD::SELECT, MVT::i64,  Legal);
    318 
    319   setOperationAction(ISD::SETCC, MVT::i8,    Legal);
    320   setOperationAction(ISD::SETCC, MVT::i16,   Legal);
    321   setOperationAction(ISD::SETCC, MVT::i32,   Legal);
    322   setOperationAction(ISD::SETCC, MVT::i64,   Legal);
    323   setOperationAction(ISD::SETCC, MVT::f64,   Custom);
    324 
    325   // Custom lower i128 -> i64 truncates
    326   setOperationAction(ISD::TRUNCATE, MVT::i64, Custom);
    327 
    328   // Custom lower i32/i64 -> i128 sign extend
    329   setOperationAction(ISD::SIGN_EXTEND, MVT::i128, Custom);
    330 
    331   setOperationAction(ISD::FP_TO_SINT, MVT::i8, Promote);
    332   setOperationAction(ISD::FP_TO_UINT, MVT::i8, Promote);
    333   setOperationAction(ISD::FP_TO_SINT, MVT::i16, Promote);
    334   setOperationAction(ISD::FP_TO_UINT, MVT::i16, Promote);
    335   // SPU has a legal FP -> signed INT instruction for f32, but for f64, need
    336   // to expand to a libcall, hence the custom lowering:
    337   setOperationAction(ISD::FP_TO_SINT, MVT::i32, Custom);
    338   setOperationAction(ISD::FP_TO_UINT, MVT::i32, Custom);
    339   setOperationAction(ISD::FP_TO_SINT, MVT::i64, Expand);
    340   setOperationAction(ISD::FP_TO_UINT, MVT::i64, Expand);
    341   setOperationAction(ISD::FP_TO_SINT, MVT::i128, Expand);
    342   setOperationAction(ISD::FP_TO_UINT, MVT::i128, Expand);
    343 
    344   // FDIV on SPU requires custom lowering
    345   setOperationAction(ISD::FDIV, MVT::f64, Expand);      // to libcall
    346 
    347   // SPU has [U|S]INT_TO_FP for f32->i32, but not for f64->i32, f64->i64:
    348   setOperationAction(ISD::SINT_TO_FP, MVT::i32, Custom);
    349   setOperationAction(ISD::SINT_TO_FP, MVT::i16, Promote);
    350   setOperationAction(ISD::SINT_TO_FP, MVT::i8,  Promote);
    351   setOperationAction(ISD::UINT_TO_FP, MVT::i32, Custom);
    352   setOperationAction(ISD::UINT_TO_FP, MVT::i16, Promote);
    353   setOperationAction(ISD::UINT_TO_FP, MVT::i8,  Promote);
    354   setOperationAction(ISD::SINT_TO_FP, MVT::i64, Custom);
    355   setOperationAction(ISD::UINT_TO_FP, MVT::i64, Custom);
    356 
    357   setOperationAction(ISD::BITCAST, MVT::i32, Legal);
    358   setOperationAction(ISD::BITCAST, MVT::f32, Legal);
    359   setOperationAction(ISD::BITCAST, MVT::i64, Legal);
    360   setOperationAction(ISD::BITCAST, MVT::f64, Legal);
    361 
    362   // We cannot sextinreg(i1).  Expand to shifts.
    363   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1, Expand);
    364 
    365   // We want to legalize GlobalAddress and ConstantPool nodes into the
    366   // appropriate instructions to materialize the address.
    367   for (unsigned sctype = (unsigned) MVT::i8; sctype < (unsigned) MVT::f128;
    368        ++sctype) {
    369     MVT::SimpleValueType VT = (MVT::SimpleValueType)sctype;
    370 
    371     setOperationAction(ISD::GlobalAddress,  VT, Custom);
    372     setOperationAction(ISD::ConstantPool,   VT, Custom);
    373     setOperationAction(ISD::JumpTable,      VT, Custom);
    374   }
    375 
    376   // VASTART needs to be custom lowered to use the VarArgsFrameIndex
    377   setOperationAction(ISD::VASTART           , MVT::Other, Custom);
    378 
    379   // Use the default implementation.
    380   setOperationAction(ISD::VAARG             , MVT::Other, Expand);
    381   setOperationAction(ISD::VACOPY            , MVT::Other, Expand);
    382   setOperationAction(ISD::VAEND             , MVT::Other, Expand);
    383   setOperationAction(ISD::STACKSAVE         , MVT::Other, Expand);
    384   setOperationAction(ISD::STACKRESTORE      , MVT::Other, Expand);
    385   setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i32  , Expand);
    386   setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i64  , Expand);
    387 
    388   // Cell SPU has instructions for converting between i64 and fp.
    389   setOperationAction(ISD::FP_TO_SINT, MVT::i64, Custom);
    390   setOperationAction(ISD::SINT_TO_FP, MVT::i64, Custom);
    391 
    392   // To take advantage of the above i64 FP_TO_SINT, promote i32 FP_TO_UINT
    393   setOperationAction(ISD::FP_TO_UINT, MVT::i32, Promote);
    394 
    395   // BUILD_PAIR can't be handled natively, and should be expanded to shl/or
    396   setOperationAction(ISD::BUILD_PAIR, MVT::i64, Expand);
    397 
    398   // First set operation action for all vector types to expand. Then we
    399   // will selectively turn on ones that can be effectively codegen'd.
    400   addRegisterClass(MVT::v16i8, SPU::VECREGRegisterClass);
    401   addRegisterClass(MVT::v8i16, SPU::VECREGRegisterClass);
    402   addRegisterClass(MVT::v4i32, SPU::VECREGRegisterClass);
    403   addRegisterClass(MVT::v2i64, SPU::VECREGRegisterClass);
    404   addRegisterClass(MVT::v4f32, SPU::VECREGRegisterClass);
    405   addRegisterClass(MVT::v2f64, SPU::VECREGRegisterClass);
    406 
    407   for (unsigned i = (unsigned)MVT::FIRST_VECTOR_VALUETYPE;
    408        i <= (unsigned)MVT::LAST_VECTOR_VALUETYPE; ++i) {
    409     MVT::SimpleValueType VT = (MVT::SimpleValueType)i;
    410 
    411     // Set operation actions to legal types only.
    412     if (!isTypeLegal(VT)) continue;
    413 
    414     // add/sub are legal for all supported vector VT's.
    415     setOperationAction(ISD::ADD,     VT, Legal);
    416     setOperationAction(ISD::SUB,     VT, Legal);
    417     // mul has to be custom lowered.
    418     setOperationAction(ISD::MUL,     VT, Legal);
    419 
    420     setOperationAction(ISD::AND,     VT, Legal);
    421     setOperationAction(ISD::OR,      VT, Legal);
    422     setOperationAction(ISD::XOR,     VT, Legal);
    423     setOperationAction(ISD::LOAD,    VT, Custom);
    424     setOperationAction(ISD::SELECT,  VT, Legal);
    425     setOperationAction(ISD::STORE,   VT, Custom);
    426 
    427     // These operations need to be expanded:
    428     setOperationAction(ISD::SDIV,    VT, Expand);
    429     setOperationAction(ISD::SREM,    VT, Expand);
    430     setOperationAction(ISD::UDIV,    VT, Expand);
    431     setOperationAction(ISD::UREM,    VT, Expand);
    432 
    433     // Expand all trunc stores
    434     for (unsigned j = (unsigned)MVT::FIRST_VECTOR_VALUETYPE;
    435          j <= (unsigned)MVT::LAST_VECTOR_VALUETYPE; ++j) {
    436       MVT::SimpleValueType TargetVT = (MVT::SimpleValueType)j;
    437     setTruncStoreAction(VT, TargetVT, Expand);
    438     }
    439 
    440     // Custom lower build_vector, constant pool spills, insert and
    441     // extract vector elements:
    442     setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
    443     setOperationAction(ISD::ConstantPool, VT, Custom);
    444     setOperationAction(ISD::SCALAR_TO_VECTOR, VT, Custom);
    445     setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
    446     setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom);
    447     setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);
    448   }
    449 
    450   setOperationAction(ISD::SHL, MVT::v2i64, Expand);
    451 
    452   setOperationAction(ISD::AND, MVT::v16i8, Custom);
    453   setOperationAction(ISD::OR,  MVT::v16i8, Custom);
    454   setOperationAction(ISD::XOR, MVT::v16i8, Custom);
    455   setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v4f32, Custom);
    456 
    457   setOperationAction(ISD::FDIV, MVT::v4f32, Legal);
    458 
    459   setBooleanContents(ZeroOrNegativeOneBooleanContent);
    460   setBooleanVectorContents(ZeroOrNegativeOneBooleanContent); // FIXME: Is this correct?
    461 
    462   setStackPointerRegisterToSaveRestore(SPU::R1);
    463 
    464   // We have target-specific dag combine patterns for the following nodes:
    465   setTargetDAGCombine(ISD::ADD);
    466   setTargetDAGCombine(ISD::ZERO_EXTEND);
    467   setTargetDAGCombine(ISD::SIGN_EXTEND);
    468   setTargetDAGCombine(ISD::ANY_EXTEND);
    469 
    470   setMinFunctionAlignment(3);
    471 
    472   computeRegisterProperties();
    473 
    474   // Set pre-RA register scheduler default to BURR, which produces slightly
    475   // better code than the default (could also be TDRR, but TargetLowering.h
    476   // needs a mod to support that model):
    477   setSchedulingPreference(Sched::RegPressure);
    478 }
    479 
    480 const char *SPUTargetLowering::getTargetNodeName(unsigned Opcode) const {
    481   switch (Opcode) {
    482   default: return 0;
    483   case SPUISD::RET_FLAG: return "SPUISD::RET_FLAG";
    484   case SPUISD::Hi: return "SPUISD::Hi";
    485   case SPUISD::Lo: return "SPUISD::Lo";
    486   case SPUISD::PCRelAddr: return "SPUISD::PCRelAddr";
    487   case SPUISD::AFormAddr: return "SPUISD::AFormAddr";
    488   case SPUISD::IndirectAddr: return "SPUISD::IndirectAddr";
    489   case SPUISD::LDRESULT: return "SPUISD::LDRESULT";
    490   case SPUISD::CALL: return "SPUISD::CALL";
    491   case SPUISD::SHUFB: return "SPUISD::SHUFB";
    492   case SPUISD::SHUFFLE_MASK: return "SPUISD::SHUFFLE_MASK";
    493   case SPUISD::CNTB: return "SPUISD::CNTB";
    494   case SPUISD::PREFSLOT2VEC: return "SPUISD::PREFSLOT2VEC";
    495   case SPUISD::VEC2PREFSLOT: return "SPUISD::VEC2PREFSLOT";
    496   case SPUISD::SHL_BITS: return "SPUISD::SHL_BITS";
    497   case SPUISD::SHL_BYTES: return "SPUISD::SHL_BYTES";
    498   case SPUISD::VEC_ROTL: return "SPUISD::VEC_ROTL";
    499   case SPUISD::VEC_ROTR: return "SPUISD::VEC_ROTR";
    500   case SPUISD::ROTBYTES_LEFT: return "SPUISD::ROTBYTES_LEFT";
    501   case SPUISD::ROTBYTES_LEFT_BITS: return "SPUISD::ROTBYTES_LEFT_BITS";
    502   case SPUISD::SELECT_MASK: return "SPUISD::SELECT_MASK";
    503   case SPUISD::SELB: return "SPUISD::SELB";
    504   case SPUISD::ADD64_MARKER: return "SPUISD::ADD64_MARKER";
    505   case SPUISD::SUB64_MARKER: return "SPUISD::SUB64_MARKER";
    506   case SPUISD::MUL64_MARKER: return "SPUISD::MUL64_MARKER";
    507   }
    508 }
    509 
    510 //===----------------------------------------------------------------------===//
    511 // Return the Cell SPU's SETCC result type
    512 //===----------------------------------------------------------------------===//
    513 
    514 EVT SPUTargetLowering::getSetCCResultType(EVT VT) const {
    515   // i8, i16 and i32 are valid SETCC result types
    516   MVT::SimpleValueType retval;
    517 
    518   switch(VT.getSimpleVT().SimpleTy){
    519     case MVT::i1:
    520     case MVT::i8:
    521       retval = MVT::i8; break;
    522     case MVT::i16:
    523       retval = MVT::i16; break;
    524     case MVT::i32:
    525     default:
    526       retval = MVT::i32;
    527   }
    528   return retval;
    529 }
    530 
    531 //===----------------------------------------------------------------------===//
    532 // Calling convention code:
    533 //===----------------------------------------------------------------------===//
    534 
    535 #include "SPUGenCallingConv.inc"
    536 
    537 //===----------------------------------------------------------------------===//
    538 //  LowerOperation implementation
    539 //===----------------------------------------------------------------------===//
    540 
    541 /// Custom lower loads for CellSPU
    542 /*!
    543  All CellSPU loads and stores are aligned to 16-byte boundaries, so for elements
    544  within a 16-byte block, we have to rotate to extract the requested element.
    545 
    546  For extending loads, we also want to ensure that the following sequence is
    547  emitted, e.g. for MVT::f32 extending load to MVT::f64:
    548 
    549 \verbatim
    550 %1  v16i8,ch = load
    551 %2  v16i8,ch = rotate %1
    552 %3  v4f8, ch = bitconvert %2
    553 %4  f32      = vec2perfslot %3
    554 %5  f64      = fp_extend %4
    555 \endverbatim
    556 */
    557 static SDValue
    558 LowerLOAD(SDValue Op, SelectionDAG &DAG, const SPUSubtarget *ST) {
    559   LoadSDNode *LN = cast<LoadSDNode>(Op);
    560   SDValue the_chain = LN->getChain();
    561   EVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy();
    562   EVT InVT = LN->getMemoryVT();
    563   EVT OutVT = Op.getValueType();
    564   ISD::LoadExtType ExtType = LN->getExtensionType();
    565   unsigned alignment = LN->getAlignment();
    566   int pso = prefslotOffset(InVT);
    567   DebugLoc dl = Op.getDebugLoc();
    568   EVT vecVT = InVT.isVector()? InVT: EVT::getVectorVT(*DAG.getContext(), InVT,
    569                                                   (128 / InVT.getSizeInBits()));
    570 
    571   // two sanity checks
    572   assert( LN->getAddressingMode() == ISD::UNINDEXED
    573           && "we should get only UNINDEXED adresses");
    574   // clean aligned loads can be selected as-is
    575   if (InVT.getSizeInBits() == 128 && (alignment%16) == 0)
    576     return SDValue();
    577 
    578   // Get pointerinfos to the memory chunk(s) that contain the data to load
    579   uint64_t mpi_offset = LN->getPointerInfo().Offset;
    580   mpi_offset -= mpi_offset%16;
    581   MachinePointerInfo lowMemPtr(LN->getPointerInfo().V, mpi_offset);
    582   MachinePointerInfo highMemPtr(LN->getPointerInfo().V, mpi_offset+16);
    583 
    584   SDValue result;
    585   SDValue basePtr = LN->getBasePtr();
    586   SDValue rotate;
    587 
    588   if ((alignment%16) == 0) {
    589     ConstantSDNode *CN;
    590 
    591     // Special cases for a known aligned load to simplify the base pointer
    592     // and the rotation amount:
    593     if (basePtr.getOpcode() == ISD::ADD
    594         && (CN = dyn_cast<ConstantSDNode > (basePtr.getOperand(1))) != 0) {
    595       // Known offset into basePtr
    596       int64_t offset = CN->getSExtValue();
    597       int64_t rotamt = int64_t((offset & 0xf) - pso);
    598 
    599       if (rotamt < 0)
    600         rotamt += 16;
    601 
    602       rotate = DAG.getConstant(rotamt, MVT::i16);
    603 
    604       // Simplify the base pointer for this case:
    605       basePtr = basePtr.getOperand(0);
    606       if ((offset & ~0xf) > 0) {
    607         basePtr = DAG.getNode(SPUISD::IndirectAddr, dl, PtrVT,
    608                               basePtr,
    609                               DAG.getConstant((offset & ~0xf), PtrVT));
    610       }
    611     } else if ((basePtr.getOpcode() == SPUISD::AFormAddr)
    612                || (basePtr.getOpcode() == SPUISD::IndirectAddr
    613                    && basePtr.getOperand(0).getOpcode() == SPUISD::Hi
    614                    && basePtr.getOperand(1).getOpcode() == SPUISD::Lo)) {
    615       // Plain aligned a-form address: rotate into preferred slot
    616       // Same for (SPUindirect (SPUhi ...), (SPUlo ...))
    617       int64_t rotamt = -pso;
    618       if (rotamt < 0)
    619         rotamt += 16;
    620       rotate = DAG.getConstant(rotamt, MVT::i16);
    621     } else {
    622       // Offset the rotate amount by the basePtr and the preferred slot
    623       // byte offset
    624       int64_t rotamt = -pso;
    625       if (rotamt < 0)
    626         rotamt += 16;
    627       rotate = DAG.getNode(ISD::ADD, dl, PtrVT,
    628                            basePtr,
    629                            DAG.getConstant(rotamt, PtrVT));
    630     }
    631   } else {
    632     // Unaligned load: must be more pessimistic about addressing modes:
    633     if (basePtr.getOpcode() == ISD::ADD) {
    634       MachineFunction &MF = DAG.getMachineFunction();
    635       MachineRegisterInfo &RegInfo = MF.getRegInfo();
    636       unsigned VReg = RegInfo.createVirtualRegister(&SPU::R32CRegClass);
    637       SDValue Flag;
    638 
    639       SDValue Op0 = basePtr.getOperand(0);
    640       SDValue Op1 = basePtr.getOperand(1);
    641 
    642       if (isa<ConstantSDNode>(Op1)) {
    643         // Convert the (add <ptr>, <const>) to an indirect address contained
    644         // in a register. Note that this is done because we need to avoid
    645         // creating a 0(reg) d-form address due to the SPU's block loads.
    646         basePtr = DAG.getNode(SPUISD::IndirectAddr, dl, PtrVT, Op0, Op1);
    647         the_chain = DAG.getCopyToReg(the_chain, dl, VReg, basePtr, Flag);
    648         basePtr = DAG.getCopyFromReg(the_chain, dl, VReg, PtrVT);
    649       } else {
    650         // Convert the (add <arg1>, <arg2>) to an indirect address, which
    651         // will likely be lowered as a reg(reg) x-form address.
    652         basePtr = DAG.getNode(SPUISD::IndirectAddr, dl, PtrVT, Op0, Op1);
    653       }
    654     } else {
    655       basePtr = DAG.getNode(SPUISD::IndirectAddr, dl, PtrVT,
    656                             basePtr,
    657                             DAG.getConstant(0, PtrVT));
    658    }
    659 
    660     // Offset the rotate amount by the basePtr and the preferred slot
    661     // byte offset
    662     rotate = DAG.getNode(ISD::ADD, dl, PtrVT,
    663                          basePtr,
    664                          DAG.getConstant(-pso, PtrVT));
    665   }
    666 
    667   // Do the load as a i128 to allow possible shifting
    668   SDValue low = DAG.getLoad(MVT::i128, dl, the_chain, basePtr,
    669                        lowMemPtr,
    670                        LN->isVolatile(), LN->isNonTemporal(), false, 16);
    671 
    672   // When the size is not greater than alignment we get all data with just
    673   // one load
    674   if (alignment >= InVT.getSizeInBits()/8) {
    675     // Update the chain
    676     the_chain = low.getValue(1);
    677 
    678     // Rotate into the preferred slot:
    679     result = DAG.getNode(SPUISD::ROTBYTES_LEFT, dl, MVT::i128,
    680                          low.getValue(0), rotate);
    681 
    682     // Convert the loaded v16i8 vector to the appropriate vector type
    683     // specified by the operand:
    684     EVT vecVT = EVT::getVectorVT(*DAG.getContext(),
    685                                  InVT, (128 / InVT.getSizeInBits()));
    686     result = DAG.getNode(SPUISD::VEC2PREFSLOT, dl, InVT,
    687                          DAG.getNode(ISD::BITCAST, dl, vecVT, result));
    688   }
    689   // When alignment is less than the size, we might need (known only at
    690   // run-time) two loads
    691   // TODO: if the memory address is composed only from constants, we have
    692   // extra kowledge, and might avoid the second load
    693   else {
    694     // storage position offset from lower 16 byte aligned memory chunk
    695     SDValue offset = DAG.getNode(ISD::AND, dl, MVT::i32,
    696                                   basePtr, DAG.getConstant( 0xf, MVT::i32 ) );
    697     // get a registerfull of ones. (this implementation is a workaround: LLVM
    698     // cannot handle 128 bit signed int constants)
    699     SDValue ones = DAG.getConstant(-1, MVT::v4i32 );
    700     ones = DAG.getNode(ISD::BITCAST, dl, MVT::i128, ones);
    701 
    702     SDValue high = DAG.getLoad(MVT::i128, dl, the_chain,
    703                                DAG.getNode(ISD::ADD, dl, PtrVT,
    704                                            basePtr,
    705                                            DAG.getConstant(16, PtrVT)),
    706                                highMemPtr,
    707                                LN->isVolatile(), LN->isNonTemporal(), false,
    708                                16);
    709 
    710     the_chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, low.getValue(1),
    711                                                               high.getValue(1));
    712 
    713     // Shift the (possible) high part right to compensate the misalignemnt.
    714     // if there is no highpart (i.e. value is i64 and offset is 4), this
    715     // will zero out the high value.
    716     high = DAG.getNode(SPUISD::SRL_BYTES, dl, MVT::i128, high,
    717                                      DAG.getNode(ISD::SUB, dl, MVT::i32,
    718                                                  DAG.getConstant( 16, MVT::i32),
    719                                                  offset
    720                                                 ));
    721 
    722     // Shift the low similarly
    723     // TODO: add SPUISD::SHL_BYTES
    724     low = DAG.getNode(SPUISD::SHL_BYTES, dl, MVT::i128, low, offset );
    725 
    726     // Merge the two parts
    727     result = DAG.getNode(ISD::BITCAST, dl, vecVT,
    728                           DAG.getNode(ISD::OR, dl, MVT::i128, low, high));
    729 
    730     if (!InVT.isVector()) {
    731       result = DAG.getNode(SPUISD::VEC2PREFSLOT, dl, InVT, result );
    732      }
    733 
    734   }
    735     // Handle extending loads by extending the scalar result:
    736     if (ExtType == ISD::SEXTLOAD) {
    737       result = DAG.getNode(ISD::SIGN_EXTEND, dl, OutVT, result);
    738     } else if (ExtType == ISD::ZEXTLOAD) {
    739       result = DAG.getNode(ISD::ZERO_EXTEND, dl, OutVT, result);
    740     } else if (ExtType == ISD::EXTLOAD) {
    741       unsigned NewOpc = ISD::ANY_EXTEND;
    742 
    743       if (OutVT.isFloatingPoint())
    744         NewOpc = ISD::FP_EXTEND;
    745 
    746       result = DAG.getNode(NewOpc, dl, OutVT, result);
    747     }
    748 
    749     SDVTList retvts = DAG.getVTList(OutVT, MVT::Other);
    750     SDValue retops[2] = {
    751       result,
    752       the_chain
    753     };
    754 
    755     result = DAG.getNode(SPUISD::LDRESULT, dl, retvts,
    756                          retops, sizeof(retops) / sizeof(retops[0]));
    757     return result;
    758 }
    759 
    760 /// Custom lower stores for CellSPU
    761 /*!
    762  All CellSPU stores are aligned to 16-byte boundaries, so for elements
    763  within a 16-byte block, we have to generate a shuffle to insert the
    764  requested element into its place, then store the resulting block.
    765  */
    766 static SDValue
    767 LowerSTORE(SDValue Op, SelectionDAG &DAG, const SPUSubtarget *ST) {
    768   StoreSDNode *SN = cast<StoreSDNode>(Op);
    769   SDValue Value = SN->getValue();
    770   EVT VT = Value.getValueType();
    771   EVT StVT = (!SN->isTruncatingStore() ? VT : SN->getMemoryVT());
    772   EVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy();
    773   DebugLoc dl = Op.getDebugLoc();
    774   unsigned alignment = SN->getAlignment();
    775   SDValue result;
    776   EVT vecVT = StVT.isVector()? StVT: EVT::getVectorVT(*DAG.getContext(), StVT,
    777                                                  (128 / StVT.getSizeInBits()));
    778   // Get pointerinfos to the memory chunk(s) that contain the data to load
    779   uint64_t mpi_offset = SN->getPointerInfo().Offset;
    780   mpi_offset -= mpi_offset%16;
    781   MachinePointerInfo lowMemPtr(SN->getPointerInfo().V, mpi_offset);
    782   MachinePointerInfo highMemPtr(SN->getPointerInfo().V, mpi_offset+16);
    783 
    784 
    785   // two sanity checks
    786   assert( SN->getAddressingMode() == ISD::UNINDEXED
    787           && "we should get only UNINDEXED adresses");
    788   // clean aligned loads can be selected as-is
    789   if (StVT.getSizeInBits() == 128 && (alignment%16) == 0)
    790     return SDValue();
    791 
    792   SDValue alignLoadVec;
    793   SDValue basePtr = SN->getBasePtr();
    794   SDValue the_chain = SN->getChain();
    795   SDValue insertEltOffs;
    796 
    797   if ((alignment%16) == 0) {
    798     ConstantSDNode *CN;
    799     // Special cases for a known aligned load to simplify the base pointer
    800     // and insertion byte:
    801     if (basePtr.getOpcode() == ISD::ADD
    802         && (CN = dyn_cast<ConstantSDNode>(basePtr.getOperand(1))) != 0) {
    803       // Known offset into basePtr
    804       int64_t offset = CN->getSExtValue();
    805 
    806       // Simplify the base pointer for this case:
    807       basePtr = basePtr.getOperand(0);
    808       insertEltOffs = DAG.getNode(SPUISD::IndirectAddr, dl, PtrVT,
    809                                   basePtr,
    810                                   DAG.getConstant((offset & 0xf), PtrVT));
    811 
    812       if ((offset & ~0xf) > 0) {
    813         basePtr = DAG.getNode(SPUISD::IndirectAddr, dl, PtrVT,
    814                               basePtr,
    815                               DAG.getConstant((offset & ~0xf), PtrVT));
    816       }
    817     } else {
    818       // Otherwise, assume it's at byte 0 of basePtr
    819       insertEltOffs = DAG.getNode(SPUISD::IndirectAddr, dl, PtrVT,
    820                                   basePtr,
    821                                   DAG.getConstant(0, PtrVT));
    822       basePtr = DAG.getNode(SPUISD::IndirectAddr, dl, PtrVT,
    823                                   basePtr,
    824                                   DAG.getConstant(0, PtrVT));
    825     }
    826   } else {
    827     // Unaligned load: must be more pessimistic about addressing modes:
    828     if (basePtr.getOpcode() == ISD::ADD) {
    829       MachineFunction &MF = DAG.getMachineFunction();
    830       MachineRegisterInfo &RegInfo = MF.getRegInfo();
    831       unsigned VReg = RegInfo.createVirtualRegister(&SPU::R32CRegClass);
    832       SDValue Flag;
    833 
    834       SDValue Op0 = basePtr.getOperand(0);
    835       SDValue Op1 = basePtr.getOperand(1);
    836 
    837       if (isa<ConstantSDNode>(Op1)) {
    838         // Convert the (add <ptr>, <const>) to an indirect address contained
    839         // in a register. Note that this is done because we need to avoid
    840         // creating a 0(reg) d-form address due to the SPU's block loads.
    841         basePtr = DAG.getNode(SPUISD::IndirectAddr, dl, PtrVT, Op0, Op1);
    842         the_chain = DAG.getCopyToReg(the_chain, dl, VReg, basePtr, Flag);
    843         basePtr = DAG.getCopyFromReg(the_chain, dl, VReg, PtrVT);
    844       } else {
    845         // Convert the (add <arg1>, <arg2>) to an indirect address, which
    846         // will likely be lowered as a reg(reg) x-form address.
    847         basePtr = DAG.getNode(SPUISD::IndirectAddr, dl, PtrVT, Op0, Op1);
    848       }
    849     } else {
    850       basePtr = DAG.getNode(SPUISD::IndirectAddr, dl, PtrVT,
    851                             basePtr,
    852                             DAG.getConstant(0, PtrVT));
    853     }
    854 
    855     // Insertion point is solely determined by basePtr's contents
    856     insertEltOffs = DAG.getNode(ISD::ADD, dl, PtrVT,
    857                                 basePtr,
    858                                 DAG.getConstant(0, PtrVT));
    859   }
    860 
    861   // Load the lower part of the memory to which to store.
    862   SDValue low = DAG.getLoad(vecVT, dl, the_chain, basePtr,
    863                           lowMemPtr, SN->isVolatile(), SN->isNonTemporal(),
    864                             false, 16);
    865 
    866   // if we don't need to store over the 16 byte boundary, one store suffices
    867   if (alignment >= StVT.getSizeInBits()/8) {
    868     // Update the chain
    869     the_chain = low.getValue(1);
    870 
    871     LoadSDNode *LN = cast<LoadSDNode>(low);
    872     SDValue theValue = SN->getValue();
    873 
    874     if (StVT != VT
    875         && (theValue.getOpcode() == ISD::AssertZext
    876             || theValue.getOpcode() == ISD::AssertSext)) {
    877       // Drill down and get the value for zero- and sign-extended
    878       // quantities
    879       theValue = theValue.getOperand(0);
    880     }
    881 
    882     // If the base pointer is already a D-form address, then just create
    883     // a new D-form address with a slot offset and the orignal base pointer.
    884     // Otherwise generate a D-form address with the slot offset relative
    885     // to the stack pointer, which is always aligned.
    886 #if !defined(NDEBUG)
    887       if (DebugFlag && isCurrentDebugType(DEBUG_TYPE)) {
    888         errs() << "CellSPU LowerSTORE: basePtr = ";
    889         basePtr.getNode()->dump(&DAG);
    890         errs() << "\n";
    891       }
    892 #endif
    893 
    894     SDValue insertEltOp = DAG.getNode(SPUISD::SHUFFLE_MASK, dl, vecVT,
    895                                       insertEltOffs);
    896     SDValue vectorizeOp = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, vecVT,
    897                                       theValue);
    898 
    899     result = DAG.getNode(SPUISD::SHUFB, dl, vecVT,
    900                          vectorizeOp, low,
    901                          DAG.getNode(ISD::BITCAST, dl,
    902                                      MVT::v4i32, insertEltOp));
    903 
    904     result = DAG.getStore(the_chain, dl, result, basePtr,
    905                           lowMemPtr,
    906                           LN->isVolatile(), LN->isNonTemporal(),
    907                           16);
    908 
    909   }
    910   // do the store when it might cross the 16 byte memory access boundary.
    911   else {
    912     // TODO issue a warning if SN->isVolatile()== true? This is likely not
    913     // what the user wanted.
    914 
    915     // address offset from nearest lower 16byte alinged address
    916     SDValue offset = DAG.getNode(ISD::AND, dl, MVT::i32,
    917                                     SN->getBasePtr(),
    918                                     DAG.getConstant(0xf, MVT::i32));
    919     // 16 - offset
    920     SDValue offset_compl = DAG.getNode(ISD::SUB, dl, MVT::i32,
    921                                            DAG.getConstant( 16, MVT::i32),
    922                                            offset);
    923     // 16 - sizeof(Value)
    924     SDValue surplus = DAG.getNode(ISD::SUB, dl, MVT::i32,
    925                                      DAG.getConstant( 16, MVT::i32),
    926                                      DAG.getConstant( VT.getSizeInBits()/8,
    927                                                       MVT::i32));
    928     // get a registerfull of ones
    929     SDValue ones = DAG.getConstant(-1, MVT::v4i32);
    930     ones = DAG.getNode(ISD::BITCAST, dl, MVT::i128, ones);
    931 
    932     // Create the 128 bit masks that have ones where the data to store is
    933     // located.
    934     SDValue lowmask, himask;
    935     // if the value to store don't fill up the an entire 128 bits, zero
    936     // out the last bits of the mask so that only the value we want to store
    937     // is masked.
    938     // this is e.g. in the case of store i32, align 2
    939     if (!VT.isVector()){
    940       Value = DAG.getNode(SPUISD::PREFSLOT2VEC, dl, vecVT, Value);
    941       lowmask = DAG.getNode(SPUISD::SRL_BYTES, dl, MVT::i128, ones, surplus);
    942       lowmask = DAG.getNode(SPUISD::SHL_BYTES, dl, MVT::i128, lowmask,
    943                                                                surplus);
    944       Value = DAG.getNode(ISD::BITCAST, dl, MVT::i128, Value);
    945       Value = DAG.getNode(ISD::AND, dl, MVT::i128, Value, lowmask);
    946 
    947     }
    948     else {
    949       lowmask = ones;
    950       Value = DAG.getNode(ISD::BITCAST, dl, MVT::i128, Value);
    951     }
    952     // this will zero, if there are no data that goes to the high quad
    953     himask = DAG.getNode(SPUISD::SHL_BYTES, dl, MVT::i128, lowmask,
    954                                                             offset_compl);
    955     lowmask = DAG.getNode(SPUISD::SRL_BYTES, dl, MVT::i128, lowmask,
    956                                                              offset);
    957 
    958     // Load in the old data and zero out the parts that will be overwritten with
    959     // the new data to store.
    960     SDValue hi = DAG.getLoad(MVT::i128, dl, the_chain,
    961                                DAG.getNode(ISD::ADD, dl, PtrVT, basePtr,
    962                                            DAG.getConstant( 16, PtrVT)),
    963                                highMemPtr,
    964                                SN->isVolatile(), SN->isNonTemporal(),
    965                                false, 16);
    966     the_chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, low.getValue(1),
    967                                                               hi.getValue(1));
    968 
    969     low = DAG.getNode(ISD::AND, dl, MVT::i128,
    970                         DAG.getNode( ISD::BITCAST, dl, MVT::i128, low),
    971                         DAG.getNode( ISD::XOR, dl, MVT::i128, lowmask, ones));
    972     hi = DAG.getNode(ISD::AND, dl, MVT::i128,
    973                         DAG.getNode( ISD::BITCAST, dl, MVT::i128, hi),
    974                         DAG.getNode( ISD::XOR, dl, MVT::i128, himask, ones));
    975 
    976     // Shift the Value to store into place. rlow contains the parts that go to
    977     // the lower memory chunk, rhi has the parts that go to the upper one.
    978     SDValue rlow = DAG.getNode(SPUISD::SRL_BYTES, dl, MVT::i128, Value, offset);
    979     rlow = DAG.getNode(ISD::AND, dl, MVT::i128, rlow, lowmask);
    980     SDValue rhi = DAG.getNode(SPUISD::SHL_BYTES, dl, MVT::i128, Value,
    981                                                             offset_compl);
    982 
    983     // Merge the old data and the new data and store the results
    984     // Need to convert vectors here to integer as 'OR'ing floats assert
    985     rlow = DAG.getNode(ISD::OR, dl, MVT::i128,
    986                           DAG.getNode(ISD::BITCAST, dl, MVT::i128, low),
    987                           DAG.getNode(ISD::BITCAST, dl, MVT::i128, rlow));
    988     rhi = DAG.getNode(ISD::OR, dl, MVT::i128,
    989                          DAG.getNode(ISD::BITCAST, dl, MVT::i128, hi),
    990                          DAG.getNode(ISD::BITCAST, dl, MVT::i128, rhi));
    991 
    992     low = DAG.getStore(the_chain, dl, rlow, basePtr,
    993                           lowMemPtr,
    994                           SN->isVolatile(), SN->isNonTemporal(), 16);
    995     hi  = DAG.getStore(the_chain, dl, rhi,
    996                             DAG.getNode(ISD::ADD, dl, PtrVT, basePtr,
    997                                         DAG.getConstant( 16, PtrVT)),
    998                             highMemPtr,
    999                             SN->isVolatile(), SN->isNonTemporal(), 16);
   1000     result = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, low.getValue(0),
   1001                                                            hi.getValue(0));
   1002   }
   1003 
   1004   return result;
   1005 }
   1006 
   1007 //! Generate the address of a constant pool entry.
   1008 static SDValue
   1009 LowerConstantPool(SDValue Op, SelectionDAG &DAG, const SPUSubtarget *ST) {
   1010   EVT PtrVT = Op.getValueType();
   1011   ConstantPoolSDNode *CP = cast<ConstantPoolSDNode>(Op);
   1012   const Constant *C = CP->getConstVal();
   1013   SDValue CPI = DAG.getTargetConstantPool(C, PtrVT, CP->getAlignment());
   1014   SDValue Zero = DAG.getConstant(0, PtrVT);
   1015   const TargetMachine &TM = DAG.getTarget();
   1016   // FIXME there is no actual debug info here
   1017   DebugLoc dl = Op.getDebugLoc();
   1018 
   1019   if (TM.getRelocationModel() == Reloc::Static) {
   1020     if (!ST->usingLargeMem()) {
   1021       // Just return the SDValue with the constant pool address in it.
   1022       return DAG.getNode(SPUISD::AFormAddr, dl, PtrVT, CPI, Zero);
   1023     } else {
   1024       SDValue Hi = DAG.getNode(SPUISD::Hi, dl, PtrVT, CPI, Zero);
   1025       SDValue Lo = DAG.getNode(SPUISD::Lo, dl, PtrVT, CPI, Zero);
   1026       return DAG.getNode(SPUISD::IndirectAddr, dl, PtrVT, Hi, Lo);
   1027     }
   1028   }
   1029 
   1030   llvm_unreachable("LowerConstantPool: Relocation model other than static"
   1031                    " not supported.");
   1032 }
   1033 
   1034 //! Alternate entry point for generating the address of a constant pool entry
   1035 SDValue
   1036 SPU::LowerConstantPool(SDValue Op, SelectionDAG &DAG, const SPUTargetMachine &TM) {
   1037   return ::LowerConstantPool(Op, DAG, TM.getSubtargetImpl());
   1038 }
   1039 
   1040 static SDValue
   1041 LowerJumpTable(SDValue Op, SelectionDAG &DAG, const SPUSubtarget *ST) {
   1042   EVT PtrVT = Op.getValueType();
   1043   JumpTableSDNode *JT = cast<JumpTableSDNode>(Op);
   1044   SDValue JTI = DAG.getTargetJumpTable(JT->getIndex(), PtrVT);
   1045   SDValue Zero = DAG.getConstant(0, PtrVT);
   1046   const TargetMachine &TM = DAG.getTarget();
   1047   // FIXME there is no actual debug info here
   1048   DebugLoc dl = Op.getDebugLoc();
   1049 
   1050   if (TM.getRelocationModel() == Reloc::Static) {
   1051     if (!ST->usingLargeMem()) {
   1052       return DAG.getNode(SPUISD::AFormAddr, dl, PtrVT, JTI, Zero);
   1053     } else {
   1054       SDValue Hi = DAG.getNode(SPUISD::Hi, dl, PtrVT, JTI, Zero);
   1055       SDValue Lo = DAG.getNode(SPUISD::Lo, dl, PtrVT, JTI, Zero);
   1056       return DAG.getNode(SPUISD::IndirectAddr, dl, PtrVT, Hi, Lo);
   1057     }
   1058   }
   1059 
   1060   llvm_unreachable("LowerJumpTable: Relocation model other than static"
   1061                    " not supported.");
   1062 }
   1063 
   1064 static SDValue
   1065 LowerGlobalAddress(SDValue Op, SelectionDAG &DAG, const SPUSubtarget *ST) {
   1066   EVT PtrVT = Op.getValueType();
   1067   GlobalAddressSDNode *GSDN = cast<GlobalAddressSDNode>(Op);
   1068   const GlobalValue *GV = GSDN->getGlobal();
   1069   SDValue GA = DAG.getTargetGlobalAddress(GV, Op.getDebugLoc(),
   1070                                           PtrVT, GSDN->getOffset());
   1071   const TargetMachine &TM = DAG.getTarget();
   1072   SDValue Zero = DAG.getConstant(0, PtrVT);
   1073   // FIXME there is no actual debug info here
   1074   DebugLoc dl = Op.getDebugLoc();
   1075 
   1076   if (TM.getRelocationModel() == Reloc::Static) {
   1077     if (!ST->usingLargeMem()) {
   1078       return DAG.getNode(SPUISD::AFormAddr, dl, PtrVT, GA, Zero);
   1079     } else {
   1080       SDValue Hi = DAG.getNode(SPUISD::Hi, dl, PtrVT, GA, Zero);
   1081       SDValue Lo = DAG.getNode(SPUISD::Lo, dl, PtrVT, GA, Zero);
   1082       return DAG.getNode(SPUISD::IndirectAddr, dl, PtrVT, Hi, Lo);
   1083     }
   1084   } else {
   1085     report_fatal_error("LowerGlobalAddress: Relocation model other than static"
   1086                       "not supported.");
   1087     /*NOTREACHED*/
   1088   }
   1089 }
   1090 
   1091 //! Custom lower double precision floating point constants
   1092 static SDValue
   1093 LowerConstantFP(SDValue Op, SelectionDAG &DAG) {
   1094   EVT VT = Op.getValueType();
   1095   // FIXME there is no actual debug info here
   1096   DebugLoc dl = Op.getDebugLoc();
   1097 
   1098   if (VT == MVT::f64) {
   1099     ConstantFPSDNode *FP = cast<ConstantFPSDNode>(Op.getNode());
   1100 
   1101     assert((FP != 0) &&
   1102            "LowerConstantFP: Node is not ConstantFPSDNode");
   1103 
   1104     uint64_t dbits = DoubleToBits(FP->getValueAPF().convertToDouble());
   1105     SDValue T = DAG.getConstant(dbits, MVT::i64);
   1106     SDValue Tvec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v2i64, T, T);
   1107     return DAG.getNode(SPUISD::VEC2PREFSLOT, dl, VT,
   1108                        DAG.getNode(ISD::BITCAST, dl, MVT::v2f64, Tvec));
   1109   }
   1110 
   1111   return SDValue();
   1112 }
   1113 
   1114 SDValue
   1115 SPUTargetLowering::LowerFormalArguments(SDValue Chain,
   1116                                         CallingConv::ID CallConv, bool isVarArg,
   1117                                         const SmallVectorImpl<ISD::InputArg>
   1118                                           &Ins,
   1119                                         DebugLoc dl, SelectionDAG &DAG,
   1120                                         SmallVectorImpl<SDValue> &InVals)
   1121                                           const {
   1122 
   1123   MachineFunction &MF = DAG.getMachineFunction();
   1124   MachineFrameInfo *MFI = MF.getFrameInfo();
   1125   MachineRegisterInfo &RegInfo = MF.getRegInfo();
   1126   SPUFunctionInfo *FuncInfo = MF.getInfo<SPUFunctionInfo>();
   1127 
   1128   unsigned ArgOffset = SPUFrameLowering::minStackSize();
   1129   unsigned ArgRegIdx = 0;
   1130   unsigned StackSlotSize = SPUFrameLowering::stackSlotSize();
   1131 
   1132   EVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy();
   1133 
   1134   SmallVector<CCValAssign, 16> ArgLocs;
   1135   CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(),
   1136 		 getTargetMachine(), ArgLocs, *DAG.getContext());
   1137   // FIXME: allow for other calling conventions
   1138   CCInfo.AnalyzeFormalArguments(Ins, CCC_SPU);
   1139 
   1140   // Add DAG nodes to load the arguments or copy them out of registers.
   1141   for (unsigned ArgNo = 0, e = Ins.size(); ArgNo != e; ++ArgNo) {
   1142     EVT ObjectVT = Ins[ArgNo].VT;
   1143     unsigned ObjSize = ObjectVT.getSizeInBits()/8;
   1144     SDValue ArgVal;
   1145     CCValAssign &VA = ArgLocs[ArgNo];
   1146 
   1147     if (VA.isRegLoc()) {
   1148       const TargetRegisterClass *ArgRegClass;
   1149 
   1150       switch (ObjectVT.getSimpleVT().SimpleTy) {
   1151       default:
   1152         report_fatal_error("LowerFormalArguments Unhandled argument type: " +
   1153                            Twine(ObjectVT.getEVTString()));
   1154       case MVT::i8:
   1155         ArgRegClass = &SPU::R8CRegClass;
   1156         break;
   1157       case MVT::i16:
   1158         ArgRegClass = &SPU::R16CRegClass;
   1159         break;
   1160       case MVT::i32:
   1161         ArgRegClass = &SPU::R32CRegClass;
   1162         break;
   1163       case MVT::i64:
   1164         ArgRegClass = &SPU::R64CRegClass;
   1165         break;
   1166       case MVT::i128:
   1167         ArgRegClass = &SPU::GPRCRegClass;
   1168         break;
   1169       case MVT::f32:
   1170         ArgRegClass = &SPU::R32FPRegClass;
   1171         break;
   1172       case MVT::f64:
   1173         ArgRegClass = &SPU::R64FPRegClass;
   1174         break;
   1175       case MVT::v2f64:
   1176       case MVT::v4f32:
   1177       case MVT::v2i64:
   1178       case MVT::v4i32:
   1179       case MVT::v8i16:
   1180       case MVT::v16i8:
   1181         ArgRegClass = &SPU::VECREGRegClass;
   1182         break;
   1183       }
   1184 
   1185       unsigned VReg = RegInfo.createVirtualRegister(ArgRegClass);
   1186       RegInfo.addLiveIn(VA.getLocReg(), VReg);
   1187       ArgVal = DAG.getCopyFromReg(Chain, dl, VReg, ObjectVT);
   1188       ++ArgRegIdx;
   1189     } else {
   1190       // We need to load the argument to a virtual register if we determined
   1191       // above that we ran out of physical registers of the appropriate type
   1192       // or we're forced to do vararg
   1193       int FI = MFI->CreateFixedObject(ObjSize, ArgOffset, true);
   1194       SDValue FIN = DAG.getFrameIndex(FI, PtrVT);
   1195       ArgVal = DAG.getLoad(ObjectVT, dl, Chain, FIN, MachinePointerInfo(),
   1196                            false, false, false, 0);
   1197       ArgOffset += StackSlotSize;
   1198     }
   1199 
   1200     InVals.push_back(ArgVal);
   1201     // Update the chain
   1202     Chain = ArgVal.getOperand(0);
   1203   }
   1204 
   1205   // vararg handling:
   1206   if (isVarArg) {
   1207     // FIXME: we should be able to query the argument registers from
   1208     //        tablegen generated code.
   1209     static const uint16_t ArgRegs[] = {
   1210       SPU::R3,  SPU::R4,  SPU::R5,  SPU::R6,  SPU::R7,  SPU::R8,  SPU::R9,
   1211       SPU::R10, SPU::R11, SPU::R12, SPU::R13, SPU::R14, SPU::R15, SPU::R16,
   1212       SPU::R17, SPU::R18, SPU::R19, SPU::R20, SPU::R21, SPU::R22, SPU::R23,
   1213       SPU::R24, SPU::R25, SPU::R26, SPU::R27, SPU::R28, SPU::R29, SPU::R30,
   1214       SPU::R31, SPU::R32, SPU::R33, SPU::R34, SPU::R35, SPU::R36, SPU::R37,
   1215       SPU::R38, SPU::R39, SPU::R40, SPU::R41, SPU::R42, SPU::R43, SPU::R44,
   1216       SPU::R45, SPU::R46, SPU::R47, SPU::R48, SPU::R49, SPU::R50, SPU::R51,
   1217       SPU::R52, SPU::R53, SPU::R54, SPU::R55, SPU::R56, SPU::R57, SPU::R58,
   1218       SPU::R59, SPU::R60, SPU::R61, SPU::R62, SPU::R63, SPU::R64, SPU::R65,
   1219       SPU::R66, SPU::R67, SPU::R68, SPU::R69, SPU::R70, SPU::R71, SPU::R72,
   1220       SPU::R73, SPU::R74, SPU::R75, SPU::R76, SPU::R77, SPU::R78, SPU::R79
   1221     };
   1222     // size of ArgRegs array
   1223     const unsigned NumArgRegs = 77;
   1224 
   1225     // We will spill (79-3)+1 registers to the stack
   1226     SmallVector<SDValue, 79-3+1> MemOps;
   1227 
   1228     // Create the frame slot
   1229     for (; ArgRegIdx != NumArgRegs; ++ArgRegIdx) {
   1230       FuncInfo->setVarArgsFrameIndex(
   1231         MFI->CreateFixedObject(StackSlotSize, ArgOffset, true));
   1232       SDValue FIN = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(), PtrVT);
   1233       unsigned VReg = MF.addLiveIn(ArgRegs[ArgRegIdx], &SPU::VECREGRegClass);
   1234       SDValue ArgVal = DAG.getRegister(VReg, MVT::v16i8);
   1235       SDValue Store = DAG.getStore(Chain, dl, ArgVal, FIN, MachinePointerInfo(),
   1236                                    false, false, 0);
   1237       Chain = Store.getOperand(0);
   1238       MemOps.push_back(Store);
   1239 
   1240       // Increment address by stack slot size for the next stored argument
   1241       ArgOffset += StackSlotSize;
   1242     }
   1243     if (!MemOps.empty())
   1244       Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
   1245                           &MemOps[0], MemOps.size());
   1246   }
   1247 
   1248   return Chain;
   1249 }
   1250 
   1251 /// isLSAAddress - Return the immediate to use if the specified
   1252 /// value is representable as a LSA address.
   1253 static SDNode *isLSAAddress(SDValue Op, SelectionDAG &DAG) {
   1254   ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op);
   1255   if (!C) return 0;
   1256 
   1257   int Addr = C->getZExtValue();
   1258   if ((Addr & 3) != 0 ||  // Low 2 bits are implicitly zero.
   1259       (Addr << 14 >> 14) != Addr)
   1260     return 0;  // Top 14 bits have to be sext of immediate.
   1261 
   1262   return DAG.getConstant((int)C->getZExtValue() >> 2, MVT::i32).getNode();
   1263 }
   1264 
   1265 SDValue
   1266 SPUTargetLowering::LowerCall(SDValue Chain, SDValue Callee,
   1267                              CallingConv::ID CallConv, bool isVarArg,
   1268                              bool doesNotRet, bool &isTailCall,
   1269                              const SmallVectorImpl<ISD::OutputArg> &Outs,
   1270                              const SmallVectorImpl<SDValue> &OutVals,
   1271                              const SmallVectorImpl<ISD::InputArg> &Ins,
   1272                              DebugLoc dl, SelectionDAG &DAG,
   1273                              SmallVectorImpl<SDValue> &InVals) const {
   1274   // CellSPU target does not yet support tail call optimization.
   1275   isTailCall = false;
   1276 
   1277   const SPUSubtarget *ST = SPUTM.getSubtargetImpl();
   1278   unsigned NumOps     = Outs.size();
   1279   unsigned StackSlotSize = SPUFrameLowering::stackSlotSize();
   1280 
   1281   SmallVector<CCValAssign, 16> ArgLocs;
   1282   CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(),
   1283 		 getTargetMachine(), ArgLocs, *DAG.getContext());
   1284   // FIXME: allow for other calling conventions
   1285   CCInfo.AnalyzeCallOperands(Outs, CCC_SPU);
   1286 
   1287   const unsigned NumArgRegs = ArgLocs.size();
   1288 
   1289 
   1290   // Handy pointer type
   1291   EVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy();
   1292 
   1293   // Set up a copy of the stack pointer for use loading and storing any
   1294   // arguments that may not fit in the registers available for argument
   1295   // passing.
   1296   SDValue StackPtr = DAG.getRegister(SPU::R1, MVT::i32);
   1297 
   1298   // Figure out which arguments are going to go in registers, and which in
   1299   // memory.
   1300   unsigned ArgOffset = SPUFrameLowering::minStackSize(); // Just below [LR]
   1301   unsigned ArgRegIdx = 0;
   1302 
   1303   // Keep track of registers passing arguments
   1304   std::vector<std::pair<unsigned, SDValue> > RegsToPass;
   1305   // And the arguments passed on the stack
   1306   SmallVector<SDValue, 8> MemOpChains;
   1307 
   1308   for (; ArgRegIdx != NumOps; ++ArgRegIdx) {
   1309     SDValue Arg = OutVals[ArgRegIdx];
   1310     CCValAssign &VA = ArgLocs[ArgRegIdx];
   1311 
   1312     // PtrOff will be used to store the current argument to the stack if a
   1313     // register cannot be found for it.
   1314     SDValue PtrOff = DAG.getConstant(ArgOffset, StackPtr.getValueType());
   1315     PtrOff = DAG.getNode(ISD::ADD, dl, PtrVT, StackPtr, PtrOff);
   1316 
   1317     switch (Arg.getValueType().getSimpleVT().SimpleTy) {
   1318     default: llvm_unreachable("Unexpected ValueType for argument!");
   1319     case MVT::i8:
   1320     case MVT::i16:
   1321     case MVT::i32:
   1322     case MVT::i64:
   1323     case MVT::i128:
   1324     case MVT::f32:
   1325     case MVT::f64:
   1326     case MVT::v2i64:
   1327     case MVT::v2f64:
   1328     case MVT::v4f32:
   1329     case MVT::v4i32:
   1330     case MVT::v8i16:
   1331     case MVT::v16i8:
   1332       if (ArgRegIdx != NumArgRegs) {
   1333         RegsToPass.push_back(std::make_pair(VA.getLocReg(), Arg));
   1334       } else {
   1335         MemOpChains.push_back(DAG.getStore(Chain, dl, Arg, PtrOff,
   1336                                            MachinePointerInfo(),
   1337                                            false, false, 0));
   1338         ArgOffset += StackSlotSize;
   1339       }
   1340       break;
   1341     }
   1342   }
   1343 
   1344   // Accumulate how many bytes are to be pushed on the stack, including the
   1345   // linkage area, and parameter passing area.  According to the SPU ABI,
   1346   // we minimally need space for [LR] and [SP].
   1347   unsigned NumStackBytes = ArgOffset - SPUFrameLowering::minStackSize();
   1348 
   1349   // Insert a call sequence start
   1350   Chain = DAG.getCALLSEQ_START(Chain, DAG.getIntPtrConstant(NumStackBytes,
   1351                                                             true));
   1352 
   1353   if (!MemOpChains.empty()) {
   1354     // Adjust the stack pointer for the stack arguments.
   1355     Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
   1356                         &MemOpChains[0], MemOpChains.size());
   1357   }
   1358 
   1359   // Build a sequence of copy-to-reg nodes chained together with token chain
   1360   // and flag operands which copy the outgoing args into the appropriate regs.
   1361   SDValue InFlag;
   1362   for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) {
   1363     Chain = DAG.getCopyToReg(Chain, dl, RegsToPass[i].first,
   1364                              RegsToPass[i].second, InFlag);
   1365     InFlag = Chain.getValue(1);
   1366   }
   1367 
   1368   SmallVector<SDValue, 8> Ops;
   1369   unsigned CallOpc = SPUISD::CALL;
   1370 
   1371   // If the callee is a GlobalAddress/ExternalSymbol node (quite common, every
   1372   // direct call is) turn it into a TargetGlobalAddress/TargetExternalSymbol
   1373   // node so that legalize doesn't hack it.
   1374   if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee)) {
   1375     const GlobalValue *GV = G->getGlobal();
   1376     EVT CalleeVT = Callee.getValueType();
   1377     SDValue Zero = DAG.getConstant(0, PtrVT);
   1378     SDValue GA = DAG.getTargetGlobalAddress(GV, dl, CalleeVT);
   1379 
   1380     if (!ST->usingLargeMem()) {
   1381       // Turn calls to targets that are defined (i.e., have bodies) into BRSL
   1382       // style calls, otherwise, external symbols are BRASL calls. This assumes
   1383       // that declared/defined symbols are in the same compilation unit and can
   1384       // be reached through PC-relative jumps.
   1385       //
   1386       // NOTE:
   1387       // This may be an unsafe assumption for JIT and really large compilation
   1388       // units.
   1389       if (GV->isDeclaration()) {
   1390         Callee = DAG.getNode(SPUISD::AFormAddr, dl, CalleeVT, GA, Zero);
   1391       } else {
   1392         Callee = DAG.getNode(SPUISD::PCRelAddr, dl, CalleeVT, GA, Zero);
   1393       }
   1394     } else {
   1395       // "Large memory" mode: Turn all calls into indirect calls with a X-form
   1396       // address pairs:
   1397       Callee = DAG.getNode(SPUISD::IndirectAddr, dl, PtrVT, GA, Zero);
   1398     }
   1399   } else if (ExternalSymbolSDNode *S = dyn_cast<ExternalSymbolSDNode>(Callee)) {
   1400     EVT CalleeVT = Callee.getValueType();
   1401     SDValue Zero = DAG.getConstant(0, PtrVT);
   1402     SDValue ExtSym = DAG.getTargetExternalSymbol(S->getSymbol(),
   1403         Callee.getValueType());
   1404 
   1405     if (!ST->usingLargeMem()) {
   1406       Callee = DAG.getNode(SPUISD::AFormAddr, dl, CalleeVT, ExtSym, Zero);
   1407     } else {
   1408       Callee = DAG.getNode(SPUISD::IndirectAddr, dl, PtrVT, ExtSym, Zero);
   1409     }
   1410   } else if (SDNode *Dest = isLSAAddress(Callee, DAG)) {
   1411     // If this is an absolute destination address that appears to be a legal
   1412     // local store address, use the munged value.
   1413     Callee = SDValue(Dest, 0);
   1414   }
   1415 
   1416   Ops.push_back(Chain);
   1417   Ops.push_back(Callee);
   1418 
   1419   // Add argument registers to the end of the list so that they are known live
   1420   // into the call.
   1421   for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i)
   1422     Ops.push_back(DAG.getRegister(RegsToPass[i].first,
   1423                                   RegsToPass[i].second.getValueType()));
   1424 
   1425   if (InFlag.getNode())
   1426     Ops.push_back(InFlag);
   1427   // Returns a chain and a flag for retval copy to use.
   1428   Chain = DAG.getNode(CallOpc, dl, DAG.getVTList(MVT::Other, MVT::Glue),
   1429                       &Ops[0], Ops.size());
   1430   InFlag = Chain.getValue(1);
   1431 
   1432   Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(NumStackBytes, true),
   1433                              DAG.getIntPtrConstant(0, true), InFlag);
   1434   if (!Ins.empty())
   1435     InFlag = Chain.getValue(1);
   1436 
   1437   // If the function returns void, just return the chain.
   1438   if (Ins.empty())
   1439     return Chain;
   1440 
   1441   // Now handle the return value(s)
   1442   SmallVector<CCValAssign, 16> RVLocs;
   1443   CCState CCRetInfo(CallConv, isVarArg, DAG.getMachineFunction(),
   1444 		    getTargetMachine(), RVLocs, *DAG.getContext());
   1445   CCRetInfo.AnalyzeCallResult(Ins, CCC_SPU);
   1446 
   1447 
   1448   // If the call has results, copy the values out of the ret val registers.
   1449   for (unsigned i = 0; i != RVLocs.size(); ++i) {
   1450     CCValAssign VA = RVLocs[i];
   1451 
   1452     SDValue Val = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(), VA.getLocVT(),
   1453                                      InFlag);
   1454     Chain = Val.getValue(1);
   1455     InFlag = Val.getValue(2);
   1456     InVals.push_back(Val);
   1457    }
   1458 
   1459   return Chain;
   1460 }
   1461 
   1462 SDValue
   1463 SPUTargetLowering::LowerReturn(SDValue Chain,
   1464                                CallingConv::ID CallConv, bool isVarArg,
   1465                                const SmallVectorImpl<ISD::OutputArg> &Outs,
   1466                                const SmallVectorImpl<SDValue> &OutVals,
   1467                                DebugLoc dl, SelectionDAG &DAG) const {
   1468 
   1469   SmallVector<CCValAssign, 16> RVLocs;
   1470   CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(),
   1471 		 getTargetMachine(), RVLocs, *DAG.getContext());
   1472   CCInfo.AnalyzeReturn(Outs, RetCC_SPU);
   1473 
   1474   // If this is the first return lowered for this function, add the regs to the
   1475   // liveout set for the function.
   1476   if (DAG.getMachineFunction().getRegInfo().liveout_empty()) {
   1477     for (unsigned i = 0; i != RVLocs.size(); ++i)
   1478       DAG.getMachineFunction().getRegInfo().addLiveOut(RVLocs[i].getLocReg());
   1479   }
   1480 
   1481   SDValue Flag;
   1482 
   1483   // Copy the result values into the output registers.
   1484   for (unsigned i = 0; i != RVLocs.size(); ++i) {
   1485     CCValAssign &VA = RVLocs[i];
   1486     assert(VA.isRegLoc() && "Can only return in registers!");
   1487     Chain = DAG.getCopyToReg(Chain, dl, VA.getLocReg(),
   1488                              OutVals[i], Flag);
   1489     Flag = Chain.getValue(1);
   1490   }
   1491 
   1492   if (Flag.getNode())
   1493     return DAG.getNode(SPUISD::RET_FLAG, dl, MVT::Other, Chain, Flag);
   1494   else
   1495     return DAG.getNode(SPUISD::RET_FLAG, dl, MVT::Other, Chain);
   1496 }
   1497 
   1498 
   1499 //===----------------------------------------------------------------------===//
   1500 // Vector related lowering:
   1501 //===----------------------------------------------------------------------===//
   1502 
   1503 static ConstantSDNode *
   1504 getVecImm(SDNode *N) {
   1505   SDValue OpVal(0, 0);
   1506 
   1507   // Check to see if this buildvec has a single non-undef value in its elements.
   1508   for (unsigned i = 0, e = N->getNumOperands(); i != e; ++i) {
   1509     if (N->getOperand(i).getOpcode() == ISD::UNDEF) continue;
   1510     if (OpVal.getNode() == 0)
   1511       OpVal = N->getOperand(i);
   1512     else if (OpVal != N->getOperand(i))
   1513       return 0;
   1514   }
   1515 
   1516   if (OpVal.getNode() != 0) {
   1517     if (ConstantSDNode *CN = dyn_cast<ConstantSDNode>(OpVal)) {
   1518       return CN;
   1519     }
   1520   }
   1521 
   1522   return 0;
   1523 }
   1524 
   1525 /// get_vec_i18imm - Test if this vector is a vector filled with the same value
   1526 /// and the value fits into an unsigned 18-bit constant, and if so, return the
   1527 /// constant
   1528 SDValue SPU::get_vec_u18imm(SDNode *N, SelectionDAG &DAG,
   1529                               EVT ValueType) {
   1530   if (ConstantSDNode *CN = getVecImm(N)) {
   1531     uint64_t Value = CN->getZExtValue();
   1532     if (ValueType == MVT::i64) {
   1533       uint64_t UValue = CN->getZExtValue();
   1534       uint32_t upper = uint32_t(UValue >> 32);
   1535       uint32_t lower = uint32_t(UValue);
   1536       if (upper != lower)
   1537         return SDValue();
   1538       Value = Value >> 32;
   1539     }
   1540     if (Value <= 0x3ffff)
   1541       return DAG.getTargetConstant(Value, ValueType);
   1542   }
   1543 
   1544   return SDValue();
   1545 }
   1546 
   1547 /// get_vec_i16imm - Test if this vector is a vector filled with the same value
   1548 /// and the value fits into a signed 16-bit constant, and if so, return the
   1549 /// constant
   1550 SDValue SPU::get_vec_i16imm(SDNode *N, SelectionDAG &DAG,
   1551                               EVT ValueType) {
   1552   if (ConstantSDNode *CN = getVecImm(N)) {
   1553     int64_t Value = CN->getSExtValue();
   1554     if (ValueType == MVT::i64) {
   1555       uint64_t UValue = CN->getZExtValue();
   1556       uint32_t upper = uint32_t(UValue >> 32);
   1557       uint32_t lower = uint32_t(UValue);
   1558       if (upper != lower)
   1559         return SDValue();
   1560       Value = Value >> 32;
   1561     }
   1562     if (Value >= -(1 << 15) && Value <= ((1 << 15) - 1)) {
   1563       return DAG.getTargetConstant(Value, ValueType);
   1564     }
   1565   }
   1566 
   1567   return SDValue();
   1568 }
   1569 
   1570 /// get_vec_i10imm - Test if this vector is a vector filled with the same value
   1571 /// and the value fits into a signed 10-bit constant, and if so, return the
   1572 /// constant
   1573 SDValue SPU::get_vec_i10imm(SDNode *N, SelectionDAG &DAG,
   1574                               EVT ValueType) {
   1575   if (ConstantSDNode *CN = getVecImm(N)) {
   1576     int64_t Value = CN->getSExtValue();
   1577     if (ValueType == MVT::i64) {
   1578       uint64_t UValue = CN->getZExtValue();
   1579       uint32_t upper = uint32_t(UValue >> 32);
   1580       uint32_t lower = uint32_t(UValue);
   1581       if (upper != lower)
   1582         return SDValue();
   1583       Value = Value >> 32;
   1584     }
   1585     if (isInt<10>(Value))
   1586       return DAG.getTargetConstant(Value, ValueType);
   1587   }
   1588 
   1589   return SDValue();
   1590 }
   1591 
   1592 /// get_vec_i8imm - Test if this vector is a vector filled with the same value
   1593 /// and the value fits into a signed 8-bit constant, and if so, return the
   1594 /// constant.
   1595 ///
   1596 /// @note: The incoming vector is v16i8 because that's the only way we can load
   1597 /// constant vectors. Thus, we test to see if the upper and lower bytes are the
   1598 /// same value.
   1599 SDValue SPU::get_vec_i8imm(SDNode *N, SelectionDAG &DAG,
   1600                              EVT ValueType) {
   1601   if (ConstantSDNode *CN = getVecImm(N)) {
   1602     int Value = (int) CN->getZExtValue();
   1603     if (ValueType == MVT::i16
   1604         && Value <= 0xffff                 /* truncated from uint64_t */
   1605         && ((short) Value >> 8) == ((short) Value & 0xff))
   1606       return DAG.getTargetConstant(Value & 0xff, ValueType);
   1607     else if (ValueType == MVT::i8
   1608              && (Value & 0xff) == Value)
   1609       return DAG.getTargetConstant(Value, ValueType);
   1610   }
   1611 
   1612   return SDValue();
   1613 }
   1614 
   1615 /// get_ILHUvec_imm - Test if this vector is a vector filled with the same value
   1616 /// and the value fits into a signed 16-bit constant, and if so, return the
   1617 /// constant
   1618 SDValue SPU::get_ILHUvec_imm(SDNode *N, SelectionDAG &DAG,
   1619                                EVT ValueType) {
   1620   if (ConstantSDNode *CN = getVecImm(N)) {
   1621     uint64_t Value = CN->getZExtValue();
   1622     if ((ValueType == MVT::i32
   1623           && ((unsigned) Value & 0xffff0000) == (unsigned) Value)
   1624         || (ValueType == MVT::i64 && (Value & 0xffff0000) == Value))
   1625       return DAG.getTargetConstant(Value >> 16, ValueType);
   1626   }
   1627 
   1628   return SDValue();
   1629 }
   1630 
   1631 /// get_v4i32_imm - Catch-all for general 32-bit constant vectors
   1632 SDValue SPU::get_v4i32_imm(SDNode *N, SelectionDAG &DAG) {
   1633   if (ConstantSDNode *CN = getVecImm(N)) {
   1634     return DAG.getTargetConstant((unsigned) CN->getZExtValue(), MVT::i32);
   1635   }
   1636 
   1637   return SDValue();
   1638 }
   1639 
   1640 /// get_v4i32_imm - Catch-all for general 64-bit constant vectors
   1641 SDValue SPU::get_v2i64_imm(SDNode *N, SelectionDAG &DAG) {
   1642   if (ConstantSDNode *CN = getVecImm(N)) {
   1643     return DAG.getTargetConstant((unsigned) CN->getZExtValue(), MVT::i64);
   1644   }
   1645 
   1646   return SDValue();
   1647 }
   1648 
   1649 //! Lower a BUILD_VECTOR instruction creatively:
   1650 static SDValue
   1651 LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) {
   1652   EVT VT = Op.getValueType();
   1653   EVT EltVT = VT.getVectorElementType();
   1654   DebugLoc dl = Op.getDebugLoc();
   1655   BuildVectorSDNode *BCN = dyn_cast<BuildVectorSDNode>(Op.getNode());
   1656   assert(BCN != 0 && "Expected BuildVectorSDNode in SPU LowerBUILD_VECTOR");
   1657   unsigned minSplatBits = EltVT.getSizeInBits();
   1658 
   1659   if (minSplatBits < 16)
   1660     minSplatBits = 16;
   1661 
   1662   APInt APSplatBits, APSplatUndef;
   1663   unsigned SplatBitSize;
   1664   bool HasAnyUndefs;
   1665 
   1666   if (!BCN->isConstantSplat(APSplatBits, APSplatUndef, SplatBitSize,
   1667                             HasAnyUndefs, minSplatBits)
   1668       || minSplatBits < SplatBitSize)
   1669     return SDValue();   // Wasn't a constant vector or splat exceeded min
   1670 
   1671   uint64_t SplatBits = APSplatBits.getZExtValue();
   1672 
   1673   switch (VT.getSimpleVT().SimpleTy) {
   1674   default:
   1675     report_fatal_error("CellSPU: Unhandled VT in LowerBUILD_VECTOR, VT = " +
   1676                        Twine(VT.getEVTString()));
   1677     /*NOTREACHED*/
   1678   case MVT::v4f32: {
   1679     uint32_t Value32 = uint32_t(SplatBits);
   1680     assert(SplatBitSize == 32
   1681            && "LowerBUILD_VECTOR: Unexpected floating point vector element.");
   1682     // NOTE: pretend the constant is an integer. LLVM won't load FP constants
   1683     SDValue T = DAG.getConstant(Value32, MVT::i32);
   1684     return DAG.getNode(ISD::BITCAST, dl, MVT::v4f32,
   1685                        DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32, T,T,T,T));
   1686   }
   1687   case MVT::v2f64: {
   1688     uint64_t f64val = uint64_t(SplatBits);
   1689     assert(SplatBitSize == 64
   1690            && "LowerBUILD_VECTOR: 64-bit float vector size > 8 bytes.");
   1691     // NOTE: pretend the constant is an integer. LLVM won't load FP constants
   1692     SDValue T = DAG.getConstant(f64val, MVT::i64);
   1693     return DAG.getNode(ISD::BITCAST, dl, MVT::v2f64,
   1694                        DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v2i64, T, T));
   1695   }
   1696   case MVT::v16i8: {
   1697    // 8-bit constants have to be expanded to 16-bits
   1698    unsigned short Value16 = SplatBits /* | (SplatBits << 8) */;
   1699    SmallVector<SDValue, 8> Ops;
   1700 
   1701    Ops.assign(8, DAG.getConstant(Value16, MVT::i16));
   1702    return DAG.getNode(ISD::BITCAST, dl, VT,
   1703                       DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v8i16, &Ops[0], Ops.size()));
   1704   }
   1705   case MVT::v8i16: {
   1706     unsigned short Value16 = SplatBits;
   1707     SDValue T = DAG.getConstant(Value16, EltVT);
   1708     SmallVector<SDValue, 8> Ops;
   1709 
   1710     Ops.assign(8, T);
   1711     return DAG.getNode(ISD::BUILD_VECTOR, dl, VT, &Ops[0], Ops.size());
   1712   }
   1713   case MVT::v4i32: {
   1714     SDValue T = DAG.getConstant(unsigned(SplatBits), VT.getVectorElementType());
   1715     return DAG.getNode(ISD::BUILD_VECTOR, dl, VT, T, T, T, T);
   1716   }
   1717   case MVT::v2i64: {
   1718     return SPU::LowerV2I64Splat(VT, DAG, SplatBits, dl);
   1719   }
   1720   }
   1721 }
   1722 
   1723 /*!
   1724  */
   1725 SDValue
   1726 SPU::LowerV2I64Splat(EVT OpVT, SelectionDAG& DAG, uint64_t SplatVal,
   1727                      DebugLoc dl) {
   1728   uint32_t upper = uint32_t(SplatVal >> 32);
   1729   uint32_t lower = uint32_t(SplatVal);
   1730 
   1731   if (upper == lower) {
   1732     // Magic constant that can be matched by IL, ILA, et. al.
   1733     SDValue Val = DAG.getTargetConstant(upper, MVT::i32);
   1734     return DAG.getNode(ISD::BITCAST, dl, OpVT,
   1735                        DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32,
   1736                                    Val, Val, Val, Val));
   1737   } else {
   1738     bool upper_special, lower_special;
   1739 
   1740     // NOTE: This code creates common-case shuffle masks that can be easily
   1741     // detected as common expressions. It is not attempting to create highly
   1742     // specialized masks to replace any and all 0's, 0xff's and 0x80's.
   1743 
   1744     // Detect if the upper or lower half is a special shuffle mask pattern:
   1745     upper_special = (upper == 0 || upper == 0xffffffff || upper == 0x80000000);
   1746     lower_special = (lower == 0 || lower == 0xffffffff || lower == 0x80000000);
   1747 
   1748     // Both upper and lower are special, lower to a constant pool load:
   1749     if (lower_special && upper_special) {
   1750       SDValue UpperVal = DAG.getConstant(upper, MVT::i32);
   1751       SDValue LowerVal = DAG.getConstant(lower, MVT::i32);
   1752       SDValue BV = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32,
   1753                          UpperVal, LowerVal, UpperVal, LowerVal);
   1754       return DAG.getNode(ISD::BITCAST, dl, OpVT, BV);
   1755     }
   1756 
   1757     SDValue LO32;
   1758     SDValue HI32;
   1759     SmallVector<SDValue, 16> ShufBytes;
   1760     SDValue Result;
   1761 
   1762     // Create lower vector if not a special pattern
   1763     if (!lower_special) {
   1764       SDValue LO32C = DAG.getConstant(lower, MVT::i32);
   1765       LO32 = DAG.getNode(ISD::BITCAST, dl, OpVT,
   1766                          DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32,
   1767                                      LO32C, LO32C, LO32C, LO32C));
   1768     }
   1769 
   1770     // Create upper vector if not a special pattern
   1771     if (!upper_special) {
   1772       SDValue HI32C = DAG.getConstant(upper, MVT::i32);
   1773       HI32 = DAG.getNode(ISD::BITCAST, dl, OpVT,
   1774                          DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32,
   1775                                      HI32C, HI32C, HI32C, HI32C));
   1776     }
   1777 
   1778     // If either upper or lower are special, then the two input operands are
   1779     // the same (basically, one of them is a "don't care")
   1780     if (lower_special)
   1781       LO32 = HI32;
   1782     if (upper_special)
   1783       HI32 = LO32;
   1784 
   1785     for (int i = 0; i < 4; ++i) {
   1786       uint64_t val = 0;
   1787       for (int j = 0; j < 4; ++j) {
   1788         SDValue V;
   1789         bool process_upper, process_lower;
   1790         val <<= 8;
   1791         process_upper = (upper_special && (i & 1) == 0);
   1792         process_lower = (lower_special && (i & 1) == 1);
   1793 
   1794         if (process_upper || process_lower) {
   1795           if ((process_upper && upper == 0)
   1796                   || (process_lower && lower == 0))
   1797             val |= 0x80;
   1798           else if ((process_upper && upper == 0xffffffff)
   1799                   || (process_lower && lower == 0xffffffff))
   1800             val |= 0xc0;
   1801           else if ((process_upper && upper == 0x80000000)
   1802                   || (process_lower && lower == 0x80000000))
   1803             val |= (j == 0 ? 0xe0 : 0x80);
   1804         } else
   1805           val |= i * 4 + j + ((i & 1) * 16);
   1806       }
   1807 
   1808       ShufBytes.push_back(DAG.getConstant(val, MVT::i32));
   1809     }
   1810 
   1811     return DAG.getNode(SPUISD::SHUFB, dl, OpVT, HI32, LO32,
   1812                        DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32,
   1813                                    &ShufBytes[0], ShufBytes.size()));
   1814   }
   1815 }
   1816 
   1817 /// LowerVECTOR_SHUFFLE - Lower a vector shuffle (V1, V2, V3) to something on
   1818 /// which the Cell can operate. The code inspects V3 to ascertain whether the
   1819 /// permutation vector, V3, is monotonically increasing with one "exception"
   1820 /// element, e.g., (0, 1, _, 3). If this is the case, then generate a
   1821 /// SHUFFLE_MASK synthetic instruction. Otherwise, spill V3 to the constant pool.
   1822 /// In either case, the net result is going to eventually invoke SHUFB to
   1823 /// permute/shuffle the bytes from V1 and V2.
   1824 /// \note
   1825 /// SHUFFLE_MASK is eventually selected as one of the C*D instructions, generate
   1826 /// control word for byte/halfword/word insertion. This takes care of a single
   1827 /// element move from V2 into V1.
   1828 /// \note
   1829 /// SPUISD::SHUFB is eventually selected as Cell's <i>shufb</i> instructions.
   1830 static SDValue LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) {
   1831   const ShuffleVectorSDNode *SVN = cast<ShuffleVectorSDNode>(Op);
   1832   SDValue V1 = Op.getOperand(0);
   1833   SDValue V2 = Op.getOperand(1);
   1834   DebugLoc dl = Op.getDebugLoc();
   1835 
   1836   if (V2.getOpcode() == ISD::UNDEF) V2 = V1;
   1837 
   1838   // If we have a single element being moved from V1 to V2, this can be handled
   1839   // using the C*[DX] compute mask instructions, but the vector elements have
   1840   // to be monotonically increasing with one exception element, and the source
   1841   // slot of the element to move must be the same as the destination.
   1842   EVT VecVT = V1.getValueType();
   1843   EVT EltVT = VecVT.getVectorElementType();
   1844   unsigned EltsFromV2 = 0;
   1845   unsigned V2EltOffset = 0;
   1846   unsigned V2EltIdx0 = 0;
   1847   unsigned CurrElt = 0;
   1848   unsigned MaxElts = VecVT.getVectorNumElements();
   1849   unsigned PrevElt = 0;
   1850   bool monotonic = true;
   1851   bool rotate = true;
   1852   int rotamt=0;
   1853   EVT maskVT;             // which of the c?d instructions to use
   1854 
   1855   if (EltVT == MVT::i8) {
   1856     V2EltIdx0 = 16;
   1857     maskVT = MVT::v16i8;
   1858   } else if (EltVT == MVT::i16) {
   1859     V2EltIdx0 = 8;
   1860     maskVT = MVT::v8i16;
   1861   } else if (EltVT == MVT::i32 || EltVT == MVT::f32) {
   1862     V2EltIdx0 = 4;
   1863     maskVT = MVT::v4i32;
   1864   } else if (EltVT == MVT::i64 || EltVT == MVT::f64) {
   1865     V2EltIdx0 = 2;
   1866     maskVT = MVT::v2i64;
   1867   } else
   1868     llvm_unreachable("Unhandled vector type in LowerVECTOR_SHUFFLE");
   1869 
   1870   for (unsigned i = 0; i != MaxElts; ++i) {
   1871     if (SVN->getMaskElt(i) < 0)
   1872       continue;
   1873 
   1874     unsigned SrcElt = SVN->getMaskElt(i);
   1875 
   1876     if (monotonic) {
   1877       if (SrcElt >= V2EltIdx0) {
   1878         // TODO: optimize for the monotonic case when several consecutive
   1879         // elements are taken form V2. Do we ever get such a case?
   1880         if (EltsFromV2 == 0 && CurrElt == (SrcElt - V2EltIdx0))
   1881           V2EltOffset = (SrcElt - V2EltIdx0) * (EltVT.getSizeInBits()/8);
   1882         else
   1883           monotonic = false;
   1884         ++EltsFromV2;
   1885       } else if (CurrElt != SrcElt) {
   1886         monotonic = false;
   1887       }
   1888 
   1889       ++CurrElt;
   1890     }
   1891 
   1892     if (rotate) {
   1893       if (PrevElt > 0 && SrcElt < MaxElts) {
   1894         if ((PrevElt == SrcElt - 1)
   1895             || (PrevElt == MaxElts - 1 && SrcElt == 0)) {
   1896           PrevElt = SrcElt;
   1897         } else {
   1898           rotate = false;
   1899         }
   1900       } else if (i == 0 || (PrevElt==0 && SrcElt==1)) {
   1901         // First time or after a "wrap around"
   1902         rotamt = SrcElt-i;
   1903         PrevElt = SrcElt;
   1904       } else {
   1905         // This isn't a rotation, takes elements from vector 2
   1906         rotate = false;
   1907       }
   1908     }
   1909   }
   1910 
   1911   if (EltsFromV2 == 1 && monotonic) {
   1912     // Compute mask and shuffle
   1913     EVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy();
   1914 
   1915     // As SHUFFLE_MASK becomes a c?d instruction, feed it an address
   1916     // R1 ($sp) is used here only as it is guaranteed to have last bits zero
   1917     SDValue Pointer = DAG.getNode(SPUISD::IndirectAddr, dl, PtrVT,
   1918                                 DAG.getRegister(SPU::R1, PtrVT),
   1919                                 DAG.getConstant(V2EltOffset, MVT::i32));
   1920     SDValue ShufMaskOp = DAG.getNode(SPUISD::SHUFFLE_MASK, dl,
   1921                                      maskVT, Pointer);
   1922 
   1923     // Use shuffle mask in SHUFB synthetic instruction:
   1924     return DAG.getNode(SPUISD::SHUFB, dl, V1.getValueType(), V2, V1,
   1925                        ShufMaskOp);
   1926   } else if (rotate) {
   1927     if (rotamt < 0)
   1928       rotamt +=MaxElts;
   1929     rotamt *= EltVT.getSizeInBits()/8;
   1930     return DAG.getNode(SPUISD::ROTBYTES_LEFT, dl, V1.getValueType(),
   1931                        V1, DAG.getConstant(rotamt, MVT::i16));
   1932   } else {
   1933    // Convert the SHUFFLE_VECTOR mask's input element units to the
   1934    // actual bytes.
   1935     unsigned BytesPerElement = EltVT.getSizeInBits()/8;
   1936 
   1937     SmallVector<SDValue, 16> ResultMask;
   1938     for (unsigned i = 0, e = MaxElts; i != e; ++i) {
   1939       unsigned SrcElt = SVN->getMaskElt(i) < 0 ? 0 : SVN->getMaskElt(i);
   1940 
   1941       for (unsigned j = 0; j < BytesPerElement; ++j)
   1942         ResultMask.push_back(DAG.getConstant(SrcElt*BytesPerElement+j,MVT::i8));
   1943     }
   1944     SDValue VPermMask = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v16i8,
   1945                                     &ResultMask[0], ResultMask.size());
   1946     return DAG.getNode(SPUISD::SHUFB, dl, V1.getValueType(), V1, V2, VPermMask);
   1947   }
   1948 }
   1949 
   1950 static SDValue LowerSCALAR_TO_VECTOR(SDValue Op, SelectionDAG &DAG) {
   1951   SDValue Op0 = Op.getOperand(0);                     // Op0 = the scalar
   1952   DebugLoc dl = Op.getDebugLoc();
   1953 
   1954   if (Op0.getNode()->getOpcode() == ISD::Constant) {
   1955     // For a constant, build the appropriate constant vector, which will
   1956     // eventually simplify to a vector register load.
   1957 
   1958     ConstantSDNode *CN = cast<ConstantSDNode>(Op0.getNode());
   1959     SmallVector<SDValue, 16> ConstVecValues;
   1960     EVT VT;
   1961     size_t n_copies;
   1962 
   1963     // Create a constant vector:
   1964     switch (Op.getValueType().getSimpleVT().SimpleTy) {
   1965     default: llvm_unreachable("Unexpected constant value type in "
   1966                               "LowerSCALAR_TO_VECTOR");
   1967     case MVT::v16i8: n_copies = 16; VT = MVT::i8; break;
   1968     case MVT::v8i16: n_copies = 8; VT = MVT::i16; break;
   1969     case MVT::v4i32: n_copies = 4; VT = MVT::i32; break;
   1970     case MVT::v4f32: n_copies = 4; VT = MVT::f32; break;
   1971     case MVT::v2i64: n_copies = 2; VT = MVT::i64; break;
   1972     case MVT::v2f64: n_copies = 2; VT = MVT::f64; break;
   1973     }
   1974 
   1975     SDValue CValue = DAG.getConstant(CN->getZExtValue(), VT);
   1976     for (size_t j = 0; j < n_copies; ++j)
   1977       ConstVecValues.push_back(CValue);
   1978 
   1979     return DAG.getNode(ISD::BUILD_VECTOR, dl, Op.getValueType(),
   1980                        &ConstVecValues[0], ConstVecValues.size());
   1981   } else {
   1982     // Otherwise, copy the value from one register to another:
   1983     switch (Op0.getValueType().getSimpleVT().SimpleTy) {
   1984     default: llvm_unreachable("Unexpected value type in LowerSCALAR_TO_VECTOR");
   1985     case MVT::i8:
   1986     case MVT::i16:
   1987     case MVT::i32:
   1988     case MVT::i64:
   1989     case MVT::f32:
   1990     case MVT::f64:
   1991       return DAG.getNode(SPUISD::PREFSLOT2VEC, dl, Op.getValueType(), Op0, Op0);
   1992     }
   1993   }
   1994 }
   1995 
   1996 static SDValue LowerEXTRACT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) {
   1997   EVT VT = Op.getValueType();
   1998   SDValue N = Op.getOperand(0);
   1999   SDValue Elt = Op.getOperand(1);
   2000   DebugLoc dl = Op.getDebugLoc();
   2001   SDValue retval;
   2002 
   2003   if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Elt)) {
   2004     // Constant argument:
   2005     int EltNo = (int) C->getZExtValue();
   2006 
   2007     // sanity checks:
   2008     if (VT == MVT::i8 && EltNo >= 16)
   2009       llvm_unreachable("SPU LowerEXTRACT_VECTOR_ELT: i8 extraction slot > 15");
   2010     else if (VT == MVT::i16 && EltNo >= 8)
   2011       llvm_unreachable("SPU LowerEXTRACT_VECTOR_ELT: i16 extraction slot > 7");
   2012     else if (VT == MVT::i32 && EltNo >= 4)
   2013       llvm_unreachable("SPU LowerEXTRACT_VECTOR_ELT: i32 extraction slot > 4");
   2014     else if (VT == MVT::i64 && EltNo >= 2)
   2015       llvm_unreachable("SPU LowerEXTRACT_VECTOR_ELT: i64 extraction slot > 2");
   2016 
   2017     if (EltNo == 0 && (VT == MVT::i32 || VT == MVT::i64)) {
   2018       // i32 and i64: Element 0 is the preferred slot
   2019       return DAG.getNode(SPUISD::VEC2PREFSLOT, dl, VT, N);
   2020     }
   2021 
   2022     // Need to generate shuffle mask and extract:
   2023     int prefslot_begin = -1, prefslot_end = -1;
   2024     int elt_byte = EltNo * VT.getSizeInBits() / 8;
   2025 
   2026     switch (VT.getSimpleVT().SimpleTy) {
   2027     default: llvm_unreachable("Invalid value type!");
   2028     case MVT::i8: {
   2029       prefslot_begin = prefslot_end = 3;
   2030       break;
   2031     }
   2032     case MVT::i16: {
   2033       prefslot_begin = 2; prefslot_end = 3;
   2034       break;
   2035     }
   2036     case MVT::i32:
   2037     case MVT::f32: {
   2038       prefslot_begin = 0; prefslot_end = 3;
   2039       break;
   2040     }
   2041     case MVT::i64:
   2042     case MVT::f64: {
   2043       prefslot_begin = 0; prefslot_end = 7;
   2044       break;
   2045     }
   2046     }
   2047 
   2048     assert(prefslot_begin != -1 && prefslot_end != -1 &&
   2049            "LowerEXTRACT_VECTOR_ELT: preferred slots uninitialized");
   2050 
   2051     unsigned int ShufBytes[16] = {
   2052       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
   2053     };
   2054     for (int i = 0; i < 16; ++i) {
   2055       // zero fill uppper part of preferred slot, don't care about the
   2056       // other slots:
   2057       unsigned int mask_val;
   2058       if (i <= prefslot_end) {
   2059         mask_val =
   2060           ((i < prefslot_begin)
   2061            ? 0x80
   2062            : elt_byte + (i - prefslot_begin));
   2063 
   2064         ShufBytes[i] = mask_val;
   2065       } else
   2066         ShufBytes[i] = ShufBytes[i % (prefslot_end + 1)];
   2067     }
   2068 
   2069     SDValue ShufMask[4];
   2070     for (unsigned i = 0; i < sizeof(ShufMask)/sizeof(ShufMask[0]); ++i) {
   2071       unsigned bidx = i * 4;
   2072       unsigned int bits = ((ShufBytes[bidx] << 24) |
   2073                            (ShufBytes[bidx+1] << 16) |
   2074                            (ShufBytes[bidx+2] << 8) |
   2075                            ShufBytes[bidx+3]);
   2076       ShufMask[i] = DAG.getConstant(bits, MVT::i32);
   2077     }
   2078 
   2079     SDValue ShufMaskVec =
   2080       DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32,
   2081                   &ShufMask[0], sizeof(ShufMask)/sizeof(ShufMask[0]));
   2082 
   2083     retval = DAG.getNode(SPUISD::VEC2PREFSLOT, dl, VT,
   2084                          DAG.getNode(SPUISD::SHUFB, dl, N.getValueType(),
   2085                                      N, N, ShufMaskVec));
   2086   } else {
   2087     // Variable index: Rotate the requested element into slot 0, then replicate
   2088     // slot 0 across the vector
   2089     EVT VecVT = N.getValueType();
   2090     if (!VecVT.isSimple() || !VecVT.isVector()) {
   2091       report_fatal_error("LowerEXTRACT_VECTOR_ELT: Must have a simple, 128-bit"
   2092                         "vector type!");
   2093     }
   2094 
   2095     // Make life easier by making sure the index is zero-extended to i32
   2096     if (Elt.getValueType() != MVT::i32)
   2097       Elt = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, Elt);
   2098 
   2099     // Scale the index to a bit/byte shift quantity
   2100     APInt scaleFactor =
   2101             APInt(32, uint64_t(16 / N.getValueType().getVectorNumElements()), false);
   2102     unsigned scaleShift = scaleFactor.logBase2();
   2103     SDValue vecShift;
   2104 
   2105     if (scaleShift > 0) {
   2106       // Scale the shift factor:
   2107       Elt = DAG.getNode(ISD::SHL, dl, MVT::i32, Elt,
   2108                         DAG.getConstant(scaleShift, MVT::i32));
   2109     }
   2110 
   2111     vecShift = DAG.getNode(SPUISD::SHL_BYTES, dl, VecVT, N, Elt);
   2112 
   2113     // Replicate the bytes starting at byte 0 across the entire vector (for
   2114     // consistency with the notion of a unified register set)
   2115     SDValue replicate;
   2116 
   2117     switch (VT.getSimpleVT().SimpleTy) {
   2118     default:
   2119       report_fatal_error("LowerEXTRACT_VECTOR_ELT(varable): Unhandled vector"
   2120                         "type");
   2121       /*NOTREACHED*/
   2122     case MVT::i8: {
   2123       SDValue factor = DAG.getConstant(0x00000000, MVT::i32);
   2124       replicate = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32,
   2125                               factor, factor, factor, factor);
   2126       break;
   2127     }
   2128     case MVT::i16: {
   2129       SDValue factor = DAG.getConstant(0x00010001, MVT::i32);
   2130       replicate = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32,
   2131                               factor, factor, factor, factor);
   2132       break;
   2133     }
   2134     case MVT::i32:
   2135     case MVT::f32: {
   2136       SDValue factor = DAG.getConstant(0x00010203, MVT::i32);
   2137       replicate = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32,
   2138                               factor, factor, factor, factor);
   2139       break;
   2140     }
   2141     case MVT::i64:
   2142     case MVT::f64: {
   2143       SDValue loFactor = DAG.getConstant(0x00010203, MVT::i32);
   2144       SDValue hiFactor = DAG.getConstant(0x04050607, MVT::i32);
   2145       replicate = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32,
   2146                               loFactor, hiFactor, loFactor, hiFactor);
   2147       break;
   2148     }
   2149     }
   2150 
   2151     retval = DAG.getNode(SPUISD::VEC2PREFSLOT, dl, VT,
   2152                          DAG.getNode(SPUISD::SHUFB, dl, VecVT,
   2153                                      vecShift, vecShift, replicate));
   2154   }
   2155 
   2156   return retval;
   2157 }
   2158 
   2159 static SDValue LowerINSERT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) {
   2160   SDValue VecOp = Op.getOperand(0);
   2161   SDValue ValOp = Op.getOperand(1);
   2162   SDValue IdxOp = Op.getOperand(2);
   2163   DebugLoc dl = Op.getDebugLoc();
   2164   EVT VT = Op.getValueType();
   2165   EVT eltVT = ValOp.getValueType();
   2166 
   2167   // use 0 when the lane to insert to is 'undef'
   2168   int64_t Offset=0;
   2169   if (IdxOp.getOpcode() != ISD::UNDEF) {
   2170     ConstantSDNode *CN = cast<ConstantSDNode>(IdxOp);
   2171     assert(CN != 0 && "LowerINSERT_VECTOR_ELT: Index is not constant!");
   2172     Offset = (CN->getSExtValue()) * eltVT.getSizeInBits()/8;
   2173   }
   2174 
   2175   EVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy();
   2176   // Use $sp ($1) because it's always 16-byte aligned and it's available:
   2177   SDValue Pointer = DAG.getNode(SPUISD::IndirectAddr, dl, PtrVT,
   2178                                 DAG.getRegister(SPU::R1, PtrVT),
   2179                                 DAG.getConstant(Offset, PtrVT));
   2180   // widen the mask when dealing with half vectors
   2181   EVT maskVT = EVT::getVectorVT(*(DAG.getContext()), VT.getVectorElementType(),
   2182                                 128/ VT.getVectorElementType().getSizeInBits());
   2183   SDValue ShufMask = DAG.getNode(SPUISD::SHUFFLE_MASK, dl, maskVT, Pointer);
   2184 
   2185   SDValue result =
   2186     DAG.getNode(SPUISD::SHUFB, dl, VT,
   2187                 DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, ValOp),
   2188                 VecOp,
   2189                 DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, ShufMask));
   2190 
   2191   return result;
   2192 }
   2193 
   2194 static SDValue LowerI8Math(SDValue Op, SelectionDAG &DAG, unsigned Opc,
   2195                            const TargetLowering &TLI)
   2196 {
   2197   SDValue N0 = Op.getOperand(0);      // Everything has at least one operand
   2198   DebugLoc dl = Op.getDebugLoc();
   2199   EVT ShiftVT = TLI.getShiftAmountTy(N0.getValueType());
   2200 
   2201   assert(Op.getValueType() == MVT::i8);
   2202   switch (Opc) {
   2203   default:
   2204     llvm_unreachable("Unhandled i8 math operator");
   2205   case ISD::ADD: {
   2206     // 8-bit addition: Promote the arguments up to 16-bits and truncate
   2207     // the result:
   2208     SDValue N1 = Op.getOperand(1);
   2209     N0 = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::i16, N0);
   2210     N1 = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::i16, N1);
   2211     return DAG.getNode(ISD::TRUNCATE, dl, MVT::i8,
   2212                        DAG.getNode(Opc, dl, MVT::i16, N0, N1));
   2213 
   2214   }
   2215 
   2216   case ISD::SUB: {
   2217     // 8-bit subtraction: Promote the arguments up to 16-bits and truncate
   2218     // the result:
   2219     SDValue N1 = Op.getOperand(1);
   2220     N0 = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::i16, N0);
   2221     N1 = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::i16, N1);
   2222     return DAG.getNode(ISD::TRUNCATE, dl, MVT::i8,
   2223                        DAG.getNode(Opc, dl, MVT::i16, N0, N1));
   2224   }
   2225   case ISD::ROTR:
   2226   case ISD::ROTL: {
   2227     SDValue N1 = Op.getOperand(1);
   2228     EVT N1VT = N1.getValueType();
   2229 
   2230     N0 = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i16, N0);
   2231     if (!N1VT.bitsEq(ShiftVT)) {
   2232       unsigned N1Opc = N1.getValueType().bitsLT(ShiftVT)
   2233                        ? ISD::ZERO_EXTEND
   2234                        : ISD::TRUNCATE;
   2235       N1 = DAG.getNode(N1Opc, dl, ShiftVT, N1);
   2236     }
   2237 
   2238     // Replicate lower 8-bits into upper 8:
   2239     SDValue ExpandArg =
   2240       DAG.getNode(ISD::OR, dl, MVT::i16, N0,
   2241                   DAG.getNode(ISD::SHL, dl, MVT::i16,
   2242                               N0, DAG.getConstant(8, MVT::i32)));
   2243 
   2244     // Truncate back down to i8
   2245     return DAG.getNode(ISD::TRUNCATE, dl, MVT::i8,
   2246                        DAG.getNode(Opc, dl, MVT::i16, ExpandArg, N1));
   2247   }
   2248   case ISD::SRL:
   2249   case ISD::SHL: {
   2250     SDValue N1 = Op.getOperand(1);
   2251     EVT N1VT = N1.getValueType();
   2252 
   2253     N0 = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i16, N0);
   2254     if (!N1VT.bitsEq(ShiftVT)) {
   2255       unsigned N1Opc = ISD::ZERO_EXTEND;
   2256 
   2257       if (N1.getValueType().bitsGT(ShiftVT))
   2258         N1Opc = ISD::TRUNCATE;
   2259 
   2260       N1 = DAG.getNode(N1Opc, dl, ShiftVT, N1);
   2261     }
   2262 
   2263     return DAG.getNode(ISD::TRUNCATE, dl, MVT::i8,
   2264                        DAG.getNode(Opc, dl, MVT::i16, N0, N1));
   2265   }
   2266   case ISD::SRA: {
   2267     SDValue N1 = Op.getOperand(1);
   2268     EVT N1VT = N1.getValueType();
   2269 
   2270     N0 = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::i16, N0);
   2271     if (!N1VT.bitsEq(ShiftVT)) {
   2272       unsigned N1Opc = ISD::SIGN_EXTEND;
   2273 
   2274       if (N1VT.bitsGT(ShiftVT))
   2275         N1Opc = ISD::TRUNCATE;
   2276       N1 = DAG.getNode(N1Opc, dl, ShiftVT, N1);
   2277     }
   2278 
   2279     return DAG.getNode(ISD::TRUNCATE, dl, MVT::i8,
   2280                        DAG.getNode(Opc, dl, MVT::i16, N0, N1));
   2281   }
   2282   case ISD::MUL: {
   2283     SDValue N1 = Op.getOperand(1);
   2284 
   2285     N0 = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::i16, N0);
   2286     N1 = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::i16, N1);
   2287     return DAG.getNode(ISD::TRUNCATE, dl, MVT::i8,
   2288                        DAG.getNode(Opc, dl, MVT::i16, N0, N1));
   2289   }
   2290   }
   2291 }
   2292 
   2293 //! Lower byte immediate operations for v16i8 vectors:
   2294 static SDValue
   2295 LowerByteImmed(SDValue Op, SelectionDAG &DAG) {
   2296   SDValue ConstVec;
   2297   SDValue Arg;
   2298   EVT VT = Op.getValueType();
   2299   DebugLoc dl = Op.getDebugLoc();
   2300 
   2301   ConstVec = Op.getOperand(0);
   2302   Arg = Op.getOperand(1);
   2303   if (ConstVec.getNode()->getOpcode() != ISD::BUILD_VECTOR) {
   2304     if (ConstVec.getNode()->getOpcode() == ISD::BITCAST) {
   2305       ConstVec = ConstVec.getOperand(0);
   2306     } else {
   2307       ConstVec = Op.getOperand(1);
   2308       Arg = Op.getOperand(0);
   2309       if (ConstVec.getNode()->getOpcode() == ISD::BITCAST) {
   2310         ConstVec = ConstVec.getOperand(0);
   2311       }
   2312     }
   2313   }
   2314 
   2315   if (ConstVec.getNode()->getOpcode() == ISD::BUILD_VECTOR) {
   2316     BuildVectorSDNode *BCN = dyn_cast<BuildVectorSDNode>(ConstVec.getNode());
   2317     assert(BCN != 0 && "Expected BuildVectorSDNode in SPU LowerByteImmed");
   2318 
   2319     APInt APSplatBits, APSplatUndef;
   2320     unsigned SplatBitSize;
   2321     bool HasAnyUndefs;
   2322     unsigned minSplatBits = VT.getVectorElementType().getSizeInBits();
   2323 
   2324     if (BCN->isConstantSplat(APSplatBits, APSplatUndef, SplatBitSize,
   2325                               HasAnyUndefs, minSplatBits)
   2326         && minSplatBits <= SplatBitSize) {
   2327       uint64_t SplatBits = APSplatBits.getZExtValue();
   2328       SDValue tc = DAG.getTargetConstant(SplatBits & 0xff, MVT::i8);
   2329 
   2330       SmallVector<SDValue, 16> tcVec;
   2331       tcVec.assign(16, tc);
   2332       return DAG.getNode(Op.getNode()->getOpcode(), dl, VT, Arg,
   2333                          DAG.getNode(ISD::BUILD_VECTOR, dl, VT, &tcVec[0], tcVec.size()));
   2334     }
   2335   }
   2336 
   2337   // These operations (AND, OR, XOR) are legal, they just couldn't be custom
   2338   // lowered.  Return the operation, rather than a null SDValue.
   2339   return Op;
   2340 }
   2341 
   2342 //! Custom lowering for CTPOP (count population)
   2343 /*!
   2344   Custom lowering code that counts the number ones in the input
   2345   operand. SPU has such an instruction, but it counts the number of
   2346   ones per byte, which then have to be accumulated.
   2347 */
   2348 static SDValue LowerCTPOP(SDValue Op, SelectionDAG &DAG) {
   2349   EVT VT = Op.getValueType();
   2350   EVT vecVT = EVT::getVectorVT(*DAG.getContext(),
   2351                                VT, (128 / VT.getSizeInBits()));
   2352   DebugLoc dl = Op.getDebugLoc();
   2353 
   2354   switch (VT.getSimpleVT().SimpleTy) {
   2355   default: llvm_unreachable("Invalid value type!");
   2356   case MVT::i8: {
   2357     SDValue N = Op.getOperand(0);
   2358     SDValue Elt0 = DAG.getConstant(0, MVT::i32);
   2359 
   2360     SDValue Promote = DAG.getNode(SPUISD::PREFSLOT2VEC, dl, vecVT, N, N);
   2361     SDValue CNTB = DAG.getNode(SPUISD::CNTB, dl, vecVT, Promote);
   2362 
   2363     return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i8, CNTB, Elt0);
   2364   }
   2365 
   2366   case MVT::i16: {
   2367     MachineFunction &MF = DAG.getMachineFunction();
   2368     MachineRegisterInfo &RegInfo = MF.getRegInfo();
   2369 
   2370     unsigned CNTB_reg = RegInfo.createVirtualRegister(&SPU::R16CRegClass);
   2371 
   2372     SDValue N = Op.getOperand(0);
   2373     SDValue Elt0 = DAG.getConstant(0, MVT::i16);
   2374     SDValue Mask0 = DAG.getConstant(0x0f, MVT::i16);
   2375     SDValue Shift1 = DAG.getConstant(8, MVT::i32);
   2376 
   2377     SDValue Promote = DAG.getNode(SPUISD::PREFSLOT2VEC, dl, vecVT, N, N);
   2378     SDValue CNTB = DAG.getNode(SPUISD::CNTB, dl, vecVT, Promote);
   2379 
   2380     // CNTB_result becomes the chain to which all of the virtual registers
   2381     // CNTB_reg, SUM1_reg become associated:
   2382     SDValue CNTB_result =
   2383       DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16, CNTB, Elt0);
   2384 
   2385     SDValue CNTB_rescopy =
   2386       DAG.getCopyToReg(CNTB_result, dl, CNTB_reg, CNTB_result);
   2387 
   2388     SDValue Tmp1 = DAG.getCopyFromReg(CNTB_rescopy, dl, CNTB_reg, MVT::i16);
   2389 
   2390     return DAG.getNode(ISD::AND, dl, MVT::i16,
   2391                        DAG.getNode(ISD::ADD, dl, MVT::i16,
   2392                                    DAG.getNode(ISD::SRL, dl, MVT::i16,
   2393                                                Tmp1, Shift1),
   2394                                    Tmp1),
   2395                        Mask0);
   2396   }
   2397 
   2398   case MVT::i32: {
   2399     MachineFunction &MF = DAG.getMachineFunction();
   2400     MachineRegisterInfo &RegInfo = MF.getRegInfo();
   2401 
   2402     unsigned CNTB_reg = RegInfo.createVirtualRegister(&SPU::R32CRegClass);
   2403     unsigned SUM1_reg = RegInfo.createVirtualRegister(&SPU::R32CRegClass);
   2404 
   2405     SDValue N = Op.getOperand(0);
   2406     SDValue Elt0 = DAG.getConstant(0, MVT::i32);
   2407     SDValue Mask0 = DAG.getConstant(0xff, MVT::i32);
   2408     SDValue Shift1 = DAG.getConstant(16, MVT::i32);
   2409     SDValue Shift2 = DAG.getConstant(8, MVT::i32);
   2410 
   2411     SDValue Promote = DAG.getNode(SPUISD::PREFSLOT2VEC, dl, vecVT, N, N);
   2412     SDValue CNTB = DAG.getNode(SPUISD::CNTB, dl, vecVT, Promote);
   2413 
   2414     // CNTB_result becomes the chain to which all of the virtual registers
   2415     // CNTB_reg, SUM1_reg become associated:
   2416     SDValue CNTB_result =
   2417       DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32, CNTB, Elt0);
   2418 
   2419     SDValue CNTB_rescopy =
   2420       DAG.getCopyToReg(CNTB_result, dl, CNTB_reg, CNTB_result);
   2421 
   2422     SDValue Comp1 =
   2423       DAG.getNode(ISD::SRL, dl, MVT::i32,
   2424                   DAG.getCopyFromReg(CNTB_rescopy, dl, CNTB_reg, MVT::i32),
   2425                   Shift1);
   2426 
   2427     SDValue Sum1 =
   2428       DAG.getNode(ISD::ADD, dl, MVT::i32, Comp1,
   2429                   DAG.getCopyFromReg(CNTB_rescopy, dl, CNTB_reg, MVT::i32));
   2430 
   2431     SDValue Sum1_rescopy =
   2432       DAG.getCopyToReg(CNTB_result, dl, SUM1_reg, Sum1);
   2433 
   2434     SDValue Comp2 =
   2435       DAG.getNode(ISD::SRL, dl, MVT::i32,
   2436                   DAG.getCopyFromReg(Sum1_rescopy, dl, SUM1_reg, MVT::i32),
   2437                   Shift2);
   2438     SDValue Sum2 =
   2439       DAG.getNode(ISD::ADD, dl, MVT::i32, Comp2,
   2440                   DAG.getCopyFromReg(Sum1_rescopy, dl, SUM1_reg, MVT::i32));
   2441 
   2442     return DAG.getNode(ISD::AND, dl, MVT::i32, Sum2, Mask0);
   2443   }
   2444 
   2445   case MVT::i64:
   2446     break;
   2447   }
   2448 
   2449   return SDValue();
   2450 }
   2451 
   2452 //! Lower ISD::FP_TO_SINT, ISD::FP_TO_UINT for i32
   2453 /*!
   2454  f32->i32 passes through unchanged, whereas f64->i32 expands to a libcall.
   2455  All conversions to i64 are expanded to a libcall.
   2456  */
   2457 static SDValue LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG,
   2458                               const SPUTargetLowering &TLI) {
   2459   EVT OpVT = Op.getValueType();
   2460   SDValue Op0 = Op.getOperand(0);
   2461   EVT Op0VT = Op0.getValueType();
   2462 
   2463   if ((OpVT == MVT::i32 && Op0VT == MVT::f64)
   2464       || OpVT == MVT::i64) {
   2465     // Convert f32 / f64 to i32 / i64 via libcall.
   2466     RTLIB::Libcall LC =
   2467             (Op.getOpcode() == ISD::FP_TO_SINT)
   2468              ? RTLIB::getFPTOSINT(Op0VT, OpVT)
   2469              : RTLIB::getFPTOUINT(Op0VT, OpVT);
   2470     assert(LC != RTLIB::UNKNOWN_LIBCALL && "Unexpectd fp-to-int conversion!");
   2471     SDValue Dummy;
   2472     return ExpandLibCall(LC, Op, DAG, false, Dummy, TLI);
   2473   }
   2474 
   2475   return Op;
   2476 }
   2477 
   2478 //! Lower ISD::SINT_TO_FP, ISD::UINT_TO_FP for i32
   2479 /*!
   2480  i32->f32 passes through unchanged, whereas i32->f64 is expanded to a libcall.
   2481  All conversions from i64 are expanded to a libcall.
   2482  */
   2483 static SDValue LowerINT_TO_FP(SDValue Op, SelectionDAG &DAG,
   2484                               const SPUTargetLowering &TLI) {
   2485   EVT OpVT = Op.getValueType();
   2486   SDValue Op0 = Op.getOperand(0);
   2487   EVT Op0VT = Op0.getValueType();
   2488 
   2489   if ((OpVT == MVT::f64 && Op0VT == MVT::i32)
   2490       || Op0VT == MVT::i64) {
   2491     // Convert i32, i64 to f64 via libcall:
   2492     RTLIB::Libcall LC =
   2493             (Op.getOpcode() == ISD::SINT_TO_FP)
   2494              ? RTLIB::getSINTTOFP(Op0VT, OpVT)
   2495              : RTLIB::getUINTTOFP(Op0VT, OpVT);
   2496     assert(LC != RTLIB::UNKNOWN_LIBCALL && "Unexpectd int-to-fp conversion!");
   2497     SDValue Dummy;
   2498     return ExpandLibCall(LC, Op, DAG, false, Dummy, TLI);
   2499   }
   2500 
   2501   return Op;
   2502 }
   2503 
   2504 //! Lower ISD::SETCC
   2505 /*!
   2506  This handles MVT::f64 (double floating point) condition lowering
   2507  */
   2508 static SDValue LowerSETCC(SDValue Op, SelectionDAG &DAG,
   2509                           const TargetLowering &TLI) {
   2510   CondCodeSDNode *CC = dyn_cast<CondCodeSDNode>(Op.getOperand(2));
   2511   DebugLoc dl = Op.getDebugLoc();
   2512   assert(CC != 0 && "LowerSETCC: CondCodeSDNode should not be null here!\n");
   2513 
   2514   SDValue lhs = Op.getOperand(0);
   2515   SDValue rhs = Op.getOperand(1);
   2516   EVT lhsVT = lhs.getValueType();
   2517   assert(lhsVT == MVT::f64 && "LowerSETCC: type other than MVT::64\n");
   2518 
   2519   EVT ccResultVT = TLI.getSetCCResultType(lhs.getValueType());
   2520   APInt ccResultOnes = APInt::getAllOnesValue(ccResultVT.getSizeInBits());
   2521   EVT IntVT(MVT::i64);
   2522 
   2523   // Take advantage of the fact that (truncate (sra arg, 32)) is efficiently
   2524   // selected to a NOP:
   2525   SDValue i64lhs = DAG.getNode(ISD::BITCAST, dl, IntVT, lhs);
   2526   SDValue lhsHi32 =
   2527           DAG.getNode(ISD::TRUNCATE, dl, MVT::i32,
   2528                       DAG.getNode(ISD::SRL, dl, IntVT,
   2529                                   i64lhs, DAG.getConstant(32, MVT::i32)));
   2530   SDValue lhsHi32abs =
   2531           DAG.getNode(ISD::AND, dl, MVT::i32,
   2532                       lhsHi32, DAG.getConstant(0x7fffffff, MVT::i32));
   2533   SDValue lhsLo32 =
   2534           DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, i64lhs);
   2535 
   2536   // SETO and SETUO only use the lhs operand:
   2537   if (CC->get() == ISD::SETO) {
   2538     // Evaluates to true if Op0 is not [SQ]NaN - lowers to the inverse of
   2539     // SETUO
   2540     APInt ccResultAllOnes = APInt::getAllOnesValue(ccResultVT.getSizeInBits());
   2541     return DAG.getNode(ISD::XOR, dl, ccResultVT,
   2542                        DAG.getSetCC(dl, ccResultVT,
   2543                                     lhs, DAG.getConstantFP(0.0, lhsVT),
   2544                                     ISD::SETUO),
   2545                        DAG.getConstant(ccResultAllOnes, ccResultVT));
   2546   } else if (CC->get() == ISD::SETUO) {
   2547     // Evaluates to true if Op0 is [SQ]NaN
   2548     return DAG.getNode(ISD::AND, dl, ccResultVT,
   2549                        DAG.getSetCC(dl, ccResultVT,
   2550                                     lhsHi32abs,
   2551                                     DAG.getConstant(0x7ff00000, MVT::i32),
   2552                                     ISD::SETGE),
   2553                        DAG.getSetCC(dl, ccResultVT,
   2554                                     lhsLo32,
   2555                                     DAG.getConstant(0, MVT::i32),
   2556                                     ISD::SETGT));
   2557   }
   2558 
   2559   SDValue i64rhs = DAG.getNode(ISD::BITCAST, dl, IntVT, rhs);
   2560   SDValue rhsHi32 =
   2561           DAG.getNode(ISD::TRUNCATE, dl, MVT::i32,
   2562                       DAG.getNode(ISD::SRL, dl, IntVT,
   2563                                   i64rhs, DAG.getConstant(32, MVT::i32)));
   2564 
   2565   // If a value is negative, subtract from the sign magnitude constant:
   2566   SDValue signMag2TC = DAG.getConstant(0x8000000000000000ULL, IntVT);
   2567 
   2568   // Convert the sign-magnitude representation into 2's complement:
   2569   SDValue lhsSelectMask = DAG.getNode(ISD::SRA, dl, ccResultVT,
   2570                                       lhsHi32, DAG.getConstant(31, MVT::i32));
   2571   SDValue lhsSignMag2TC = DAG.getNode(ISD::SUB, dl, IntVT, signMag2TC, i64lhs);
   2572   SDValue lhsSelect =
   2573           DAG.getNode(ISD::SELECT, dl, IntVT,
   2574                       lhsSelectMask, lhsSignMag2TC, i64lhs);
   2575 
   2576   SDValue rhsSelectMask = DAG.getNode(ISD::SRA, dl, ccResultVT,
   2577                                       rhsHi32, DAG.getConstant(31, MVT::i32));
   2578   SDValue rhsSignMag2TC = DAG.getNode(ISD::SUB, dl, IntVT, signMag2TC, i64rhs);
   2579   SDValue rhsSelect =
   2580           DAG.getNode(ISD::SELECT, dl, IntVT,
   2581                       rhsSelectMask, rhsSignMag2TC, i64rhs);
   2582 
   2583   unsigned compareOp;
   2584 
   2585   switch (CC->get()) {
   2586   case ISD::SETOEQ:
   2587   case ISD::SETUEQ:
   2588     compareOp = ISD::SETEQ; break;
   2589   case ISD::SETOGT:
   2590   case ISD::SETUGT:
   2591     compareOp = ISD::SETGT; break;
   2592   case ISD::SETOGE:
   2593   case ISD::SETUGE:
   2594     compareOp = ISD::SETGE; break;
   2595   case ISD::SETOLT:
   2596   case ISD::SETULT:
   2597     compareOp = ISD::SETLT; break;
   2598   case ISD::SETOLE:
   2599   case ISD::SETULE:
   2600     compareOp = ISD::SETLE; break;
   2601   case ISD::SETUNE:
   2602   case ISD::SETONE:
   2603     compareOp = ISD::SETNE; break;
   2604   default:
   2605     report_fatal_error("CellSPU ISel Select: unimplemented f64 condition");
   2606   }
   2607 
   2608   SDValue result =
   2609           DAG.getSetCC(dl, ccResultVT, lhsSelect, rhsSelect,
   2610                        (ISD::CondCode) compareOp);
   2611 
   2612   if ((CC->get() & 0x8) == 0) {
   2613     // Ordered comparison:
   2614     SDValue lhsNaN = DAG.getSetCC(dl, ccResultVT,
   2615                                   lhs, DAG.getConstantFP(0.0, MVT::f64),
   2616                                   ISD::SETO);
   2617     SDValue rhsNaN = DAG.getSetCC(dl, ccResultVT,
   2618                                   rhs, DAG.getConstantFP(0.0, MVT::f64),
   2619                                   ISD::SETO);
   2620     SDValue ordered = DAG.getNode(ISD::AND, dl, ccResultVT, lhsNaN, rhsNaN);
   2621 
   2622     result = DAG.getNode(ISD::AND, dl, ccResultVT, ordered, result);
   2623   }
   2624 
   2625   return result;
   2626 }
   2627 
   2628 //! Lower ISD::SELECT_CC
   2629 /*!
   2630   ISD::SELECT_CC can (generally) be implemented directly on the SPU using the
   2631   SELB instruction.
   2632 
   2633   \note Need to revisit this in the future: if the code path through the true
   2634   and false value computations is longer than the latency of a branch (6
   2635   cycles), then it would be more advantageous to branch and insert a new basic
   2636   block and branch on the condition. However, this code does not make that
   2637   assumption, given the simplisitc uses so far.
   2638  */
   2639 
   2640 static SDValue LowerSELECT_CC(SDValue Op, SelectionDAG &DAG,
   2641                               const TargetLowering &TLI) {
   2642   EVT VT = Op.getValueType();
   2643   SDValue lhs = Op.getOperand(0);
   2644   SDValue rhs = Op.getOperand(1);
   2645   SDValue trueval = Op.getOperand(2);
   2646   SDValue falseval = Op.getOperand(3);
   2647   SDValue condition = Op.getOperand(4);
   2648   DebugLoc dl = Op.getDebugLoc();
   2649 
   2650   // NOTE: SELB's arguments: $rA, $rB, $mask
   2651   //
   2652   // SELB selects bits from $rA where bits in $mask are 0, bits from $rB
   2653   // where bits in $mask are 1. CCond will be inverted, having 1s where the
   2654   // condition was true and 0s where the condition was false. Hence, the
   2655   // arguments to SELB get reversed.
   2656 
   2657   // Note: Really should be ISD::SELECT instead of SPUISD::SELB, but LLVM's
   2658   // legalizer insists on combining SETCC/SELECT into SELECT_CC, so we end up
   2659   // with another "cannot select select_cc" assert:
   2660 
   2661   SDValue compare = DAG.getNode(ISD::SETCC, dl,
   2662                                 TLI.getSetCCResultType(Op.getValueType()),
   2663                                 lhs, rhs, condition);
   2664   return DAG.getNode(SPUISD::SELB, dl, VT, falseval, trueval, compare);
   2665 }
   2666 
   2667 //! Custom lower ISD::TRUNCATE
   2668 static SDValue LowerTRUNCATE(SDValue Op, SelectionDAG &DAG)
   2669 {
   2670   // Type to truncate to
   2671   EVT VT = Op.getValueType();
   2672   MVT simpleVT = VT.getSimpleVT();
   2673   EVT VecVT = EVT::getVectorVT(*DAG.getContext(),
   2674                                VT, (128 / VT.getSizeInBits()));
   2675   DebugLoc dl = Op.getDebugLoc();
   2676 
   2677   // Type to truncate from
   2678   SDValue Op0 = Op.getOperand(0);
   2679   EVT Op0VT = Op0.getValueType();
   2680 
   2681   if (Op0VT == MVT::i128 && simpleVT == MVT::i64) {
   2682     // Create shuffle mask, least significant doubleword of quadword
   2683     unsigned maskHigh = 0x08090a0b;
   2684     unsigned maskLow = 0x0c0d0e0f;
   2685     // Use a shuffle to perform the truncation
   2686     SDValue shufMask = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32,
   2687                                    DAG.getConstant(maskHigh, MVT::i32),
   2688                                    DAG.getConstant(maskLow, MVT::i32),
   2689                                    DAG.getConstant(maskHigh, MVT::i32),
   2690                                    DAG.getConstant(maskLow, MVT::i32));
   2691 
   2692     SDValue truncShuffle = DAG.getNode(SPUISD::SHUFB, dl, VecVT,
   2693                                        Op0, Op0, shufMask);
   2694 
   2695     return DAG.getNode(SPUISD::VEC2PREFSLOT, dl, VT, truncShuffle);
   2696   }
   2697 
   2698   return SDValue();             // Leave the truncate unmolested
   2699 }
   2700 
   2701 /*!
   2702  * Emit the instruction sequence for i64/i32 -> i128 sign extend. The basic
   2703  * algorithm is to duplicate the sign bit using rotmai to generate at
   2704  * least one byte full of sign bits. Then propagate the "sign-byte" into
   2705  * the leftmost words and the i64/i32 into the rightmost words using shufb.
   2706  *
   2707  * @param Op The sext operand
   2708  * @param DAG The current DAG
   2709  * @return The SDValue with the entire instruction sequence
   2710  */
   2711 static SDValue LowerSIGN_EXTEND(SDValue Op, SelectionDAG &DAG)
   2712 {
   2713   DebugLoc dl = Op.getDebugLoc();
   2714 
   2715   // Type to extend to
   2716   MVT OpVT = Op.getValueType().getSimpleVT();
   2717 
   2718   // Type to extend from
   2719   SDValue Op0 = Op.getOperand(0);
   2720   MVT Op0VT = Op0.getValueType().getSimpleVT();
   2721 
   2722   // extend i8 & i16 via i32
   2723   if (Op0VT == MVT::i8 || Op0VT == MVT::i16) {
   2724     Op0 = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::i32, Op0);
   2725     Op0VT = MVT::i32;
   2726   }
   2727 
   2728   // The type to extend to needs to be a i128 and
   2729   // the type to extend from needs to be i64 or i32.
   2730   assert((OpVT == MVT::i128 && (Op0VT == MVT::i64 || Op0VT == MVT::i32)) &&
   2731           "LowerSIGN_EXTEND: input and/or output operand have wrong size");
   2732   (void)OpVT;
   2733 
   2734   // Create shuffle mask
   2735   unsigned mask1 = 0x10101010; // byte 0 - 3 and 4 - 7
   2736   unsigned mask2 = Op0VT == MVT::i64 ? 0x00010203 : 0x10101010; // byte  8 - 11
   2737   unsigned mask3 = Op0VT == MVT::i64 ? 0x04050607 : 0x00010203; // byte 12 - 15
   2738   SDValue shufMask = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32,
   2739                                  DAG.getConstant(mask1, MVT::i32),
   2740                                  DAG.getConstant(mask1, MVT::i32),
   2741                                  DAG.getConstant(mask2, MVT::i32),
   2742                                  DAG.getConstant(mask3, MVT::i32));
   2743 
   2744   // Word wise arithmetic right shift to generate at least one byte
   2745   // that contains sign bits.
   2746   MVT mvt = Op0VT == MVT::i64 ? MVT::v2i64 : MVT::v4i32;
   2747   SDValue sraVal = DAG.getNode(ISD::SRA,
   2748                  dl,
   2749                  mvt,
   2750                  DAG.getNode(SPUISD::PREFSLOT2VEC, dl, mvt, Op0, Op0),
   2751                  DAG.getConstant(31, MVT::i32));
   2752 
   2753   // reinterpret as a i128 (SHUFB requires it). This gets lowered away.
   2754   SDValue extended = SDValue(DAG.getMachineNode(TargetOpcode::COPY_TO_REGCLASS,
   2755                                         dl, Op0VT, Op0,
   2756                                         DAG.getTargetConstant(
   2757                                                   SPU::GPRCRegClass.getID(),
   2758                                                   MVT::i32)), 0);
   2759   // Shuffle bytes - Copy the sign bits into the upper 64 bits
   2760   // and the input value into the lower 64 bits.
   2761   SDValue extShuffle = DAG.getNode(SPUISD::SHUFB, dl, mvt,
   2762         extended, sraVal, shufMask);
   2763   return DAG.getNode(ISD::BITCAST, dl, MVT::i128, extShuffle);
   2764 }
   2765 
   2766 //! Custom (target-specific) lowering entry point
   2767 /*!
   2768   This is where LLVM's DAG selection process calls to do target-specific
   2769   lowering of nodes.
   2770  */
   2771 SDValue
   2772 SPUTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const
   2773 {
   2774   unsigned Opc = (unsigned) Op.getOpcode();
   2775   EVT VT = Op.getValueType();
   2776 
   2777   switch (Opc) {
   2778   default: {
   2779 #ifndef NDEBUG
   2780     errs() << "SPUTargetLowering::LowerOperation(): need to lower this!\n";
   2781     errs() << "Op.getOpcode() = " << Opc << "\n";
   2782     errs() << "*Op.getNode():\n";
   2783     Op.getNode()->dump();
   2784 #endif
   2785     llvm_unreachable(0);
   2786   }
   2787   case ISD::LOAD:
   2788   case ISD::EXTLOAD:
   2789   case ISD::SEXTLOAD:
   2790   case ISD::ZEXTLOAD:
   2791     return LowerLOAD(Op, DAG, SPUTM.getSubtargetImpl());
   2792   case ISD::STORE:
   2793     return LowerSTORE(Op, DAG, SPUTM.getSubtargetImpl());
   2794   case ISD::ConstantPool:
   2795     return LowerConstantPool(Op, DAG, SPUTM.getSubtargetImpl());
   2796   case ISD::GlobalAddress:
   2797     return LowerGlobalAddress(Op, DAG, SPUTM.getSubtargetImpl());
   2798   case ISD::JumpTable:
   2799     return LowerJumpTable(Op, DAG, SPUTM.getSubtargetImpl());
   2800   case ISD::ConstantFP:
   2801     return LowerConstantFP(Op, DAG);
   2802 
   2803   // i8, i64 math ops:
   2804   case ISD::ADD:
   2805   case ISD::SUB:
   2806   case ISD::ROTR:
   2807   case ISD::ROTL:
   2808   case ISD::SRL:
   2809   case ISD::SHL:
   2810   case ISD::SRA: {
   2811     if (VT == MVT::i8)
   2812       return LowerI8Math(Op, DAG, Opc, *this);
   2813     break;
   2814   }
   2815 
   2816   case ISD::FP_TO_SINT:
   2817   case ISD::FP_TO_UINT:
   2818     return LowerFP_TO_INT(Op, DAG, *this);
   2819 
   2820   case ISD::SINT_TO_FP:
   2821   case ISD::UINT_TO_FP:
   2822     return LowerINT_TO_FP(Op, DAG, *this);
   2823 
   2824   // Vector-related lowering.
   2825   case ISD::BUILD_VECTOR:
   2826     return LowerBUILD_VECTOR(Op, DAG);
   2827   case ISD::SCALAR_TO_VECTOR:
   2828     return LowerSCALAR_TO_VECTOR(Op, DAG);
   2829   case ISD::VECTOR_SHUFFLE:
   2830     return LowerVECTOR_SHUFFLE(Op, DAG);
   2831   case ISD::EXTRACT_VECTOR_ELT:
   2832     return LowerEXTRACT_VECTOR_ELT(Op, DAG);
   2833   case ISD::INSERT_VECTOR_ELT:
   2834     return LowerINSERT_VECTOR_ELT(Op, DAG);
   2835 
   2836   // Look for ANDBI, ORBI and XORBI opportunities and lower appropriately:
   2837   case ISD::AND:
   2838   case ISD::OR:
   2839   case ISD::XOR:
   2840     return LowerByteImmed(Op, DAG);
   2841 
   2842   // Vector and i8 multiply:
   2843   case ISD::MUL:
   2844     if (VT == MVT::i8)
   2845       return LowerI8Math(Op, DAG, Opc, *this);
   2846 
   2847   case ISD::CTPOP:
   2848     return LowerCTPOP(Op, DAG);
   2849 
   2850   case ISD::SELECT_CC:
   2851     return LowerSELECT_CC(Op, DAG, *this);
   2852 
   2853   case ISD::SETCC:
   2854     return LowerSETCC(Op, DAG, *this);
   2855 
   2856   case ISD::TRUNCATE:
   2857     return LowerTRUNCATE(Op, DAG);
   2858 
   2859   case ISD::SIGN_EXTEND:
   2860     return LowerSIGN_EXTEND(Op, DAG);
   2861   }
   2862 
   2863   return SDValue();
   2864 }
   2865 
   2866 void SPUTargetLowering::ReplaceNodeResults(SDNode *N,
   2867                                            SmallVectorImpl<SDValue>&Results,
   2868                                            SelectionDAG &DAG) const
   2869 {
   2870 #if 0
   2871   unsigned Opc = (unsigned) N->getOpcode();
   2872   EVT OpVT = N->getValueType(0);
   2873 
   2874   switch (Opc) {
   2875   default: {
   2876     errs() << "SPUTargetLowering::ReplaceNodeResults(): need to fix this!\n";
   2877     errs() << "Op.getOpcode() = " << Opc << "\n";
   2878     errs() << "*Op.getNode():\n";
   2879     N->dump();
   2880     abort();
   2881     /*NOTREACHED*/
   2882   }
   2883   }
   2884 #endif
   2885 
   2886   /* Otherwise, return unchanged */
   2887 }
   2888 
   2889 //===----------------------------------------------------------------------===//
   2890 // Target Optimization Hooks
   2891 //===----------------------------------------------------------------------===//
   2892 
   2893 SDValue
   2894 SPUTargetLowering::PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const
   2895 {
   2896 #if 0
   2897   TargetMachine &TM = getTargetMachine();
   2898 #endif
   2899   const SPUSubtarget *ST = SPUTM.getSubtargetImpl();
   2900   SelectionDAG &DAG = DCI.DAG;
   2901   SDValue Op0 = N->getOperand(0);       // everything has at least one operand
   2902   EVT NodeVT = N->getValueType(0);      // The node's value type
   2903   EVT Op0VT = Op0.getValueType();       // The first operand's result
   2904   SDValue Result;                       // Initially, empty result
   2905   DebugLoc dl = N->getDebugLoc();
   2906 
   2907   switch (N->getOpcode()) {
   2908   default: break;
   2909   case ISD::ADD: {
   2910     SDValue Op1 = N->getOperand(1);
   2911 
   2912     if (Op0.getOpcode() == SPUISD::IndirectAddr
   2913         || Op1.getOpcode() == SPUISD::IndirectAddr) {
   2914       // Normalize the operands to reduce repeated code
   2915       SDValue IndirectArg = Op0, AddArg = Op1;
   2916 
   2917       if (Op1.getOpcode() == SPUISD::IndirectAddr) {
   2918         IndirectArg = Op1;
   2919         AddArg = Op0;
   2920       }
   2921 
   2922       if (isa<ConstantSDNode>(AddArg)) {
   2923         ConstantSDNode *CN0 = cast<ConstantSDNode > (AddArg);
   2924         SDValue IndOp1 = IndirectArg.getOperand(1);
   2925 
   2926         if (CN0->isNullValue()) {
   2927           // (add (SPUindirect <arg>, <arg>), 0) ->
   2928           // (SPUindirect <arg>, <arg>)
   2929 
   2930 #if !defined(NDEBUG)
   2931           if (DebugFlag && isCurrentDebugType(DEBUG_TYPE)) {
   2932             errs() << "\n"
   2933                  << "Replace: (add (SPUindirect <arg>, <arg>), 0)\n"
   2934                  << "With:    (SPUindirect <arg>, <arg>)\n";
   2935           }
   2936 #endif
   2937 
   2938           return IndirectArg;
   2939         } else if (isa<ConstantSDNode>(IndOp1)) {
   2940           // (add (SPUindirect <arg>, <const>), <const>) ->
   2941           // (SPUindirect <arg>, <const + const>)
   2942           ConstantSDNode *CN1 = cast<ConstantSDNode > (IndOp1);
   2943           int64_t combinedConst = CN0->getSExtValue() + CN1->getSExtValue();
   2944           SDValue combinedValue = DAG.getConstant(combinedConst, Op0VT);
   2945 
   2946 #if !defined(NDEBUG)
   2947           if (DebugFlag && isCurrentDebugType(DEBUG_TYPE)) {
   2948             errs() << "\n"
   2949                  << "Replace: (add (SPUindirect <arg>, " << CN1->getSExtValue()
   2950                  << "), " << CN0->getSExtValue() << ")\n"
   2951                  << "With:    (SPUindirect <arg>, "
   2952                  << combinedConst << ")\n";
   2953           }
   2954 #endif
   2955 
   2956           return DAG.getNode(SPUISD::IndirectAddr, dl, Op0VT,
   2957                              IndirectArg, combinedValue);
   2958         }
   2959       }
   2960     }
   2961     break;
   2962   }
   2963   case ISD::SIGN_EXTEND:
   2964   case ISD::ZERO_EXTEND:
   2965   case ISD::ANY_EXTEND: {
   2966     if (Op0.getOpcode() == SPUISD::VEC2PREFSLOT && NodeVT == Op0VT) {
   2967       // (any_extend (SPUextract_elt0 <arg>)) ->
   2968       // (SPUextract_elt0 <arg>)
   2969       // Types must match, however...
   2970 #if !defined(NDEBUG)
   2971       if (DebugFlag && isCurrentDebugType(DEBUG_TYPE)) {
   2972         errs() << "\nReplace: ";
   2973         N->dump(&DAG);
   2974         errs() << "\nWith:    ";
   2975         Op0.getNode()->dump(&DAG);
   2976         errs() << "\n";
   2977       }
   2978 #endif
   2979 
   2980       return Op0;
   2981     }
   2982     break;
   2983   }
   2984   case SPUISD::IndirectAddr: {
   2985     if (!ST->usingLargeMem() && Op0.getOpcode() == SPUISD::AFormAddr) {
   2986       ConstantSDNode *CN = dyn_cast<ConstantSDNode>(N->getOperand(1));
   2987       if (CN != 0 && CN->isNullValue()) {
   2988         // (SPUindirect (SPUaform <addr>, 0), 0) ->
   2989         // (SPUaform <addr>, 0)
   2990 
   2991         DEBUG(errs() << "Replace: ");
   2992         DEBUG(N->dump(&DAG));
   2993         DEBUG(errs() << "\nWith:    ");
   2994         DEBUG(Op0.getNode()->dump(&DAG));
   2995         DEBUG(errs() << "\n");
   2996 
   2997         return Op0;
   2998       }
   2999     } else if (Op0.getOpcode() == ISD::ADD) {
   3000       SDValue Op1 = N->getOperand(1);
   3001       if (ConstantSDNode *CN1 = dyn_cast<ConstantSDNode>(Op1)) {
   3002         // (SPUindirect (add <arg>, <arg>), 0) ->
   3003         // (SPUindirect <arg>, <arg>)
   3004         if (CN1->isNullValue()) {
   3005 
   3006 #if !defined(NDEBUG)
   3007           if (DebugFlag && isCurrentDebugType(DEBUG_TYPE)) {
   3008             errs() << "\n"
   3009                  << "Replace: (SPUindirect (add <arg>, <arg>), 0)\n"
   3010                  << "With:    (SPUindirect <arg>, <arg>)\n";
   3011           }
   3012 #endif
   3013 
   3014           return DAG.getNode(SPUISD::IndirectAddr, dl, Op0VT,
   3015                              Op0.getOperand(0), Op0.getOperand(1));
   3016         }
   3017       }
   3018     }
   3019     break;
   3020   }
   3021   case SPUISD::SHL_BITS:
   3022   case SPUISD::SHL_BYTES:
   3023   case SPUISD::ROTBYTES_LEFT: {
   3024     SDValue Op1 = N->getOperand(1);
   3025 
   3026     // Kill degenerate vector shifts:
   3027     if (ConstantSDNode *CN = dyn_cast<ConstantSDNode>(Op1)) {
   3028       if (CN->isNullValue()) {
   3029         Result = Op0;
   3030       }
   3031     }
   3032     break;
   3033   }
   3034   case SPUISD::PREFSLOT2VEC: {
   3035     switch (Op0.getOpcode()) {
   3036     default:
   3037       break;
   3038     case ISD::ANY_EXTEND:
   3039     case ISD::ZERO_EXTEND:
   3040     case ISD::SIGN_EXTEND: {
   3041       // (SPUprefslot2vec (any|zero|sign_extend (SPUvec2prefslot <arg>))) ->
   3042       // <arg>
   3043       // but only if the SPUprefslot2vec and <arg> types match.
   3044       SDValue Op00 = Op0.getOperand(0);
   3045       if (Op00.getOpcode() == SPUISD::VEC2PREFSLOT) {
   3046         SDValue Op000 = Op00.getOperand(0);
   3047         if (Op000.getValueType() == NodeVT) {
   3048           Result = Op000;
   3049         }
   3050       }
   3051       break;
   3052     }
   3053     case SPUISD::VEC2PREFSLOT: {
   3054       // (SPUprefslot2vec (SPUvec2prefslot <arg>)) ->
   3055       // <arg>
   3056       Result = Op0.getOperand(0);
   3057       break;
   3058     }
   3059     }
   3060     break;
   3061   }
   3062   }
   3063 
   3064   // Otherwise, return unchanged.
   3065 #ifndef NDEBUG
   3066   if (Result.getNode()) {
   3067     DEBUG(errs() << "\nReplace.SPU: ");
   3068     DEBUG(N->dump(&DAG));
   3069     DEBUG(errs() << "\nWith:        ");
   3070     DEBUG(Result.getNode()->dump(&DAG));
   3071     DEBUG(errs() << "\n");
   3072   }
   3073 #endif
   3074 
   3075   return Result;
   3076 }
   3077 
   3078 //===----------------------------------------------------------------------===//
   3079 // Inline Assembly Support
   3080 //===----------------------------------------------------------------------===//
   3081 
   3082 /// getConstraintType - Given a constraint letter, return the type of
   3083 /// constraint it is for this target.
   3084 SPUTargetLowering::ConstraintType
   3085 SPUTargetLowering::getConstraintType(const std::string &ConstraintLetter) const {
   3086   if (ConstraintLetter.size() == 1) {
   3087     switch (ConstraintLetter[0]) {
   3088     default: break;
   3089     case 'b':
   3090     case 'r':
   3091     case 'f':
   3092     case 'v':
   3093     case 'y':
   3094       return C_RegisterClass;
   3095     }
   3096   }
   3097   return TargetLowering::getConstraintType(ConstraintLetter);
   3098 }
   3099 
   3100 /// Examine constraint type and operand type and determine a weight value.
   3101 /// This object must already have been set up with the operand type
   3102 /// and the current alternative constraint selected.
   3103 TargetLowering::ConstraintWeight
   3104 SPUTargetLowering::getSingleConstraintMatchWeight(
   3105     AsmOperandInfo &info, const char *constraint) const {
   3106   ConstraintWeight weight = CW_Invalid;
   3107   Value *CallOperandVal = info.CallOperandVal;
   3108     // If we don't have a value, we can't do a match,
   3109     // but allow it at the lowest weight.
   3110   if (CallOperandVal == NULL)
   3111     return CW_Default;
   3112   // Look at the constraint type.
   3113   switch (*constraint) {
   3114   default:
   3115     weight = TargetLowering::getSingleConstraintMatchWeight(info, constraint);
   3116     break;
   3117     //FIXME: Seems like the supported constraint letters were just copied
   3118     // from PPC, as the following doesn't correspond to the GCC docs.
   3119     // I'm leaving it so until someone adds the corresponding lowering support.
   3120   case 'b':
   3121   case 'r':
   3122   case 'f':
   3123   case 'd':
   3124   case 'v':
   3125   case 'y':
   3126     weight = CW_Register;
   3127     break;
   3128   }
   3129   return weight;
   3130 }
   3131 
   3132 std::pair<unsigned, const TargetRegisterClass*>
   3133 SPUTargetLowering::getRegForInlineAsmConstraint(const std::string &Constraint,
   3134                                                 EVT VT) const
   3135 {
   3136   if (Constraint.size() == 1) {
   3137     // GCC RS6000 Constraint Letters
   3138     switch (Constraint[0]) {
   3139     case 'b':   // R1-R31
   3140     case 'r':   // R0-R31
   3141       if (VT == MVT::i64)
   3142         return std::make_pair(0U, SPU::R64CRegisterClass);
   3143       return std::make_pair(0U, SPU::R32CRegisterClass);
   3144     case 'f':
   3145       if (VT == MVT::f32)
   3146         return std::make_pair(0U, SPU::R32FPRegisterClass);
   3147       else if (VT == MVT::f64)
   3148         return std::make_pair(0U, SPU::R64FPRegisterClass);
   3149       break;
   3150     case 'v':
   3151       return std::make_pair(0U, SPU::GPRCRegisterClass);
   3152     }
   3153   }
   3154 
   3155   return TargetLowering::getRegForInlineAsmConstraint(Constraint, VT);
   3156 }
   3157 
   3158 //! Compute used/known bits for a SPU operand
   3159 void
   3160 SPUTargetLowering::computeMaskedBitsForTargetNode(const SDValue Op,
   3161                                                   APInt &KnownZero,
   3162                                                   APInt &KnownOne,
   3163                                                   const SelectionDAG &DAG,
   3164                                                   unsigned Depth ) const {
   3165 #if 0
   3166   const uint64_t uint64_sizebits = sizeof(uint64_t) * CHAR_BIT;
   3167 
   3168   switch (Op.getOpcode()) {
   3169   default:
   3170     // KnownZero = KnownOne = APInt(Mask.getBitWidth(), 0);
   3171     break;
   3172   case CALL:
   3173   case SHUFB:
   3174   case SHUFFLE_MASK:
   3175   case CNTB:
   3176   case SPUISD::PREFSLOT2VEC:
   3177   case SPUISD::LDRESULT:
   3178   case SPUISD::VEC2PREFSLOT:
   3179   case SPUISD::SHLQUAD_L_BITS:
   3180   case SPUISD::SHLQUAD_L_BYTES:
   3181   case SPUISD::VEC_ROTL:
   3182   case SPUISD::VEC_ROTR:
   3183   case SPUISD::ROTBYTES_LEFT:
   3184   case SPUISD::SELECT_MASK:
   3185   case SPUISD::SELB:
   3186   }
   3187 #endif
   3188 }
   3189 
   3190 unsigned
   3191 SPUTargetLowering::ComputeNumSignBitsForTargetNode(SDValue Op,
   3192                                                    unsigned Depth) const {
   3193   switch (Op.getOpcode()) {
   3194   default:
   3195     return 1;
   3196 
   3197   case ISD::SETCC: {
   3198     EVT VT = Op.getValueType();
   3199 
   3200     if (VT != MVT::i8 && VT != MVT::i16 && VT != MVT::i32) {
   3201       VT = MVT::i32;
   3202     }
   3203     return VT.getSizeInBits();
   3204   }
   3205   }
   3206 }
   3207 
   3208 // LowerAsmOperandForConstraint
   3209 void
   3210 SPUTargetLowering::LowerAsmOperandForConstraint(SDValue Op,
   3211                                                 std::string &Constraint,
   3212                                                 std::vector<SDValue> &Ops,
   3213                                                 SelectionDAG &DAG) const {
   3214   // Default, for the time being, to the base class handler
   3215   TargetLowering::LowerAsmOperandForConstraint(Op, Constraint, Ops, DAG);
   3216 }
   3217 
   3218 /// isLegalAddressImmediate - Return true if the integer value can be used
   3219 /// as the offset of the target addressing mode.
   3220 bool SPUTargetLowering::isLegalAddressImmediate(int64_t V,
   3221                                                 Type *Ty) const {
   3222   // SPU's addresses are 256K:
   3223   return (V > -(1 << 18) && V < (1 << 18) - 1);
   3224 }
   3225 
   3226 bool SPUTargetLowering::isLegalAddressImmediate(GlobalValue* GV) const {
   3227   return false;
   3228 }
   3229 
   3230 bool
   3231 SPUTargetLowering::isOffsetFoldingLegal(const GlobalAddressSDNode *GA) const {
   3232   // The SPU target isn't yet aware of offsets.
   3233   return false;
   3234 }
   3235 
   3236 // can we compare to Imm without writing it into a register?
   3237 bool SPUTargetLowering::isLegalICmpImmediate(int64_t Imm) const {
   3238   //ceqi, cgti, etc. all take s10 operand
   3239   return isInt<10>(Imm);
   3240 }
   3241 
   3242 bool
   3243 SPUTargetLowering::isLegalAddressingMode(const AddrMode &AM,
   3244                                          Type * ) const{
   3245 
   3246   // A-form: 18bit absolute address.
   3247   if (AM.BaseGV && !AM.HasBaseReg && AM.Scale == 0 && AM.BaseOffs == 0)
   3248     return true;
   3249 
   3250   // D-form: reg + 14bit offset
   3251   if (AM.BaseGV ==0 && AM.HasBaseReg && AM.Scale == 0 && isInt<14>(AM.BaseOffs))
   3252     return true;
   3253 
   3254   // X-form: reg+reg
   3255   if (AM.BaseGV == 0 && AM.HasBaseReg && AM.Scale == 1 && AM.BaseOffs ==0)
   3256     return true;
   3257 
   3258   return false;
   3259 }
   3260