Home | History | Annotate | Download | only in CellSPU
      1 //===-- SPUISelLowering.cpp - Cell SPU DAG Lowering Implementation --------===//
      2 //                     The LLVM Compiler Infrastructure
      3 //
      4 // This file is distributed under the University of Illinois Open Source
      5 // License. See LICENSE.TXT for details.
      6 //
      7 //===----------------------------------------------------------------------===//
      8 //
      9 // This file implements the SPUTargetLowering class.
     10 //
     11 //===----------------------------------------------------------------------===//
     12 
     13 #include "SPUISelLowering.h"
     14 #include "SPUTargetMachine.h"
     15 #include "SPUFrameLowering.h"
     16 #include "SPUMachineFunction.h"
     17 #include "llvm/Constants.h"
     18 #include "llvm/Function.h"
     19 #include "llvm/Intrinsics.h"
     20 #include "llvm/CallingConv.h"
     21 #include "llvm/Type.h"
     22 #include "llvm/CodeGen/CallingConvLower.h"
     23 #include "llvm/CodeGen/MachineFrameInfo.h"
     24 #include "llvm/CodeGen/MachineFunction.h"
     25 #include "llvm/CodeGen/MachineInstrBuilder.h"
     26 #include "llvm/CodeGen/MachineRegisterInfo.h"
     27 #include "llvm/CodeGen/SelectionDAG.h"
     28 #include "llvm/CodeGen/TargetLoweringObjectFileImpl.h"
     29 #include "llvm/Target/TargetOptions.h"
     30 #include "llvm/Support/Debug.h"
     31 #include "llvm/Support/ErrorHandling.h"
     32 #include "llvm/Support/MathExtras.h"
     33 #include "llvm/Support/raw_ostream.h"
     34 
     35 using namespace llvm;
     36 
     37 namespace {
     38   // Byte offset of the preferred slot (counted from the MSB)
     39   int prefslotOffset(EVT VT) {
     40     int retval=0;
     41     if (VT==MVT::i1) retval=3;
     42     if (VT==MVT::i8) retval=3;
     43     if (VT==MVT::i16) retval=2;
     44 
     45     return retval;
     46   }
     47 
     48   //! Expand a library call into an actual call DAG node
     49   /*!
     50    \note
     51    This code is taken from SelectionDAGLegalize, since it is not exposed as
     52    part of the LLVM SelectionDAG API.
     53    */
     54 
     55   SDValue
     56   ExpandLibCall(RTLIB::Libcall LC, SDValue Op, SelectionDAG &DAG,
     57                 bool isSigned, SDValue &Hi, const SPUTargetLowering &TLI) {
     58     // The input chain to this libcall is the entry node of the function.
     59     // Legalizing the call will automatically add the previous call to the
     60     // dependence.
     61     SDValue InChain = DAG.getEntryNode();
     62 
     63     TargetLowering::ArgListTy Args;
     64     TargetLowering::ArgListEntry Entry;
     65     for (unsigned i = 0, e = Op.getNumOperands(); i != e; ++i) {
     66       EVT ArgVT = Op.getOperand(i).getValueType();
     67       Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext());
     68       Entry.Node = Op.getOperand(i);
     69       Entry.Ty = ArgTy;
     70       Entry.isSExt = isSigned;
     71       Entry.isZExt = !isSigned;
     72       Args.push_back(Entry);
     73     }
     74     SDValue Callee = DAG.getExternalSymbol(TLI.getLibcallName(LC),
     75                                            TLI.getPointerTy());
     76 
     77     // Splice the libcall in wherever FindInputOutputChains tells us to.
     78     Type *RetTy =
     79                 Op.getNode()->getValueType(0).getTypeForEVT(*DAG.getContext());
     80     TargetLowering::CallLoweringInfo CLI(InChain, RetTy, isSigned, !isSigned,
     81                                          false, false,
     82                             0, TLI.getLibcallCallingConv(LC),
     83                             /*isTailCall=*/false,
     84                                          /*doesNotRet=*/false,
     85                                          /*isReturnValueUsed=*/true,
     86                             Callee, Args, DAG, Op.getDebugLoc());
     87     std::pair<SDValue, SDValue> CallInfo = TLI.LowerCallTo(CLI);
     88 
     89     return CallInfo.first;
     90   }
     91 }
     92 
     93 SPUTargetLowering::SPUTargetLowering(SPUTargetMachine &TM)
     94   : TargetLowering(TM, new TargetLoweringObjectFileELF()),
     95     SPUTM(TM) {
     96 
     97   // Use _setjmp/_longjmp instead of setjmp/longjmp.
     98   setUseUnderscoreSetJmp(true);
     99   setUseUnderscoreLongJmp(true);
    100 
    101   // Set RTLIB libcall names as used by SPU:
    102   setLibcallName(RTLIB::DIV_F64, "__fast_divdf3");
    103 
    104   // Set up the SPU's register classes:
    105   addRegisterClass(MVT::i8,   &SPU::R8CRegClass);
    106   addRegisterClass(MVT::i16,  &SPU::R16CRegClass);
    107   addRegisterClass(MVT::i32,  &SPU::R32CRegClass);
    108   addRegisterClass(MVT::i64,  &SPU::R64CRegClass);
    109   addRegisterClass(MVT::f32,  &SPU::R32FPRegClass);
    110   addRegisterClass(MVT::f64,  &SPU::R64FPRegClass);
    111   addRegisterClass(MVT::i128, &SPU::GPRCRegClass);
    112 
    113   // SPU has no sign or zero extended loads for i1, i8, i16:
    114   setLoadExtAction(ISD::EXTLOAD,  MVT::i1, Promote);
    115   setLoadExtAction(ISD::SEXTLOAD, MVT::i1, Promote);
    116   setLoadExtAction(ISD::ZEXTLOAD, MVT::i1, Promote);
    117 
    118   setLoadExtAction(ISD::EXTLOAD,  MVT::f32, Expand);
    119   setLoadExtAction(ISD::EXTLOAD,  MVT::f64, Expand);
    120 
    121   setTruncStoreAction(MVT::i128, MVT::i64, Expand);
    122   setTruncStoreAction(MVT::i128, MVT::i32, Expand);
    123   setTruncStoreAction(MVT::i128, MVT::i16, Expand);
    124   setTruncStoreAction(MVT::i128, MVT::i8, Expand);
    125 
    126   setTruncStoreAction(MVT::f64, MVT::f32, Expand);
    127 
    128   // SPU constant load actions are custom lowered:
    129   setOperationAction(ISD::ConstantFP, MVT::f32, Legal);
    130   setOperationAction(ISD::ConstantFP, MVT::f64, Custom);
    131 
    132   // SPU's loads and stores have to be custom lowered:
    133   for (unsigned sctype = (unsigned) MVT::i8; sctype < (unsigned) MVT::i128;
    134        ++sctype) {
    135     MVT::SimpleValueType VT = (MVT::SimpleValueType)sctype;
    136 
    137     setOperationAction(ISD::LOAD,   VT, Custom);
    138     setOperationAction(ISD::STORE,  VT, Custom);
    139     setLoadExtAction(ISD::EXTLOAD,  VT, Custom);
    140     setLoadExtAction(ISD::ZEXTLOAD, VT, Custom);
    141     setLoadExtAction(ISD::SEXTLOAD, VT, Custom);
    142 
    143     for (unsigned stype = sctype - 1; stype >= (unsigned) MVT::i8; --stype) {
    144       MVT::SimpleValueType StoreVT = (MVT::SimpleValueType) stype;
    145       setTruncStoreAction(VT, StoreVT, Expand);
    146     }
    147   }
    148 
    149   for (unsigned sctype = (unsigned) MVT::f32; sctype < (unsigned) MVT::f64;
    150        ++sctype) {
    151     MVT::SimpleValueType VT = (MVT::SimpleValueType) sctype;
    152 
    153     setOperationAction(ISD::LOAD,   VT, Custom);
    154     setOperationAction(ISD::STORE,  VT, Custom);
    155 
    156     for (unsigned stype = sctype - 1; stype >= (unsigned) MVT::f32; --stype) {
    157       MVT::SimpleValueType StoreVT = (MVT::SimpleValueType) stype;
    158       setTruncStoreAction(VT, StoreVT, Expand);
    159     }
    160   }
    161 
    162   // Expand the jumptable branches
    163   setOperationAction(ISD::BR_JT,        MVT::Other, Expand);
    164   setOperationAction(ISD::BR_CC,        MVT::Other, Expand);
    165 
    166   // Custom lower SELECT_CC for most cases, but expand by default
    167   setOperationAction(ISD::SELECT_CC,    MVT::Other, Expand);
    168   setOperationAction(ISD::SELECT_CC,    MVT::i8,    Custom);
    169   setOperationAction(ISD::SELECT_CC,    MVT::i16,   Custom);
    170   setOperationAction(ISD::SELECT_CC,    MVT::i32,   Custom);
    171   setOperationAction(ISD::SELECT_CC,    MVT::i64,   Custom);
    172 
    173   // SPU has no intrinsics for these particular operations:
    174   setOperationAction(ISD::MEMBARRIER, MVT::Other, Expand);
    175   setOperationAction(ISD::ATOMIC_FENCE, MVT::Other, Expand);
    176 
    177   // SPU has no division/remainder instructions
    178   setOperationAction(ISD::SREM,    MVT::i8,   Expand);
    179   setOperationAction(ISD::UREM,    MVT::i8,   Expand);
    180   setOperationAction(ISD::SDIV,    MVT::i8,   Expand);
    181   setOperationAction(ISD::UDIV,    MVT::i8,   Expand);
    182   setOperationAction(ISD::SDIVREM, MVT::i8,   Expand);
    183   setOperationAction(ISD::UDIVREM, MVT::i8,   Expand);
    184   setOperationAction(ISD::SREM,    MVT::i16,  Expand);
    185   setOperationAction(ISD::UREM,    MVT::i16,  Expand);
    186   setOperationAction(ISD::SDIV,    MVT::i16,  Expand);
    187   setOperationAction(ISD::UDIV,    MVT::i16,  Expand);
    188   setOperationAction(ISD::SDIVREM, MVT::i16,  Expand);
    189   setOperationAction(ISD::UDIVREM, MVT::i16,  Expand);
    190   setOperationAction(ISD::SREM,    MVT::i32,  Expand);
    191   setOperationAction(ISD::UREM,    MVT::i32,  Expand);
    192   setOperationAction(ISD::SDIV,    MVT::i32,  Expand);
    193   setOperationAction(ISD::UDIV,    MVT::i32,  Expand);
    194   setOperationAction(ISD::SDIVREM, MVT::i32,  Expand);
    195   setOperationAction(ISD::UDIVREM, MVT::i32,  Expand);
    196   setOperationAction(ISD::SREM,    MVT::i64,  Expand);
    197   setOperationAction(ISD::UREM,    MVT::i64,  Expand);
    198   setOperationAction(ISD::SDIV,    MVT::i64,  Expand);
    199   setOperationAction(ISD::UDIV,    MVT::i64,  Expand);
    200   setOperationAction(ISD::SDIVREM, MVT::i64,  Expand);
    201   setOperationAction(ISD::UDIVREM, MVT::i64,  Expand);
    202   setOperationAction(ISD::SREM,    MVT::i128, Expand);
    203   setOperationAction(ISD::UREM,    MVT::i128, Expand);
    204   setOperationAction(ISD::SDIV,    MVT::i128, Expand);
    205   setOperationAction(ISD::UDIV,    MVT::i128, Expand);
    206   setOperationAction(ISD::SDIVREM, MVT::i128, Expand);
    207   setOperationAction(ISD::UDIVREM, MVT::i128, Expand);
    208 
    209   // We don't support sin/cos/sqrt/fmod
    210   setOperationAction(ISD::FSIN , MVT::f64, Expand);
    211   setOperationAction(ISD::FCOS , MVT::f64, Expand);
    212   setOperationAction(ISD::FREM , MVT::f64, Expand);
    213   setOperationAction(ISD::FSIN , MVT::f32, Expand);
    214   setOperationAction(ISD::FCOS , MVT::f32, Expand);
    215   setOperationAction(ISD::FREM , MVT::f32, Expand);
    216 
    217   // Expand fsqrt to the appropriate libcall (NOTE: should use h/w fsqrt
    218   // for f32!)
    219   setOperationAction(ISD::FSQRT, MVT::f64, Expand);
    220   setOperationAction(ISD::FSQRT, MVT::f32, Expand);
    221 
    222   setOperationAction(ISD::FMA, MVT::f64, Expand);
    223   setOperationAction(ISD::FMA, MVT::f32, Expand);
    224 
    225   setOperationAction(ISD::FCOPYSIGN, MVT::f64, Expand);
    226   setOperationAction(ISD::FCOPYSIGN, MVT::f32, Expand);
    227 
    228   // SPU can do rotate right and left, so legalize it... but customize for i8
    229   // because instructions don't exist.
    230 
    231   // FIXME: Change from "expand" to appropriate type once ROTR is supported in
    232   //        .td files.
    233   setOperationAction(ISD::ROTR, MVT::i32,    Expand /*Legal*/);
    234   setOperationAction(ISD::ROTR, MVT::i16,    Expand /*Legal*/);
    235   setOperationAction(ISD::ROTR, MVT::i8,     Expand /*Custom*/);
    236 
    237   setOperationAction(ISD::ROTL, MVT::i32,    Legal);
    238   setOperationAction(ISD::ROTL, MVT::i16,    Legal);
    239   setOperationAction(ISD::ROTL, MVT::i8,     Custom);
    240 
    241   // SPU has no native version of shift left/right for i8
    242   setOperationAction(ISD::SHL,  MVT::i8,     Custom);
    243   setOperationAction(ISD::SRL,  MVT::i8,     Custom);
    244   setOperationAction(ISD::SRA,  MVT::i8,     Custom);
    245 
    246   // Make these operations legal and handle them during instruction selection:
    247   setOperationAction(ISD::SHL,  MVT::i64,    Legal);
    248   setOperationAction(ISD::SRL,  MVT::i64,    Legal);
    249   setOperationAction(ISD::SRA,  MVT::i64,    Legal);
    250 
    251   // Custom lower i8, i32 and i64 multiplications
    252   setOperationAction(ISD::MUL,  MVT::i8,     Custom);
    253   setOperationAction(ISD::MUL,  MVT::i32,    Legal);
    254   setOperationAction(ISD::MUL,  MVT::i64,    Legal);
    255 
    256   // Expand double-width multiplication
    257   // FIXME: It would probably be reasonable to support some of these operations
    258   setOperationAction(ISD::UMUL_LOHI, MVT::i8,  Expand);
    259   setOperationAction(ISD::SMUL_LOHI, MVT::i8,  Expand);
    260   setOperationAction(ISD::MULHU,     MVT::i8,  Expand);
    261   setOperationAction(ISD::MULHS,     MVT::i8,  Expand);
    262   setOperationAction(ISD::UMUL_LOHI, MVT::i16, Expand);
    263   setOperationAction(ISD::SMUL_LOHI, MVT::i16, Expand);
    264   setOperationAction(ISD::MULHU,     MVT::i16, Expand);
    265   setOperationAction(ISD::MULHS,     MVT::i16, Expand);
    266   setOperationAction(ISD::UMUL_LOHI, MVT::i32, Expand);
    267   setOperationAction(ISD::SMUL_LOHI, MVT::i32, Expand);
    268   setOperationAction(ISD::MULHU,     MVT::i32, Expand);
    269   setOperationAction(ISD::MULHS,     MVT::i32, Expand);
    270   setOperationAction(ISD::UMUL_LOHI, MVT::i64, Expand);
    271   setOperationAction(ISD::SMUL_LOHI, MVT::i64, Expand);
    272   setOperationAction(ISD::MULHU,     MVT::i64, Expand);
    273   setOperationAction(ISD::MULHS,     MVT::i64, Expand);
    274 
    275   // Need to custom handle (some) common i8, i64 math ops
    276   setOperationAction(ISD::ADD,  MVT::i8,     Custom);
    277   setOperationAction(ISD::ADD,  MVT::i64,    Legal);
    278   setOperationAction(ISD::SUB,  MVT::i8,     Custom);
    279   setOperationAction(ISD::SUB,  MVT::i64,    Legal);
    280 
    281   // SPU does not have BSWAP. It does have i32 support CTLZ.
    282   // CTPOP has to be custom lowered.
    283   setOperationAction(ISD::BSWAP, MVT::i32,   Expand);
    284   setOperationAction(ISD::BSWAP, MVT::i64,   Expand);
    285 
    286   setOperationAction(ISD::CTPOP, MVT::i8,    Custom);
    287   setOperationAction(ISD::CTPOP, MVT::i16,   Custom);
    288   setOperationAction(ISD::CTPOP, MVT::i32,   Custom);
    289   setOperationAction(ISD::CTPOP, MVT::i64,   Custom);
    290   setOperationAction(ISD::CTPOP, MVT::i128,  Expand);
    291 
    292   setOperationAction(ISD::CTTZ , MVT::i8,    Expand);
    293   setOperationAction(ISD::CTTZ , MVT::i16,   Expand);
    294   setOperationAction(ISD::CTTZ , MVT::i32,   Expand);
    295   setOperationAction(ISD::CTTZ , MVT::i64,   Expand);
    296   setOperationAction(ISD::CTTZ , MVT::i128,  Expand);
    297   setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i8,    Expand);
    298   setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i16,   Expand);
    299   setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i32,   Expand);
    300   setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i64,   Expand);
    301   setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i128,  Expand);
    302 
    303   setOperationAction(ISD::CTLZ , MVT::i8,    Promote);
    304   setOperationAction(ISD::CTLZ , MVT::i16,   Promote);
    305   setOperationAction(ISD::CTLZ , MVT::i32,   Legal);
    306   setOperationAction(ISD::CTLZ , MVT::i64,   Expand);
    307   setOperationAction(ISD::CTLZ , MVT::i128,  Expand);
    308   setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i8,    Expand);
    309   setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i16,   Expand);
    310   setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i32,   Expand);
    311   setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i64,   Expand);
    312   setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i128,  Expand);
    313 
    314   // SPU has a version of select that implements (a&~c)|(b&c), just like
    315   // select ought to work:
    316   setOperationAction(ISD::SELECT, MVT::i8,   Legal);
    317   setOperationAction(ISD::SELECT, MVT::i16,  Legal);
    318   setOperationAction(ISD::SELECT, MVT::i32,  Legal);
    319   setOperationAction(ISD::SELECT, MVT::i64,  Legal);
    320 
    321   setOperationAction(ISD::SETCC, MVT::i8,    Legal);
    322   setOperationAction(ISD::SETCC, MVT::i16,   Legal);
    323   setOperationAction(ISD::SETCC, MVT::i32,   Legal);
    324   setOperationAction(ISD::SETCC, MVT::i64,   Legal);
    325   setOperationAction(ISD::SETCC, MVT::f64,   Custom);
    326 
    327   // Custom lower i128 -> i64 truncates
    328   setOperationAction(ISD::TRUNCATE, MVT::i64, Custom);
    329 
    330   // Custom lower i32/i64 -> i128 sign extend
    331   setOperationAction(ISD::SIGN_EXTEND, MVT::i128, Custom);
    332 
    333   setOperationAction(ISD::FP_TO_SINT, MVT::i8, Promote);
    334   setOperationAction(ISD::FP_TO_UINT, MVT::i8, Promote);
    335   setOperationAction(ISD::FP_TO_SINT, MVT::i16, Promote);
    336   setOperationAction(ISD::FP_TO_UINT, MVT::i16, Promote);
    337   // SPU has a legal FP -> signed INT instruction for f32, but for f64, need
    338   // to expand to a libcall, hence the custom lowering:
    339   setOperationAction(ISD::FP_TO_SINT, MVT::i32, Custom);
    340   setOperationAction(ISD::FP_TO_UINT, MVT::i32, Custom);
    341   setOperationAction(ISD::FP_TO_SINT, MVT::i64, Expand);
    342   setOperationAction(ISD::FP_TO_UINT, MVT::i64, Expand);
    343   setOperationAction(ISD::FP_TO_SINT, MVT::i128, Expand);
    344   setOperationAction(ISD::FP_TO_UINT, MVT::i128, Expand);
    345 
    346   // FDIV on SPU requires custom lowering
    347   setOperationAction(ISD::FDIV, MVT::f64, Expand);      // to libcall
    348 
    349   // SPU has [U|S]INT_TO_FP for f32->i32, but not for f64->i32, f64->i64:
    350   setOperationAction(ISD::SINT_TO_FP, MVT::i32, Custom);
    351   setOperationAction(ISD::SINT_TO_FP, MVT::i16, Promote);
    352   setOperationAction(ISD::SINT_TO_FP, MVT::i8,  Promote);
    353   setOperationAction(ISD::UINT_TO_FP, MVT::i32, Custom);
    354   setOperationAction(ISD::UINT_TO_FP, MVT::i16, Promote);
    355   setOperationAction(ISD::UINT_TO_FP, MVT::i8,  Promote);
    356   setOperationAction(ISD::SINT_TO_FP, MVT::i64, Custom);
    357   setOperationAction(ISD::UINT_TO_FP, MVT::i64, Custom);
    358 
    359   setOperationAction(ISD::BITCAST, MVT::i32, Legal);
    360   setOperationAction(ISD::BITCAST, MVT::f32, Legal);
    361   setOperationAction(ISD::BITCAST, MVT::i64, Legal);
    362   setOperationAction(ISD::BITCAST, MVT::f64, Legal);
    363 
    364   // We cannot sextinreg(i1).  Expand to shifts.
    365   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1, Expand);
    366 
    367   // We want to legalize GlobalAddress and ConstantPool nodes into the
    368   // appropriate instructions to materialize the address.
    369   for (unsigned sctype = (unsigned) MVT::i8; sctype < (unsigned) MVT::f128;
    370        ++sctype) {
    371     MVT::SimpleValueType VT = (MVT::SimpleValueType)sctype;
    372 
    373     setOperationAction(ISD::GlobalAddress,  VT, Custom);
    374     setOperationAction(ISD::ConstantPool,   VT, Custom);
    375     setOperationAction(ISD::JumpTable,      VT, Custom);
    376   }
    377 
    378   // VASTART needs to be custom lowered to use the VarArgsFrameIndex
    379   setOperationAction(ISD::VASTART           , MVT::Other, Custom);
    380 
    381   // Use the default implementation.
    382   setOperationAction(ISD::VAARG             , MVT::Other, Expand);
    383   setOperationAction(ISD::VACOPY            , MVT::Other, Expand);
    384   setOperationAction(ISD::VAEND             , MVT::Other, Expand);
    385   setOperationAction(ISD::STACKSAVE         , MVT::Other, Expand);
    386   setOperationAction(ISD::STACKRESTORE      , MVT::Other, Expand);
    387   setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i32  , Expand);
    388   setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i64  , Expand);
    389 
    390   // Cell SPU has instructions for converting between i64 and fp.
    391   setOperationAction(ISD::FP_TO_SINT, MVT::i64, Custom);
    392   setOperationAction(ISD::SINT_TO_FP, MVT::i64, Custom);
    393 
    394   // To take advantage of the above i64 FP_TO_SINT, promote i32 FP_TO_UINT
    395   setOperationAction(ISD::FP_TO_UINT, MVT::i32, Promote);
    396 
    397   // BUILD_PAIR can't be handled natively, and should be expanded to shl/or
    398   setOperationAction(ISD::BUILD_PAIR, MVT::i64, Expand);
    399 
    400   // First set operation action for all vector types to expand. Then we
    401   // will selectively turn on ones that can be effectively codegen'd.
    402   addRegisterClass(MVT::v16i8, &SPU::VECREGRegClass);
    403   addRegisterClass(MVT::v8i16, &SPU::VECREGRegClass);
    404   addRegisterClass(MVT::v4i32, &SPU::VECREGRegClass);
    405   addRegisterClass(MVT::v2i64, &SPU::VECREGRegClass);
    406   addRegisterClass(MVT::v4f32, &SPU::VECREGRegClass);
    407   addRegisterClass(MVT::v2f64, &SPU::VECREGRegClass);
    408 
    409   for (unsigned i = (unsigned)MVT::FIRST_VECTOR_VALUETYPE;
    410        i <= (unsigned)MVT::LAST_VECTOR_VALUETYPE; ++i) {
    411     MVT::SimpleValueType VT = (MVT::SimpleValueType)i;
    412 
    413     // Set operation actions to legal types only.
    414     if (!isTypeLegal(VT)) continue;
    415 
    416     // add/sub are legal for all supported vector VT's.
    417     setOperationAction(ISD::ADD,     VT, Legal);
    418     setOperationAction(ISD::SUB,     VT, Legal);
    419     // mul has to be custom lowered.
    420     setOperationAction(ISD::MUL,     VT, Legal);
    421 
    422     setOperationAction(ISD::AND,     VT, Legal);
    423     setOperationAction(ISD::OR,      VT, Legal);
    424     setOperationAction(ISD::XOR,     VT, Legal);
    425     setOperationAction(ISD::LOAD,    VT, Custom);
    426     setOperationAction(ISD::SELECT,  VT, Legal);
    427     setOperationAction(ISD::STORE,   VT, Custom);
    428 
    429     // These operations need to be expanded:
    430     setOperationAction(ISD::SDIV,    VT, Expand);
    431     setOperationAction(ISD::SREM,    VT, Expand);
    432     setOperationAction(ISD::UDIV,    VT, Expand);
    433     setOperationAction(ISD::UREM,    VT, Expand);
    434 
    435     // Expand all trunc stores
    436     for (unsigned j = (unsigned)MVT::FIRST_VECTOR_VALUETYPE;
    437          j <= (unsigned)MVT::LAST_VECTOR_VALUETYPE; ++j) {
    438       MVT::SimpleValueType TargetVT = (MVT::SimpleValueType)j;
    439     setTruncStoreAction(VT, TargetVT, Expand);
    440     }
    441 
    442     // Custom lower build_vector, constant pool spills, insert and
    443     // extract vector elements:
    444     setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
    445     setOperationAction(ISD::ConstantPool, VT, Custom);
    446     setOperationAction(ISD::SCALAR_TO_VECTOR, VT, Custom);
    447     setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
    448     setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom);
    449     setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);
    450   }
    451 
    452   setOperationAction(ISD::SHL, MVT::v2i64, Expand);
    453 
    454   setOperationAction(ISD::AND, MVT::v16i8, Custom);
    455   setOperationAction(ISD::OR,  MVT::v16i8, Custom);
    456   setOperationAction(ISD::XOR, MVT::v16i8, Custom);
    457   setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v4f32, Custom);
    458 
    459   setOperationAction(ISD::FDIV, MVT::v4f32, Legal);
    460 
    461   setBooleanContents(ZeroOrNegativeOneBooleanContent);
    462   setBooleanVectorContents(ZeroOrNegativeOneBooleanContent); // FIXME: Is this correct?
    463 
    464   setStackPointerRegisterToSaveRestore(SPU::R1);
    465 
    466   // We have target-specific dag combine patterns for the following nodes:
    467   setTargetDAGCombine(ISD::ADD);
    468   setTargetDAGCombine(ISD::ZERO_EXTEND);
    469   setTargetDAGCombine(ISD::SIGN_EXTEND);
    470   setTargetDAGCombine(ISD::ANY_EXTEND);
    471 
    472   setMinFunctionAlignment(3);
    473 
    474   computeRegisterProperties();
    475 
    476   // Set pre-RA register scheduler default to BURR, which produces slightly
    477   // better code than the default (could also be TDRR, but TargetLowering.h
    478   // needs a mod to support that model):
    479   setSchedulingPreference(Sched::RegPressure);
    480 }
    481 
    482 const char *SPUTargetLowering::getTargetNodeName(unsigned Opcode) const {
    483   switch (Opcode) {
    484   default: return 0;
    485   case SPUISD::RET_FLAG: return "SPUISD::RET_FLAG";
    486   case SPUISD::Hi: return "SPUISD::Hi";
    487   case SPUISD::Lo: return "SPUISD::Lo";
    488   case SPUISD::PCRelAddr: return "SPUISD::PCRelAddr";
    489   case SPUISD::AFormAddr: return "SPUISD::AFormAddr";
    490   case SPUISD::IndirectAddr: return "SPUISD::IndirectAddr";
    491   case SPUISD::LDRESULT: return "SPUISD::LDRESULT";
    492   case SPUISD::CALL: return "SPUISD::CALL";
    493   case SPUISD::SHUFB: return "SPUISD::SHUFB";
    494   case SPUISD::SHUFFLE_MASK: return "SPUISD::SHUFFLE_MASK";
    495   case SPUISD::CNTB: return "SPUISD::CNTB";
    496   case SPUISD::PREFSLOT2VEC: return "SPUISD::PREFSLOT2VEC";
    497   case SPUISD::VEC2PREFSLOT: return "SPUISD::VEC2PREFSLOT";
    498   case SPUISD::SHL_BITS: return "SPUISD::SHL_BITS";
    499   case SPUISD::SHL_BYTES: return "SPUISD::SHL_BYTES";
    500   case SPUISD::VEC_ROTL: return "SPUISD::VEC_ROTL";
    501   case SPUISD::VEC_ROTR: return "SPUISD::VEC_ROTR";
    502   case SPUISD::ROTBYTES_LEFT: return "SPUISD::ROTBYTES_LEFT";
    503   case SPUISD::ROTBYTES_LEFT_BITS: return "SPUISD::ROTBYTES_LEFT_BITS";
    504   case SPUISD::SELECT_MASK: return "SPUISD::SELECT_MASK";
    505   case SPUISD::SELB: return "SPUISD::SELB";
    506   case SPUISD::ADD64_MARKER: return "SPUISD::ADD64_MARKER";
    507   case SPUISD::SUB64_MARKER: return "SPUISD::SUB64_MARKER";
    508   case SPUISD::MUL64_MARKER: return "SPUISD::MUL64_MARKER";
    509   }
    510 }
    511 
    512 //===----------------------------------------------------------------------===//
    513 // Return the Cell SPU's SETCC result type
    514 //===----------------------------------------------------------------------===//
    515 
    516 EVT SPUTargetLowering::getSetCCResultType(EVT VT) const {
    517   // i8, i16 and i32 are valid SETCC result types
    518   MVT::SimpleValueType retval;
    519 
    520   switch(VT.getSimpleVT().SimpleTy){
    521     case MVT::i1:
    522     case MVT::i8:
    523       retval = MVT::i8; break;
    524     case MVT::i16:
    525       retval = MVT::i16; break;
    526     case MVT::i32:
    527     default:
    528       retval = MVT::i32;
    529   }
    530   return retval;
    531 }
    532 
    533 //===----------------------------------------------------------------------===//
    534 // Calling convention code:
    535 //===----------------------------------------------------------------------===//
    536 
    537 #include "SPUGenCallingConv.inc"
    538 
    539 //===----------------------------------------------------------------------===//
    540 //  LowerOperation implementation
    541 //===----------------------------------------------------------------------===//
    542 
    543 /// Custom lower loads for CellSPU
    544 /*!
    545  All CellSPU loads and stores are aligned to 16-byte boundaries, so for elements
    546  within a 16-byte block, we have to rotate to extract the requested element.
    547 
    548  For extending loads, we also want to ensure that the following sequence is
    549  emitted, e.g. for MVT::f32 extending load to MVT::f64:
    550 
    551 \verbatim
    552 %1  v16i8,ch = load
    553 %2  v16i8,ch = rotate %1
    554 %3  v4f8, ch = bitconvert %2
    555 %4  f32      = vec2perfslot %3
    556 %5  f64      = fp_extend %4
    557 \endverbatim
    558 */
    559 static SDValue
    560 LowerLOAD(SDValue Op, SelectionDAG &DAG, const SPUSubtarget *ST) {
    561   LoadSDNode *LN = cast<LoadSDNode>(Op);
    562   SDValue the_chain = LN->getChain();
    563   EVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy();
    564   EVT InVT = LN->getMemoryVT();
    565   EVT OutVT = Op.getValueType();
    566   ISD::LoadExtType ExtType = LN->getExtensionType();
    567   unsigned alignment = LN->getAlignment();
    568   int pso = prefslotOffset(InVT);
    569   DebugLoc dl = Op.getDebugLoc();
    570   EVT vecVT = InVT.isVector()? InVT: EVT::getVectorVT(*DAG.getContext(), InVT,
    571                                                   (128 / InVT.getSizeInBits()));
    572 
    573   // two sanity checks
    574   assert( LN->getAddressingMode() == ISD::UNINDEXED
    575           && "we should get only UNINDEXED adresses");
    576   // clean aligned loads can be selected as-is
    577   if (InVT.getSizeInBits() == 128 && (alignment%16) == 0)
    578     return SDValue();
    579 
    580   // Get pointerinfos to the memory chunk(s) that contain the data to load
    581   uint64_t mpi_offset = LN->getPointerInfo().Offset;
    582   mpi_offset -= mpi_offset%16;
    583   MachinePointerInfo lowMemPtr(LN->getPointerInfo().V, mpi_offset);
    584   MachinePointerInfo highMemPtr(LN->getPointerInfo().V, mpi_offset+16);
    585 
    586   SDValue result;
    587   SDValue basePtr = LN->getBasePtr();
    588   SDValue rotate;
    589 
    590   if ((alignment%16) == 0) {
    591     ConstantSDNode *CN;
    592 
    593     // Special cases for a known aligned load to simplify the base pointer
    594     // and the rotation amount:
    595     if (basePtr.getOpcode() == ISD::ADD
    596         && (CN = dyn_cast<ConstantSDNode > (basePtr.getOperand(1))) != 0) {
    597       // Known offset into basePtr
    598       int64_t offset = CN->getSExtValue();
    599       int64_t rotamt = int64_t((offset & 0xf) - pso);
    600 
    601       if (rotamt < 0)
    602         rotamt += 16;
    603 
    604       rotate = DAG.getConstant(rotamt, MVT::i16);
    605 
    606       // Simplify the base pointer for this case:
    607       basePtr = basePtr.getOperand(0);
    608       if ((offset & ~0xf) > 0) {
    609         basePtr = DAG.getNode(SPUISD::IndirectAddr, dl, PtrVT,
    610                               basePtr,
    611                               DAG.getConstant((offset & ~0xf), PtrVT));
    612       }
    613     } else if ((basePtr.getOpcode() == SPUISD::AFormAddr)
    614                || (basePtr.getOpcode() == SPUISD::IndirectAddr
    615                    && basePtr.getOperand(0).getOpcode() == SPUISD::Hi
    616                    && basePtr.getOperand(1).getOpcode() == SPUISD::Lo)) {
    617       // Plain aligned a-form address: rotate into preferred slot
    618       // Same for (SPUindirect (SPUhi ...), (SPUlo ...))
    619       int64_t rotamt = -pso;
    620       if (rotamt < 0)
    621         rotamt += 16;
    622       rotate = DAG.getConstant(rotamt, MVT::i16);
    623     } else {
    624       // Offset the rotate amount by the basePtr and the preferred slot
    625       // byte offset
    626       int64_t rotamt = -pso;
    627       if (rotamt < 0)
    628         rotamt += 16;
    629       rotate = DAG.getNode(ISD::ADD, dl, PtrVT,
    630                            basePtr,
    631                            DAG.getConstant(rotamt, PtrVT));
    632     }
    633   } else {
    634     // Unaligned load: must be more pessimistic about addressing modes:
    635     if (basePtr.getOpcode() == ISD::ADD) {
    636       MachineFunction &MF = DAG.getMachineFunction();
    637       MachineRegisterInfo &RegInfo = MF.getRegInfo();
    638       unsigned VReg = RegInfo.createVirtualRegister(&SPU::R32CRegClass);
    639       SDValue Flag;
    640 
    641       SDValue Op0 = basePtr.getOperand(0);
    642       SDValue Op1 = basePtr.getOperand(1);
    643 
    644       if (isa<ConstantSDNode>(Op1)) {
    645         // Convert the (add <ptr>, <const>) to an indirect address contained
    646         // in a register. Note that this is done because we need to avoid
    647         // creating a 0(reg) d-form address due to the SPU's block loads.
    648         basePtr = DAG.getNode(SPUISD::IndirectAddr, dl, PtrVT, Op0, Op1);
    649         the_chain = DAG.getCopyToReg(the_chain, dl, VReg, basePtr, Flag);
    650         basePtr = DAG.getCopyFromReg(the_chain, dl, VReg, PtrVT);
    651       } else {
    652         // Convert the (add <arg1>, <arg2>) to an indirect address, which
    653         // will likely be lowered as a reg(reg) x-form address.
    654         basePtr = DAG.getNode(SPUISD::IndirectAddr, dl, PtrVT, Op0, Op1);
    655       }
    656     } else {
    657       basePtr = DAG.getNode(SPUISD::IndirectAddr, dl, PtrVT,
    658                             basePtr,
    659                             DAG.getConstant(0, PtrVT));
    660    }
    661 
    662     // Offset the rotate amount by the basePtr and the preferred slot
    663     // byte offset
    664     rotate = DAG.getNode(ISD::ADD, dl, PtrVT,
    665                          basePtr,
    666                          DAG.getConstant(-pso, PtrVT));
    667   }
    668 
    669   // Do the load as a i128 to allow possible shifting
    670   SDValue low = DAG.getLoad(MVT::i128, dl, the_chain, basePtr,
    671                        lowMemPtr,
    672                        LN->isVolatile(), LN->isNonTemporal(), false, 16);
    673 
    674   // When the size is not greater than alignment we get all data with just
    675   // one load
    676   if (alignment >= InVT.getSizeInBits()/8) {
    677     // Update the chain
    678     the_chain = low.getValue(1);
    679 
    680     // Rotate into the preferred slot:
    681     result = DAG.getNode(SPUISD::ROTBYTES_LEFT, dl, MVT::i128,
    682                          low.getValue(0), rotate);
    683 
    684     // Convert the loaded v16i8 vector to the appropriate vector type
    685     // specified by the operand:
    686     EVT vecVT = EVT::getVectorVT(*DAG.getContext(),
    687                                  InVT, (128 / InVT.getSizeInBits()));
    688     result = DAG.getNode(SPUISD::VEC2PREFSLOT, dl, InVT,
    689                          DAG.getNode(ISD::BITCAST, dl, vecVT, result));
    690   }
    691   // When alignment is less than the size, we might need (known only at
    692   // run-time) two loads
    693   // TODO: if the memory address is composed only from constants, we have
    694   // extra kowledge, and might avoid the second load
    695   else {
    696     // storage position offset from lower 16 byte aligned memory chunk
    697     SDValue offset = DAG.getNode(ISD::AND, dl, MVT::i32,
    698                                   basePtr, DAG.getConstant( 0xf, MVT::i32 ) );
    699     // get a registerfull of ones. (this implementation is a workaround: LLVM
    700     // cannot handle 128 bit signed int constants)
    701     SDValue ones = DAG.getConstant(-1, MVT::v4i32 );
    702     ones = DAG.getNode(ISD::BITCAST, dl, MVT::i128, ones);
    703 
    704     SDValue high = DAG.getLoad(MVT::i128, dl, the_chain,
    705                                DAG.getNode(ISD::ADD, dl, PtrVT,
    706                                            basePtr,
    707                                            DAG.getConstant(16, PtrVT)),
    708                                highMemPtr,
    709                                LN->isVolatile(), LN->isNonTemporal(), false,
    710                                16);
    711 
    712     the_chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, low.getValue(1),
    713                                                               high.getValue(1));
    714 
    715     // Shift the (possible) high part right to compensate the misalignemnt.
    716     // if there is no highpart (i.e. value is i64 and offset is 4), this
    717     // will zero out the high value.
    718     high = DAG.getNode(SPUISD::SRL_BYTES, dl, MVT::i128, high,
    719                                      DAG.getNode(ISD::SUB, dl, MVT::i32,
    720                                                  DAG.getConstant( 16, MVT::i32),
    721                                                  offset
    722                                                 ));
    723 
    724     // Shift the low similarly
    725     // TODO: add SPUISD::SHL_BYTES
    726     low = DAG.getNode(SPUISD::SHL_BYTES, dl, MVT::i128, low, offset );
    727 
    728     // Merge the two parts
    729     result = DAG.getNode(ISD::BITCAST, dl, vecVT,
    730                           DAG.getNode(ISD::OR, dl, MVT::i128, low, high));
    731 
    732     if (!InVT.isVector()) {
    733       result = DAG.getNode(SPUISD::VEC2PREFSLOT, dl, InVT, result );
    734      }
    735 
    736   }
    737     // Handle extending loads by extending the scalar result:
    738     if (ExtType == ISD::SEXTLOAD) {
    739       result = DAG.getNode(ISD::SIGN_EXTEND, dl, OutVT, result);
    740     } else if (ExtType == ISD::ZEXTLOAD) {
    741       result = DAG.getNode(ISD::ZERO_EXTEND, dl, OutVT, result);
    742     } else if (ExtType == ISD::EXTLOAD) {
    743       unsigned NewOpc = ISD::ANY_EXTEND;
    744 
    745       if (OutVT.isFloatingPoint())
    746         NewOpc = ISD::FP_EXTEND;
    747 
    748       result = DAG.getNode(NewOpc, dl, OutVT, result);
    749     }
    750 
    751     SDVTList retvts = DAG.getVTList(OutVT, MVT::Other);
    752     SDValue retops[2] = {
    753       result,
    754       the_chain
    755     };
    756 
    757     result = DAG.getNode(SPUISD::LDRESULT, dl, retvts,
    758                          retops, sizeof(retops) / sizeof(retops[0]));
    759     return result;
    760 }
    761 
    762 /// Custom lower stores for CellSPU
    763 /*!
    764  All CellSPU stores are aligned to 16-byte boundaries, so for elements
    765  within a 16-byte block, we have to generate a shuffle to insert the
    766  requested element into its place, then store the resulting block.
    767  */
    768 static SDValue
    769 LowerSTORE(SDValue Op, SelectionDAG &DAG, const SPUSubtarget *ST) {
    770   StoreSDNode *SN = cast<StoreSDNode>(Op);
    771   SDValue Value = SN->getValue();
    772   EVT VT = Value.getValueType();
    773   EVT StVT = (!SN->isTruncatingStore() ? VT : SN->getMemoryVT());
    774   EVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy();
    775   DebugLoc dl = Op.getDebugLoc();
    776   unsigned alignment = SN->getAlignment();
    777   SDValue result;
    778   EVT vecVT = StVT.isVector()? StVT: EVT::getVectorVT(*DAG.getContext(), StVT,
    779                                                  (128 / StVT.getSizeInBits()));
    780   // Get pointerinfos to the memory chunk(s) that contain the data to load
    781   uint64_t mpi_offset = SN->getPointerInfo().Offset;
    782   mpi_offset -= mpi_offset%16;
    783   MachinePointerInfo lowMemPtr(SN->getPointerInfo().V, mpi_offset);
    784   MachinePointerInfo highMemPtr(SN->getPointerInfo().V, mpi_offset+16);
    785 
    786 
    787   // two sanity checks
    788   assert( SN->getAddressingMode() == ISD::UNINDEXED
    789           && "we should get only UNINDEXED adresses");
    790   // clean aligned loads can be selected as-is
    791   if (StVT.getSizeInBits() == 128 && (alignment%16) == 0)
    792     return SDValue();
    793 
    794   SDValue alignLoadVec;
    795   SDValue basePtr = SN->getBasePtr();
    796   SDValue the_chain = SN->getChain();
    797   SDValue insertEltOffs;
    798 
    799   if ((alignment%16) == 0) {
    800     ConstantSDNode *CN;
    801     // Special cases for a known aligned load to simplify the base pointer
    802     // and insertion byte:
    803     if (basePtr.getOpcode() == ISD::ADD
    804         && (CN = dyn_cast<ConstantSDNode>(basePtr.getOperand(1))) != 0) {
    805       // Known offset into basePtr
    806       int64_t offset = CN->getSExtValue();
    807 
    808       // Simplify the base pointer for this case:
    809       basePtr = basePtr.getOperand(0);
    810       insertEltOffs = DAG.getNode(SPUISD::IndirectAddr, dl, PtrVT,
    811                                   basePtr,
    812                                   DAG.getConstant((offset & 0xf), PtrVT));
    813 
    814       if ((offset & ~0xf) > 0) {
    815         basePtr = DAG.getNode(SPUISD::IndirectAddr, dl, PtrVT,
    816                               basePtr,
    817                               DAG.getConstant((offset & ~0xf), PtrVT));
    818       }
    819     } else {
    820       // Otherwise, assume it's at byte 0 of basePtr
    821       insertEltOffs = DAG.getNode(SPUISD::IndirectAddr, dl, PtrVT,
    822                                   basePtr,
    823                                   DAG.getConstant(0, PtrVT));
    824       basePtr = DAG.getNode(SPUISD::IndirectAddr, dl, PtrVT,
    825                                   basePtr,
    826                                   DAG.getConstant(0, PtrVT));
    827     }
    828   } else {
    829     // Unaligned load: must be more pessimistic about addressing modes:
    830     if (basePtr.getOpcode() == ISD::ADD) {
    831       MachineFunction &MF = DAG.getMachineFunction();
    832       MachineRegisterInfo &RegInfo = MF.getRegInfo();
    833       unsigned VReg = RegInfo.createVirtualRegister(&SPU::R32CRegClass);
    834       SDValue Flag;
    835 
    836       SDValue Op0 = basePtr.getOperand(0);
    837       SDValue Op1 = basePtr.getOperand(1);
    838 
    839       if (isa<ConstantSDNode>(Op1)) {
    840         // Convert the (add <ptr>, <const>) to an indirect address contained
    841         // in a register. Note that this is done because we need to avoid
    842         // creating a 0(reg) d-form address due to the SPU's block loads.
    843         basePtr = DAG.getNode(SPUISD::IndirectAddr, dl, PtrVT, Op0, Op1);
    844         the_chain = DAG.getCopyToReg(the_chain, dl, VReg, basePtr, Flag);
    845         basePtr = DAG.getCopyFromReg(the_chain, dl, VReg, PtrVT);
    846       } else {
    847         // Convert the (add <arg1>, <arg2>) to an indirect address, which
    848         // will likely be lowered as a reg(reg) x-form address.
    849         basePtr = DAG.getNode(SPUISD::IndirectAddr, dl, PtrVT, Op0, Op1);
    850       }
    851     } else {
    852       basePtr = DAG.getNode(SPUISD::IndirectAddr, dl, PtrVT,
    853                             basePtr,
    854                             DAG.getConstant(0, PtrVT));
    855     }
    856 
    857     // Insertion point is solely determined by basePtr's contents
    858     insertEltOffs = DAG.getNode(ISD::ADD, dl, PtrVT,
    859                                 basePtr,
    860                                 DAG.getConstant(0, PtrVT));
    861   }
    862 
    863   // Load the lower part of the memory to which to store.
    864   SDValue low = DAG.getLoad(vecVT, dl, the_chain, basePtr,
    865                           lowMemPtr, SN->isVolatile(), SN->isNonTemporal(),
    866                             false, 16);
    867 
    868   // if we don't need to store over the 16 byte boundary, one store suffices
    869   if (alignment >= StVT.getSizeInBits()/8) {
    870     // Update the chain
    871     the_chain = low.getValue(1);
    872 
    873     LoadSDNode *LN = cast<LoadSDNode>(low);
    874     SDValue theValue = SN->getValue();
    875 
    876     if (StVT != VT
    877         && (theValue.getOpcode() == ISD::AssertZext
    878             || theValue.getOpcode() == ISD::AssertSext)) {
    879       // Drill down and get the value for zero- and sign-extended
    880       // quantities
    881       theValue = theValue.getOperand(0);
    882     }
    883 
    884     // If the base pointer is already a D-form address, then just create
    885     // a new D-form address with a slot offset and the orignal base pointer.
    886     // Otherwise generate a D-form address with the slot offset relative
    887     // to the stack pointer, which is always aligned.
    888 #if !defined(NDEBUG)
    889       if (DebugFlag && isCurrentDebugType(DEBUG_TYPE)) {
    890         errs() << "CellSPU LowerSTORE: basePtr = ";
    891         basePtr.getNode()->dump(&DAG);
    892         errs() << "\n";
    893       }
    894 #endif
    895 
    896     SDValue insertEltOp = DAG.getNode(SPUISD::SHUFFLE_MASK, dl, vecVT,
    897                                       insertEltOffs);
    898     SDValue vectorizeOp = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, vecVT,
    899                                       theValue);
    900 
    901     result = DAG.getNode(SPUISD::SHUFB, dl, vecVT,
    902                          vectorizeOp, low,
    903                          DAG.getNode(ISD::BITCAST, dl,
    904                                      MVT::v4i32, insertEltOp));
    905 
    906     result = DAG.getStore(the_chain, dl, result, basePtr,
    907                           lowMemPtr,
    908                           LN->isVolatile(), LN->isNonTemporal(),
    909                           16);
    910 
    911   }
    912   // do the store when it might cross the 16 byte memory access boundary.
    913   else {
    914     // TODO issue a warning if SN->isVolatile()== true? This is likely not
    915     // what the user wanted.
    916 
    917     // address offset from nearest lower 16byte alinged address
    918     SDValue offset = DAG.getNode(ISD::AND, dl, MVT::i32,
    919                                     SN->getBasePtr(),
    920                                     DAG.getConstant(0xf, MVT::i32));
    921     // 16 - offset
    922     SDValue offset_compl = DAG.getNode(ISD::SUB, dl, MVT::i32,
    923                                            DAG.getConstant( 16, MVT::i32),
    924                                            offset);
    925     // 16 - sizeof(Value)
    926     SDValue surplus = DAG.getNode(ISD::SUB, dl, MVT::i32,
    927                                      DAG.getConstant( 16, MVT::i32),
    928                                      DAG.getConstant( VT.getSizeInBits()/8,
    929                                                       MVT::i32));
    930     // get a registerfull of ones
    931     SDValue ones = DAG.getConstant(-1, MVT::v4i32);
    932     ones = DAG.getNode(ISD::BITCAST, dl, MVT::i128, ones);
    933 
    934     // Create the 128 bit masks that have ones where the data to store is
    935     // located.
    936     SDValue lowmask, himask;
    937     // if the value to store don't fill up the an entire 128 bits, zero
    938     // out the last bits of the mask so that only the value we want to store
    939     // is masked.
    940     // this is e.g. in the case of store i32, align 2
    941     if (!VT.isVector()){
    942       Value = DAG.getNode(SPUISD::PREFSLOT2VEC, dl, vecVT, Value);
    943       lowmask = DAG.getNode(SPUISD::SRL_BYTES, dl, MVT::i128, ones, surplus);
    944       lowmask = DAG.getNode(SPUISD::SHL_BYTES, dl, MVT::i128, lowmask,
    945                                                                surplus);
    946       Value = DAG.getNode(ISD::BITCAST, dl, MVT::i128, Value);
    947       Value = DAG.getNode(ISD::AND, dl, MVT::i128, Value, lowmask);
    948 
    949     }
    950     else {
    951       lowmask = ones;
    952       Value = DAG.getNode(ISD::BITCAST, dl, MVT::i128, Value);
    953     }
    954     // this will zero, if there are no data that goes to the high quad
    955     himask = DAG.getNode(SPUISD::SHL_BYTES, dl, MVT::i128, lowmask,
    956                                                             offset_compl);
    957     lowmask = DAG.getNode(SPUISD::SRL_BYTES, dl, MVT::i128, lowmask,
    958                                                              offset);
    959 
    960     // Load in the old data and zero out the parts that will be overwritten with
    961     // the new data to store.
    962     SDValue hi = DAG.getLoad(MVT::i128, dl, the_chain,
    963                                DAG.getNode(ISD::ADD, dl, PtrVT, basePtr,
    964                                            DAG.getConstant( 16, PtrVT)),
    965                                highMemPtr,
    966                                SN->isVolatile(), SN->isNonTemporal(),
    967                                false, 16);
    968     the_chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, low.getValue(1),
    969                                                               hi.getValue(1));
    970 
    971     low = DAG.getNode(ISD::AND, dl, MVT::i128,
    972                         DAG.getNode( ISD::BITCAST, dl, MVT::i128, low),
    973                         DAG.getNode( ISD::XOR, dl, MVT::i128, lowmask, ones));
    974     hi = DAG.getNode(ISD::AND, dl, MVT::i128,
    975                         DAG.getNode( ISD::BITCAST, dl, MVT::i128, hi),
    976                         DAG.getNode( ISD::XOR, dl, MVT::i128, himask, ones));
    977 
    978     // Shift the Value to store into place. rlow contains the parts that go to
    979     // the lower memory chunk, rhi has the parts that go to the upper one.
    980     SDValue rlow = DAG.getNode(SPUISD::SRL_BYTES, dl, MVT::i128, Value, offset);
    981     rlow = DAG.getNode(ISD::AND, dl, MVT::i128, rlow, lowmask);
    982     SDValue rhi = DAG.getNode(SPUISD::SHL_BYTES, dl, MVT::i128, Value,
    983                                                             offset_compl);
    984 
    985     // Merge the old data and the new data and store the results
    986     // Need to convert vectors here to integer as 'OR'ing floats assert
    987     rlow = DAG.getNode(ISD::OR, dl, MVT::i128,
    988                           DAG.getNode(ISD::BITCAST, dl, MVT::i128, low),
    989                           DAG.getNode(ISD::BITCAST, dl, MVT::i128, rlow));
    990     rhi = DAG.getNode(ISD::OR, dl, MVT::i128,
    991                          DAG.getNode(ISD::BITCAST, dl, MVT::i128, hi),
    992                          DAG.getNode(ISD::BITCAST, dl, MVT::i128, rhi));
    993 
    994     low = DAG.getStore(the_chain, dl, rlow, basePtr,
    995                           lowMemPtr,
    996                           SN->isVolatile(), SN->isNonTemporal(), 16);
    997     hi  = DAG.getStore(the_chain, dl, rhi,
    998                             DAG.getNode(ISD::ADD, dl, PtrVT, basePtr,
    999                                         DAG.getConstant( 16, PtrVT)),
   1000                             highMemPtr,
   1001                             SN->isVolatile(), SN->isNonTemporal(), 16);
   1002     result = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, low.getValue(0),
   1003                                                            hi.getValue(0));
   1004   }
   1005 
   1006   return result;
   1007 }
   1008 
   1009 //! Generate the address of a constant pool entry.
   1010 static SDValue
   1011 LowerConstantPool(SDValue Op, SelectionDAG &DAG, const SPUSubtarget *ST) {
   1012   EVT PtrVT = Op.getValueType();
   1013   ConstantPoolSDNode *CP = cast<ConstantPoolSDNode>(Op);
   1014   const Constant *C = CP->getConstVal();
   1015   SDValue CPI = DAG.getTargetConstantPool(C, PtrVT, CP->getAlignment());
   1016   SDValue Zero = DAG.getConstant(0, PtrVT);
   1017   const TargetMachine &TM = DAG.getTarget();
   1018   // FIXME there is no actual debug info here
   1019   DebugLoc dl = Op.getDebugLoc();
   1020 
   1021   if (TM.getRelocationModel() == Reloc::Static) {
   1022     if (!ST->usingLargeMem()) {
   1023       // Just return the SDValue with the constant pool address in it.
   1024       return DAG.getNode(SPUISD::AFormAddr, dl, PtrVT, CPI, Zero);
   1025     } else {
   1026       SDValue Hi = DAG.getNode(SPUISD::Hi, dl, PtrVT, CPI, Zero);
   1027       SDValue Lo = DAG.getNode(SPUISD::Lo, dl, PtrVT, CPI, Zero);
   1028       return DAG.getNode(SPUISD::IndirectAddr, dl, PtrVT, Hi, Lo);
   1029     }
   1030   }
   1031 
   1032   llvm_unreachable("LowerConstantPool: Relocation model other than static"
   1033                    " not supported.");
   1034 }
   1035 
   1036 //! Alternate entry point for generating the address of a constant pool entry
   1037 SDValue
   1038 SPU::LowerConstantPool(SDValue Op, SelectionDAG &DAG, const SPUTargetMachine &TM) {
   1039   return ::LowerConstantPool(Op, DAG, TM.getSubtargetImpl());
   1040 }
   1041 
   1042 static SDValue
   1043 LowerJumpTable(SDValue Op, SelectionDAG &DAG, const SPUSubtarget *ST) {
   1044   EVT PtrVT = Op.getValueType();
   1045   JumpTableSDNode *JT = cast<JumpTableSDNode>(Op);
   1046   SDValue JTI = DAG.getTargetJumpTable(JT->getIndex(), PtrVT);
   1047   SDValue Zero = DAG.getConstant(0, PtrVT);
   1048   const TargetMachine &TM = DAG.getTarget();
   1049   // FIXME there is no actual debug info here
   1050   DebugLoc dl = Op.getDebugLoc();
   1051 
   1052   if (TM.getRelocationModel() == Reloc::Static) {
   1053     if (!ST->usingLargeMem()) {
   1054       return DAG.getNode(SPUISD::AFormAddr, dl, PtrVT, JTI, Zero);
   1055     } else {
   1056       SDValue Hi = DAG.getNode(SPUISD::Hi, dl, PtrVT, JTI, Zero);
   1057       SDValue Lo = DAG.getNode(SPUISD::Lo, dl, PtrVT, JTI, Zero);
   1058       return DAG.getNode(SPUISD::IndirectAddr, dl, PtrVT, Hi, Lo);
   1059     }
   1060   }
   1061 
   1062   llvm_unreachable("LowerJumpTable: Relocation model other than static"
   1063                    " not supported.");
   1064 }
   1065 
   1066 static SDValue
   1067 LowerGlobalAddress(SDValue Op, SelectionDAG &DAG, const SPUSubtarget *ST) {
   1068   EVT PtrVT = Op.getValueType();
   1069   GlobalAddressSDNode *GSDN = cast<GlobalAddressSDNode>(Op);
   1070   const GlobalValue *GV = GSDN->getGlobal();
   1071   SDValue GA = DAG.getTargetGlobalAddress(GV, Op.getDebugLoc(),
   1072                                           PtrVT, GSDN->getOffset());
   1073   const TargetMachine &TM = DAG.getTarget();
   1074   SDValue Zero = DAG.getConstant(0, PtrVT);
   1075   // FIXME there is no actual debug info here
   1076   DebugLoc dl = Op.getDebugLoc();
   1077 
   1078   if (TM.getRelocationModel() == Reloc::Static) {
   1079     if (!ST->usingLargeMem()) {
   1080       return DAG.getNode(SPUISD::AFormAddr, dl, PtrVT, GA, Zero);
   1081     } else {
   1082       SDValue Hi = DAG.getNode(SPUISD::Hi, dl, PtrVT, GA, Zero);
   1083       SDValue Lo = DAG.getNode(SPUISD::Lo, dl, PtrVT, GA, Zero);
   1084       return DAG.getNode(SPUISD::IndirectAddr, dl, PtrVT, Hi, Lo);
   1085     }
   1086   } else {
   1087     report_fatal_error("LowerGlobalAddress: Relocation model other than static"
   1088                       "not supported.");
   1089     /*NOTREACHED*/
   1090   }
   1091 }
   1092 
   1093 //! Custom lower double precision floating point constants
   1094 static SDValue
   1095 LowerConstantFP(SDValue Op, SelectionDAG &DAG) {
   1096   EVT VT = Op.getValueType();
   1097   // FIXME there is no actual debug info here
   1098   DebugLoc dl = Op.getDebugLoc();
   1099 
   1100   if (VT == MVT::f64) {
   1101     ConstantFPSDNode *FP = cast<ConstantFPSDNode>(Op.getNode());
   1102 
   1103     assert((FP != 0) &&
   1104            "LowerConstantFP: Node is not ConstantFPSDNode");
   1105 
   1106     uint64_t dbits = DoubleToBits(FP->getValueAPF().convertToDouble());
   1107     SDValue T = DAG.getConstant(dbits, MVT::i64);
   1108     SDValue Tvec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v2i64, T, T);
   1109     return DAG.getNode(SPUISD::VEC2PREFSLOT, dl, VT,
   1110                        DAG.getNode(ISD::BITCAST, dl, MVT::v2f64, Tvec));
   1111   }
   1112 
   1113   return SDValue();
   1114 }
   1115 
   1116 SDValue
   1117 SPUTargetLowering::LowerFormalArguments(SDValue Chain,
   1118                                         CallingConv::ID CallConv, bool isVarArg,
   1119                                         const SmallVectorImpl<ISD::InputArg>
   1120                                           &Ins,
   1121                                         DebugLoc dl, SelectionDAG &DAG,
   1122                                         SmallVectorImpl<SDValue> &InVals)
   1123                                           const {
   1124 
   1125   MachineFunction &MF = DAG.getMachineFunction();
   1126   MachineFrameInfo *MFI = MF.getFrameInfo();
   1127   MachineRegisterInfo &RegInfo = MF.getRegInfo();
   1128   SPUFunctionInfo *FuncInfo = MF.getInfo<SPUFunctionInfo>();
   1129 
   1130   unsigned ArgOffset = SPUFrameLowering::minStackSize();
   1131   unsigned ArgRegIdx = 0;
   1132   unsigned StackSlotSize = SPUFrameLowering::stackSlotSize();
   1133 
   1134   EVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy();
   1135 
   1136   SmallVector<CCValAssign, 16> ArgLocs;
   1137   CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(),
   1138                  getTargetMachine(), ArgLocs, *DAG.getContext());
   1139   // FIXME: allow for other calling conventions
   1140   CCInfo.AnalyzeFormalArguments(Ins, CCC_SPU);
   1141 
   1142   // Add DAG nodes to load the arguments or copy them out of registers.
   1143   for (unsigned ArgNo = 0, e = Ins.size(); ArgNo != e; ++ArgNo) {
   1144     EVT ObjectVT = Ins[ArgNo].VT;
   1145     unsigned ObjSize = ObjectVT.getSizeInBits()/8;
   1146     SDValue ArgVal;
   1147     CCValAssign &VA = ArgLocs[ArgNo];
   1148 
   1149     if (VA.isRegLoc()) {
   1150       const TargetRegisterClass *ArgRegClass;
   1151 
   1152       switch (ObjectVT.getSimpleVT().SimpleTy) {
   1153       default:
   1154         report_fatal_error("LowerFormalArguments Unhandled argument type: " +
   1155                            Twine(ObjectVT.getEVTString()));
   1156       case MVT::i8:
   1157         ArgRegClass = &SPU::R8CRegClass;
   1158         break;
   1159       case MVT::i16:
   1160         ArgRegClass = &SPU::R16CRegClass;
   1161         break;
   1162       case MVT::i32:
   1163         ArgRegClass = &SPU::R32CRegClass;
   1164         break;
   1165       case MVT::i64:
   1166         ArgRegClass = &SPU::R64CRegClass;
   1167         break;
   1168       case MVT::i128:
   1169         ArgRegClass = &SPU::GPRCRegClass;
   1170         break;
   1171       case MVT::f32:
   1172         ArgRegClass = &SPU::R32FPRegClass;
   1173         break;
   1174       case MVT::f64:
   1175         ArgRegClass = &SPU::R64FPRegClass;
   1176         break;
   1177       case MVT::v2f64:
   1178       case MVT::v4f32:
   1179       case MVT::v2i64:
   1180       case MVT::v4i32:
   1181       case MVT::v8i16:
   1182       case MVT::v16i8:
   1183         ArgRegClass = &SPU::VECREGRegClass;
   1184         break;
   1185       }
   1186 
   1187       unsigned VReg = RegInfo.createVirtualRegister(ArgRegClass);
   1188       RegInfo.addLiveIn(VA.getLocReg(), VReg);
   1189       ArgVal = DAG.getCopyFromReg(Chain, dl, VReg, ObjectVT);
   1190       ++ArgRegIdx;
   1191     } else {
   1192       // We need to load the argument to a virtual register if we determined
   1193       // above that we ran out of physical registers of the appropriate type
   1194       // or we're forced to do vararg
   1195       int FI = MFI->CreateFixedObject(ObjSize, ArgOffset, true);
   1196       SDValue FIN = DAG.getFrameIndex(FI, PtrVT);
   1197       ArgVal = DAG.getLoad(ObjectVT, dl, Chain, FIN, MachinePointerInfo(),
   1198                            false, false, false, 0);
   1199       ArgOffset += StackSlotSize;
   1200     }
   1201 
   1202     InVals.push_back(ArgVal);
   1203     // Update the chain
   1204     Chain = ArgVal.getOperand(0);
   1205   }
   1206 
   1207   // vararg handling:
   1208   if (isVarArg) {
   1209     // FIXME: we should be able to query the argument registers from
   1210     //        tablegen generated code.
   1211     static const uint16_t ArgRegs[] = {
   1212       SPU::R3,  SPU::R4,  SPU::R5,  SPU::R6,  SPU::R7,  SPU::R8,  SPU::R9,
   1213       SPU::R10, SPU::R11, SPU::R12, SPU::R13, SPU::R14, SPU::R15, SPU::R16,
   1214       SPU::R17, SPU::R18, SPU::R19, SPU::R20, SPU::R21, SPU::R22, SPU::R23,
   1215       SPU::R24, SPU::R25, SPU::R26, SPU::R27, SPU::R28, SPU::R29, SPU::R30,
   1216       SPU::R31, SPU::R32, SPU::R33, SPU::R34, SPU::R35, SPU::R36, SPU::R37,
   1217       SPU::R38, SPU::R39, SPU::R40, SPU::R41, SPU::R42, SPU::R43, SPU::R44,
   1218       SPU::R45, SPU::R46, SPU::R47, SPU::R48, SPU::R49, SPU::R50, SPU::R51,
   1219       SPU::R52, SPU::R53, SPU::R54, SPU::R55, SPU::R56, SPU::R57, SPU::R58,
   1220       SPU::R59, SPU::R60, SPU::R61, SPU::R62, SPU::R63, SPU::R64, SPU::R65,
   1221       SPU::R66, SPU::R67, SPU::R68, SPU::R69, SPU::R70, SPU::R71, SPU::R72,
   1222       SPU::R73, SPU::R74, SPU::R75, SPU::R76, SPU::R77, SPU::R78, SPU::R79
   1223     };
   1224     // size of ArgRegs array
   1225     const unsigned NumArgRegs = 77;
   1226 
   1227     // We will spill (79-3)+1 registers to the stack
   1228     SmallVector<SDValue, 79-3+1> MemOps;
   1229 
   1230     // Create the frame slot
   1231     for (; ArgRegIdx != NumArgRegs; ++ArgRegIdx) {
   1232       FuncInfo->setVarArgsFrameIndex(
   1233         MFI->CreateFixedObject(StackSlotSize, ArgOffset, true));
   1234       SDValue FIN = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(), PtrVT);
   1235       unsigned VReg = MF.addLiveIn(ArgRegs[ArgRegIdx], &SPU::VECREGRegClass);
   1236       SDValue ArgVal = DAG.getRegister(VReg, MVT::v16i8);
   1237       SDValue Store = DAG.getStore(Chain, dl, ArgVal, FIN, MachinePointerInfo(),
   1238                                    false, false, 0);
   1239       Chain = Store.getOperand(0);
   1240       MemOps.push_back(Store);
   1241 
   1242       // Increment address by stack slot size for the next stored argument
   1243       ArgOffset += StackSlotSize;
   1244     }
   1245     if (!MemOps.empty())
   1246       Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
   1247                           &MemOps[0], MemOps.size());
   1248   }
   1249 
   1250   return Chain;
   1251 }
   1252 
   1253 /// isLSAAddress - Return the immediate to use if the specified
   1254 /// value is representable as a LSA address.
   1255 static SDNode *isLSAAddress(SDValue Op, SelectionDAG &DAG) {
   1256   ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op);
   1257   if (!C) return 0;
   1258 
   1259   int Addr = C->getZExtValue();
   1260   if ((Addr & 3) != 0 ||  // Low 2 bits are implicitly zero.
   1261       (Addr << 14 >> 14) != Addr)
   1262     return 0;  // Top 14 bits have to be sext of immediate.
   1263 
   1264   return DAG.getConstant((int)C->getZExtValue() >> 2, MVT::i32).getNode();
   1265 }
   1266 
   1267 SDValue
   1268 SPUTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
   1269                              SmallVectorImpl<SDValue> &InVals) const {
   1270   SelectionDAG &DAG                     = CLI.DAG;
   1271   DebugLoc &dl                          = CLI.DL;
   1272   SmallVector<ISD::OutputArg, 32> &Outs = CLI.Outs;
   1273   SmallVector<SDValue, 32> &OutVals     = CLI.OutVals;
   1274   SmallVector<ISD::InputArg, 32> &Ins   = CLI.Ins;
   1275   SDValue Chain                         = CLI.Chain;
   1276   SDValue Callee                        = CLI.Callee;
   1277   bool &isTailCall                      = CLI.IsTailCall;
   1278   CallingConv::ID CallConv              = CLI.CallConv;
   1279   bool isVarArg                         = CLI.IsVarArg;
   1280 
   1281   // CellSPU target does not yet support tail call optimization.
   1282   isTailCall = false;
   1283 
   1284   const SPUSubtarget *ST = SPUTM.getSubtargetImpl();
   1285   unsigned NumOps     = Outs.size();
   1286   unsigned StackSlotSize = SPUFrameLowering::stackSlotSize();
   1287 
   1288   SmallVector<CCValAssign, 16> ArgLocs;
   1289   CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(),
   1290                  getTargetMachine(), ArgLocs, *DAG.getContext());
   1291   // FIXME: allow for other calling conventions
   1292   CCInfo.AnalyzeCallOperands(Outs, CCC_SPU);
   1293 
   1294   const unsigned NumArgRegs = ArgLocs.size();
   1295 
   1296 
   1297   // Handy pointer type
   1298   EVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy();
   1299 
   1300   // Set up a copy of the stack pointer for use loading and storing any
   1301   // arguments that may not fit in the registers available for argument
   1302   // passing.
   1303   SDValue StackPtr = DAG.getRegister(SPU::R1, MVT::i32);
   1304 
   1305   // Figure out which arguments are going to go in registers, and which in
   1306   // memory.
   1307   unsigned ArgOffset = SPUFrameLowering::minStackSize(); // Just below [LR]
   1308   unsigned ArgRegIdx = 0;
   1309 
   1310   // Keep track of registers passing arguments
   1311   std::vector<std::pair<unsigned, SDValue> > RegsToPass;
   1312   // And the arguments passed on the stack
   1313   SmallVector<SDValue, 8> MemOpChains;
   1314 
   1315   for (; ArgRegIdx != NumOps; ++ArgRegIdx) {
   1316     SDValue Arg = OutVals[ArgRegIdx];
   1317     CCValAssign &VA = ArgLocs[ArgRegIdx];
   1318 
   1319     // PtrOff will be used to store the current argument to the stack if a
   1320     // register cannot be found for it.
   1321     SDValue PtrOff = DAG.getConstant(ArgOffset, StackPtr.getValueType());
   1322     PtrOff = DAG.getNode(ISD::ADD, dl, PtrVT, StackPtr, PtrOff);
   1323 
   1324     switch (Arg.getValueType().getSimpleVT().SimpleTy) {
   1325     default: llvm_unreachable("Unexpected ValueType for argument!");
   1326     case MVT::i8:
   1327     case MVT::i16:
   1328     case MVT::i32:
   1329     case MVT::i64:
   1330     case MVT::i128:
   1331     case MVT::f32:
   1332     case MVT::f64:
   1333     case MVT::v2i64:
   1334     case MVT::v2f64:
   1335     case MVT::v4f32:
   1336     case MVT::v4i32:
   1337     case MVT::v8i16:
   1338     case MVT::v16i8:
   1339       if (ArgRegIdx != NumArgRegs) {
   1340         RegsToPass.push_back(std::make_pair(VA.getLocReg(), Arg));
   1341       } else {
   1342         MemOpChains.push_back(DAG.getStore(Chain, dl, Arg, PtrOff,
   1343                                            MachinePointerInfo(),
   1344                                            false, false, 0));
   1345         ArgOffset += StackSlotSize;
   1346       }
   1347       break;
   1348     }
   1349   }
   1350 
   1351   // Accumulate how many bytes are to be pushed on the stack, including the
   1352   // linkage area, and parameter passing area.  According to the SPU ABI,
   1353   // we minimally need space for [LR] and [SP].
   1354   unsigned NumStackBytes = ArgOffset - SPUFrameLowering::minStackSize();
   1355 
   1356   // Insert a call sequence start
   1357   Chain = DAG.getCALLSEQ_START(Chain, DAG.getIntPtrConstant(NumStackBytes,
   1358                                                             true));
   1359 
   1360   if (!MemOpChains.empty()) {
   1361     // Adjust the stack pointer for the stack arguments.
   1362     Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
   1363                         &MemOpChains[0], MemOpChains.size());
   1364   }
   1365 
   1366   // Build a sequence of copy-to-reg nodes chained together with token chain
   1367   // and flag operands which copy the outgoing args into the appropriate regs.
   1368   SDValue InFlag;
   1369   for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) {
   1370     Chain = DAG.getCopyToReg(Chain, dl, RegsToPass[i].first,
   1371                              RegsToPass[i].second, InFlag);
   1372     InFlag = Chain.getValue(1);
   1373   }
   1374 
   1375   SmallVector<SDValue, 8> Ops;
   1376   unsigned CallOpc = SPUISD::CALL;
   1377 
   1378   // If the callee is a GlobalAddress/ExternalSymbol node (quite common, every
   1379   // direct call is) turn it into a TargetGlobalAddress/TargetExternalSymbol
   1380   // node so that legalize doesn't hack it.
   1381   if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee)) {
   1382     const GlobalValue *GV = G->getGlobal();
   1383     EVT CalleeVT = Callee.getValueType();
   1384     SDValue Zero = DAG.getConstant(0, PtrVT);
   1385     SDValue GA = DAG.getTargetGlobalAddress(GV, dl, CalleeVT);
   1386 
   1387     if (!ST->usingLargeMem()) {
   1388       // Turn calls to targets that are defined (i.e., have bodies) into BRSL
   1389       // style calls, otherwise, external symbols are BRASL calls. This assumes
   1390       // that declared/defined symbols are in the same compilation unit and can
   1391       // be reached through PC-relative jumps.
   1392       //
   1393       // NOTE:
   1394       // This may be an unsafe assumption for JIT and really large compilation
   1395       // units.
   1396       if (GV->isDeclaration()) {
   1397         Callee = DAG.getNode(SPUISD::AFormAddr, dl, CalleeVT, GA, Zero);
   1398       } else {
   1399         Callee = DAG.getNode(SPUISD::PCRelAddr, dl, CalleeVT, GA, Zero);
   1400       }
   1401     } else {
   1402       // "Large memory" mode: Turn all calls into indirect calls with a X-form
   1403       // address pairs:
   1404       Callee = DAG.getNode(SPUISD::IndirectAddr, dl, PtrVT, GA, Zero);
   1405     }
   1406   } else if (ExternalSymbolSDNode *S = dyn_cast<ExternalSymbolSDNode>(Callee)) {
   1407     EVT CalleeVT = Callee.getValueType();
   1408     SDValue Zero = DAG.getConstant(0, PtrVT);
   1409     SDValue ExtSym = DAG.getTargetExternalSymbol(S->getSymbol(),
   1410         Callee.getValueType());
   1411 
   1412     if (!ST->usingLargeMem()) {
   1413       Callee = DAG.getNode(SPUISD::AFormAddr, dl, CalleeVT, ExtSym, Zero);
   1414     } else {
   1415       Callee = DAG.getNode(SPUISD::IndirectAddr, dl, PtrVT, ExtSym, Zero);
   1416     }
   1417   } else if (SDNode *Dest = isLSAAddress(Callee, DAG)) {
   1418     // If this is an absolute destination address that appears to be a legal
   1419     // local store address, use the munged value.
   1420     Callee = SDValue(Dest, 0);
   1421   }
   1422 
   1423   Ops.push_back(Chain);
   1424   Ops.push_back(Callee);
   1425 
   1426   // Add argument registers to the end of the list so that they are known live
   1427   // into the call.
   1428   for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i)
   1429     Ops.push_back(DAG.getRegister(RegsToPass[i].first,
   1430                                   RegsToPass[i].second.getValueType()));
   1431 
   1432   if (InFlag.getNode())
   1433     Ops.push_back(InFlag);
   1434   // Returns a chain and a flag for retval copy to use.
   1435   Chain = DAG.getNode(CallOpc, dl, DAG.getVTList(MVT::Other, MVT::Glue),
   1436                       &Ops[0], Ops.size());
   1437   InFlag = Chain.getValue(1);
   1438 
   1439   Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(NumStackBytes, true),
   1440                              DAG.getIntPtrConstant(0, true), InFlag);
   1441   if (!Ins.empty())
   1442     InFlag = Chain.getValue(1);
   1443 
   1444   // If the function returns void, just return the chain.
   1445   if (Ins.empty())
   1446     return Chain;
   1447 
   1448   // Now handle the return value(s)
   1449   SmallVector<CCValAssign, 16> RVLocs;
   1450   CCState CCRetInfo(CallConv, isVarArg, DAG.getMachineFunction(),
   1451                     getTargetMachine(), RVLocs, *DAG.getContext());
   1452   CCRetInfo.AnalyzeCallResult(Ins, CCC_SPU);
   1453 
   1454 
   1455   // If the call has results, copy the values out of the ret val registers.
   1456   for (unsigned i = 0; i != RVLocs.size(); ++i) {
   1457     CCValAssign VA = RVLocs[i];
   1458 
   1459     SDValue Val = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(), VA.getLocVT(),
   1460                                      InFlag);
   1461     Chain = Val.getValue(1);
   1462     InFlag = Val.getValue(2);
   1463     InVals.push_back(Val);
   1464    }
   1465 
   1466   return Chain;
   1467 }
   1468 
   1469 SDValue
   1470 SPUTargetLowering::LowerReturn(SDValue Chain,
   1471                                CallingConv::ID CallConv, bool isVarArg,
   1472                                const SmallVectorImpl<ISD::OutputArg> &Outs,
   1473                                const SmallVectorImpl<SDValue> &OutVals,
   1474                                DebugLoc dl, SelectionDAG &DAG) const {
   1475 
   1476   SmallVector<CCValAssign, 16> RVLocs;
   1477   CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(),
   1478                  getTargetMachine(), RVLocs, *DAG.getContext());
   1479   CCInfo.AnalyzeReturn(Outs, RetCC_SPU);
   1480 
   1481   // If this is the first return lowered for this function, add the regs to the
   1482   // liveout set for the function.
   1483   if (DAG.getMachineFunction().getRegInfo().liveout_empty()) {
   1484     for (unsigned i = 0; i != RVLocs.size(); ++i)
   1485       DAG.getMachineFunction().getRegInfo().addLiveOut(RVLocs[i].getLocReg());
   1486   }
   1487 
   1488   SDValue Flag;
   1489 
   1490   // Copy the result values into the output registers.
   1491   for (unsigned i = 0; i != RVLocs.size(); ++i) {
   1492     CCValAssign &VA = RVLocs[i];
   1493     assert(VA.isRegLoc() && "Can only return in registers!");
   1494     Chain = DAG.getCopyToReg(Chain, dl, VA.getLocReg(),
   1495                              OutVals[i], Flag);
   1496     Flag = Chain.getValue(1);
   1497   }
   1498 
   1499   if (Flag.getNode())
   1500     return DAG.getNode(SPUISD::RET_FLAG, dl, MVT::Other, Chain, Flag);
   1501   else
   1502     return DAG.getNode(SPUISD::RET_FLAG, dl, MVT::Other, Chain);
   1503 }
   1504 
   1505 
   1506 //===----------------------------------------------------------------------===//
   1507 // Vector related lowering:
   1508 //===----------------------------------------------------------------------===//
   1509 
   1510 static ConstantSDNode *
   1511 getVecImm(SDNode *N) {
   1512   SDValue OpVal(0, 0);
   1513 
   1514   // Check to see if this buildvec has a single non-undef value in its elements.
   1515   for (unsigned i = 0, e = N->getNumOperands(); i != e; ++i) {
   1516     if (N->getOperand(i).getOpcode() == ISD::UNDEF) continue;
   1517     if (OpVal.getNode() == 0)
   1518       OpVal = N->getOperand(i);
   1519     else if (OpVal != N->getOperand(i))
   1520       return 0;
   1521   }
   1522 
   1523   if (OpVal.getNode() != 0) {
   1524     if (ConstantSDNode *CN = dyn_cast<ConstantSDNode>(OpVal)) {
   1525       return CN;
   1526     }
   1527   }
   1528 
   1529   return 0;
   1530 }
   1531 
   1532 /// get_vec_i18imm - Test if this vector is a vector filled with the same value
   1533 /// and the value fits into an unsigned 18-bit constant, and if so, return the
   1534 /// constant
   1535 SDValue SPU::get_vec_u18imm(SDNode *N, SelectionDAG &DAG,
   1536                               EVT ValueType) {
   1537   if (ConstantSDNode *CN = getVecImm(N)) {
   1538     uint64_t Value = CN->getZExtValue();
   1539     if (ValueType == MVT::i64) {
   1540       uint64_t UValue = CN->getZExtValue();
   1541       uint32_t upper = uint32_t(UValue >> 32);
   1542       uint32_t lower = uint32_t(UValue);
   1543       if (upper != lower)
   1544         return SDValue();
   1545       Value = Value >> 32;
   1546     }
   1547     if (Value <= 0x3ffff)
   1548       return DAG.getTargetConstant(Value, ValueType);
   1549   }
   1550 
   1551   return SDValue();
   1552 }
   1553 
   1554 /// get_vec_i16imm - Test if this vector is a vector filled with the same value
   1555 /// and the value fits into a signed 16-bit constant, and if so, return the
   1556 /// constant
   1557 SDValue SPU::get_vec_i16imm(SDNode *N, SelectionDAG &DAG,
   1558                               EVT ValueType) {
   1559   if (ConstantSDNode *CN = getVecImm(N)) {
   1560     int64_t Value = CN->getSExtValue();
   1561     if (ValueType == MVT::i64) {
   1562       uint64_t UValue = CN->getZExtValue();
   1563       uint32_t upper = uint32_t(UValue >> 32);
   1564       uint32_t lower = uint32_t(UValue);
   1565       if (upper != lower)
   1566         return SDValue();
   1567       Value = Value >> 32;
   1568     }
   1569     if (Value >= -(1 << 15) && Value <= ((1 << 15) - 1)) {
   1570       return DAG.getTargetConstant(Value, ValueType);
   1571     }
   1572   }
   1573 
   1574   return SDValue();
   1575 }
   1576 
   1577 /// get_vec_i10imm - Test if this vector is a vector filled with the same value
   1578 /// and the value fits into a signed 10-bit constant, and if so, return the
   1579 /// constant
   1580 SDValue SPU::get_vec_i10imm(SDNode *N, SelectionDAG &DAG,
   1581                               EVT ValueType) {
   1582   if (ConstantSDNode *CN = getVecImm(N)) {
   1583     int64_t Value = CN->getSExtValue();
   1584     if (ValueType == MVT::i64) {
   1585       uint64_t UValue = CN->getZExtValue();
   1586       uint32_t upper = uint32_t(UValue >> 32);
   1587       uint32_t lower = uint32_t(UValue);
   1588       if (upper != lower)
   1589         return SDValue();
   1590       Value = Value >> 32;
   1591     }
   1592     if (isInt<10>(Value))
   1593       return DAG.getTargetConstant(Value, ValueType);
   1594   }
   1595 
   1596   return SDValue();
   1597 }
   1598 
   1599 /// get_vec_i8imm - Test if this vector is a vector filled with the same value
   1600 /// and the value fits into a signed 8-bit constant, and if so, return the
   1601 /// constant.
   1602 ///
   1603 /// @note: The incoming vector is v16i8 because that's the only way we can load
   1604 /// constant vectors. Thus, we test to see if the upper and lower bytes are the
   1605 /// same value.
   1606 SDValue SPU::get_vec_i8imm(SDNode *N, SelectionDAG &DAG,
   1607                              EVT ValueType) {
   1608   if (ConstantSDNode *CN = getVecImm(N)) {
   1609     int Value = (int) CN->getZExtValue();
   1610     if (ValueType == MVT::i16
   1611         && Value <= 0xffff                 /* truncated from uint64_t */
   1612         && ((short) Value >> 8) == ((short) Value & 0xff))
   1613       return DAG.getTargetConstant(Value & 0xff, ValueType);
   1614     else if (ValueType == MVT::i8
   1615              && (Value & 0xff) == Value)
   1616       return DAG.getTargetConstant(Value, ValueType);
   1617   }
   1618 
   1619   return SDValue();
   1620 }
   1621 
   1622 /// get_ILHUvec_imm - Test if this vector is a vector filled with the same value
   1623 /// and the value fits into a signed 16-bit constant, and if so, return the
   1624 /// constant
   1625 SDValue SPU::get_ILHUvec_imm(SDNode *N, SelectionDAG &DAG,
   1626                                EVT ValueType) {
   1627   if (ConstantSDNode *CN = getVecImm(N)) {
   1628     uint64_t Value = CN->getZExtValue();
   1629     if ((ValueType == MVT::i32
   1630           && ((unsigned) Value & 0xffff0000) == (unsigned) Value)
   1631         || (ValueType == MVT::i64 && (Value & 0xffff0000) == Value))
   1632       return DAG.getTargetConstant(Value >> 16, ValueType);
   1633   }
   1634 
   1635   return SDValue();
   1636 }
   1637 
   1638 /// get_v4i32_imm - Catch-all for general 32-bit constant vectors
   1639 SDValue SPU::get_v4i32_imm(SDNode *N, SelectionDAG &DAG) {
   1640   if (ConstantSDNode *CN = getVecImm(N)) {
   1641     return DAG.getTargetConstant((unsigned) CN->getZExtValue(), MVT::i32);
   1642   }
   1643 
   1644   return SDValue();
   1645 }
   1646 
   1647 /// get_v4i32_imm - Catch-all for general 64-bit constant vectors
   1648 SDValue SPU::get_v2i64_imm(SDNode *N, SelectionDAG &DAG) {
   1649   if (ConstantSDNode *CN = getVecImm(N)) {
   1650     return DAG.getTargetConstant((unsigned) CN->getZExtValue(), MVT::i64);
   1651   }
   1652 
   1653   return SDValue();
   1654 }
   1655 
   1656 //! Lower a BUILD_VECTOR instruction creatively:
   1657 static SDValue
   1658 LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) {
   1659   EVT VT = Op.getValueType();
   1660   EVT EltVT = VT.getVectorElementType();
   1661   DebugLoc dl = Op.getDebugLoc();
   1662   BuildVectorSDNode *BCN = dyn_cast<BuildVectorSDNode>(Op.getNode());
   1663   assert(BCN != 0 && "Expected BuildVectorSDNode in SPU LowerBUILD_VECTOR");
   1664   unsigned minSplatBits = EltVT.getSizeInBits();
   1665 
   1666   if (minSplatBits < 16)
   1667     minSplatBits = 16;
   1668 
   1669   APInt APSplatBits, APSplatUndef;
   1670   unsigned SplatBitSize;
   1671   bool HasAnyUndefs;
   1672 
   1673   if (!BCN->isConstantSplat(APSplatBits, APSplatUndef, SplatBitSize,
   1674                             HasAnyUndefs, minSplatBits)
   1675       || minSplatBits < SplatBitSize)
   1676     return SDValue();   // Wasn't a constant vector or splat exceeded min
   1677 
   1678   uint64_t SplatBits = APSplatBits.getZExtValue();
   1679 
   1680   switch (VT.getSimpleVT().SimpleTy) {
   1681   default:
   1682     report_fatal_error("CellSPU: Unhandled VT in LowerBUILD_VECTOR, VT = " +
   1683                        Twine(VT.getEVTString()));
   1684     /*NOTREACHED*/
   1685   case MVT::v4f32: {
   1686     uint32_t Value32 = uint32_t(SplatBits);
   1687     assert(SplatBitSize == 32
   1688            && "LowerBUILD_VECTOR: Unexpected floating point vector element.");
   1689     // NOTE: pretend the constant is an integer. LLVM won't load FP constants
   1690     SDValue T = DAG.getConstant(Value32, MVT::i32);
   1691     return DAG.getNode(ISD::BITCAST, dl, MVT::v4f32,
   1692                        DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32, T,T,T,T));
   1693   }
   1694   case MVT::v2f64: {
   1695     uint64_t f64val = uint64_t(SplatBits);
   1696     assert(SplatBitSize == 64
   1697            && "LowerBUILD_VECTOR: 64-bit float vector size > 8 bytes.");
   1698     // NOTE: pretend the constant is an integer. LLVM won't load FP constants
   1699     SDValue T = DAG.getConstant(f64val, MVT::i64);
   1700     return DAG.getNode(ISD::BITCAST, dl, MVT::v2f64,
   1701                        DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v2i64, T, T));
   1702   }
   1703   case MVT::v16i8: {
   1704    // 8-bit constants have to be expanded to 16-bits
   1705    unsigned short Value16 = SplatBits /* | (SplatBits << 8) */;
   1706    SmallVector<SDValue, 8> Ops;
   1707 
   1708    Ops.assign(8, DAG.getConstant(Value16, MVT::i16));
   1709    return DAG.getNode(ISD::BITCAST, dl, VT,
   1710                       DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v8i16, &Ops[0], Ops.size()));
   1711   }
   1712   case MVT::v8i16: {
   1713     unsigned short Value16 = SplatBits;
   1714     SDValue T = DAG.getConstant(Value16, EltVT);
   1715     SmallVector<SDValue, 8> Ops;
   1716 
   1717     Ops.assign(8, T);
   1718     return DAG.getNode(ISD::BUILD_VECTOR, dl, VT, &Ops[0], Ops.size());
   1719   }
   1720   case MVT::v4i32: {
   1721     SDValue T = DAG.getConstant(unsigned(SplatBits), VT.getVectorElementType());
   1722     return DAG.getNode(ISD::BUILD_VECTOR, dl, VT, T, T, T, T);
   1723   }
   1724   case MVT::v2i64: {
   1725     return SPU::LowerV2I64Splat(VT, DAG, SplatBits, dl);
   1726   }
   1727   }
   1728 }
   1729 
   1730 /*!
   1731  */
   1732 SDValue
   1733 SPU::LowerV2I64Splat(EVT OpVT, SelectionDAG& DAG, uint64_t SplatVal,
   1734                      DebugLoc dl) {
   1735   uint32_t upper = uint32_t(SplatVal >> 32);
   1736   uint32_t lower = uint32_t(SplatVal);
   1737 
   1738   if (upper == lower) {
   1739     // Magic constant that can be matched by IL, ILA, et. al.
   1740     SDValue Val = DAG.getTargetConstant(upper, MVT::i32);
   1741     return DAG.getNode(ISD::BITCAST, dl, OpVT,
   1742                        DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32,
   1743                                    Val, Val, Val, Val));
   1744   } else {
   1745     bool upper_special, lower_special;
   1746 
   1747     // NOTE: This code creates common-case shuffle masks that can be easily
   1748     // detected as common expressions. It is not attempting to create highly
   1749     // specialized masks to replace any and all 0's, 0xff's and 0x80's.
   1750 
   1751     // Detect if the upper or lower half is a special shuffle mask pattern:
   1752     upper_special = (upper == 0 || upper == 0xffffffff || upper == 0x80000000);
   1753     lower_special = (lower == 0 || lower == 0xffffffff || lower == 0x80000000);
   1754 
   1755     // Both upper and lower are special, lower to a constant pool load:
   1756     if (lower_special && upper_special) {
   1757       SDValue UpperVal = DAG.getConstant(upper, MVT::i32);
   1758       SDValue LowerVal = DAG.getConstant(lower, MVT::i32);
   1759       SDValue BV = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32,
   1760                          UpperVal, LowerVal, UpperVal, LowerVal);
   1761       return DAG.getNode(ISD::BITCAST, dl, OpVT, BV);
   1762     }
   1763 
   1764     SDValue LO32;
   1765     SDValue HI32;
   1766     SmallVector<SDValue, 16> ShufBytes;
   1767     SDValue Result;
   1768 
   1769     // Create lower vector if not a special pattern
   1770     if (!lower_special) {
   1771       SDValue LO32C = DAG.getConstant(lower, MVT::i32);
   1772       LO32 = DAG.getNode(ISD::BITCAST, dl, OpVT,
   1773                          DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32,
   1774                                      LO32C, LO32C, LO32C, LO32C));
   1775     }
   1776 
   1777     // Create upper vector if not a special pattern
   1778     if (!upper_special) {
   1779       SDValue HI32C = DAG.getConstant(upper, MVT::i32);
   1780       HI32 = DAG.getNode(ISD::BITCAST, dl, OpVT,
   1781                          DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32,
   1782                                      HI32C, HI32C, HI32C, HI32C));
   1783     }
   1784 
   1785     // If either upper or lower are special, then the two input operands are
   1786     // the same (basically, one of them is a "don't care")
   1787     if (lower_special)
   1788       LO32 = HI32;
   1789     if (upper_special)
   1790       HI32 = LO32;
   1791 
   1792     for (int i = 0; i < 4; ++i) {
   1793       uint64_t val = 0;
   1794       for (int j = 0; j < 4; ++j) {
   1795         SDValue V;
   1796         bool process_upper, process_lower;
   1797         val <<= 8;
   1798         process_upper = (upper_special && (i & 1) == 0);
   1799         process_lower = (lower_special && (i & 1) == 1);
   1800 
   1801         if (process_upper || process_lower) {
   1802           if ((process_upper && upper == 0)
   1803                   || (process_lower && lower == 0))
   1804             val |= 0x80;
   1805           else if ((process_upper && upper == 0xffffffff)
   1806                   || (process_lower && lower == 0xffffffff))
   1807             val |= 0xc0;
   1808           else if ((process_upper && upper == 0x80000000)
   1809                   || (process_lower && lower == 0x80000000))
   1810             val |= (j == 0 ? 0xe0 : 0x80);
   1811         } else
   1812           val |= i * 4 + j + ((i & 1) * 16);
   1813       }
   1814 
   1815       ShufBytes.push_back(DAG.getConstant(val, MVT::i32));
   1816     }
   1817 
   1818     return DAG.getNode(SPUISD::SHUFB, dl, OpVT, HI32, LO32,
   1819                        DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32,
   1820                                    &ShufBytes[0], ShufBytes.size()));
   1821   }
   1822 }
   1823 
   1824 /// LowerVECTOR_SHUFFLE - Lower a vector shuffle (V1, V2, V3) to something on
   1825 /// which the Cell can operate. The code inspects V3 to ascertain whether the
   1826 /// permutation vector, V3, is monotonically increasing with one "exception"
   1827 /// element, e.g., (0, 1, _, 3). If this is the case, then generate a
   1828 /// SHUFFLE_MASK synthetic instruction. Otherwise, spill V3 to the constant pool.
   1829 /// In either case, the net result is going to eventually invoke SHUFB to
   1830 /// permute/shuffle the bytes from V1 and V2.
   1831 /// \note
   1832 /// SHUFFLE_MASK is eventually selected as one of the C*D instructions, generate
   1833 /// control word for byte/halfword/word insertion. This takes care of a single
   1834 /// element move from V2 into V1.
   1835 /// \note
   1836 /// SPUISD::SHUFB is eventually selected as Cell's <i>shufb</i> instructions.
   1837 static SDValue LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) {
   1838   const ShuffleVectorSDNode *SVN = cast<ShuffleVectorSDNode>(Op);
   1839   SDValue V1 = Op.getOperand(0);
   1840   SDValue V2 = Op.getOperand(1);
   1841   DebugLoc dl = Op.getDebugLoc();
   1842 
   1843   if (V2.getOpcode() == ISD::UNDEF) V2 = V1;
   1844 
   1845   // If we have a single element being moved from V1 to V2, this can be handled
   1846   // using the C*[DX] compute mask instructions, but the vector elements have
   1847   // to be monotonically increasing with one exception element, and the source
   1848   // slot of the element to move must be the same as the destination.
   1849   EVT VecVT = V1.getValueType();
   1850   EVT EltVT = VecVT.getVectorElementType();
   1851   unsigned EltsFromV2 = 0;
   1852   unsigned V2EltOffset = 0;
   1853   unsigned V2EltIdx0 = 0;
   1854   unsigned CurrElt = 0;
   1855   unsigned MaxElts = VecVT.getVectorNumElements();
   1856   unsigned PrevElt = 0;
   1857   bool monotonic = true;
   1858   bool rotate = true;
   1859   int rotamt=0;
   1860   EVT maskVT;             // which of the c?d instructions to use
   1861 
   1862   if (EltVT == MVT::i8) {
   1863     V2EltIdx0 = 16;
   1864     maskVT = MVT::v16i8;
   1865   } else if (EltVT == MVT::i16) {
   1866     V2EltIdx0 = 8;
   1867     maskVT = MVT::v8i16;
   1868   } else if (EltVT == MVT::i32 || EltVT == MVT::f32) {
   1869     V2EltIdx0 = 4;
   1870     maskVT = MVT::v4i32;
   1871   } else if (EltVT == MVT::i64 || EltVT == MVT::f64) {
   1872     V2EltIdx0 = 2;
   1873     maskVT = MVT::v2i64;
   1874   } else
   1875     llvm_unreachable("Unhandled vector type in LowerVECTOR_SHUFFLE");
   1876 
   1877   for (unsigned i = 0; i != MaxElts; ++i) {
   1878     if (SVN->getMaskElt(i) < 0)
   1879       continue;
   1880 
   1881     unsigned SrcElt = SVN->getMaskElt(i);
   1882 
   1883     if (monotonic) {
   1884       if (SrcElt >= V2EltIdx0) {
   1885         // TODO: optimize for the monotonic case when several consecutive
   1886         // elements are taken form V2. Do we ever get such a case?
   1887         if (EltsFromV2 == 0 && CurrElt == (SrcElt - V2EltIdx0))
   1888           V2EltOffset = (SrcElt - V2EltIdx0) * (EltVT.getSizeInBits()/8);
   1889         else
   1890           monotonic = false;
   1891         ++EltsFromV2;
   1892       } else if (CurrElt != SrcElt) {
   1893         monotonic = false;
   1894       }
   1895 
   1896       ++CurrElt;
   1897     }
   1898 
   1899     if (rotate) {
   1900       if (PrevElt > 0 && SrcElt < MaxElts) {
   1901         if ((PrevElt == SrcElt - 1)
   1902             || (PrevElt == MaxElts - 1 && SrcElt == 0)) {
   1903           PrevElt = SrcElt;
   1904         } else {
   1905           rotate = false;
   1906         }
   1907       } else if (i == 0 || (PrevElt==0 && SrcElt==1)) {
   1908         // First time or after a "wrap around"
   1909         rotamt = SrcElt-i;
   1910         PrevElt = SrcElt;
   1911       } else {
   1912         // This isn't a rotation, takes elements from vector 2
   1913         rotate = false;
   1914       }
   1915     }
   1916   }
   1917 
   1918   if (EltsFromV2 == 1 && monotonic) {
   1919     // Compute mask and shuffle
   1920     EVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy();
   1921 
   1922     // As SHUFFLE_MASK becomes a c?d instruction, feed it an address
   1923     // R1 ($sp) is used here only as it is guaranteed to have last bits zero
   1924     SDValue Pointer = DAG.getNode(SPUISD::IndirectAddr, dl, PtrVT,
   1925                                 DAG.getRegister(SPU::R1, PtrVT),
   1926                                 DAG.getConstant(V2EltOffset, MVT::i32));
   1927     SDValue ShufMaskOp = DAG.getNode(SPUISD::SHUFFLE_MASK, dl,
   1928                                      maskVT, Pointer);
   1929 
   1930     // Use shuffle mask in SHUFB synthetic instruction:
   1931     return DAG.getNode(SPUISD::SHUFB, dl, V1.getValueType(), V2, V1,
   1932                        ShufMaskOp);
   1933   } else if (rotate) {
   1934     if (rotamt < 0)
   1935       rotamt +=MaxElts;
   1936     rotamt *= EltVT.getSizeInBits()/8;
   1937     return DAG.getNode(SPUISD::ROTBYTES_LEFT, dl, V1.getValueType(),
   1938                        V1, DAG.getConstant(rotamt, MVT::i16));
   1939   } else {
   1940    // Convert the SHUFFLE_VECTOR mask's input element units to the
   1941    // actual bytes.
   1942     unsigned BytesPerElement = EltVT.getSizeInBits()/8;
   1943 
   1944     SmallVector<SDValue, 16> ResultMask;
   1945     for (unsigned i = 0, e = MaxElts; i != e; ++i) {
   1946       unsigned SrcElt = SVN->getMaskElt(i) < 0 ? 0 : SVN->getMaskElt(i);
   1947 
   1948       for (unsigned j = 0; j < BytesPerElement; ++j)
   1949         ResultMask.push_back(DAG.getConstant(SrcElt*BytesPerElement+j,MVT::i8));
   1950     }
   1951     SDValue VPermMask = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v16i8,
   1952                                     &ResultMask[0], ResultMask.size());
   1953     return DAG.getNode(SPUISD::SHUFB, dl, V1.getValueType(), V1, V2, VPermMask);
   1954   }
   1955 }
   1956 
   1957 static SDValue LowerSCALAR_TO_VECTOR(SDValue Op, SelectionDAG &DAG) {
   1958   SDValue Op0 = Op.getOperand(0);                     // Op0 = the scalar
   1959   DebugLoc dl = Op.getDebugLoc();
   1960 
   1961   if (Op0.getNode()->getOpcode() == ISD::Constant) {
   1962     // For a constant, build the appropriate constant vector, which will
   1963     // eventually simplify to a vector register load.
   1964 
   1965     ConstantSDNode *CN = cast<ConstantSDNode>(Op0.getNode());
   1966     SmallVector<SDValue, 16> ConstVecValues;
   1967     EVT VT;
   1968     size_t n_copies;
   1969 
   1970     // Create a constant vector:
   1971     switch (Op.getValueType().getSimpleVT().SimpleTy) {
   1972     default: llvm_unreachable("Unexpected constant value type in "
   1973                               "LowerSCALAR_TO_VECTOR");
   1974     case MVT::v16i8: n_copies = 16; VT = MVT::i8; break;
   1975     case MVT::v8i16: n_copies = 8; VT = MVT::i16; break;
   1976     case MVT::v4i32: n_copies = 4; VT = MVT::i32; break;
   1977     case MVT::v4f32: n_copies = 4; VT = MVT::f32; break;
   1978     case MVT::v2i64: n_copies = 2; VT = MVT::i64; break;
   1979     case MVT::v2f64: n_copies = 2; VT = MVT::f64; break;
   1980     }
   1981 
   1982     SDValue CValue = DAG.getConstant(CN->getZExtValue(), VT);
   1983     for (size_t j = 0; j < n_copies; ++j)
   1984       ConstVecValues.push_back(CValue);
   1985 
   1986     return DAG.getNode(ISD::BUILD_VECTOR, dl, Op.getValueType(),
   1987                        &ConstVecValues[0], ConstVecValues.size());
   1988   } else {
   1989     // Otherwise, copy the value from one register to another:
   1990     switch (Op0.getValueType().getSimpleVT().SimpleTy) {
   1991     default: llvm_unreachable("Unexpected value type in LowerSCALAR_TO_VECTOR");
   1992     case MVT::i8:
   1993     case MVT::i16:
   1994     case MVT::i32:
   1995     case MVT::i64:
   1996     case MVT::f32:
   1997     case MVT::f64:
   1998       return DAG.getNode(SPUISD::PREFSLOT2VEC, dl, Op.getValueType(), Op0, Op0);
   1999     }
   2000   }
   2001 }
   2002 
   2003 static SDValue LowerEXTRACT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) {
   2004   EVT VT = Op.getValueType();
   2005   SDValue N = Op.getOperand(0);
   2006   SDValue Elt = Op.getOperand(1);
   2007   DebugLoc dl = Op.getDebugLoc();
   2008   SDValue retval;
   2009 
   2010   if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Elt)) {
   2011     // Constant argument:
   2012     int EltNo = (int) C->getZExtValue();
   2013 
   2014     // sanity checks:
   2015     if (VT == MVT::i8 && EltNo >= 16)
   2016       llvm_unreachable("SPU LowerEXTRACT_VECTOR_ELT: i8 extraction slot > 15");
   2017     else if (VT == MVT::i16 && EltNo >= 8)
   2018       llvm_unreachable("SPU LowerEXTRACT_VECTOR_ELT: i16 extraction slot > 7");
   2019     else if (VT == MVT::i32 && EltNo >= 4)
   2020       llvm_unreachable("SPU LowerEXTRACT_VECTOR_ELT: i32 extraction slot > 4");
   2021     else if (VT == MVT::i64 && EltNo >= 2)
   2022       llvm_unreachable("SPU LowerEXTRACT_VECTOR_ELT: i64 extraction slot > 2");
   2023 
   2024     if (EltNo == 0 && (VT == MVT::i32 || VT == MVT::i64)) {
   2025       // i32 and i64: Element 0 is the preferred slot
   2026       return DAG.getNode(SPUISD::VEC2PREFSLOT, dl, VT, N);
   2027     }
   2028 
   2029     // Need to generate shuffle mask and extract:
   2030     int prefslot_begin = -1, prefslot_end = -1;
   2031     int elt_byte = EltNo * VT.getSizeInBits() / 8;
   2032 
   2033     switch (VT.getSimpleVT().SimpleTy) {
   2034     default: llvm_unreachable("Invalid value type!");
   2035     case MVT::i8: {
   2036       prefslot_begin = prefslot_end = 3;
   2037       break;
   2038     }
   2039     case MVT::i16: {
   2040       prefslot_begin = 2; prefslot_end = 3;
   2041       break;
   2042     }
   2043     case MVT::i32:
   2044     case MVT::f32: {
   2045       prefslot_begin = 0; prefslot_end = 3;
   2046       break;
   2047     }
   2048     case MVT::i64:
   2049     case MVT::f64: {
   2050       prefslot_begin = 0; prefslot_end = 7;
   2051       break;
   2052     }
   2053     }
   2054 
   2055     assert(prefslot_begin != -1 && prefslot_end != -1 &&
   2056            "LowerEXTRACT_VECTOR_ELT: preferred slots uninitialized");
   2057 
   2058     unsigned int ShufBytes[16] = {
   2059       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
   2060     };
   2061     for (int i = 0; i < 16; ++i) {
   2062       // zero fill uppper part of preferred slot, don't care about the
   2063       // other slots:
   2064       unsigned int mask_val;
   2065       if (i <= prefslot_end) {
   2066         mask_val =
   2067           ((i < prefslot_begin)
   2068            ? 0x80
   2069            : elt_byte + (i - prefslot_begin));
   2070 
   2071         ShufBytes[i] = mask_val;
   2072       } else
   2073         ShufBytes[i] = ShufBytes[i % (prefslot_end + 1)];
   2074     }
   2075 
   2076     SDValue ShufMask[4];
   2077     for (unsigned i = 0; i < sizeof(ShufMask)/sizeof(ShufMask[0]); ++i) {
   2078       unsigned bidx = i * 4;
   2079       unsigned int bits = ((ShufBytes[bidx] << 24) |
   2080                            (ShufBytes[bidx+1] << 16) |
   2081                            (ShufBytes[bidx+2] << 8) |
   2082                            ShufBytes[bidx+3]);
   2083       ShufMask[i] = DAG.getConstant(bits, MVT::i32);
   2084     }
   2085 
   2086     SDValue ShufMaskVec =
   2087       DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32,
   2088                   &ShufMask[0], sizeof(ShufMask)/sizeof(ShufMask[0]));
   2089 
   2090     retval = DAG.getNode(SPUISD::VEC2PREFSLOT, dl, VT,
   2091                          DAG.getNode(SPUISD::SHUFB, dl, N.getValueType(),
   2092                                      N, N, ShufMaskVec));
   2093   } else {
   2094     // Variable index: Rotate the requested element into slot 0, then replicate
   2095     // slot 0 across the vector
   2096     EVT VecVT = N.getValueType();
   2097     if (!VecVT.isSimple() || !VecVT.isVector()) {
   2098       report_fatal_error("LowerEXTRACT_VECTOR_ELT: Must have a simple, 128-bit"
   2099                         "vector type!");
   2100     }
   2101 
   2102     // Make life easier by making sure the index is zero-extended to i32
   2103     if (Elt.getValueType() != MVT::i32)
   2104       Elt = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, Elt);
   2105 
   2106     // Scale the index to a bit/byte shift quantity
   2107     APInt scaleFactor =
   2108             APInt(32, uint64_t(16 / N.getValueType().getVectorNumElements()), false);
   2109     unsigned scaleShift = scaleFactor.logBase2();
   2110     SDValue vecShift;
   2111 
   2112     if (scaleShift > 0) {
   2113       // Scale the shift factor:
   2114       Elt = DAG.getNode(ISD::SHL, dl, MVT::i32, Elt,
   2115                         DAG.getConstant(scaleShift, MVT::i32));
   2116     }
   2117 
   2118     vecShift = DAG.getNode(SPUISD::SHL_BYTES, dl, VecVT, N, Elt);
   2119 
   2120     // Replicate the bytes starting at byte 0 across the entire vector (for
   2121     // consistency with the notion of a unified register set)
   2122     SDValue replicate;
   2123 
   2124     switch (VT.getSimpleVT().SimpleTy) {
   2125     default:
   2126       report_fatal_error("LowerEXTRACT_VECTOR_ELT(varable): Unhandled vector"
   2127                         "type");
   2128       /*NOTREACHED*/
   2129     case MVT::i8: {
   2130       SDValue factor = DAG.getConstant(0x00000000, MVT::i32);
   2131       replicate = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32,
   2132                               factor, factor, factor, factor);
   2133       break;
   2134     }
   2135     case MVT::i16: {
   2136       SDValue factor = DAG.getConstant(0x00010001, MVT::i32);
   2137       replicate = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32,
   2138                               factor, factor, factor, factor);
   2139       break;
   2140     }
   2141     case MVT::i32:
   2142     case MVT::f32: {
   2143       SDValue factor = DAG.getConstant(0x00010203, MVT::i32);
   2144       replicate = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32,
   2145                               factor, factor, factor, factor);
   2146       break;
   2147     }
   2148     case MVT::i64:
   2149     case MVT::f64: {
   2150       SDValue loFactor = DAG.getConstant(0x00010203, MVT::i32);
   2151       SDValue hiFactor = DAG.getConstant(0x04050607, MVT::i32);
   2152       replicate = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32,
   2153                               loFactor, hiFactor, loFactor, hiFactor);
   2154       break;
   2155     }
   2156     }
   2157 
   2158     retval = DAG.getNode(SPUISD::VEC2PREFSLOT, dl, VT,
   2159                          DAG.getNode(SPUISD::SHUFB, dl, VecVT,
   2160                                      vecShift, vecShift, replicate));
   2161   }
   2162 
   2163   return retval;
   2164 }
   2165 
   2166 static SDValue LowerINSERT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) {
   2167   SDValue VecOp = Op.getOperand(0);
   2168   SDValue ValOp = Op.getOperand(1);
   2169   SDValue IdxOp = Op.getOperand(2);
   2170   DebugLoc dl = Op.getDebugLoc();
   2171   EVT VT = Op.getValueType();
   2172   EVT eltVT = ValOp.getValueType();
   2173 
   2174   // use 0 when the lane to insert to is 'undef'
   2175   int64_t Offset=0;
   2176   if (IdxOp.getOpcode() != ISD::UNDEF) {
   2177     ConstantSDNode *CN = cast<ConstantSDNode>(IdxOp);
   2178     assert(CN != 0 && "LowerINSERT_VECTOR_ELT: Index is not constant!");
   2179     Offset = (CN->getSExtValue()) * eltVT.getSizeInBits()/8;
   2180   }
   2181 
   2182   EVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy();
   2183   // Use $sp ($1) because it's always 16-byte aligned and it's available:
   2184   SDValue Pointer = DAG.getNode(SPUISD::IndirectAddr, dl, PtrVT,
   2185                                 DAG.getRegister(SPU::R1, PtrVT),
   2186                                 DAG.getConstant(Offset, PtrVT));
   2187   // widen the mask when dealing with half vectors
   2188   EVT maskVT = EVT::getVectorVT(*(DAG.getContext()), VT.getVectorElementType(),
   2189                                 128/ VT.getVectorElementType().getSizeInBits());
   2190   SDValue ShufMask = DAG.getNode(SPUISD::SHUFFLE_MASK, dl, maskVT, Pointer);
   2191 
   2192   SDValue result =
   2193     DAG.getNode(SPUISD::SHUFB, dl, VT,
   2194                 DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, ValOp),
   2195                 VecOp,
   2196                 DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, ShufMask));
   2197 
   2198   return result;
   2199 }
   2200 
   2201 static SDValue LowerI8Math(SDValue Op, SelectionDAG &DAG, unsigned Opc,
   2202                            const TargetLowering &TLI)
   2203 {
   2204   SDValue N0 = Op.getOperand(0);      // Everything has at least one operand
   2205   DebugLoc dl = Op.getDebugLoc();
   2206   EVT ShiftVT = TLI.getShiftAmountTy(N0.getValueType());
   2207 
   2208   assert(Op.getValueType() == MVT::i8);
   2209   switch (Opc) {
   2210   default:
   2211     llvm_unreachable("Unhandled i8 math operator");
   2212   case ISD::ADD: {
   2213     // 8-bit addition: Promote the arguments up to 16-bits and truncate
   2214     // the result:
   2215     SDValue N1 = Op.getOperand(1);
   2216     N0 = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::i16, N0);
   2217     N1 = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::i16, N1);
   2218     return DAG.getNode(ISD::TRUNCATE, dl, MVT::i8,
   2219                        DAG.getNode(Opc, dl, MVT::i16, N0, N1));
   2220 
   2221   }
   2222 
   2223   case ISD::SUB: {
   2224     // 8-bit subtraction: Promote the arguments up to 16-bits and truncate
   2225     // the result:
   2226     SDValue N1 = Op.getOperand(1);
   2227     N0 = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::i16, N0);
   2228     N1 = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::i16, N1);
   2229     return DAG.getNode(ISD::TRUNCATE, dl, MVT::i8,
   2230                        DAG.getNode(Opc, dl, MVT::i16, N0, N1));
   2231   }
   2232   case ISD::ROTR:
   2233   case ISD::ROTL: {
   2234     SDValue N1 = Op.getOperand(1);
   2235     EVT N1VT = N1.getValueType();
   2236 
   2237     N0 = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i16, N0);
   2238     if (!N1VT.bitsEq(ShiftVT)) {
   2239       unsigned N1Opc = N1.getValueType().bitsLT(ShiftVT)
   2240                        ? ISD::ZERO_EXTEND
   2241                        : ISD::TRUNCATE;
   2242       N1 = DAG.getNode(N1Opc, dl, ShiftVT, N1);
   2243     }
   2244 
   2245     // Replicate lower 8-bits into upper 8:
   2246     SDValue ExpandArg =
   2247       DAG.getNode(ISD::OR, dl, MVT::i16, N0,
   2248                   DAG.getNode(ISD::SHL, dl, MVT::i16,
   2249                               N0, DAG.getConstant(8, MVT::i32)));
   2250 
   2251     // Truncate back down to i8
   2252     return DAG.getNode(ISD::TRUNCATE, dl, MVT::i8,
   2253                        DAG.getNode(Opc, dl, MVT::i16, ExpandArg, N1));
   2254   }
   2255   case ISD::SRL:
   2256   case ISD::SHL: {
   2257     SDValue N1 = Op.getOperand(1);
   2258     EVT N1VT = N1.getValueType();
   2259 
   2260     N0 = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i16, N0);
   2261     if (!N1VT.bitsEq(ShiftVT)) {
   2262       unsigned N1Opc = ISD::ZERO_EXTEND;
   2263 
   2264       if (N1.getValueType().bitsGT(ShiftVT))
   2265         N1Opc = ISD::TRUNCATE;
   2266 
   2267       N1 = DAG.getNode(N1Opc, dl, ShiftVT, N1);
   2268     }
   2269 
   2270     return DAG.getNode(ISD::TRUNCATE, dl, MVT::i8,
   2271                        DAG.getNode(Opc, dl, MVT::i16, N0, N1));
   2272   }
   2273   case ISD::SRA: {
   2274     SDValue N1 = Op.getOperand(1);
   2275     EVT N1VT = N1.getValueType();
   2276 
   2277     N0 = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::i16, N0);
   2278     if (!N1VT.bitsEq(ShiftVT)) {
   2279       unsigned N1Opc = ISD::SIGN_EXTEND;
   2280 
   2281       if (N1VT.bitsGT(ShiftVT))
   2282         N1Opc = ISD::TRUNCATE;
   2283       N1 = DAG.getNode(N1Opc, dl, ShiftVT, N1);
   2284     }
   2285 
   2286     return DAG.getNode(ISD::TRUNCATE, dl, MVT::i8,
   2287                        DAG.getNode(Opc, dl, MVT::i16, N0, N1));
   2288   }
   2289   case ISD::MUL: {
   2290     SDValue N1 = Op.getOperand(1);
   2291 
   2292     N0 = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::i16, N0);
   2293     N1 = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::i16, N1);
   2294     return DAG.getNode(ISD::TRUNCATE, dl, MVT::i8,
   2295                        DAG.getNode(Opc, dl, MVT::i16, N0, N1));
   2296   }
   2297   }
   2298 }
   2299 
   2300 //! Lower byte immediate operations for v16i8 vectors:
   2301 static SDValue
   2302 LowerByteImmed(SDValue Op, SelectionDAG &DAG) {
   2303   SDValue ConstVec;
   2304   SDValue Arg;
   2305   EVT VT = Op.getValueType();
   2306   DebugLoc dl = Op.getDebugLoc();
   2307 
   2308   ConstVec = Op.getOperand(0);
   2309   Arg = Op.getOperand(1);
   2310   if (ConstVec.getNode()->getOpcode() != ISD::BUILD_VECTOR) {
   2311     if (ConstVec.getNode()->getOpcode() == ISD::BITCAST) {
   2312       ConstVec = ConstVec.getOperand(0);
   2313     } else {
   2314       ConstVec = Op.getOperand(1);
   2315       Arg = Op.getOperand(0);
   2316       if (ConstVec.getNode()->getOpcode() == ISD::BITCAST) {
   2317         ConstVec = ConstVec.getOperand(0);
   2318       }
   2319     }
   2320   }
   2321 
   2322   if (ConstVec.getNode()->getOpcode() == ISD::BUILD_VECTOR) {
   2323     BuildVectorSDNode *BCN = dyn_cast<BuildVectorSDNode>(ConstVec.getNode());
   2324     assert(BCN != 0 && "Expected BuildVectorSDNode in SPU LowerByteImmed");
   2325 
   2326     APInt APSplatBits, APSplatUndef;
   2327     unsigned SplatBitSize;
   2328     bool HasAnyUndefs;
   2329     unsigned minSplatBits = VT.getVectorElementType().getSizeInBits();
   2330 
   2331     if (BCN->isConstantSplat(APSplatBits, APSplatUndef, SplatBitSize,
   2332                               HasAnyUndefs, minSplatBits)
   2333         && minSplatBits <= SplatBitSize) {
   2334       uint64_t SplatBits = APSplatBits.getZExtValue();
   2335       SDValue tc = DAG.getTargetConstant(SplatBits & 0xff, MVT::i8);
   2336 
   2337       SmallVector<SDValue, 16> tcVec;
   2338       tcVec.assign(16, tc);
   2339       return DAG.getNode(Op.getNode()->getOpcode(), dl, VT, Arg,
   2340                          DAG.getNode(ISD::BUILD_VECTOR, dl, VT, &tcVec[0], tcVec.size()));
   2341     }
   2342   }
   2343 
   2344   // These operations (AND, OR, XOR) are legal, they just couldn't be custom
   2345   // lowered.  Return the operation, rather than a null SDValue.
   2346   return Op;
   2347 }
   2348 
   2349 //! Custom lowering for CTPOP (count population)
   2350 /*!
   2351   Custom lowering code that counts the number ones in the input
   2352   operand. SPU has such an instruction, but it counts the number of
   2353   ones per byte, which then have to be accumulated.
   2354 */
   2355 static SDValue LowerCTPOP(SDValue Op, SelectionDAG &DAG) {
   2356   EVT VT = Op.getValueType();
   2357   EVT vecVT = EVT::getVectorVT(*DAG.getContext(),
   2358                                VT, (128 / VT.getSizeInBits()));
   2359   DebugLoc dl = Op.getDebugLoc();
   2360 
   2361   switch (VT.getSimpleVT().SimpleTy) {
   2362   default: llvm_unreachable("Invalid value type!");
   2363   case MVT::i8: {
   2364     SDValue N = Op.getOperand(0);
   2365     SDValue Elt0 = DAG.getConstant(0, MVT::i32);
   2366 
   2367     SDValue Promote = DAG.getNode(SPUISD::PREFSLOT2VEC, dl, vecVT, N, N);
   2368     SDValue CNTB = DAG.getNode(SPUISD::CNTB, dl, vecVT, Promote);
   2369 
   2370     return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i8, CNTB, Elt0);
   2371   }
   2372 
   2373   case MVT::i16: {
   2374     MachineFunction &MF = DAG.getMachineFunction();
   2375     MachineRegisterInfo &RegInfo = MF.getRegInfo();
   2376 
   2377     unsigned CNTB_reg = RegInfo.createVirtualRegister(&SPU::R16CRegClass);
   2378 
   2379     SDValue N = Op.getOperand(0);
   2380     SDValue Elt0 = DAG.getConstant(0, MVT::i16);
   2381     SDValue Mask0 = DAG.getConstant(0x0f, MVT::i16);
   2382     SDValue Shift1 = DAG.getConstant(8, MVT::i32);
   2383 
   2384     SDValue Promote = DAG.getNode(SPUISD::PREFSLOT2VEC, dl, vecVT, N, N);
   2385     SDValue CNTB = DAG.getNode(SPUISD::CNTB, dl, vecVT, Promote);
   2386 
   2387     // CNTB_result becomes the chain to which all of the virtual registers
   2388     // CNTB_reg, SUM1_reg become associated:
   2389     SDValue CNTB_result =
   2390       DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16, CNTB, Elt0);
   2391 
   2392     SDValue CNTB_rescopy =
   2393       DAG.getCopyToReg(CNTB_result, dl, CNTB_reg, CNTB_result);
   2394 
   2395     SDValue Tmp1 = DAG.getCopyFromReg(CNTB_rescopy, dl, CNTB_reg, MVT::i16);
   2396 
   2397     return DAG.getNode(ISD::AND, dl, MVT::i16,
   2398                        DAG.getNode(ISD::ADD, dl, MVT::i16,
   2399                                    DAG.getNode(ISD::SRL, dl, MVT::i16,
   2400                                                Tmp1, Shift1),
   2401                                    Tmp1),
   2402                        Mask0);
   2403   }
   2404 
   2405   case MVT::i32: {
   2406     MachineFunction &MF = DAG.getMachineFunction();
   2407     MachineRegisterInfo &RegInfo = MF.getRegInfo();
   2408 
   2409     unsigned CNTB_reg = RegInfo.createVirtualRegister(&SPU::R32CRegClass);
   2410     unsigned SUM1_reg = RegInfo.createVirtualRegister(&SPU::R32CRegClass);
   2411 
   2412     SDValue N = Op.getOperand(0);
   2413     SDValue Elt0 = DAG.getConstant(0, MVT::i32);
   2414     SDValue Mask0 = DAG.getConstant(0xff, MVT::i32);
   2415     SDValue Shift1 = DAG.getConstant(16, MVT::i32);
   2416     SDValue Shift2 = DAG.getConstant(8, MVT::i32);
   2417 
   2418     SDValue Promote = DAG.getNode(SPUISD::PREFSLOT2VEC, dl, vecVT, N, N);
   2419     SDValue CNTB = DAG.getNode(SPUISD::CNTB, dl, vecVT, Promote);
   2420 
   2421     // CNTB_result becomes the chain to which all of the virtual registers
   2422     // CNTB_reg, SUM1_reg become associated:
   2423     SDValue CNTB_result =
   2424       DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32, CNTB, Elt0);
   2425 
   2426     SDValue CNTB_rescopy =
   2427       DAG.getCopyToReg(CNTB_result, dl, CNTB_reg, CNTB_result);
   2428 
   2429     SDValue Comp1 =
   2430       DAG.getNode(ISD::SRL, dl, MVT::i32,
   2431                   DAG.getCopyFromReg(CNTB_rescopy, dl, CNTB_reg, MVT::i32),
   2432                   Shift1);
   2433 
   2434     SDValue Sum1 =
   2435       DAG.getNode(ISD::ADD, dl, MVT::i32, Comp1,
   2436                   DAG.getCopyFromReg(CNTB_rescopy, dl, CNTB_reg, MVT::i32));
   2437 
   2438     SDValue Sum1_rescopy =
   2439       DAG.getCopyToReg(CNTB_result, dl, SUM1_reg, Sum1);
   2440 
   2441     SDValue Comp2 =
   2442       DAG.getNode(ISD::SRL, dl, MVT::i32,
   2443                   DAG.getCopyFromReg(Sum1_rescopy, dl, SUM1_reg, MVT::i32),
   2444                   Shift2);
   2445     SDValue Sum2 =
   2446       DAG.getNode(ISD::ADD, dl, MVT::i32, Comp2,
   2447                   DAG.getCopyFromReg(Sum1_rescopy, dl, SUM1_reg, MVT::i32));
   2448 
   2449     return DAG.getNode(ISD::AND, dl, MVT::i32, Sum2, Mask0);
   2450   }
   2451 
   2452   case MVT::i64:
   2453     break;
   2454   }
   2455 
   2456   return SDValue();
   2457 }
   2458 
   2459 //! Lower ISD::FP_TO_SINT, ISD::FP_TO_UINT for i32
   2460 /*!
   2461  f32->i32 passes through unchanged, whereas f64->i32 expands to a libcall.
   2462  All conversions to i64 are expanded to a libcall.
   2463  */
   2464 static SDValue LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG,
   2465                               const SPUTargetLowering &TLI) {
   2466   EVT OpVT = Op.getValueType();
   2467   SDValue Op0 = Op.getOperand(0);
   2468   EVT Op0VT = Op0.getValueType();
   2469 
   2470   if ((OpVT == MVT::i32 && Op0VT == MVT::f64)
   2471       || OpVT == MVT::i64) {
   2472     // Convert f32 / f64 to i32 / i64 via libcall.
   2473     RTLIB::Libcall LC =
   2474             (Op.getOpcode() == ISD::FP_TO_SINT)
   2475              ? RTLIB::getFPTOSINT(Op0VT, OpVT)
   2476              : RTLIB::getFPTOUINT(Op0VT, OpVT);
   2477     assert(LC != RTLIB::UNKNOWN_LIBCALL && "Unexpectd fp-to-int conversion!");
   2478     SDValue Dummy;
   2479     return ExpandLibCall(LC, Op, DAG, false, Dummy, TLI);
   2480   }
   2481 
   2482   return Op;
   2483 }
   2484 
   2485 //! Lower ISD::SINT_TO_FP, ISD::UINT_TO_FP for i32
   2486 /*!
   2487  i32->f32 passes through unchanged, whereas i32->f64 is expanded to a libcall.
   2488  All conversions from i64 are expanded to a libcall.
   2489  */
   2490 static SDValue LowerINT_TO_FP(SDValue Op, SelectionDAG &DAG,
   2491                               const SPUTargetLowering &TLI) {
   2492   EVT OpVT = Op.getValueType();
   2493   SDValue Op0 = Op.getOperand(0);
   2494   EVT Op0VT = Op0.getValueType();
   2495 
   2496   if ((OpVT == MVT::f64 && Op0VT == MVT::i32)
   2497       || Op0VT == MVT::i64) {
   2498     // Convert i32, i64 to f64 via libcall:
   2499     RTLIB::Libcall LC =
   2500             (Op.getOpcode() == ISD::SINT_TO_FP)
   2501              ? RTLIB::getSINTTOFP(Op0VT, OpVT)
   2502              : RTLIB::getUINTTOFP(Op0VT, OpVT);
   2503     assert(LC != RTLIB::UNKNOWN_LIBCALL && "Unexpectd int-to-fp conversion!");
   2504     SDValue Dummy;
   2505     return ExpandLibCall(LC, Op, DAG, false, Dummy, TLI);
   2506   }
   2507 
   2508   return Op;
   2509 }
   2510 
   2511 //! Lower ISD::SETCC
   2512 /*!
   2513  This handles MVT::f64 (double floating point) condition lowering
   2514  */
   2515 static SDValue LowerSETCC(SDValue Op, SelectionDAG &DAG,
   2516                           const TargetLowering &TLI) {
   2517   CondCodeSDNode *CC = dyn_cast<CondCodeSDNode>(Op.getOperand(2));
   2518   DebugLoc dl = Op.getDebugLoc();
   2519   assert(CC != 0 && "LowerSETCC: CondCodeSDNode should not be null here!\n");
   2520 
   2521   SDValue lhs = Op.getOperand(0);
   2522   SDValue rhs = Op.getOperand(1);
   2523   EVT lhsVT = lhs.getValueType();
   2524   assert(lhsVT == MVT::f64 && "LowerSETCC: type other than MVT::64\n");
   2525 
   2526   EVT ccResultVT = TLI.getSetCCResultType(lhs.getValueType());
   2527   APInt ccResultOnes = APInt::getAllOnesValue(ccResultVT.getSizeInBits());
   2528   EVT IntVT(MVT::i64);
   2529 
   2530   // Take advantage of the fact that (truncate (sra arg, 32)) is efficiently
   2531   // selected to a NOP:
   2532   SDValue i64lhs = DAG.getNode(ISD::BITCAST, dl, IntVT, lhs);
   2533   SDValue lhsHi32 =
   2534           DAG.getNode(ISD::TRUNCATE, dl, MVT::i32,
   2535                       DAG.getNode(ISD::SRL, dl, IntVT,
   2536                                   i64lhs, DAG.getConstant(32, MVT::i32)));
   2537   SDValue lhsHi32abs =
   2538           DAG.getNode(ISD::AND, dl, MVT::i32,
   2539                       lhsHi32, DAG.getConstant(0x7fffffff, MVT::i32));
   2540   SDValue lhsLo32 =
   2541           DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, i64lhs);
   2542 
   2543   // SETO and SETUO only use the lhs operand:
   2544   if (CC->get() == ISD::SETO) {
   2545     // Evaluates to true if Op0 is not [SQ]NaN - lowers to the inverse of
   2546     // SETUO
   2547     APInt ccResultAllOnes = APInt::getAllOnesValue(ccResultVT.getSizeInBits());
   2548     return DAG.getNode(ISD::XOR, dl, ccResultVT,
   2549                        DAG.getSetCC(dl, ccResultVT,
   2550                                     lhs, DAG.getConstantFP(0.0, lhsVT),
   2551                                     ISD::SETUO),
   2552                        DAG.getConstant(ccResultAllOnes, ccResultVT));
   2553   } else if (CC->get() == ISD::SETUO) {
   2554     // Evaluates to true if Op0 is [SQ]NaN
   2555     return DAG.getNode(ISD::AND, dl, ccResultVT,
   2556                        DAG.getSetCC(dl, ccResultVT,
   2557                                     lhsHi32abs,
   2558                                     DAG.getConstant(0x7ff00000, MVT::i32),
   2559                                     ISD::SETGE),
   2560                        DAG.getSetCC(dl, ccResultVT,
   2561                                     lhsLo32,
   2562                                     DAG.getConstant(0, MVT::i32),
   2563                                     ISD::SETGT));
   2564   }
   2565 
   2566   SDValue i64rhs = DAG.getNode(ISD::BITCAST, dl, IntVT, rhs);
   2567   SDValue rhsHi32 =
   2568           DAG.getNode(ISD::TRUNCATE, dl, MVT::i32,
   2569                       DAG.getNode(ISD::SRL, dl, IntVT,
   2570                                   i64rhs, DAG.getConstant(32, MVT::i32)));
   2571 
   2572   // If a value is negative, subtract from the sign magnitude constant:
   2573   SDValue signMag2TC = DAG.getConstant(0x8000000000000000ULL, IntVT);
   2574 
   2575   // Convert the sign-magnitude representation into 2's complement:
   2576   SDValue lhsSelectMask = DAG.getNode(ISD::SRA, dl, ccResultVT,
   2577                                       lhsHi32, DAG.getConstant(31, MVT::i32));
   2578   SDValue lhsSignMag2TC = DAG.getNode(ISD::SUB, dl, IntVT, signMag2TC, i64lhs);
   2579   SDValue lhsSelect =
   2580           DAG.getNode(ISD::SELECT, dl, IntVT,
   2581                       lhsSelectMask, lhsSignMag2TC, i64lhs);
   2582 
   2583   SDValue rhsSelectMask = DAG.getNode(ISD::SRA, dl, ccResultVT,
   2584                                       rhsHi32, DAG.getConstant(31, MVT::i32));
   2585   SDValue rhsSignMag2TC = DAG.getNode(ISD::SUB, dl, IntVT, signMag2TC, i64rhs);
   2586   SDValue rhsSelect =
   2587           DAG.getNode(ISD::SELECT, dl, IntVT,
   2588                       rhsSelectMask, rhsSignMag2TC, i64rhs);
   2589 
   2590   unsigned compareOp;
   2591 
   2592   switch (CC->get()) {
   2593   case ISD::SETOEQ:
   2594   case ISD::SETUEQ:
   2595     compareOp = ISD::SETEQ; break;
   2596   case ISD::SETOGT:
   2597   case ISD::SETUGT:
   2598     compareOp = ISD::SETGT; break;
   2599   case ISD::SETOGE:
   2600   case ISD::SETUGE:
   2601     compareOp = ISD::SETGE; break;
   2602   case ISD::SETOLT:
   2603   case ISD::SETULT:
   2604     compareOp = ISD::SETLT; break;
   2605   case ISD::SETOLE:
   2606   case ISD::SETULE:
   2607     compareOp = ISD::SETLE; break;
   2608   case ISD::SETUNE:
   2609   case ISD::SETONE:
   2610     compareOp = ISD::SETNE; break;
   2611   default:
   2612     report_fatal_error("CellSPU ISel Select: unimplemented f64 condition");
   2613   }
   2614 
   2615   SDValue result =
   2616           DAG.getSetCC(dl, ccResultVT, lhsSelect, rhsSelect,
   2617                        (ISD::CondCode) compareOp);
   2618 
   2619   if ((CC->get() & 0x8) == 0) {
   2620     // Ordered comparison:
   2621     SDValue lhsNaN = DAG.getSetCC(dl, ccResultVT,
   2622                                   lhs, DAG.getConstantFP(0.0, MVT::f64),
   2623                                   ISD::SETO);
   2624     SDValue rhsNaN = DAG.getSetCC(dl, ccResultVT,
   2625                                   rhs, DAG.getConstantFP(0.0, MVT::f64),
   2626                                   ISD::SETO);
   2627     SDValue ordered = DAG.getNode(ISD::AND, dl, ccResultVT, lhsNaN, rhsNaN);
   2628 
   2629     result = DAG.getNode(ISD::AND, dl, ccResultVT, ordered, result);
   2630   }
   2631 
   2632   return result;
   2633 }
   2634 
   2635 //! Lower ISD::SELECT_CC
   2636 /*!
   2637   ISD::SELECT_CC can (generally) be implemented directly on the SPU using the
   2638   SELB instruction.
   2639 
   2640   \note Need to revisit this in the future: if the code path through the true
   2641   and false value computations is longer than the latency of a branch (6
   2642   cycles), then it would be more advantageous to branch and insert a new basic
   2643   block and branch on the condition. However, this code does not make that
   2644   assumption, given the simplisitc uses so far.
   2645  */
   2646 
   2647 static SDValue LowerSELECT_CC(SDValue Op, SelectionDAG &DAG,
   2648                               const TargetLowering &TLI) {
   2649   EVT VT = Op.getValueType();
   2650   SDValue lhs = Op.getOperand(0);
   2651   SDValue rhs = Op.getOperand(1);
   2652   SDValue trueval = Op.getOperand(2);
   2653   SDValue falseval = Op.getOperand(3);
   2654   SDValue condition = Op.getOperand(4);
   2655   DebugLoc dl = Op.getDebugLoc();
   2656 
   2657   // NOTE: SELB's arguments: $rA, $rB, $mask
   2658   //
   2659   // SELB selects bits from $rA where bits in $mask are 0, bits from $rB
   2660   // where bits in $mask are 1. CCond will be inverted, having 1s where the
   2661   // condition was true and 0s where the condition was false. Hence, the
   2662   // arguments to SELB get reversed.
   2663 
   2664   // Note: Really should be ISD::SELECT instead of SPUISD::SELB, but LLVM's
   2665   // legalizer insists on combining SETCC/SELECT into SELECT_CC, so we end up
   2666   // with another "cannot select select_cc" assert:
   2667 
   2668   SDValue compare = DAG.getNode(ISD::SETCC, dl,
   2669                                 TLI.getSetCCResultType(Op.getValueType()),
   2670                                 lhs, rhs, condition);
   2671   return DAG.getNode(SPUISD::SELB, dl, VT, falseval, trueval, compare);
   2672 }
   2673 
   2674 //! Custom lower ISD::TRUNCATE
   2675 static SDValue LowerTRUNCATE(SDValue Op, SelectionDAG &DAG)
   2676 {
   2677   // Type to truncate to
   2678   EVT VT = Op.getValueType();
   2679   MVT simpleVT = VT.getSimpleVT();
   2680   EVT VecVT = EVT::getVectorVT(*DAG.getContext(),
   2681                                VT, (128 / VT.getSizeInBits()));
   2682   DebugLoc dl = Op.getDebugLoc();
   2683 
   2684   // Type to truncate from
   2685   SDValue Op0 = Op.getOperand(0);
   2686   EVT Op0VT = Op0.getValueType();
   2687 
   2688   if (Op0VT == MVT::i128 && simpleVT == MVT::i64) {
   2689     // Create shuffle mask, least significant doubleword of quadword
   2690     unsigned maskHigh = 0x08090a0b;
   2691     unsigned maskLow = 0x0c0d0e0f;
   2692     // Use a shuffle to perform the truncation
   2693     SDValue shufMask = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32,
   2694                                    DAG.getConstant(maskHigh, MVT::i32),
   2695                                    DAG.getConstant(maskLow, MVT::i32),
   2696                                    DAG.getConstant(maskHigh, MVT::i32),
   2697                                    DAG.getConstant(maskLow, MVT::i32));
   2698 
   2699     SDValue truncShuffle = DAG.getNode(SPUISD::SHUFB, dl, VecVT,
   2700                                        Op0, Op0, shufMask);
   2701 
   2702     return DAG.getNode(SPUISD::VEC2PREFSLOT, dl, VT, truncShuffle);
   2703   }
   2704 
   2705   return SDValue();             // Leave the truncate unmolested
   2706 }
   2707 
   2708 /*!
   2709  * Emit the instruction sequence for i64/i32 -> i128 sign extend. The basic
   2710  * algorithm is to duplicate the sign bit using rotmai to generate at
   2711  * least one byte full of sign bits. Then propagate the "sign-byte" into
   2712  * the leftmost words and the i64/i32 into the rightmost words using shufb.
   2713  *
   2714  * @param Op The sext operand
   2715  * @param DAG The current DAG
   2716  * @return The SDValue with the entire instruction sequence
   2717  */
   2718 static SDValue LowerSIGN_EXTEND(SDValue Op, SelectionDAG &DAG)
   2719 {
   2720   DebugLoc dl = Op.getDebugLoc();
   2721 
   2722   // Type to extend to
   2723   MVT OpVT = Op.getValueType().getSimpleVT();
   2724 
   2725   // Type to extend from
   2726   SDValue Op0 = Op.getOperand(0);
   2727   MVT Op0VT = Op0.getValueType().getSimpleVT();
   2728 
   2729   // extend i8 & i16 via i32
   2730   if (Op0VT == MVT::i8 || Op0VT == MVT::i16) {
   2731     Op0 = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::i32, Op0);
   2732     Op0VT = MVT::i32;
   2733   }
   2734 
   2735   // The type to extend to needs to be a i128 and
   2736   // the type to extend from needs to be i64 or i32.
   2737   assert((OpVT == MVT::i128 && (Op0VT == MVT::i64 || Op0VT == MVT::i32)) &&
   2738           "LowerSIGN_EXTEND: input and/or output operand have wrong size");
   2739   (void)OpVT;
   2740 
   2741   // Create shuffle mask
   2742   unsigned mask1 = 0x10101010; // byte 0 - 3 and 4 - 7
   2743   unsigned mask2 = Op0VT == MVT::i64 ? 0x00010203 : 0x10101010; // byte  8 - 11
   2744   unsigned mask3 = Op0VT == MVT::i64 ? 0x04050607 : 0x00010203; // byte 12 - 15
   2745   SDValue shufMask = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32,
   2746                                  DAG.getConstant(mask1, MVT::i32),
   2747                                  DAG.getConstant(mask1, MVT::i32),
   2748                                  DAG.getConstant(mask2, MVT::i32),
   2749                                  DAG.getConstant(mask3, MVT::i32));
   2750 
   2751   // Word wise arithmetic right shift to generate at least one byte
   2752   // that contains sign bits.
   2753   MVT mvt = Op0VT == MVT::i64 ? MVT::v2i64 : MVT::v4i32;
   2754   SDValue sraVal = DAG.getNode(ISD::SRA,
   2755                  dl,
   2756                  mvt,
   2757                  DAG.getNode(SPUISD::PREFSLOT2VEC, dl, mvt, Op0, Op0),
   2758                  DAG.getConstant(31, MVT::i32));
   2759 
   2760   // reinterpret as a i128 (SHUFB requires it). This gets lowered away.
   2761   SDValue extended = SDValue(DAG.getMachineNode(TargetOpcode::COPY_TO_REGCLASS,
   2762                                         dl, Op0VT, Op0,
   2763                                         DAG.getTargetConstant(
   2764                                                   SPU::GPRCRegClass.getID(),
   2765                                                   MVT::i32)), 0);
   2766   // Shuffle bytes - Copy the sign bits into the upper 64 bits
   2767   // and the input value into the lower 64 bits.
   2768   SDValue extShuffle = DAG.getNode(SPUISD::SHUFB, dl, mvt,
   2769         extended, sraVal, shufMask);
   2770   return DAG.getNode(ISD::BITCAST, dl, MVT::i128, extShuffle);
   2771 }
   2772 
   2773 //! Custom (target-specific) lowering entry point
   2774 /*!
   2775   This is where LLVM's DAG selection process calls to do target-specific
   2776   lowering of nodes.
   2777  */
   2778 SDValue
   2779 SPUTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const
   2780 {
   2781   unsigned Opc = (unsigned) Op.getOpcode();
   2782   EVT VT = Op.getValueType();
   2783 
   2784   switch (Opc) {
   2785   default: {
   2786 #ifndef NDEBUG
   2787     errs() << "SPUTargetLowering::LowerOperation(): need to lower this!\n";
   2788     errs() << "Op.getOpcode() = " << Opc << "\n";
   2789     errs() << "*Op.getNode():\n";
   2790     Op.getNode()->dump();
   2791 #endif
   2792     llvm_unreachable(0);
   2793   }
   2794   case ISD::LOAD:
   2795   case ISD::EXTLOAD:
   2796   case ISD::SEXTLOAD:
   2797   case ISD::ZEXTLOAD:
   2798     return LowerLOAD(Op, DAG, SPUTM.getSubtargetImpl());
   2799   case ISD::STORE:
   2800     return LowerSTORE(Op, DAG, SPUTM.getSubtargetImpl());
   2801   case ISD::ConstantPool:
   2802     return LowerConstantPool(Op, DAG, SPUTM.getSubtargetImpl());
   2803   case ISD::GlobalAddress:
   2804     return LowerGlobalAddress(Op, DAG, SPUTM.getSubtargetImpl());
   2805   case ISD::JumpTable:
   2806     return LowerJumpTable(Op, DAG, SPUTM.getSubtargetImpl());
   2807   case ISD::ConstantFP:
   2808     return LowerConstantFP(Op, DAG);
   2809 
   2810   // i8, i64 math ops:
   2811   case ISD::ADD:
   2812   case ISD::SUB:
   2813   case ISD::ROTR:
   2814   case ISD::ROTL:
   2815   case ISD::SRL:
   2816   case ISD::SHL:
   2817   case ISD::SRA: {
   2818     if (VT == MVT::i8)
   2819       return LowerI8Math(Op, DAG, Opc, *this);
   2820     break;
   2821   }
   2822 
   2823   case ISD::FP_TO_SINT:
   2824   case ISD::FP_TO_UINT:
   2825     return LowerFP_TO_INT(Op, DAG, *this);
   2826 
   2827   case ISD::SINT_TO_FP:
   2828   case ISD::UINT_TO_FP:
   2829     return LowerINT_TO_FP(Op, DAG, *this);
   2830 
   2831   // Vector-related lowering.
   2832   case ISD::BUILD_VECTOR:
   2833     return LowerBUILD_VECTOR(Op, DAG);
   2834   case ISD::SCALAR_TO_VECTOR:
   2835     return LowerSCALAR_TO_VECTOR(Op, DAG);
   2836   case ISD::VECTOR_SHUFFLE:
   2837     return LowerVECTOR_SHUFFLE(Op, DAG);
   2838   case ISD::EXTRACT_VECTOR_ELT:
   2839     return LowerEXTRACT_VECTOR_ELT(Op, DAG);
   2840   case ISD::INSERT_VECTOR_ELT:
   2841     return LowerINSERT_VECTOR_ELT(Op, DAG);
   2842 
   2843   // Look for ANDBI, ORBI and XORBI opportunities and lower appropriately:
   2844   case ISD::AND:
   2845   case ISD::OR:
   2846   case ISD::XOR:
   2847     return LowerByteImmed(Op, DAG);
   2848 
   2849   // Vector and i8 multiply:
   2850   case ISD::MUL:
   2851     if (VT == MVT::i8)
   2852       return LowerI8Math(Op, DAG, Opc, *this);
   2853 
   2854   case ISD::CTPOP:
   2855     return LowerCTPOP(Op, DAG);
   2856 
   2857   case ISD::SELECT_CC:
   2858     return LowerSELECT_CC(Op, DAG, *this);
   2859 
   2860   case ISD::SETCC:
   2861     return LowerSETCC(Op, DAG, *this);
   2862 
   2863   case ISD::TRUNCATE:
   2864     return LowerTRUNCATE(Op, DAG);
   2865 
   2866   case ISD::SIGN_EXTEND:
   2867     return LowerSIGN_EXTEND(Op, DAG);
   2868   }
   2869 
   2870   return SDValue();
   2871 }
   2872 
   2873 void SPUTargetLowering::ReplaceNodeResults(SDNode *N,
   2874                                            SmallVectorImpl<SDValue>&Results,
   2875                                            SelectionDAG &DAG) const
   2876 {
   2877 #if 0
   2878   unsigned Opc = (unsigned) N->getOpcode();
   2879   EVT OpVT = N->getValueType(0);
   2880 
   2881   switch (Opc) {
   2882   default: {
   2883     errs() << "SPUTargetLowering::ReplaceNodeResults(): need to fix this!\n";
   2884     errs() << "Op.getOpcode() = " << Opc << "\n";
   2885     errs() << "*Op.getNode():\n";
   2886     N->dump();
   2887     abort();
   2888     /*NOTREACHED*/
   2889   }
   2890   }
   2891 #endif
   2892 
   2893   /* Otherwise, return unchanged */
   2894 }
   2895 
   2896 //===----------------------------------------------------------------------===//
   2897 // Target Optimization Hooks
   2898 //===----------------------------------------------------------------------===//
   2899 
   2900 SDValue
   2901 SPUTargetLowering::PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const
   2902 {
   2903 #if 0
   2904   TargetMachine &TM = getTargetMachine();
   2905 #endif
   2906   const SPUSubtarget *ST = SPUTM.getSubtargetImpl();
   2907   SelectionDAG &DAG = DCI.DAG;
   2908   SDValue Op0 = N->getOperand(0);       // everything has at least one operand
   2909   EVT NodeVT = N->getValueType(0);      // The node's value type
   2910   EVT Op0VT = Op0.getValueType();       // The first operand's result
   2911   SDValue Result;                       // Initially, empty result
   2912   DebugLoc dl = N->getDebugLoc();
   2913 
   2914   switch (N->getOpcode()) {
   2915   default: break;
   2916   case ISD::ADD: {
   2917     SDValue Op1 = N->getOperand(1);
   2918 
   2919     if (Op0.getOpcode() == SPUISD::IndirectAddr
   2920         || Op1.getOpcode() == SPUISD::IndirectAddr) {
   2921       // Normalize the operands to reduce repeated code
   2922       SDValue IndirectArg = Op0, AddArg = Op1;
   2923 
   2924       if (Op1.getOpcode() == SPUISD::IndirectAddr) {
   2925         IndirectArg = Op1;
   2926         AddArg = Op0;
   2927       }
   2928 
   2929       if (isa<ConstantSDNode>(AddArg)) {
   2930         ConstantSDNode *CN0 = cast<ConstantSDNode > (AddArg);
   2931         SDValue IndOp1 = IndirectArg.getOperand(1);
   2932 
   2933         if (CN0->isNullValue()) {
   2934           // (add (SPUindirect <arg>, <arg>), 0) ->
   2935           // (SPUindirect <arg>, <arg>)
   2936 
   2937 #if !defined(NDEBUG)
   2938           if (DebugFlag && isCurrentDebugType(DEBUG_TYPE)) {
   2939             errs() << "\n"
   2940                  << "Replace: (add (SPUindirect <arg>, <arg>), 0)\n"
   2941                  << "With:    (SPUindirect <arg>, <arg>)\n";
   2942           }
   2943 #endif
   2944 
   2945           return IndirectArg;
   2946         } else if (isa<ConstantSDNode>(IndOp1)) {
   2947           // (add (SPUindirect <arg>, <const>), <const>) ->
   2948           // (SPUindirect <arg>, <const + const>)
   2949           ConstantSDNode *CN1 = cast<ConstantSDNode > (IndOp1);
   2950           int64_t combinedConst = CN0->getSExtValue() + CN1->getSExtValue();
   2951           SDValue combinedValue = DAG.getConstant(combinedConst, Op0VT);
   2952 
   2953 #if !defined(NDEBUG)
   2954           if (DebugFlag && isCurrentDebugType(DEBUG_TYPE)) {
   2955             errs() << "\n"
   2956                  << "Replace: (add (SPUindirect <arg>, " << CN1->getSExtValue()
   2957                  << "), " << CN0->getSExtValue() << ")\n"
   2958                  << "With:    (SPUindirect <arg>, "
   2959                  << combinedConst << ")\n";
   2960           }
   2961 #endif
   2962 
   2963           return DAG.getNode(SPUISD::IndirectAddr, dl, Op0VT,
   2964                              IndirectArg, combinedValue);
   2965         }
   2966       }
   2967     }
   2968     break;
   2969   }
   2970   case ISD::SIGN_EXTEND:
   2971   case ISD::ZERO_EXTEND:
   2972   case ISD::ANY_EXTEND: {
   2973     if (Op0.getOpcode() == SPUISD::VEC2PREFSLOT && NodeVT == Op0VT) {
   2974       // (any_extend (SPUextract_elt0 <arg>)) ->
   2975       // (SPUextract_elt0 <arg>)
   2976       // Types must match, however...
   2977 #if !defined(NDEBUG)
   2978       if (DebugFlag && isCurrentDebugType(DEBUG_TYPE)) {
   2979         errs() << "\nReplace: ";
   2980         N->dump(&DAG);
   2981         errs() << "\nWith:    ";
   2982         Op0.getNode()->dump(&DAG);
   2983         errs() << "\n";
   2984       }
   2985 #endif
   2986 
   2987       return Op0;
   2988     }
   2989     break;
   2990   }
   2991   case SPUISD::IndirectAddr: {
   2992     if (!ST->usingLargeMem() && Op0.getOpcode() == SPUISD::AFormAddr) {
   2993       ConstantSDNode *CN = dyn_cast<ConstantSDNode>(N->getOperand(1));
   2994       if (CN != 0 && CN->isNullValue()) {
   2995         // (SPUindirect (SPUaform <addr>, 0), 0) ->
   2996         // (SPUaform <addr>, 0)
   2997 
   2998         DEBUG(errs() << "Replace: ");
   2999         DEBUG(N->dump(&DAG));
   3000         DEBUG(errs() << "\nWith:    ");
   3001         DEBUG(Op0.getNode()->dump(&DAG));
   3002         DEBUG(errs() << "\n");
   3003 
   3004         return Op0;
   3005       }
   3006     } else if (Op0.getOpcode() == ISD::ADD) {
   3007       SDValue Op1 = N->getOperand(1);
   3008       if (ConstantSDNode *CN1 = dyn_cast<ConstantSDNode>(Op1)) {
   3009         // (SPUindirect (add <arg>, <arg>), 0) ->
   3010         // (SPUindirect <arg>, <arg>)
   3011         if (CN1->isNullValue()) {
   3012 
   3013 #if !defined(NDEBUG)
   3014           if (DebugFlag && isCurrentDebugType(DEBUG_TYPE)) {
   3015             errs() << "\n"
   3016                  << "Replace: (SPUindirect (add <arg>, <arg>), 0)\n"
   3017                  << "With:    (SPUindirect <arg>, <arg>)\n";
   3018           }
   3019 #endif
   3020 
   3021           return DAG.getNode(SPUISD::IndirectAddr, dl, Op0VT,
   3022                              Op0.getOperand(0), Op0.getOperand(1));
   3023         }
   3024       }
   3025     }
   3026     break;
   3027   }
   3028   case SPUISD::SHL_BITS:
   3029   case SPUISD::SHL_BYTES:
   3030   case SPUISD::ROTBYTES_LEFT: {
   3031     SDValue Op1 = N->getOperand(1);
   3032 
   3033     // Kill degenerate vector shifts:
   3034     if (ConstantSDNode *CN = dyn_cast<ConstantSDNode>(Op1)) {
   3035       if (CN->isNullValue()) {
   3036         Result = Op0;
   3037       }
   3038     }
   3039     break;
   3040   }
   3041   case SPUISD::PREFSLOT2VEC: {
   3042     switch (Op0.getOpcode()) {
   3043     default:
   3044       break;
   3045     case ISD::ANY_EXTEND:
   3046     case ISD::ZERO_EXTEND:
   3047     case ISD::SIGN_EXTEND: {
   3048       // (SPUprefslot2vec (any|zero|sign_extend (SPUvec2prefslot <arg>))) ->
   3049       // <arg>
   3050       // but only if the SPUprefslot2vec and <arg> types match.
   3051       SDValue Op00 = Op0.getOperand(0);
   3052       if (Op00.getOpcode() == SPUISD::VEC2PREFSLOT) {
   3053         SDValue Op000 = Op00.getOperand(0);
   3054         if (Op000.getValueType() == NodeVT) {
   3055           Result = Op000;
   3056         }
   3057       }
   3058       break;
   3059     }
   3060     case SPUISD::VEC2PREFSLOT: {
   3061       // (SPUprefslot2vec (SPUvec2prefslot <arg>)) ->
   3062       // <arg>
   3063       Result = Op0.getOperand(0);
   3064       break;
   3065     }
   3066     }
   3067     break;
   3068   }
   3069   }
   3070 
   3071   // Otherwise, return unchanged.
   3072 #ifndef NDEBUG
   3073   if (Result.getNode()) {
   3074     DEBUG(errs() << "\nReplace.SPU: ");
   3075     DEBUG(N->dump(&DAG));
   3076     DEBUG(errs() << "\nWith:        ");
   3077     DEBUG(Result.getNode()->dump(&DAG));
   3078     DEBUG(errs() << "\n");
   3079   }
   3080 #endif
   3081 
   3082   return Result;
   3083 }
   3084 
   3085 //===----------------------------------------------------------------------===//
   3086 // Inline Assembly Support
   3087 //===----------------------------------------------------------------------===//
   3088 
   3089 /// getConstraintType - Given a constraint letter, return the type of
   3090 /// constraint it is for this target.
   3091 SPUTargetLowering::ConstraintType
   3092 SPUTargetLowering::getConstraintType(const std::string &ConstraintLetter) const {
   3093   if (ConstraintLetter.size() == 1) {
   3094     switch (ConstraintLetter[0]) {
   3095     default: break;
   3096     case 'b':
   3097     case 'r':
   3098     case 'f':
   3099     case 'v':
   3100     case 'y':
   3101       return C_RegisterClass;
   3102     }
   3103   }
   3104   return TargetLowering::getConstraintType(ConstraintLetter);
   3105 }
   3106 
   3107 /// Examine constraint type and operand type and determine a weight value.
   3108 /// This object must already have been set up with the operand type
   3109 /// and the current alternative constraint selected.
   3110 TargetLowering::ConstraintWeight
   3111 SPUTargetLowering::getSingleConstraintMatchWeight(
   3112     AsmOperandInfo &info, const char *constraint) const {
   3113   ConstraintWeight weight = CW_Invalid;
   3114   Value *CallOperandVal = info.CallOperandVal;
   3115     // If we don't have a value, we can't do a match,
   3116     // but allow it at the lowest weight.
   3117   if (CallOperandVal == NULL)
   3118     return CW_Default;
   3119   // Look at the constraint type.
   3120   switch (*constraint) {
   3121   default:
   3122     weight = TargetLowering::getSingleConstraintMatchWeight(info, constraint);
   3123     break;
   3124     //FIXME: Seems like the supported constraint letters were just copied
   3125     // from PPC, as the following doesn't correspond to the GCC docs.
   3126     // I'm leaving it so until someone adds the corresponding lowering support.
   3127   case 'b':
   3128   case 'r':
   3129   case 'f':
   3130   case 'd':
   3131   case 'v':
   3132   case 'y':
   3133     weight = CW_Register;
   3134     break;
   3135   }
   3136   return weight;
   3137 }
   3138 
   3139 std::pair<unsigned, const TargetRegisterClass*>
   3140 SPUTargetLowering::getRegForInlineAsmConstraint(const std::string &Constraint,
   3141                                                 EVT VT) const
   3142 {
   3143   if (Constraint.size() == 1) {
   3144     // GCC RS6000 Constraint Letters
   3145     switch (Constraint[0]) {
   3146     case 'b':   // R1-R31
   3147     case 'r':   // R0-R31
   3148       if (VT == MVT::i64)
   3149         return std::make_pair(0U, &SPU::R64CRegClass);
   3150       return std::make_pair(0U, &SPU::R32CRegClass);
   3151     case 'f':
   3152       if (VT == MVT::f32)
   3153         return std::make_pair(0U, &SPU::R32FPRegClass);
   3154       if (VT == MVT::f64)
   3155         return std::make_pair(0U, &SPU::R64FPRegClass);
   3156       break;
   3157     case 'v':
   3158       return std::make_pair(0U, &SPU::GPRCRegClass);
   3159     }
   3160   }
   3161 
   3162   return TargetLowering::getRegForInlineAsmConstraint(Constraint, VT);
   3163 }
   3164 
   3165 //! Compute used/known bits for a SPU operand
   3166 void
   3167 SPUTargetLowering::computeMaskedBitsForTargetNode(const SDValue Op,
   3168                                                   APInt &KnownZero,
   3169                                                   APInt &KnownOne,
   3170                                                   const SelectionDAG &DAG,
   3171                                                   unsigned Depth ) const {
   3172 #if 0
   3173   const uint64_t uint64_sizebits = sizeof(uint64_t) * CHAR_BIT;
   3174 
   3175   switch (Op.getOpcode()) {
   3176   default:
   3177     // KnownZero = KnownOne = APInt(Mask.getBitWidth(), 0);
   3178     break;
   3179   case CALL:
   3180   case SHUFB:
   3181   case SHUFFLE_MASK:
   3182   case CNTB:
   3183   case SPUISD::PREFSLOT2VEC:
   3184   case SPUISD::LDRESULT:
   3185   case SPUISD::VEC2PREFSLOT:
   3186   case SPUISD::SHLQUAD_L_BITS:
   3187   case SPUISD::SHLQUAD_L_BYTES:
   3188   case SPUISD::VEC_ROTL:
   3189   case SPUISD::VEC_ROTR:
   3190   case SPUISD::ROTBYTES_LEFT:
   3191   case SPUISD::SELECT_MASK:
   3192   case SPUISD::SELB:
   3193   }
   3194 #endif
   3195 }
   3196 
   3197 unsigned
   3198 SPUTargetLowering::ComputeNumSignBitsForTargetNode(SDValue Op,
   3199                                                    unsigned Depth) const {
   3200   switch (Op.getOpcode()) {
   3201   default:
   3202     return 1;
   3203 
   3204   case ISD::SETCC: {
   3205     EVT VT = Op.getValueType();
   3206 
   3207     if (VT != MVT::i8 && VT != MVT::i16 && VT != MVT::i32) {
   3208       VT = MVT::i32;
   3209     }
   3210     return VT.getSizeInBits();
   3211   }
   3212   }
   3213 }
   3214 
   3215 // LowerAsmOperandForConstraint
   3216 void
   3217 SPUTargetLowering::LowerAsmOperandForConstraint(SDValue Op,
   3218                                                 std::string &Constraint,
   3219                                                 std::vector<SDValue> &Ops,
   3220                                                 SelectionDAG &DAG) const {
   3221   // Default, for the time being, to the base class handler
   3222   TargetLowering::LowerAsmOperandForConstraint(Op, Constraint, Ops, DAG);
   3223 }
   3224 
   3225 /// isLegalAddressImmediate - Return true if the integer value can be used
   3226 /// as the offset of the target addressing mode.
   3227 bool SPUTargetLowering::isLegalAddressImmediate(int64_t V,
   3228                                                 Type *Ty) const {
   3229   // SPU's addresses are 256K:
   3230   return (V > -(1 << 18) && V < (1 << 18) - 1);
   3231 }
   3232 
   3233 bool SPUTargetLowering::isLegalAddressImmediate(GlobalValue* GV) const {
   3234   return false;
   3235 }
   3236 
   3237 bool
   3238 SPUTargetLowering::isOffsetFoldingLegal(const GlobalAddressSDNode *GA) const {
   3239   // The SPU target isn't yet aware of offsets.
   3240   return false;
   3241 }
   3242 
   3243 // can we compare to Imm without writing it into a register?
   3244 bool SPUTargetLowering::isLegalICmpImmediate(int64_t Imm) const {
   3245   //ceqi, cgti, etc. all take s10 operand
   3246   return isInt<10>(Imm);
   3247 }
   3248 
   3249 bool
   3250 SPUTargetLowering::isLegalAddressingMode(const AddrMode &AM,
   3251                                          Type * ) const{
   3252 
   3253   // A-form: 18bit absolute address.
   3254   if (AM.BaseGV && !AM.HasBaseReg && AM.Scale == 0 && AM.BaseOffs == 0)
   3255     return true;
   3256 
   3257   // D-form: reg + 14bit offset
   3258   if (AM.BaseGV ==0 && AM.HasBaseReg && AM.Scale == 0 && isInt<14>(AM.BaseOffs))
   3259     return true;
   3260 
   3261   // X-form: reg+reg
   3262   if (AM.BaseGV == 0 && AM.HasBaseReg && AM.Scale == 1 && AM.BaseOffs ==0)
   3263     return true;
   3264 
   3265   return false;
   3266 }
   3267