Home | History | Annotate | Download | only in CellSPU
      1 //===-- SPUISelLowering.cpp - Cell SPU DAG Lowering Implementation --------===//
      2 //                     The LLVM Compiler Infrastructure
      3 //
      4 // This file is distributed under the University of Illinois Open Source
      5 // License. See LICENSE.TXT for details.
      6 //
      7 //===----------------------------------------------------------------------===//
      8 //
      9 // This file implements the SPUTargetLowering class.
     10 //
     11 //===----------------------------------------------------------------------===//
     12 
     13 #include "SPUISelLowering.h"
     14 #include "SPUTargetMachine.h"
     15 #include "SPUFrameLowering.h"
     16 #include "SPUMachineFunction.h"
     17 #include "llvm/Constants.h"
     18 #include "llvm/Function.h"
     19 #include "llvm/Intrinsics.h"
     20 #include "llvm/CallingConv.h"
     21 #include "llvm/Type.h"
     22 #include "llvm/CodeGen/CallingConvLower.h"
     23 #include "llvm/CodeGen/MachineFrameInfo.h"
     24 #include "llvm/CodeGen/MachineFunction.h"
     25 #include "llvm/CodeGen/MachineInstrBuilder.h"
     26 #include "llvm/CodeGen/MachineRegisterInfo.h"
     27 #include "llvm/CodeGen/SelectionDAG.h"
     28 #include "llvm/CodeGen/TargetLoweringObjectFileImpl.h"
     29 #include "llvm/Target/TargetOptions.h"
     30 #include "llvm/ADT/VectorExtras.h"
     31 #include "llvm/Support/Debug.h"
     32 #include "llvm/Support/ErrorHandling.h"
     33 #include "llvm/Support/MathExtras.h"
     34 #include "llvm/Support/raw_ostream.h"
     35 #include <map>
     36 
     37 using namespace llvm;
     38 
     39 // Used in getTargetNodeName() below
     40 namespace {
     41   std::map<unsigned, const char *> node_names;
     42 
     43   // Byte offset of the preferred slot (counted from the MSB)
     44   int prefslotOffset(EVT VT) {
     45     int retval=0;
     46     if (VT==MVT::i1) retval=3;
     47     if (VT==MVT::i8) retval=3;
     48     if (VT==MVT::i16) retval=2;
     49 
     50     return retval;
     51   }
     52 
     53   //! Expand a library call into an actual call DAG node
     54   /*!
     55    \note
     56    This code is taken from SelectionDAGLegalize, since it is not exposed as
     57    part of the LLVM SelectionDAG API.
     58    */
     59 
     60   SDValue
     61   ExpandLibCall(RTLIB::Libcall LC, SDValue Op, SelectionDAG &DAG,
     62                 bool isSigned, SDValue &Hi, const SPUTargetLowering &TLI) {
     63     // The input chain to this libcall is the entry node of the function.
     64     // Legalizing the call will automatically add the previous call to the
     65     // dependence.
     66     SDValue InChain = DAG.getEntryNode();
     67 
     68     TargetLowering::ArgListTy Args;
     69     TargetLowering::ArgListEntry Entry;
     70     for (unsigned i = 0, e = Op.getNumOperands(); i != e; ++i) {
     71       EVT ArgVT = Op.getOperand(i).getValueType();
     72       Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext());
     73       Entry.Node = Op.getOperand(i);
     74       Entry.Ty = ArgTy;
     75       Entry.isSExt = isSigned;
     76       Entry.isZExt = !isSigned;
     77       Args.push_back(Entry);
     78     }
     79     SDValue Callee = DAG.getExternalSymbol(TLI.getLibcallName(LC),
     80                                            TLI.getPointerTy());
     81 
     82     // Splice the libcall in wherever FindInputOutputChains tells us to.
     83     Type *RetTy =
     84                 Op.getNode()->getValueType(0).getTypeForEVT(*DAG.getContext());
     85     std::pair<SDValue, SDValue> CallInfo =
     86             TLI.LowerCallTo(InChain, RetTy, isSigned, !isSigned, false, false,
     87                             0, TLI.getLibcallCallingConv(LC), false,
     88                             /*isReturnValueUsed=*/true,
     89                             Callee, Args, DAG, Op.getDebugLoc());
     90 
     91     return CallInfo.first;
     92   }
     93 }
     94 
     95 SPUTargetLowering::SPUTargetLowering(SPUTargetMachine &TM)
     96   : TargetLowering(TM, new TargetLoweringObjectFileELF()),
     97     SPUTM(TM) {
     98 
     99   // Use _setjmp/_longjmp instead of setjmp/longjmp.
    100   setUseUnderscoreSetJmp(true);
    101   setUseUnderscoreLongJmp(true);
    102 
    103   // Set RTLIB libcall names as used by SPU:
    104   setLibcallName(RTLIB::DIV_F64, "__fast_divdf3");
    105 
    106   // Set up the SPU's register classes:
    107   addRegisterClass(MVT::i8,   SPU::R8CRegisterClass);
    108   addRegisterClass(MVT::i16,  SPU::R16CRegisterClass);
    109   addRegisterClass(MVT::i32,  SPU::R32CRegisterClass);
    110   addRegisterClass(MVT::i64,  SPU::R64CRegisterClass);
    111   addRegisterClass(MVT::f32,  SPU::R32FPRegisterClass);
    112   addRegisterClass(MVT::f64,  SPU::R64FPRegisterClass);
    113   addRegisterClass(MVT::i128, SPU::GPRCRegisterClass);
    114 
    115   // SPU has no sign or zero extended loads for i1, i8, i16:
    116   setLoadExtAction(ISD::EXTLOAD,  MVT::i1, Promote);
    117   setLoadExtAction(ISD::SEXTLOAD, MVT::i1, Promote);
    118   setLoadExtAction(ISD::ZEXTLOAD, MVT::i1, Promote);
    119 
    120   setLoadExtAction(ISD::EXTLOAD,  MVT::f32, Expand);
    121   setLoadExtAction(ISD::EXTLOAD,  MVT::f64, Expand);
    122 
    123   setTruncStoreAction(MVT::i128, MVT::i64, Expand);
    124   setTruncStoreAction(MVT::i128, MVT::i32, Expand);
    125   setTruncStoreAction(MVT::i128, MVT::i16, Expand);
    126   setTruncStoreAction(MVT::i128, MVT::i8, Expand);
    127 
    128   setTruncStoreAction(MVT::f64, MVT::f32, Expand);
    129 
    130   // SPU constant load actions are custom lowered:
    131   setOperationAction(ISD::ConstantFP, MVT::f32, Legal);
    132   setOperationAction(ISD::ConstantFP, MVT::f64, Custom);
    133 
    134   // SPU's loads and stores have to be custom lowered:
    135   for (unsigned sctype = (unsigned) MVT::i8; sctype < (unsigned) MVT::i128;
    136        ++sctype) {
    137     MVT::SimpleValueType VT = (MVT::SimpleValueType)sctype;
    138 
    139     setOperationAction(ISD::LOAD,   VT, Custom);
    140     setOperationAction(ISD::STORE,  VT, Custom);
    141     setLoadExtAction(ISD::EXTLOAD,  VT, Custom);
    142     setLoadExtAction(ISD::ZEXTLOAD, VT, Custom);
    143     setLoadExtAction(ISD::SEXTLOAD, VT, Custom);
    144 
    145     for (unsigned stype = sctype - 1; stype >= (unsigned) MVT::i8; --stype) {
    146       MVT::SimpleValueType StoreVT = (MVT::SimpleValueType) stype;
    147       setTruncStoreAction(VT, StoreVT, Expand);
    148     }
    149   }
    150 
    151   for (unsigned sctype = (unsigned) MVT::f32; sctype < (unsigned) MVT::f64;
    152        ++sctype) {
    153     MVT::SimpleValueType VT = (MVT::SimpleValueType) sctype;
    154 
    155     setOperationAction(ISD::LOAD,   VT, Custom);
    156     setOperationAction(ISD::STORE,  VT, Custom);
    157 
    158     for (unsigned stype = sctype - 1; stype >= (unsigned) MVT::f32; --stype) {
    159       MVT::SimpleValueType StoreVT = (MVT::SimpleValueType) stype;
    160       setTruncStoreAction(VT, StoreVT, Expand);
    161     }
    162   }
    163 
    164   // Expand the jumptable branches
    165   setOperationAction(ISD::BR_JT,        MVT::Other, Expand);
    166   setOperationAction(ISD::BR_CC,        MVT::Other, Expand);
    167 
    168   // Custom lower SELECT_CC for most cases, but expand by default
    169   setOperationAction(ISD::SELECT_CC,    MVT::Other, Expand);
    170   setOperationAction(ISD::SELECT_CC,    MVT::i8,    Custom);
    171   setOperationAction(ISD::SELECT_CC,    MVT::i16,   Custom);
    172   setOperationAction(ISD::SELECT_CC,    MVT::i32,   Custom);
    173   setOperationAction(ISD::SELECT_CC,    MVT::i64,   Custom);
    174 
    175   // SPU has no intrinsics for these particular operations:
    176   setOperationAction(ISD::MEMBARRIER, MVT::Other, Expand);
    177 
    178   // SPU has no division/remainder instructions
    179   setOperationAction(ISD::SREM,    MVT::i8,   Expand);
    180   setOperationAction(ISD::UREM,    MVT::i8,   Expand);
    181   setOperationAction(ISD::SDIV,    MVT::i8,   Expand);
    182   setOperationAction(ISD::UDIV,    MVT::i8,   Expand);
    183   setOperationAction(ISD::SDIVREM, MVT::i8,   Expand);
    184   setOperationAction(ISD::UDIVREM, MVT::i8,   Expand);
    185   setOperationAction(ISD::SREM,    MVT::i16,  Expand);
    186   setOperationAction(ISD::UREM,    MVT::i16,  Expand);
    187   setOperationAction(ISD::SDIV,    MVT::i16,  Expand);
    188   setOperationAction(ISD::UDIV,    MVT::i16,  Expand);
    189   setOperationAction(ISD::SDIVREM, MVT::i16,  Expand);
    190   setOperationAction(ISD::UDIVREM, MVT::i16,  Expand);
    191   setOperationAction(ISD::SREM,    MVT::i32,  Expand);
    192   setOperationAction(ISD::UREM,    MVT::i32,  Expand);
    193   setOperationAction(ISD::SDIV,    MVT::i32,  Expand);
    194   setOperationAction(ISD::UDIV,    MVT::i32,  Expand);
    195   setOperationAction(ISD::SDIVREM, MVT::i32,  Expand);
    196   setOperationAction(ISD::UDIVREM, MVT::i32,  Expand);
    197   setOperationAction(ISD::SREM,    MVT::i64,  Expand);
    198   setOperationAction(ISD::UREM,    MVT::i64,  Expand);
    199   setOperationAction(ISD::SDIV,    MVT::i64,  Expand);
    200   setOperationAction(ISD::UDIV,    MVT::i64,  Expand);
    201   setOperationAction(ISD::SDIVREM, MVT::i64,  Expand);
    202   setOperationAction(ISD::UDIVREM, MVT::i64,  Expand);
    203   setOperationAction(ISD::SREM,    MVT::i128, Expand);
    204   setOperationAction(ISD::UREM,    MVT::i128, Expand);
    205   setOperationAction(ISD::SDIV,    MVT::i128, Expand);
    206   setOperationAction(ISD::UDIV,    MVT::i128, Expand);
    207   setOperationAction(ISD::SDIVREM, MVT::i128, Expand);
    208   setOperationAction(ISD::UDIVREM, MVT::i128, Expand);
    209 
    210   // We don't support sin/cos/sqrt/fmod
    211   setOperationAction(ISD::FSIN , MVT::f64, Expand);
    212   setOperationAction(ISD::FCOS , MVT::f64, Expand);
    213   setOperationAction(ISD::FREM , MVT::f64, Expand);
    214   setOperationAction(ISD::FSIN , MVT::f32, Expand);
    215   setOperationAction(ISD::FCOS , MVT::f32, Expand);
    216   setOperationAction(ISD::FREM , MVT::f32, Expand);
    217 
    218   // Expand fsqrt to the appropriate libcall (NOTE: should use h/w fsqrt
    219   // for f32!)
    220   setOperationAction(ISD::FSQRT, MVT::f64, Expand);
    221   setOperationAction(ISD::FSQRT, MVT::f32, Expand);
    222 
    223   setOperationAction(ISD::FMA, MVT::f64, Expand);
    224   setOperationAction(ISD::FMA, MVT::f32, Expand);
    225 
    226   setOperationAction(ISD::FCOPYSIGN, MVT::f64, Expand);
    227   setOperationAction(ISD::FCOPYSIGN, MVT::f32, Expand);
    228 
    229   // SPU can do rotate right and left, so legalize it... but customize for i8
    230   // because instructions don't exist.
    231 
    232   // FIXME: Change from "expand" to appropriate type once ROTR is supported in
    233   //        .td files.
    234   setOperationAction(ISD::ROTR, MVT::i32,    Expand /*Legal*/);
    235   setOperationAction(ISD::ROTR, MVT::i16,    Expand /*Legal*/);
    236   setOperationAction(ISD::ROTR, MVT::i8,     Expand /*Custom*/);
    237 
    238   setOperationAction(ISD::ROTL, MVT::i32,    Legal);
    239   setOperationAction(ISD::ROTL, MVT::i16,    Legal);
    240   setOperationAction(ISD::ROTL, MVT::i8,     Custom);
    241 
    242   // SPU has no native version of shift left/right for i8
    243   setOperationAction(ISD::SHL,  MVT::i8,     Custom);
    244   setOperationAction(ISD::SRL,  MVT::i8,     Custom);
    245   setOperationAction(ISD::SRA,  MVT::i8,     Custom);
    246 
    247   // Make these operations legal and handle them during instruction selection:
    248   setOperationAction(ISD::SHL,  MVT::i64,    Legal);
    249   setOperationAction(ISD::SRL,  MVT::i64,    Legal);
    250   setOperationAction(ISD::SRA,  MVT::i64,    Legal);
    251 
    252   // Custom lower i8, i32 and i64 multiplications
    253   setOperationAction(ISD::MUL,  MVT::i8,     Custom);
    254   setOperationAction(ISD::MUL,  MVT::i32,    Legal);
    255   setOperationAction(ISD::MUL,  MVT::i64,    Legal);
    256 
    257   // Expand double-width multiplication
    258   // FIXME: It would probably be reasonable to support some of these operations
    259   setOperationAction(ISD::UMUL_LOHI, MVT::i8,  Expand);
    260   setOperationAction(ISD::SMUL_LOHI, MVT::i8,  Expand);
    261   setOperationAction(ISD::MULHU,     MVT::i8,  Expand);
    262   setOperationAction(ISD::MULHS,     MVT::i8,  Expand);
    263   setOperationAction(ISD::UMUL_LOHI, MVT::i16, Expand);
    264   setOperationAction(ISD::SMUL_LOHI, MVT::i16, Expand);
    265   setOperationAction(ISD::MULHU,     MVT::i16, Expand);
    266   setOperationAction(ISD::MULHS,     MVT::i16, Expand);
    267   setOperationAction(ISD::UMUL_LOHI, MVT::i32, Expand);
    268   setOperationAction(ISD::SMUL_LOHI, MVT::i32, Expand);
    269   setOperationAction(ISD::MULHU,     MVT::i32, Expand);
    270   setOperationAction(ISD::MULHS,     MVT::i32, Expand);
    271   setOperationAction(ISD::UMUL_LOHI, MVT::i64, Expand);
    272   setOperationAction(ISD::SMUL_LOHI, MVT::i64, Expand);
    273   setOperationAction(ISD::MULHU,     MVT::i64, Expand);
    274   setOperationAction(ISD::MULHS,     MVT::i64, Expand);
    275 
    276   // Need to custom handle (some) common i8, i64 math ops
    277   setOperationAction(ISD::ADD,  MVT::i8,     Custom);
    278   setOperationAction(ISD::ADD,  MVT::i64,    Legal);
    279   setOperationAction(ISD::SUB,  MVT::i8,     Custom);
    280   setOperationAction(ISD::SUB,  MVT::i64,    Legal);
    281 
    282   // SPU does not have BSWAP. It does have i32 support CTLZ.
    283   // CTPOP has to be custom lowered.
    284   setOperationAction(ISD::BSWAP, MVT::i32,   Expand);
    285   setOperationAction(ISD::BSWAP, MVT::i64,   Expand);
    286 
    287   setOperationAction(ISD::CTPOP, MVT::i8,    Custom);
    288   setOperationAction(ISD::CTPOP, MVT::i16,   Custom);
    289   setOperationAction(ISD::CTPOP, MVT::i32,   Custom);
    290   setOperationAction(ISD::CTPOP, MVT::i64,   Custom);
    291   setOperationAction(ISD::CTPOP, MVT::i128,  Expand);
    292 
    293   setOperationAction(ISD::CTTZ , MVT::i8,    Expand);
    294   setOperationAction(ISD::CTTZ , MVT::i16,   Expand);
    295   setOperationAction(ISD::CTTZ , MVT::i32,   Expand);
    296   setOperationAction(ISD::CTTZ , MVT::i64,   Expand);
    297   setOperationAction(ISD::CTTZ , MVT::i128,  Expand);
    298 
    299   setOperationAction(ISD::CTLZ , MVT::i8,    Promote);
    300   setOperationAction(ISD::CTLZ , MVT::i16,   Promote);
    301   setOperationAction(ISD::CTLZ , MVT::i32,   Legal);
    302   setOperationAction(ISD::CTLZ , MVT::i64,   Expand);
    303   setOperationAction(ISD::CTLZ , MVT::i128,  Expand);
    304 
    305   // SPU has a version of select that implements (a&~c)|(b&c), just like
    306   // select ought to work:
    307   setOperationAction(ISD::SELECT, MVT::i8,   Legal);
    308   setOperationAction(ISD::SELECT, MVT::i16,  Legal);
    309   setOperationAction(ISD::SELECT, MVT::i32,  Legal);
    310   setOperationAction(ISD::SELECT, MVT::i64,  Legal);
    311 
    312   setOperationAction(ISD::SETCC, MVT::i8,    Legal);
    313   setOperationAction(ISD::SETCC, MVT::i16,   Legal);
    314   setOperationAction(ISD::SETCC, MVT::i32,   Legal);
    315   setOperationAction(ISD::SETCC, MVT::i64,   Legal);
    316   setOperationAction(ISD::SETCC, MVT::f64,   Custom);
    317 
    318   // Custom lower i128 -> i64 truncates
    319   setOperationAction(ISD::TRUNCATE, MVT::i64, Custom);
    320 
    321   // Custom lower i32/i64 -> i128 sign extend
    322   setOperationAction(ISD::SIGN_EXTEND, MVT::i128, Custom);
    323 
    324   setOperationAction(ISD::FP_TO_SINT, MVT::i8, Promote);
    325   setOperationAction(ISD::FP_TO_UINT, MVT::i8, Promote);
    326   setOperationAction(ISD::FP_TO_SINT, MVT::i16, Promote);
    327   setOperationAction(ISD::FP_TO_UINT, MVT::i16, Promote);
    328   // SPU has a legal FP -> signed INT instruction for f32, but for f64, need
    329   // to expand to a libcall, hence the custom lowering:
    330   setOperationAction(ISD::FP_TO_SINT, MVT::i32, Custom);
    331   setOperationAction(ISD::FP_TO_UINT, MVT::i32, Custom);
    332   setOperationAction(ISD::FP_TO_SINT, MVT::i64, Expand);
    333   setOperationAction(ISD::FP_TO_UINT, MVT::i64, Expand);
    334   setOperationAction(ISD::FP_TO_SINT, MVT::i128, Expand);
    335   setOperationAction(ISD::FP_TO_UINT, MVT::i128, Expand);
    336 
    337   // FDIV on SPU requires custom lowering
    338   setOperationAction(ISD::FDIV, MVT::f64, Expand);      // to libcall
    339 
    340   // SPU has [U|S]INT_TO_FP for f32->i32, but not for f64->i32, f64->i64:
    341   setOperationAction(ISD::SINT_TO_FP, MVT::i32, Custom);
    342   setOperationAction(ISD::SINT_TO_FP, MVT::i16, Promote);
    343   setOperationAction(ISD::SINT_TO_FP, MVT::i8,  Promote);
    344   setOperationAction(ISD::UINT_TO_FP, MVT::i32, Custom);
    345   setOperationAction(ISD::UINT_TO_FP, MVT::i16, Promote);
    346   setOperationAction(ISD::UINT_TO_FP, MVT::i8,  Promote);
    347   setOperationAction(ISD::SINT_TO_FP, MVT::i64, Custom);
    348   setOperationAction(ISD::UINT_TO_FP, MVT::i64, Custom);
    349 
    350   setOperationAction(ISD::BITCAST, MVT::i32, Legal);
    351   setOperationAction(ISD::BITCAST, MVT::f32, Legal);
    352   setOperationAction(ISD::BITCAST, MVT::i64, Legal);
    353   setOperationAction(ISD::BITCAST, MVT::f64, Legal);
    354 
    355   // We cannot sextinreg(i1).  Expand to shifts.
    356   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1, Expand);
    357 
    358   // We want to legalize GlobalAddress and ConstantPool nodes into the
    359   // appropriate instructions to materialize the address.
    360   for (unsigned sctype = (unsigned) MVT::i8; sctype < (unsigned) MVT::f128;
    361        ++sctype) {
    362     MVT::SimpleValueType VT = (MVT::SimpleValueType)sctype;
    363 
    364     setOperationAction(ISD::GlobalAddress,  VT, Custom);
    365     setOperationAction(ISD::ConstantPool,   VT, Custom);
    366     setOperationAction(ISD::JumpTable,      VT, Custom);
    367   }
    368 
    369   // VASTART needs to be custom lowered to use the VarArgsFrameIndex
    370   setOperationAction(ISD::VASTART           , MVT::Other, Custom);
    371 
    372   // Use the default implementation.
    373   setOperationAction(ISD::VAARG             , MVT::Other, Expand);
    374   setOperationAction(ISD::VACOPY            , MVT::Other, Expand);
    375   setOperationAction(ISD::VAEND             , MVT::Other, Expand);
    376   setOperationAction(ISD::STACKSAVE         , MVT::Other, Expand);
    377   setOperationAction(ISD::STACKRESTORE      , MVT::Other, Expand);
    378   setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i32  , Expand);
    379   setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i64  , Expand);
    380 
    381   // Cell SPU has instructions for converting between i64 and fp.
    382   setOperationAction(ISD::FP_TO_SINT, MVT::i64, Custom);
    383   setOperationAction(ISD::SINT_TO_FP, MVT::i64, Custom);
    384 
    385   // To take advantage of the above i64 FP_TO_SINT, promote i32 FP_TO_UINT
    386   setOperationAction(ISD::FP_TO_UINT, MVT::i32, Promote);
    387 
    388   // BUILD_PAIR can't be handled natively, and should be expanded to shl/or
    389   setOperationAction(ISD::BUILD_PAIR, MVT::i64, Expand);
    390 
    391   // First set operation action for all vector types to expand. Then we
    392   // will selectively turn on ones that can be effectively codegen'd.
    393   addRegisterClass(MVT::v16i8, SPU::VECREGRegisterClass);
    394   addRegisterClass(MVT::v8i16, SPU::VECREGRegisterClass);
    395   addRegisterClass(MVT::v4i32, SPU::VECREGRegisterClass);
    396   addRegisterClass(MVT::v2i64, SPU::VECREGRegisterClass);
    397   addRegisterClass(MVT::v4f32, SPU::VECREGRegisterClass);
    398   addRegisterClass(MVT::v2f64, SPU::VECREGRegisterClass);
    399 
    400   for (unsigned i = (unsigned)MVT::FIRST_VECTOR_VALUETYPE;
    401        i <= (unsigned)MVT::LAST_VECTOR_VALUETYPE; ++i) {
    402     MVT::SimpleValueType VT = (MVT::SimpleValueType)i;
    403 
    404     // add/sub are legal for all supported vector VT's.
    405     setOperationAction(ISD::ADD,     VT, Legal);
    406     setOperationAction(ISD::SUB,     VT, Legal);
    407     // mul has to be custom lowered.
    408     setOperationAction(ISD::MUL,     VT, Legal);
    409 
    410     setOperationAction(ISD::AND,     VT, Legal);
    411     setOperationAction(ISD::OR,      VT, Legal);
    412     setOperationAction(ISD::XOR,     VT, Legal);
    413     setOperationAction(ISD::LOAD,    VT, Custom);
    414     setOperationAction(ISD::SELECT,  VT, Legal);
    415     setOperationAction(ISD::STORE,   VT, Custom);
    416 
    417     // These operations need to be expanded:
    418     setOperationAction(ISD::SDIV,    VT, Expand);
    419     setOperationAction(ISD::SREM,    VT, Expand);
    420     setOperationAction(ISD::UDIV,    VT, Expand);
    421     setOperationAction(ISD::UREM,    VT, Expand);
    422 
    423     // Custom lower build_vector, constant pool spills, insert and
    424     // extract vector elements:
    425     setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
    426     setOperationAction(ISD::ConstantPool, VT, Custom);
    427     setOperationAction(ISD::SCALAR_TO_VECTOR, VT, Custom);
    428     setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
    429     setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom);
    430     setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);
    431   }
    432 
    433   setOperationAction(ISD::AND, MVT::v16i8, Custom);
    434   setOperationAction(ISD::OR,  MVT::v16i8, Custom);
    435   setOperationAction(ISD::XOR, MVT::v16i8, Custom);
    436   setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v4f32, Custom);
    437 
    438   setOperationAction(ISD::FDIV, MVT::v4f32, Legal);
    439 
    440   setBooleanContents(ZeroOrNegativeOneBooleanContent);
    441 
    442   setStackPointerRegisterToSaveRestore(SPU::R1);
    443 
    444   // We have target-specific dag combine patterns for the following nodes:
    445   setTargetDAGCombine(ISD::ADD);
    446   setTargetDAGCombine(ISD::ZERO_EXTEND);
    447   setTargetDAGCombine(ISD::SIGN_EXTEND);
    448   setTargetDAGCombine(ISD::ANY_EXTEND);
    449 
    450   setMinFunctionAlignment(3);
    451 
    452   computeRegisterProperties();
    453 
    454   // Set pre-RA register scheduler default to BURR, which produces slightly
    455   // better code than the default (could also be TDRR, but TargetLowering.h
    456   // needs a mod to support that model):
    457   setSchedulingPreference(Sched::RegPressure);
    458 }
    459 
    460 const char *
    461 SPUTargetLowering::getTargetNodeName(unsigned Opcode) const
    462 {
    463   if (node_names.empty()) {
    464     node_names[(unsigned) SPUISD::RET_FLAG] = "SPUISD::RET_FLAG";
    465     node_names[(unsigned) SPUISD::Hi] = "SPUISD::Hi";
    466     node_names[(unsigned) SPUISD::Lo] = "SPUISD::Lo";
    467     node_names[(unsigned) SPUISD::PCRelAddr] = "SPUISD::PCRelAddr";
    468     node_names[(unsigned) SPUISD::AFormAddr] = "SPUISD::AFormAddr";
    469     node_names[(unsigned) SPUISD::IndirectAddr] = "SPUISD::IndirectAddr";
    470     node_names[(unsigned) SPUISD::LDRESULT] = "SPUISD::LDRESULT";
    471     node_names[(unsigned) SPUISD::CALL] = "SPUISD::CALL";
    472     node_names[(unsigned) SPUISD::SHUFB] = "SPUISD::SHUFB";
    473     node_names[(unsigned) SPUISD::SHUFFLE_MASK] = "SPUISD::SHUFFLE_MASK";
    474     node_names[(unsigned) SPUISD::CNTB] = "SPUISD::CNTB";
    475     node_names[(unsigned) SPUISD::PREFSLOT2VEC] = "SPUISD::PREFSLOT2VEC";
    476     node_names[(unsigned) SPUISD::VEC2PREFSLOT] = "SPUISD::VEC2PREFSLOT";
    477     node_names[(unsigned) SPUISD::SHL_BITS] = "SPUISD::SHL_BITS";
    478     node_names[(unsigned) SPUISD::SHL_BYTES] = "SPUISD::SHL_BYTES";
    479     node_names[(unsigned) SPUISD::VEC_ROTL] = "SPUISD::VEC_ROTL";
    480     node_names[(unsigned) SPUISD::VEC_ROTR] = "SPUISD::VEC_ROTR";
    481     node_names[(unsigned) SPUISD::ROTBYTES_LEFT] = "SPUISD::ROTBYTES_LEFT";
    482     node_names[(unsigned) SPUISD::ROTBYTES_LEFT_BITS] =
    483             "SPUISD::ROTBYTES_LEFT_BITS";
    484     node_names[(unsigned) SPUISD::SELECT_MASK] = "SPUISD::SELECT_MASK";
    485     node_names[(unsigned) SPUISD::SELB] = "SPUISD::SELB";
    486     node_names[(unsigned) SPUISD::ADD64_MARKER] = "SPUISD::ADD64_MARKER";
    487     node_names[(unsigned) SPUISD::SUB64_MARKER] = "SPUISD::SUB64_MARKER";
    488     node_names[(unsigned) SPUISD::MUL64_MARKER] = "SPUISD::MUL64_MARKER";
    489   }
    490 
    491   std::map<unsigned, const char *>::iterator i = node_names.find(Opcode);
    492 
    493   return ((i != node_names.end()) ? i->second : 0);
    494 }
    495 
    496 //===----------------------------------------------------------------------===//
    497 // Return the Cell SPU's SETCC result type
    498 //===----------------------------------------------------------------------===//
    499 
    500 MVT::SimpleValueType SPUTargetLowering::getSetCCResultType(EVT VT) const {
    501   // i8, i16 and i32 are valid SETCC result types
    502   MVT::SimpleValueType retval;
    503 
    504   switch(VT.getSimpleVT().SimpleTy){
    505     case MVT::i1:
    506     case MVT::i8:
    507       retval = MVT::i8; break;
    508     case MVT::i16:
    509       retval = MVT::i16; break;
    510     case MVT::i32:
    511     default:
    512       retval = MVT::i32;
    513   }
    514   return retval;
    515 }
    516 
    517 //===----------------------------------------------------------------------===//
    518 // Calling convention code:
    519 //===----------------------------------------------------------------------===//
    520 
    521 #include "SPUGenCallingConv.inc"
    522 
    523 //===----------------------------------------------------------------------===//
    524 //  LowerOperation implementation
    525 //===----------------------------------------------------------------------===//
    526 
    527 /// Custom lower loads for CellSPU
    528 /*!
    529  All CellSPU loads and stores are aligned to 16-byte boundaries, so for elements
    530  within a 16-byte block, we have to rotate to extract the requested element.
    531 
    532  For extending loads, we also want to ensure that the following sequence is
    533  emitted, e.g. for MVT::f32 extending load to MVT::f64:
    534 
    535 \verbatim
    536 %1  v16i8,ch = load
    537 %2  v16i8,ch = rotate %1
    538 %3  v4f8, ch = bitconvert %2
    539 %4  f32      = vec2perfslot %3
    540 %5  f64      = fp_extend %4
    541 \endverbatim
    542 */
    543 static SDValue
    544 LowerLOAD(SDValue Op, SelectionDAG &DAG, const SPUSubtarget *ST) {
    545   LoadSDNode *LN = cast<LoadSDNode>(Op);
    546   SDValue the_chain = LN->getChain();
    547   EVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy();
    548   EVT InVT = LN->getMemoryVT();
    549   EVT OutVT = Op.getValueType();
    550   ISD::LoadExtType ExtType = LN->getExtensionType();
    551   unsigned alignment = LN->getAlignment();
    552   int pso = prefslotOffset(InVT);
    553   DebugLoc dl = Op.getDebugLoc();
    554   EVT vecVT = InVT.isVector()? InVT: EVT::getVectorVT(*DAG.getContext(), InVT,
    555                                                   (128 / InVT.getSizeInBits()));
    556 
    557   // two sanity checks
    558   assert( LN->getAddressingMode() == ISD::UNINDEXED
    559           && "we should get only UNINDEXED adresses");
    560   // clean aligned loads can be selected as-is
    561   if (InVT.getSizeInBits() == 128 && (alignment%16) == 0)
    562     return SDValue();
    563 
    564   // Get pointerinfos to the memory chunk(s) that contain the data to load
    565   uint64_t mpi_offset = LN->getPointerInfo().Offset;
    566   mpi_offset -= mpi_offset%16;
    567   MachinePointerInfo lowMemPtr(LN->getPointerInfo().V, mpi_offset);
    568   MachinePointerInfo highMemPtr(LN->getPointerInfo().V, mpi_offset+16);
    569 
    570   SDValue result;
    571   SDValue basePtr = LN->getBasePtr();
    572   SDValue rotate;
    573 
    574   if ((alignment%16) == 0) {
    575     ConstantSDNode *CN;
    576 
    577     // Special cases for a known aligned load to simplify the base pointer
    578     // and the rotation amount:
    579     if (basePtr.getOpcode() == ISD::ADD
    580         && (CN = dyn_cast<ConstantSDNode > (basePtr.getOperand(1))) != 0) {
    581       // Known offset into basePtr
    582       int64_t offset = CN->getSExtValue();
    583       int64_t rotamt = int64_t((offset & 0xf) - pso);
    584 
    585       if (rotamt < 0)
    586         rotamt += 16;
    587 
    588       rotate = DAG.getConstant(rotamt, MVT::i16);
    589 
    590       // Simplify the base pointer for this case:
    591       basePtr = basePtr.getOperand(0);
    592       if ((offset & ~0xf) > 0) {
    593         basePtr = DAG.getNode(SPUISD::IndirectAddr, dl, PtrVT,
    594                               basePtr,
    595                               DAG.getConstant((offset & ~0xf), PtrVT));
    596       }
    597     } else if ((basePtr.getOpcode() == SPUISD::AFormAddr)
    598                || (basePtr.getOpcode() == SPUISD::IndirectAddr
    599                    && basePtr.getOperand(0).getOpcode() == SPUISD::Hi
    600                    && basePtr.getOperand(1).getOpcode() == SPUISD::Lo)) {
    601       // Plain aligned a-form address: rotate into preferred slot
    602       // Same for (SPUindirect (SPUhi ...), (SPUlo ...))
    603       int64_t rotamt = -pso;
    604       if (rotamt < 0)
    605         rotamt += 16;
    606       rotate = DAG.getConstant(rotamt, MVT::i16);
    607     } else {
    608       // Offset the rotate amount by the basePtr and the preferred slot
    609       // byte offset
    610       int64_t rotamt = -pso;
    611       if (rotamt < 0)
    612         rotamt += 16;
    613       rotate = DAG.getNode(ISD::ADD, dl, PtrVT,
    614                            basePtr,
    615                            DAG.getConstant(rotamt, PtrVT));
    616     }
    617   } else {
    618     // Unaligned load: must be more pessimistic about addressing modes:
    619     if (basePtr.getOpcode() == ISD::ADD) {
    620       MachineFunction &MF = DAG.getMachineFunction();
    621       MachineRegisterInfo &RegInfo = MF.getRegInfo();
    622       unsigned VReg = RegInfo.createVirtualRegister(&SPU::R32CRegClass);
    623       SDValue Flag;
    624 
    625       SDValue Op0 = basePtr.getOperand(0);
    626       SDValue Op1 = basePtr.getOperand(1);
    627 
    628       if (isa<ConstantSDNode>(Op1)) {
    629         // Convert the (add <ptr>, <const>) to an indirect address contained
    630         // in a register. Note that this is done because we need to avoid
    631         // creating a 0(reg) d-form address due to the SPU's block loads.
    632         basePtr = DAG.getNode(SPUISD::IndirectAddr, dl, PtrVT, Op0, Op1);
    633         the_chain = DAG.getCopyToReg(the_chain, dl, VReg, basePtr, Flag);
    634         basePtr = DAG.getCopyFromReg(the_chain, dl, VReg, PtrVT);
    635       } else {
    636         // Convert the (add <arg1>, <arg2>) to an indirect address, which
    637         // will likely be lowered as a reg(reg) x-form address.
    638         basePtr = DAG.getNode(SPUISD::IndirectAddr, dl, PtrVT, Op0, Op1);
    639       }
    640     } else {
    641       basePtr = DAG.getNode(SPUISD::IndirectAddr, dl, PtrVT,
    642                             basePtr,
    643                             DAG.getConstant(0, PtrVT));
    644    }
    645 
    646     // Offset the rotate amount by the basePtr and the preferred slot
    647     // byte offset
    648     rotate = DAG.getNode(ISD::ADD, dl, PtrVT,
    649                          basePtr,
    650                          DAG.getConstant(-pso, PtrVT));
    651   }
    652 
    653   // Do the load as a i128 to allow possible shifting
    654   SDValue low = DAG.getLoad(MVT::i128, dl, the_chain, basePtr,
    655                        lowMemPtr,
    656                        LN->isVolatile(), LN->isNonTemporal(), 16);
    657 
    658   // When the size is not greater than alignment we get all data with just
    659   // one load
    660   if (alignment >= InVT.getSizeInBits()/8) {
    661     // Update the chain
    662     the_chain = low.getValue(1);
    663 
    664     // Rotate into the preferred slot:
    665     result = DAG.getNode(SPUISD::ROTBYTES_LEFT, dl, MVT::i128,
    666                          low.getValue(0), rotate);
    667 
    668     // Convert the loaded v16i8 vector to the appropriate vector type
    669     // specified by the operand:
    670     EVT vecVT = EVT::getVectorVT(*DAG.getContext(),
    671                                  InVT, (128 / InVT.getSizeInBits()));
    672     result = DAG.getNode(SPUISD::VEC2PREFSLOT, dl, InVT,
    673                          DAG.getNode(ISD::BITCAST, dl, vecVT, result));
    674   }
    675   // When alignment is less than the size, we might need (known only at
    676   // run-time) two loads
    677   // TODO: if the memory address is composed only from constants, we have
    678   // extra kowledge, and might avoid the second load
    679   else {
    680     // storage position offset from lower 16 byte aligned memory chunk
    681     SDValue offset = DAG.getNode(ISD::AND, dl, MVT::i32,
    682                                   basePtr, DAG.getConstant( 0xf, MVT::i32 ) );
    683     // get a registerfull of ones. (this implementation is a workaround: LLVM
    684     // cannot handle 128 bit signed int constants)
    685     SDValue ones = DAG.getConstant(-1, MVT::v4i32 );
    686     ones = DAG.getNode(ISD::BITCAST, dl, MVT::i128, ones);
    687 
    688     SDValue high = DAG.getLoad(MVT::i128, dl, the_chain,
    689                                DAG.getNode(ISD::ADD, dl, PtrVT,
    690                                            basePtr,
    691                                            DAG.getConstant(16, PtrVT)),
    692                                highMemPtr,
    693                                LN->isVolatile(), LN->isNonTemporal(), 16);
    694 
    695     the_chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, low.getValue(1),
    696                                                               high.getValue(1));
    697 
    698     // Shift the (possible) high part right to compensate the misalignemnt.
    699     // if there is no highpart (i.e. value is i64 and offset is 4), this
    700     // will zero out the high value.
    701     high = DAG.getNode(SPUISD::SRL_BYTES, dl, MVT::i128, high,
    702                                      DAG.getNode(ISD::SUB, dl, MVT::i32,
    703                                                  DAG.getConstant( 16, MVT::i32),
    704                                                  offset
    705                                                 ));
    706 
    707     // Shift the low similarly
    708     // TODO: add SPUISD::SHL_BYTES
    709     low = DAG.getNode(SPUISD::SHL_BYTES, dl, MVT::i128, low, offset );
    710 
    711     // Merge the two parts
    712     result = DAG.getNode(ISD::BITCAST, dl, vecVT,
    713                           DAG.getNode(ISD::OR, dl, MVT::i128, low, high));
    714 
    715     if (!InVT.isVector()) {
    716       result = DAG.getNode(SPUISD::VEC2PREFSLOT, dl, InVT, result );
    717      }
    718 
    719   }
    720     // Handle extending loads by extending the scalar result:
    721     if (ExtType == ISD::SEXTLOAD) {
    722       result = DAG.getNode(ISD::SIGN_EXTEND, dl, OutVT, result);
    723     } else if (ExtType == ISD::ZEXTLOAD) {
    724       result = DAG.getNode(ISD::ZERO_EXTEND, dl, OutVT, result);
    725     } else if (ExtType == ISD::EXTLOAD) {
    726       unsigned NewOpc = ISD::ANY_EXTEND;
    727 
    728       if (OutVT.isFloatingPoint())
    729         NewOpc = ISD::FP_EXTEND;
    730 
    731       result = DAG.getNode(NewOpc, dl, OutVT, result);
    732     }
    733 
    734     SDVTList retvts = DAG.getVTList(OutVT, MVT::Other);
    735     SDValue retops[2] = {
    736       result,
    737       the_chain
    738     };
    739 
    740     result = DAG.getNode(SPUISD::LDRESULT, dl, retvts,
    741                          retops, sizeof(retops) / sizeof(retops[0]));
    742     return result;
    743 }
    744 
    745 /// Custom lower stores for CellSPU
    746 /*!
    747  All CellSPU stores are aligned to 16-byte boundaries, so for elements
    748  within a 16-byte block, we have to generate a shuffle to insert the
    749  requested element into its place, then store the resulting block.
    750  */
    751 static SDValue
    752 LowerSTORE(SDValue Op, SelectionDAG &DAG, const SPUSubtarget *ST) {
    753   StoreSDNode *SN = cast<StoreSDNode>(Op);
    754   SDValue Value = SN->getValue();
    755   EVT VT = Value.getValueType();
    756   EVT StVT = (!SN->isTruncatingStore() ? VT : SN->getMemoryVT());
    757   EVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy();
    758   DebugLoc dl = Op.getDebugLoc();
    759   unsigned alignment = SN->getAlignment();
    760   SDValue result;
    761   EVT vecVT = StVT.isVector()? StVT: EVT::getVectorVT(*DAG.getContext(), StVT,
    762                                                  (128 / StVT.getSizeInBits()));
    763   // Get pointerinfos to the memory chunk(s) that contain the data to load
    764   uint64_t mpi_offset = SN->getPointerInfo().Offset;
    765   mpi_offset -= mpi_offset%16;
    766   MachinePointerInfo lowMemPtr(SN->getPointerInfo().V, mpi_offset);
    767   MachinePointerInfo highMemPtr(SN->getPointerInfo().V, mpi_offset+16);
    768 
    769 
    770   // two sanity checks
    771   assert( SN->getAddressingMode() == ISD::UNINDEXED
    772           && "we should get only UNINDEXED adresses");
    773   // clean aligned loads can be selected as-is
    774   if (StVT.getSizeInBits() == 128 && (alignment%16) == 0)
    775     return SDValue();
    776 
    777   SDValue alignLoadVec;
    778   SDValue basePtr = SN->getBasePtr();
    779   SDValue the_chain = SN->getChain();
    780   SDValue insertEltOffs;
    781 
    782   if ((alignment%16) == 0) {
    783     ConstantSDNode *CN;
    784     // Special cases for a known aligned load to simplify the base pointer
    785     // and insertion byte:
    786     if (basePtr.getOpcode() == ISD::ADD
    787         && (CN = dyn_cast<ConstantSDNode>(basePtr.getOperand(1))) != 0) {
    788       // Known offset into basePtr
    789       int64_t offset = CN->getSExtValue();
    790 
    791       // Simplify the base pointer for this case:
    792       basePtr = basePtr.getOperand(0);
    793       insertEltOffs = DAG.getNode(SPUISD::IndirectAddr, dl, PtrVT,
    794                                   basePtr,
    795                                   DAG.getConstant((offset & 0xf), PtrVT));
    796 
    797       if ((offset & ~0xf) > 0) {
    798         basePtr = DAG.getNode(SPUISD::IndirectAddr, dl, PtrVT,
    799                               basePtr,
    800                               DAG.getConstant((offset & ~0xf), PtrVT));
    801       }
    802     } else {
    803       // Otherwise, assume it's at byte 0 of basePtr
    804       insertEltOffs = DAG.getNode(SPUISD::IndirectAddr, dl, PtrVT,
    805                                   basePtr,
    806                                   DAG.getConstant(0, PtrVT));
    807       basePtr = DAG.getNode(SPUISD::IndirectAddr, dl, PtrVT,
    808                                   basePtr,
    809                                   DAG.getConstant(0, PtrVT));
    810     }
    811   } else {
    812     // Unaligned load: must be more pessimistic about addressing modes:
    813     if (basePtr.getOpcode() == ISD::ADD) {
    814       MachineFunction &MF = DAG.getMachineFunction();
    815       MachineRegisterInfo &RegInfo = MF.getRegInfo();
    816       unsigned VReg = RegInfo.createVirtualRegister(&SPU::R32CRegClass);
    817       SDValue Flag;
    818 
    819       SDValue Op0 = basePtr.getOperand(0);
    820       SDValue Op1 = basePtr.getOperand(1);
    821 
    822       if (isa<ConstantSDNode>(Op1)) {
    823         // Convert the (add <ptr>, <const>) to an indirect address contained
    824         // in a register. Note that this is done because we need to avoid
    825         // creating a 0(reg) d-form address due to the SPU's block loads.
    826         basePtr = DAG.getNode(SPUISD::IndirectAddr, dl, PtrVT, Op0, Op1);
    827         the_chain = DAG.getCopyToReg(the_chain, dl, VReg, basePtr, Flag);
    828         basePtr = DAG.getCopyFromReg(the_chain, dl, VReg, PtrVT);
    829       } else {
    830         // Convert the (add <arg1>, <arg2>) to an indirect address, which
    831         // will likely be lowered as a reg(reg) x-form address.
    832         basePtr = DAG.getNode(SPUISD::IndirectAddr, dl, PtrVT, Op0, Op1);
    833       }
    834     } else {
    835       basePtr = DAG.getNode(SPUISD::IndirectAddr, dl, PtrVT,
    836                             basePtr,
    837                             DAG.getConstant(0, PtrVT));
    838     }
    839 
    840     // Insertion point is solely determined by basePtr's contents
    841     insertEltOffs = DAG.getNode(ISD::ADD, dl, PtrVT,
    842                                 basePtr,
    843                                 DAG.getConstant(0, PtrVT));
    844   }
    845 
    846   // Load the lower part of the memory to which to store.
    847   SDValue low = DAG.getLoad(vecVT, dl, the_chain, basePtr,
    848                           lowMemPtr, SN->isVolatile(), SN->isNonTemporal(), 16);
    849 
    850   // if we don't need to store over the 16 byte boundary, one store suffices
    851   if (alignment >= StVT.getSizeInBits()/8) {
    852     // Update the chain
    853     the_chain = low.getValue(1);
    854 
    855     LoadSDNode *LN = cast<LoadSDNode>(low);
    856     SDValue theValue = SN->getValue();
    857 
    858     if (StVT != VT
    859         && (theValue.getOpcode() == ISD::AssertZext
    860             || theValue.getOpcode() == ISD::AssertSext)) {
    861       // Drill down and get the value for zero- and sign-extended
    862       // quantities
    863       theValue = theValue.getOperand(0);
    864     }
    865 
    866     // If the base pointer is already a D-form address, then just create
    867     // a new D-form address with a slot offset and the orignal base pointer.
    868     // Otherwise generate a D-form address with the slot offset relative
    869     // to the stack pointer, which is always aligned.
    870 #if !defined(NDEBUG)
    871       if (DebugFlag && isCurrentDebugType(DEBUG_TYPE)) {
    872         errs() << "CellSPU LowerSTORE: basePtr = ";
    873         basePtr.getNode()->dump(&DAG);
    874         errs() << "\n";
    875       }
    876 #endif
    877 
    878     SDValue insertEltOp = DAG.getNode(SPUISD::SHUFFLE_MASK, dl, vecVT,
    879                                       insertEltOffs);
    880     SDValue vectorizeOp = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, vecVT,
    881                                       theValue);
    882 
    883     result = DAG.getNode(SPUISD::SHUFB, dl, vecVT,
    884                          vectorizeOp, low,
    885                          DAG.getNode(ISD::BITCAST, dl,
    886                                      MVT::v4i32, insertEltOp));
    887 
    888     result = DAG.getStore(the_chain, dl, result, basePtr,
    889                           lowMemPtr,
    890                           LN->isVolatile(), LN->isNonTemporal(),
    891                           16);
    892 
    893   }
    894   // do the store when it might cross the 16 byte memory access boundary.
    895   else {
    896     // TODO issue a warning if SN->isVolatile()== true? This is likely not
    897     // what the user wanted.
    898 
    899     // address offset from nearest lower 16byte alinged address
    900     SDValue offset = DAG.getNode(ISD::AND, dl, MVT::i32,
    901                                     SN->getBasePtr(),
    902                                     DAG.getConstant(0xf, MVT::i32));
    903     // 16 - offset
    904     SDValue offset_compl = DAG.getNode(ISD::SUB, dl, MVT::i32,
    905                                            DAG.getConstant( 16, MVT::i32),
    906                                            offset);
    907     // 16 - sizeof(Value)
    908     SDValue surplus = DAG.getNode(ISD::SUB, dl, MVT::i32,
    909                                      DAG.getConstant( 16, MVT::i32),
    910                                      DAG.getConstant( VT.getSizeInBits()/8,
    911                                                       MVT::i32));
    912     // get a registerfull of ones
    913     SDValue ones = DAG.getConstant(-1, MVT::v4i32);
    914     ones = DAG.getNode(ISD::BITCAST, dl, MVT::i128, ones);
    915 
    916     // Create the 128 bit masks that have ones where the data to store is
    917     // located.
    918     SDValue lowmask, himask;
    919     // if the value to store don't fill up the an entire 128 bits, zero
    920     // out the last bits of the mask so that only the value we want to store
    921     // is masked.
    922     // this is e.g. in the case of store i32, align 2
    923     if (!VT.isVector()){
    924       Value = DAG.getNode(SPUISD::PREFSLOT2VEC, dl, vecVT, Value);
    925       lowmask = DAG.getNode(SPUISD::SRL_BYTES, dl, MVT::i128, ones, surplus);
    926       lowmask = DAG.getNode(SPUISD::SHL_BYTES, dl, MVT::i128, lowmask,
    927                                                                surplus);
    928       Value = DAG.getNode(ISD::BITCAST, dl, MVT::i128, Value);
    929       Value = DAG.getNode(ISD::AND, dl, MVT::i128, Value, lowmask);
    930 
    931     }
    932     else {
    933       lowmask = ones;
    934       Value = DAG.getNode(ISD::BITCAST, dl, MVT::i128, Value);
    935     }
    936     // this will zero, if there are no data that goes to the high quad
    937     himask = DAG.getNode(SPUISD::SHL_BYTES, dl, MVT::i128, lowmask,
    938                                                             offset_compl);
    939     lowmask = DAG.getNode(SPUISD::SRL_BYTES, dl, MVT::i128, lowmask,
    940                                                              offset);
    941 
    942     // Load in the old data and zero out the parts that will be overwritten with
    943     // the new data to store.
    944     SDValue hi = DAG.getLoad(MVT::i128, dl, the_chain,
    945                                DAG.getNode(ISD::ADD, dl, PtrVT, basePtr,
    946                                            DAG.getConstant( 16, PtrVT)),
    947                                highMemPtr,
    948                                SN->isVolatile(), SN->isNonTemporal(), 16);
    949     the_chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, low.getValue(1),
    950                                                               hi.getValue(1));
    951 
    952     low = DAG.getNode(ISD::AND, dl, MVT::i128,
    953                         DAG.getNode( ISD::BITCAST, dl, MVT::i128, low),
    954                         DAG.getNode( ISD::XOR, dl, MVT::i128, lowmask, ones));
    955     hi = DAG.getNode(ISD::AND, dl, MVT::i128,
    956                         DAG.getNode( ISD::BITCAST, dl, MVT::i128, hi),
    957                         DAG.getNode( ISD::XOR, dl, MVT::i128, himask, ones));
    958 
    959     // Shift the Value to store into place. rlow contains the parts that go to
    960     // the lower memory chunk, rhi has the parts that go to the upper one.
    961     SDValue rlow = DAG.getNode(SPUISD::SRL_BYTES, dl, MVT::i128, Value, offset);
    962     rlow = DAG.getNode(ISD::AND, dl, MVT::i128, rlow, lowmask);
    963     SDValue rhi = DAG.getNode(SPUISD::SHL_BYTES, dl, MVT::i128, Value,
    964                                                             offset_compl);
    965 
    966     // Merge the old data and the new data and store the results
    967     // Need to convert vectors here to integer as 'OR'ing floats assert
    968     rlow = DAG.getNode(ISD::OR, dl, MVT::i128,
    969                           DAG.getNode(ISD::BITCAST, dl, MVT::i128, low),
    970                           DAG.getNode(ISD::BITCAST, dl, MVT::i128, rlow));
    971     rhi = DAG.getNode(ISD::OR, dl, MVT::i128,
    972                          DAG.getNode(ISD::BITCAST, dl, MVT::i128, hi),
    973                          DAG.getNode(ISD::BITCAST, dl, MVT::i128, rhi));
    974 
    975     low = DAG.getStore(the_chain, dl, rlow, basePtr,
    976                           lowMemPtr,
    977                           SN->isVolatile(), SN->isNonTemporal(), 16);
    978     hi  = DAG.getStore(the_chain, dl, rhi,
    979                             DAG.getNode(ISD::ADD, dl, PtrVT, basePtr,
    980                                         DAG.getConstant( 16, PtrVT)),
    981                             highMemPtr,
    982                             SN->isVolatile(), SN->isNonTemporal(), 16);
    983     result = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, low.getValue(0),
    984                                                            hi.getValue(0));
    985   }
    986 
    987   return result;
    988 }
    989 
    990 //! Generate the address of a constant pool entry.
    991 static SDValue
    992 LowerConstantPool(SDValue Op, SelectionDAG &DAG, const SPUSubtarget *ST) {
    993   EVT PtrVT = Op.getValueType();
    994   ConstantPoolSDNode *CP = cast<ConstantPoolSDNode>(Op);
    995   const Constant *C = CP->getConstVal();
    996   SDValue CPI = DAG.getTargetConstantPool(C, PtrVT, CP->getAlignment());
    997   SDValue Zero = DAG.getConstant(0, PtrVT);
    998   const TargetMachine &TM = DAG.getTarget();
    999   // FIXME there is no actual debug info here
   1000   DebugLoc dl = Op.getDebugLoc();
   1001 
   1002   if (TM.getRelocationModel() == Reloc::Static) {
   1003     if (!ST->usingLargeMem()) {
   1004       // Just return the SDValue with the constant pool address in it.
   1005       return DAG.getNode(SPUISD::AFormAddr, dl, PtrVT, CPI, Zero);
   1006     } else {
   1007       SDValue Hi = DAG.getNode(SPUISD::Hi, dl, PtrVT, CPI, Zero);
   1008       SDValue Lo = DAG.getNode(SPUISD::Lo, dl, PtrVT, CPI, Zero);
   1009       return DAG.getNode(SPUISD::IndirectAddr, dl, PtrVT, Hi, Lo);
   1010     }
   1011   }
   1012 
   1013   llvm_unreachable("LowerConstantPool: Relocation model other than static"
   1014                    " not supported.");
   1015   return SDValue();
   1016 }
   1017 
   1018 //! Alternate entry point for generating the address of a constant pool entry
   1019 SDValue
   1020 SPU::LowerConstantPool(SDValue Op, SelectionDAG &DAG, const SPUTargetMachine &TM) {
   1021   return ::LowerConstantPool(Op, DAG, TM.getSubtargetImpl());
   1022 }
   1023 
   1024 static SDValue
   1025 LowerJumpTable(SDValue Op, SelectionDAG &DAG, const SPUSubtarget *ST) {
   1026   EVT PtrVT = Op.getValueType();
   1027   JumpTableSDNode *JT = cast<JumpTableSDNode>(Op);
   1028   SDValue JTI = DAG.getTargetJumpTable(JT->getIndex(), PtrVT);
   1029   SDValue Zero = DAG.getConstant(0, PtrVT);
   1030   const TargetMachine &TM = DAG.getTarget();
   1031   // FIXME there is no actual debug info here
   1032   DebugLoc dl = Op.getDebugLoc();
   1033 
   1034   if (TM.getRelocationModel() == Reloc::Static) {
   1035     if (!ST->usingLargeMem()) {
   1036       return DAG.getNode(SPUISD::AFormAddr, dl, PtrVT, JTI, Zero);
   1037     } else {
   1038       SDValue Hi = DAG.getNode(SPUISD::Hi, dl, PtrVT, JTI, Zero);
   1039       SDValue Lo = DAG.getNode(SPUISD::Lo, dl, PtrVT, JTI, Zero);
   1040       return DAG.getNode(SPUISD::IndirectAddr, dl, PtrVT, Hi, Lo);
   1041     }
   1042   }
   1043 
   1044   llvm_unreachable("LowerJumpTable: Relocation model other than static"
   1045                    " not supported.");
   1046   return SDValue();
   1047 }
   1048 
   1049 static SDValue
   1050 LowerGlobalAddress(SDValue Op, SelectionDAG &DAG, const SPUSubtarget *ST) {
   1051   EVT PtrVT = Op.getValueType();
   1052   GlobalAddressSDNode *GSDN = cast<GlobalAddressSDNode>(Op);
   1053   const GlobalValue *GV = GSDN->getGlobal();
   1054   SDValue GA = DAG.getTargetGlobalAddress(GV, Op.getDebugLoc(),
   1055                                           PtrVT, GSDN->getOffset());
   1056   const TargetMachine &TM = DAG.getTarget();
   1057   SDValue Zero = DAG.getConstant(0, PtrVT);
   1058   // FIXME there is no actual debug info here
   1059   DebugLoc dl = Op.getDebugLoc();
   1060 
   1061   if (TM.getRelocationModel() == Reloc::Static) {
   1062     if (!ST->usingLargeMem()) {
   1063       return DAG.getNode(SPUISD::AFormAddr, dl, PtrVT, GA, Zero);
   1064     } else {
   1065       SDValue Hi = DAG.getNode(SPUISD::Hi, dl, PtrVT, GA, Zero);
   1066       SDValue Lo = DAG.getNode(SPUISD::Lo, dl, PtrVT, GA, Zero);
   1067       return DAG.getNode(SPUISD::IndirectAddr, dl, PtrVT, Hi, Lo);
   1068     }
   1069   } else {
   1070     report_fatal_error("LowerGlobalAddress: Relocation model other than static"
   1071                       "not supported.");
   1072     /*NOTREACHED*/
   1073   }
   1074 
   1075   return SDValue();
   1076 }
   1077 
   1078 //! Custom lower double precision floating point constants
   1079 static SDValue
   1080 LowerConstantFP(SDValue Op, SelectionDAG &DAG) {
   1081   EVT VT = Op.getValueType();
   1082   // FIXME there is no actual debug info here
   1083   DebugLoc dl = Op.getDebugLoc();
   1084 
   1085   if (VT == MVT::f64) {
   1086     ConstantFPSDNode *FP = cast<ConstantFPSDNode>(Op.getNode());
   1087 
   1088     assert((FP != 0) &&
   1089            "LowerConstantFP: Node is not ConstantFPSDNode");
   1090 
   1091     uint64_t dbits = DoubleToBits(FP->getValueAPF().convertToDouble());
   1092     SDValue T = DAG.getConstant(dbits, MVT::i64);
   1093     SDValue Tvec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v2i64, T, T);
   1094     return DAG.getNode(SPUISD::VEC2PREFSLOT, dl, VT,
   1095                        DAG.getNode(ISD::BITCAST, dl, MVT::v2f64, Tvec));
   1096   }
   1097 
   1098   return SDValue();
   1099 }
   1100 
   1101 SDValue
   1102 SPUTargetLowering::LowerFormalArguments(SDValue Chain,
   1103                                         CallingConv::ID CallConv, bool isVarArg,
   1104                                         const SmallVectorImpl<ISD::InputArg>
   1105                                           &Ins,
   1106                                         DebugLoc dl, SelectionDAG &DAG,
   1107                                         SmallVectorImpl<SDValue> &InVals)
   1108                                           const {
   1109 
   1110   MachineFunction &MF = DAG.getMachineFunction();
   1111   MachineFrameInfo *MFI = MF.getFrameInfo();
   1112   MachineRegisterInfo &RegInfo = MF.getRegInfo();
   1113   SPUFunctionInfo *FuncInfo = MF.getInfo<SPUFunctionInfo>();
   1114 
   1115   unsigned ArgOffset = SPUFrameLowering::minStackSize();
   1116   unsigned ArgRegIdx = 0;
   1117   unsigned StackSlotSize = SPUFrameLowering::stackSlotSize();
   1118 
   1119   EVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy();
   1120 
   1121   SmallVector<CCValAssign, 16> ArgLocs;
   1122   CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(),
   1123 		 getTargetMachine(), ArgLocs, *DAG.getContext());
   1124   // FIXME: allow for other calling conventions
   1125   CCInfo.AnalyzeFormalArguments(Ins, CCC_SPU);
   1126 
   1127   // Add DAG nodes to load the arguments or copy them out of registers.
   1128   for (unsigned ArgNo = 0, e = Ins.size(); ArgNo != e; ++ArgNo) {
   1129     EVT ObjectVT = Ins[ArgNo].VT;
   1130     unsigned ObjSize = ObjectVT.getSizeInBits()/8;
   1131     SDValue ArgVal;
   1132     CCValAssign &VA = ArgLocs[ArgNo];
   1133 
   1134     if (VA.isRegLoc()) {
   1135       const TargetRegisterClass *ArgRegClass;
   1136 
   1137       switch (ObjectVT.getSimpleVT().SimpleTy) {
   1138       default:
   1139         report_fatal_error("LowerFormalArguments Unhandled argument type: " +
   1140                            Twine(ObjectVT.getEVTString()));
   1141       case MVT::i8:
   1142         ArgRegClass = &SPU::R8CRegClass;
   1143         break;
   1144       case MVT::i16:
   1145         ArgRegClass = &SPU::R16CRegClass;
   1146         break;
   1147       case MVT::i32:
   1148         ArgRegClass = &SPU::R32CRegClass;
   1149         break;
   1150       case MVT::i64:
   1151         ArgRegClass = &SPU::R64CRegClass;
   1152         break;
   1153       case MVT::i128:
   1154         ArgRegClass = &SPU::GPRCRegClass;
   1155         break;
   1156       case MVT::f32:
   1157         ArgRegClass = &SPU::R32FPRegClass;
   1158         break;
   1159       case MVT::f64:
   1160         ArgRegClass = &SPU::R64FPRegClass;
   1161         break;
   1162       case MVT::v2f64:
   1163       case MVT::v4f32:
   1164       case MVT::v2i64:
   1165       case MVT::v4i32:
   1166       case MVT::v8i16:
   1167       case MVT::v16i8:
   1168         ArgRegClass = &SPU::VECREGRegClass;
   1169         break;
   1170       }
   1171 
   1172       unsigned VReg = RegInfo.createVirtualRegister(ArgRegClass);
   1173       RegInfo.addLiveIn(VA.getLocReg(), VReg);
   1174       ArgVal = DAG.getCopyFromReg(Chain, dl, VReg, ObjectVT);
   1175       ++ArgRegIdx;
   1176     } else {
   1177       // We need to load the argument to a virtual register if we determined
   1178       // above that we ran out of physical registers of the appropriate type
   1179       // or we're forced to do vararg
   1180       int FI = MFI->CreateFixedObject(ObjSize, ArgOffset, true);
   1181       SDValue FIN = DAG.getFrameIndex(FI, PtrVT);
   1182       ArgVal = DAG.getLoad(ObjectVT, dl, Chain, FIN, MachinePointerInfo(),
   1183                            false, false, 0);
   1184       ArgOffset += StackSlotSize;
   1185     }
   1186 
   1187     InVals.push_back(ArgVal);
   1188     // Update the chain
   1189     Chain = ArgVal.getOperand(0);
   1190   }
   1191 
   1192   // vararg handling:
   1193   if (isVarArg) {
   1194     // FIXME: we should be able to query the argument registers from
   1195     //        tablegen generated code.
   1196     static const unsigned ArgRegs[] = {
   1197       SPU::R3,  SPU::R4,  SPU::R5,  SPU::R6,  SPU::R7,  SPU::R8,  SPU::R9,
   1198       SPU::R10, SPU::R11, SPU::R12, SPU::R13, SPU::R14, SPU::R15, SPU::R16,
   1199       SPU::R17, SPU::R18, SPU::R19, SPU::R20, SPU::R21, SPU::R22, SPU::R23,
   1200       SPU::R24, SPU::R25, SPU::R26, SPU::R27, SPU::R28, SPU::R29, SPU::R30,
   1201       SPU::R31, SPU::R32, SPU::R33, SPU::R34, SPU::R35, SPU::R36, SPU::R37,
   1202       SPU::R38, SPU::R39, SPU::R40, SPU::R41, SPU::R42, SPU::R43, SPU::R44,
   1203       SPU::R45, SPU::R46, SPU::R47, SPU::R48, SPU::R49, SPU::R50, SPU::R51,
   1204       SPU::R52, SPU::R53, SPU::R54, SPU::R55, SPU::R56, SPU::R57, SPU::R58,
   1205       SPU::R59, SPU::R60, SPU::R61, SPU::R62, SPU::R63, SPU::R64, SPU::R65,
   1206       SPU::R66, SPU::R67, SPU::R68, SPU::R69, SPU::R70, SPU::R71, SPU::R72,
   1207       SPU::R73, SPU::R74, SPU::R75, SPU::R76, SPU::R77, SPU::R78, SPU::R79
   1208     };
   1209     // size of ArgRegs array
   1210     unsigned NumArgRegs = 77;
   1211 
   1212     // We will spill (79-3)+1 registers to the stack
   1213     SmallVector<SDValue, 79-3+1> MemOps;
   1214 
   1215     // Create the frame slot
   1216     for (; ArgRegIdx != NumArgRegs; ++ArgRegIdx) {
   1217       FuncInfo->setVarArgsFrameIndex(
   1218         MFI->CreateFixedObject(StackSlotSize, ArgOffset, true));
   1219       SDValue FIN = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(), PtrVT);
   1220       unsigned VReg = MF.addLiveIn(ArgRegs[ArgRegIdx], &SPU::VECREGRegClass);
   1221       SDValue ArgVal = DAG.getRegister(VReg, MVT::v16i8);
   1222       SDValue Store = DAG.getStore(Chain, dl, ArgVal, FIN, MachinePointerInfo(),
   1223                                    false, false, 0);
   1224       Chain = Store.getOperand(0);
   1225       MemOps.push_back(Store);
   1226 
   1227       // Increment address by stack slot size for the next stored argument
   1228       ArgOffset += StackSlotSize;
   1229     }
   1230     if (!MemOps.empty())
   1231       Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
   1232                           &MemOps[0], MemOps.size());
   1233   }
   1234 
   1235   return Chain;
   1236 }
   1237 
   1238 /// isLSAAddress - Return the immediate to use if the specified
   1239 /// value is representable as a LSA address.
   1240 static SDNode *isLSAAddress(SDValue Op, SelectionDAG &DAG) {
   1241   ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op);
   1242   if (!C) return 0;
   1243 
   1244   int Addr = C->getZExtValue();
   1245   if ((Addr & 3) != 0 ||  // Low 2 bits are implicitly zero.
   1246       (Addr << 14 >> 14) != Addr)
   1247     return 0;  // Top 14 bits have to be sext of immediate.
   1248 
   1249   return DAG.getConstant((int)C->getZExtValue() >> 2, MVT::i32).getNode();
   1250 }
   1251 
   1252 SDValue
   1253 SPUTargetLowering::LowerCall(SDValue Chain, SDValue Callee,
   1254                              CallingConv::ID CallConv, bool isVarArg,
   1255                              bool &isTailCall,
   1256                              const SmallVectorImpl<ISD::OutputArg> &Outs,
   1257                              const SmallVectorImpl<SDValue> &OutVals,
   1258                              const SmallVectorImpl<ISD::InputArg> &Ins,
   1259                              DebugLoc dl, SelectionDAG &DAG,
   1260                              SmallVectorImpl<SDValue> &InVals) const {
   1261   // CellSPU target does not yet support tail call optimization.
   1262   isTailCall = false;
   1263 
   1264   const SPUSubtarget *ST = SPUTM.getSubtargetImpl();
   1265   unsigned NumOps     = Outs.size();
   1266   unsigned StackSlotSize = SPUFrameLowering::stackSlotSize();
   1267 
   1268   SmallVector<CCValAssign, 16> ArgLocs;
   1269   CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(),
   1270 		 getTargetMachine(), ArgLocs, *DAG.getContext());
   1271   // FIXME: allow for other calling conventions
   1272   CCInfo.AnalyzeCallOperands(Outs, CCC_SPU);
   1273 
   1274   const unsigned NumArgRegs = ArgLocs.size();
   1275 
   1276 
   1277   // Handy pointer type
   1278   EVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy();
   1279 
   1280   // Set up a copy of the stack pointer for use loading and storing any
   1281   // arguments that may not fit in the registers available for argument
   1282   // passing.
   1283   SDValue StackPtr = DAG.getRegister(SPU::R1, MVT::i32);
   1284 
   1285   // Figure out which arguments are going to go in registers, and which in
   1286   // memory.
   1287   unsigned ArgOffset = SPUFrameLowering::minStackSize(); // Just below [LR]
   1288   unsigned ArgRegIdx = 0;
   1289 
   1290   // Keep track of registers passing arguments
   1291   std::vector<std::pair<unsigned, SDValue> > RegsToPass;
   1292   // And the arguments passed on the stack
   1293   SmallVector<SDValue, 8> MemOpChains;
   1294 
   1295   for (; ArgRegIdx != NumOps; ++ArgRegIdx) {
   1296     SDValue Arg = OutVals[ArgRegIdx];
   1297     CCValAssign &VA = ArgLocs[ArgRegIdx];
   1298 
   1299     // PtrOff will be used to store the current argument to the stack if a
   1300     // register cannot be found for it.
   1301     SDValue PtrOff = DAG.getConstant(ArgOffset, StackPtr.getValueType());
   1302     PtrOff = DAG.getNode(ISD::ADD, dl, PtrVT, StackPtr, PtrOff);
   1303 
   1304     switch (Arg.getValueType().getSimpleVT().SimpleTy) {
   1305     default: llvm_unreachable("Unexpected ValueType for argument!");
   1306     case MVT::i8:
   1307     case MVT::i16:
   1308     case MVT::i32:
   1309     case MVT::i64:
   1310     case MVT::i128:
   1311     case MVT::f32:
   1312     case MVT::f64:
   1313     case MVT::v2i64:
   1314     case MVT::v2f64:
   1315     case MVT::v4f32:
   1316     case MVT::v4i32:
   1317     case MVT::v8i16:
   1318     case MVT::v16i8:
   1319       if (ArgRegIdx != NumArgRegs) {
   1320         RegsToPass.push_back(std::make_pair(VA.getLocReg(), Arg));
   1321       } else {
   1322         MemOpChains.push_back(DAG.getStore(Chain, dl, Arg, PtrOff,
   1323                                            MachinePointerInfo(),
   1324                                            false, false, 0));
   1325         ArgOffset += StackSlotSize;
   1326       }
   1327       break;
   1328     }
   1329   }
   1330 
   1331   // Accumulate how many bytes are to be pushed on the stack, including the
   1332   // linkage area, and parameter passing area.  According to the SPU ABI,
   1333   // we minimally need space for [LR] and [SP].
   1334   unsigned NumStackBytes = ArgOffset - SPUFrameLowering::minStackSize();
   1335 
   1336   // Insert a call sequence start
   1337   Chain = DAG.getCALLSEQ_START(Chain, DAG.getIntPtrConstant(NumStackBytes,
   1338                                                             true));
   1339 
   1340   if (!MemOpChains.empty()) {
   1341     // Adjust the stack pointer for the stack arguments.
   1342     Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
   1343                         &MemOpChains[0], MemOpChains.size());
   1344   }
   1345 
   1346   // Build a sequence of copy-to-reg nodes chained together with token chain
   1347   // and flag operands which copy the outgoing args into the appropriate regs.
   1348   SDValue InFlag;
   1349   for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) {
   1350     Chain = DAG.getCopyToReg(Chain, dl, RegsToPass[i].first,
   1351                              RegsToPass[i].second, InFlag);
   1352     InFlag = Chain.getValue(1);
   1353   }
   1354 
   1355   SmallVector<SDValue, 8> Ops;
   1356   unsigned CallOpc = SPUISD::CALL;
   1357 
   1358   // If the callee is a GlobalAddress/ExternalSymbol node (quite common, every
   1359   // direct call is) turn it into a TargetGlobalAddress/TargetExternalSymbol
   1360   // node so that legalize doesn't hack it.
   1361   if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee)) {
   1362     const GlobalValue *GV = G->getGlobal();
   1363     EVT CalleeVT = Callee.getValueType();
   1364     SDValue Zero = DAG.getConstant(0, PtrVT);
   1365     SDValue GA = DAG.getTargetGlobalAddress(GV, dl, CalleeVT);
   1366 
   1367     if (!ST->usingLargeMem()) {
   1368       // Turn calls to targets that are defined (i.e., have bodies) into BRSL
   1369       // style calls, otherwise, external symbols are BRASL calls. This assumes
   1370       // that declared/defined symbols are in the same compilation unit and can
   1371       // be reached through PC-relative jumps.
   1372       //
   1373       // NOTE:
   1374       // This may be an unsafe assumption for JIT and really large compilation
   1375       // units.
   1376       if (GV->isDeclaration()) {
   1377         Callee = DAG.getNode(SPUISD::AFormAddr, dl, CalleeVT, GA, Zero);
   1378       } else {
   1379         Callee = DAG.getNode(SPUISD::PCRelAddr, dl, CalleeVT, GA, Zero);
   1380       }
   1381     } else {
   1382       // "Large memory" mode: Turn all calls into indirect calls with a X-form
   1383       // address pairs:
   1384       Callee = DAG.getNode(SPUISD::IndirectAddr, dl, PtrVT, GA, Zero);
   1385     }
   1386   } else if (ExternalSymbolSDNode *S = dyn_cast<ExternalSymbolSDNode>(Callee)) {
   1387     EVT CalleeVT = Callee.getValueType();
   1388     SDValue Zero = DAG.getConstant(0, PtrVT);
   1389     SDValue ExtSym = DAG.getTargetExternalSymbol(S->getSymbol(),
   1390         Callee.getValueType());
   1391 
   1392     if (!ST->usingLargeMem()) {
   1393       Callee = DAG.getNode(SPUISD::AFormAddr, dl, CalleeVT, ExtSym, Zero);
   1394     } else {
   1395       Callee = DAG.getNode(SPUISD::IndirectAddr, dl, PtrVT, ExtSym, Zero);
   1396     }
   1397   } else if (SDNode *Dest = isLSAAddress(Callee, DAG)) {
   1398     // If this is an absolute destination address that appears to be a legal
   1399     // local store address, use the munged value.
   1400     Callee = SDValue(Dest, 0);
   1401   }
   1402 
   1403   Ops.push_back(Chain);
   1404   Ops.push_back(Callee);
   1405 
   1406   // Add argument registers to the end of the list so that they are known live
   1407   // into the call.
   1408   for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i)
   1409     Ops.push_back(DAG.getRegister(RegsToPass[i].first,
   1410                                   RegsToPass[i].second.getValueType()));
   1411 
   1412   if (InFlag.getNode())
   1413     Ops.push_back(InFlag);
   1414   // Returns a chain and a flag for retval copy to use.
   1415   Chain = DAG.getNode(CallOpc, dl, DAG.getVTList(MVT::Other, MVT::Glue),
   1416                       &Ops[0], Ops.size());
   1417   InFlag = Chain.getValue(1);
   1418 
   1419   Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(NumStackBytes, true),
   1420                              DAG.getIntPtrConstant(0, true), InFlag);
   1421   if (!Ins.empty())
   1422     InFlag = Chain.getValue(1);
   1423 
   1424   // If the function returns void, just return the chain.
   1425   if (Ins.empty())
   1426     return Chain;
   1427 
   1428   // Now handle the return value(s)
   1429   SmallVector<CCValAssign, 16> RVLocs;
   1430   CCState CCRetInfo(CallConv, isVarArg, DAG.getMachineFunction(),
   1431 		    getTargetMachine(), RVLocs, *DAG.getContext());
   1432   CCRetInfo.AnalyzeCallResult(Ins, CCC_SPU);
   1433 
   1434 
   1435   // If the call has results, copy the values out of the ret val registers.
   1436   for (unsigned i = 0; i != RVLocs.size(); ++i) {
   1437     CCValAssign VA = RVLocs[i];
   1438 
   1439     SDValue Val = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(), VA.getLocVT(),
   1440                                      InFlag);
   1441     Chain = Val.getValue(1);
   1442     InFlag = Val.getValue(2);
   1443     InVals.push_back(Val);
   1444    }
   1445 
   1446   return Chain;
   1447 }
   1448 
   1449 SDValue
   1450 SPUTargetLowering::LowerReturn(SDValue Chain,
   1451                                CallingConv::ID CallConv, bool isVarArg,
   1452                                const SmallVectorImpl<ISD::OutputArg> &Outs,
   1453                                const SmallVectorImpl<SDValue> &OutVals,
   1454                                DebugLoc dl, SelectionDAG &DAG) const {
   1455 
   1456   SmallVector<CCValAssign, 16> RVLocs;
   1457   CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(),
   1458 		 getTargetMachine(), RVLocs, *DAG.getContext());
   1459   CCInfo.AnalyzeReturn(Outs, RetCC_SPU);
   1460 
   1461   // If this is the first return lowered for this function, add the regs to the
   1462   // liveout set for the function.
   1463   if (DAG.getMachineFunction().getRegInfo().liveout_empty()) {
   1464     for (unsigned i = 0; i != RVLocs.size(); ++i)
   1465       DAG.getMachineFunction().getRegInfo().addLiveOut(RVLocs[i].getLocReg());
   1466   }
   1467 
   1468   SDValue Flag;
   1469 
   1470   // Copy the result values into the output registers.
   1471   for (unsigned i = 0; i != RVLocs.size(); ++i) {
   1472     CCValAssign &VA = RVLocs[i];
   1473     assert(VA.isRegLoc() && "Can only return in registers!");
   1474     Chain = DAG.getCopyToReg(Chain, dl, VA.getLocReg(),
   1475                              OutVals[i], Flag);
   1476     Flag = Chain.getValue(1);
   1477   }
   1478 
   1479   if (Flag.getNode())
   1480     return DAG.getNode(SPUISD::RET_FLAG, dl, MVT::Other, Chain, Flag);
   1481   else
   1482     return DAG.getNode(SPUISD::RET_FLAG, dl, MVT::Other, Chain);
   1483 }
   1484 
   1485 
   1486 //===----------------------------------------------------------------------===//
   1487 // Vector related lowering:
   1488 //===----------------------------------------------------------------------===//
   1489 
   1490 static ConstantSDNode *
   1491 getVecImm(SDNode *N) {
   1492   SDValue OpVal(0, 0);
   1493 
   1494   // Check to see if this buildvec has a single non-undef value in its elements.
   1495   for (unsigned i = 0, e = N->getNumOperands(); i != e; ++i) {
   1496     if (N->getOperand(i).getOpcode() == ISD::UNDEF) continue;
   1497     if (OpVal.getNode() == 0)
   1498       OpVal = N->getOperand(i);
   1499     else if (OpVal != N->getOperand(i))
   1500       return 0;
   1501   }
   1502 
   1503   if (OpVal.getNode() != 0) {
   1504     if (ConstantSDNode *CN = dyn_cast<ConstantSDNode>(OpVal)) {
   1505       return CN;
   1506     }
   1507   }
   1508 
   1509   return 0;
   1510 }
   1511 
   1512 /// get_vec_i18imm - Test if this vector is a vector filled with the same value
   1513 /// and the value fits into an unsigned 18-bit constant, and if so, return the
   1514 /// constant
   1515 SDValue SPU::get_vec_u18imm(SDNode *N, SelectionDAG &DAG,
   1516                               EVT ValueType) {
   1517   if (ConstantSDNode *CN = getVecImm(N)) {
   1518     uint64_t Value = CN->getZExtValue();
   1519     if (ValueType == MVT::i64) {
   1520       uint64_t UValue = CN->getZExtValue();
   1521       uint32_t upper = uint32_t(UValue >> 32);
   1522       uint32_t lower = uint32_t(UValue);
   1523       if (upper != lower)
   1524         return SDValue();
   1525       Value = Value >> 32;
   1526     }
   1527     if (Value <= 0x3ffff)
   1528       return DAG.getTargetConstant(Value, ValueType);
   1529   }
   1530 
   1531   return SDValue();
   1532 }
   1533 
   1534 /// get_vec_i16imm - Test if this vector is a vector filled with the same value
   1535 /// and the value fits into a signed 16-bit constant, and if so, return the
   1536 /// constant
   1537 SDValue SPU::get_vec_i16imm(SDNode *N, SelectionDAG &DAG,
   1538                               EVT ValueType) {
   1539   if (ConstantSDNode *CN = getVecImm(N)) {
   1540     int64_t Value = CN->getSExtValue();
   1541     if (ValueType == MVT::i64) {
   1542       uint64_t UValue = CN->getZExtValue();
   1543       uint32_t upper = uint32_t(UValue >> 32);
   1544       uint32_t lower = uint32_t(UValue);
   1545       if (upper != lower)
   1546         return SDValue();
   1547       Value = Value >> 32;
   1548     }
   1549     if (Value >= -(1 << 15) && Value <= ((1 << 15) - 1)) {
   1550       return DAG.getTargetConstant(Value, ValueType);
   1551     }
   1552   }
   1553 
   1554   return SDValue();
   1555 }
   1556 
   1557 /// get_vec_i10imm - Test if this vector is a vector filled with the same value
   1558 /// and the value fits into a signed 10-bit constant, and if so, return the
   1559 /// constant
   1560 SDValue SPU::get_vec_i10imm(SDNode *N, SelectionDAG &DAG,
   1561                               EVT ValueType) {
   1562   if (ConstantSDNode *CN = getVecImm(N)) {
   1563     int64_t Value = CN->getSExtValue();
   1564     if (ValueType == MVT::i64) {
   1565       uint64_t UValue = CN->getZExtValue();
   1566       uint32_t upper = uint32_t(UValue >> 32);
   1567       uint32_t lower = uint32_t(UValue);
   1568       if (upper != lower)
   1569         return SDValue();
   1570       Value = Value >> 32;
   1571     }
   1572     if (isInt<10>(Value))
   1573       return DAG.getTargetConstant(Value, ValueType);
   1574   }
   1575 
   1576   return SDValue();
   1577 }
   1578 
   1579 /// get_vec_i8imm - Test if this vector is a vector filled with the same value
   1580 /// and the value fits into a signed 8-bit constant, and if so, return the
   1581 /// constant.
   1582 ///
   1583 /// @note: The incoming vector is v16i8 because that's the only way we can load
   1584 /// constant vectors. Thus, we test to see if the upper and lower bytes are the
   1585 /// same value.
   1586 SDValue SPU::get_vec_i8imm(SDNode *N, SelectionDAG &DAG,
   1587                              EVT ValueType) {
   1588   if (ConstantSDNode *CN = getVecImm(N)) {
   1589     int Value = (int) CN->getZExtValue();
   1590     if (ValueType == MVT::i16
   1591         && Value <= 0xffff                 /* truncated from uint64_t */
   1592         && ((short) Value >> 8) == ((short) Value & 0xff))
   1593       return DAG.getTargetConstant(Value & 0xff, ValueType);
   1594     else if (ValueType == MVT::i8
   1595              && (Value & 0xff) == Value)
   1596       return DAG.getTargetConstant(Value, ValueType);
   1597   }
   1598 
   1599   return SDValue();
   1600 }
   1601 
   1602 /// get_ILHUvec_imm - Test if this vector is a vector filled with the same value
   1603 /// and the value fits into a signed 16-bit constant, and if so, return the
   1604 /// constant
   1605 SDValue SPU::get_ILHUvec_imm(SDNode *N, SelectionDAG &DAG,
   1606                                EVT ValueType) {
   1607   if (ConstantSDNode *CN = getVecImm(N)) {
   1608     uint64_t Value = CN->getZExtValue();
   1609     if ((ValueType == MVT::i32
   1610           && ((unsigned) Value & 0xffff0000) == (unsigned) Value)
   1611         || (ValueType == MVT::i64 && (Value & 0xffff0000) == Value))
   1612       return DAG.getTargetConstant(Value >> 16, ValueType);
   1613   }
   1614 
   1615   return SDValue();
   1616 }
   1617 
   1618 /// get_v4i32_imm - Catch-all for general 32-bit constant vectors
   1619 SDValue SPU::get_v4i32_imm(SDNode *N, SelectionDAG &DAG) {
   1620   if (ConstantSDNode *CN = getVecImm(N)) {
   1621     return DAG.getTargetConstant((unsigned) CN->getZExtValue(), MVT::i32);
   1622   }
   1623 
   1624   return SDValue();
   1625 }
   1626 
   1627 /// get_v4i32_imm - Catch-all for general 64-bit constant vectors
   1628 SDValue SPU::get_v2i64_imm(SDNode *N, SelectionDAG &DAG) {
   1629   if (ConstantSDNode *CN = getVecImm(N)) {
   1630     return DAG.getTargetConstant((unsigned) CN->getZExtValue(), MVT::i64);
   1631   }
   1632 
   1633   return SDValue();
   1634 }
   1635 
   1636 //! Lower a BUILD_VECTOR instruction creatively:
   1637 static SDValue
   1638 LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) {
   1639   EVT VT = Op.getValueType();
   1640   EVT EltVT = VT.getVectorElementType();
   1641   DebugLoc dl = Op.getDebugLoc();
   1642   BuildVectorSDNode *BCN = dyn_cast<BuildVectorSDNode>(Op.getNode());
   1643   assert(BCN != 0 && "Expected BuildVectorSDNode in SPU LowerBUILD_VECTOR");
   1644   unsigned minSplatBits = EltVT.getSizeInBits();
   1645 
   1646   if (minSplatBits < 16)
   1647     minSplatBits = 16;
   1648 
   1649   APInt APSplatBits, APSplatUndef;
   1650   unsigned SplatBitSize;
   1651   bool HasAnyUndefs;
   1652 
   1653   if (!BCN->isConstantSplat(APSplatBits, APSplatUndef, SplatBitSize,
   1654                             HasAnyUndefs, minSplatBits)
   1655       || minSplatBits < SplatBitSize)
   1656     return SDValue();   // Wasn't a constant vector or splat exceeded min
   1657 
   1658   uint64_t SplatBits = APSplatBits.getZExtValue();
   1659 
   1660   switch (VT.getSimpleVT().SimpleTy) {
   1661   default:
   1662     report_fatal_error("CellSPU: Unhandled VT in LowerBUILD_VECTOR, VT = " +
   1663                        Twine(VT.getEVTString()));
   1664     /*NOTREACHED*/
   1665   case MVT::v4f32: {
   1666     uint32_t Value32 = uint32_t(SplatBits);
   1667     assert(SplatBitSize == 32
   1668            && "LowerBUILD_VECTOR: Unexpected floating point vector element.");
   1669     // NOTE: pretend the constant is an integer. LLVM won't load FP constants
   1670     SDValue T = DAG.getConstant(Value32, MVT::i32);
   1671     return DAG.getNode(ISD::BITCAST, dl, MVT::v4f32,
   1672                        DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32, T,T,T,T));
   1673     break;
   1674   }
   1675   case MVT::v2f64: {
   1676     uint64_t f64val = uint64_t(SplatBits);
   1677     assert(SplatBitSize == 64
   1678            && "LowerBUILD_VECTOR: 64-bit float vector size > 8 bytes.");
   1679     // NOTE: pretend the constant is an integer. LLVM won't load FP constants
   1680     SDValue T = DAG.getConstant(f64val, MVT::i64);
   1681     return DAG.getNode(ISD::BITCAST, dl, MVT::v2f64,
   1682                        DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v2i64, T, T));
   1683     break;
   1684   }
   1685   case MVT::v16i8: {
   1686    // 8-bit constants have to be expanded to 16-bits
   1687    unsigned short Value16 = SplatBits /* | (SplatBits << 8) */;
   1688    SmallVector<SDValue, 8> Ops;
   1689 
   1690    Ops.assign(8, DAG.getConstant(Value16, MVT::i16));
   1691    return DAG.getNode(ISD::BITCAST, dl, VT,
   1692                       DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v8i16, &Ops[0], Ops.size()));
   1693   }
   1694   case MVT::v8i16: {
   1695     unsigned short Value16 = SplatBits;
   1696     SDValue T = DAG.getConstant(Value16, EltVT);
   1697     SmallVector<SDValue, 8> Ops;
   1698 
   1699     Ops.assign(8, T);
   1700     return DAG.getNode(ISD::BUILD_VECTOR, dl, VT, &Ops[0], Ops.size());
   1701   }
   1702   case MVT::v4i32: {
   1703     SDValue T = DAG.getConstant(unsigned(SplatBits), VT.getVectorElementType());
   1704     return DAG.getNode(ISD::BUILD_VECTOR, dl, VT, T, T, T, T);
   1705   }
   1706   case MVT::v2i64: {
   1707     return SPU::LowerV2I64Splat(VT, DAG, SplatBits, dl);
   1708   }
   1709   }
   1710 
   1711   return SDValue();
   1712 }
   1713 
   1714 /*!
   1715  */
   1716 SDValue
   1717 SPU::LowerV2I64Splat(EVT OpVT, SelectionDAG& DAG, uint64_t SplatVal,
   1718                      DebugLoc dl) {
   1719   uint32_t upper = uint32_t(SplatVal >> 32);
   1720   uint32_t lower = uint32_t(SplatVal);
   1721 
   1722   if (upper == lower) {
   1723     // Magic constant that can be matched by IL, ILA, et. al.
   1724     SDValue Val = DAG.getTargetConstant(upper, MVT::i32);
   1725     return DAG.getNode(ISD::BITCAST, dl, OpVT,
   1726                        DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32,
   1727                                    Val, Val, Val, Val));
   1728   } else {
   1729     bool upper_special, lower_special;
   1730 
   1731     // NOTE: This code creates common-case shuffle masks that can be easily
   1732     // detected as common expressions. It is not attempting to create highly
   1733     // specialized masks to replace any and all 0's, 0xff's and 0x80's.
   1734 
   1735     // Detect if the upper or lower half is a special shuffle mask pattern:
   1736     upper_special = (upper == 0 || upper == 0xffffffff || upper == 0x80000000);
   1737     lower_special = (lower == 0 || lower == 0xffffffff || lower == 0x80000000);
   1738 
   1739     // Both upper and lower are special, lower to a constant pool load:
   1740     if (lower_special && upper_special) {
   1741       SDValue SplatValCN = DAG.getConstant(SplatVal, MVT::i64);
   1742       return DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v2i64,
   1743                          SplatValCN, SplatValCN);
   1744     }
   1745 
   1746     SDValue LO32;
   1747     SDValue HI32;
   1748     SmallVector<SDValue, 16> ShufBytes;
   1749     SDValue Result;
   1750 
   1751     // Create lower vector if not a special pattern
   1752     if (!lower_special) {
   1753       SDValue LO32C = DAG.getConstant(lower, MVT::i32);
   1754       LO32 = DAG.getNode(ISD::BITCAST, dl, OpVT,
   1755                          DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32,
   1756                                      LO32C, LO32C, LO32C, LO32C));
   1757     }
   1758 
   1759     // Create upper vector if not a special pattern
   1760     if (!upper_special) {
   1761       SDValue HI32C = DAG.getConstant(upper, MVT::i32);
   1762       HI32 = DAG.getNode(ISD::BITCAST, dl, OpVT,
   1763                          DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32,
   1764                                      HI32C, HI32C, HI32C, HI32C));
   1765     }
   1766 
   1767     // If either upper or lower are special, then the two input operands are
   1768     // the same (basically, one of them is a "don't care")
   1769     if (lower_special)
   1770       LO32 = HI32;
   1771     if (upper_special)
   1772       HI32 = LO32;
   1773 
   1774     for (int i = 0; i < 4; ++i) {
   1775       uint64_t val = 0;
   1776       for (int j = 0; j < 4; ++j) {
   1777         SDValue V;
   1778         bool process_upper, process_lower;
   1779         val <<= 8;
   1780         process_upper = (upper_special && (i & 1) == 0);
   1781         process_lower = (lower_special && (i & 1) == 1);
   1782 
   1783         if (process_upper || process_lower) {
   1784           if ((process_upper && upper == 0)
   1785                   || (process_lower && lower == 0))
   1786             val |= 0x80;
   1787           else if ((process_upper && upper == 0xffffffff)
   1788                   || (process_lower && lower == 0xffffffff))
   1789             val |= 0xc0;
   1790           else if ((process_upper && upper == 0x80000000)
   1791                   || (process_lower && lower == 0x80000000))
   1792             val |= (j == 0 ? 0xe0 : 0x80);
   1793         } else
   1794           val |= i * 4 + j + ((i & 1) * 16);
   1795       }
   1796 
   1797       ShufBytes.push_back(DAG.getConstant(val, MVT::i32));
   1798     }
   1799 
   1800     return DAG.getNode(SPUISD::SHUFB, dl, OpVT, HI32, LO32,
   1801                        DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32,
   1802                                    &ShufBytes[0], ShufBytes.size()));
   1803   }
   1804 }
   1805 
   1806 /// LowerVECTOR_SHUFFLE - Lower a vector shuffle (V1, V2, V3) to something on
   1807 /// which the Cell can operate. The code inspects V3 to ascertain whether the
   1808 /// permutation vector, V3, is monotonically increasing with one "exception"
   1809 /// element, e.g., (0, 1, _, 3). If this is the case, then generate a
   1810 /// SHUFFLE_MASK synthetic instruction. Otherwise, spill V3 to the constant pool.
   1811 /// In either case, the net result is going to eventually invoke SHUFB to
   1812 /// permute/shuffle the bytes from V1 and V2.
   1813 /// \note
   1814 /// SHUFFLE_MASK is eventually selected as one of the C*D instructions, generate
   1815 /// control word for byte/halfword/word insertion. This takes care of a single
   1816 /// element move from V2 into V1.
   1817 /// \note
   1818 /// SPUISD::SHUFB is eventually selected as Cell's <i>shufb</i> instructions.
   1819 static SDValue LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) {
   1820   const ShuffleVectorSDNode *SVN = cast<ShuffleVectorSDNode>(Op);
   1821   SDValue V1 = Op.getOperand(0);
   1822   SDValue V2 = Op.getOperand(1);
   1823   DebugLoc dl = Op.getDebugLoc();
   1824 
   1825   if (V2.getOpcode() == ISD::UNDEF) V2 = V1;
   1826 
   1827   // If we have a single element being moved from V1 to V2, this can be handled
   1828   // using the C*[DX] compute mask instructions, but the vector elements have
   1829   // to be monotonically increasing with one exception element, and the source
   1830   // slot of the element to move must be the same as the destination.
   1831   EVT VecVT = V1.getValueType();
   1832   EVT EltVT = VecVT.getVectorElementType();
   1833   unsigned EltsFromV2 = 0;
   1834   unsigned V2EltOffset = 0;
   1835   unsigned V2EltIdx0 = 0;
   1836   unsigned CurrElt = 0;
   1837   unsigned MaxElts = VecVT.getVectorNumElements();
   1838   unsigned PrevElt = 0;
   1839   bool monotonic = true;
   1840   bool rotate = true;
   1841   int rotamt=0;
   1842   EVT maskVT;             // which of the c?d instructions to use
   1843 
   1844   if (EltVT == MVT::i8) {
   1845     V2EltIdx0 = 16;
   1846     maskVT = MVT::v16i8;
   1847   } else if (EltVT == MVT::i16) {
   1848     V2EltIdx0 = 8;
   1849     maskVT = MVT::v8i16;
   1850   } else if (EltVT == MVT::i32 || EltVT == MVT::f32) {
   1851     V2EltIdx0 = 4;
   1852     maskVT = MVT::v4i32;
   1853   } else if (EltVT == MVT::i64 || EltVT == MVT::f64) {
   1854     V2EltIdx0 = 2;
   1855     maskVT = MVT::v2i64;
   1856   } else
   1857     llvm_unreachable("Unhandled vector type in LowerVECTOR_SHUFFLE");
   1858 
   1859   for (unsigned i = 0; i != MaxElts; ++i) {
   1860     if (SVN->getMaskElt(i) < 0)
   1861       continue;
   1862 
   1863     unsigned SrcElt = SVN->getMaskElt(i);
   1864 
   1865     if (monotonic) {
   1866       if (SrcElt >= V2EltIdx0) {
   1867         // TODO: optimize for the monotonic case when several consecutive
   1868         // elements are taken form V2. Do we ever get such a case?
   1869         if (EltsFromV2 == 0 && CurrElt == (SrcElt - V2EltIdx0))
   1870           V2EltOffset = (SrcElt - V2EltIdx0) * (EltVT.getSizeInBits()/8);
   1871         else
   1872           monotonic = false;
   1873         ++EltsFromV2;
   1874       } else if (CurrElt != SrcElt) {
   1875         monotonic = false;
   1876       }
   1877 
   1878       ++CurrElt;
   1879     }
   1880 
   1881     if (rotate) {
   1882       if (PrevElt > 0 && SrcElt < MaxElts) {
   1883         if ((PrevElt == SrcElt - 1)
   1884             || (PrevElt == MaxElts - 1 && SrcElt == 0)) {
   1885           PrevElt = SrcElt;
   1886         } else {
   1887           rotate = false;
   1888         }
   1889       } else if (i == 0 || (PrevElt==0 && SrcElt==1)) {
   1890         // First time or after a "wrap around"
   1891         rotamt = SrcElt-i;
   1892         PrevElt = SrcElt;
   1893       } else {
   1894         // This isn't a rotation, takes elements from vector 2
   1895         rotate = false;
   1896       }
   1897     }
   1898   }
   1899 
   1900   if (EltsFromV2 == 1 && monotonic) {
   1901     // Compute mask and shuffle
   1902     EVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy();
   1903 
   1904     // As SHUFFLE_MASK becomes a c?d instruction, feed it an address
   1905     // R1 ($sp) is used here only as it is guaranteed to have last bits zero
   1906     SDValue Pointer = DAG.getNode(SPUISD::IndirectAddr, dl, PtrVT,
   1907                                 DAG.getRegister(SPU::R1, PtrVT),
   1908                                 DAG.getConstant(V2EltOffset, MVT::i32));
   1909     SDValue ShufMaskOp = DAG.getNode(SPUISD::SHUFFLE_MASK, dl,
   1910                                      maskVT, Pointer);
   1911 
   1912     // Use shuffle mask in SHUFB synthetic instruction:
   1913     return DAG.getNode(SPUISD::SHUFB, dl, V1.getValueType(), V2, V1,
   1914                        ShufMaskOp);
   1915   } else if (rotate) {
   1916     if (rotamt < 0)
   1917       rotamt +=MaxElts;
   1918     rotamt *= EltVT.getSizeInBits()/8;
   1919     return DAG.getNode(SPUISD::ROTBYTES_LEFT, dl, V1.getValueType(),
   1920                        V1, DAG.getConstant(rotamt, MVT::i16));
   1921   } else {
   1922    // Convert the SHUFFLE_VECTOR mask's input element units to the
   1923    // actual bytes.
   1924     unsigned BytesPerElement = EltVT.getSizeInBits()/8;
   1925 
   1926     SmallVector<SDValue, 16> ResultMask;
   1927     for (unsigned i = 0, e = MaxElts; i != e; ++i) {
   1928       unsigned SrcElt = SVN->getMaskElt(i) < 0 ? 0 : SVN->getMaskElt(i);
   1929 
   1930       for (unsigned j = 0; j < BytesPerElement; ++j)
   1931         ResultMask.push_back(DAG.getConstant(SrcElt*BytesPerElement+j,MVT::i8));
   1932     }
   1933     SDValue VPermMask = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v16i8,
   1934                                     &ResultMask[0], ResultMask.size());
   1935     return DAG.getNode(SPUISD::SHUFB, dl, V1.getValueType(), V1, V2, VPermMask);
   1936   }
   1937 }
   1938 
   1939 static SDValue LowerSCALAR_TO_VECTOR(SDValue Op, SelectionDAG &DAG) {
   1940   SDValue Op0 = Op.getOperand(0);                     // Op0 = the scalar
   1941   DebugLoc dl = Op.getDebugLoc();
   1942 
   1943   if (Op0.getNode()->getOpcode() == ISD::Constant) {
   1944     // For a constant, build the appropriate constant vector, which will
   1945     // eventually simplify to a vector register load.
   1946 
   1947     ConstantSDNode *CN = cast<ConstantSDNode>(Op0.getNode());
   1948     SmallVector<SDValue, 16> ConstVecValues;
   1949     EVT VT;
   1950     size_t n_copies;
   1951 
   1952     // Create a constant vector:
   1953     switch (Op.getValueType().getSimpleVT().SimpleTy) {
   1954     default: llvm_unreachable("Unexpected constant value type in "
   1955                               "LowerSCALAR_TO_VECTOR");
   1956     case MVT::v16i8: n_copies = 16; VT = MVT::i8; break;
   1957     case MVT::v8i16: n_copies = 8; VT = MVT::i16; break;
   1958     case MVT::v4i32: n_copies = 4; VT = MVT::i32; break;
   1959     case MVT::v4f32: n_copies = 4; VT = MVT::f32; break;
   1960     case MVT::v2i64: n_copies = 2; VT = MVT::i64; break;
   1961     case MVT::v2f64: n_copies = 2; VT = MVT::f64; break;
   1962     }
   1963 
   1964     SDValue CValue = DAG.getConstant(CN->getZExtValue(), VT);
   1965     for (size_t j = 0; j < n_copies; ++j)
   1966       ConstVecValues.push_back(CValue);
   1967 
   1968     return DAG.getNode(ISD::BUILD_VECTOR, dl, Op.getValueType(),
   1969                        &ConstVecValues[0], ConstVecValues.size());
   1970   } else {
   1971     // Otherwise, copy the value from one register to another:
   1972     switch (Op0.getValueType().getSimpleVT().SimpleTy) {
   1973     default: llvm_unreachable("Unexpected value type in LowerSCALAR_TO_VECTOR");
   1974     case MVT::i8:
   1975     case MVT::i16:
   1976     case MVT::i32:
   1977     case MVT::i64:
   1978     case MVT::f32:
   1979     case MVT::f64:
   1980       return DAG.getNode(SPUISD::PREFSLOT2VEC, dl, Op.getValueType(), Op0, Op0);
   1981     }
   1982   }
   1983 
   1984   return SDValue();
   1985 }
   1986 
   1987 static SDValue LowerEXTRACT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) {
   1988   EVT VT = Op.getValueType();
   1989   SDValue N = Op.getOperand(0);
   1990   SDValue Elt = Op.getOperand(1);
   1991   DebugLoc dl = Op.getDebugLoc();
   1992   SDValue retval;
   1993 
   1994   if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Elt)) {
   1995     // Constant argument:
   1996     int EltNo = (int) C->getZExtValue();
   1997 
   1998     // sanity checks:
   1999     if (VT == MVT::i8 && EltNo >= 16)
   2000       llvm_unreachable("SPU LowerEXTRACT_VECTOR_ELT: i8 extraction slot > 15");
   2001     else if (VT == MVT::i16 && EltNo >= 8)
   2002       llvm_unreachable("SPU LowerEXTRACT_VECTOR_ELT: i16 extraction slot > 7");
   2003     else if (VT == MVT::i32 && EltNo >= 4)
   2004       llvm_unreachable("SPU LowerEXTRACT_VECTOR_ELT: i32 extraction slot > 4");
   2005     else if (VT == MVT::i64 && EltNo >= 2)
   2006       llvm_unreachable("SPU LowerEXTRACT_VECTOR_ELT: i64 extraction slot > 2");
   2007 
   2008     if (EltNo == 0 && (VT == MVT::i32 || VT == MVT::i64)) {
   2009       // i32 and i64: Element 0 is the preferred slot
   2010       return DAG.getNode(SPUISD::VEC2PREFSLOT, dl, VT, N);
   2011     }
   2012 
   2013     // Need to generate shuffle mask and extract:
   2014     int prefslot_begin = -1, prefslot_end = -1;
   2015     int elt_byte = EltNo * VT.getSizeInBits() / 8;
   2016 
   2017     switch (VT.getSimpleVT().SimpleTy) {
   2018     default:
   2019       assert(false && "Invalid value type!");
   2020     case MVT::i8: {
   2021       prefslot_begin = prefslot_end = 3;
   2022       break;
   2023     }
   2024     case MVT::i16: {
   2025       prefslot_begin = 2; prefslot_end = 3;
   2026       break;
   2027     }
   2028     case MVT::i32:
   2029     case MVT::f32: {
   2030       prefslot_begin = 0; prefslot_end = 3;
   2031       break;
   2032     }
   2033     case MVT::i64:
   2034     case MVT::f64: {
   2035       prefslot_begin = 0; prefslot_end = 7;
   2036       break;
   2037     }
   2038     }
   2039 
   2040     assert(prefslot_begin != -1 && prefslot_end != -1 &&
   2041            "LowerEXTRACT_VECTOR_ELT: preferred slots uninitialized");
   2042 
   2043     unsigned int ShufBytes[16] = {
   2044       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
   2045     };
   2046     for (int i = 0; i < 16; ++i) {
   2047       // zero fill uppper part of preferred slot, don't care about the
   2048       // other slots:
   2049       unsigned int mask_val;
   2050       if (i <= prefslot_end) {
   2051         mask_val =
   2052           ((i < prefslot_begin)
   2053            ? 0x80
   2054            : elt_byte + (i - prefslot_begin));
   2055 
   2056         ShufBytes[i] = mask_val;
   2057       } else
   2058         ShufBytes[i] = ShufBytes[i % (prefslot_end + 1)];
   2059     }
   2060 
   2061     SDValue ShufMask[4];
   2062     for (unsigned i = 0; i < sizeof(ShufMask)/sizeof(ShufMask[0]); ++i) {
   2063       unsigned bidx = i * 4;
   2064       unsigned int bits = ((ShufBytes[bidx] << 24) |
   2065                            (ShufBytes[bidx+1] << 16) |
   2066                            (ShufBytes[bidx+2] << 8) |
   2067                            ShufBytes[bidx+3]);
   2068       ShufMask[i] = DAG.getConstant(bits, MVT::i32);
   2069     }
   2070 
   2071     SDValue ShufMaskVec =
   2072       DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32,
   2073                   &ShufMask[0], sizeof(ShufMask)/sizeof(ShufMask[0]));
   2074 
   2075     retval = DAG.getNode(SPUISD::VEC2PREFSLOT, dl, VT,
   2076                          DAG.getNode(SPUISD::SHUFB, dl, N.getValueType(),
   2077                                      N, N, ShufMaskVec));
   2078   } else {
   2079     // Variable index: Rotate the requested element into slot 0, then replicate
   2080     // slot 0 across the vector
   2081     EVT VecVT = N.getValueType();
   2082     if (!VecVT.isSimple() || !VecVT.isVector()) {
   2083       report_fatal_error("LowerEXTRACT_VECTOR_ELT: Must have a simple, 128-bit"
   2084                         "vector type!");
   2085     }
   2086 
   2087     // Make life easier by making sure the index is zero-extended to i32
   2088     if (Elt.getValueType() != MVT::i32)
   2089       Elt = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, Elt);
   2090 
   2091     // Scale the index to a bit/byte shift quantity
   2092     APInt scaleFactor =
   2093             APInt(32, uint64_t(16 / N.getValueType().getVectorNumElements()), false);
   2094     unsigned scaleShift = scaleFactor.logBase2();
   2095     SDValue vecShift;
   2096 
   2097     if (scaleShift > 0) {
   2098       // Scale the shift factor:
   2099       Elt = DAG.getNode(ISD::SHL, dl, MVT::i32, Elt,
   2100                         DAG.getConstant(scaleShift, MVT::i32));
   2101     }
   2102 
   2103     vecShift = DAG.getNode(SPUISD::SHL_BYTES, dl, VecVT, N, Elt);
   2104 
   2105     // Replicate the bytes starting at byte 0 across the entire vector (for
   2106     // consistency with the notion of a unified register set)
   2107     SDValue replicate;
   2108 
   2109     switch (VT.getSimpleVT().SimpleTy) {
   2110     default:
   2111       report_fatal_error("LowerEXTRACT_VECTOR_ELT(varable): Unhandled vector"
   2112                         "type");
   2113       /*NOTREACHED*/
   2114     case MVT::i8: {
   2115       SDValue factor = DAG.getConstant(0x00000000, MVT::i32);
   2116       replicate = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32,
   2117                               factor, factor, factor, factor);
   2118       break;
   2119     }
   2120     case MVT::i16: {
   2121       SDValue factor = DAG.getConstant(0x00010001, MVT::i32);
   2122       replicate = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32,
   2123                               factor, factor, factor, factor);
   2124       break;
   2125     }
   2126     case MVT::i32:
   2127     case MVT::f32: {
   2128       SDValue factor = DAG.getConstant(0x00010203, MVT::i32);
   2129       replicate = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32,
   2130                               factor, factor, factor, factor);
   2131       break;
   2132     }
   2133     case MVT::i64:
   2134     case MVT::f64: {
   2135       SDValue loFactor = DAG.getConstant(0x00010203, MVT::i32);
   2136       SDValue hiFactor = DAG.getConstant(0x04050607, MVT::i32);
   2137       replicate = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32,
   2138                               loFactor, hiFactor, loFactor, hiFactor);
   2139       break;
   2140     }
   2141     }
   2142 
   2143     retval = DAG.getNode(SPUISD::VEC2PREFSLOT, dl, VT,
   2144                          DAG.getNode(SPUISD::SHUFB, dl, VecVT,
   2145                                      vecShift, vecShift, replicate));
   2146   }
   2147 
   2148   return retval;
   2149 }
   2150 
   2151 static SDValue LowerINSERT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) {
   2152   SDValue VecOp = Op.getOperand(0);
   2153   SDValue ValOp = Op.getOperand(1);
   2154   SDValue IdxOp = Op.getOperand(2);
   2155   DebugLoc dl = Op.getDebugLoc();
   2156   EVT VT = Op.getValueType();
   2157   EVT eltVT = ValOp.getValueType();
   2158 
   2159   // use 0 when the lane to insert to is 'undef'
   2160   int64_t Offset=0;
   2161   if (IdxOp.getOpcode() != ISD::UNDEF) {
   2162     ConstantSDNode *CN = cast<ConstantSDNode>(IdxOp);
   2163     assert(CN != 0 && "LowerINSERT_VECTOR_ELT: Index is not constant!");
   2164     Offset = (CN->getSExtValue()) * eltVT.getSizeInBits()/8;
   2165   }
   2166 
   2167   EVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy();
   2168   // Use $sp ($1) because it's always 16-byte aligned and it's available:
   2169   SDValue Pointer = DAG.getNode(SPUISD::IndirectAddr, dl, PtrVT,
   2170                                 DAG.getRegister(SPU::R1, PtrVT),
   2171                                 DAG.getConstant(Offset, PtrVT));
   2172   // widen the mask when dealing with half vectors
   2173   EVT maskVT = EVT::getVectorVT(*(DAG.getContext()), VT.getVectorElementType(),
   2174                                 128/ VT.getVectorElementType().getSizeInBits());
   2175   SDValue ShufMask = DAG.getNode(SPUISD::SHUFFLE_MASK, dl, maskVT, Pointer);
   2176 
   2177   SDValue result =
   2178     DAG.getNode(SPUISD::SHUFB, dl, VT,
   2179                 DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, ValOp),
   2180                 VecOp,
   2181                 DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, ShufMask));
   2182 
   2183   return result;
   2184 }
   2185 
   2186 static SDValue LowerI8Math(SDValue Op, SelectionDAG &DAG, unsigned Opc,
   2187                            const TargetLowering &TLI)
   2188 {
   2189   SDValue N0 = Op.getOperand(0);      // Everything has at least one operand
   2190   DebugLoc dl = Op.getDebugLoc();
   2191   EVT ShiftVT = TLI.getShiftAmountTy(N0.getValueType());
   2192 
   2193   assert(Op.getValueType() == MVT::i8);
   2194   switch (Opc) {
   2195   default:
   2196     llvm_unreachable("Unhandled i8 math operator");
   2197     /*NOTREACHED*/
   2198     break;
   2199   case ISD::ADD: {
   2200     // 8-bit addition: Promote the arguments up to 16-bits and truncate
   2201     // the result:
   2202     SDValue N1 = Op.getOperand(1);
   2203     N0 = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::i16, N0);
   2204     N1 = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::i16, N1);
   2205     return DAG.getNode(ISD::TRUNCATE, dl, MVT::i8,
   2206                        DAG.getNode(Opc, dl, MVT::i16, N0, N1));
   2207 
   2208   }
   2209 
   2210   case ISD::SUB: {
   2211     // 8-bit subtraction: Promote the arguments up to 16-bits and truncate
   2212     // the result:
   2213     SDValue N1 = Op.getOperand(1);
   2214     N0 = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::i16, N0);
   2215     N1 = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::i16, N1);
   2216     return DAG.getNode(ISD::TRUNCATE, dl, MVT::i8,
   2217                        DAG.getNode(Opc, dl, MVT::i16, N0, N1));
   2218   }
   2219   case ISD::ROTR:
   2220   case ISD::ROTL: {
   2221     SDValue N1 = Op.getOperand(1);
   2222     EVT N1VT = N1.getValueType();
   2223 
   2224     N0 = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i16, N0);
   2225     if (!N1VT.bitsEq(ShiftVT)) {
   2226       unsigned N1Opc = N1.getValueType().bitsLT(ShiftVT)
   2227                        ? ISD::ZERO_EXTEND
   2228                        : ISD::TRUNCATE;
   2229       N1 = DAG.getNode(N1Opc, dl, ShiftVT, N1);
   2230     }
   2231 
   2232     // Replicate lower 8-bits into upper 8:
   2233     SDValue ExpandArg =
   2234       DAG.getNode(ISD::OR, dl, MVT::i16, N0,
   2235                   DAG.getNode(ISD::SHL, dl, MVT::i16,
   2236                               N0, DAG.getConstant(8, MVT::i32)));
   2237 
   2238     // Truncate back down to i8
   2239     return DAG.getNode(ISD::TRUNCATE, dl, MVT::i8,
   2240                        DAG.getNode(Opc, dl, MVT::i16, ExpandArg, N1));
   2241   }
   2242   case ISD::SRL:
   2243   case ISD::SHL: {
   2244     SDValue N1 = Op.getOperand(1);
   2245     EVT N1VT = N1.getValueType();
   2246 
   2247     N0 = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i16, N0);
   2248     if (!N1VT.bitsEq(ShiftVT)) {
   2249       unsigned N1Opc = ISD::ZERO_EXTEND;
   2250 
   2251       if (N1.getValueType().bitsGT(ShiftVT))
   2252         N1Opc = ISD::TRUNCATE;
   2253 
   2254       N1 = DAG.getNode(N1Opc, dl, ShiftVT, N1);
   2255     }
   2256 
   2257     return DAG.getNode(ISD::TRUNCATE, dl, MVT::i8,
   2258                        DAG.getNode(Opc, dl, MVT::i16, N0, N1));
   2259   }
   2260   case ISD::SRA: {
   2261     SDValue N1 = Op.getOperand(1);
   2262     EVT N1VT = N1.getValueType();
   2263 
   2264     N0 = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::i16, N0);
   2265     if (!N1VT.bitsEq(ShiftVT)) {
   2266       unsigned N1Opc = ISD::SIGN_EXTEND;
   2267 
   2268       if (N1VT.bitsGT(ShiftVT))
   2269         N1Opc = ISD::TRUNCATE;
   2270       N1 = DAG.getNode(N1Opc, dl, ShiftVT, N1);
   2271     }
   2272 
   2273     return DAG.getNode(ISD::TRUNCATE, dl, MVT::i8,
   2274                        DAG.getNode(Opc, dl, MVT::i16, N0, N1));
   2275   }
   2276   case ISD::MUL: {
   2277     SDValue N1 = Op.getOperand(1);
   2278 
   2279     N0 = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::i16, N0);
   2280     N1 = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::i16, N1);
   2281     return DAG.getNode(ISD::TRUNCATE, dl, MVT::i8,
   2282                        DAG.getNode(Opc, dl, MVT::i16, N0, N1));
   2283     break;
   2284   }
   2285   }
   2286 
   2287   return SDValue();
   2288 }
   2289 
   2290 //! Lower byte immediate operations for v16i8 vectors:
   2291 static SDValue
   2292 LowerByteImmed(SDValue Op, SelectionDAG &DAG) {
   2293   SDValue ConstVec;
   2294   SDValue Arg;
   2295   EVT VT = Op.getValueType();
   2296   DebugLoc dl = Op.getDebugLoc();
   2297 
   2298   ConstVec = Op.getOperand(0);
   2299   Arg = Op.getOperand(1);
   2300   if (ConstVec.getNode()->getOpcode() != ISD::BUILD_VECTOR) {
   2301     if (ConstVec.getNode()->getOpcode() == ISD::BITCAST) {
   2302       ConstVec = ConstVec.getOperand(0);
   2303     } else {
   2304       ConstVec = Op.getOperand(1);
   2305       Arg = Op.getOperand(0);
   2306       if (ConstVec.getNode()->getOpcode() == ISD::BITCAST) {
   2307         ConstVec = ConstVec.getOperand(0);
   2308       }
   2309     }
   2310   }
   2311 
   2312   if (ConstVec.getNode()->getOpcode() == ISD::BUILD_VECTOR) {
   2313     BuildVectorSDNode *BCN = dyn_cast<BuildVectorSDNode>(ConstVec.getNode());
   2314     assert(BCN != 0 && "Expected BuildVectorSDNode in SPU LowerByteImmed");
   2315 
   2316     APInt APSplatBits, APSplatUndef;
   2317     unsigned SplatBitSize;
   2318     bool HasAnyUndefs;
   2319     unsigned minSplatBits = VT.getVectorElementType().getSizeInBits();
   2320 
   2321     if (BCN->isConstantSplat(APSplatBits, APSplatUndef, SplatBitSize,
   2322                               HasAnyUndefs, minSplatBits)
   2323         && minSplatBits <= SplatBitSize) {
   2324       uint64_t SplatBits = APSplatBits.getZExtValue();
   2325       SDValue tc = DAG.getTargetConstant(SplatBits & 0xff, MVT::i8);
   2326 
   2327       SmallVector<SDValue, 16> tcVec;
   2328       tcVec.assign(16, tc);
   2329       return DAG.getNode(Op.getNode()->getOpcode(), dl, VT, Arg,
   2330                          DAG.getNode(ISD::BUILD_VECTOR, dl, VT, &tcVec[0], tcVec.size()));
   2331     }
   2332   }
   2333 
   2334   // These operations (AND, OR, XOR) are legal, they just couldn't be custom
   2335   // lowered.  Return the operation, rather than a null SDValue.
   2336   return Op;
   2337 }
   2338 
   2339 //! Custom lowering for CTPOP (count population)
   2340 /*!
   2341   Custom lowering code that counts the number ones in the input
   2342   operand. SPU has such an instruction, but it counts the number of
   2343   ones per byte, which then have to be accumulated.
   2344 */
   2345 static SDValue LowerCTPOP(SDValue Op, SelectionDAG &DAG) {
   2346   EVT VT = Op.getValueType();
   2347   EVT vecVT = EVT::getVectorVT(*DAG.getContext(),
   2348                                VT, (128 / VT.getSizeInBits()));
   2349   DebugLoc dl = Op.getDebugLoc();
   2350 
   2351   switch (VT.getSimpleVT().SimpleTy) {
   2352   default:
   2353     assert(false && "Invalid value type!");
   2354   case MVT::i8: {
   2355     SDValue N = Op.getOperand(0);
   2356     SDValue Elt0 = DAG.getConstant(0, MVT::i32);
   2357 
   2358     SDValue Promote = DAG.getNode(SPUISD::PREFSLOT2VEC, dl, vecVT, N, N);
   2359     SDValue CNTB = DAG.getNode(SPUISD::CNTB, dl, vecVT, Promote);
   2360 
   2361     return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i8, CNTB, Elt0);
   2362   }
   2363 
   2364   case MVT::i16: {
   2365     MachineFunction &MF = DAG.getMachineFunction();
   2366     MachineRegisterInfo &RegInfo = MF.getRegInfo();
   2367 
   2368     unsigned CNTB_reg = RegInfo.createVirtualRegister(&SPU::R16CRegClass);
   2369 
   2370     SDValue N = Op.getOperand(0);
   2371     SDValue Elt0 = DAG.getConstant(0, MVT::i16);
   2372     SDValue Mask0 = DAG.getConstant(0x0f, MVT::i16);
   2373     SDValue Shift1 = DAG.getConstant(8, MVT::i32);
   2374 
   2375     SDValue Promote = DAG.getNode(SPUISD::PREFSLOT2VEC, dl, vecVT, N, N);
   2376     SDValue CNTB = DAG.getNode(SPUISD::CNTB, dl, vecVT, Promote);
   2377 
   2378     // CNTB_result becomes the chain to which all of the virtual registers
   2379     // CNTB_reg, SUM1_reg become associated:
   2380     SDValue CNTB_result =
   2381       DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16, CNTB, Elt0);
   2382 
   2383     SDValue CNTB_rescopy =
   2384       DAG.getCopyToReg(CNTB_result, dl, CNTB_reg, CNTB_result);
   2385 
   2386     SDValue Tmp1 = DAG.getCopyFromReg(CNTB_rescopy, dl, CNTB_reg, MVT::i16);
   2387 
   2388     return DAG.getNode(ISD::AND, dl, MVT::i16,
   2389                        DAG.getNode(ISD::ADD, dl, MVT::i16,
   2390                                    DAG.getNode(ISD::SRL, dl, MVT::i16,
   2391                                                Tmp1, Shift1),
   2392                                    Tmp1),
   2393                        Mask0);
   2394   }
   2395 
   2396   case MVT::i32: {
   2397     MachineFunction &MF = DAG.getMachineFunction();
   2398     MachineRegisterInfo &RegInfo = MF.getRegInfo();
   2399 
   2400     unsigned CNTB_reg = RegInfo.createVirtualRegister(&SPU::R32CRegClass);
   2401     unsigned SUM1_reg = RegInfo.createVirtualRegister(&SPU::R32CRegClass);
   2402 
   2403     SDValue N = Op.getOperand(0);
   2404     SDValue Elt0 = DAG.getConstant(0, MVT::i32);
   2405     SDValue Mask0 = DAG.getConstant(0xff, MVT::i32);
   2406     SDValue Shift1 = DAG.getConstant(16, MVT::i32);
   2407     SDValue Shift2 = DAG.getConstant(8, MVT::i32);
   2408 
   2409     SDValue Promote = DAG.getNode(SPUISD::PREFSLOT2VEC, dl, vecVT, N, N);
   2410     SDValue CNTB = DAG.getNode(SPUISD::CNTB, dl, vecVT, Promote);
   2411 
   2412     // CNTB_result becomes the chain to which all of the virtual registers
   2413     // CNTB_reg, SUM1_reg become associated:
   2414     SDValue CNTB_result =
   2415       DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32, CNTB, Elt0);
   2416 
   2417     SDValue CNTB_rescopy =
   2418       DAG.getCopyToReg(CNTB_result, dl, CNTB_reg, CNTB_result);
   2419 
   2420     SDValue Comp1 =
   2421       DAG.getNode(ISD::SRL, dl, MVT::i32,
   2422                   DAG.getCopyFromReg(CNTB_rescopy, dl, CNTB_reg, MVT::i32),
   2423                   Shift1);
   2424 
   2425     SDValue Sum1 =
   2426       DAG.getNode(ISD::ADD, dl, MVT::i32, Comp1,
   2427                   DAG.getCopyFromReg(CNTB_rescopy, dl, CNTB_reg, MVT::i32));
   2428 
   2429     SDValue Sum1_rescopy =
   2430       DAG.getCopyToReg(CNTB_result, dl, SUM1_reg, Sum1);
   2431 
   2432     SDValue Comp2 =
   2433       DAG.getNode(ISD::SRL, dl, MVT::i32,
   2434                   DAG.getCopyFromReg(Sum1_rescopy, dl, SUM1_reg, MVT::i32),
   2435                   Shift2);
   2436     SDValue Sum2 =
   2437       DAG.getNode(ISD::ADD, dl, MVT::i32, Comp2,
   2438                   DAG.getCopyFromReg(Sum1_rescopy, dl, SUM1_reg, MVT::i32));
   2439 
   2440     return DAG.getNode(ISD::AND, dl, MVT::i32, Sum2, Mask0);
   2441   }
   2442 
   2443   case MVT::i64:
   2444     break;
   2445   }
   2446 
   2447   return SDValue();
   2448 }
   2449 
   2450 //! Lower ISD::FP_TO_SINT, ISD::FP_TO_UINT for i32
   2451 /*!
   2452  f32->i32 passes through unchanged, whereas f64->i32 expands to a libcall.
   2453  All conversions to i64 are expanded to a libcall.
   2454  */
   2455 static SDValue LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG,
   2456                               const SPUTargetLowering &TLI) {
   2457   EVT OpVT = Op.getValueType();
   2458   SDValue Op0 = Op.getOperand(0);
   2459   EVT Op0VT = Op0.getValueType();
   2460 
   2461   if ((OpVT == MVT::i32 && Op0VT == MVT::f64)
   2462       || OpVT == MVT::i64) {
   2463     // Convert f32 / f64 to i32 / i64 via libcall.
   2464     RTLIB::Libcall LC =
   2465             (Op.getOpcode() == ISD::FP_TO_SINT)
   2466              ? RTLIB::getFPTOSINT(Op0VT, OpVT)
   2467              : RTLIB::getFPTOUINT(Op0VT, OpVT);
   2468     assert(LC != RTLIB::UNKNOWN_LIBCALL && "Unexpectd fp-to-int conversion!");
   2469     SDValue Dummy;
   2470     return ExpandLibCall(LC, Op, DAG, false, Dummy, TLI);
   2471   }
   2472 
   2473   return Op;
   2474 }
   2475 
   2476 //! Lower ISD::SINT_TO_FP, ISD::UINT_TO_FP for i32
   2477 /*!
   2478  i32->f32 passes through unchanged, whereas i32->f64 is expanded to a libcall.
   2479  All conversions from i64 are expanded to a libcall.
   2480  */
   2481 static SDValue LowerINT_TO_FP(SDValue Op, SelectionDAG &DAG,
   2482                               const SPUTargetLowering &TLI) {
   2483   EVT OpVT = Op.getValueType();
   2484   SDValue Op0 = Op.getOperand(0);
   2485   EVT Op0VT = Op0.getValueType();
   2486 
   2487   if ((OpVT == MVT::f64 && Op0VT == MVT::i32)
   2488       || Op0VT == MVT::i64) {
   2489     // Convert i32, i64 to f64 via libcall:
   2490     RTLIB::Libcall LC =
   2491             (Op.getOpcode() == ISD::SINT_TO_FP)
   2492              ? RTLIB::getSINTTOFP(Op0VT, OpVT)
   2493              : RTLIB::getUINTTOFP(Op0VT, OpVT);
   2494     assert(LC != RTLIB::UNKNOWN_LIBCALL && "Unexpectd int-to-fp conversion!");
   2495     SDValue Dummy;
   2496     return ExpandLibCall(LC, Op, DAG, false, Dummy, TLI);
   2497   }
   2498 
   2499   return Op;
   2500 }
   2501 
   2502 //! Lower ISD::SETCC
   2503 /*!
   2504  This handles MVT::f64 (double floating point) condition lowering
   2505  */
   2506 static SDValue LowerSETCC(SDValue Op, SelectionDAG &DAG,
   2507                           const TargetLowering &TLI) {
   2508   CondCodeSDNode *CC = dyn_cast<CondCodeSDNode>(Op.getOperand(2));
   2509   DebugLoc dl = Op.getDebugLoc();
   2510   assert(CC != 0 && "LowerSETCC: CondCodeSDNode should not be null here!\n");
   2511 
   2512   SDValue lhs = Op.getOperand(0);
   2513   SDValue rhs = Op.getOperand(1);
   2514   EVT lhsVT = lhs.getValueType();
   2515   assert(lhsVT == MVT::f64 && "LowerSETCC: type other than MVT::64\n");
   2516 
   2517   EVT ccResultVT = TLI.getSetCCResultType(lhs.getValueType());
   2518   APInt ccResultOnes = APInt::getAllOnesValue(ccResultVT.getSizeInBits());
   2519   EVT IntVT(MVT::i64);
   2520 
   2521   // Take advantage of the fact that (truncate (sra arg, 32)) is efficiently
   2522   // selected to a NOP:
   2523   SDValue i64lhs = DAG.getNode(ISD::BITCAST, dl, IntVT, lhs);
   2524   SDValue lhsHi32 =
   2525           DAG.getNode(ISD::TRUNCATE, dl, MVT::i32,
   2526                       DAG.getNode(ISD::SRL, dl, IntVT,
   2527                                   i64lhs, DAG.getConstant(32, MVT::i32)));
   2528   SDValue lhsHi32abs =
   2529           DAG.getNode(ISD::AND, dl, MVT::i32,
   2530                       lhsHi32, DAG.getConstant(0x7fffffff, MVT::i32));
   2531   SDValue lhsLo32 =
   2532           DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, i64lhs);
   2533 
   2534   // SETO and SETUO only use the lhs operand:
   2535   if (CC->get() == ISD::SETO) {
   2536     // Evaluates to true if Op0 is not [SQ]NaN - lowers to the inverse of
   2537     // SETUO
   2538     APInt ccResultAllOnes = APInt::getAllOnesValue(ccResultVT.getSizeInBits());
   2539     return DAG.getNode(ISD::XOR, dl, ccResultVT,
   2540                        DAG.getSetCC(dl, ccResultVT,
   2541                                     lhs, DAG.getConstantFP(0.0, lhsVT),
   2542                                     ISD::SETUO),
   2543                        DAG.getConstant(ccResultAllOnes, ccResultVT));
   2544   } else if (CC->get() == ISD::SETUO) {
   2545     // Evaluates to true if Op0 is [SQ]NaN
   2546     return DAG.getNode(ISD::AND, dl, ccResultVT,
   2547                        DAG.getSetCC(dl, ccResultVT,
   2548                                     lhsHi32abs,
   2549                                     DAG.getConstant(0x7ff00000, MVT::i32),
   2550                                     ISD::SETGE),
   2551                        DAG.getSetCC(dl, ccResultVT,
   2552                                     lhsLo32,
   2553                                     DAG.getConstant(0, MVT::i32),
   2554                                     ISD::SETGT));
   2555   }
   2556 
   2557   SDValue i64rhs = DAG.getNode(ISD::BITCAST, dl, IntVT, rhs);
   2558   SDValue rhsHi32 =
   2559           DAG.getNode(ISD::TRUNCATE, dl, MVT::i32,
   2560                       DAG.getNode(ISD::SRL, dl, IntVT,
   2561                                   i64rhs, DAG.getConstant(32, MVT::i32)));
   2562 
   2563   // If a value is negative, subtract from the sign magnitude constant:
   2564   SDValue signMag2TC = DAG.getConstant(0x8000000000000000ULL, IntVT);
   2565 
   2566   // Convert the sign-magnitude representation into 2's complement:
   2567   SDValue lhsSelectMask = DAG.getNode(ISD::SRA, dl, ccResultVT,
   2568                                       lhsHi32, DAG.getConstant(31, MVT::i32));
   2569   SDValue lhsSignMag2TC = DAG.getNode(ISD::SUB, dl, IntVT, signMag2TC, i64lhs);
   2570   SDValue lhsSelect =
   2571           DAG.getNode(ISD::SELECT, dl, IntVT,
   2572                       lhsSelectMask, lhsSignMag2TC, i64lhs);
   2573 
   2574   SDValue rhsSelectMask = DAG.getNode(ISD::SRA, dl, ccResultVT,
   2575                                       rhsHi32, DAG.getConstant(31, MVT::i32));
   2576   SDValue rhsSignMag2TC = DAG.getNode(ISD::SUB, dl, IntVT, signMag2TC, i64rhs);
   2577   SDValue rhsSelect =
   2578           DAG.getNode(ISD::SELECT, dl, IntVT,
   2579                       rhsSelectMask, rhsSignMag2TC, i64rhs);
   2580 
   2581   unsigned compareOp;
   2582 
   2583   switch (CC->get()) {
   2584   case ISD::SETOEQ:
   2585   case ISD::SETUEQ:
   2586     compareOp = ISD::SETEQ; break;
   2587   case ISD::SETOGT:
   2588   case ISD::SETUGT:
   2589     compareOp = ISD::SETGT; break;
   2590   case ISD::SETOGE:
   2591   case ISD::SETUGE:
   2592     compareOp = ISD::SETGE; break;
   2593   case ISD::SETOLT:
   2594   case ISD::SETULT:
   2595     compareOp = ISD::SETLT; break;
   2596   case ISD::SETOLE:
   2597   case ISD::SETULE:
   2598     compareOp = ISD::SETLE; break;
   2599   case ISD::SETUNE:
   2600   case ISD::SETONE:
   2601     compareOp = ISD::SETNE; break;
   2602   default:
   2603     report_fatal_error("CellSPU ISel Select: unimplemented f64 condition");
   2604   }
   2605 
   2606   SDValue result =
   2607           DAG.getSetCC(dl, ccResultVT, lhsSelect, rhsSelect,
   2608                        (ISD::CondCode) compareOp);
   2609 
   2610   if ((CC->get() & 0x8) == 0) {
   2611     // Ordered comparison:
   2612     SDValue lhsNaN = DAG.getSetCC(dl, ccResultVT,
   2613                                   lhs, DAG.getConstantFP(0.0, MVT::f64),
   2614                                   ISD::SETO);
   2615     SDValue rhsNaN = DAG.getSetCC(dl, ccResultVT,
   2616                                   rhs, DAG.getConstantFP(0.0, MVT::f64),
   2617                                   ISD::SETO);
   2618     SDValue ordered = DAG.getNode(ISD::AND, dl, ccResultVT, lhsNaN, rhsNaN);
   2619 
   2620     result = DAG.getNode(ISD::AND, dl, ccResultVT, ordered, result);
   2621   }
   2622 
   2623   return result;
   2624 }
   2625 
   2626 //! Lower ISD::SELECT_CC
   2627 /*!
   2628   ISD::SELECT_CC can (generally) be implemented directly on the SPU using the
   2629   SELB instruction.
   2630 
   2631   \note Need to revisit this in the future: if the code path through the true
   2632   and false value computations is longer than the latency of a branch (6
   2633   cycles), then it would be more advantageous to branch and insert a new basic
   2634   block and branch on the condition. However, this code does not make that
   2635   assumption, given the simplisitc uses so far.
   2636  */
   2637 
   2638 static SDValue LowerSELECT_CC(SDValue Op, SelectionDAG &DAG,
   2639                               const TargetLowering &TLI) {
   2640   EVT VT = Op.getValueType();
   2641   SDValue lhs = Op.getOperand(0);
   2642   SDValue rhs = Op.getOperand(1);
   2643   SDValue trueval = Op.getOperand(2);
   2644   SDValue falseval = Op.getOperand(3);
   2645   SDValue condition = Op.getOperand(4);
   2646   DebugLoc dl = Op.getDebugLoc();
   2647 
   2648   // NOTE: SELB's arguments: $rA, $rB, $mask
   2649   //
   2650   // SELB selects bits from $rA where bits in $mask are 0, bits from $rB
   2651   // where bits in $mask are 1. CCond will be inverted, having 1s where the
   2652   // condition was true and 0s where the condition was false. Hence, the
   2653   // arguments to SELB get reversed.
   2654 
   2655   // Note: Really should be ISD::SELECT instead of SPUISD::SELB, but LLVM's
   2656   // legalizer insists on combining SETCC/SELECT into SELECT_CC, so we end up
   2657   // with another "cannot select select_cc" assert:
   2658 
   2659   SDValue compare = DAG.getNode(ISD::SETCC, dl,
   2660                                 TLI.getSetCCResultType(Op.getValueType()),
   2661                                 lhs, rhs, condition);
   2662   return DAG.getNode(SPUISD::SELB, dl, VT, falseval, trueval, compare);
   2663 }
   2664 
   2665 //! Custom lower ISD::TRUNCATE
   2666 static SDValue LowerTRUNCATE(SDValue Op, SelectionDAG &DAG)
   2667 {
   2668   // Type to truncate to
   2669   EVT VT = Op.getValueType();
   2670   MVT simpleVT = VT.getSimpleVT();
   2671   EVT VecVT = EVT::getVectorVT(*DAG.getContext(),
   2672                                VT, (128 / VT.getSizeInBits()));
   2673   DebugLoc dl = Op.getDebugLoc();
   2674 
   2675   // Type to truncate from
   2676   SDValue Op0 = Op.getOperand(0);
   2677   EVT Op0VT = Op0.getValueType();
   2678 
   2679   if (Op0VT == MVT::i128 && simpleVT == MVT::i64) {
   2680     // Create shuffle mask, least significant doubleword of quadword
   2681     unsigned maskHigh = 0x08090a0b;
   2682     unsigned maskLow = 0x0c0d0e0f;
   2683     // Use a shuffle to perform the truncation
   2684     SDValue shufMask = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32,
   2685                                    DAG.getConstant(maskHigh, MVT::i32),
   2686                                    DAG.getConstant(maskLow, MVT::i32),
   2687                                    DAG.getConstant(maskHigh, MVT::i32),
   2688                                    DAG.getConstant(maskLow, MVT::i32));
   2689 
   2690     SDValue truncShuffle = DAG.getNode(SPUISD::SHUFB, dl, VecVT,
   2691                                        Op0, Op0, shufMask);
   2692 
   2693     return DAG.getNode(SPUISD::VEC2PREFSLOT, dl, VT, truncShuffle);
   2694   }
   2695 
   2696   return SDValue();             // Leave the truncate unmolested
   2697 }
   2698 
   2699 /*!
   2700  * Emit the instruction sequence for i64/i32 -> i128 sign extend. The basic
   2701  * algorithm is to duplicate the sign bit using rotmai to generate at
   2702  * least one byte full of sign bits. Then propagate the "sign-byte" into
   2703  * the leftmost words and the i64/i32 into the rightmost words using shufb.
   2704  *
   2705  * @param Op The sext operand
   2706  * @param DAG The current DAG
   2707  * @return The SDValue with the entire instruction sequence
   2708  */
   2709 static SDValue LowerSIGN_EXTEND(SDValue Op, SelectionDAG &DAG)
   2710 {
   2711   DebugLoc dl = Op.getDebugLoc();
   2712 
   2713   // Type to extend to
   2714   MVT OpVT = Op.getValueType().getSimpleVT();
   2715 
   2716   // Type to extend from
   2717   SDValue Op0 = Op.getOperand(0);
   2718   MVT Op0VT = Op0.getValueType().getSimpleVT();
   2719 
   2720   // extend i8 & i16 via i32
   2721   if (Op0VT == MVT::i8 || Op0VT == MVT::i16) {
   2722     Op0 = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::i32, Op0);
   2723     Op0VT = MVT::i32;
   2724   }
   2725 
   2726   // The type to extend to needs to be a i128 and
   2727   // the type to extend from needs to be i64 or i32.
   2728   assert((OpVT == MVT::i128 && (Op0VT == MVT::i64 || Op0VT == MVT::i32)) &&
   2729           "LowerSIGN_EXTEND: input and/or output operand have wrong size");
   2730 
   2731   // Create shuffle mask
   2732   unsigned mask1 = 0x10101010; // byte 0 - 3 and 4 - 7
   2733   unsigned mask2 = Op0VT == MVT::i64 ? 0x00010203 : 0x10101010; // byte  8 - 11
   2734   unsigned mask3 = Op0VT == MVT::i64 ? 0x04050607 : 0x00010203; // byte 12 - 15
   2735   SDValue shufMask = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32,
   2736                                  DAG.getConstant(mask1, MVT::i32),
   2737                                  DAG.getConstant(mask1, MVT::i32),
   2738                                  DAG.getConstant(mask2, MVT::i32),
   2739                                  DAG.getConstant(mask3, MVT::i32));
   2740 
   2741   // Word wise arithmetic right shift to generate at least one byte
   2742   // that contains sign bits.
   2743   MVT mvt = Op0VT == MVT::i64 ? MVT::v2i64 : MVT::v4i32;
   2744   SDValue sraVal = DAG.getNode(ISD::SRA,
   2745                  dl,
   2746                  mvt,
   2747                  DAG.getNode(SPUISD::PREFSLOT2VEC, dl, mvt, Op0, Op0),
   2748                  DAG.getConstant(31, MVT::i32));
   2749 
   2750   // reinterpret as a i128 (SHUFB requires it). This gets lowered away.
   2751   SDValue extended = SDValue(DAG.getMachineNode(TargetOpcode::COPY_TO_REGCLASS,
   2752                                         dl, Op0VT, Op0,
   2753                                         DAG.getTargetConstant(
   2754                                                   SPU::GPRCRegClass.getID(),
   2755                                                   MVT::i32)), 0);
   2756   // Shuffle bytes - Copy the sign bits into the upper 64 bits
   2757   // and the input value into the lower 64 bits.
   2758   SDValue extShuffle = DAG.getNode(SPUISD::SHUFB, dl, mvt,
   2759         extended, sraVal, shufMask);
   2760   return DAG.getNode(ISD::BITCAST, dl, MVT::i128, extShuffle);
   2761 }
   2762 
   2763 //! Custom (target-specific) lowering entry point
   2764 /*!
   2765   This is where LLVM's DAG selection process calls to do target-specific
   2766   lowering of nodes.
   2767  */
   2768 SDValue
   2769 SPUTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const
   2770 {
   2771   unsigned Opc = (unsigned) Op.getOpcode();
   2772   EVT VT = Op.getValueType();
   2773 
   2774   switch (Opc) {
   2775   default: {
   2776 #ifndef NDEBUG
   2777     errs() << "SPUTargetLowering::LowerOperation(): need to lower this!\n";
   2778     errs() << "Op.getOpcode() = " << Opc << "\n";
   2779     errs() << "*Op.getNode():\n";
   2780     Op.getNode()->dump();
   2781 #endif
   2782     llvm_unreachable(0);
   2783   }
   2784   case ISD::LOAD:
   2785   case ISD::EXTLOAD:
   2786   case ISD::SEXTLOAD:
   2787   case ISD::ZEXTLOAD:
   2788     return LowerLOAD(Op, DAG, SPUTM.getSubtargetImpl());
   2789   case ISD::STORE:
   2790     return LowerSTORE(Op, DAG, SPUTM.getSubtargetImpl());
   2791   case ISD::ConstantPool:
   2792     return LowerConstantPool(Op, DAG, SPUTM.getSubtargetImpl());
   2793   case ISD::GlobalAddress:
   2794     return LowerGlobalAddress(Op, DAG, SPUTM.getSubtargetImpl());
   2795   case ISD::JumpTable:
   2796     return LowerJumpTable(Op, DAG, SPUTM.getSubtargetImpl());
   2797   case ISD::ConstantFP:
   2798     return LowerConstantFP(Op, DAG);
   2799 
   2800   // i8, i64 math ops:
   2801   case ISD::ADD:
   2802   case ISD::SUB:
   2803   case ISD::ROTR:
   2804   case ISD::ROTL:
   2805   case ISD::SRL:
   2806   case ISD::SHL:
   2807   case ISD::SRA: {
   2808     if (VT == MVT::i8)
   2809       return LowerI8Math(Op, DAG, Opc, *this);
   2810     break;
   2811   }
   2812 
   2813   case ISD::FP_TO_SINT:
   2814   case ISD::FP_TO_UINT:
   2815     return LowerFP_TO_INT(Op, DAG, *this);
   2816 
   2817   case ISD::SINT_TO_FP:
   2818   case ISD::UINT_TO_FP:
   2819     return LowerINT_TO_FP(Op, DAG, *this);
   2820 
   2821   // Vector-related lowering.
   2822   case ISD::BUILD_VECTOR:
   2823     return LowerBUILD_VECTOR(Op, DAG);
   2824   case ISD::SCALAR_TO_VECTOR:
   2825     return LowerSCALAR_TO_VECTOR(Op, DAG);
   2826   case ISD::VECTOR_SHUFFLE:
   2827     return LowerVECTOR_SHUFFLE(Op, DAG);
   2828   case ISD::EXTRACT_VECTOR_ELT:
   2829     return LowerEXTRACT_VECTOR_ELT(Op, DAG);
   2830   case ISD::INSERT_VECTOR_ELT:
   2831     return LowerINSERT_VECTOR_ELT(Op, DAG);
   2832 
   2833   // Look for ANDBI, ORBI and XORBI opportunities and lower appropriately:
   2834   case ISD::AND:
   2835   case ISD::OR:
   2836   case ISD::XOR:
   2837     return LowerByteImmed(Op, DAG);
   2838 
   2839   // Vector and i8 multiply:
   2840   case ISD::MUL:
   2841     if (VT == MVT::i8)
   2842       return LowerI8Math(Op, DAG, Opc, *this);
   2843 
   2844   case ISD::CTPOP:
   2845     return LowerCTPOP(Op, DAG);
   2846 
   2847   case ISD::SELECT_CC:
   2848     return LowerSELECT_CC(Op, DAG, *this);
   2849 
   2850   case ISD::SETCC:
   2851     return LowerSETCC(Op, DAG, *this);
   2852 
   2853   case ISD::TRUNCATE:
   2854     return LowerTRUNCATE(Op, DAG);
   2855 
   2856   case ISD::SIGN_EXTEND:
   2857     return LowerSIGN_EXTEND(Op, DAG);
   2858   }
   2859 
   2860   return SDValue();
   2861 }
   2862 
   2863 void SPUTargetLowering::ReplaceNodeResults(SDNode *N,
   2864                                            SmallVectorImpl<SDValue>&Results,
   2865                                            SelectionDAG &DAG) const
   2866 {
   2867 #if 0
   2868   unsigned Opc = (unsigned) N->getOpcode();
   2869   EVT OpVT = N->getValueType(0);
   2870 
   2871   switch (Opc) {
   2872   default: {
   2873     errs() << "SPUTargetLowering::ReplaceNodeResults(): need to fix this!\n";
   2874     errs() << "Op.getOpcode() = " << Opc << "\n";
   2875     errs() << "*Op.getNode():\n";
   2876     N->dump();
   2877     abort();
   2878     /*NOTREACHED*/
   2879   }
   2880   }
   2881 #endif
   2882 
   2883   /* Otherwise, return unchanged */
   2884 }
   2885 
   2886 //===----------------------------------------------------------------------===//
   2887 // Target Optimization Hooks
   2888 //===----------------------------------------------------------------------===//
   2889 
   2890 SDValue
   2891 SPUTargetLowering::PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const
   2892 {
   2893 #if 0
   2894   TargetMachine &TM = getTargetMachine();
   2895 #endif
   2896   const SPUSubtarget *ST = SPUTM.getSubtargetImpl();
   2897   SelectionDAG &DAG = DCI.DAG;
   2898   SDValue Op0 = N->getOperand(0);       // everything has at least one operand
   2899   EVT NodeVT = N->getValueType(0);      // The node's value type
   2900   EVT Op0VT = Op0.getValueType();       // The first operand's result
   2901   SDValue Result;                       // Initially, empty result
   2902   DebugLoc dl = N->getDebugLoc();
   2903 
   2904   switch (N->getOpcode()) {
   2905   default: break;
   2906   case ISD::ADD: {
   2907     SDValue Op1 = N->getOperand(1);
   2908 
   2909     if (Op0.getOpcode() == SPUISD::IndirectAddr
   2910         || Op1.getOpcode() == SPUISD::IndirectAddr) {
   2911       // Normalize the operands to reduce repeated code
   2912       SDValue IndirectArg = Op0, AddArg = Op1;
   2913 
   2914       if (Op1.getOpcode() == SPUISD::IndirectAddr) {
   2915         IndirectArg = Op1;
   2916         AddArg = Op0;
   2917       }
   2918 
   2919       if (isa<ConstantSDNode>(AddArg)) {
   2920         ConstantSDNode *CN0 = cast<ConstantSDNode > (AddArg);
   2921         SDValue IndOp1 = IndirectArg.getOperand(1);
   2922 
   2923         if (CN0->isNullValue()) {
   2924           // (add (SPUindirect <arg>, <arg>), 0) ->
   2925           // (SPUindirect <arg>, <arg>)
   2926 
   2927 #if !defined(NDEBUG)
   2928           if (DebugFlag && isCurrentDebugType(DEBUG_TYPE)) {
   2929             errs() << "\n"
   2930                  << "Replace: (add (SPUindirect <arg>, <arg>), 0)\n"
   2931                  << "With:    (SPUindirect <arg>, <arg>)\n";
   2932           }
   2933 #endif
   2934 
   2935           return IndirectArg;
   2936         } else if (isa<ConstantSDNode>(IndOp1)) {
   2937           // (add (SPUindirect <arg>, <const>), <const>) ->
   2938           // (SPUindirect <arg>, <const + const>)
   2939           ConstantSDNode *CN1 = cast<ConstantSDNode > (IndOp1);
   2940           int64_t combinedConst = CN0->getSExtValue() + CN1->getSExtValue();
   2941           SDValue combinedValue = DAG.getConstant(combinedConst, Op0VT);
   2942 
   2943 #if !defined(NDEBUG)
   2944           if (DebugFlag && isCurrentDebugType(DEBUG_TYPE)) {
   2945             errs() << "\n"
   2946                  << "Replace: (add (SPUindirect <arg>, " << CN1->getSExtValue()
   2947                  << "), " << CN0->getSExtValue() << ")\n"
   2948                  << "With:    (SPUindirect <arg>, "
   2949                  << combinedConst << ")\n";
   2950           }
   2951 #endif
   2952 
   2953           return DAG.getNode(SPUISD::IndirectAddr, dl, Op0VT,
   2954                              IndirectArg, combinedValue);
   2955         }
   2956       }
   2957     }
   2958     break;
   2959   }
   2960   case ISD::SIGN_EXTEND:
   2961   case ISD::ZERO_EXTEND:
   2962   case ISD::ANY_EXTEND: {
   2963     if (Op0.getOpcode() == SPUISD::VEC2PREFSLOT && NodeVT == Op0VT) {
   2964       // (any_extend (SPUextract_elt0 <arg>)) ->
   2965       // (SPUextract_elt0 <arg>)
   2966       // Types must match, however...
   2967 #if !defined(NDEBUG)
   2968       if (DebugFlag && isCurrentDebugType(DEBUG_TYPE)) {
   2969         errs() << "\nReplace: ";
   2970         N->dump(&DAG);
   2971         errs() << "\nWith:    ";
   2972         Op0.getNode()->dump(&DAG);
   2973         errs() << "\n";
   2974       }
   2975 #endif
   2976 
   2977       return Op0;
   2978     }
   2979     break;
   2980   }
   2981   case SPUISD::IndirectAddr: {
   2982     if (!ST->usingLargeMem() && Op0.getOpcode() == SPUISD::AFormAddr) {
   2983       ConstantSDNode *CN = dyn_cast<ConstantSDNode>(N->getOperand(1));
   2984       if (CN != 0 && CN->isNullValue()) {
   2985         // (SPUindirect (SPUaform <addr>, 0), 0) ->
   2986         // (SPUaform <addr>, 0)
   2987 
   2988         DEBUG(errs() << "Replace: ");
   2989         DEBUG(N->dump(&DAG));
   2990         DEBUG(errs() << "\nWith:    ");
   2991         DEBUG(Op0.getNode()->dump(&DAG));
   2992         DEBUG(errs() << "\n");
   2993 
   2994         return Op0;
   2995       }
   2996     } else if (Op0.getOpcode() == ISD::ADD) {
   2997       SDValue Op1 = N->getOperand(1);
   2998       if (ConstantSDNode *CN1 = dyn_cast<ConstantSDNode>(Op1)) {
   2999         // (SPUindirect (add <arg>, <arg>), 0) ->
   3000         // (SPUindirect <arg>, <arg>)
   3001         if (CN1->isNullValue()) {
   3002 
   3003 #if !defined(NDEBUG)
   3004           if (DebugFlag && isCurrentDebugType(DEBUG_TYPE)) {
   3005             errs() << "\n"
   3006                  << "Replace: (SPUindirect (add <arg>, <arg>), 0)\n"
   3007                  << "With:    (SPUindirect <arg>, <arg>)\n";
   3008           }
   3009 #endif
   3010 
   3011           return DAG.getNode(SPUISD::IndirectAddr, dl, Op0VT,
   3012                              Op0.getOperand(0), Op0.getOperand(1));
   3013         }
   3014       }
   3015     }
   3016     break;
   3017   }
   3018   case SPUISD::SHL_BITS:
   3019   case SPUISD::SHL_BYTES:
   3020   case SPUISD::ROTBYTES_LEFT: {
   3021     SDValue Op1 = N->getOperand(1);
   3022 
   3023     // Kill degenerate vector shifts:
   3024     if (ConstantSDNode *CN = dyn_cast<ConstantSDNode>(Op1)) {
   3025       if (CN->isNullValue()) {
   3026         Result = Op0;
   3027       }
   3028     }
   3029     break;
   3030   }
   3031   case SPUISD::PREFSLOT2VEC: {
   3032     switch (Op0.getOpcode()) {
   3033     default:
   3034       break;
   3035     case ISD::ANY_EXTEND:
   3036     case ISD::ZERO_EXTEND:
   3037     case ISD::SIGN_EXTEND: {
   3038       // (SPUprefslot2vec (any|zero|sign_extend (SPUvec2prefslot <arg>))) ->
   3039       // <arg>
   3040       // but only if the SPUprefslot2vec and <arg> types match.
   3041       SDValue Op00 = Op0.getOperand(0);
   3042       if (Op00.getOpcode() == SPUISD::VEC2PREFSLOT) {
   3043         SDValue Op000 = Op00.getOperand(0);
   3044         if (Op000.getValueType() == NodeVT) {
   3045           Result = Op000;
   3046         }
   3047       }
   3048       break;
   3049     }
   3050     case SPUISD::VEC2PREFSLOT: {
   3051       // (SPUprefslot2vec (SPUvec2prefslot <arg>)) ->
   3052       // <arg>
   3053       Result = Op0.getOperand(0);
   3054       break;
   3055     }
   3056     }
   3057     break;
   3058   }
   3059   }
   3060 
   3061   // Otherwise, return unchanged.
   3062 #ifndef NDEBUG
   3063   if (Result.getNode()) {
   3064     DEBUG(errs() << "\nReplace.SPU: ");
   3065     DEBUG(N->dump(&DAG));
   3066     DEBUG(errs() << "\nWith:        ");
   3067     DEBUG(Result.getNode()->dump(&DAG));
   3068     DEBUG(errs() << "\n");
   3069   }
   3070 #endif
   3071 
   3072   return Result;
   3073 }
   3074 
   3075 //===----------------------------------------------------------------------===//
   3076 // Inline Assembly Support
   3077 //===----------------------------------------------------------------------===//
   3078 
   3079 /// getConstraintType - Given a constraint letter, return the type of
   3080 /// constraint it is for this target.
   3081 SPUTargetLowering::ConstraintType
   3082 SPUTargetLowering::getConstraintType(const std::string &ConstraintLetter) const {
   3083   if (ConstraintLetter.size() == 1) {
   3084     switch (ConstraintLetter[0]) {
   3085     default: break;
   3086     case 'b':
   3087     case 'r':
   3088     case 'f':
   3089     case 'v':
   3090     case 'y':
   3091       return C_RegisterClass;
   3092     }
   3093   }
   3094   return TargetLowering::getConstraintType(ConstraintLetter);
   3095 }
   3096 
   3097 /// Examine constraint type and operand type and determine a weight value.
   3098 /// This object must already have been set up with the operand type
   3099 /// and the current alternative constraint selected.
   3100 TargetLowering::ConstraintWeight
   3101 SPUTargetLowering::getSingleConstraintMatchWeight(
   3102     AsmOperandInfo &info, const char *constraint) const {
   3103   ConstraintWeight weight = CW_Invalid;
   3104   Value *CallOperandVal = info.CallOperandVal;
   3105     // If we don't have a value, we can't do a match,
   3106     // but allow it at the lowest weight.
   3107   if (CallOperandVal == NULL)
   3108     return CW_Default;
   3109   // Look at the constraint type.
   3110   switch (*constraint) {
   3111   default:
   3112     weight = TargetLowering::getSingleConstraintMatchWeight(info, constraint);
   3113     break;
   3114     //FIXME: Seems like the supported constraint letters were just copied
   3115     // from PPC, as the following doesn't correspond to the GCC docs.
   3116     // I'm leaving it so until someone adds the corresponding lowering support.
   3117   case 'b':
   3118   case 'r':
   3119   case 'f':
   3120   case 'd':
   3121   case 'v':
   3122   case 'y':
   3123     weight = CW_Register;
   3124     break;
   3125   }
   3126   return weight;
   3127 }
   3128 
   3129 std::pair<unsigned, const TargetRegisterClass*>
   3130 SPUTargetLowering::getRegForInlineAsmConstraint(const std::string &Constraint,
   3131                                                 EVT VT) const
   3132 {
   3133   if (Constraint.size() == 1) {
   3134     // GCC RS6000 Constraint Letters
   3135     switch (Constraint[0]) {
   3136     case 'b':   // R1-R31
   3137     case 'r':   // R0-R31
   3138       if (VT == MVT::i64)
   3139         return std::make_pair(0U, SPU::R64CRegisterClass);
   3140       return std::make_pair(0U, SPU::R32CRegisterClass);
   3141     case 'f':
   3142       if (VT == MVT::f32)
   3143         return std::make_pair(0U, SPU::R32FPRegisterClass);
   3144       else if (VT == MVT::f64)
   3145         return std::make_pair(0U, SPU::R64FPRegisterClass);
   3146       break;
   3147     case 'v':
   3148       return std::make_pair(0U, SPU::GPRCRegisterClass);
   3149     }
   3150   }
   3151 
   3152   return TargetLowering::getRegForInlineAsmConstraint(Constraint, VT);
   3153 }
   3154 
   3155 //! Compute used/known bits for a SPU operand
   3156 void
   3157 SPUTargetLowering::computeMaskedBitsForTargetNode(const SDValue Op,
   3158                                                   const APInt &Mask,
   3159                                                   APInt &KnownZero,
   3160                                                   APInt &KnownOne,
   3161                                                   const SelectionDAG &DAG,
   3162                                                   unsigned Depth ) const {
   3163 #if 0
   3164   const uint64_t uint64_sizebits = sizeof(uint64_t) * CHAR_BIT;
   3165 
   3166   switch (Op.getOpcode()) {
   3167   default:
   3168     // KnownZero = KnownOne = APInt(Mask.getBitWidth(), 0);
   3169     break;
   3170   case CALL:
   3171   case SHUFB:
   3172   case SHUFFLE_MASK:
   3173   case CNTB:
   3174   case SPUISD::PREFSLOT2VEC:
   3175   case SPUISD::LDRESULT:
   3176   case SPUISD::VEC2PREFSLOT:
   3177   case SPUISD::SHLQUAD_L_BITS:
   3178   case SPUISD::SHLQUAD_L_BYTES:
   3179   case SPUISD::VEC_ROTL:
   3180   case SPUISD::VEC_ROTR:
   3181   case SPUISD::ROTBYTES_LEFT:
   3182   case SPUISD::SELECT_MASK:
   3183   case SPUISD::SELB:
   3184   }
   3185 #endif
   3186 }
   3187 
   3188 unsigned
   3189 SPUTargetLowering::ComputeNumSignBitsForTargetNode(SDValue Op,
   3190                                                    unsigned Depth) const {
   3191   switch (Op.getOpcode()) {
   3192   default:
   3193     return 1;
   3194 
   3195   case ISD::SETCC: {
   3196     EVT VT = Op.getValueType();
   3197 
   3198     if (VT != MVT::i8 && VT != MVT::i16 && VT != MVT::i32) {
   3199       VT = MVT::i32;
   3200     }
   3201     return VT.getSizeInBits();
   3202   }
   3203   }
   3204 }
   3205 
   3206 // LowerAsmOperandForConstraint
   3207 void
   3208 SPUTargetLowering::LowerAsmOperandForConstraint(SDValue Op,
   3209                                                 std::string &Constraint,
   3210                                                 std::vector<SDValue> &Ops,
   3211                                                 SelectionDAG &DAG) const {
   3212   // Default, for the time being, to the base class handler
   3213   TargetLowering::LowerAsmOperandForConstraint(Op, Constraint, Ops, DAG);
   3214 }
   3215 
   3216 /// isLegalAddressImmediate - Return true if the integer value can be used
   3217 /// as the offset of the target addressing mode.
   3218 bool SPUTargetLowering::isLegalAddressImmediate(int64_t V,
   3219                                                 Type *Ty) const {
   3220   // SPU's addresses are 256K:
   3221   return (V > -(1 << 18) && V < (1 << 18) - 1);
   3222 }
   3223 
   3224 bool SPUTargetLowering::isLegalAddressImmediate(llvm::GlobalValue* GV) const {
   3225   return false;
   3226 }
   3227 
   3228 bool
   3229 SPUTargetLowering::isOffsetFoldingLegal(const GlobalAddressSDNode *GA) const {
   3230   // The SPU target isn't yet aware of offsets.
   3231   return false;
   3232 }
   3233 
   3234 // can we compare to Imm without writing it into a register?
   3235 bool SPUTargetLowering::isLegalICmpImmediate(int64_t Imm) const {
   3236   //ceqi, cgti, etc. all take s10 operand
   3237   return isInt<10>(Imm);
   3238 }
   3239 
   3240 bool
   3241 SPUTargetLowering::isLegalAddressingMode(const AddrMode &AM,
   3242                                          Type * ) const{
   3243 
   3244   // A-form: 18bit absolute address.
   3245   if (AM.BaseGV && !AM.HasBaseReg && AM.Scale == 0 && AM.BaseOffs == 0)
   3246     return true;
   3247 
   3248   // D-form: reg + 14bit offset
   3249   if (AM.BaseGV ==0 && AM.HasBaseReg && AM.Scale == 0 && isInt<14>(AM.BaseOffs))
   3250     return true;
   3251 
   3252   // X-form: reg+reg
   3253   if (AM.BaseGV == 0 && AM.HasBaseReg && AM.Scale == 1 && AM.BaseOffs ==0)
   3254     return true;
   3255 
   3256   return false;
   3257 }
   3258