Home | History | Annotate | Download | only in CellSPU
      1 //===-- SPUISelLowering.cpp - Cell SPU DAG Lowering Implementation --------===//
      2 //                     The LLVM Compiler Infrastructure
      3 //
      4 // This file is distributed under the University of Illinois Open Source
      5 // License. See LICENSE.TXT for details.
      6 //
      7 //===----------------------------------------------------------------------===//
      8 //
      9 // This file implements the SPUTargetLowering class.
     10 //
     11 //===----------------------------------------------------------------------===//
     12 
     13 #include "SPUISelLowering.h"
     14 #include "SPUTargetMachine.h"
     15 #include "SPUFrameLowering.h"
     16 #include "SPUMachineFunction.h"
     17 #include "llvm/Constants.h"
     18 #include "llvm/Function.h"
     19 #include "llvm/Intrinsics.h"
     20 #include "llvm/CallingConv.h"
     21 #include "llvm/Type.h"
     22 #include "llvm/CodeGen/CallingConvLower.h"
     23 #include "llvm/CodeGen/MachineFrameInfo.h"
     24 #include "llvm/CodeGen/MachineFunction.h"
     25 #include "llvm/CodeGen/MachineInstrBuilder.h"
     26 #include "llvm/CodeGen/MachineRegisterInfo.h"
     27 #include "llvm/CodeGen/SelectionDAG.h"
     28 #include "llvm/CodeGen/TargetLoweringObjectFileImpl.h"
     29 #include "llvm/Target/TargetOptions.h"
     30 #include "llvm/ADT/VectorExtras.h"
     31 #include "llvm/Support/Debug.h"
     32 #include "llvm/Support/ErrorHandling.h"
     33 #include "llvm/Support/MathExtras.h"
     34 #include "llvm/Support/raw_ostream.h"
     35 #include <map>
     36 
     37 using namespace llvm;
     38 
     39 // Used in getTargetNodeName() below
     40 namespace {
     41   std::map<unsigned, const char *> node_names;
     42 
     43   // Byte offset of the preferred slot (counted from the MSB)
     44   int prefslotOffset(EVT VT) {
     45     int retval=0;
     46     if (VT==MVT::i1) retval=3;
     47     if (VT==MVT::i8) retval=3;
     48     if (VT==MVT::i16) retval=2;
     49 
     50     return retval;
     51   }
     52 
     53   //! Expand a library call into an actual call DAG node
     54   /*!
     55    \note
     56    This code is taken from SelectionDAGLegalize, since it is not exposed as
     57    part of the LLVM SelectionDAG API.
     58    */
     59 
     60   SDValue
     61   ExpandLibCall(RTLIB::Libcall LC, SDValue Op, SelectionDAG &DAG,
     62                 bool isSigned, SDValue &Hi, const SPUTargetLowering &TLI) {
     63     // The input chain to this libcall is the entry node of the function.
     64     // Legalizing the call will automatically add the previous call to the
     65     // dependence.
     66     SDValue InChain = DAG.getEntryNode();
     67 
     68     TargetLowering::ArgListTy Args;
     69     TargetLowering::ArgListEntry Entry;
     70     for (unsigned i = 0, e = Op.getNumOperands(); i != e; ++i) {
     71       EVT ArgVT = Op.getOperand(i).getValueType();
     72       Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext());
     73       Entry.Node = Op.getOperand(i);
     74       Entry.Ty = ArgTy;
     75       Entry.isSExt = isSigned;
     76       Entry.isZExt = !isSigned;
     77       Args.push_back(Entry);
     78     }
     79     SDValue Callee = DAG.getExternalSymbol(TLI.getLibcallName(LC),
     80                                            TLI.getPointerTy());
     81 
     82     // Splice the libcall in wherever FindInputOutputChains tells us to.
     83     Type *RetTy =
     84                 Op.getNode()->getValueType(0).getTypeForEVT(*DAG.getContext());
     85     std::pair<SDValue, SDValue> CallInfo =
     86             TLI.LowerCallTo(InChain, RetTy, isSigned, !isSigned, false, false,
     87                             0, TLI.getLibcallCallingConv(LC), false,
     88                             /*isReturnValueUsed=*/true,
     89                             Callee, Args, DAG, Op.getDebugLoc());
     90 
     91     return CallInfo.first;
     92   }
     93 }
     94 
     95 SPUTargetLowering::SPUTargetLowering(SPUTargetMachine &TM)
     96   : TargetLowering(TM, new TargetLoweringObjectFileELF()),
     97     SPUTM(TM) {
     98 
     99   // Use _setjmp/_longjmp instead of setjmp/longjmp.
    100   setUseUnderscoreSetJmp(true);
    101   setUseUnderscoreLongJmp(true);
    102 
    103   // Set RTLIB libcall names as used by SPU:
    104   setLibcallName(RTLIB::DIV_F64, "__fast_divdf3");
    105 
    106   // Set up the SPU's register classes:
    107   addRegisterClass(MVT::i8,   SPU::R8CRegisterClass);
    108   addRegisterClass(MVT::i16,  SPU::R16CRegisterClass);
    109   addRegisterClass(MVT::i32,  SPU::R32CRegisterClass);
    110   addRegisterClass(MVT::i64,  SPU::R64CRegisterClass);
    111   addRegisterClass(MVT::f32,  SPU::R32FPRegisterClass);
    112   addRegisterClass(MVT::f64,  SPU::R64FPRegisterClass);
    113   addRegisterClass(MVT::i128, SPU::GPRCRegisterClass);
    114 
    115   // SPU has no sign or zero extended loads for i1, i8, i16:
    116   setLoadExtAction(ISD::EXTLOAD,  MVT::i1, Promote);
    117   setLoadExtAction(ISD::SEXTLOAD, MVT::i1, Promote);
    118   setLoadExtAction(ISD::ZEXTLOAD, MVT::i1, Promote);
    119 
    120   setLoadExtAction(ISD::EXTLOAD,  MVT::f32, Expand);
    121   setLoadExtAction(ISD::EXTLOAD,  MVT::f64, Expand);
    122 
    123   setTruncStoreAction(MVT::i128, MVT::i64, Expand);
    124   setTruncStoreAction(MVT::i128, MVT::i32, Expand);
    125   setTruncStoreAction(MVT::i128, MVT::i16, Expand);
    126   setTruncStoreAction(MVT::i128, MVT::i8, Expand);
    127 
    128   setTruncStoreAction(MVT::f64, MVT::f32, Expand);
    129 
    130   // SPU constant load actions are custom lowered:
    131   setOperationAction(ISD::ConstantFP, MVT::f32, Legal);
    132   setOperationAction(ISD::ConstantFP, MVT::f64, Custom);
    133 
    134   // SPU's loads and stores have to be custom lowered:
    135   for (unsigned sctype = (unsigned) MVT::i8; sctype < (unsigned) MVT::i128;
    136        ++sctype) {
    137     MVT::SimpleValueType VT = (MVT::SimpleValueType)sctype;
    138 
    139     setOperationAction(ISD::LOAD,   VT, Custom);
    140     setOperationAction(ISD::STORE,  VT, Custom);
    141     setLoadExtAction(ISD::EXTLOAD,  VT, Custom);
    142     setLoadExtAction(ISD::ZEXTLOAD, VT, Custom);
    143     setLoadExtAction(ISD::SEXTLOAD, VT, Custom);
    144 
    145     for (unsigned stype = sctype - 1; stype >= (unsigned) MVT::i8; --stype) {
    146       MVT::SimpleValueType StoreVT = (MVT::SimpleValueType) stype;
    147       setTruncStoreAction(VT, StoreVT, Expand);
    148     }
    149   }
    150 
    151   for (unsigned sctype = (unsigned) MVT::f32; sctype < (unsigned) MVT::f64;
    152        ++sctype) {
    153     MVT::SimpleValueType VT = (MVT::SimpleValueType) sctype;
    154 
    155     setOperationAction(ISD::LOAD,   VT, Custom);
    156     setOperationAction(ISD::STORE,  VT, Custom);
    157 
    158     for (unsigned stype = sctype - 1; stype >= (unsigned) MVT::f32; --stype) {
    159       MVT::SimpleValueType StoreVT = (MVT::SimpleValueType) stype;
    160       setTruncStoreAction(VT, StoreVT, Expand);
    161     }
    162   }
    163 
    164   // Expand the jumptable branches
    165   setOperationAction(ISD::BR_JT,        MVT::Other, Expand);
    166   setOperationAction(ISD::BR_CC,        MVT::Other, Expand);
    167 
    168   // Custom lower SELECT_CC for most cases, but expand by default
    169   setOperationAction(ISD::SELECT_CC,    MVT::Other, Expand);
    170   setOperationAction(ISD::SELECT_CC,    MVT::i8,    Custom);
    171   setOperationAction(ISD::SELECT_CC,    MVT::i16,   Custom);
    172   setOperationAction(ISD::SELECT_CC,    MVT::i32,   Custom);
    173   setOperationAction(ISD::SELECT_CC,    MVT::i64,   Custom);
    174 
    175   // SPU has no intrinsics for these particular operations:
    176   setOperationAction(ISD::MEMBARRIER, MVT::Other, Expand);
    177   setOperationAction(ISD::ATOMIC_FENCE, MVT::Other, Expand);
    178 
    179   // SPU has no division/remainder instructions
    180   setOperationAction(ISD::SREM,    MVT::i8,   Expand);
    181   setOperationAction(ISD::UREM,    MVT::i8,   Expand);
    182   setOperationAction(ISD::SDIV,    MVT::i8,   Expand);
    183   setOperationAction(ISD::UDIV,    MVT::i8,   Expand);
    184   setOperationAction(ISD::SDIVREM, MVT::i8,   Expand);
    185   setOperationAction(ISD::UDIVREM, MVT::i8,   Expand);
    186   setOperationAction(ISD::SREM,    MVT::i16,  Expand);
    187   setOperationAction(ISD::UREM,    MVT::i16,  Expand);
    188   setOperationAction(ISD::SDIV,    MVT::i16,  Expand);
    189   setOperationAction(ISD::UDIV,    MVT::i16,  Expand);
    190   setOperationAction(ISD::SDIVREM, MVT::i16,  Expand);
    191   setOperationAction(ISD::UDIVREM, MVT::i16,  Expand);
    192   setOperationAction(ISD::SREM,    MVT::i32,  Expand);
    193   setOperationAction(ISD::UREM,    MVT::i32,  Expand);
    194   setOperationAction(ISD::SDIV,    MVT::i32,  Expand);
    195   setOperationAction(ISD::UDIV,    MVT::i32,  Expand);
    196   setOperationAction(ISD::SDIVREM, MVT::i32,  Expand);
    197   setOperationAction(ISD::UDIVREM, MVT::i32,  Expand);
    198   setOperationAction(ISD::SREM,    MVT::i64,  Expand);
    199   setOperationAction(ISD::UREM,    MVT::i64,  Expand);
    200   setOperationAction(ISD::SDIV,    MVT::i64,  Expand);
    201   setOperationAction(ISD::UDIV,    MVT::i64,  Expand);
    202   setOperationAction(ISD::SDIVREM, MVT::i64,  Expand);
    203   setOperationAction(ISD::UDIVREM, MVT::i64,  Expand);
    204   setOperationAction(ISD::SREM,    MVT::i128, Expand);
    205   setOperationAction(ISD::UREM,    MVT::i128, Expand);
    206   setOperationAction(ISD::SDIV,    MVT::i128, Expand);
    207   setOperationAction(ISD::UDIV,    MVT::i128, Expand);
    208   setOperationAction(ISD::SDIVREM, MVT::i128, Expand);
    209   setOperationAction(ISD::UDIVREM, MVT::i128, Expand);
    210 
    211   // We don't support sin/cos/sqrt/fmod
    212   setOperationAction(ISD::FSIN , MVT::f64, Expand);
    213   setOperationAction(ISD::FCOS , MVT::f64, Expand);
    214   setOperationAction(ISD::FREM , MVT::f64, Expand);
    215   setOperationAction(ISD::FSIN , MVT::f32, Expand);
    216   setOperationAction(ISD::FCOS , MVT::f32, Expand);
    217   setOperationAction(ISD::FREM , MVT::f32, Expand);
    218 
    219   // Expand fsqrt to the appropriate libcall (NOTE: should use h/w fsqrt
    220   // for f32!)
    221   setOperationAction(ISD::FSQRT, MVT::f64, Expand);
    222   setOperationAction(ISD::FSQRT, MVT::f32, Expand);
    223 
    224   setOperationAction(ISD::FMA, MVT::f64, Expand);
    225   setOperationAction(ISD::FMA, MVT::f32, Expand);
    226 
    227   setOperationAction(ISD::FCOPYSIGN, MVT::f64, Expand);
    228   setOperationAction(ISD::FCOPYSIGN, MVT::f32, Expand);
    229 
    230   // SPU can do rotate right and left, so legalize it... but customize for i8
    231   // because instructions don't exist.
    232 
    233   // FIXME: Change from "expand" to appropriate type once ROTR is supported in
    234   //        .td files.
    235   setOperationAction(ISD::ROTR, MVT::i32,    Expand /*Legal*/);
    236   setOperationAction(ISD::ROTR, MVT::i16,    Expand /*Legal*/);
    237   setOperationAction(ISD::ROTR, MVT::i8,     Expand /*Custom*/);
    238 
    239   setOperationAction(ISD::ROTL, MVT::i32,    Legal);
    240   setOperationAction(ISD::ROTL, MVT::i16,    Legal);
    241   setOperationAction(ISD::ROTL, MVT::i8,     Custom);
    242 
    243   // SPU has no native version of shift left/right for i8
    244   setOperationAction(ISD::SHL,  MVT::i8,     Custom);
    245   setOperationAction(ISD::SRL,  MVT::i8,     Custom);
    246   setOperationAction(ISD::SRA,  MVT::i8,     Custom);
    247 
    248   // Make these operations legal and handle them during instruction selection:
    249   setOperationAction(ISD::SHL,  MVT::i64,    Legal);
    250   setOperationAction(ISD::SRL,  MVT::i64,    Legal);
    251   setOperationAction(ISD::SRA,  MVT::i64,    Legal);
    252 
    253   // Custom lower i8, i32 and i64 multiplications
    254   setOperationAction(ISD::MUL,  MVT::i8,     Custom);
    255   setOperationAction(ISD::MUL,  MVT::i32,    Legal);
    256   setOperationAction(ISD::MUL,  MVT::i64,    Legal);
    257 
    258   // Expand double-width multiplication
    259   // FIXME: It would probably be reasonable to support some of these operations
    260   setOperationAction(ISD::UMUL_LOHI, MVT::i8,  Expand);
    261   setOperationAction(ISD::SMUL_LOHI, MVT::i8,  Expand);
    262   setOperationAction(ISD::MULHU,     MVT::i8,  Expand);
    263   setOperationAction(ISD::MULHS,     MVT::i8,  Expand);
    264   setOperationAction(ISD::UMUL_LOHI, MVT::i16, Expand);
    265   setOperationAction(ISD::SMUL_LOHI, MVT::i16, Expand);
    266   setOperationAction(ISD::MULHU,     MVT::i16, Expand);
    267   setOperationAction(ISD::MULHS,     MVT::i16, Expand);
    268   setOperationAction(ISD::UMUL_LOHI, MVT::i32, Expand);
    269   setOperationAction(ISD::SMUL_LOHI, MVT::i32, Expand);
    270   setOperationAction(ISD::MULHU,     MVT::i32, Expand);
    271   setOperationAction(ISD::MULHS,     MVT::i32, Expand);
    272   setOperationAction(ISD::UMUL_LOHI, MVT::i64, Expand);
    273   setOperationAction(ISD::SMUL_LOHI, MVT::i64, Expand);
    274   setOperationAction(ISD::MULHU,     MVT::i64, Expand);
    275   setOperationAction(ISD::MULHS,     MVT::i64, Expand);
    276 
    277   // Need to custom handle (some) common i8, i64 math ops
    278   setOperationAction(ISD::ADD,  MVT::i8,     Custom);
    279   setOperationAction(ISD::ADD,  MVT::i64,    Legal);
    280   setOperationAction(ISD::SUB,  MVT::i8,     Custom);
    281   setOperationAction(ISD::SUB,  MVT::i64,    Legal);
    282 
    283   // SPU does not have BSWAP. It does have i32 support CTLZ.
    284   // CTPOP has to be custom lowered.
    285   setOperationAction(ISD::BSWAP, MVT::i32,   Expand);
    286   setOperationAction(ISD::BSWAP, MVT::i64,   Expand);
    287 
    288   setOperationAction(ISD::CTPOP, MVT::i8,    Custom);
    289   setOperationAction(ISD::CTPOP, MVT::i16,   Custom);
    290   setOperationAction(ISD::CTPOP, MVT::i32,   Custom);
    291   setOperationAction(ISD::CTPOP, MVT::i64,   Custom);
    292   setOperationAction(ISD::CTPOP, MVT::i128,  Expand);
    293 
    294   setOperationAction(ISD::CTTZ , MVT::i8,    Expand);
    295   setOperationAction(ISD::CTTZ , MVT::i16,   Expand);
    296   setOperationAction(ISD::CTTZ , MVT::i32,   Expand);
    297   setOperationAction(ISD::CTTZ , MVT::i64,   Expand);
    298   setOperationAction(ISD::CTTZ , MVT::i128,  Expand);
    299 
    300   setOperationAction(ISD::CTLZ , MVT::i8,    Promote);
    301   setOperationAction(ISD::CTLZ , MVT::i16,   Promote);
    302   setOperationAction(ISD::CTLZ , MVT::i32,   Legal);
    303   setOperationAction(ISD::CTLZ , MVT::i64,   Expand);
    304   setOperationAction(ISD::CTLZ , MVT::i128,  Expand);
    305 
    306   // SPU has a version of select that implements (a&~c)|(b&c), just like
    307   // select ought to work:
    308   setOperationAction(ISD::SELECT, MVT::i8,   Legal);
    309   setOperationAction(ISD::SELECT, MVT::i16,  Legal);
    310   setOperationAction(ISD::SELECT, MVT::i32,  Legal);
    311   setOperationAction(ISD::SELECT, MVT::i64,  Legal);
    312 
    313   setOperationAction(ISD::SETCC, MVT::i8,    Legal);
    314   setOperationAction(ISD::SETCC, MVT::i16,   Legal);
    315   setOperationAction(ISD::SETCC, MVT::i32,   Legal);
    316   setOperationAction(ISD::SETCC, MVT::i64,   Legal);
    317   setOperationAction(ISD::SETCC, MVT::f64,   Custom);
    318 
    319   // Custom lower i128 -> i64 truncates
    320   setOperationAction(ISD::TRUNCATE, MVT::i64, Custom);
    321 
    322   // Custom lower i32/i64 -> i128 sign extend
    323   setOperationAction(ISD::SIGN_EXTEND, MVT::i128, Custom);
    324 
    325   setOperationAction(ISD::FP_TO_SINT, MVT::i8, Promote);
    326   setOperationAction(ISD::FP_TO_UINT, MVT::i8, Promote);
    327   setOperationAction(ISD::FP_TO_SINT, MVT::i16, Promote);
    328   setOperationAction(ISD::FP_TO_UINT, MVT::i16, Promote);
    329   // SPU has a legal FP -> signed INT instruction for f32, but for f64, need
    330   // to expand to a libcall, hence the custom lowering:
    331   setOperationAction(ISD::FP_TO_SINT, MVT::i32, Custom);
    332   setOperationAction(ISD::FP_TO_UINT, MVT::i32, Custom);
    333   setOperationAction(ISD::FP_TO_SINT, MVT::i64, Expand);
    334   setOperationAction(ISD::FP_TO_UINT, MVT::i64, Expand);
    335   setOperationAction(ISD::FP_TO_SINT, MVT::i128, Expand);
    336   setOperationAction(ISD::FP_TO_UINT, MVT::i128, Expand);
    337 
    338   // FDIV on SPU requires custom lowering
    339   setOperationAction(ISD::FDIV, MVT::f64, Expand);      // to libcall
    340 
    341   // SPU has [U|S]INT_TO_FP for f32->i32, but not for f64->i32, f64->i64:
    342   setOperationAction(ISD::SINT_TO_FP, MVT::i32, Custom);
    343   setOperationAction(ISD::SINT_TO_FP, MVT::i16, Promote);
    344   setOperationAction(ISD::SINT_TO_FP, MVT::i8,  Promote);
    345   setOperationAction(ISD::UINT_TO_FP, MVT::i32, Custom);
    346   setOperationAction(ISD::UINT_TO_FP, MVT::i16, Promote);
    347   setOperationAction(ISD::UINT_TO_FP, MVT::i8,  Promote);
    348   setOperationAction(ISD::SINT_TO_FP, MVT::i64, Custom);
    349   setOperationAction(ISD::UINT_TO_FP, MVT::i64, Custom);
    350 
    351   setOperationAction(ISD::BITCAST, MVT::i32, Legal);
    352   setOperationAction(ISD::BITCAST, MVT::f32, Legal);
    353   setOperationAction(ISD::BITCAST, MVT::i64, Legal);
    354   setOperationAction(ISD::BITCAST, MVT::f64, Legal);
    355 
    356   // We cannot sextinreg(i1).  Expand to shifts.
    357   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1, Expand);
    358 
    359   // We want to legalize GlobalAddress and ConstantPool nodes into the
    360   // appropriate instructions to materialize the address.
    361   for (unsigned sctype = (unsigned) MVT::i8; sctype < (unsigned) MVT::f128;
    362        ++sctype) {
    363     MVT::SimpleValueType VT = (MVT::SimpleValueType)sctype;
    364 
    365     setOperationAction(ISD::GlobalAddress,  VT, Custom);
    366     setOperationAction(ISD::ConstantPool,   VT, Custom);
    367     setOperationAction(ISD::JumpTable,      VT, Custom);
    368   }
    369 
    370   // VASTART needs to be custom lowered to use the VarArgsFrameIndex
    371   setOperationAction(ISD::VASTART           , MVT::Other, Custom);
    372 
    373   // Use the default implementation.
    374   setOperationAction(ISD::VAARG             , MVT::Other, Expand);
    375   setOperationAction(ISD::VACOPY            , MVT::Other, Expand);
    376   setOperationAction(ISD::VAEND             , MVT::Other, Expand);
    377   setOperationAction(ISD::STACKSAVE         , MVT::Other, Expand);
    378   setOperationAction(ISD::STACKRESTORE      , MVT::Other, Expand);
    379   setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i32  , Expand);
    380   setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i64  , Expand);
    381 
    382   // Cell SPU has instructions for converting between i64 and fp.
    383   setOperationAction(ISD::FP_TO_SINT, MVT::i64, Custom);
    384   setOperationAction(ISD::SINT_TO_FP, MVT::i64, Custom);
    385 
    386   // To take advantage of the above i64 FP_TO_SINT, promote i32 FP_TO_UINT
    387   setOperationAction(ISD::FP_TO_UINT, MVT::i32, Promote);
    388 
    389   // BUILD_PAIR can't be handled natively, and should be expanded to shl/or
    390   setOperationAction(ISD::BUILD_PAIR, MVT::i64, Expand);
    391 
    392   // First set operation action for all vector types to expand. Then we
    393   // will selectively turn on ones that can be effectively codegen'd.
    394   addRegisterClass(MVT::v16i8, SPU::VECREGRegisterClass);
    395   addRegisterClass(MVT::v8i16, SPU::VECREGRegisterClass);
    396   addRegisterClass(MVT::v4i32, SPU::VECREGRegisterClass);
    397   addRegisterClass(MVT::v2i64, SPU::VECREGRegisterClass);
    398   addRegisterClass(MVT::v4f32, SPU::VECREGRegisterClass);
    399   addRegisterClass(MVT::v2f64, SPU::VECREGRegisterClass);
    400 
    401   for (unsigned i = (unsigned)MVT::FIRST_VECTOR_VALUETYPE;
    402        i <= (unsigned)MVT::LAST_VECTOR_VALUETYPE; ++i) {
    403     MVT::SimpleValueType VT = (MVT::SimpleValueType)i;
    404 
    405     // Set operation actions to legal types only.
    406     if (!isTypeLegal(VT)) continue;
    407 
    408     // add/sub are legal for all supported vector VT's.
    409     setOperationAction(ISD::ADD,     VT, Legal);
    410     setOperationAction(ISD::SUB,     VT, Legal);
    411     // mul has to be custom lowered.
    412     setOperationAction(ISD::MUL,     VT, Legal);
    413 
    414     setOperationAction(ISD::AND,     VT, Legal);
    415     setOperationAction(ISD::OR,      VT, Legal);
    416     setOperationAction(ISD::XOR,     VT, Legal);
    417     setOperationAction(ISD::LOAD,    VT, Custom);
    418     setOperationAction(ISD::SELECT,  VT, Legal);
    419     setOperationAction(ISD::STORE,   VT, Custom);
    420 
    421     // These operations need to be expanded:
    422     setOperationAction(ISD::SDIV,    VT, Expand);
    423     setOperationAction(ISD::SREM,    VT, Expand);
    424     setOperationAction(ISD::UDIV,    VT, Expand);
    425     setOperationAction(ISD::UREM,    VT, Expand);
    426 
    427     // Expand all trunc stores
    428     for (unsigned j = (unsigned)MVT::FIRST_VECTOR_VALUETYPE;
    429          j <= (unsigned)MVT::LAST_VECTOR_VALUETYPE; ++j) {
    430       MVT::SimpleValueType TargetVT = (MVT::SimpleValueType)j;
    431     setTruncStoreAction(VT, TargetVT, Expand);
    432     }
    433 
    434     // Custom lower build_vector, constant pool spills, insert and
    435     // extract vector elements:
    436     setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
    437     setOperationAction(ISD::ConstantPool, VT, Custom);
    438     setOperationAction(ISD::SCALAR_TO_VECTOR, VT, Custom);
    439     setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
    440     setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom);
    441     setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);
    442   }
    443 
    444   setOperationAction(ISD::SHL, MVT::v2i64, Expand);
    445 
    446   setOperationAction(ISD::AND, MVT::v16i8, Custom);
    447   setOperationAction(ISD::OR,  MVT::v16i8, Custom);
    448   setOperationAction(ISD::XOR, MVT::v16i8, Custom);
    449   setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v4f32, Custom);
    450 
    451   setOperationAction(ISD::FDIV, MVT::v4f32, Legal);
    452 
    453   setBooleanContents(ZeroOrNegativeOneBooleanContent);
    454   setBooleanVectorContents(ZeroOrNegativeOneBooleanContent); // FIXME: Is this correct?
    455 
    456   setStackPointerRegisterToSaveRestore(SPU::R1);
    457 
    458   // We have target-specific dag combine patterns for the following nodes:
    459   setTargetDAGCombine(ISD::ADD);
    460   setTargetDAGCombine(ISD::ZERO_EXTEND);
    461   setTargetDAGCombine(ISD::SIGN_EXTEND);
    462   setTargetDAGCombine(ISD::ANY_EXTEND);
    463 
    464   setMinFunctionAlignment(3);
    465 
    466   computeRegisterProperties();
    467 
    468   // Set pre-RA register scheduler default to BURR, which produces slightly
    469   // better code than the default (could also be TDRR, but TargetLowering.h
    470   // needs a mod to support that model):
    471   setSchedulingPreference(Sched::RegPressure);
    472 }
    473 
    474 const char *
    475 SPUTargetLowering::getTargetNodeName(unsigned Opcode) const
    476 {
    477   if (node_names.empty()) {
    478     node_names[(unsigned) SPUISD::RET_FLAG] = "SPUISD::RET_FLAG";
    479     node_names[(unsigned) SPUISD::Hi] = "SPUISD::Hi";
    480     node_names[(unsigned) SPUISD::Lo] = "SPUISD::Lo";
    481     node_names[(unsigned) SPUISD::PCRelAddr] = "SPUISD::PCRelAddr";
    482     node_names[(unsigned) SPUISD::AFormAddr] = "SPUISD::AFormAddr";
    483     node_names[(unsigned) SPUISD::IndirectAddr] = "SPUISD::IndirectAddr";
    484     node_names[(unsigned) SPUISD::LDRESULT] = "SPUISD::LDRESULT";
    485     node_names[(unsigned) SPUISD::CALL] = "SPUISD::CALL";
    486     node_names[(unsigned) SPUISD::SHUFB] = "SPUISD::SHUFB";
    487     node_names[(unsigned) SPUISD::SHUFFLE_MASK] = "SPUISD::SHUFFLE_MASK";
    488     node_names[(unsigned) SPUISD::CNTB] = "SPUISD::CNTB";
    489     node_names[(unsigned) SPUISD::PREFSLOT2VEC] = "SPUISD::PREFSLOT2VEC";
    490     node_names[(unsigned) SPUISD::VEC2PREFSLOT] = "SPUISD::VEC2PREFSLOT";
    491     node_names[(unsigned) SPUISD::SHL_BITS] = "SPUISD::SHL_BITS";
    492     node_names[(unsigned) SPUISD::SHL_BYTES] = "SPUISD::SHL_BYTES";
    493     node_names[(unsigned) SPUISD::VEC_ROTL] = "SPUISD::VEC_ROTL";
    494     node_names[(unsigned) SPUISD::VEC_ROTR] = "SPUISD::VEC_ROTR";
    495     node_names[(unsigned) SPUISD::ROTBYTES_LEFT] = "SPUISD::ROTBYTES_LEFT";
    496     node_names[(unsigned) SPUISD::ROTBYTES_LEFT_BITS] =
    497             "SPUISD::ROTBYTES_LEFT_BITS";
    498     node_names[(unsigned) SPUISD::SELECT_MASK] = "SPUISD::SELECT_MASK";
    499     node_names[(unsigned) SPUISD::SELB] = "SPUISD::SELB";
    500     node_names[(unsigned) SPUISD::ADD64_MARKER] = "SPUISD::ADD64_MARKER";
    501     node_names[(unsigned) SPUISD::SUB64_MARKER] = "SPUISD::SUB64_MARKER";
    502     node_names[(unsigned) SPUISD::MUL64_MARKER] = "SPUISD::MUL64_MARKER";
    503   }
    504 
    505   std::map<unsigned, const char *>::iterator i = node_names.find(Opcode);
    506 
    507   return ((i != node_names.end()) ? i->second : 0);
    508 }
    509 
    510 //===----------------------------------------------------------------------===//
    511 // Return the Cell SPU's SETCC result type
    512 //===----------------------------------------------------------------------===//
    513 
    514 EVT SPUTargetLowering::getSetCCResultType(EVT VT) const {
    515   // i8, i16 and i32 are valid SETCC result types
    516   MVT::SimpleValueType retval;
    517 
    518   switch(VT.getSimpleVT().SimpleTy){
    519     case MVT::i1:
    520     case MVT::i8:
    521       retval = MVT::i8; break;
    522     case MVT::i16:
    523       retval = MVT::i16; break;
    524     case MVT::i32:
    525     default:
    526       retval = MVT::i32;
    527   }
    528   return retval;
    529 }
    530 
    531 //===----------------------------------------------------------------------===//
    532 // Calling convention code:
    533 //===----------------------------------------------------------------------===//
    534 
    535 #include "SPUGenCallingConv.inc"
    536 
    537 //===----------------------------------------------------------------------===//
    538 //  LowerOperation implementation
    539 //===----------------------------------------------------------------------===//
    540 
    541 /// Custom lower loads for CellSPU
    542 /*!
    543  All CellSPU loads and stores are aligned to 16-byte boundaries, so for elements
    544  within a 16-byte block, we have to rotate to extract the requested element.
    545 
    546  For extending loads, we also want to ensure that the following sequence is
    547  emitted, e.g. for MVT::f32 extending load to MVT::f64:
    548 
    549 \verbatim
    550 %1  v16i8,ch = load
    551 %2  v16i8,ch = rotate %1
    552 %3  v4f8, ch = bitconvert %2
    553 %4  f32      = vec2perfslot %3
    554 %5  f64      = fp_extend %4
    555 \endverbatim
    556 */
    557 static SDValue
    558 LowerLOAD(SDValue Op, SelectionDAG &DAG, const SPUSubtarget *ST) {
    559   LoadSDNode *LN = cast<LoadSDNode>(Op);
    560   SDValue the_chain = LN->getChain();
    561   EVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy();
    562   EVT InVT = LN->getMemoryVT();
    563   EVT OutVT = Op.getValueType();
    564   ISD::LoadExtType ExtType = LN->getExtensionType();
    565   unsigned alignment = LN->getAlignment();
    566   int pso = prefslotOffset(InVT);
    567   DebugLoc dl = Op.getDebugLoc();
    568   EVT vecVT = InVT.isVector()? InVT: EVT::getVectorVT(*DAG.getContext(), InVT,
    569                                                   (128 / InVT.getSizeInBits()));
    570 
    571   // two sanity checks
    572   assert( LN->getAddressingMode() == ISD::UNINDEXED
    573           && "we should get only UNINDEXED adresses");
    574   // clean aligned loads can be selected as-is
    575   if (InVT.getSizeInBits() == 128 && (alignment%16) == 0)
    576     return SDValue();
    577 
    578   // Get pointerinfos to the memory chunk(s) that contain the data to load
    579   uint64_t mpi_offset = LN->getPointerInfo().Offset;
    580   mpi_offset -= mpi_offset%16;
    581   MachinePointerInfo lowMemPtr(LN->getPointerInfo().V, mpi_offset);
    582   MachinePointerInfo highMemPtr(LN->getPointerInfo().V, mpi_offset+16);
    583 
    584   SDValue result;
    585   SDValue basePtr = LN->getBasePtr();
    586   SDValue rotate;
    587 
    588   if ((alignment%16) == 0) {
    589     ConstantSDNode *CN;
    590 
    591     // Special cases for a known aligned load to simplify the base pointer
    592     // and the rotation amount:
    593     if (basePtr.getOpcode() == ISD::ADD
    594         && (CN = dyn_cast<ConstantSDNode > (basePtr.getOperand(1))) != 0) {
    595       // Known offset into basePtr
    596       int64_t offset = CN->getSExtValue();
    597       int64_t rotamt = int64_t((offset & 0xf) - pso);
    598 
    599       if (rotamt < 0)
    600         rotamt += 16;
    601 
    602       rotate = DAG.getConstant(rotamt, MVT::i16);
    603 
    604       // Simplify the base pointer for this case:
    605       basePtr = basePtr.getOperand(0);
    606       if ((offset & ~0xf) > 0) {
    607         basePtr = DAG.getNode(SPUISD::IndirectAddr, dl, PtrVT,
    608                               basePtr,
    609                               DAG.getConstant((offset & ~0xf), PtrVT));
    610       }
    611     } else if ((basePtr.getOpcode() == SPUISD::AFormAddr)
    612                || (basePtr.getOpcode() == SPUISD::IndirectAddr
    613                    && basePtr.getOperand(0).getOpcode() == SPUISD::Hi
    614                    && basePtr.getOperand(1).getOpcode() == SPUISD::Lo)) {
    615       // Plain aligned a-form address: rotate into preferred slot
    616       // Same for (SPUindirect (SPUhi ...), (SPUlo ...))
    617       int64_t rotamt = -pso;
    618       if (rotamt < 0)
    619         rotamt += 16;
    620       rotate = DAG.getConstant(rotamt, MVT::i16);
    621     } else {
    622       // Offset the rotate amount by the basePtr and the preferred slot
    623       // byte offset
    624       int64_t rotamt = -pso;
    625       if (rotamt < 0)
    626         rotamt += 16;
    627       rotate = DAG.getNode(ISD::ADD, dl, PtrVT,
    628                            basePtr,
    629                            DAG.getConstant(rotamt, PtrVT));
    630     }
    631   } else {
    632     // Unaligned load: must be more pessimistic about addressing modes:
    633     if (basePtr.getOpcode() == ISD::ADD) {
    634       MachineFunction &MF = DAG.getMachineFunction();
    635       MachineRegisterInfo &RegInfo = MF.getRegInfo();
    636       unsigned VReg = RegInfo.createVirtualRegister(&SPU::R32CRegClass);
    637       SDValue Flag;
    638 
    639       SDValue Op0 = basePtr.getOperand(0);
    640       SDValue Op1 = basePtr.getOperand(1);
    641 
    642       if (isa<ConstantSDNode>(Op1)) {
    643         // Convert the (add <ptr>, <const>) to an indirect address contained
    644         // in a register. Note that this is done because we need to avoid
    645         // creating a 0(reg) d-form address due to the SPU's block loads.
    646         basePtr = DAG.getNode(SPUISD::IndirectAddr, dl, PtrVT, Op0, Op1);
    647         the_chain = DAG.getCopyToReg(the_chain, dl, VReg, basePtr, Flag);
    648         basePtr = DAG.getCopyFromReg(the_chain, dl, VReg, PtrVT);
    649       } else {
    650         // Convert the (add <arg1>, <arg2>) to an indirect address, which
    651         // will likely be lowered as a reg(reg) x-form address.
    652         basePtr = DAG.getNode(SPUISD::IndirectAddr, dl, PtrVT, Op0, Op1);
    653       }
    654     } else {
    655       basePtr = DAG.getNode(SPUISD::IndirectAddr, dl, PtrVT,
    656                             basePtr,
    657                             DAG.getConstant(0, PtrVT));
    658    }
    659 
    660     // Offset the rotate amount by the basePtr and the preferred slot
    661     // byte offset
    662     rotate = DAG.getNode(ISD::ADD, dl, PtrVT,
    663                          basePtr,
    664                          DAG.getConstant(-pso, PtrVT));
    665   }
    666 
    667   // Do the load as a i128 to allow possible shifting
    668   SDValue low = DAG.getLoad(MVT::i128, dl, the_chain, basePtr,
    669                        lowMemPtr,
    670                        LN->isVolatile(), LN->isNonTemporal(), 16);
    671 
    672   // When the size is not greater than alignment we get all data with just
    673   // one load
    674   if (alignment >= InVT.getSizeInBits()/8) {
    675     // Update the chain
    676     the_chain = low.getValue(1);
    677 
    678     // Rotate into the preferred slot:
    679     result = DAG.getNode(SPUISD::ROTBYTES_LEFT, dl, MVT::i128,
    680                          low.getValue(0), rotate);
    681 
    682     // Convert the loaded v16i8 vector to the appropriate vector type
    683     // specified by the operand:
    684     EVT vecVT = EVT::getVectorVT(*DAG.getContext(),
    685                                  InVT, (128 / InVT.getSizeInBits()));
    686     result = DAG.getNode(SPUISD::VEC2PREFSLOT, dl, InVT,
    687                          DAG.getNode(ISD::BITCAST, dl, vecVT, result));
    688   }
    689   // When alignment is less than the size, we might need (known only at
    690   // run-time) two loads
    691   // TODO: if the memory address is composed only from constants, we have
    692   // extra kowledge, and might avoid the second load
    693   else {
    694     // storage position offset from lower 16 byte aligned memory chunk
    695     SDValue offset = DAG.getNode(ISD::AND, dl, MVT::i32,
    696                                   basePtr, DAG.getConstant( 0xf, MVT::i32 ) );
    697     // get a registerfull of ones. (this implementation is a workaround: LLVM
    698     // cannot handle 128 bit signed int constants)
    699     SDValue ones = DAG.getConstant(-1, MVT::v4i32 );
    700     ones = DAG.getNode(ISD::BITCAST, dl, MVT::i128, ones);
    701 
    702     SDValue high = DAG.getLoad(MVT::i128, dl, the_chain,
    703                                DAG.getNode(ISD::ADD, dl, PtrVT,
    704                                            basePtr,
    705                                            DAG.getConstant(16, PtrVT)),
    706                                highMemPtr,
    707                                LN->isVolatile(), LN->isNonTemporal(), 16);
    708 
    709     the_chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, low.getValue(1),
    710                                                               high.getValue(1));
    711 
    712     // Shift the (possible) high part right to compensate the misalignemnt.
    713     // if there is no highpart (i.e. value is i64 and offset is 4), this
    714     // will zero out the high value.
    715     high = DAG.getNode(SPUISD::SRL_BYTES, dl, MVT::i128, high,
    716                                      DAG.getNode(ISD::SUB, dl, MVT::i32,
    717                                                  DAG.getConstant( 16, MVT::i32),
    718                                                  offset
    719                                                 ));
    720 
    721     // Shift the low similarly
    722     // TODO: add SPUISD::SHL_BYTES
    723     low = DAG.getNode(SPUISD::SHL_BYTES, dl, MVT::i128, low, offset );
    724 
    725     // Merge the two parts
    726     result = DAG.getNode(ISD::BITCAST, dl, vecVT,
    727                           DAG.getNode(ISD::OR, dl, MVT::i128, low, high));
    728 
    729     if (!InVT.isVector()) {
    730       result = DAG.getNode(SPUISD::VEC2PREFSLOT, dl, InVT, result );
    731      }
    732 
    733   }
    734     // Handle extending loads by extending the scalar result:
    735     if (ExtType == ISD::SEXTLOAD) {
    736       result = DAG.getNode(ISD::SIGN_EXTEND, dl, OutVT, result);
    737     } else if (ExtType == ISD::ZEXTLOAD) {
    738       result = DAG.getNode(ISD::ZERO_EXTEND, dl, OutVT, result);
    739     } else if (ExtType == ISD::EXTLOAD) {
    740       unsigned NewOpc = ISD::ANY_EXTEND;
    741 
    742       if (OutVT.isFloatingPoint())
    743         NewOpc = ISD::FP_EXTEND;
    744 
    745       result = DAG.getNode(NewOpc, dl, OutVT, result);
    746     }
    747 
    748     SDVTList retvts = DAG.getVTList(OutVT, MVT::Other);
    749     SDValue retops[2] = {
    750       result,
    751       the_chain
    752     };
    753 
    754     result = DAG.getNode(SPUISD::LDRESULT, dl, retvts,
    755                          retops, sizeof(retops) / sizeof(retops[0]));
    756     return result;
    757 }
    758 
    759 /// Custom lower stores for CellSPU
    760 /*!
    761  All CellSPU stores are aligned to 16-byte boundaries, so for elements
    762  within a 16-byte block, we have to generate a shuffle to insert the
    763  requested element into its place, then store the resulting block.
    764  */
    765 static SDValue
    766 LowerSTORE(SDValue Op, SelectionDAG &DAG, const SPUSubtarget *ST) {
    767   StoreSDNode *SN = cast<StoreSDNode>(Op);
    768   SDValue Value = SN->getValue();
    769   EVT VT = Value.getValueType();
    770   EVT StVT = (!SN->isTruncatingStore() ? VT : SN->getMemoryVT());
    771   EVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy();
    772   DebugLoc dl = Op.getDebugLoc();
    773   unsigned alignment = SN->getAlignment();
    774   SDValue result;
    775   EVT vecVT = StVT.isVector()? StVT: EVT::getVectorVT(*DAG.getContext(), StVT,
    776                                                  (128 / StVT.getSizeInBits()));
    777   // Get pointerinfos to the memory chunk(s) that contain the data to load
    778   uint64_t mpi_offset = SN->getPointerInfo().Offset;
    779   mpi_offset -= mpi_offset%16;
    780   MachinePointerInfo lowMemPtr(SN->getPointerInfo().V, mpi_offset);
    781   MachinePointerInfo highMemPtr(SN->getPointerInfo().V, mpi_offset+16);
    782 
    783 
    784   // two sanity checks
    785   assert( SN->getAddressingMode() == ISD::UNINDEXED
    786           && "we should get only UNINDEXED adresses");
    787   // clean aligned loads can be selected as-is
    788   if (StVT.getSizeInBits() == 128 && (alignment%16) == 0)
    789     return SDValue();
    790 
    791   SDValue alignLoadVec;
    792   SDValue basePtr = SN->getBasePtr();
    793   SDValue the_chain = SN->getChain();
    794   SDValue insertEltOffs;
    795 
    796   if ((alignment%16) == 0) {
    797     ConstantSDNode *CN;
    798     // Special cases for a known aligned load to simplify the base pointer
    799     // and insertion byte:
    800     if (basePtr.getOpcode() == ISD::ADD
    801         && (CN = dyn_cast<ConstantSDNode>(basePtr.getOperand(1))) != 0) {
    802       // Known offset into basePtr
    803       int64_t offset = CN->getSExtValue();
    804 
    805       // Simplify the base pointer for this case:
    806       basePtr = basePtr.getOperand(0);
    807       insertEltOffs = DAG.getNode(SPUISD::IndirectAddr, dl, PtrVT,
    808                                   basePtr,
    809                                   DAG.getConstant((offset & 0xf), PtrVT));
    810 
    811       if ((offset & ~0xf) > 0) {
    812         basePtr = DAG.getNode(SPUISD::IndirectAddr, dl, PtrVT,
    813                               basePtr,
    814                               DAG.getConstant((offset & ~0xf), PtrVT));
    815       }
    816     } else {
    817       // Otherwise, assume it's at byte 0 of basePtr
    818       insertEltOffs = DAG.getNode(SPUISD::IndirectAddr, dl, PtrVT,
    819                                   basePtr,
    820                                   DAG.getConstant(0, PtrVT));
    821       basePtr = DAG.getNode(SPUISD::IndirectAddr, dl, PtrVT,
    822                                   basePtr,
    823                                   DAG.getConstant(0, PtrVT));
    824     }
    825   } else {
    826     // Unaligned load: must be more pessimistic about addressing modes:
    827     if (basePtr.getOpcode() == ISD::ADD) {
    828       MachineFunction &MF = DAG.getMachineFunction();
    829       MachineRegisterInfo &RegInfo = MF.getRegInfo();
    830       unsigned VReg = RegInfo.createVirtualRegister(&SPU::R32CRegClass);
    831       SDValue Flag;
    832 
    833       SDValue Op0 = basePtr.getOperand(0);
    834       SDValue Op1 = basePtr.getOperand(1);
    835 
    836       if (isa<ConstantSDNode>(Op1)) {
    837         // Convert the (add <ptr>, <const>) to an indirect address contained
    838         // in a register. Note that this is done because we need to avoid
    839         // creating a 0(reg) d-form address due to the SPU's block loads.
    840         basePtr = DAG.getNode(SPUISD::IndirectAddr, dl, PtrVT, Op0, Op1);
    841         the_chain = DAG.getCopyToReg(the_chain, dl, VReg, basePtr, Flag);
    842         basePtr = DAG.getCopyFromReg(the_chain, dl, VReg, PtrVT);
    843       } else {
    844         // Convert the (add <arg1>, <arg2>) to an indirect address, which
    845         // will likely be lowered as a reg(reg) x-form address.
    846         basePtr = DAG.getNode(SPUISD::IndirectAddr, dl, PtrVT, Op0, Op1);
    847       }
    848     } else {
    849       basePtr = DAG.getNode(SPUISD::IndirectAddr, dl, PtrVT,
    850                             basePtr,
    851                             DAG.getConstant(0, PtrVT));
    852     }
    853 
    854     // Insertion point is solely determined by basePtr's contents
    855     insertEltOffs = DAG.getNode(ISD::ADD, dl, PtrVT,
    856                                 basePtr,
    857                                 DAG.getConstant(0, PtrVT));
    858   }
    859 
    860   // Load the lower part of the memory to which to store.
    861   SDValue low = DAG.getLoad(vecVT, dl, the_chain, basePtr,
    862                           lowMemPtr, SN->isVolatile(), SN->isNonTemporal(), 16);
    863 
    864   // if we don't need to store over the 16 byte boundary, one store suffices
    865   if (alignment >= StVT.getSizeInBits()/8) {
    866     // Update the chain
    867     the_chain = low.getValue(1);
    868 
    869     LoadSDNode *LN = cast<LoadSDNode>(low);
    870     SDValue theValue = SN->getValue();
    871 
    872     if (StVT != VT
    873         && (theValue.getOpcode() == ISD::AssertZext
    874             || theValue.getOpcode() == ISD::AssertSext)) {
    875       // Drill down and get the value for zero- and sign-extended
    876       // quantities
    877       theValue = theValue.getOperand(0);
    878     }
    879 
    880     // If the base pointer is already a D-form address, then just create
    881     // a new D-form address with a slot offset and the orignal base pointer.
    882     // Otherwise generate a D-form address with the slot offset relative
    883     // to the stack pointer, which is always aligned.
    884 #if !defined(NDEBUG)
    885       if (DebugFlag && isCurrentDebugType(DEBUG_TYPE)) {
    886         errs() << "CellSPU LowerSTORE: basePtr = ";
    887         basePtr.getNode()->dump(&DAG);
    888         errs() << "\n";
    889       }
    890 #endif
    891 
    892     SDValue insertEltOp = DAG.getNode(SPUISD::SHUFFLE_MASK, dl, vecVT,
    893                                       insertEltOffs);
    894     SDValue vectorizeOp = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, vecVT,
    895                                       theValue);
    896 
    897     result = DAG.getNode(SPUISD::SHUFB, dl, vecVT,
    898                          vectorizeOp, low,
    899                          DAG.getNode(ISD::BITCAST, dl,
    900                                      MVT::v4i32, insertEltOp));
    901 
    902     result = DAG.getStore(the_chain, dl, result, basePtr,
    903                           lowMemPtr,
    904                           LN->isVolatile(), LN->isNonTemporal(),
    905                           16);
    906 
    907   }
    908   // do the store when it might cross the 16 byte memory access boundary.
    909   else {
    910     // TODO issue a warning if SN->isVolatile()== true? This is likely not
    911     // what the user wanted.
    912 
    913     // address offset from nearest lower 16byte alinged address
    914     SDValue offset = DAG.getNode(ISD::AND, dl, MVT::i32,
    915                                     SN->getBasePtr(),
    916                                     DAG.getConstant(0xf, MVT::i32));
    917     // 16 - offset
    918     SDValue offset_compl = DAG.getNode(ISD::SUB, dl, MVT::i32,
    919                                            DAG.getConstant( 16, MVT::i32),
    920                                            offset);
    921     // 16 - sizeof(Value)
    922     SDValue surplus = DAG.getNode(ISD::SUB, dl, MVT::i32,
    923                                      DAG.getConstant( 16, MVT::i32),
    924                                      DAG.getConstant( VT.getSizeInBits()/8,
    925                                                       MVT::i32));
    926     // get a registerfull of ones
    927     SDValue ones = DAG.getConstant(-1, MVT::v4i32);
    928     ones = DAG.getNode(ISD::BITCAST, dl, MVT::i128, ones);
    929 
    930     // Create the 128 bit masks that have ones where the data to store is
    931     // located.
    932     SDValue lowmask, himask;
    933     // if the value to store don't fill up the an entire 128 bits, zero
    934     // out the last bits of the mask so that only the value we want to store
    935     // is masked.
    936     // this is e.g. in the case of store i32, align 2
    937     if (!VT.isVector()){
    938       Value = DAG.getNode(SPUISD::PREFSLOT2VEC, dl, vecVT, Value);
    939       lowmask = DAG.getNode(SPUISD::SRL_BYTES, dl, MVT::i128, ones, surplus);
    940       lowmask = DAG.getNode(SPUISD::SHL_BYTES, dl, MVT::i128, lowmask,
    941                                                                surplus);
    942       Value = DAG.getNode(ISD::BITCAST, dl, MVT::i128, Value);
    943       Value = DAG.getNode(ISD::AND, dl, MVT::i128, Value, lowmask);
    944 
    945     }
    946     else {
    947       lowmask = ones;
    948       Value = DAG.getNode(ISD::BITCAST, dl, MVT::i128, Value);
    949     }
    950     // this will zero, if there are no data that goes to the high quad
    951     himask = DAG.getNode(SPUISD::SHL_BYTES, dl, MVT::i128, lowmask,
    952                                                             offset_compl);
    953     lowmask = DAG.getNode(SPUISD::SRL_BYTES, dl, MVT::i128, lowmask,
    954                                                              offset);
    955 
    956     // Load in the old data and zero out the parts that will be overwritten with
    957     // the new data to store.
    958     SDValue hi = DAG.getLoad(MVT::i128, dl, the_chain,
    959                                DAG.getNode(ISD::ADD, dl, PtrVT, basePtr,
    960                                            DAG.getConstant( 16, PtrVT)),
    961                                highMemPtr,
    962                                SN->isVolatile(), SN->isNonTemporal(), 16);
    963     the_chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, low.getValue(1),
    964                                                               hi.getValue(1));
    965 
    966     low = DAG.getNode(ISD::AND, dl, MVT::i128,
    967                         DAG.getNode( ISD::BITCAST, dl, MVT::i128, low),
    968                         DAG.getNode( ISD::XOR, dl, MVT::i128, lowmask, ones));
    969     hi = DAG.getNode(ISD::AND, dl, MVT::i128,
    970                         DAG.getNode( ISD::BITCAST, dl, MVT::i128, hi),
    971                         DAG.getNode( ISD::XOR, dl, MVT::i128, himask, ones));
    972 
    973     // Shift the Value to store into place. rlow contains the parts that go to
    974     // the lower memory chunk, rhi has the parts that go to the upper one.
    975     SDValue rlow = DAG.getNode(SPUISD::SRL_BYTES, dl, MVT::i128, Value, offset);
    976     rlow = DAG.getNode(ISD::AND, dl, MVT::i128, rlow, lowmask);
    977     SDValue rhi = DAG.getNode(SPUISD::SHL_BYTES, dl, MVT::i128, Value,
    978                                                             offset_compl);
    979 
    980     // Merge the old data and the new data and store the results
    981     // Need to convert vectors here to integer as 'OR'ing floats assert
    982     rlow = DAG.getNode(ISD::OR, dl, MVT::i128,
    983                           DAG.getNode(ISD::BITCAST, dl, MVT::i128, low),
    984                           DAG.getNode(ISD::BITCAST, dl, MVT::i128, rlow));
    985     rhi = DAG.getNode(ISD::OR, dl, MVT::i128,
    986                          DAG.getNode(ISD::BITCAST, dl, MVT::i128, hi),
    987                          DAG.getNode(ISD::BITCAST, dl, MVT::i128, rhi));
    988 
    989     low = DAG.getStore(the_chain, dl, rlow, basePtr,
    990                           lowMemPtr,
    991                           SN->isVolatile(), SN->isNonTemporal(), 16);
    992     hi  = DAG.getStore(the_chain, dl, rhi,
    993                             DAG.getNode(ISD::ADD, dl, PtrVT, basePtr,
    994                                         DAG.getConstant( 16, PtrVT)),
    995                             highMemPtr,
    996                             SN->isVolatile(), SN->isNonTemporal(), 16);
    997     result = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, low.getValue(0),
    998                                                            hi.getValue(0));
    999   }
   1000 
   1001   return result;
   1002 }
   1003 
   1004 //! Generate the address of a constant pool entry.
   1005 static SDValue
   1006 LowerConstantPool(SDValue Op, SelectionDAG &DAG, const SPUSubtarget *ST) {
   1007   EVT PtrVT = Op.getValueType();
   1008   ConstantPoolSDNode *CP = cast<ConstantPoolSDNode>(Op);
   1009   const Constant *C = CP->getConstVal();
   1010   SDValue CPI = DAG.getTargetConstantPool(C, PtrVT, CP->getAlignment());
   1011   SDValue Zero = DAG.getConstant(0, PtrVT);
   1012   const TargetMachine &TM = DAG.getTarget();
   1013   // FIXME there is no actual debug info here
   1014   DebugLoc dl = Op.getDebugLoc();
   1015 
   1016   if (TM.getRelocationModel() == Reloc::Static) {
   1017     if (!ST->usingLargeMem()) {
   1018       // Just return the SDValue with the constant pool address in it.
   1019       return DAG.getNode(SPUISD::AFormAddr, dl, PtrVT, CPI, Zero);
   1020     } else {
   1021       SDValue Hi = DAG.getNode(SPUISD::Hi, dl, PtrVT, CPI, Zero);
   1022       SDValue Lo = DAG.getNode(SPUISD::Lo, dl, PtrVT, CPI, Zero);
   1023       return DAG.getNode(SPUISD::IndirectAddr, dl, PtrVT, Hi, Lo);
   1024     }
   1025   }
   1026 
   1027   llvm_unreachable("LowerConstantPool: Relocation model other than static"
   1028                    " not supported.");
   1029   return SDValue();
   1030 }
   1031 
   1032 //! Alternate entry point for generating the address of a constant pool entry
   1033 SDValue
   1034 SPU::LowerConstantPool(SDValue Op, SelectionDAG &DAG, const SPUTargetMachine &TM) {
   1035   return ::LowerConstantPool(Op, DAG, TM.getSubtargetImpl());
   1036 }
   1037 
   1038 static SDValue
   1039 LowerJumpTable(SDValue Op, SelectionDAG &DAG, const SPUSubtarget *ST) {
   1040   EVT PtrVT = Op.getValueType();
   1041   JumpTableSDNode *JT = cast<JumpTableSDNode>(Op);
   1042   SDValue JTI = DAG.getTargetJumpTable(JT->getIndex(), PtrVT);
   1043   SDValue Zero = DAG.getConstant(0, PtrVT);
   1044   const TargetMachine &TM = DAG.getTarget();
   1045   // FIXME there is no actual debug info here
   1046   DebugLoc dl = Op.getDebugLoc();
   1047 
   1048   if (TM.getRelocationModel() == Reloc::Static) {
   1049     if (!ST->usingLargeMem()) {
   1050       return DAG.getNode(SPUISD::AFormAddr, dl, PtrVT, JTI, Zero);
   1051     } else {
   1052       SDValue Hi = DAG.getNode(SPUISD::Hi, dl, PtrVT, JTI, Zero);
   1053       SDValue Lo = DAG.getNode(SPUISD::Lo, dl, PtrVT, JTI, Zero);
   1054       return DAG.getNode(SPUISD::IndirectAddr, dl, PtrVT, Hi, Lo);
   1055     }
   1056   }
   1057 
   1058   llvm_unreachable("LowerJumpTable: Relocation model other than static"
   1059                    " not supported.");
   1060   return SDValue();
   1061 }
   1062 
   1063 static SDValue
   1064 LowerGlobalAddress(SDValue Op, SelectionDAG &DAG, const SPUSubtarget *ST) {
   1065   EVT PtrVT = Op.getValueType();
   1066   GlobalAddressSDNode *GSDN = cast<GlobalAddressSDNode>(Op);
   1067   const GlobalValue *GV = GSDN->getGlobal();
   1068   SDValue GA = DAG.getTargetGlobalAddress(GV, Op.getDebugLoc(),
   1069                                           PtrVT, GSDN->getOffset());
   1070   const TargetMachine &TM = DAG.getTarget();
   1071   SDValue Zero = DAG.getConstant(0, PtrVT);
   1072   // FIXME there is no actual debug info here
   1073   DebugLoc dl = Op.getDebugLoc();
   1074 
   1075   if (TM.getRelocationModel() == Reloc::Static) {
   1076     if (!ST->usingLargeMem()) {
   1077       return DAG.getNode(SPUISD::AFormAddr, dl, PtrVT, GA, Zero);
   1078     } else {
   1079       SDValue Hi = DAG.getNode(SPUISD::Hi, dl, PtrVT, GA, Zero);
   1080       SDValue Lo = DAG.getNode(SPUISD::Lo, dl, PtrVT, GA, Zero);
   1081       return DAG.getNode(SPUISD::IndirectAddr, dl, PtrVT, Hi, Lo);
   1082     }
   1083   } else {
   1084     report_fatal_error("LowerGlobalAddress: Relocation model other than static"
   1085                       "not supported.");
   1086     /*NOTREACHED*/
   1087   }
   1088 
   1089   return SDValue();
   1090 }
   1091 
   1092 //! Custom lower double precision floating point constants
   1093 static SDValue
   1094 LowerConstantFP(SDValue Op, SelectionDAG &DAG) {
   1095   EVT VT = Op.getValueType();
   1096   // FIXME there is no actual debug info here
   1097   DebugLoc dl = Op.getDebugLoc();
   1098 
   1099   if (VT == MVT::f64) {
   1100     ConstantFPSDNode *FP = cast<ConstantFPSDNode>(Op.getNode());
   1101 
   1102     assert((FP != 0) &&
   1103            "LowerConstantFP: Node is not ConstantFPSDNode");
   1104 
   1105     uint64_t dbits = DoubleToBits(FP->getValueAPF().convertToDouble());
   1106     SDValue T = DAG.getConstant(dbits, MVT::i64);
   1107     SDValue Tvec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v2i64, T, T);
   1108     return DAG.getNode(SPUISD::VEC2PREFSLOT, dl, VT,
   1109                        DAG.getNode(ISD::BITCAST, dl, MVT::v2f64, Tvec));
   1110   }
   1111 
   1112   return SDValue();
   1113 }
   1114 
   1115 SDValue
   1116 SPUTargetLowering::LowerFormalArguments(SDValue Chain,
   1117                                         CallingConv::ID CallConv, bool isVarArg,
   1118                                         const SmallVectorImpl<ISD::InputArg>
   1119                                           &Ins,
   1120                                         DebugLoc dl, SelectionDAG &DAG,
   1121                                         SmallVectorImpl<SDValue> &InVals)
   1122                                           const {
   1123 
   1124   MachineFunction &MF = DAG.getMachineFunction();
   1125   MachineFrameInfo *MFI = MF.getFrameInfo();
   1126   MachineRegisterInfo &RegInfo = MF.getRegInfo();
   1127   SPUFunctionInfo *FuncInfo = MF.getInfo<SPUFunctionInfo>();
   1128 
   1129   unsigned ArgOffset = SPUFrameLowering::minStackSize();
   1130   unsigned ArgRegIdx = 0;
   1131   unsigned StackSlotSize = SPUFrameLowering::stackSlotSize();
   1132 
   1133   EVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy();
   1134 
   1135   SmallVector<CCValAssign, 16> ArgLocs;
   1136   CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(),
   1137 		 getTargetMachine(), ArgLocs, *DAG.getContext());
   1138   // FIXME: allow for other calling conventions
   1139   CCInfo.AnalyzeFormalArguments(Ins, CCC_SPU);
   1140 
   1141   // Add DAG nodes to load the arguments or copy them out of registers.
   1142   for (unsigned ArgNo = 0, e = Ins.size(); ArgNo != e; ++ArgNo) {
   1143     EVT ObjectVT = Ins[ArgNo].VT;
   1144     unsigned ObjSize = ObjectVT.getSizeInBits()/8;
   1145     SDValue ArgVal;
   1146     CCValAssign &VA = ArgLocs[ArgNo];
   1147 
   1148     if (VA.isRegLoc()) {
   1149       const TargetRegisterClass *ArgRegClass;
   1150 
   1151       switch (ObjectVT.getSimpleVT().SimpleTy) {
   1152       default:
   1153         report_fatal_error("LowerFormalArguments Unhandled argument type: " +
   1154                            Twine(ObjectVT.getEVTString()));
   1155       case MVT::i8:
   1156         ArgRegClass = &SPU::R8CRegClass;
   1157         break;
   1158       case MVT::i16:
   1159         ArgRegClass = &SPU::R16CRegClass;
   1160         break;
   1161       case MVT::i32:
   1162         ArgRegClass = &SPU::R32CRegClass;
   1163         break;
   1164       case MVT::i64:
   1165         ArgRegClass = &SPU::R64CRegClass;
   1166         break;
   1167       case MVT::i128:
   1168         ArgRegClass = &SPU::GPRCRegClass;
   1169         break;
   1170       case MVT::f32:
   1171         ArgRegClass = &SPU::R32FPRegClass;
   1172         break;
   1173       case MVT::f64:
   1174         ArgRegClass = &SPU::R64FPRegClass;
   1175         break;
   1176       case MVT::v2f64:
   1177       case MVT::v4f32:
   1178       case MVT::v2i64:
   1179       case MVT::v4i32:
   1180       case MVT::v8i16:
   1181       case MVT::v16i8:
   1182         ArgRegClass = &SPU::VECREGRegClass;
   1183         break;
   1184       }
   1185 
   1186       unsigned VReg = RegInfo.createVirtualRegister(ArgRegClass);
   1187       RegInfo.addLiveIn(VA.getLocReg(), VReg);
   1188       ArgVal = DAG.getCopyFromReg(Chain, dl, VReg, ObjectVT);
   1189       ++ArgRegIdx;
   1190     } else {
   1191       // We need to load the argument to a virtual register if we determined
   1192       // above that we ran out of physical registers of the appropriate type
   1193       // or we're forced to do vararg
   1194       int FI = MFI->CreateFixedObject(ObjSize, ArgOffset, true);
   1195       SDValue FIN = DAG.getFrameIndex(FI, PtrVT);
   1196       ArgVal = DAG.getLoad(ObjectVT, dl, Chain, FIN, MachinePointerInfo(),
   1197                            false, false, 0);
   1198       ArgOffset += StackSlotSize;
   1199     }
   1200 
   1201     InVals.push_back(ArgVal);
   1202     // Update the chain
   1203     Chain = ArgVal.getOperand(0);
   1204   }
   1205 
   1206   // vararg handling:
   1207   if (isVarArg) {
   1208     // FIXME: we should be able to query the argument registers from
   1209     //        tablegen generated code.
   1210     static const unsigned ArgRegs[] = {
   1211       SPU::R3,  SPU::R4,  SPU::R5,  SPU::R6,  SPU::R7,  SPU::R8,  SPU::R9,
   1212       SPU::R10, SPU::R11, SPU::R12, SPU::R13, SPU::R14, SPU::R15, SPU::R16,
   1213       SPU::R17, SPU::R18, SPU::R19, SPU::R20, SPU::R21, SPU::R22, SPU::R23,
   1214       SPU::R24, SPU::R25, SPU::R26, SPU::R27, SPU::R28, SPU::R29, SPU::R30,
   1215       SPU::R31, SPU::R32, SPU::R33, SPU::R34, SPU::R35, SPU::R36, SPU::R37,
   1216       SPU::R38, SPU::R39, SPU::R40, SPU::R41, SPU::R42, SPU::R43, SPU::R44,
   1217       SPU::R45, SPU::R46, SPU::R47, SPU::R48, SPU::R49, SPU::R50, SPU::R51,
   1218       SPU::R52, SPU::R53, SPU::R54, SPU::R55, SPU::R56, SPU::R57, SPU::R58,
   1219       SPU::R59, SPU::R60, SPU::R61, SPU::R62, SPU::R63, SPU::R64, SPU::R65,
   1220       SPU::R66, SPU::R67, SPU::R68, SPU::R69, SPU::R70, SPU::R71, SPU::R72,
   1221       SPU::R73, SPU::R74, SPU::R75, SPU::R76, SPU::R77, SPU::R78, SPU::R79
   1222     };
   1223     // size of ArgRegs array
   1224     unsigned NumArgRegs = 77;
   1225 
   1226     // We will spill (79-3)+1 registers to the stack
   1227     SmallVector<SDValue, 79-3+1> MemOps;
   1228 
   1229     // Create the frame slot
   1230     for (; ArgRegIdx != NumArgRegs; ++ArgRegIdx) {
   1231       FuncInfo->setVarArgsFrameIndex(
   1232         MFI->CreateFixedObject(StackSlotSize, ArgOffset, true));
   1233       SDValue FIN = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(), PtrVT);
   1234       unsigned VReg = MF.addLiveIn(ArgRegs[ArgRegIdx], &SPU::VECREGRegClass);
   1235       SDValue ArgVal = DAG.getRegister(VReg, MVT::v16i8);
   1236       SDValue Store = DAG.getStore(Chain, dl, ArgVal, FIN, MachinePointerInfo(),
   1237                                    false, false, 0);
   1238       Chain = Store.getOperand(0);
   1239       MemOps.push_back(Store);
   1240 
   1241       // Increment address by stack slot size for the next stored argument
   1242       ArgOffset += StackSlotSize;
   1243     }
   1244     if (!MemOps.empty())
   1245       Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
   1246                           &MemOps[0], MemOps.size());
   1247   }
   1248 
   1249   return Chain;
   1250 }
   1251 
   1252 /// isLSAAddress - Return the immediate to use if the specified
   1253 /// value is representable as a LSA address.
   1254 static SDNode *isLSAAddress(SDValue Op, SelectionDAG &DAG) {
   1255   ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op);
   1256   if (!C) return 0;
   1257 
   1258   int Addr = C->getZExtValue();
   1259   if ((Addr & 3) != 0 ||  // Low 2 bits are implicitly zero.
   1260       (Addr << 14 >> 14) != Addr)
   1261     return 0;  // Top 14 bits have to be sext of immediate.
   1262 
   1263   return DAG.getConstant((int)C->getZExtValue() >> 2, MVT::i32).getNode();
   1264 }
   1265 
   1266 SDValue
   1267 SPUTargetLowering::LowerCall(SDValue Chain, SDValue Callee,
   1268                              CallingConv::ID CallConv, bool isVarArg,
   1269                              bool &isTailCall,
   1270                              const SmallVectorImpl<ISD::OutputArg> &Outs,
   1271                              const SmallVectorImpl<SDValue> &OutVals,
   1272                              const SmallVectorImpl<ISD::InputArg> &Ins,
   1273                              DebugLoc dl, SelectionDAG &DAG,
   1274                              SmallVectorImpl<SDValue> &InVals) const {
   1275   // CellSPU target does not yet support tail call optimization.
   1276   isTailCall = false;
   1277 
   1278   const SPUSubtarget *ST = SPUTM.getSubtargetImpl();
   1279   unsigned NumOps     = Outs.size();
   1280   unsigned StackSlotSize = SPUFrameLowering::stackSlotSize();
   1281 
   1282   SmallVector<CCValAssign, 16> ArgLocs;
   1283   CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(),
   1284 		 getTargetMachine(), ArgLocs, *DAG.getContext());
   1285   // FIXME: allow for other calling conventions
   1286   CCInfo.AnalyzeCallOperands(Outs, CCC_SPU);
   1287 
   1288   const unsigned NumArgRegs = ArgLocs.size();
   1289 
   1290 
   1291   // Handy pointer type
   1292   EVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy();
   1293 
   1294   // Set up a copy of the stack pointer for use loading and storing any
   1295   // arguments that may not fit in the registers available for argument
   1296   // passing.
   1297   SDValue StackPtr = DAG.getRegister(SPU::R1, MVT::i32);
   1298 
   1299   // Figure out which arguments are going to go in registers, and which in
   1300   // memory.
   1301   unsigned ArgOffset = SPUFrameLowering::minStackSize(); // Just below [LR]
   1302   unsigned ArgRegIdx = 0;
   1303 
   1304   // Keep track of registers passing arguments
   1305   std::vector<std::pair<unsigned, SDValue> > RegsToPass;
   1306   // And the arguments passed on the stack
   1307   SmallVector<SDValue, 8> MemOpChains;
   1308 
   1309   for (; ArgRegIdx != NumOps; ++ArgRegIdx) {
   1310     SDValue Arg = OutVals[ArgRegIdx];
   1311     CCValAssign &VA = ArgLocs[ArgRegIdx];
   1312 
   1313     // PtrOff will be used to store the current argument to the stack if a
   1314     // register cannot be found for it.
   1315     SDValue PtrOff = DAG.getConstant(ArgOffset, StackPtr.getValueType());
   1316     PtrOff = DAG.getNode(ISD::ADD, dl, PtrVT, StackPtr, PtrOff);
   1317 
   1318     switch (Arg.getValueType().getSimpleVT().SimpleTy) {
   1319     default: llvm_unreachable("Unexpected ValueType for argument!");
   1320     case MVT::i8:
   1321     case MVT::i16:
   1322     case MVT::i32:
   1323     case MVT::i64:
   1324     case MVT::i128:
   1325     case MVT::f32:
   1326     case MVT::f64:
   1327     case MVT::v2i64:
   1328     case MVT::v2f64:
   1329     case MVT::v4f32:
   1330     case MVT::v4i32:
   1331     case MVT::v8i16:
   1332     case MVT::v16i8:
   1333       if (ArgRegIdx != NumArgRegs) {
   1334         RegsToPass.push_back(std::make_pair(VA.getLocReg(), Arg));
   1335       } else {
   1336         MemOpChains.push_back(DAG.getStore(Chain, dl, Arg, PtrOff,
   1337                                            MachinePointerInfo(),
   1338                                            false, false, 0));
   1339         ArgOffset += StackSlotSize;
   1340       }
   1341       break;
   1342     }
   1343   }
   1344 
   1345   // Accumulate how many bytes are to be pushed on the stack, including the
   1346   // linkage area, and parameter passing area.  According to the SPU ABI,
   1347   // we minimally need space for [LR] and [SP].
   1348   unsigned NumStackBytes = ArgOffset - SPUFrameLowering::minStackSize();
   1349 
   1350   // Insert a call sequence start
   1351   Chain = DAG.getCALLSEQ_START(Chain, DAG.getIntPtrConstant(NumStackBytes,
   1352                                                             true));
   1353 
   1354   if (!MemOpChains.empty()) {
   1355     // Adjust the stack pointer for the stack arguments.
   1356     Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
   1357                         &MemOpChains[0], MemOpChains.size());
   1358   }
   1359 
   1360   // Build a sequence of copy-to-reg nodes chained together with token chain
   1361   // and flag operands which copy the outgoing args into the appropriate regs.
   1362   SDValue InFlag;
   1363   for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) {
   1364     Chain = DAG.getCopyToReg(Chain, dl, RegsToPass[i].first,
   1365                              RegsToPass[i].second, InFlag);
   1366     InFlag = Chain.getValue(1);
   1367   }
   1368 
   1369   SmallVector<SDValue, 8> Ops;
   1370   unsigned CallOpc = SPUISD::CALL;
   1371 
   1372   // If the callee is a GlobalAddress/ExternalSymbol node (quite common, every
   1373   // direct call is) turn it into a TargetGlobalAddress/TargetExternalSymbol
   1374   // node so that legalize doesn't hack it.
   1375   if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee)) {
   1376     const GlobalValue *GV = G->getGlobal();
   1377     EVT CalleeVT = Callee.getValueType();
   1378     SDValue Zero = DAG.getConstant(0, PtrVT);
   1379     SDValue GA = DAG.getTargetGlobalAddress(GV, dl, CalleeVT);
   1380 
   1381     if (!ST->usingLargeMem()) {
   1382       // Turn calls to targets that are defined (i.e., have bodies) into BRSL
   1383       // style calls, otherwise, external symbols are BRASL calls. This assumes
   1384       // that declared/defined symbols are in the same compilation unit and can
   1385       // be reached through PC-relative jumps.
   1386       //
   1387       // NOTE:
   1388       // This may be an unsafe assumption for JIT and really large compilation
   1389       // units.
   1390       if (GV->isDeclaration()) {
   1391         Callee = DAG.getNode(SPUISD::AFormAddr, dl, CalleeVT, GA, Zero);
   1392       } else {
   1393         Callee = DAG.getNode(SPUISD::PCRelAddr, dl, CalleeVT, GA, Zero);
   1394       }
   1395     } else {
   1396       // "Large memory" mode: Turn all calls into indirect calls with a X-form
   1397       // address pairs:
   1398       Callee = DAG.getNode(SPUISD::IndirectAddr, dl, PtrVT, GA, Zero);
   1399     }
   1400   } else if (ExternalSymbolSDNode *S = dyn_cast<ExternalSymbolSDNode>(Callee)) {
   1401     EVT CalleeVT = Callee.getValueType();
   1402     SDValue Zero = DAG.getConstant(0, PtrVT);
   1403     SDValue ExtSym = DAG.getTargetExternalSymbol(S->getSymbol(),
   1404         Callee.getValueType());
   1405 
   1406     if (!ST->usingLargeMem()) {
   1407       Callee = DAG.getNode(SPUISD::AFormAddr, dl, CalleeVT, ExtSym, Zero);
   1408     } else {
   1409       Callee = DAG.getNode(SPUISD::IndirectAddr, dl, PtrVT, ExtSym, Zero);
   1410     }
   1411   } else if (SDNode *Dest = isLSAAddress(Callee, DAG)) {
   1412     // If this is an absolute destination address that appears to be a legal
   1413     // local store address, use the munged value.
   1414     Callee = SDValue(Dest, 0);
   1415   }
   1416 
   1417   Ops.push_back(Chain);
   1418   Ops.push_back(Callee);
   1419 
   1420   // Add argument registers to the end of the list so that they are known live
   1421   // into the call.
   1422   for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i)
   1423     Ops.push_back(DAG.getRegister(RegsToPass[i].first,
   1424                                   RegsToPass[i].second.getValueType()));
   1425 
   1426   if (InFlag.getNode())
   1427     Ops.push_back(InFlag);
   1428   // Returns a chain and a flag for retval copy to use.
   1429   Chain = DAG.getNode(CallOpc, dl, DAG.getVTList(MVT::Other, MVT::Glue),
   1430                       &Ops[0], Ops.size());
   1431   InFlag = Chain.getValue(1);
   1432 
   1433   Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(NumStackBytes, true),
   1434                              DAG.getIntPtrConstant(0, true), InFlag);
   1435   if (!Ins.empty())
   1436     InFlag = Chain.getValue(1);
   1437 
   1438   // If the function returns void, just return the chain.
   1439   if (Ins.empty())
   1440     return Chain;
   1441 
   1442   // Now handle the return value(s)
   1443   SmallVector<CCValAssign, 16> RVLocs;
   1444   CCState CCRetInfo(CallConv, isVarArg, DAG.getMachineFunction(),
   1445 		    getTargetMachine(), RVLocs, *DAG.getContext());
   1446   CCRetInfo.AnalyzeCallResult(Ins, CCC_SPU);
   1447 
   1448 
   1449   // If the call has results, copy the values out of the ret val registers.
   1450   for (unsigned i = 0; i != RVLocs.size(); ++i) {
   1451     CCValAssign VA = RVLocs[i];
   1452 
   1453     SDValue Val = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(), VA.getLocVT(),
   1454                                      InFlag);
   1455     Chain = Val.getValue(1);
   1456     InFlag = Val.getValue(2);
   1457     InVals.push_back(Val);
   1458    }
   1459 
   1460   return Chain;
   1461 }
   1462 
   1463 SDValue
   1464 SPUTargetLowering::LowerReturn(SDValue Chain,
   1465                                CallingConv::ID CallConv, bool isVarArg,
   1466                                const SmallVectorImpl<ISD::OutputArg> &Outs,
   1467                                const SmallVectorImpl<SDValue> &OutVals,
   1468                                DebugLoc dl, SelectionDAG &DAG) const {
   1469 
   1470   SmallVector<CCValAssign, 16> RVLocs;
   1471   CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(),
   1472 		 getTargetMachine(), RVLocs, *DAG.getContext());
   1473   CCInfo.AnalyzeReturn(Outs, RetCC_SPU);
   1474 
   1475   // If this is the first return lowered for this function, add the regs to the
   1476   // liveout set for the function.
   1477   if (DAG.getMachineFunction().getRegInfo().liveout_empty()) {
   1478     for (unsigned i = 0; i != RVLocs.size(); ++i)
   1479       DAG.getMachineFunction().getRegInfo().addLiveOut(RVLocs[i].getLocReg());
   1480   }
   1481 
   1482   SDValue Flag;
   1483 
   1484   // Copy the result values into the output registers.
   1485   for (unsigned i = 0; i != RVLocs.size(); ++i) {
   1486     CCValAssign &VA = RVLocs[i];
   1487     assert(VA.isRegLoc() && "Can only return in registers!");
   1488     Chain = DAG.getCopyToReg(Chain, dl, VA.getLocReg(),
   1489                              OutVals[i], Flag);
   1490     Flag = Chain.getValue(1);
   1491   }
   1492 
   1493   if (Flag.getNode())
   1494     return DAG.getNode(SPUISD::RET_FLAG, dl, MVT::Other, Chain, Flag);
   1495   else
   1496     return DAG.getNode(SPUISD::RET_FLAG, dl, MVT::Other, Chain);
   1497 }
   1498 
   1499 
   1500 //===----------------------------------------------------------------------===//
   1501 // Vector related lowering:
   1502 //===----------------------------------------------------------------------===//
   1503 
   1504 static ConstantSDNode *
   1505 getVecImm(SDNode *N) {
   1506   SDValue OpVal(0, 0);
   1507 
   1508   // Check to see if this buildvec has a single non-undef value in its elements.
   1509   for (unsigned i = 0, e = N->getNumOperands(); i != e; ++i) {
   1510     if (N->getOperand(i).getOpcode() == ISD::UNDEF) continue;
   1511     if (OpVal.getNode() == 0)
   1512       OpVal = N->getOperand(i);
   1513     else if (OpVal != N->getOperand(i))
   1514       return 0;
   1515   }
   1516 
   1517   if (OpVal.getNode() != 0) {
   1518     if (ConstantSDNode *CN = dyn_cast<ConstantSDNode>(OpVal)) {
   1519       return CN;
   1520     }
   1521   }
   1522 
   1523   return 0;
   1524 }
   1525 
   1526 /// get_vec_i18imm - Test if this vector is a vector filled with the same value
   1527 /// and the value fits into an unsigned 18-bit constant, and if so, return the
   1528 /// constant
   1529 SDValue SPU::get_vec_u18imm(SDNode *N, SelectionDAG &DAG,
   1530                               EVT ValueType) {
   1531   if (ConstantSDNode *CN = getVecImm(N)) {
   1532     uint64_t Value = CN->getZExtValue();
   1533     if (ValueType == MVT::i64) {
   1534       uint64_t UValue = CN->getZExtValue();
   1535       uint32_t upper = uint32_t(UValue >> 32);
   1536       uint32_t lower = uint32_t(UValue);
   1537       if (upper != lower)
   1538         return SDValue();
   1539       Value = Value >> 32;
   1540     }
   1541     if (Value <= 0x3ffff)
   1542       return DAG.getTargetConstant(Value, ValueType);
   1543   }
   1544 
   1545   return SDValue();
   1546 }
   1547 
   1548 /// get_vec_i16imm - Test if this vector is a vector filled with the same value
   1549 /// and the value fits into a signed 16-bit constant, and if so, return the
   1550 /// constant
   1551 SDValue SPU::get_vec_i16imm(SDNode *N, SelectionDAG &DAG,
   1552                               EVT ValueType) {
   1553   if (ConstantSDNode *CN = getVecImm(N)) {
   1554     int64_t Value = CN->getSExtValue();
   1555     if (ValueType == MVT::i64) {
   1556       uint64_t UValue = CN->getZExtValue();
   1557       uint32_t upper = uint32_t(UValue >> 32);
   1558       uint32_t lower = uint32_t(UValue);
   1559       if (upper != lower)
   1560         return SDValue();
   1561       Value = Value >> 32;
   1562     }
   1563     if (Value >= -(1 << 15) && Value <= ((1 << 15) - 1)) {
   1564       return DAG.getTargetConstant(Value, ValueType);
   1565     }
   1566   }
   1567 
   1568   return SDValue();
   1569 }
   1570 
   1571 /// get_vec_i10imm - Test if this vector is a vector filled with the same value
   1572 /// and the value fits into a signed 10-bit constant, and if so, return the
   1573 /// constant
   1574 SDValue SPU::get_vec_i10imm(SDNode *N, SelectionDAG &DAG,
   1575                               EVT ValueType) {
   1576   if (ConstantSDNode *CN = getVecImm(N)) {
   1577     int64_t Value = CN->getSExtValue();
   1578     if (ValueType == MVT::i64) {
   1579       uint64_t UValue = CN->getZExtValue();
   1580       uint32_t upper = uint32_t(UValue >> 32);
   1581       uint32_t lower = uint32_t(UValue);
   1582       if (upper != lower)
   1583         return SDValue();
   1584       Value = Value >> 32;
   1585     }
   1586     if (isInt<10>(Value))
   1587       return DAG.getTargetConstant(Value, ValueType);
   1588   }
   1589 
   1590   return SDValue();
   1591 }
   1592 
   1593 /// get_vec_i8imm - Test if this vector is a vector filled with the same value
   1594 /// and the value fits into a signed 8-bit constant, and if so, return the
   1595 /// constant.
   1596 ///
   1597 /// @note: The incoming vector is v16i8 because that's the only way we can load
   1598 /// constant vectors. Thus, we test to see if the upper and lower bytes are the
   1599 /// same value.
   1600 SDValue SPU::get_vec_i8imm(SDNode *N, SelectionDAG &DAG,
   1601                              EVT ValueType) {
   1602   if (ConstantSDNode *CN = getVecImm(N)) {
   1603     int Value = (int) CN->getZExtValue();
   1604     if (ValueType == MVT::i16
   1605         && Value <= 0xffff                 /* truncated from uint64_t */
   1606         && ((short) Value >> 8) == ((short) Value & 0xff))
   1607       return DAG.getTargetConstant(Value & 0xff, ValueType);
   1608     else if (ValueType == MVT::i8
   1609              && (Value & 0xff) == Value)
   1610       return DAG.getTargetConstant(Value, ValueType);
   1611   }
   1612 
   1613   return SDValue();
   1614 }
   1615 
   1616 /// get_ILHUvec_imm - Test if this vector is a vector filled with the same value
   1617 /// and the value fits into a signed 16-bit constant, and if so, return the
   1618 /// constant
   1619 SDValue SPU::get_ILHUvec_imm(SDNode *N, SelectionDAG &DAG,
   1620                                EVT ValueType) {
   1621   if (ConstantSDNode *CN = getVecImm(N)) {
   1622     uint64_t Value = CN->getZExtValue();
   1623     if ((ValueType == MVT::i32
   1624           && ((unsigned) Value & 0xffff0000) == (unsigned) Value)
   1625         || (ValueType == MVT::i64 && (Value & 0xffff0000) == Value))
   1626       return DAG.getTargetConstant(Value >> 16, ValueType);
   1627   }
   1628 
   1629   return SDValue();
   1630 }
   1631 
   1632 /// get_v4i32_imm - Catch-all for general 32-bit constant vectors
   1633 SDValue SPU::get_v4i32_imm(SDNode *N, SelectionDAG &DAG) {
   1634   if (ConstantSDNode *CN = getVecImm(N)) {
   1635     return DAG.getTargetConstant((unsigned) CN->getZExtValue(), MVT::i32);
   1636   }
   1637 
   1638   return SDValue();
   1639 }
   1640 
   1641 /// get_v4i32_imm - Catch-all for general 64-bit constant vectors
   1642 SDValue SPU::get_v2i64_imm(SDNode *N, SelectionDAG &DAG) {
   1643   if (ConstantSDNode *CN = getVecImm(N)) {
   1644     return DAG.getTargetConstant((unsigned) CN->getZExtValue(), MVT::i64);
   1645   }
   1646 
   1647   return SDValue();
   1648 }
   1649 
   1650 //! Lower a BUILD_VECTOR instruction creatively:
   1651 static SDValue
   1652 LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) {
   1653   EVT VT = Op.getValueType();
   1654   EVT EltVT = VT.getVectorElementType();
   1655   DebugLoc dl = Op.getDebugLoc();
   1656   BuildVectorSDNode *BCN = dyn_cast<BuildVectorSDNode>(Op.getNode());
   1657   assert(BCN != 0 && "Expected BuildVectorSDNode in SPU LowerBUILD_VECTOR");
   1658   unsigned minSplatBits = EltVT.getSizeInBits();
   1659 
   1660   if (minSplatBits < 16)
   1661     minSplatBits = 16;
   1662 
   1663   APInt APSplatBits, APSplatUndef;
   1664   unsigned SplatBitSize;
   1665   bool HasAnyUndefs;
   1666 
   1667   if (!BCN->isConstantSplat(APSplatBits, APSplatUndef, SplatBitSize,
   1668                             HasAnyUndefs, minSplatBits)
   1669       || minSplatBits < SplatBitSize)
   1670     return SDValue();   // Wasn't a constant vector or splat exceeded min
   1671 
   1672   uint64_t SplatBits = APSplatBits.getZExtValue();
   1673 
   1674   switch (VT.getSimpleVT().SimpleTy) {
   1675   default:
   1676     report_fatal_error("CellSPU: Unhandled VT in LowerBUILD_VECTOR, VT = " +
   1677                        Twine(VT.getEVTString()));
   1678     /*NOTREACHED*/
   1679   case MVT::v4f32: {
   1680     uint32_t Value32 = uint32_t(SplatBits);
   1681     assert(SplatBitSize == 32
   1682            && "LowerBUILD_VECTOR: Unexpected floating point vector element.");
   1683     // NOTE: pretend the constant is an integer. LLVM won't load FP constants
   1684     SDValue T = DAG.getConstant(Value32, MVT::i32);
   1685     return DAG.getNode(ISD::BITCAST, dl, MVT::v4f32,
   1686                        DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32, T,T,T,T));
   1687     break;
   1688   }
   1689   case MVT::v2f64: {
   1690     uint64_t f64val = uint64_t(SplatBits);
   1691     assert(SplatBitSize == 64
   1692            && "LowerBUILD_VECTOR: 64-bit float vector size > 8 bytes.");
   1693     // NOTE: pretend the constant is an integer. LLVM won't load FP constants
   1694     SDValue T = DAG.getConstant(f64val, MVT::i64);
   1695     return DAG.getNode(ISD::BITCAST, dl, MVT::v2f64,
   1696                        DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v2i64, T, T));
   1697     break;
   1698   }
   1699   case MVT::v16i8: {
   1700    // 8-bit constants have to be expanded to 16-bits
   1701    unsigned short Value16 = SplatBits /* | (SplatBits << 8) */;
   1702    SmallVector<SDValue, 8> Ops;
   1703 
   1704    Ops.assign(8, DAG.getConstant(Value16, MVT::i16));
   1705    return DAG.getNode(ISD::BITCAST, dl, VT,
   1706                       DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v8i16, &Ops[0], Ops.size()));
   1707   }
   1708   case MVT::v8i16: {
   1709     unsigned short Value16 = SplatBits;
   1710     SDValue T = DAG.getConstant(Value16, EltVT);
   1711     SmallVector<SDValue, 8> Ops;
   1712 
   1713     Ops.assign(8, T);
   1714     return DAG.getNode(ISD::BUILD_VECTOR, dl, VT, &Ops[0], Ops.size());
   1715   }
   1716   case MVT::v4i32: {
   1717     SDValue T = DAG.getConstant(unsigned(SplatBits), VT.getVectorElementType());
   1718     return DAG.getNode(ISD::BUILD_VECTOR, dl, VT, T, T, T, T);
   1719   }
   1720   case MVT::v2i64: {
   1721     return SPU::LowerV2I64Splat(VT, DAG, SplatBits, dl);
   1722   }
   1723   }
   1724 
   1725   return SDValue();
   1726 }
   1727 
   1728 /*!
   1729  */
   1730 SDValue
   1731 SPU::LowerV2I64Splat(EVT OpVT, SelectionDAG& DAG, uint64_t SplatVal,
   1732                      DebugLoc dl) {
   1733   uint32_t upper = uint32_t(SplatVal >> 32);
   1734   uint32_t lower = uint32_t(SplatVal);
   1735 
   1736   if (upper == lower) {
   1737     // Magic constant that can be matched by IL, ILA, et. al.
   1738     SDValue Val = DAG.getTargetConstant(upper, MVT::i32);
   1739     return DAG.getNode(ISD::BITCAST, dl, OpVT,
   1740                        DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32,
   1741                                    Val, Val, Val, Val));
   1742   } else {
   1743     bool upper_special, lower_special;
   1744 
   1745     // NOTE: This code creates common-case shuffle masks that can be easily
   1746     // detected as common expressions. It is not attempting to create highly
   1747     // specialized masks to replace any and all 0's, 0xff's and 0x80's.
   1748 
   1749     // Detect if the upper or lower half is a special shuffle mask pattern:
   1750     upper_special = (upper == 0 || upper == 0xffffffff || upper == 0x80000000);
   1751     lower_special = (lower == 0 || lower == 0xffffffff || lower == 0x80000000);
   1752 
   1753     // Both upper and lower are special, lower to a constant pool load:
   1754     if (lower_special && upper_special) {
   1755       SDValue UpperVal = DAG.getConstant(upper, MVT::i32);
   1756       SDValue LowerVal = DAG.getConstant(lower, MVT::i32);
   1757       SDValue BV = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32,
   1758                          UpperVal, LowerVal, UpperVal, LowerVal);
   1759       return DAG.getNode(ISD::BITCAST, dl, OpVT, BV);
   1760     }
   1761 
   1762     SDValue LO32;
   1763     SDValue HI32;
   1764     SmallVector<SDValue, 16> ShufBytes;
   1765     SDValue Result;
   1766 
   1767     // Create lower vector if not a special pattern
   1768     if (!lower_special) {
   1769       SDValue LO32C = DAG.getConstant(lower, MVT::i32);
   1770       LO32 = DAG.getNode(ISD::BITCAST, dl, OpVT,
   1771                          DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32,
   1772                                      LO32C, LO32C, LO32C, LO32C));
   1773     }
   1774 
   1775     // Create upper vector if not a special pattern
   1776     if (!upper_special) {
   1777       SDValue HI32C = DAG.getConstant(upper, MVT::i32);
   1778       HI32 = DAG.getNode(ISD::BITCAST, dl, OpVT,
   1779                          DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32,
   1780                                      HI32C, HI32C, HI32C, HI32C));
   1781     }
   1782 
   1783     // If either upper or lower are special, then the two input operands are
   1784     // the same (basically, one of them is a "don't care")
   1785     if (lower_special)
   1786       LO32 = HI32;
   1787     if (upper_special)
   1788       HI32 = LO32;
   1789 
   1790     for (int i = 0; i < 4; ++i) {
   1791       uint64_t val = 0;
   1792       for (int j = 0; j < 4; ++j) {
   1793         SDValue V;
   1794         bool process_upper, process_lower;
   1795         val <<= 8;
   1796         process_upper = (upper_special && (i & 1) == 0);
   1797         process_lower = (lower_special && (i & 1) == 1);
   1798 
   1799         if (process_upper || process_lower) {
   1800           if ((process_upper && upper == 0)
   1801                   || (process_lower && lower == 0))
   1802             val |= 0x80;
   1803           else if ((process_upper && upper == 0xffffffff)
   1804                   || (process_lower && lower == 0xffffffff))
   1805             val |= 0xc0;
   1806           else if ((process_upper && upper == 0x80000000)
   1807                   || (process_lower && lower == 0x80000000))
   1808             val |= (j == 0 ? 0xe0 : 0x80);
   1809         } else
   1810           val |= i * 4 + j + ((i & 1) * 16);
   1811       }
   1812 
   1813       ShufBytes.push_back(DAG.getConstant(val, MVT::i32));
   1814     }
   1815 
   1816     return DAG.getNode(SPUISD::SHUFB, dl, OpVT, HI32, LO32,
   1817                        DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32,
   1818                                    &ShufBytes[0], ShufBytes.size()));
   1819   }
   1820 }
   1821 
   1822 /// LowerVECTOR_SHUFFLE - Lower a vector shuffle (V1, V2, V3) to something on
   1823 /// which the Cell can operate. The code inspects V3 to ascertain whether the
   1824 /// permutation vector, V3, is monotonically increasing with one "exception"
   1825 /// element, e.g., (0, 1, _, 3). If this is the case, then generate a
   1826 /// SHUFFLE_MASK synthetic instruction. Otherwise, spill V3 to the constant pool.
   1827 /// In either case, the net result is going to eventually invoke SHUFB to
   1828 /// permute/shuffle the bytes from V1 and V2.
   1829 /// \note
   1830 /// SHUFFLE_MASK is eventually selected as one of the C*D instructions, generate
   1831 /// control word for byte/halfword/word insertion. This takes care of a single
   1832 /// element move from V2 into V1.
   1833 /// \note
   1834 /// SPUISD::SHUFB is eventually selected as Cell's <i>shufb</i> instructions.
   1835 static SDValue LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) {
   1836   const ShuffleVectorSDNode *SVN = cast<ShuffleVectorSDNode>(Op);
   1837   SDValue V1 = Op.getOperand(0);
   1838   SDValue V2 = Op.getOperand(1);
   1839   DebugLoc dl = Op.getDebugLoc();
   1840 
   1841   if (V2.getOpcode() == ISD::UNDEF) V2 = V1;
   1842 
   1843   // If we have a single element being moved from V1 to V2, this can be handled
   1844   // using the C*[DX] compute mask instructions, but the vector elements have
   1845   // to be monotonically increasing with one exception element, and the source
   1846   // slot of the element to move must be the same as the destination.
   1847   EVT VecVT = V1.getValueType();
   1848   EVT EltVT = VecVT.getVectorElementType();
   1849   unsigned EltsFromV2 = 0;
   1850   unsigned V2EltOffset = 0;
   1851   unsigned V2EltIdx0 = 0;
   1852   unsigned CurrElt = 0;
   1853   unsigned MaxElts = VecVT.getVectorNumElements();
   1854   unsigned PrevElt = 0;
   1855   bool monotonic = true;
   1856   bool rotate = true;
   1857   int rotamt=0;
   1858   EVT maskVT;             // which of the c?d instructions to use
   1859 
   1860   if (EltVT == MVT::i8) {
   1861     V2EltIdx0 = 16;
   1862     maskVT = MVT::v16i8;
   1863   } else if (EltVT == MVT::i16) {
   1864     V2EltIdx0 = 8;
   1865     maskVT = MVT::v8i16;
   1866   } else if (EltVT == MVT::i32 || EltVT == MVT::f32) {
   1867     V2EltIdx0 = 4;
   1868     maskVT = MVT::v4i32;
   1869   } else if (EltVT == MVT::i64 || EltVT == MVT::f64) {
   1870     V2EltIdx0 = 2;
   1871     maskVT = MVT::v2i64;
   1872   } else
   1873     llvm_unreachable("Unhandled vector type in LowerVECTOR_SHUFFLE");
   1874 
   1875   for (unsigned i = 0; i != MaxElts; ++i) {
   1876     if (SVN->getMaskElt(i) < 0)
   1877       continue;
   1878 
   1879     unsigned SrcElt = SVN->getMaskElt(i);
   1880 
   1881     if (monotonic) {
   1882       if (SrcElt >= V2EltIdx0) {
   1883         // TODO: optimize for the monotonic case when several consecutive
   1884         // elements are taken form V2. Do we ever get such a case?
   1885         if (EltsFromV2 == 0 && CurrElt == (SrcElt - V2EltIdx0))
   1886           V2EltOffset = (SrcElt - V2EltIdx0) * (EltVT.getSizeInBits()/8);
   1887         else
   1888           monotonic = false;
   1889         ++EltsFromV2;
   1890       } else if (CurrElt != SrcElt) {
   1891         monotonic = false;
   1892       }
   1893 
   1894       ++CurrElt;
   1895     }
   1896 
   1897     if (rotate) {
   1898       if (PrevElt > 0 && SrcElt < MaxElts) {
   1899         if ((PrevElt == SrcElt - 1)
   1900             || (PrevElt == MaxElts - 1 && SrcElt == 0)) {
   1901           PrevElt = SrcElt;
   1902         } else {
   1903           rotate = false;
   1904         }
   1905       } else if (i == 0 || (PrevElt==0 && SrcElt==1)) {
   1906         // First time or after a "wrap around"
   1907         rotamt = SrcElt-i;
   1908         PrevElt = SrcElt;
   1909       } else {
   1910         // This isn't a rotation, takes elements from vector 2
   1911         rotate = false;
   1912       }
   1913     }
   1914   }
   1915 
   1916   if (EltsFromV2 == 1 && monotonic) {
   1917     // Compute mask and shuffle
   1918     EVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy();
   1919 
   1920     // As SHUFFLE_MASK becomes a c?d instruction, feed it an address
   1921     // R1 ($sp) is used here only as it is guaranteed to have last bits zero
   1922     SDValue Pointer = DAG.getNode(SPUISD::IndirectAddr, dl, PtrVT,
   1923                                 DAG.getRegister(SPU::R1, PtrVT),
   1924                                 DAG.getConstant(V2EltOffset, MVT::i32));
   1925     SDValue ShufMaskOp = DAG.getNode(SPUISD::SHUFFLE_MASK, dl,
   1926                                      maskVT, Pointer);
   1927 
   1928     // Use shuffle mask in SHUFB synthetic instruction:
   1929     return DAG.getNode(SPUISD::SHUFB, dl, V1.getValueType(), V2, V1,
   1930                        ShufMaskOp);
   1931   } else if (rotate) {
   1932     if (rotamt < 0)
   1933       rotamt +=MaxElts;
   1934     rotamt *= EltVT.getSizeInBits()/8;
   1935     return DAG.getNode(SPUISD::ROTBYTES_LEFT, dl, V1.getValueType(),
   1936                        V1, DAG.getConstant(rotamt, MVT::i16));
   1937   } else {
   1938    // Convert the SHUFFLE_VECTOR mask's input element units to the
   1939    // actual bytes.
   1940     unsigned BytesPerElement = EltVT.getSizeInBits()/8;
   1941 
   1942     SmallVector<SDValue, 16> ResultMask;
   1943     for (unsigned i = 0, e = MaxElts; i != e; ++i) {
   1944       unsigned SrcElt = SVN->getMaskElt(i) < 0 ? 0 : SVN->getMaskElt(i);
   1945 
   1946       for (unsigned j = 0; j < BytesPerElement; ++j)
   1947         ResultMask.push_back(DAG.getConstant(SrcElt*BytesPerElement+j,MVT::i8));
   1948     }
   1949     SDValue VPermMask = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v16i8,
   1950                                     &ResultMask[0], ResultMask.size());
   1951     return DAG.getNode(SPUISD::SHUFB, dl, V1.getValueType(), V1, V2, VPermMask);
   1952   }
   1953 }
   1954 
   1955 static SDValue LowerSCALAR_TO_VECTOR(SDValue Op, SelectionDAG &DAG) {
   1956   SDValue Op0 = Op.getOperand(0);                     // Op0 = the scalar
   1957   DebugLoc dl = Op.getDebugLoc();
   1958 
   1959   if (Op0.getNode()->getOpcode() == ISD::Constant) {
   1960     // For a constant, build the appropriate constant vector, which will
   1961     // eventually simplify to a vector register load.
   1962 
   1963     ConstantSDNode *CN = cast<ConstantSDNode>(Op0.getNode());
   1964     SmallVector<SDValue, 16> ConstVecValues;
   1965     EVT VT;
   1966     size_t n_copies;
   1967 
   1968     // Create a constant vector:
   1969     switch (Op.getValueType().getSimpleVT().SimpleTy) {
   1970     default: llvm_unreachable("Unexpected constant value type in "
   1971                               "LowerSCALAR_TO_VECTOR");
   1972     case MVT::v16i8: n_copies = 16; VT = MVT::i8; break;
   1973     case MVT::v8i16: n_copies = 8; VT = MVT::i16; break;
   1974     case MVT::v4i32: n_copies = 4; VT = MVT::i32; break;
   1975     case MVT::v4f32: n_copies = 4; VT = MVT::f32; break;
   1976     case MVT::v2i64: n_copies = 2; VT = MVT::i64; break;
   1977     case MVT::v2f64: n_copies = 2; VT = MVT::f64; break;
   1978     }
   1979 
   1980     SDValue CValue = DAG.getConstant(CN->getZExtValue(), VT);
   1981     for (size_t j = 0; j < n_copies; ++j)
   1982       ConstVecValues.push_back(CValue);
   1983 
   1984     return DAG.getNode(ISD::BUILD_VECTOR, dl, Op.getValueType(),
   1985                        &ConstVecValues[0], ConstVecValues.size());
   1986   } else {
   1987     // Otherwise, copy the value from one register to another:
   1988     switch (Op0.getValueType().getSimpleVT().SimpleTy) {
   1989     default: llvm_unreachable("Unexpected value type in LowerSCALAR_TO_VECTOR");
   1990     case MVT::i8:
   1991     case MVT::i16:
   1992     case MVT::i32:
   1993     case MVT::i64:
   1994     case MVT::f32:
   1995     case MVT::f64:
   1996       return DAG.getNode(SPUISD::PREFSLOT2VEC, dl, Op.getValueType(), Op0, Op0);
   1997     }
   1998   }
   1999 
   2000   return SDValue();
   2001 }
   2002 
   2003 static SDValue LowerEXTRACT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) {
   2004   EVT VT = Op.getValueType();
   2005   SDValue N = Op.getOperand(0);
   2006   SDValue Elt = Op.getOperand(1);
   2007   DebugLoc dl = Op.getDebugLoc();
   2008   SDValue retval;
   2009 
   2010   if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Elt)) {
   2011     // Constant argument:
   2012     int EltNo = (int) C->getZExtValue();
   2013 
   2014     // sanity checks:
   2015     if (VT == MVT::i8 && EltNo >= 16)
   2016       llvm_unreachable("SPU LowerEXTRACT_VECTOR_ELT: i8 extraction slot > 15");
   2017     else if (VT == MVT::i16 && EltNo >= 8)
   2018       llvm_unreachable("SPU LowerEXTRACT_VECTOR_ELT: i16 extraction slot > 7");
   2019     else if (VT == MVT::i32 && EltNo >= 4)
   2020       llvm_unreachable("SPU LowerEXTRACT_VECTOR_ELT: i32 extraction slot > 4");
   2021     else if (VT == MVT::i64 && EltNo >= 2)
   2022       llvm_unreachable("SPU LowerEXTRACT_VECTOR_ELT: i64 extraction slot > 2");
   2023 
   2024     if (EltNo == 0 && (VT == MVT::i32 || VT == MVT::i64)) {
   2025       // i32 and i64: Element 0 is the preferred slot
   2026       return DAG.getNode(SPUISD::VEC2PREFSLOT, dl, VT, N);
   2027     }
   2028 
   2029     // Need to generate shuffle mask and extract:
   2030     int prefslot_begin = -1, prefslot_end = -1;
   2031     int elt_byte = EltNo * VT.getSizeInBits() / 8;
   2032 
   2033     switch (VT.getSimpleVT().SimpleTy) {
   2034     default:
   2035       assert(false && "Invalid value type!");
   2036     case MVT::i8: {
   2037       prefslot_begin = prefslot_end = 3;
   2038       break;
   2039     }
   2040     case MVT::i16: {
   2041       prefslot_begin = 2; prefslot_end = 3;
   2042       break;
   2043     }
   2044     case MVT::i32:
   2045     case MVT::f32: {
   2046       prefslot_begin = 0; prefslot_end = 3;
   2047       break;
   2048     }
   2049     case MVT::i64:
   2050     case MVT::f64: {
   2051       prefslot_begin = 0; prefslot_end = 7;
   2052       break;
   2053     }
   2054     }
   2055 
   2056     assert(prefslot_begin != -1 && prefslot_end != -1 &&
   2057            "LowerEXTRACT_VECTOR_ELT: preferred slots uninitialized");
   2058 
   2059     unsigned int ShufBytes[16] = {
   2060       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
   2061     };
   2062     for (int i = 0; i < 16; ++i) {
   2063       // zero fill uppper part of preferred slot, don't care about the
   2064       // other slots:
   2065       unsigned int mask_val;
   2066       if (i <= prefslot_end) {
   2067         mask_val =
   2068           ((i < prefslot_begin)
   2069            ? 0x80
   2070            : elt_byte + (i - prefslot_begin));
   2071 
   2072         ShufBytes[i] = mask_val;
   2073       } else
   2074         ShufBytes[i] = ShufBytes[i % (prefslot_end + 1)];
   2075     }
   2076 
   2077     SDValue ShufMask[4];
   2078     for (unsigned i = 0; i < sizeof(ShufMask)/sizeof(ShufMask[0]); ++i) {
   2079       unsigned bidx = i * 4;
   2080       unsigned int bits = ((ShufBytes[bidx] << 24) |
   2081                            (ShufBytes[bidx+1] << 16) |
   2082                            (ShufBytes[bidx+2] << 8) |
   2083                            ShufBytes[bidx+3]);
   2084       ShufMask[i] = DAG.getConstant(bits, MVT::i32);
   2085     }
   2086 
   2087     SDValue ShufMaskVec =
   2088       DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32,
   2089                   &ShufMask[0], sizeof(ShufMask)/sizeof(ShufMask[0]));
   2090 
   2091     retval = DAG.getNode(SPUISD::VEC2PREFSLOT, dl, VT,
   2092                          DAG.getNode(SPUISD::SHUFB, dl, N.getValueType(),
   2093                                      N, N, ShufMaskVec));
   2094   } else {
   2095     // Variable index: Rotate the requested element into slot 0, then replicate
   2096     // slot 0 across the vector
   2097     EVT VecVT = N.getValueType();
   2098     if (!VecVT.isSimple() || !VecVT.isVector()) {
   2099       report_fatal_error("LowerEXTRACT_VECTOR_ELT: Must have a simple, 128-bit"
   2100                         "vector type!");
   2101     }
   2102 
   2103     // Make life easier by making sure the index is zero-extended to i32
   2104     if (Elt.getValueType() != MVT::i32)
   2105       Elt = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, Elt);
   2106 
   2107     // Scale the index to a bit/byte shift quantity
   2108     APInt scaleFactor =
   2109             APInt(32, uint64_t(16 / N.getValueType().getVectorNumElements()), false);
   2110     unsigned scaleShift = scaleFactor.logBase2();
   2111     SDValue vecShift;
   2112 
   2113     if (scaleShift > 0) {
   2114       // Scale the shift factor:
   2115       Elt = DAG.getNode(ISD::SHL, dl, MVT::i32, Elt,
   2116                         DAG.getConstant(scaleShift, MVT::i32));
   2117     }
   2118 
   2119     vecShift = DAG.getNode(SPUISD::SHL_BYTES, dl, VecVT, N, Elt);
   2120 
   2121     // Replicate the bytes starting at byte 0 across the entire vector (for
   2122     // consistency with the notion of a unified register set)
   2123     SDValue replicate;
   2124 
   2125     switch (VT.getSimpleVT().SimpleTy) {
   2126     default:
   2127       report_fatal_error("LowerEXTRACT_VECTOR_ELT(varable): Unhandled vector"
   2128                         "type");
   2129       /*NOTREACHED*/
   2130     case MVT::i8: {
   2131       SDValue factor = DAG.getConstant(0x00000000, MVT::i32);
   2132       replicate = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32,
   2133                               factor, factor, factor, factor);
   2134       break;
   2135     }
   2136     case MVT::i16: {
   2137       SDValue factor = DAG.getConstant(0x00010001, MVT::i32);
   2138       replicate = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32,
   2139                               factor, factor, factor, factor);
   2140       break;
   2141     }
   2142     case MVT::i32:
   2143     case MVT::f32: {
   2144       SDValue factor = DAG.getConstant(0x00010203, MVT::i32);
   2145       replicate = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32,
   2146                               factor, factor, factor, factor);
   2147       break;
   2148     }
   2149     case MVT::i64:
   2150     case MVT::f64: {
   2151       SDValue loFactor = DAG.getConstant(0x00010203, MVT::i32);
   2152       SDValue hiFactor = DAG.getConstant(0x04050607, MVT::i32);
   2153       replicate = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32,
   2154                               loFactor, hiFactor, loFactor, hiFactor);
   2155       break;
   2156     }
   2157     }
   2158 
   2159     retval = DAG.getNode(SPUISD::VEC2PREFSLOT, dl, VT,
   2160                          DAG.getNode(SPUISD::SHUFB, dl, VecVT,
   2161                                      vecShift, vecShift, replicate));
   2162   }
   2163 
   2164   return retval;
   2165 }
   2166 
   2167 static SDValue LowerINSERT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) {
   2168   SDValue VecOp = Op.getOperand(0);
   2169   SDValue ValOp = Op.getOperand(1);
   2170   SDValue IdxOp = Op.getOperand(2);
   2171   DebugLoc dl = Op.getDebugLoc();
   2172   EVT VT = Op.getValueType();
   2173   EVT eltVT = ValOp.getValueType();
   2174 
   2175   // use 0 when the lane to insert to is 'undef'
   2176   int64_t Offset=0;
   2177   if (IdxOp.getOpcode() != ISD::UNDEF) {
   2178     ConstantSDNode *CN = cast<ConstantSDNode>(IdxOp);
   2179     assert(CN != 0 && "LowerINSERT_VECTOR_ELT: Index is not constant!");
   2180     Offset = (CN->getSExtValue()) * eltVT.getSizeInBits()/8;
   2181   }
   2182 
   2183   EVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy();
   2184   // Use $sp ($1) because it's always 16-byte aligned and it's available:
   2185   SDValue Pointer = DAG.getNode(SPUISD::IndirectAddr, dl, PtrVT,
   2186                                 DAG.getRegister(SPU::R1, PtrVT),
   2187                                 DAG.getConstant(Offset, PtrVT));
   2188   // widen the mask when dealing with half vectors
   2189   EVT maskVT = EVT::getVectorVT(*(DAG.getContext()), VT.getVectorElementType(),
   2190                                 128/ VT.getVectorElementType().getSizeInBits());
   2191   SDValue ShufMask = DAG.getNode(SPUISD::SHUFFLE_MASK, dl, maskVT, Pointer);
   2192 
   2193   SDValue result =
   2194     DAG.getNode(SPUISD::SHUFB, dl, VT,
   2195                 DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, ValOp),
   2196                 VecOp,
   2197                 DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, ShufMask));
   2198 
   2199   return result;
   2200 }
   2201 
   2202 static SDValue LowerI8Math(SDValue Op, SelectionDAG &DAG, unsigned Opc,
   2203                            const TargetLowering &TLI)
   2204 {
   2205   SDValue N0 = Op.getOperand(0);      // Everything has at least one operand
   2206   DebugLoc dl = Op.getDebugLoc();
   2207   EVT ShiftVT = TLI.getShiftAmountTy(N0.getValueType());
   2208 
   2209   assert(Op.getValueType() == MVT::i8);
   2210   switch (Opc) {
   2211   default:
   2212     llvm_unreachable("Unhandled i8 math operator");
   2213     /*NOTREACHED*/
   2214     break;
   2215   case ISD::ADD: {
   2216     // 8-bit addition: Promote the arguments up to 16-bits and truncate
   2217     // the result:
   2218     SDValue N1 = Op.getOperand(1);
   2219     N0 = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::i16, N0);
   2220     N1 = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::i16, N1);
   2221     return DAG.getNode(ISD::TRUNCATE, dl, MVT::i8,
   2222                        DAG.getNode(Opc, dl, MVT::i16, N0, N1));
   2223 
   2224   }
   2225 
   2226   case ISD::SUB: {
   2227     // 8-bit subtraction: Promote the arguments up to 16-bits and truncate
   2228     // the result:
   2229     SDValue N1 = Op.getOperand(1);
   2230     N0 = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::i16, N0);
   2231     N1 = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::i16, N1);
   2232     return DAG.getNode(ISD::TRUNCATE, dl, MVT::i8,
   2233                        DAG.getNode(Opc, dl, MVT::i16, N0, N1));
   2234   }
   2235   case ISD::ROTR:
   2236   case ISD::ROTL: {
   2237     SDValue N1 = Op.getOperand(1);
   2238     EVT N1VT = N1.getValueType();
   2239 
   2240     N0 = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i16, N0);
   2241     if (!N1VT.bitsEq(ShiftVT)) {
   2242       unsigned N1Opc = N1.getValueType().bitsLT(ShiftVT)
   2243                        ? ISD::ZERO_EXTEND
   2244                        : ISD::TRUNCATE;
   2245       N1 = DAG.getNode(N1Opc, dl, ShiftVT, N1);
   2246     }
   2247 
   2248     // Replicate lower 8-bits into upper 8:
   2249     SDValue ExpandArg =
   2250       DAG.getNode(ISD::OR, dl, MVT::i16, N0,
   2251                   DAG.getNode(ISD::SHL, dl, MVT::i16,
   2252                               N0, DAG.getConstant(8, MVT::i32)));
   2253 
   2254     // Truncate back down to i8
   2255     return DAG.getNode(ISD::TRUNCATE, dl, MVT::i8,
   2256                        DAG.getNode(Opc, dl, MVT::i16, ExpandArg, N1));
   2257   }
   2258   case ISD::SRL:
   2259   case ISD::SHL: {
   2260     SDValue N1 = Op.getOperand(1);
   2261     EVT N1VT = N1.getValueType();
   2262 
   2263     N0 = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i16, N0);
   2264     if (!N1VT.bitsEq(ShiftVT)) {
   2265       unsigned N1Opc = ISD::ZERO_EXTEND;
   2266 
   2267       if (N1.getValueType().bitsGT(ShiftVT))
   2268         N1Opc = ISD::TRUNCATE;
   2269 
   2270       N1 = DAG.getNode(N1Opc, dl, ShiftVT, N1);
   2271     }
   2272 
   2273     return DAG.getNode(ISD::TRUNCATE, dl, MVT::i8,
   2274                        DAG.getNode(Opc, dl, MVT::i16, N0, N1));
   2275   }
   2276   case ISD::SRA: {
   2277     SDValue N1 = Op.getOperand(1);
   2278     EVT N1VT = N1.getValueType();
   2279 
   2280     N0 = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::i16, N0);
   2281     if (!N1VT.bitsEq(ShiftVT)) {
   2282       unsigned N1Opc = ISD::SIGN_EXTEND;
   2283 
   2284       if (N1VT.bitsGT(ShiftVT))
   2285         N1Opc = ISD::TRUNCATE;
   2286       N1 = DAG.getNode(N1Opc, dl, ShiftVT, N1);
   2287     }
   2288 
   2289     return DAG.getNode(ISD::TRUNCATE, dl, MVT::i8,
   2290                        DAG.getNode(Opc, dl, MVT::i16, N0, N1));
   2291   }
   2292   case ISD::MUL: {
   2293     SDValue N1 = Op.getOperand(1);
   2294 
   2295     N0 = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::i16, N0);
   2296     N1 = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::i16, N1);
   2297     return DAG.getNode(ISD::TRUNCATE, dl, MVT::i8,
   2298                        DAG.getNode(Opc, dl, MVT::i16, N0, N1));
   2299     break;
   2300   }
   2301   }
   2302 
   2303   return SDValue();
   2304 }
   2305 
   2306 //! Lower byte immediate operations for v16i8 vectors:
   2307 static SDValue
   2308 LowerByteImmed(SDValue Op, SelectionDAG &DAG) {
   2309   SDValue ConstVec;
   2310   SDValue Arg;
   2311   EVT VT = Op.getValueType();
   2312   DebugLoc dl = Op.getDebugLoc();
   2313 
   2314   ConstVec = Op.getOperand(0);
   2315   Arg = Op.getOperand(1);
   2316   if (ConstVec.getNode()->getOpcode() != ISD::BUILD_VECTOR) {
   2317     if (ConstVec.getNode()->getOpcode() == ISD::BITCAST) {
   2318       ConstVec = ConstVec.getOperand(0);
   2319     } else {
   2320       ConstVec = Op.getOperand(1);
   2321       Arg = Op.getOperand(0);
   2322       if (ConstVec.getNode()->getOpcode() == ISD::BITCAST) {
   2323         ConstVec = ConstVec.getOperand(0);
   2324       }
   2325     }
   2326   }
   2327 
   2328   if (ConstVec.getNode()->getOpcode() == ISD::BUILD_VECTOR) {
   2329     BuildVectorSDNode *BCN = dyn_cast<BuildVectorSDNode>(ConstVec.getNode());
   2330     assert(BCN != 0 && "Expected BuildVectorSDNode in SPU LowerByteImmed");
   2331 
   2332     APInt APSplatBits, APSplatUndef;
   2333     unsigned SplatBitSize;
   2334     bool HasAnyUndefs;
   2335     unsigned minSplatBits = VT.getVectorElementType().getSizeInBits();
   2336 
   2337     if (BCN->isConstantSplat(APSplatBits, APSplatUndef, SplatBitSize,
   2338                               HasAnyUndefs, minSplatBits)
   2339         && minSplatBits <= SplatBitSize) {
   2340       uint64_t SplatBits = APSplatBits.getZExtValue();
   2341       SDValue tc = DAG.getTargetConstant(SplatBits & 0xff, MVT::i8);
   2342 
   2343       SmallVector<SDValue, 16> tcVec;
   2344       tcVec.assign(16, tc);
   2345       return DAG.getNode(Op.getNode()->getOpcode(), dl, VT, Arg,
   2346                          DAG.getNode(ISD::BUILD_VECTOR, dl, VT, &tcVec[0], tcVec.size()));
   2347     }
   2348   }
   2349 
   2350   // These operations (AND, OR, XOR) are legal, they just couldn't be custom
   2351   // lowered.  Return the operation, rather than a null SDValue.
   2352   return Op;
   2353 }
   2354 
   2355 //! Custom lowering for CTPOP (count population)
   2356 /*!
   2357   Custom lowering code that counts the number ones in the input
   2358   operand. SPU has such an instruction, but it counts the number of
   2359   ones per byte, which then have to be accumulated.
   2360 */
   2361 static SDValue LowerCTPOP(SDValue Op, SelectionDAG &DAG) {
   2362   EVT VT = Op.getValueType();
   2363   EVT vecVT = EVT::getVectorVT(*DAG.getContext(),
   2364                                VT, (128 / VT.getSizeInBits()));
   2365   DebugLoc dl = Op.getDebugLoc();
   2366 
   2367   switch (VT.getSimpleVT().SimpleTy) {
   2368   default:
   2369     assert(false && "Invalid value type!");
   2370   case MVT::i8: {
   2371     SDValue N = Op.getOperand(0);
   2372     SDValue Elt0 = DAG.getConstant(0, MVT::i32);
   2373 
   2374     SDValue Promote = DAG.getNode(SPUISD::PREFSLOT2VEC, dl, vecVT, N, N);
   2375     SDValue CNTB = DAG.getNode(SPUISD::CNTB, dl, vecVT, Promote);
   2376 
   2377     return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i8, CNTB, Elt0);
   2378   }
   2379 
   2380   case MVT::i16: {
   2381     MachineFunction &MF = DAG.getMachineFunction();
   2382     MachineRegisterInfo &RegInfo = MF.getRegInfo();
   2383 
   2384     unsigned CNTB_reg = RegInfo.createVirtualRegister(&SPU::R16CRegClass);
   2385 
   2386     SDValue N = Op.getOperand(0);
   2387     SDValue Elt0 = DAG.getConstant(0, MVT::i16);
   2388     SDValue Mask0 = DAG.getConstant(0x0f, MVT::i16);
   2389     SDValue Shift1 = DAG.getConstant(8, MVT::i32);
   2390 
   2391     SDValue Promote = DAG.getNode(SPUISD::PREFSLOT2VEC, dl, vecVT, N, N);
   2392     SDValue CNTB = DAG.getNode(SPUISD::CNTB, dl, vecVT, Promote);
   2393 
   2394     // CNTB_result becomes the chain to which all of the virtual registers
   2395     // CNTB_reg, SUM1_reg become associated:
   2396     SDValue CNTB_result =
   2397       DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16, CNTB, Elt0);
   2398 
   2399     SDValue CNTB_rescopy =
   2400       DAG.getCopyToReg(CNTB_result, dl, CNTB_reg, CNTB_result);
   2401 
   2402     SDValue Tmp1 = DAG.getCopyFromReg(CNTB_rescopy, dl, CNTB_reg, MVT::i16);
   2403 
   2404     return DAG.getNode(ISD::AND, dl, MVT::i16,
   2405                        DAG.getNode(ISD::ADD, dl, MVT::i16,
   2406                                    DAG.getNode(ISD::SRL, dl, MVT::i16,
   2407                                                Tmp1, Shift1),
   2408                                    Tmp1),
   2409                        Mask0);
   2410   }
   2411 
   2412   case MVT::i32: {
   2413     MachineFunction &MF = DAG.getMachineFunction();
   2414     MachineRegisterInfo &RegInfo = MF.getRegInfo();
   2415 
   2416     unsigned CNTB_reg = RegInfo.createVirtualRegister(&SPU::R32CRegClass);
   2417     unsigned SUM1_reg = RegInfo.createVirtualRegister(&SPU::R32CRegClass);
   2418 
   2419     SDValue N = Op.getOperand(0);
   2420     SDValue Elt0 = DAG.getConstant(0, MVT::i32);
   2421     SDValue Mask0 = DAG.getConstant(0xff, MVT::i32);
   2422     SDValue Shift1 = DAG.getConstant(16, MVT::i32);
   2423     SDValue Shift2 = DAG.getConstant(8, MVT::i32);
   2424 
   2425     SDValue Promote = DAG.getNode(SPUISD::PREFSLOT2VEC, dl, vecVT, N, N);
   2426     SDValue CNTB = DAG.getNode(SPUISD::CNTB, dl, vecVT, Promote);
   2427 
   2428     // CNTB_result becomes the chain to which all of the virtual registers
   2429     // CNTB_reg, SUM1_reg become associated:
   2430     SDValue CNTB_result =
   2431       DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32, CNTB, Elt0);
   2432 
   2433     SDValue CNTB_rescopy =
   2434       DAG.getCopyToReg(CNTB_result, dl, CNTB_reg, CNTB_result);
   2435 
   2436     SDValue Comp1 =
   2437       DAG.getNode(ISD::SRL, dl, MVT::i32,
   2438                   DAG.getCopyFromReg(CNTB_rescopy, dl, CNTB_reg, MVT::i32),
   2439                   Shift1);
   2440 
   2441     SDValue Sum1 =
   2442       DAG.getNode(ISD::ADD, dl, MVT::i32, Comp1,
   2443                   DAG.getCopyFromReg(CNTB_rescopy, dl, CNTB_reg, MVT::i32));
   2444 
   2445     SDValue Sum1_rescopy =
   2446       DAG.getCopyToReg(CNTB_result, dl, SUM1_reg, Sum1);
   2447 
   2448     SDValue Comp2 =
   2449       DAG.getNode(ISD::SRL, dl, MVT::i32,
   2450                   DAG.getCopyFromReg(Sum1_rescopy, dl, SUM1_reg, MVT::i32),
   2451                   Shift2);
   2452     SDValue Sum2 =
   2453       DAG.getNode(ISD::ADD, dl, MVT::i32, Comp2,
   2454                   DAG.getCopyFromReg(Sum1_rescopy, dl, SUM1_reg, MVT::i32));
   2455 
   2456     return DAG.getNode(ISD::AND, dl, MVT::i32, Sum2, Mask0);
   2457   }
   2458 
   2459   case MVT::i64:
   2460     break;
   2461   }
   2462 
   2463   return SDValue();
   2464 }
   2465 
   2466 //! Lower ISD::FP_TO_SINT, ISD::FP_TO_UINT for i32
   2467 /*!
   2468  f32->i32 passes through unchanged, whereas f64->i32 expands to a libcall.
   2469  All conversions to i64 are expanded to a libcall.
   2470  */
   2471 static SDValue LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG,
   2472                               const SPUTargetLowering &TLI) {
   2473   EVT OpVT = Op.getValueType();
   2474   SDValue Op0 = Op.getOperand(0);
   2475   EVT Op0VT = Op0.getValueType();
   2476 
   2477   if ((OpVT == MVT::i32 && Op0VT == MVT::f64)
   2478       || OpVT == MVT::i64) {
   2479     // Convert f32 / f64 to i32 / i64 via libcall.
   2480     RTLIB::Libcall LC =
   2481             (Op.getOpcode() == ISD::FP_TO_SINT)
   2482              ? RTLIB::getFPTOSINT(Op0VT, OpVT)
   2483              : RTLIB::getFPTOUINT(Op0VT, OpVT);
   2484     assert(LC != RTLIB::UNKNOWN_LIBCALL && "Unexpectd fp-to-int conversion!");
   2485     SDValue Dummy;
   2486     return ExpandLibCall(LC, Op, DAG, false, Dummy, TLI);
   2487   }
   2488 
   2489   return Op;
   2490 }
   2491 
   2492 //! Lower ISD::SINT_TO_FP, ISD::UINT_TO_FP for i32
   2493 /*!
   2494  i32->f32 passes through unchanged, whereas i32->f64 is expanded to a libcall.
   2495  All conversions from i64 are expanded to a libcall.
   2496  */
   2497 static SDValue LowerINT_TO_FP(SDValue Op, SelectionDAG &DAG,
   2498                               const SPUTargetLowering &TLI) {
   2499   EVT OpVT = Op.getValueType();
   2500   SDValue Op0 = Op.getOperand(0);
   2501   EVT Op0VT = Op0.getValueType();
   2502 
   2503   if ((OpVT == MVT::f64 && Op0VT == MVT::i32)
   2504       || Op0VT == MVT::i64) {
   2505     // Convert i32, i64 to f64 via libcall:
   2506     RTLIB::Libcall LC =
   2507             (Op.getOpcode() == ISD::SINT_TO_FP)
   2508              ? RTLIB::getSINTTOFP(Op0VT, OpVT)
   2509              : RTLIB::getUINTTOFP(Op0VT, OpVT);
   2510     assert(LC != RTLIB::UNKNOWN_LIBCALL && "Unexpectd int-to-fp conversion!");
   2511     SDValue Dummy;
   2512     return ExpandLibCall(LC, Op, DAG, false, Dummy, TLI);
   2513   }
   2514 
   2515   return Op;
   2516 }
   2517 
   2518 //! Lower ISD::SETCC
   2519 /*!
   2520  This handles MVT::f64 (double floating point) condition lowering
   2521  */
   2522 static SDValue LowerSETCC(SDValue Op, SelectionDAG &DAG,
   2523                           const TargetLowering &TLI) {
   2524   CondCodeSDNode *CC = dyn_cast<CondCodeSDNode>(Op.getOperand(2));
   2525   DebugLoc dl = Op.getDebugLoc();
   2526   assert(CC != 0 && "LowerSETCC: CondCodeSDNode should not be null here!\n");
   2527 
   2528   SDValue lhs = Op.getOperand(0);
   2529   SDValue rhs = Op.getOperand(1);
   2530   EVT lhsVT = lhs.getValueType();
   2531   assert(lhsVT == MVT::f64 && "LowerSETCC: type other than MVT::64\n");
   2532 
   2533   EVT ccResultVT = TLI.getSetCCResultType(lhs.getValueType());
   2534   APInt ccResultOnes = APInt::getAllOnesValue(ccResultVT.getSizeInBits());
   2535   EVT IntVT(MVT::i64);
   2536 
   2537   // Take advantage of the fact that (truncate (sra arg, 32)) is efficiently
   2538   // selected to a NOP:
   2539   SDValue i64lhs = DAG.getNode(ISD::BITCAST, dl, IntVT, lhs);
   2540   SDValue lhsHi32 =
   2541           DAG.getNode(ISD::TRUNCATE, dl, MVT::i32,
   2542                       DAG.getNode(ISD::SRL, dl, IntVT,
   2543                                   i64lhs, DAG.getConstant(32, MVT::i32)));
   2544   SDValue lhsHi32abs =
   2545           DAG.getNode(ISD::AND, dl, MVT::i32,
   2546                       lhsHi32, DAG.getConstant(0x7fffffff, MVT::i32));
   2547   SDValue lhsLo32 =
   2548           DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, i64lhs);
   2549 
   2550   // SETO and SETUO only use the lhs operand:
   2551   if (CC->get() == ISD::SETO) {
   2552     // Evaluates to true if Op0 is not [SQ]NaN - lowers to the inverse of
   2553     // SETUO
   2554     APInt ccResultAllOnes = APInt::getAllOnesValue(ccResultVT.getSizeInBits());
   2555     return DAG.getNode(ISD::XOR, dl, ccResultVT,
   2556                        DAG.getSetCC(dl, ccResultVT,
   2557                                     lhs, DAG.getConstantFP(0.0, lhsVT),
   2558                                     ISD::SETUO),
   2559                        DAG.getConstant(ccResultAllOnes, ccResultVT));
   2560   } else if (CC->get() == ISD::SETUO) {
   2561     // Evaluates to true if Op0 is [SQ]NaN
   2562     return DAG.getNode(ISD::AND, dl, ccResultVT,
   2563                        DAG.getSetCC(dl, ccResultVT,
   2564                                     lhsHi32abs,
   2565                                     DAG.getConstant(0x7ff00000, MVT::i32),
   2566                                     ISD::SETGE),
   2567                        DAG.getSetCC(dl, ccResultVT,
   2568                                     lhsLo32,
   2569                                     DAG.getConstant(0, MVT::i32),
   2570                                     ISD::SETGT));
   2571   }
   2572 
   2573   SDValue i64rhs = DAG.getNode(ISD::BITCAST, dl, IntVT, rhs);
   2574   SDValue rhsHi32 =
   2575           DAG.getNode(ISD::TRUNCATE, dl, MVT::i32,
   2576                       DAG.getNode(ISD::SRL, dl, IntVT,
   2577                                   i64rhs, DAG.getConstant(32, MVT::i32)));
   2578 
   2579   // If a value is negative, subtract from the sign magnitude constant:
   2580   SDValue signMag2TC = DAG.getConstant(0x8000000000000000ULL, IntVT);
   2581 
   2582   // Convert the sign-magnitude representation into 2's complement:
   2583   SDValue lhsSelectMask = DAG.getNode(ISD::SRA, dl, ccResultVT,
   2584                                       lhsHi32, DAG.getConstant(31, MVT::i32));
   2585   SDValue lhsSignMag2TC = DAG.getNode(ISD::SUB, dl, IntVT, signMag2TC, i64lhs);
   2586   SDValue lhsSelect =
   2587           DAG.getNode(ISD::SELECT, dl, IntVT,
   2588                       lhsSelectMask, lhsSignMag2TC, i64lhs);
   2589 
   2590   SDValue rhsSelectMask = DAG.getNode(ISD::SRA, dl, ccResultVT,
   2591                                       rhsHi32, DAG.getConstant(31, MVT::i32));
   2592   SDValue rhsSignMag2TC = DAG.getNode(ISD::SUB, dl, IntVT, signMag2TC, i64rhs);
   2593   SDValue rhsSelect =
   2594           DAG.getNode(ISD::SELECT, dl, IntVT,
   2595                       rhsSelectMask, rhsSignMag2TC, i64rhs);
   2596 
   2597   unsigned compareOp;
   2598 
   2599   switch (CC->get()) {
   2600   case ISD::SETOEQ:
   2601   case ISD::SETUEQ:
   2602     compareOp = ISD::SETEQ; break;
   2603   case ISD::SETOGT:
   2604   case ISD::SETUGT:
   2605     compareOp = ISD::SETGT; break;
   2606   case ISD::SETOGE:
   2607   case ISD::SETUGE:
   2608     compareOp = ISD::SETGE; break;
   2609   case ISD::SETOLT:
   2610   case ISD::SETULT:
   2611     compareOp = ISD::SETLT; break;
   2612   case ISD::SETOLE:
   2613   case ISD::SETULE:
   2614     compareOp = ISD::SETLE; break;
   2615   case ISD::SETUNE:
   2616   case ISD::SETONE:
   2617     compareOp = ISD::SETNE; break;
   2618   default:
   2619     report_fatal_error("CellSPU ISel Select: unimplemented f64 condition");
   2620   }
   2621 
   2622   SDValue result =
   2623           DAG.getSetCC(dl, ccResultVT, lhsSelect, rhsSelect,
   2624                        (ISD::CondCode) compareOp);
   2625 
   2626   if ((CC->get() & 0x8) == 0) {
   2627     // Ordered comparison:
   2628     SDValue lhsNaN = DAG.getSetCC(dl, ccResultVT,
   2629                                   lhs, DAG.getConstantFP(0.0, MVT::f64),
   2630                                   ISD::SETO);
   2631     SDValue rhsNaN = DAG.getSetCC(dl, ccResultVT,
   2632                                   rhs, DAG.getConstantFP(0.0, MVT::f64),
   2633                                   ISD::SETO);
   2634     SDValue ordered = DAG.getNode(ISD::AND, dl, ccResultVT, lhsNaN, rhsNaN);
   2635 
   2636     result = DAG.getNode(ISD::AND, dl, ccResultVT, ordered, result);
   2637   }
   2638 
   2639   return result;
   2640 }
   2641 
   2642 //! Lower ISD::SELECT_CC
   2643 /*!
   2644   ISD::SELECT_CC can (generally) be implemented directly on the SPU using the
   2645   SELB instruction.
   2646 
   2647   \note Need to revisit this in the future: if the code path through the true
   2648   and false value computations is longer than the latency of a branch (6
   2649   cycles), then it would be more advantageous to branch and insert a new basic
   2650   block and branch on the condition. However, this code does not make that
   2651   assumption, given the simplisitc uses so far.
   2652  */
   2653 
   2654 static SDValue LowerSELECT_CC(SDValue Op, SelectionDAG &DAG,
   2655                               const TargetLowering &TLI) {
   2656   EVT VT = Op.getValueType();
   2657   SDValue lhs = Op.getOperand(0);
   2658   SDValue rhs = Op.getOperand(1);
   2659   SDValue trueval = Op.getOperand(2);
   2660   SDValue falseval = Op.getOperand(3);
   2661   SDValue condition = Op.getOperand(4);
   2662   DebugLoc dl = Op.getDebugLoc();
   2663 
   2664   // NOTE: SELB's arguments: $rA, $rB, $mask
   2665   //
   2666   // SELB selects bits from $rA where bits in $mask are 0, bits from $rB
   2667   // where bits in $mask are 1. CCond will be inverted, having 1s where the
   2668   // condition was true and 0s where the condition was false. Hence, the
   2669   // arguments to SELB get reversed.
   2670 
   2671   // Note: Really should be ISD::SELECT instead of SPUISD::SELB, but LLVM's
   2672   // legalizer insists on combining SETCC/SELECT into SELECT_CC, so we end up
   2673   // with another "cannot select select_cc" assert:
   2674 
   2675   SDValue compare = DAG.getNode(ISD::SETCC, dl,
   2676                                 TLI.getSetCCResultType(Op.getValueType()),
   2677                                 lhs, rhs, condition);
   2678   return DAG.getNode(SPUISD::SELB, dl, VT, falseval, trueval, compare);
   2679 }
   2680 
   2681 //! Custom lower ISD::TRUNCATE
   2682 static SDValue LowerTRUNCATE(SDValue Op, SelectionDAG &DAG)
   2683 {
   2684   // Type to truncate to
   2685   EVT VT = Op.getValueType();
   2686   MVT simpleVT = VT.getSimpleVT();
   2687   EVT VecVT = EVT::getVectorVT(*DAG.getContext(),
   2688                                VT, (128 / VT.getSizeInBits()));
   2689   DebugLoc dl = Op.getDebugLoc();
   2690 
   2691   // Type to truncate from
   2692   SDValue Op0 = Op.getOperand(0);
   2693   EVT Op0VT = Op0.getValueType();
   2694 
   2695   if (Op0VT == MVT::i128 && simpleVT == MVT::i64) {
   2696     // Create shuffle mask, least significant doubleword of quadword
   2697     unsigned maskHigh = 0x08090a0b;
   2698     unsigned maskLow = 0x0c0d0e0f;
   2699     // Use a shuffle to perform the truncation
   2700     SDValue shufMask = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32,
   2701                                    DAG.getConstant(maskHigh, MVT::i32),
   2702                                    DAG.getConstant(maskLow, MVT::i32),
   2703                                    DAG.getConstant(maskHigh, MVT::i32),
   2704                                    DAG.getConstant(maskLow, MVT::i32));
   2705 
   2706     SDValue truncShuffle = DAG.getNode(SPUISD::SHUFB, dl, VecVT,
   2707                                        Op0, Op0, shufMask);
   2708 
   2709     return DAG.getNode(SPUISD::VEC2PREFSLOT, dl, VT, truncShuffle);
   2710   }
   2711 
   2712   return SDValue();             // Leave the truncate unmolested
   2713 }
   2714 
   2715 /*!
   2716  * Emit the instruction sequence for i64/i32 -> i128 sign extend. The basic
   2717  * algorithm is to duplicate the sign bit using rotmai to generate at
   2718  * least one byte full of sign bits. Then propagate the "sign-byte" into
   2719  * the leftmost words and the i64/i32 into the rightmost words using shufb.
   2720  *
   2721  * @param Op The sext operand
   2722  * @param DAG The current DAG
   2723  * @return The SDValue with the entire instruction sequence
   2724  */
   2725 static SDValue LowerSIGN_EXTEND(SDValue Op, SelectionDAG &DAG)
   2726 {
   2727   DebugLoc dl = Op.getDebugLoc();
   2728 
   2729   // Type to extend to
   2730   MVT OpVT = Op.getValueType().getSimpleVT();
   2731 
   2732   // Type to extend from
   2733   SDValue Op0 = Op.getOperand(0);
   2734   MVT Op0VT = Op0.getValueType().getSimpleVT();
   2735 
   2736   // extend i8 & i16 via i32
   2737   if (Op0VT == MVT::i8 || Op0VT == MVT::i16) {
   2738     Op0 = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::i32, Op0);
   2739     Op0VT = MVT::i32;
   2740   }
   2741 
   2742   // The type to extend to needs to be a i128 and
   2743   // the type to extend from needs to be i64 or i32.
   2744   assert((OpVT == MVT::i128 && (Op0VT == MVT::i64 || Op0VT == MVT::i32)) &&
   2745           "LowerSIGN_EXTEND: input and/or output operand have wrong size");
   2746   (void)OpVT;
   2747 
   2748   // Create shuffle mask
   2749   unsigned mask1 = 0x10101010; // byte 0 - 3 and 4 - 7
   2750   unsigned mask2 = Op0VT == MVT::i64 ? 0x00010203 : 0x10101010; // byte  8 - 11
   2751   unsigned mask3 = Op0VT == MVT::i64 ? 0x04050607 : 0x00010203; // byte 12 - 15
   2752   SDValue shufMask = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32,
   2753                                  DAG.getConstant(mask1, MVT::i32),
   2754                                  DAG.getConstant(mask1, MVT::i32),
   2755                                  DAG.getConstant(mask2, MVT::i32),
   2756                                  DAG.getConstant(mask3, MVT::i32));
   2757 
   2758   // Word wise arithmetic right shift to generate at least one byte
   2759   // that contains sign bits.
   2760   MVT mvt = Op0VT == MVT::i64 ? MVT::v2i64 : MVT::v4i32;
   2761   SDValue sraVal = DAG.getNode(ISD::SRA,
   2762                  dl,
   2763                  mvt,
   2764                  DAG.getNode(SPUISD::PREFSLOT2VEC, dl, mvt, Op0, Op0),
   2765                  DAG.getConstant(31, MVT::i32));
   2766 
   2767   // reinterpret as a i128 (SHUFB requires it). This gets lowered away.
   2768   SDValue extended = SDValue(DAG.getMachineNode(TargetOpcode::COPY_TO_REGCLASS,
   2769                                         dl, Op0VT, Op0,
   2770                                         DAG.getTargetConstant(
   2771                                                   SPU::GPRCRegClass.getID(),
   2772                                                   MVT::i32)), 0);
   2773   // Shuffle bytes - Copy the sign bits into the upper 64 bits
   2774   // and the input value into the lower 64 bits.
   2775   SDValue extShuffle = DAG.getNode(SPUISD::SHUFB, dl, mvt,
   2776         extended, sraVal, shufMask);
   2777   return DAG.getNode(ISD::BITCAST, dl, MVT::i128, extShuffle);
   2778 }
   2779 
   2780 //! Custom (target-specific) lowering entry point
   2781 /*!
   2782   This is where LLVM's DAG selection process calls to do target-specific
   2783   lowering of nodes.
   2784  */
   2785 SDValue
   2786 SPUTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const
   2787 {
   2788   unsigned Opc = (unsigned) Op.getOpcode();
   2789   EVT VT = Op.getValueType();
   2790 
   2791   switch (Opc) {
   2792   default: {
   2793 #ifndef NDEBUG
   2794     errs() << "SPUTargetLowering::LowerOperation(): need to lower this!\n";
   2795     errs() << "Op.getOpcode() = " << Opc << "\n";
   2796     errs() << "*Op.getNode():\n";
   2797     Op.getNode()->dump();
   2798 #endif
   2799     llvm_unreachable(0);
   2800   }
   2801   case ISD::LOAD:
   2802   case ISD::EXTLOAD:
   2803   case ISD::SEXTLOAD:
   2804   case ISD::ZEXTLOAD:
   2805     return LowerLOAD(Op, DAG, SPUTM.getSubtargetImpl());
   2806   case ISD::STORE:
   2807     return LowerSTORE(Op, DAG, SPUTM.getSubtargetImpl());
   2808   case ISD::ConstantPool:
   2809     return LowerConstantPool(Op, DAG, SPUTM.getSubtargetImpl());
   2810   case ISD::GlobalAddress:
   2811     return LowerGlobalAddress(Op, DAG, SPUTM.getSubtargetImpl());
   2812   case ISD::JumpTable:
   2813     return LowerJumpTable(Op, DAG, SPUTM.getSubtargetImpl());
   2814   case ISD::ConstantFP:
   2815     return LowerConstantFP(Op, DAG);
   2816 
   2817   // i8, i64 math ops:
   2818   case ISD::ADD:
   2819   case ISD::SUB:
   2820   case ISD::ROTR:
   2821   case ISD::ROTL:
   2822   case ISD::SRL:
   2823   case ISD::SHL:
   2824   case ISD::SRA: {
   2825     if (VT == MVT::i8)
   2826       return LowerI8Math(Op, DAG, Opc, *this);
   2827     break;
   2828   }
   2829 
   2830   case ISD::FP_TO_SINT:
   2831   case ISD::FP_TO_UINT:
   2832     return LowerFP_TO_INT(Op, DAG, *this);
   2833 
   2834   case ISD::SINT_TO_FP:
   2835   case ISD::UINT_TO_FP:
   2836     return LowerINT_TO_FP(Op, DAG, *this);
   2837 
   2838   // Vector-related lowering.
   2839   case ISD::BUILD_VECTOR:
   2840     return LowerBUILD_VECTOR(Op, DAG);
   2841   case ISD::SCALAR_TO_VECTOR:
   2842     return LowerSCALAR_TO_VECTOR(Op, DAG);
   2843   case ISD::VECTOR_SHUFFLE:
   2844     return LowerVECTOR_SHUFFLE(Op, DAG);
   2845   case ISD::EXTRACT_VECTOR_ELT:
   2846     return LowerEXTRACT_VECTOR_ELT(Op, DAG);
   2847   case ISD::INSERT_VECTOR_ELT:
   2848     return LowerINSERT_VECTOR_ELT(Op, DAG);
   2849 
   2850   // Look for ANDBI, ORBI and XORBI opportunities and lower appropriately:
   2851   case ISD::AND:
   2852   case ISD::OR:
   2853   case ISD::XOR:
   2854     return LowerByteImmed(Op, DAG);
   2855 
   2856   // Vector and i8 multiply:
   2857   case ISD::MUL:
   2858     if (VT == MVT::i8)
   2859       return LowerI8Math(Op, DAG, Opc, *this);
   2860 
   2861   case ISD::CTPOP:
   2862     return LowerCTPOP(Op, DAG);
   2863 
   2864   case ISD::SELECT_CC:
   2865     return LowerSELECT_CC(Op, DAG, *this);
   2866 
   2867   case ISD::SETCC:
   2868     return LowerSETCC(Op, DAG, *this);
   2869 
   2870   case ISD::TRUNCATE:
   2871     return LowerTRUNCATE(Op, DAG);
   2872 
   2873   case ISD::SIGN_EXTEND:
   2874     return LowerSIGN_EXTEND(Op, DAG);
   2875   }
   2876 
   2877   return SDValue();
   2878 }
   2879 
   2880 void SPUTargetLowering::ReplaceNodeResults(SDNode *N,
   2881                                            SmallVectorImpl<SDValue>&Results,
   2882                                            SelectionDAG &DAG) const
   2883 {
   2884 #if 0
   2885   unsigned Opc = (unsigned) N->getOpcode();
   2886   EVT OpVT = N->getValueType(0);
   2887 
   2888   switch (Opc) {
   2889   default: {
   2890     errs() << "SPUTargetLowering::ReplaceNodeResults(): need to fix this!\n";
   2891     errs() << "Op.getOpcode() = " << Opc << "\n";
   2892     errs() << "*Op.getNode():\n";
   2893     N->dump();
   2894     abort();
   2895     /*NOTREACHED*/
   2896   }
   2897   }
   2898 #endif
   2899 
   2900   /* Otherwise, return unchanged */
   2901 }
   2902 
   2903 //===----------------------------------------------------------------------===//
   2904 // Target Optimization Hooks
   2905 //===----------------------------------------------------------------------===//
   2906 
   2907 SDValue
   2908 SPUTargetLowering::PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const
   2909 {
   2910 #if 0
   2911   TargetMachine &TM = getTargetMachine();
   2912 #endif
   2913   const SPUSubtarget *ST = SPUTM.getSubtargetImpl();
   2914   SelectionDAG &DAG = DCI.DAG;
   2915   SDValue Op0 = N->getOperand(0);       // everything has at least one operand
   2916   EVT NodeVT = N->getValueType(0);      // The node's value type
   2917   EVT Op0VT = Op0.getValueType();       // The first operand's result
   2918   SDValue Result;                       // Initially, empty result
   2919   DebugLoc dl = N->getDebugLoc();
   2920 
   2921   switch (N->getOpcode()) {
   2922   default: break;
   2923   case ISD::ADD: {
   2924     SDValue Op1 = N->getOperand(1);
   2925 
   2926     if (Op0.getOpcode() == SPUISD::IndirectAddr
   2927         || Op1.getOpcode() == SPUISD::IndirectAddr) {
   2928       // Normalize the operands to reduce repeated code
   2929       SDValue IndirectArg = Op0, AddArg = Op1;
   2930 
   2931       if (Op1.getOpcode() == SPUISD::IndirectAddr) {
   2932         IndirectArg = Op1;
   2933         AddArg = Op0;
   2934       }
   2935 
   2936       if (isa<ConstantSDNode>(AddArg)) {
   2937         ConstantSDNode *CN0 = cast<ConstantSDNode > (AddArg);
   2938         SDValue IndOp1 = IndirectArg.getOperand(1);
   2939 
   2940         if (CN0->isNullValue()) {
   2941           // (add (SPUindirect <arg>, <arg>), 0) ->
   2942           // (SPUindirect <arg>, <arg>)
   2943 
   2944 #if !defined(NDEBUG)
   2945           if (DebugFlag && isCurrentDebugType(DEBUG_TYPE)) {
   2946             errs() << "\n"
   2947                  << "Replace: (add (SPUindirect <arg>, <arg>), 0)\n"
   2948                  << "With:    (SPUindirect <arg>, <arg>)\n";
   2949           }
   2950 #endif
   2951 
   2952           return IndirectArg;
   2953         } else if (isa<ConstantSDNode>(IndOp1)) {
   2954           // (add (SPUindirect <arg>, <const>), <const>) ->
   2955           // (SPUindirect <arg>, <const + const>)
   2956           ConstantSDNode *CN1 = cast<ConstantSDNode > (IndOp1);
   2957           int64_t combinedConst = CN0->getSExtValue() + CN1->getSExtValue();
   2958           SDValue combinedValue = DAG.getConstant(combinedConst, Op0VT);
   2959 
   2960 #if !defined(NDEBUG)
   2961           if (DebugFlag && isCurrentDebugType(DEBUG_TYPE)) {
   2962             errs() << "\n"
   2963                  << "Replace: (add (SPUindirect <arg>, " << CN1->getSExtValue()
   2964                  << "), " << CN0->getSExtValue() << ")\n"
   2965                  << "With:    (SPUindirect <arg>, "
   2966                  << combinedConst << ")\n";
   2967           }
   2968 #endif
   2969 
   2970           return DAG.getNode(SPUISD::IndirectAddr, dl, Op0VT,
   2971                              IndirectArg, combinedValue);
   2972         }
   2973       }
   2974     }
   2975     break;
   2976   }
   2977   case ISD::SIGN_EXTEND:
   2978   case ISD::ZERO_EXTEND:
   2979   case ISD::ANY_EXTEND: {
   2980     if (Op0.getOpcode() == SPUISD::VEC2PREFSLOT && NodeVT == Op0VT) {
   2981       // (any_extend (SPUextract_elt0 <arg>)) ->
   2982       // (SPUextract_elt0 <arg>)
   2983       // Types must match, however...
   2984 #if !defined(NDEBUG)
   2985       if (DebugFlag && isCurrentDebugType(DEBUG_TYPE)) {
   2986         errs() << "\nReplace: ";
   2987         N->dump(&DAG);
   2988         errs() << "\nWith:    ";
   2989         Op0.getNode()->dump(&DAG);
   2990         errs() << "\n";
   2991       }
   2992 #endif
   2993 
   2994       return Op0;
   2995     }
   2996     break;
   2997   }
   2998   case SPUISD::IndirectAddr: {
   2999     if (!ST->usingLargeMem() && Op0.getOpcode() == SPUISD::AFormAddr) {
   3000       ConstantSDNode *CN = dyn_cast<ConstantSDNode>(N->getOperand(1));
   3001       if (CN != 0 && CN->isNullValue()) {
   3002         // (SPUindirect (SPUaform <addr>, 0), 0) ->
   3003         // (SPUaform <addr>, 0)
   3004 
   3005         DEBUG(errs() << "Replace: ");
   3006         DEBUG(N->dump(&DAG));
   3007         DEBUG(errs() << "\nWith:    ");
   3008         DEBUG(Op0.getNode()->dump(&DAG));
   3009         DEBUG(errs() << "\n");
   3010 
   3011         return Op0;
   3012       }
   3013     } else if (Op0.getOpcode() == ISD::ADD) {
   3014       SDValue Op1 = N->getOperand(1);
   3015       if (ConstantSDNode *CN1 = dyn_cast<ConstantSDNode>(Op1)) {
   3016         // (SPUindirect (add <arg>, <arg>), 0) ->
   3017         // (SPUindirect <arg>, <arg>)
   3018         if (CN1->isNullValue()) {
   3019 
   3020 #if !defined(NDEBUG)
   3021           if (DebugFlag && isCurrentDebugType(DEBUG_TYPE)) {
   3022             errs() << "\n"
   3023                  << "Replace: (SPUindirect (add <arg>, <arg>), 0)\n"
   3024                  << "With:    (SPUindirect <arg>, <arg>)\n";
   3025           }
   3026 #endif
   3027 
   3028           return DAG.getNode(SPUISD::IndirectAddr, dl, Op0VT,
   3029                              Op0.getOperand(0), Op0.getOperand(1));
   3030         }
   3031       }
   3032     }
   3033     break;
   3034   }
   3035   case SPUISD::SHL_BITS:
   3036   case SPUISD::SHL_BYTES:
   3037   case SPUISD::ROTBYTES_LEFT: {
   3038     SDValue Op1 = N->getOperand(1);
   3039 
   3040     // Kill degenerate vector shifts:
   3041     if (ConstantSDNode *CN = dyn_cast<ConstantSDNode>(Op1)) {
   3042       if (CN->isNullValue()) {
   3043         Result = Op0;
   3044       }
   3045     }
   3046     break;
   3047   }
   3048   case SPUISD::PREFSLOT2VEC: {
   3049     switch (Op0.getOpcode()) {
   3050     default:
   3051       break;
   3052     case ISD::ANY_EXTEND:
   3053     case ISD::ZERO_EXTEND:
   3054     case ISD::SIGN_EXTEND: {
   3055       // (SPUprefslot2vec (any|zero|sign_extend (SPUvec2prefslot <arg>))) ->
   3056       // <arg>
   3057       // but only if the SPUprefslot2vec and <arg> types match.
   3058       SDValue Op00 = Op0.getOperand(0);
   3059       if (Op00.getOpcode() == SPUISD::VEC2PREFSLOT) {
   3060         SDValue Op000 = Op00.getOperand(0);
   3061         if (Op000.getValueType() == NodeVT) {
   3062           Result = Op000;
   3063         }
   3064       }
   3065       break;
   3066     }
   3067     case SPUISD::VEC2PREFSLOT: {
   3068       // (SPUprefslot2vec (SPUvec2prefslot <arg>)) ->
   3069       // <arg>
   3070       Result = Op0.getOperand(0);
   3071       break;
   3072     }
   3073     }
   3074     break;
   3075   }
   3076   }
   3077 
   3078   // Otherwise, return unchanged.
   3079 #ifndef NDEBUG
   3080   if (Result.getNode()) {
   3081     DEBUG(errs() << "\nReplace.SPU: ");
   3082     DEBUG(N->dump(&DAG));
   3083     DEBUG(errs() << "\nWith:        ");
   3084     DEBUG(Result.getNode()->dump(&DAG));
   3085     DEBUG(errs() << "\n");
   3086   }
   3087 #endif
   3088 
   3089   return Result;
   3090 }
   3091 
   3092 //===----------------------------------------------------------------------===//
   3093 // Inline Assembly Support
   3094 //===----------------------------------------------------------------------===//
   3095 
   3096 /// getConstraintType - Given a constraint letter, return the type of
   3097 /// constraint it is for this target.
   3098 SPUTargetLowering::ConstraintType
   3099 SPUTargetLowering::getConstraintType(const std::string &ConstraintLetter) const {
   3100   if (ConstraintLetter.size() == 1) {
   3101     switch (ConstraintLetter[0]) {
   3102     default: break;
   3103     case 'b':
   3104     case 'r':
   3105     case 'f':
   3106     case 'v':
   3107     case 'y':
   3108       return C_RegisterClass;
   3109     }
   3110   }
   3111   return TargetLowering::getConstraintType(ConstraintLetter);
   3112 }
   3113 
   3114 /// Examine constraint type and operand type and determine a weight value.
   3115 /// This object must already have been set up with the operand type
   3116 /// and the current alternative constraint selected.
   3117 TargetLowering::ConstraintWeight
   3118 SPUTargetLowering::getSingleConstraintMatchWeight(
   3119     AsmOperandInfo &info, const char *constraint) const {
   3120   ConstraintWeight weight = CW_Invalid;
   3121   Value *CallOperandVal = info.CallOperandVal;
   3122     // If we don't have a value, we can't do a match,
   3123     // but allow it at the lowest weight.
   3124   if (CallOperandVal == NULL)
   3125     return CW_Default;
   3126   // Look at the constraint type.
   3127   switch (*constraint) {
   3128   default:
   3129     weight = TargetLowering::getSingleConstraintMatchWeight(info, constraint);
   3130     break;
   3131     //FIXME: Seems like the supported constraint letters were just copied
   3132     // from PPC, as the following doesn't correspond to the GCC docs.
   3133     // I'm leaving it so until someone adds the corresponding lowering support.
   3134   case 'b':
   3135   case 'r':
   3136   case 'f':
   3137   case 'd':
   3138   case 'v':
   3139   case 'y':
   3140     weight = CW_Register;
   3141     break;
   3142   }
   3143   return weight;
   3144 }
   3145 
   3146 std::pair<unsigned, const TargetRegisterClass*>
   3147 SPUTargetLowering::getRegForInlineAsmConstraint(const std::string &Constraint,
   3148                                                 EVT VT) const
   3149 {
   3150   if (Constraint.size() == 1) {
   3151     // GCC RS6000 Constraint Letters
   3152     switch (Constraint[0]) {
   3153     case 'b':   // R1-R31
   3154     case 'r':   // R0-R31
   3155       if (VT == MVT::i64)
   3156         return std::make_pair(0U, SPU::R64CRegisterClass);
   3157       return std::make_pair(0U, SPU::R32CRegisterClass);
   3158     case 'f':
   3159       if (VT == MVT::f32)
   3160         return std::make_pair(0U, SPU::R32FPRegisterClass);
   3161       else if (VT == MVT::f64)
   3162         return std::make_pair(0U, SPU::R64FPRegisterClass);
   3163       break;
   3164     case 'v':
   3165       return std::make_pair(0U, SPU::GPRCRegisterClass);
   3166     }
   3167   }
   3168 
   3169   return TargetLowering::getRegForInlineAsmConstraint(Constraint, VT);
   3170 }
   3171 
   3172 //! Compute used/known bits for a SPU operand
   3173 void
   3174 SPUTargetLowering::computeMaskedBitsForTargetNode(const SDValue Op,
   3175                                                   const APInt &Mask,
   3176                                                   APInt &KnownZero,
   3177                                                   APInt &KnownOne,
   3178                                                   const SelectionDAG &DAG,
   3179                                                   unsigned Depth ) const {
   3180 #if 0
   3181   const uint64_t uint64_sizebits = sizeof(uint64_t) * CHAR_BIT;
   3182 
   3183   switch (Op.getOpcode()) {
   3184   default:
   3185     // KnownZero = KnownOne = APInt(Mask.getBitWidth(), 0);
   3186     break;
   3187   case CALL:
   3188   case SHUFB:
   3189   case SHUFFLE_MASK:
   3190   case CNTB:
   3191   case SPUISD::PREFSLOT2VEC:
   3192   case SPUISD::LDRESULT:
   3193   case SPUISD::VEC2PREFSLOT:
   3194   case SPUISD::SHLQUAD_L_BITS:
   3195   case SPUISD::SHLQUAD_L_BYTES:
   3196   case SPUISD::VEC_ROTL:
   3197   case SPUISD::VEC_ROTR:
   3198   case SPUISD::ROTBYTES_LEFT:
   3199   case SPUISD::SELECT_MASK:
   3200   case SPUISD::SELB:
   3201   }
   3202 #endif
   3203 }
   3204 
   3205 unsigned
   3206 SPUTargetLowering::ComputeNumSignBitsForTargetNode(SDValue Op,
   3207                                                    unsigned Depth) const {
   3208   switch (Op.getOpcode()) {
   3209   default:
   3210     return 1;
   3211 
   3212   case ISD::SETCC: {
   3213     EVT VT = Op.getValueType();
   3214 
   3215     if (VT != MVT::i8 && VT != MVT::i16 && VT != MVT::i32) {
   3216       VT = MVT::i32;
   3217     }
   3218     return VT.getSizeInBits();
   3219   }
   3220   }
   3221 }
   3222 
   3223 // LowerAsmOperandForConstraint
   3224 void
   3225 SPUTargetLowering::LowerAsmOperandForConstraint(SDValue Op,
   3226                                                 std::string &Constraint,
   3227                                                 std::vector<SDValue> &Ops,
   3228                                                 SelectionDAG &DAG) const {
   3229   // Default, for the time being, to the base class handler
   3230   TargetLowering::LowerAsmOperandForConstraint(Op, Constraint, Ops, DAG);
   3231 }
   3232 
   3233 /// isLegalAddressImmediate - Return true if the integer value can be used
   3234 /// as the offset of the target addressing mode.
   3235 bool SPUTargetLowering::isLegalAddressImmediate(int64_t V,
   3236                                                 Type *Ty) const {
   3237   // SPU's addresses are 256K:
   3238   return (V > -(1 << 18) && V < (1 << 18) - 1);
   3239 }
   3240 
   3241 bool SPUTargetLowering::isLegalAddressImmediate(llvm::GlobalValue* GV) const {
   3242   return false;
   3243 }
   3244 
   3245 bool
   3246 SPUTargetLowering::isOffsetFoldingLegal(const GlobalAddressSDNode *GA) const {
   3247   // The SPU target isn't yet aware of offsets.
   3248   return false;
   3249 }
   3250 
   3251 // can we compare to Imm without writing it into a register?
   3252 bool SPUTargetLowering::isLegalICmpImmediate(int64_t Imm) const {
   3253   //ceqi, cgti, etc. all take s10 operand
   3254   return isInt<10>(Imm);
   3255 }
   3256 
   3257 bool
   3258 SPUTargetLowering::isLegalAddressingMode(const AddrMode &AM,
   3259                                          Type * ) const{
   3260 
   3261   // A-form: 18bit absolute address.
   3262   if (AM.BaseGV && !AM.HasBaseReg && AM.Scale == 0 && AM.BaseOffs == 0)
   3263     return true;
   3264 
   3265   // D-form: reg + 14bit offset
   3266   if (AM.BaseGV ==0 && AM.HasBaseReg && AM.Scale == 0 && isInt<14>(AM.BaseOffs))
   3267     return true;
   3268 
   3269   // X-form: reg+reg
   3270   if (AM.BaseGV == 0 && AM.HasBaseReg && AM.Scale == 1 && AM.BaseOffs ==0)
   3271     return true;
   3272 
   3273   return false;
   3274 }
   3275