Home | History | Annotate | Download | only in AArch64
      1 //===-- AArch64ISelLowering.cpp - AArch64 DAG Lowering Implementation  ----===//
      2 //
      3 //                     The LLVM Compiler Infrastructure
      4 //
      5 // This file is distributed under the University of Illinois Open Source
      6 // License. See LICENSE.TXT for details.
      7 //
      8 //===----------------------------------------------------------------------===//
      9 //
     10 // This file implements the AArch64TargetLowering class.
     11 //
     12 //===----------------------------------------------------------------------===//
     13 
     14 #include "AArch64ISelLowering.h"
     15 #include "AArch64PerfectShuffle.h"
     16 #include "AArch64Subtarget.h"
     17 #include "AArch64MachineFunctionInfo.h"
     18 #include "AArch64TargetMachine.h"
     19 #include "AArch64TargetObjectFile.h"
     20 #include "MCTargetDesc/AArch64AddressingModes.h"
     21 #include "llvm/ADT/Statistic.h"
     22 #include "llvm/CodeGen/CallingConvLower.h"
     23 #include "llvm/CodeGen/MachineFrameInfo.h"
     24 #include "llvm/CodeGen/MachineInstrBuilder.h"
     25 #include "llvm/CodeGen/MachineRegisterInfo.h"
     26 #include "llvm/IR/Function.h"
     27 #include "llvm/IR/Intrinsics.h"
     28 #include "llvm/IR/Type.h"
     29 #include "llvm/Support/CommandLine.h"
     30 #include "llvm/Support/Debug.h"
     31 #include "llvm/Support/ErrorHandling.h"
     32 #include "llvm/Support/raw_ostream.h"
     33 #include "llvm/Target/TargetOptions.h"
     34 using namespace llvm;
     35 
     36 #define DEBUG_TYPE "aarch64-lower"
     37 
     38 STATISTIC(NumTailCalls, "Number of tail calls");
     39 STATISTIC(NumShiftInserts, "Number of vector shift inserts");
     40 
     41 enum AlignMode {
     42   StrictAlign,
     43   NoStrictAlign
     44 };
     45 
     46 static cl::opt<AlignMode>
     47 Align(cl::desc("Load/store alignment support"),
     48       cl::Hidden, cl::init(NoStrictAlign),
     49       cl::values(
     50           clEnumValN(StrictAlign,   "aarch64-strict-align",
     51                      "Disallow all unaligned memory accesses"),
     52           clEnumValN(NoStrictAlign, "aarch64-no-strict-align",
     53                      "Allow unaligned memory accesses"),
     54           clEnumValEnd));
     55 
     56 // Place holder until extr generation is tested fully.
     57 static cl::opt<bool>
     58 EnableAArch64ExtrGeneration("aarch64-extr-generation", cl::Hidden,
     59                           cl::desc("Allow AArch64 (or (shift)(shift))->extract"),
     60                           cl::init(true));
     61 
     62 static cl::opt<bool>
     63 EnableAArch64SlrGeneration("aarch64-shift-insert-generation", cl::Hidden,
     64                          cl::desc("Allow AArch64 SLI/SRI formation"),
     65                          cl::init(false));
     66 
     67 //===----------------------------------------------------------------------===//
     68 // AArch64 Lowering public interface.
     69 //===----------------------------------------------------------------------===//
     70 static TargetLoweringObjectFile *createTLOF(const Triple &TT) {
     71   if (TT.isOSBinFormatMachO())
     72     return new AArch64_MachoTargetObjectFile();
     73 
     74   return new AArch64_ELFTargetObjectFile();
     75 }
     76 
     77 AArch64TargetLowering::AArch64TargetLowering(TargetMachine &TM)
     78     : TargetLowering(TM, createTLOF(Triple(TM.getTargetTriple()))) {
     79   Subtarget = &TM.getSubtarget<AArch64Subtarget>();
     80 
     81   // AArch64 doesn't have comparisons which set GPRs or setcc instructions, so
     82   // we have to make something up. Arbitrarily, choose ZeroOrOne.
     83   setBooleanContents(ZeroOrOneBooleanContent);
     84   // When comparing vectors the result sets the different elements in the
     85   // vector to all-one or all-zero.
     86   setBooleanVectorContents(ZeroOrNegativeOneBooleanContent);
     87 
     88   // Set up the register classes.
     89   addRegisterClass(MVT::i32, &AArch64::GPR32allRegClass);
     90   addRegisterClass(MVT::i64, &AArch64::GPR64allRegClass);
     91 
     92   if (Subtarget->hasFPARMv8()) {
     93     addRegisterClass(MVT::f16, &AArch64::FPR16RegClass);
     94     addRegisterClass(MVT::f32, &AArch64::FPR32RegClass);
     95     addRegisterClass(MVT::f64, &AArch64::FPR64RegClass);
     96     addRegisterClass(MVT::f128, &AArch64::FPR128RegClass);
     97   }
     98 
     99   if (Subtarget->hasNEON()) {
    100     addRegisterClass(MVT::v16i8, &AArch64::FPR8RegClass);
    101     addRegisterClass(MVT::v8i16, &AArch64::FPR16RegClass);
    102     // Someone set us up the NEON.
    103     addDRTypeForNEON(MVT::v2f32);
    104     addDRTypeForNEON(MVT::v8i8);
    105     addDRTypeForNEON(MVT::v4i16);
    106     addDRTypeForNEON(MVT::v2i32);
    107     addDRTypeForNEON(MVT::v1i64);
    108     addDRTypeForNEON(MVT::v1f64);
    109 
    110     addQRTypeForNEON(MVT::v4f32);
    111     addQRTypeForNEON(MVT::v2f64);
    112     addQRTypeForNEON(MVT::v16i8);
    113     addQRTypeForNEON(MVT::v8i16);
    114     addQRTypeForNEON(MVT::v4i32);
    115     addQRTypeForNEON(MVT::v2i64);
    116   }
    117 
    118   // Compute derived properties from the register classes
    119   computeRegisterProperties();
    120 
    121   // Provide all sorts of operation actions
    122   setOperationAction(ISD::GlobalAddress, MVT::i64, Custom);
    123   setOperationAction(ISD::GlobalTLSAddress, MVT::i64, Custom);
    124   setOperationAction(ISD::SETCC, MVT::i32, Custom);
    125   setOperationAction(ISD::SETCC, MVT::i64, Custom);
    126   setOperationAction(ISD::SETCC, MVT::f32, Custom);
    127   setOperationAction(ISD::SETCC, MVT::f64, Custom);
    128   setOperationAction(ISD::BRCOND, MVT::Other, Expand);
    129   setOperationAction(ISD::BR_CC, MVT::i32, Custom);
    130   setOperationAction(ISD::BR_CC, MVT::i64, Custom);
    131   setOperationAction(ISD::BR_CC, MVT::f32, Custom);
    132   setOperationAction(ISD::BR_CC, MVT::f64, Custom);
    133   setOperationAction(ISD::SELECT, MVT::i32, Custom);
    134   setOperationAction(ISD::SELECT, MVT::i64, Custom);
    135   setOperationAction(ISD::SELECT, MVT::f32, Custom);
    136   setOperationAction(ISD::SELECT, MVT::f64, Custom);
    137   setOperationAction(ISD::SELECT_CC, MVT::i32, Custom);
    138   setOperationAction(ISD::SELECT_CC, MVT::i64, Custom);
    139   setOperationAction(ISD::SELECT_CC, MVT::f32, Custom);
    140   setOperationAction(ISD::SELECT_CC, MVT::f64, Custom);
    141   setOperationAction(ISD::BR_JT, MVT::Other, Expand);
    142   setOperationAction(ISD::JumpTable, MVT::i64, Custom);
    143 
    144   setOperationAction(ISD::SHL_PARTS, MVT::i64, Custom);
    145   setOperationAction(ISD::SRA_PARTS, MVT::i64, Custom);
    146   setOperationAction(ISD::SRL_PARTS, MVT::i64, Custom);
    147 
    148   setOperationAction(ISD::FREM, MVT::f32, Expand);
    149   setOperationAction(ISD::FREM, MVT::f64, Expand);
    150   setOperationAction(ISD::FREM, MVT::f80, Expand);
    151 
    152   // Custom lowering hooks are needed for XOR
    153   // to fold it into CSINC/CSINV.
    154   setOperationAction(ISD::XOR, MVT::i32, Custom);
    155   setOperationAction(ISD::XOR, MVT::i64, Custom);
    156 
    157   // Virtually no operation on f128 is legal, but LLVM can't expand them when
    158   // there's a valid register class, so we need custom operations in most cases.
    159   setOperationAction(ISD::FABS, MVT::f128, Expand);
    160   setOperationAction(ISD::FADD, MVT::f128, Custom);
    161   setOperationAction(ISD::FCOPYSIGN, MVT::f128, Expand);
    162   setOperationAction(ISD::FCOS, MVT::f128, Expand);
    163   setOperationAction(ISD::FDIV, MVT::f128, Custom);
    164   setOperationAction(ISD::FMA, MVT::f128, Expand);
    165   setOperationAction(ISD::FMUL, MVT::f128, Custom);
    166   setOperationAction(ISD::FNEG, MVT::f128, Expand);
    167   setOperationAction(ISD::FPOW, MVT::f128, Expand);
    168   setOperationAction(ISD::FREM, MVT::f128, Expand);
    169   setOperationAction(ISD::FRINT, MVT::f128, Expand);
    170   setOperationAction(ISD::FSIN, MVT::f128, Expand);
    171   setOperationAction(ISD::FSINCOS, MVT::f128, Expand);
    172   setOperationAction(ISD::FSQRT, MVT::f128, Expand);
    173   setOperationAction(ISD::FSUB, MVT::f128, Custom);
    174   setOperationAction(ISD::FTRUNC, MVT::f128, Expand);
    175   setOperationAction(ISD::SETCC, MVT::f128, Custom);
    176   setOperationAction(ISD::BR_CC, MVT::f128, Custom);
    177   setOperationAction(ISD::SELECT, MVT::f128, Custom);
    178   setOperationAction(ISD::SELECT_CC, MVT::f128, Custom);
    179   setOperationAction(ISD::FP_EXTEND, MVT::f128, Custom);
    180 
    181   // Lowering for many of the conversions is actually specified by the non-f128
    182   // type. The LowerXXX function will be trivial when f128 isn't involved.
    183   setOperationAction(ISD::FP_TO_SINT, MVT::i32, Custom);
    184   setOperationAction(ISD::FP_TO_SINT, MVT::i64, Custom);
    185   setOperationAction(ISD::FP_TO_SINT, MVT::i128, Custom);
    186   setOperationAction(ISD::FP_TO_UINT, MVT::i32, Custom);
    187   setOperationAction(ISD::FP_TO_UINT, MVT::i64, Custom);
    188   setOperationAction(ISD::FP_TO_UINT, MVT::i128, Custom);
    189   setOperationAction(ISD::SINT_TO_FP, MVT::i32, Custom);
    190   setOperationAction(ISD::SINT_TO_FP, MVT::i64, Custom);
    191   setOperationAction(ISD::SINT_TO_FP, MVT::i128, Custom);
    192   setOperationAction(ISD::UINT_TO_FP, MVT::i32, Custom);
    193   setOperationAction(ISD::UINT_TO_FP, MVT::i64, Custom);
    194   setOperationAction(ISD::UINT_TO_FP, MVT::i128, Custom);
    195   setOperationAction(ISD::FP_ROUND, MVT::f32, Custom);
    196   setOperationAction(ISD::FP_ROUND, MVT::f64, Custom);
    197 
    198   // Variable arguments.
    199   setOperationAction(ISD::VASTART, MVT::Other, Custom);
    200   setOperationAction(ISD::VAARG, MVT::Other, Custom);
    201   setOperationAction(ISD::VACOPY, MVT::Other, Custom);
    202   setOperationAction(ISD::VAEND, MVT::Other, Expand);
    203 
    204   // Variable-sized objects.
    205   setOperationAction(ISD::STACKSAVE, MVT::Other, Expand);
    206   setOperationAction(ISD::STACKRESTORE, MVT::Other, Expand);
    207   setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i64, Expand);
    208 
    209   // Exception handling.
    210   // FIXME: These are guesses. Has this been defined yet?
    211   setExceptionPointerRegister(AArch64::X0);
    212   setExceptionSelectorRegister(AArch64::X1);
    213 
    214   // Constant pool entries
    215   setOperationAction(ISD::ConstantPool, MVT::i64, Custom);
    216 
    217   // BlockAddress
    218   setOperationAction(ISD::BlockAddress, MVT::i64, Custom);
    219 
    220   // Add/Sub overflow ops with MVT::Glues are lowered to NZCV dependences.
    221   setOperationAction(ISD::ADDC, MVT::i32, Custom);
    222   setOperationAction(ISD::ADDE, MVT::i32, Custom);
    223   setOperationAction(ISD::SUBC, MVT::i32, Custom);
    224   setOperationAction(ISD::SUBE, MVT::i32, Custom);
    225   setOperationAction(ISD::ADDC, MVT::i64, Custom);
    226   setOperationAction(ISD::ADDE, MVT::i64, Custom);
    227   setOperationAction(ISD::SUBC, MVT::i64, Custom);
    228   setOperationAction(ISD::SUBE, MVT::i64, Custom);
    229 
    230   // AArch64 lacks both left-rotate and popcount instructions.
    231   setOperationAction(ISD::ROTL, MVT::i32, Expand);
    232   setOperationAction(ISD::ROTL, MVT::i64, Expand);
    233 
    234   // AArch64 doesn't have {U|S}MUL_LOHI.
    235   setOperationAction(ISD::UMUL_LOHI, MVT::i64, Expand);
    236   setOperationAction(ISD::SMUL_LOHI, MVT::i64, Expand);
    237 
    238 
    239   // Expand the undefined-at-zero variants to cttz/ctlz to their defined-at-zero
    240   // counterparts, which AArch64 supports directly.
    241   setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i32, Expand);
    242   setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i32, Expand);
    243   setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i64, Expand);
    244   setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i64, Expand);
    245 
    246   setOperationAction(ISD::CTPOP, MVT::i32, Custom);
    247   setOperationAction(ISD::CTPOP, MVT::i64, Custom);
    248 
    249   setOperationAction(ISD::SDIVREM, MVT::i32, Expand);
    250   setOperationAction(ISD::SDIVREM, MVT::i64, Expand);
    251   setOperationAction(ISD::SREM, MVT::i32, Expand);
    252   setOperationAction(ISD::SREM, MVT::i64, Expand);
    253   setOperationAction(ISD::UDIVREM, MVT::i32, Expand);
    254   setOperationAction(ISD::UDIVREM, MVT::i64, Expand);
    255   setOperationAction(ISD::UREM, MVT::i32, Expand);
    256   setOperationAction(ISD::UREM, MVT::i64, Expand);
    257 
    258   // Custom lower Add/Sub/Mul with overflow.
    259   setOperationAction(ISD::SADDO, MVT::i32, Custom);
    260   setOperationAction(ISD::SADDO, MVT::i64, Custom);
    261   setOperationAction(ISD::UADDO, MVT::i32, Custom);
    262   setOperationAction(ISD::UADDO, MVT::i64, Custom);
    263   setOperationAction(ISD::SSUBO, MVT::i32, Custom);
    264   setOperationAction(ISD::SSUBO, MVT::i64, Custom);
    265   setOperationAction(ISD::USUBO, MVT::i32, Custom);
    266   setOperationAction(ISD::USUBO, MVT::i64, Custom);
    267   setOperationAction(ISD::SMULO, MVT::i32, Custom);
    268   setOperationAction(ISD::SMULO, MVT::i64, Custom);
    269   setOperationAction(ISD::UMULO, MVT::i32, Custom);
    270   setOperationAction(ISD::UMULO, MVT::i64, Custom);
    271 
    272   setOperationAction(ISD::FSIN, MVT::f32, Expand);
    273   setOperationAction(ISD::FSIN, MVT::f64, Expand);
    274   setOperationAction(ISD::FCOS, MVT::f32, Expand);
    275   setOperationAction(ISD::FCOS, MVT::f64, Expand);
    276   setOperationAction(ISD::FPOW, MVT::f32, Expand);
    277   setOperationAction(ISD::FPOW, MVT::f64, Expand);
    278   setOperationAction(ISD::FCOPYSIGN, MVT::f64, Custom);
    279   setOperationAction(ISD::FCOPYSIGN, MVT::f32, Custom);
    280 
    281   // AArch64 has implementations of a lot of rounding-like FP operations.
    282   static MVT RoundingTypes[] = { MVT::f32, MVT::f64};
    283   for (unsigned I = 0; I < array_lengthof(RoundingTypes); ++I) {
    284     MVT Ty = RoundingTypes[I];
    285     setOperationAction(ISD::FFLOOR, Ty, Legal);
    286     setOperationAction(ISD::FNEARBYINT, Ty, Legal);
    287     setOperationAction(ISD::FCEIL, Ty, Legal);
    288     setOperationAction(ISD::FRINT, Ty, Legal);
    289     setOperationAction(ISD::FTRUNC, Ty, Legal);
    290     setOperationAction(ISD::FROUND, Ty, Legal);
    291   }
    292 
    293   setOperationAction(ISD::PREFETCH, MVT::Other, Custom);
    294 
    295   if (Subtarget->isTargetMachO()) {
    296     // For iOS, we don't want to the normal expansion of a libcall to
    297     // sincos. We want to issue a libcall to __sincos_stret to avoid memory
    298     // traffic.
    299     setOperationAction(ISD::FSINCOS, MVT::f64, Custom);
    300     setOperationAction(ISD::FSINCOS, MVT::f32, Custom);
    301   } else {
    302     setOperationAction(ISD::FSINCOS, MVT::f64, Expand);
    303     setOperationAction(ISD::FSINCOS, MVT::f32, Expand);
    304   }
    305 
    306   // AArch64 does not have floating-point extending loads, i1 sign-extending
    307   // load, floating-point truncating stores, or v2i32->v2i16 truncating store.
    308   setLoadExtAction(ISD::EXTLOAD, MVT::f32, Expand);
    309   setLoadExtAction(ISD::EXTLOAD, MVT::f64, Expand);
    310   setLoadExtAction(ISD::EXTLOAD, MVT::f80, Expand);
    311   setLoadExtAction(ISD::SEXTLOAD, MVT::i1, Expand);
    312   setTruncStoreAction(MVT::f32, MVT::f16, Expand);
    313   setTruncStoreAction(MVT::f64, MVT::f32, Expand);
    314   setTruncStoreAction(MVT::f64, MVT::f16, Expand);
    315   setTruncStoreAction(MVT::f128, MVT::f80, Expand);
    316   setTruncStoreAction(MVT::f128, MVT::f64, Expand);
    317   setTruncStoreAction(MVT::f128, MVT::f32, Expand);
    318   setTruncStoreAction(MVT::f128, MVT::f16, Expand);
    319   // Indexed loads and stores are supported.
    320   for (unsigned im = (unsigned)ISD::PRE_INC;
    321        im != (unsigned)ISD::LAST_INDEXED_MODE; ++im) {
    322     setIndexedLoadAction(im, MVT::i8, Legal);
    323     setIndexedLoadAction(im, MVT::i16, Legal);
    324     setIndexedLoadAction(im, MVT::i32, Legal);
    325     setIndexedLoadAction(im, MVT::i64, Legal);
    326     setIndexedLoadAction(im, MVT::f64, Legal);
    327     setIndexedLoadAction(im, MVT::f32, Legal);
    328     setIndexedStoreAction(im, MVT::i8, Legal);
    329     setIndexedStoreAction(im, MVT::i16, Legal);
    330     setIndexedStoreAction(im, MVT::i32, Legal);
    331     setIndexedStoreAction(im, MVT::i64, Legal);
    332     setIndexedStoreAction(im, MVT::f64, Legal);
    333     setIndexedStoreAction(im, MVT::f32, Legal);
    334   }
    335 
    336   // Trap.
    337   setOperationAction(ISD::TRAP, MVT::Other, Legal);
    338 
    339   // We combine OR nodes for bitfield operations.
    340   setTargetDAGCombine(ISD::OR);
    341 
    342   // Vector add and sub nodes may conceal a high-half opportunity.
    343   // Also, try to fold ADD into CSINC/CSINV..
    344   setTargetDAGCombine(ISD::ADD);
    345   setTargetDAGCombine(ISD::SUB);
    346 
    347   setTargetDAGCombine(ISD::XOR);
    348   setTargetDAGCombine(ISD::SINT_TO_FP);
    349   setTargetDAGCombine(ISD::UINT_TO_FP);
    350 
    351   setTargetDAGCombine(ISD::INTRINSIC_WO_CHAIN);
    352 
    353   setTargetDAGCombine(ISD::ANY_EXTEND);
    354   setTargetDAGCombine(ISD::ZERO_EXTEND);
    355   setTargetDAGCombine(ISD::SIGN_EXTEND);
    356   setTargetDAGCombine(ISD::BITCAST);
    357   setTargetDAGCombine(ISD::CONCAT_VECTORS);
    358   setTargetDAGCombine(ISD::STORE);
    359 
    360   setTargetDAGCombine(ISD::MUL);
    361 
    362   setTargetDAGCombine(ISD::SELECT);
    363   setTargetDAGCombine(ISD::VSELECT);
    364 
    365   setTargetDAGCombine(ISD::INTRINSIC_VOID);
    366   setTargetDAGCombine(ISD::INTRINSIC_W_CHAIN);
    367   setTargetDAGCombine(ISD::INSERT_VECTOR_ELT);
    368 
    369   MaxStoresPerMemset = MaxStoresPerMemsetOptSize = 8;
    370   MaxStoresPerMemcpy = MaxStoresPerMemcpyOptSize = 4;
    371   MaxStoresPerMemmove = MaxStoresPerMemmoveOptSize = 4;
    372 
    373   setStackPointerRegisterToSaveRestore(AArch64::SP);
    374 
    375   setSchedulingPreference(Sched::Hybrid);
    376 
    377   // Enable TBZ/TBNZ
    378   MaskAndBranchFoldingIsLegal = true;
    379 
    380   setMinFunctionAlignment(2);
    381 
    382   RequireStrictAlign = (Align == StrictAlign);
    383 
    384   setHasExtractBitsInsn(true);
    385 
    386   if (Subtarget->hasNEON()) {
    387     // FIXME: v1f64 shouldn't be legal if we can avoid it, because it leads to
    388     // silliness like this:
    389     setOperationAction(ISD::FABS, MVT::v1f64, Expand);
    390     setOperationAction(ISD::FADD, MVT::v1f64, Expand);
    391     setOperationAction(ISD::FCEIL, MVT::v1f64, Expand);
    392     setOperationAction(ISD::FCOPYSIGN, MVT::v1f64, Expand);
    393     setOperationAction(ISD::FCOS, MVT::v1f64, Expand);
    394     setOperationAction(ISD::FDIV, MVT::v1f64, Expand);
    395     setOperationAction(ISD::FFLOOR, MVT::v1f64, Expand);
    396     setOperationAction(ISD::FMA, MVT::v1f64, Expand);
    397     setOperationAction(ISD::FMUL, MVT::v1f64, Expand);
    398     setOperationAction(ISD::FNEARBYINT, MVT::v1f64, Expand);
    399     setOperationAction(ISD::FNEG, MVT::v1f64, Expand);
    400     setOperationAction(ISD::FPOW, MVT::v1f64, Expand);
    401     setOperationAction(ISD::FREM, MVT::v1f64, Expand);
    402     setOperationAction(ISD::FROUND, MVT::v1f64, Expand);
    403     setOperationAction(ISD::FRINT, MVT::v1f64, Expand);
    404     setOperationAction(ISD::FSIN, MVT::v1f64, Expand);
    405     setOperationAction(ISD::FSINCOS, MVT::v1f64, Expand);
    406     setOperationAction(ISD::FSQRT, MVT::v1f64, Expand);
    407     setOperationAction(ISD::FSUB, MVT::v1f64, Expand);
    408     setOperationAction(ISD::FTRUNC, MVT::v1f64, Expand);
    409     setOperationAction(ISD::SETCC, MVT::v1f64, Expand);
    410     setOperationAction(ISD::BR_CC, MVT::v1f64, Expand);
    411     setOperationAction(ISD::SELECT, MVT::v1f64, Expand);
    412     setOperationAction(ISD::SELECT_CC, MVT::v1f64, Expand);
    413     setOperationAction(ISD::FP_EXTEND, MVT::v1f64, Expand);
    414 
    415     setOperationAction(ISD::FP_TO_SINT, MVT::v1i64, Expand);
    416     setOperationAction(ISD::FP_TO_UINT, MVT::v1i64, Expand);
    417     setOperationAction(ISD::SINT_TO_FP, MVT::v1i64, Expand);
    418     setOperationAction(ISD::UINT_TO_FP, MVT::v1i64, Expand);
    419     setOperationAction(ISD::FP_ROUND, MVT::v1f64, Expand);
    420 
    421     setOperationAction(ISD::MUL, MVT::v1i64, Expand);
    422 
    423     // AArch64 doesn't have a direct vector ->f32 conversion instructions for
    424     // elements smaller than i32, so promote the input to i32 first.
    425     setOperationAction(ISD::UINT_TO_FP, MVT::v4i8, Promote);
    426     setOperationAction(ISD::SINT_TO_FP, MVT::v4i8, Promote);
    427     setOperationAction(ISD::UINT_TO_FP, MVT::v4i16, Promote);
    428     setOperationAction(ISD::SINT_TO_FP, MVT::v4i16, Promote);
    429     // Similarly, there is no direct i32 -> f64 vector conversion instruction.
    430     setOperationAction(ISD::SINT_TO_FP, MVT::v2i32, Custom);
    431     setOperationAction(ISD::UINT_TO_FP, MVT::v2i32, Custom);
    432     setOperationAction(ISD::SINT_TO_FP, MVT::v2i64, Custom);
    433     setOperationAction(ISD::UINT_TO_FP, MVT::v2i64, Custom);
    434 
    435     // AArch64 doesn't have MUL.2d:
    436     setOperationAction(ISD::MUL, MVT::v2i64, Expand);
    437     setOperationAction(ISD::ANY_EXTEND, MVT::v4i32, Legal);
    438     setTruncStoreAction(MVT::v2i32, MVT::v2i16, Expand);
    439     // Likewise, narrowing and extending vector loads/stores aren't handled
    440     // directly.
    441     for (unsigned VT = (unsigned)MVT::FIRST_VECTOR_VALUETYPE;
    442          VT <= (unsigned)MVT::LAST_VECTOR_VALUETYPE; ++VT) {
    443 
    444       setOperationAction(ISD::SIGN_EXTEND_INREG, (MVT::SimpleValueType)VT,
    445                          Expand);
    446 
    447       setOperationAction(ISD::MULHS, (MVT::SimpleValueType)VT, Expand);
    448       setOperationAction(ISD::SMUL_LOHI, (MVT::SimpleValueType)VT, Expand);
    449       setOperationAction(ISD::MULHU, (MVT::SimpleValueType)VT, Expand);
    450       setOperationAction(ISD::UMUL_LOHI, (MVT::SimpleValueType)VT, Expand);
    451 
    452       setOperationAction(ISD::BSWAP, (MVT::SimpleValueType)VT, Expand);
    453 
    454       for (unsigned InnerVT = (unsigned)MVT::FIRST_VECTOR_VALUETYPE;
    455            InnerVT <= (unsigned)MVT::LAST_VECTOR_VALUETYPE; ++InnerVT)
    456         setTruncStoreAction((MVT::SimpleValueType)VT,
    457                             (MVT::SimpleValueType)InnerVT, Expand);
    458       setLoadExtAction(ISD::SEXTLOAD, (MVT::SimpleValueType)VT, Expand);
    459       setLoadExtAction(ISD::ZEXTLOAD, (MVT::SimpleValueType)VT, Expand);
    460       setLoadExtAction(ISD::EXTLOAD, (MVT::SimpleValueType)VT, Expand);
    461     }
    462 
    463     // AArch64 has implementations of a lot of rounding-like FP operations.
    464     static MVT RoundingVecTypes[] = {MVT::v2f32, MVT::v4f32, MVT::v2f64 };
    465     for (unsigned I = 0; I < array_lengthof(RoundingVecTypes); ++I) {
    466       MVT Ty = RoundingVecTypes[I];
    467       setOperationAction(ISD::FFLOOR, Ty, Legal);
    468       setOperationAction(ISD::FNEARBYINT, Ty, Legal);
    469       setOperationAction(ISD::FCEIL, Ty, Legal);
    470       setOperationAction(ISD::FRINT, Ty, Legal);
    471       setOperationAction(ISD::FTRUNC, Ty, Legal);
    472       setOperationAction(ISD::FROUND, Ty, Legal);
    473     }
    474   }
    475 
    476   // Prefer likely predicted branches to selects on out-of-order cores.
    477   if (Subtarget->isCortexA57())
    478     PredictableSelectIsExpensive = true;
    479 }
    480 
    481 void AArch64TargetLowering::addTypeForNEON(EVT VT, EVT PromotedBitwiseVT) {
    482   if (VT == MVT::v2f32) {
    483     setOperationAction(ISD::LOAD, VT.getSimpleVT(), Promote);
    484     AddPromotedToType(ISD::LOAD, VT.getSimpleVT(), MVT::v2i32);
    485 
    486     setOperationAction(ISD::STORE, VT.getSimpleVT(), Promote);
    487     AddPromotedToType(ISD::STORE, VT.getSimpleVT(), MVT::v2i32);
    488   } else if (VT == MVT::v2f64 || VT == MVT::v4f32) {
    489     setOperationAction(ISD::LOAD, VT.getSimpleVT(), Promote);
    490     AddPromotedToType(ISD::LOAD, VT.getSimpleVT(), MVT::v2i64);
    491 
    492     setOperationAction(ISD::STORE, VT.getSimpleVT(), Promote);
    493     AddPromotedToType(ISD::STORE, VT.getSimpleVT(), MVT::v2i64);
    494   }
    495 
    496   // Mark vector float intrinsics as expand.
    497   if (VT == MVT::v2f32 || VT == MVT::v4f32 || VT == MVT::v2f64) {
    498     setOperationAction(ISD::FSIN, VT.getSimpleVT(), Expand);
    499     setOperationAction(ISD::FCOS, VT.getSimpleVT(), Expand);
    500     setOperationAction(ISD::FPOWI, VT.getSimpleVT(), Expand);
    501     setOperationAction(ISD::FPOW, VT.getSimpleVT(), Expand);
    502     setOperationAction(ISD::FLOG, VT.getSimpleVT(), Expand);
    503     setOperationAction(ISD::FLOG2, VT.getSimpleVT(), Expand);
    504     setOperationAction(ISD::FLOG10, VT.getSimpleVT(), Expand);
    505     setOperationAction(ISD::FEXP, VT.getSimpleVT(), Expand);
    506     setOperationAction(ISD::FEXP2, VT.getSimpleVT(), Expand);
    507   }
    508 
    509   setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT.getSimpleVT(), Custom);
    510   setOperationAction(ISD::INSERT_VECTOR_ELT, VT.getSimpleVT(), Custom);
    511   setOperationAction(ISD::BUILD_VECTOR, VT.getSimpleVT(), Custom);
    512   setOperationAction(ISD::VECTOR_SHUFFLE, VT.getSimpleVT(), Custom);
    513   setOperationAction(ISD::EXTRACT_SUBVECTOR, VT.getSimpleVT(), Custom);
    514   setOperationAction(ISD::SRA, VT.getSimpleVT(), Custom);
    515   setOperationAction(ISD::SRL, VT.getSimpleVT(), Custom);
    516   setOperationAction(ISD::SHL, VT.getSimpleVT(), Custom);
    517   setOperationAction(ISD::AND, VT.getSimpleVT(), Custom);
    518   setOperationAction(ISD::OR, VT.getSimpleVT(), Custom);
    519   setOperationAction(ISD::SETCC, VT.getSimpleVT(), Custom);
    520   setOperationAction(ISD::CONCAT_VECTORS, VT.getSimpleVT(), Legal);
    521 
    522   setOperationAction(ISD::SELECT, VT.getSimpleVT(), Expand);
    523   setOperationAction(ISD::SELECT_CC, VT.getSimpleVT(), Expand);
    524   setOperationAction(ISD::VSELECT, VT.getSimpleVT(), Expand);
    525   setLoadExtAction(ISD::EXTLOAD, VT.getSimpleVT(), Expand);
    526 
    527   // CNT supports only B element sizes.
    528   if (VT != MVT::v8i8 && VT != MVT::v16i8)
    529     setOperationAction(ISD::CTPOP, VT.getSimpleVT(), Expand);
    530 
    531   setOperationAction(ISD::UDIV, VT.getSimpleVT(), Expand);
    532   setOperationAction(ISD::SDIV, VT.getSimpleVT(), Expand);
    533   setOperationAction(ISD::UREM, VT.getSimpleVT(), Expand);
    534   setOperationAction(ISD::SREM, VT.getSimpleVT(), Expand);
    535   setOperationAction(ISD::FREM, VT.getSimpleVT(), Expand);
    536 
    537   setOperationAction(ISD::FP_TO_SINT, VT.getSimpleVT(), Custom);
    538   setOperationAction(ISD::FP_TO_UINT, VT.getSimpleVT(), Custom);
    539 
    540   if (Subtarget->isLittleEndian()) {
    541     for (unsigned im = (unsigned)ISD::PRE_INC;
    542          im != (unsigned)ISD::LAST_INDEXED_MODE; ++im) {
    543       setIndexedLoadAction(im, VT.getSimpleVT(), Legal);
    544       setIndexedStoreAction(im, VT.getSimpleVT(), Legal);
    545     }
    546   }
    547 }
    548 
    549 void AArch64TargetLowering::addDRTypeForNEON(MVT VT) {
    550   addRegisterClass(VT, &AArch64::FPR64RegClass);
    551   addTypeForNEON(VT, MVT::v2i32);
    552 }
    553 
    554 void AArch64TargetLowering::addQRTypeForNEON(MVT VT) {
    555   addRegisterClass(VT, &AArch64::FPR128RegClass);
    556   addTypeForNEON(VT, MVT::v4i32);
    557 }
    558 
    559 EVT AArch64TargetLowering::getSetCCResultType(LLVMContext &, EVT VT) const {
    560   if (!VT.isVector())
    561     return MVT::i32;
    562   return VT.changeVectorElementTypeToInteger();
    563 }
    564 
    565 /// computeKnownBitsForTargetNode - Determine which of the bits specified in
    566 /// Mask are known to be either zero or one and return them in the
    567 /// KnownZero/KnownOne bitsets.
    568 void AArch64TargetLowering::computeKnownBitsForTargetNode(
    569     const SDValue Op, APInt &KnownZero, APInt &KnownOne,
    570     const SelectionDAG &DAG, unsigned Depth) const {
    571   switch (Op.getOpcode()) {
    572   default:
    573     break;
    574   case AArch64ISD::CSEL: {
    575     APInt KnownZero2, KnownOne2;
    576     DAG.computeKnownBits(Op->getOperand(0), KnownZero, KnownOne, Depth + 1);
    577     DAG.computeKnownBits(Op->getOperand(1), KnownZero2, KnownOne2, Depth + 1);
    578     KnownZero &= KnownZero2;
    579     KnownOne &= KnownOne2;
    580     break;
    581   }
    582   case ISD::INTRINSIC_W_CHAIN: {
    583    ConstantSDNode *CN = cast<ConstantSDNode>(Op->getOperand(1));
    584     Intrinsic::ID IntID = static_cast<Intrinsic::ID>(CN->getZExtValue());
    585     switch (IntID) {
    586     default: return;
    587     case Intrinsic::aarch64_ldaxr:
    588     case Intrinsic::aarch64_ldxr: {
    589       unsigned BitWidth = KnownOne.getBitWidth();
    590       EVT VT = cast<MemIntrinsicSDNode>(Op)->getMemoryVT();
    591       unsigned MemBits = VT.getScalarType().getSizeInBits();
    592       KnownZero |= APInt::getHighBitsSet(BitWidth, BitWidth - MemBits);
    593       return;
    594     }
    595     }
    596     break;
    597   }
    598   case ISD::INTRINSIC_WO_CHAIN:
    599   case ISD::INTRINSIC_VOID: {
    600     unsigned IntNo = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
    601     switch (IntNo) {
    602     default:
    603       break;
    604     case Intrinsic::aarch64_neon_umaxv:
    605     case Intrinsic::aarch64_neon_uminv: {
    606       // Figure out the datatype of the vector operand. The UMINV instruction
    607       // will zero extend the result, so we can mark as known zero all the
    608       // bits larger than the element datatype. 32-bit or larget doesn't need
    609       // this as those are legal types and will be handled by isel directly.
    610       MVT VT = Op.getOperand(1).getValueType().getSimpleVT();
    611       unsigned BitWidth = KnownZero.getBitWidth();
    612       if (VT == MVT::v8i8 || VT == MVT::v16i8) {
    613         assert(BitWidth >= 8 && "Unexpected width!");
    614         APInt Mask = APInt::getHighBitsSet(BitWidth, BitWidth - 8);
    615         KnownZero |= Mask;
    616       } else if (VT == MVT::v4i16 || VT == MVT::v8i16) {
    617         assert(BitWidth >= 16 && "Unexpected width!");
    618         APInt Mask = APInt::getHighBitsSet(BitWidth, BitWidth - 16);
    619         KnownZero |= Mask;
    620       }
    621       break;
    622     } break;
    623     }
    624   }
    625   }
    626 }
    627 
    628 MVT AArch64TargetLowering::getScalarShiftAmountTy(EVT LHSTy) const {
    629   return MVT::i64;
    630 }
    631 
    632 unsigned AArch64TargetLowering::getMaximalGlobalOffset() const {
    633   // FIXME: On AArch64, this depends on the type.
    634   // Basically, the addressable offsets are up to 4095 * Ty.getSizeInBytes().
    635   // and the offset has to be a multiple of the related size in bytes.
    636   return 4095;
    637 }
    638 
    639 FastISel *
    640 AArch64TargetLowering::createFastISel(FunctionLoweringInfo &funcInfo,
    641                                       const TargetLibraryInfo *libInfo) const {
    642   return AArch64::createFastISel(funcInfo, libInfo);
    643 }
    644 
    645 const char *AArch64TargetLowering::getTargetNodeName(unsigned Opcode) const {
    646   switch (Opcode) {
    647   default:
    648     return nullptr;
    649   case AArch64ISD::CALL:              return "AArch64ISD::CALL";
    650   case AArch64ISD::ADRP:              return "AArch64ISD::ADRP";
    651   case AArch64ISD::ADDlow:            return "AArch64ISD::ADDlow";
    652   case AArch64ISD::LOADgot:           return "AArch64ISD::LOADgot";
    653   case AArch64ISD::RET_FLAG:          return "AArch64ISD::RET_FLAG";
    654   case AArch64ISD::BRCOND:            return "AArch64ISD::BRCOND";
    655   case AArch64ISD::CSEL:              return "AArch64ISD::CSEL";
    656   case AArch64ISD::FCSEL:             return "AArch64ISD::FCSEL";
    657   case AArch64ISD::CSINV:             return "AArch64ISD::CSINV";
    658   case AArch64ISD::CSNEG:             return "AArch64ISD::CSNEG";
    659   case AArch64ISD::CSINC:             return "AArch64ISD::CSINC";
    660   case AArch64ISD::THREAD_POINTER:    return "AArch64ISD::THREAD_POINTER";
    661   case AArch64ISD::TLSDESC_CALL:      return "AArch64ISD::TLSDESC_CALL";
    662   case AArch64ISD::ADC:               return "AArch64ISD::ADC";
    663   case AArch64ISD::SBC:               return "AArch64ISD::SBC";
    664   case AArch64ISD::ADDS:              return "AArch64ISD::ADDS";
    665   case AArch64ISD::SUBS:              return "AArch64ISD::SUBS";
    666   case AArch64ISD::ADCS:              return "AArch64ISD::ADCS";
    667   case AArch64ISD::SBCS:              return "AArch64ISD::SBCS";
    668   case AArch64ISD::ANDS:              return "AArch64ISD::ANDS";
    669   case AArch64ISD::FCMP:              return "AArch64ISD::FCMP";
    670   case AArch64ISD::FMIN:              return "AArch64ISD::FMIN";
    671   case AArch64ISD::FMAX:              return "AArch64ISD::FMAX";
    672   case AArch64ISD::DUP:               return "AArch64ISD::DUP";
    673   case AArch64ISD::DUPLANE8:          return "AArch64ISD::DUPLANE8";
    674   case AArch64ISD::DUPLANE16:         return "AArch64ISD::DUPLANE16";
    675   case AArch64ISD::DUPLANE32:         return "AArch64ISD::DUPLANE32";
    676   case AArch64ISD::DUPLANE64:         return "AArch64ISD::DUPLANE64";
    677   case AArch64ISD::MOVI:              return "AArch64ISD::MOVI";
    678   case AArch64ISD::MOVIshift:         return "AArch64ISD::MOVIshift";
    679   case AArch64ISD::MOVIedit:          return "AArch64ISD::MOVIedit";
    680   case AArch64ISD::MOVImsl:           return "AArch64ISD::MOVImsl";
    681   case AArch64ISD::FMOV:              return "AArch64ISD::FMOV";
    682   case AArch64ISD::MVNIshift:         return "AArch64ISD::MVNIshift";
    683   case AArch64ISD::MVNImsl:           return "AArch64ISD::MVNImsl";
    684   case AArch64ISD::BICi:              return "AArch64ISD::BICi";
    685   case AArch64ISD::ORRi:              return "AArch64ISD::ORRi";
    686   case AArch64ISD::BSL:               return "AArch64ISD::BSL";
    687   case AArch64ISD::NEG:               return "AArch64ISD::NEG";
    688   case AArch64ISD::EXTR:              return "AArch64ISD::EXTR";
    689   case AArch64ISD::ZIP1:              return "AArch64ISD::ZIP1";
    690   case AArch64ISD::ZIP2:              return "AArch64ISD::ZIP2";
    691   case AArch64ISD::UZP1:              return "AArch64ISD::UZP1";
    692   case AArch64ISD::UZP2:              return "AArch64ISD::UZP2";
    693   case AArch64ISD::TRN1:              return "AArch64ISD::TRN1";
    694   case AArch64ISD::TRN2:              return "AArch64ISD::TRN2";
    695   case AArch64ISD::REV16:             return "AArch64ISD::REV16";
    696   case AArch64ISD::REV32:             return "AArch64ISD::REV32";
    697   case AArch64ISD::REV64:             return "AArch64ISD::REV64";
    698   case AArch64ISD::EXT:               return "AArch64ISD::EXT";
    699   case AArch64ISD::VSHL:              return "AArch64ISD::VSHL";
    700   case AArch64ISD::VLSHR:             return "AArch64ISD::VLSHR";
    701   case AArch64ISD::VASHR:             return "AArch64ISD::VASHR";
    702   case AArch64ISD::CMEQ:              return "AArch64ISD::CMEQ";
    703   case AArch64ISD::CMGE:              return "AArch64ISD::CMGE";
    704   case AArch64ISD::CMGT:              return "AArch64ISD::CMGT";
    705   case AArch64ISD::CMHI:              return "AArch64ISD::CMHI";
    706   case AArch64ISD::CMHS:              return "AArch64ISD::CMHS";
    707   case AArch64ISD::FCMEQ:             return "AArch64ISD::FCMEQ";
    708   case AArch64ISD::FCMGE:             return "AArch64ISD::FCMGE";
    709   case AArch64ISD::FCMGT:             return "AArch64ISD::FCMGT";
    710   case AArch64ISD::CMEQz:             return "AArch64ISD::CMEQz";
    711   case AArch64ISD::CMGEz:             return "AArch64ISD::CMGEz";
    712   case AArch64ISD::CMGTz:             return "AArch64ISD::CMGTz";
    713   case AArch64ISD::CMLEz:             return "AArch64ISD::CMLEz";
    714   case AArch64ISD::CMLTz:             return "AArch64ISD::CMLTz";
    715   case AArch64ISD::FCMEQz:            return "AArch64ISD::FCMEQz";
    716   case AArch64ISD::FCMGEz:            return "AArch64ISD::FCMGEz";
    717   case AArch64ISD::FCMGTz:            return "AArch64ISD::FCMGTz";
    718   case AArch64ISD::FCMLEz:            return "AArch64ISD::FCMLEz";
    719   case AArch64ISD::FCMLTz:            return "AArch64ISD::FCMLTz";
    720   case AArch64ISD::NOT:               return "AArch64ISD::NOT";
    721   case AArch64ISD::BIT:               return "AArch64ISD::BIT";
    722   case AArch64ISD::CBZ:               return "AArch64ISD::CBZ";
    723   case AArch64ISD::CBNZ:              return "AArch64ISD::CBNZ";
    724   case AArch64ISD::TBZ:               return "AArch64ISD::TBZ";
    725   case AArch64ISD::TBNZ:              return "AArch64ISD::TBNZ";
    726   case AArch64ISD::TC_RETURN:         return "AArch64ISD::TC_RETURN";
    727   case AArch64ISD::SITOF:             return "AArch64ISD::SITOF";
    728   case AArch64ISD::UITOF:             return "AArch64ISD::UITOF";
    729   case AArch64ISD::SQSHL_I:           return "AArch64ISD::SQSHL_I";
    730   case AArch64ISD::UQSHL_I:           return "AArch64ISD::UQSHL_I";
    731   case AArch64ISD::SRSHR_I:           return "AArch64ISD::SRSHR_I";
    732   case AArch64ISD::URSHR_I:           return "AArch64ISD::URSHR_I";
    733   case AArch64ISD::SQSHLU_I:          return "AArch64ISD::SQSHLU_I";
    734   case AArch64ISD::WrapperLarge:      return "AArch64ISD::WrapperLarge";
    735   case AArch64ISD::LD2post:           return "AArch64ISD::LD2post";
    736   case AArch64ISD::LD3post:           return "AArch64ISD::LD3post";
    737   case AArch64ISD::LD4post:           return "AArch64ISD::LD4post";
    738   case AArch64ISD::ST2post:           return "AArch64ISD::ST2post";
    739   case AArch64ISD::ST3post:           return "AArch64ISD::ST3post";
    740   case AArch64ISD::ST4post:           return "AArch64ISD::ST4post";
    741   case AArch64ISD::LD1x2post:         return "AArch64ISD::LD1x2post";
    742   case AArch64ISD::LD1x3post:         return "AArch64ISD::LD1x3post";
    743   case AArch64ISD::LD1x4post:         return "AArch64ISD::LD1x4post";
    744   case AArch64ISD::ST1x2post:         return "AArch64ISD::ST1x2post";
    745   case AArch64ISD::ST1x3post:         return "AArch64ISD::ST1x3post";
    746   case AArch64ISD::ST1x4post:         return "AArch64ISD::ST1x4post";
    747   case AArch64ISD::LD1DUPpost:        return "AArch64ISD::LD1DUPpost";
    748   case AArch64ISD::LD2DUPpost:        return "AArch64ISD::LD2DUPpost";
    749   case AArch64ISD::LD3DUPpost:        return "AArch64ISD::LD3DUPpost";
    750   case AArch64ISD::LD4DUPpost:        return "AArch64ISD::LD4DUPpost";
    751   case AArch64ISD::LD1LANEpost:       return "AArch64ISD::LD1LANEpost";
    752   case AArch64ISD::LD2LANEpost:       return "AArch64ISD::LD2LANEpost";
    753   case AArch64ISD::LD3LANEpost:       return "AArch64ISD::LD3LANEpost";
    754   case AArch64ISD::LD4LANEpost:       return "AArch64ISD::LD4LANEpost";
    755   case AArch64ISD::ST2LANEpost:       return "AArch64ISD::ST2LANEpost";
    756   case AArch64ISD::ST3LANEpost:       return "AArch64ISD::ST3LANEpost";
    757   case AArch64ISD::ST4LANEpost:       return "AArch64ISD::ST4LANEpost";
    758   }
    759 }
    760 
    761 MachineBasicBlock *
    762 AArch64TargetLowering::EmitF128CSEL(MachineInstr *MI,
    763                                     MachineBasicBlock *MBB) const {
    764   // We materialise the F128CSEL pseudo-instruction as some control flow and a
    765   // phi node:
    766 
    767   // OrigBB:
    768   //     [... previous instrs leading to comparison ...]
    769   //     b.ne TrueBB
    770   //     b EndBB
    771   // TrueBB:
    772   //     ; Fallthrough
    773   // EndBB:
    774   //     Dest = PHI [IfTrue, TrueBB], [IfFalse, OrigBB]
    775 
    776   const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
    777   MachineFunction *MF = MBB->getParent();
    778   const BasicBlock *LLVM_BB = MBB->getBasicBlock();
    779   DebugLoc DL = MI->getDebugLoc();
    780   MachineFunction::iterator It = MBB;
    781   ++It;
    782 
    783   unsigned DestReg = MI->getOperand(0).getReg();
    784   unsigned IfTrueReg = MI->getOperand(1).getReg();
    785   unsigned IfFalseReg = MI->getOperand(2).getReg();
    786   unsigned CondCode = MI->getOperand(3).getImm();
    787   bool NZCVKilled = MI->getOperand(4).isKill();
    788 
    789   MachineBasicBlock *TrueBB = MF->CreateMachineBasicBlock(LLVM_BB);
    790   MachineBasicBlock *EndBB = MF->CreateMachineBasicBlock(LLVM_BB);
    791   MF->insert(It, TrueBB);
    792   MF->insert(It, EndBB);
    793 
    794   // Transfer rest of current basic-block to EndBB
    795   EndBB->splice(EndBB->begin(), MBB, std::next(MachineBasicBlock::iterator(MI)),
    796                 MBB->end());
    797   EndBB->transferSuccessorsAndUpdatePHIs(MBB);
    798 
    799   BuildMI(MBB, DL, TII->get(AArch64::Bcc)).addImm(CondCode).addMBB(TrueBB);
    800   BuildMI(MBB, DL, TII->get(AArch64::B)).addMBB(EndBB);
    801   MBB->addSuccessor(TrueBB);
    802   MBB->addSuccessor(EndBB);
    803 
    804   // TrueBB falls through to the end.
    805   TrueBB->addSuccessor(EndBB);
    806 
    807   if (!NZCVKilled) {
    808     TrueBB->addLiveIn(AArch64::NZCV);
    809     EndBB->addLiveIn(AArch64::NZCV);
    810   }
    811 
    812   BuildMI(*EndBB, EndBB->begin(), DL, TII->get(AArch64::PHI), DestReg)
    813       .addReg(IfTrueReg)
    814       .addMBB(TrueBB)
    815       .addReg(IfFalseReg)
    816       .addMBB(MBB);
    817 
    818   MI->eraseFromParent();
    819   return EndBB;
    820 }
    821 
    822 MachineBasicBlock *
    823 AArch64TargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI,
    824                                                  MachineBasicBlock *BB) const {
    825   switch (MI->getOpcode()) {
    826   default:
    827 #ifndef NDEBUG
    828     MI->dump();
    829 #endif
    830     llvm_unreachable("Unexpected instruction for custom inserter!");
    831 
    832   case AArch64::F128CSEL:
    833     return EmitF128CSEL(MI, BB);
    834 
    835   case TargetOpcode::STACKMAP:
    836   case TargetOpcode::PATCHPOINT:
    837     return emitPatchPoint(MI, BB);
    838   }
    839 }
    840 
    841 //===----------------------------------------------------------------------===//
    842 // AArch64 Lowering private implementation.
    843 //===----------------------------------------------------------------------===//
    844 
    845 //===----------------------------------------------------------------------===//
    846 // Lowering Code
    847 //===----------------------------------------------------------------------===//
    848 
    849 /// changeIntCCToAArch64CC - Convert a DAG integer condition code to an AArch64
    850 /// CC
    851 static AArch64CC::CondCode changeIntCCToAArch64CC(ISD::CondCode CC) {
    852   switch (CC) {
    853   default:
    854     llvm_unreachable("Unknown condition code!");
    855   case ISD::SETNE:
    856     return AArch64CC::NE;
    857   case ISD::SETEQ:
    858     return AArch64CC::EQ;
    859   case ISD::SETGT:
    860     return AArch64CC::GT;
    861   case ISD::SETGE:
    862     return AArch64CC::GE;
    863   case ISD::SETLT:
    864     return AArch64CC::LT;
    865   case ISD::SETLE:
    866     return AArch64CC::LE;
    867   case ISD::SETUGT:
    868     return AArch64CC::HI;
    869   case ISD::SETUGE:
    870     return AArch64CC::HS;
    871   case ISD::SETULT:
    872     return AArch64CC::LO;
    873   case ISD::SETULE:
    874     return AArch64CC::LS;
    875   }
    876 }
    877 
    878 /// changeFPCCToAArch64CC - Convert a DAG fp condition code to an AArch64 CC.
    879 static void changeFPCCToAArch64CC(ISD::CondCode CC,
    880                                   AArch64CC::CondCode &CondCode,
    881                                   AArch64CC::CondCode &CondCode2) {
    882   CondCode2 = AArch64CC::AL;
    883   switch (CC) {
    884   default:
    885     llvm_unreachable("Unknown FP condition!");
    886   case ISD::SETEQ:
    887   case ISD::SETOEQ:
    888     CondCode = AArch64CC::EQ;
    889     break;
    890   case ISD::SETGT:
    891   case ISD::SETOGT:
    892     CondCode = AArch64CC::GT;
    893     break;
    894   case ISD::SETGE:
    895   case ISD::SETOGE:
    896     CondCode = AArch64CC::GE;
    897     break;
    898   case ISD::SETOLT:
    899     CondCode = AArch64CC::MI;
    900     break;
    901   case ISD::SETOLE:
    902     CondCode = AArch64CC::LS;
    903     break;
    904   case ISD::SETONE:
    905     CondCode = AArch64CC::MI;
    906     CondCode2 = AArch64CC::GT;
    907     break;
    908   case ISD::SETO:
    909     CondCode = AArch64CC::VC;
    910     break;
    911   case ISD::SETUO:
    912     CondCode = AArch64CC::VS;
    913     break;
    914   case ISD::SETUEQ:
    915     CondCode = AArch64CC::EQ;
    916     CondCode2 = AArch64CC::VS;
    917     break;
    918   case ISD::SETUGT:
    919     CondCode = AArch64CC::HI;
    920     break;
    921   case ISD::SETUGE:
    922     CondCode = AArch64CC::PL;
    923     break;
    924   case ISD::SETLT:
    925   case ISD::SETULT:
    926     CondCode = AArch64CC::LT;
    927     break;
    928   case ISD::SETLE:
    929   case ISD::SETULE:
    930     CondCode = AArch64CC::LE;
    931     break;
    932   case ISD::SETNE:
    933   case ISD::SETUNE:
    934     CondCode = AArch64CC::NE;
    935     break;
    936   }
    937 }
    938 
    939 /// changeVectorFPCCToAArch64CC - Convert a DAG fp condition code to an AArch64
    940 /// CC usable with the vector instructions. Fewer operations are available
    941 /// without a real NZCV register, so we have to use less efficient combinations
    942 /// to get the same effect.
    943 static void changeVectorFPCCToAArch64CC(ISD::CondCode CC,
    944                                         AArch64CC::CondCode &CondCode,
    945                                         AArch64CC::CondCode &CondCode2,
    946                                         bool &Invert) {
    947   Invert = false;
    948   switch (CC) {
    949   default:
    950     // Mostly the scalar mappings work fine.
    951     changeFPCCToAArch64CC(CC, CondCode, CondCode2);
    952     break;
    953   case ISD::SETUO:
    954     Invert = true; // Fallthrough
    955   case ISD::SETO:
    956     CondCode = AArch64CC::MI;
    957     CondCode2 = AArch64CC::GE;
    958     break;
    959   case ISD::SETUEQ:
    960   case ISD::SETULT:
    961   case ISD::SETULE:
    962   case ISD::SETUGT:
    963   case ISD::SETUGE:
    964     // All of the compare-mask comparisons are ordered, but we can switch
    965     // between the two by a double inversion. E.g. ULE == !OGT.
    966     Invert = true;
    967     changeFPCCToAArch64CC(getSetCCInverse(CC, false), CondCode, CondCode2);
    968     break;
    969   }
    970 }
    971 
    972 static bool isLegalArithImmed(uint64_t C) {
    973   // Matches AArch64DAGToDAGISel::SelectArithImmed().
    974   return (C >> 12 == 0) || ((C & 0xFFFULL) == 0 && C >> 24 == 0);
    975 }
    976 
    977 static SDValue emitComparison(SDValue LHS, SDValue RHS, ISD::CondCode CC,
    978                               SDLoc dl, SelectionDAG &DAG) {
    979   EVT VT = LHS.getValueType();
    980 
    981   if (VT.isFloatingPoint())
    982     return DAG.getNode(AArch64ISD::FCMP, dl, VT, LHS, RHS);
    983 
    984   // The CMP instruction is just an alias for SUBS, and representing it as
    985   // SUBS means that it's possible to get CSE with subtract operations.
    986   // A later phase can perform the optimization of setting the destination
    987   // register to WZR/XZR if it ends up being unused.
    988   unsigned Opcode = AArch64ISD::SUBS;
    989 
    990   if (RHS.getOpcode() == ISD::SUB && isa<ConstantSDNode>(RHS.getOperand(0)) &&
    991       cast<ConstantSDNode>(RHS.getOperand(0))->getZExtValue() == 0 &&
    992       (CC == ISD::SETEQ || CC == ISD::SETNE)) {
    993     // We'd like to combine a (CMP op1, (sub 0, op2) into a CMN instruction on
    994     // the grounds that "op1 - (-op2) == op1 + op2". However, the C and V flags
    995     // can be set differently by this operation. It comes down to whether
    996     // "SInt(~op2)+1 == SInt(~op2+1)" (and the same for UInt). If they are then
    997     // everything is fine. If not then the optimization is wrong. Thus general
    998     // comparisons are only valid if op2 != 0.
    999 
   1000     // So, finally, the only LLVM-native comparisons that don't mention C and V
   1001     // are SETEQ and SETNE. They're the only ones we can safely use CMN for in
   1002     // the absence of information about op2.
   1003     Opcode = AArch64ISD::ADDS;
   1004     RHS = RHS.getOperand(1);
   1005   } else if (LHS.getOpcode() == ISD::AND && isa<ConstantSDNode>(RHS) &&
   1006              cast<ConstantSDNode>(RHS)->getZExtValue() == 0 &&
   1007              !isUnsignedIntSetCC(CC)) {
   1008     // Similarly, (CMP (and X, Y), 0) can be implemented with a TST
   1009     // (a.k.a. ANDS) except that the flags are only guaranteed to work for one
   1010     // of the signed comparisons.
   1011     Opcode = AArch64ISD::ANDS;
   1012     RHS = LHS.getOperand(1);
   1013     LHS = LHS.getOperand(0);
   1014   }
   1015 
   1016   return DAG.getNode(Opcode, dl, DAG.getVTList(VT, MVT::i32), LHS, RHS)
   1017       .getValue(1);
   1018 }
   1019 
   1020 static SDValue getAArch64Cmp(SDValue LHS, SDValue RHS, ISD::CondCode CC,
   1021                              SDValue &AArch64cc, SelectionDAG &DAG, SDLoc dl) {
   1022   if (ConstantSDNode *RHSC = dyn_cast<ConstantSDNode>(RHS.getNode())) {
   1023     EVT VT = RHS.getValueType();
   1024     uint64_t C = RHSC->getZExtValue();
   1025     if (!isLegalArithImmed(C)) {
   1026       // Constant does not fit, try adjusting it by one?
   1027       switch (CC) {
   1028       default:
   1029         break;
   1030       case ISD::SETLT:
   1031       case ISD::SETGE:
   1032         if ((VT == MVT::i32 && C != 0x80000000 &&
   1033              isLegalArithImmed((uint32_t)(C - 1))) ||
   1034             (VT == MVT::i64 && C != 0x80000000ULL &&
   1035              isLegalArithImmed(C - 1ULL))) {
   1036           CC = (CC == ISD::SETLT) ? ISD::SETLE : ISD::SETGT;
   1037           C = (VT == MVT::i32) ? (uint32_t)(C - 1) : C - 1;
   1038           RHS = DAG.getConstant(C, VT);
   1039         }
   1040         break;
   1041       case ISD::SETULT:
   1042       case ISD::SETUGE:
   1043         if ((VT == MVT::i32 && C != 0 &&
   1044              isLegalArithImmed((uint32_t)(C - 1))) ||
   1045             (VT == MVT::i64 && C != 0ULL && isLegalArithImmed(C - 1ULL))) {
   1046           CC = (CC == ISD::SETULT) ? ISD::SETULE : ISD::SETUGT;
   1047           C = (VT == MVT::i32) ? (uint32_t)(C - 1) : C - 1;
   1048           RHS = DAG.getConstant(C, VT);
   1049         }
   1050         break;
   1051       case ISD::SETLE:
   1052       case ISD::SETGT:
   1053         if ((VT == MVT::i32 && C != 0x7fffffff &&
   1054              isLegalArithImmed((uint32_t)(C + 1))) ||
   1055             (VT == MVT::i64 && C != 0x7ffffffffffffffULL &&
   1056              isLegalArithImmed(C + 1ULL))) {
   1057           CC = (CC == ISD::SETLE) ? ISD::SETLT : ISD::SETGE;
   1058           C = (VT == MVT::i32) ? (uint32_t)(C + 1) : C + 1;
   1059           RHS = DAG.getConstant(C, VT);
   1060         }
   1061         break;
   1062       case ISD::SETULE:
   1063       case ISD::SETUGT:
   1064         if ((VT == MVT::i32 && C != 0xffffffff &&
   1065              isLegalArithImmed((uint32_t)(C + 1))) ||
   1066             (VT == MVT::i64 && C != 0xfffffffffffffffULL &&
   1067              isLegalArithImmed(C + 1ULL))) {
   1068           CC = (CC == ISD::SETULE) ? ISD::SETULT : ISD::SETUGE;
   1069           C = (VT == MVT::i32) ? (uint32_t)(C + 1) : C + 1;
   1070           RHS = DAG.getConstant(C, VT);
   1071         }
   1072         break;
   1073       }
   1074     }
   1075   }
   1076 
   1077   SDValue Cmp = emitComparison(LHS, RHS, CC, dl, DAG);
   1078   AArch64CC::CondCode AArch64CC = changeIntCCToAArch64CC(CC);
   1079   AArch64cc = DAG.getConstant(AArch64CC, MVT::i32);
   1080   return Cmp;
   1081 }
   1082 
   1083 static std::pair<SDValue, SDValue>
   1084 getAArch64XALUOOp(AArch64CC::CondCode &CC, SDValue Op, SelectionDAG &DAG) {
   1085   assert((Op.getValueType() == MVT::i32 || Op.getValueType() == MVT::i64) &&
   1086          "Unsupported value type");
   1087   SDValue Value, Overflow;
   1088   SDLoc DL(Op);
   1089   SDValue LHS = Op.getOperand(0);
   1090   SDValue RHS = Op.getOperand(1);
   1091   unsigned Opc = 0;
   1092   switch (Op.getOpcode()) {
   1093   default:
   1094     llvm_unreachable("Unknown overflow instruction!");
   1095   case ISD::SADDO:
   1096     Opc = AArch64ISD::ADDS;
   1097     CC = AArch64CC::VS;
   1098     break;
   1099   case ISD::UADDO:
   1100     Opc = AArch64ISD::ADDS;
   1101     CC = AArch64CC::HS;
   1102     break;
   1103   case ISD::SSUBO:
   1104     Opc = AArch64ISD::SUBS;
   1105     CC = AArch64CC::VS;
   1106     break;
   1107   case ISD::USUBO:
   1108     Opc = AArch64ISD::SUBS;
   1109     CC = AArch64CC::LO;
   1110     break;
   1111   // Multiply needs a little bit extra work.
   1112   case ISD::SMULO:
   1113   case ISD::UMULO: {
   1114     CC = AArch64CC::NE;
   1115     bool IsSigned = (Op.getOpcode() == ISD::SMULO) ? true : false;
   1116     if (Op.getValueType() == MVT::i32) {
   1117       unsigned ExtendOpc = IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
   1118       // For a 32 bit multiply with overflow check we want the instruction
   1119       // selector to generate a widening multiply (SMADDL/UMADDL). For that we
   1120       // need to generate the following pattern:
   1121       // (i64 add 0, (i64 mul (i64 sext|zext i32 %a), (i64 sext|zext i32 %b))
   1122       LHS = DAG.getNode(ExtendOpc, DL, MVT::i64, LHS);
   1123       RHS = DAG.getNode(ExtendOpc, DL, MVT::i64, RHS);
   1124       SDValue Mul = DAG.getNode(ISD::MUL, DL, MVT::i64, LHS, RHS);
   1125       SDValue Add = DAG.getNode(ISD::ADD, DL, MVT::i64, Mul,
   1126                                 DAG.getConstant(0, MVT::i64));
   1127       // On AArch64 the upper 32 bits are always zero extended for a 32 bit
   1128       // operation. We need to clear out the upper 32 bits, because we used a
   1129       // widening multiply that wrote all 64 bits. In the end this should be a
   1130       // noop.
   1131       Value = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, Add);
   1132       if (IsSigned) {
   1133         // The signed overflow check requires more than just a simple check for
   1134         // any bit set in the upper 32 bits of the result. These bits could be
   1135         // just the sign bits of a negative number. To perform the overflow
   1136         // check we have to arithmetic shift right the 32nd bit of the result by
   1137         // 31 bits. Then we compare the result to the upper 32 bits.
   1138         SDValue UpperBits = DAG.getNode(ISD::SRL, DL, MVT::i64, Add,
   1139                                         DAG.getConstant(32, MVT::i64));
   1140         UpperBits = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, UpperBits);
   1141         SDValue LowerBits = DAG.getNode(ISD::SRA, DL, MVT::i32, Value,
   1142                                         DAG.getConstant(31, MVT::i64));
   1143         // It is important that LowerBits is last, otherwise the arithmetic
   1144         // shift will not be folded into the compare (SUBS).
   1145         SDVTList VTs = DAG.getVTList(MVT::i32, MVT::i32);
   1146         Overflow = DAG.getNode(AArch64ISD::SUBS, DL, VTs, UpperBits, LowerBits)
   1147                        .getValue(1);
   1148       } else {
   1149         // The overflow check for unsigned multiply is easy. We only need to
   1150         // check if any of the upper 32 bits are set. This can be done with a
   1151         // CMP (shifted register). For that we need to generate the following
   1152         // pattern:
   1153         // (i64 AArch64ISD::SUBS i64 0, (i64 srl i64 %Mul, i64 32)
   1154         SDValue UpperBits = DAG.getNode(ISD::SRL, DL, MVT::i64, Mul,
   1155                                         DAG.getConstant(32, MVT::i64));
   1156         SDVTList VTs = DAG.getVTList(MVT::i64, MVT::i32);
   1157         Overflow =
   1158             DAG.getNode(AArch64ISD::SUBS, DL, VTs, DAG.getConstant(0, MVT::i64),
   1159                         UpperBits).getValue(1);
   1160       }
   1161       break;
   1162     }
   1163     assert(Op.getValueType() == MVT::i64 && "Expected an i64 value type");
   1164     // For the 64 bit multiply
   1165     Value = DAG.getNode(ISD::MUL, DL, MVT::i64, LHS, RHS);
   1166     if (IsSigned) {
   1167       SDValue UpperBits = DAG.getNode(ISD::MULHS, DL, MVT::i64, LHS, RHS);
   1168       SDValue LowerBits = DAG.getNode(ISD::SRA, DL, MVT::i64, Value,
   1169                                       DAG.getConstant(63, MVT::i64));
   1170       // It is important that LowerBits is last, otherwise the arithmetic
   1171       // shift will not be folded into the compare (SUBS).
   1172       SDVTList VTs = DAG.getVTList(MVT::i64, MVT::i32);
   1173       Overflow = DAG.getNode(AArch64ISD::SUBS, DL, VTs, UpperBits, LowerBits)
   1174                      .getValue(1);
   1175     } else {
   1176       SDValue UpperBits = DAG.getNode(ISD::MULHU, DL, MVT::i64, LHS, RHS);
   1177       SDVTList VTs = DAG.getVTList(MVT::i64, MVT::i32);
   1178       Overflow =
   1179           DAG.getNode(AArch64ISD::SUBS, DL, VTs, DAG.getConstant(0, MVT::i64),
   1180                       UpperBits).getValue(1);
   1181     }
   1182     break;
   1183   }
   1184   } // switch (...)
   1185 
   1186   if (Opc) {
   1187     SDVTList VTs = DAG.getVTList(Op->getValueType(0), MVT::i32);
   1188 
   1189     // Emit the AArch64 operation with overflow check.
   1190     Value = DAG.getNode(Opc, DL, VTs, LHS, RHS);
   1191     Overflow = Value.getValue(1);
   1192   }
   1193   return std::make_pair(Value, Overflow);
   1194 }
   1195 
   1196 SDValue AArch64TargetLowering::LowerF128Call(SDValue Op, SelectionDAG &DAG,
   1197                                              RTLIB::Libcall Call) const {
   1198   SmallVector<SDValue, 2> Ops;
   1199   for (unsigned i = 0, e = Op->getNumOperands(); i != e; ++i)
   1200     Ops.push_back(Op.getOperand(i));
   1201 
   1202   return makeLibCall(DAG, Call, MVT::f128, &Ops[0], Ops.size(), false,
   1203                      SDLoc(Op)).first;
   1204 }
   1205 
   1206 static SDValue LowerXOR(SDValue Op, SelectionDAG &DAG) {
   1207   SDValue Sel = Op.getOperand(0);
   1208   SDValue Other = Op.getOperand(1);
   1209 
   1210   // If neither operand is a SELECT_CC, give up.
   1211   if (Sel.getOpcode() != ISD::SELECT_CC)
   1212     std::swap(Sel, Other);
   1213   if (Sel.getOpcode() != ISD::SELECT_CC)
   1214     return Op;
   1215 
   1216   // The folding we want to perform is:
   1217   // (xor x, (select_cc a, b, cc, 0, -1) )
   1218   //   -->
   1219   // (csel x, (xor x, -1), cc ...)
   1220   //
   1221   // The latter will get matched to a CSINV instruction.
   1222 
   1223   ISD::CondCode CC = cast<CondCodeSDNode>(Sel.getOperand(4))->get();
   1224   SDValue LHS = Sel.getOperand(0);
   1225   SDValue RHS = Sel.getOperand(1);
   1226   SDValue TVal = Sel.getOperand(2);
   1227   SDValue FVal = Sel.getOperand(3);
   1228   SDLoc dl(Sel);
   1229 
   1230   // FIXME: This could be generalized to non-integer comparisons.
   1231   if (LHS.getValueType() != MVT::i32 && LHS.getValueType() != MVT::i64)
   1232     return Op;
   1233 
   1234   ConstantSDNode *CFVal = dyn_cast<ConstantSDNode>(FVal);
   1235   ConstantSDNode *CTVal = dyn_cast<ConstantSDNode>(TVal);
   1236 
   1237   // The the values aren't constants, this isn't the pattern we're looking for.
   1238   if (!CFVal || !CTVal)
   1239     return Op;
   1240 
   1241   // We can commute the SELECT_CC by inverting the condition.  This
   1242   // might be needed to make this fit into a CSINV pattern.
   1243   if (CTVal->isAllOnesValue() && CFVal->isNullValue()) {
   1244     std::swap(TVal, FVal);
   1245     std::swap(CTVal, CFVal);
   1246     CC = ISD::getSetCCInverse(CC, true);
   1247   }
   1248 
   1249   // If the constants line up, perform the transform!
   1250   if (CTVal->isNullValue() && CFVal->isAllOnesValue()) {
   1251     SDValue CCVal;
   1252     SDValue Cmp = getAArch64Cmp(LHS, RHS, CC, CCVal, DAG, dl);
   1253 
   1254     FVal = Other;
   1255     TVal = DAG.getNode(ISD::XOR, dl, Other.getValueType(), Other,
   1256                        DAG.getConstant(-1ULL, Other.getValueType()));
   1257 
   1258     return DAG.getNode(AArch64ISD::CSEL, dl, Sel.getValueType(), FVal, TVal,
   1259                        CCVal, Cmp);
   1260   }
   1261 
   1262   return Op;
   1263 }
   1264 
   1265 static SDValue LowerADDC_ADDE_SUBC_SUBE(SDValue Op, SelectionDAG &DAG) {
   1266   EVT VT = Op.getValueType();
   1267 
   1268   // Let legalize expand this if it isn't a legal type yet.
   1269   if (!DAG.getTargetLoweringInfo().isTypeLegal(VT))
   1270     return SDValue();
   1271 
   1272   SDVTList VTs = DAG.getVTList(VT, MVT::i32);
   1273 
   1274   unsigned Opc;
   1275   bool ExtraOp = false;
   1276   switch (Op.getOpcode()) {
   1277   default:
   1278     llvm_unreachable("Invalid code");
   1279   case ISD::ADDC:
   1280     Opc = AArch64ISD::ADDS;
   1281     break;
   1282   case ISD::SUBC:
   1283     Opc = AArch64ISD::SUBS;
   1284     break;
   1285   case ISD::ADDE:
   1286     Opc = AArch64ISD::ADCS;
   1287     ExtraOp = true;
   1288     break;
   1289   case ISD::SUBE:
   1290     Opc = AArch64ISD::SBCS;
   1291     ExtraOp = true;
   1292     break;
   1293   }
   1294 
   1295   if (!ExtraOp)
   1296     return DAG.getNode(Opc, SDLoc(Op), VTs, Op.getOperand(0), Op.getOperand(1));
   1297   return DAG.getNode(Opc, SDLoc(Op), VTs, Op.getOperand(0), Op.getOperand(1),
   1298                      Op.getOperand(2));
   1299 }
   1300 
   1301 static SDValue LowerXALUO(SDValue Op, SelectionDAG &DAG) {
   1302   // Let legalize expand this if it isn't a legal type yet.
   1303   if (!DAG.getTargetLoweringInfo().isTypeLegal(Op.getValueType()))
   1304     return SDValue();
   1305 
   1306   AArch64CC::CondCode CC;
   1307   // The actual operation that sets the overflow or carry flag.
   1308   SDValue Value, Overflow;
   1309   std::tie(Value, Overflow) = getAArch64XALUOOp(CC, Op, DAG);
   1310 
   1311   // We use 0 and 1 as false and true values.
   1312   SDValue TVal = DAG.getConstant(1, MVT::i32);
   1313   SDValue FVal = DAG.getConstant(0, MVT::i32);
   1314 
   1315   // We use an inverted condition, because the conditional select is inverted
   1316   // too. This will allow it to be selected to a single instruction:
   1317   // CSINC Wd, WZR, WZR, invert(cond).
   1318   SDValue CCVal = DAG.getConstant(getInvertedCondCode(CC), MVT::i32);
   1319   Overflow = DAG.getNode(AArch64ISD::CSEL, SDLoc(Op), MVT::i32, FVal, TVal,
   1320                          CCVal, Overflow);
   1321 
   1322   SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32);
   1323   return DAG.getNode(ISD::MERGE_VALUES, SDLoc(Op), VTs, Value, Overflow);
   1324 }
   1325 
   1326 // Prefetch operands are:
   1327 // 1: Address to prefetch
   1328 // 2: bool isWrite
   1329 // 3: int locality (0 = no locality ... 3 = extreme locality)
   1330 // 4: bool isDataCache
   1331 static SDValue LowerPREFETCH(SDValue Op, SelectionDAG &DAG) {
   1332   SDLoc DL(Op);
   1333   unsigned IsWrite = cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue();
   1334   unsigned Locality = cast<ConstantSDNode>(Op.getOperand(3))->getZExtValue();
   1335   // The data thing is not used.
   1336   // unsigned isData = cast<ConstantSDNode>(Op.getOperand(4))->getZExtValue();
   1337 
   1338   bool IsStream = !Locality;
   1339   // When the locality number is set
   1340   if (Locality) {
   1341     // The front-end should have filtered out the out-of-range values
   1342     assert(Locality <= 3 && "Prefetch locality out-of-range");
   1343     // The locality degree is the opposite of the cache speed.
   1344     // Put the number the other way around.
   1345     // The encoding starts at 0 for level 1
   1346     Locality = 3 - Locality;
   1347   }
   1348 
   1349   // built the mask value encoding the expected behavior.
   1350   unsigned PrfOp = (IsWrite << 4) |     // Load/Store bit
   1351                    (Locality << 1) |    // Cache level bits
   1352                    (unsigned)IsStream;  // Stream bit
   1353   return DAG.getNode(AArch64ISD::PREFETCH, DL, MVT::Other, Op.getOperand(0),
   1354                      DAG.getConstant(PrfOp, MVT::i32), Op.getOperand(1));
   1355 }
   1356 
   1357 SDValue AArch64TargetLowering::LowerFP_EXTEND(SDValue Op,
   1358                                               SelectionDAG &DAG) const {
   1359   assert(Op.getValueType() == MVT::f128 && "Unexpected lowering");
   1360 
   1361   RTLIB::Libcall LC;
   1362   LC = RTLIB::getFPEXT(Op.getOperand(0).getValueType(), Op.getValueType());
   1363 
   1364   return LowerF128Call(Op, DAG, LC);
   1365 }
   1366 
   1367 SDValue AArch64TargetLowering::LowerFP_ROUND(SDValue Op,
   1368                                              SelectionDAG &DAG) const {
   1369   if (Op.getOperand(0).getValueType() != MVT::f128) {
   1370     // It's legal except when f128 is involved
   1371     return Op;
   1372   }
   1373 
   1374   RTLIB::Libcall LC;
   1375   LC = RTLIB::getFPROUND(Op.getOperand(0).getValueType(), Op.getValueType());
   1376 
   1377   // FP_ROUND node has a second operand indicating whether it is known to be
   1378   // precise. That doesn't take part in the LibCall so we can't directly use
   1379   // LowerF128Call.
   1380   SDValue SrcVal = Op.getOperand(0);
   1381   return makeLibCall(DAG, LC, Op.getValueType(), &SrcVal, 1,
   1382                      /*isSigned*/ false, SDLoc(Op)).first;
   1383 }
   1384 
   1385 static SDValue LowerVectorFP_TO_INT(SDValue Op, SelectionDAG &DAG) {
   1386   // Warning: We maintain cost tables in AArch64TargetTransformInfo.cpp.
   1387   // Any additional optimization in this function should be recorded
   1388   // in the cost tables.
   1389   EVT InVT = Op.getOperand(0).getValueType();
   1390   EVT VT = Op.getValueType();
   1391 
   1392   if (VT.getSizeInBits() < InVT.getSizeInBits()) {
   1393     SDLoc dl(Op);
   1394     SDValue Cv =
   1395         DAG.getNode(Op.getOpcode(), dl, InVT.changeVectorElementTypeToInteger(),
   1396                     Op.getOperand(0));
   1397     return DAG.getNode(ISD::TRUNCATE, dl, VT, Cv);
   1398   }
   1399 
   1400   if (VT.getSizeInBits() > InVT.getSizeInBits()) {
   1401     SDLoc dl(Op);
   1402     SDValue Ext = DAG.getNode(ISD::FP_EXTEND, dl, MVT::v2f64, Op.getOperand(0));
   1403     return DAG.getNode(Op.getOpcode(), dl, VT, Ext);
   1404   }
   1405 
   1406   // Type changing conversions are illegal.
   1407   return Op;
   1408 }
   1409 
   1410 SDValue AArch64TargetLowering::LowerFP_TO_INT(SDValue Op,
   1411                                               SelectionDAG &DAG) const {
   1412   if (Op.getOperand(0).getValueType().isVector())
   1413     return LowerVectorFP_TO_INT(Op, DAG);
   1414 
   1415   if (Op.getOperand(0).getValueType() != MVT::f128) {
   1416     // It's legal except when f128 is involved
   1417     return Op;
   1418   }
   1419 
   1420   RTLIB::Libcall LC;
   1421   if (Op.getOpcode() == ISD::FP_TO_SINT)
   1422     LC = RTLIB::getFPTOSINT(Op.getOperand(0).getValueType(), Op.getValueType());
   1423   else
   1424     LC = RTLIB::getFPTOUINT(Op.getOperand(0).getValueType(), Op.getValueType());
   1425 
   1426   SmallVector<SDValue, 2> Ops;
   1427   for (unsigned i = 0, e = Op->getNumOperands(); i != e; ++i)
   1428     Ops.push_back(Op.getOperand(i));
   1429 
   1430   return makeLibCall(DAG, LC, Op.getValueType(), &Ops[0], Ops.size(), false,
   1431                      SDLoc(Op)).first;
   1432 }
   1433 
   1434 static SDValue LowerVectorINT_TO_FP(SDValue Op, SelectionDAG &DAG) {
   1435   // Warning: We maintain cost tables in AArch64TargetTransformInfo.cpp.
   1436   // Any additional optimization in this function should be recorded
   1437   // in the cost tables.
   1438   EVT VT = Op.getValueType();
   1439   SDLoc dl(Op);
   1440   SDValue In = Op.getOperand(0);
   1441   EVT InVT = In.getValueType();
   1442 
   1443   if (VT.getSizeInBits() < InVT.getSizeInBits()) {
   1444     MVT CastVT =
   1445         MVT::getVectorVT(MVT::getFloatingPointVT(InVT.getScalarSizeInBits()),
   1446                          InVT.getVectorNumElements());
   1447     In = DAG.getNode(Op.getOpcode(), dl, CastVT, In);
   1448     return DAG.getNode(ISD::FP_ROUND, dl, VT, In, DAG.getIntPtrConstant(0));
   1449   }
   1450 
   1451   if (VT.getSizeInBits() > InVT.getSizeInBits()) {
   1452     unsigned CastOpc =
   1453         Op.getOpcode() == ISD::SINT_TO_FP ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
   1454     EVT CastVT = VT.changeVectorElementTypeToInteger();
   1455     In = DAG.getNode(CastOpc, dl, CastVT, In);
   1456     return DAG.getNode(Op.getOpcode(), dl, VT, In);
   1457   }
   1458 
   1459   return Op;
   1460 }
   1461 
   1462 SDValue AArch64TargetLowering::LowerINT_TO_FP(SDValue Op,
   1463                                             SelectionDAG &DAG) const {
   1464   if (Op.getValueType().isVector())
   1465     return LowerVectorINT_TO_FP(Op, DAG);
   1466 
   1467   // i128 conversions are libcalls.
   1468   if (Op.getOperand(0).getValueType() == MVT::i128)
   1469     return SDValue();
   1470 
   1471   // Other conversions are legal, unless it's to the completely software-based
   1472   // fp128.
   1473   if (Op.getValueType() != MVT::f128)
   1474     return Op;
   1475 
   1476   RTLIB::Libcall LC;
   1477   if (Op.getOpcode() == ISD::SINT_TO_FP)
   1478     LC = RTLIB::getSINTTOFP(Op.getOperand(0).getValueType(), Op.getValueType());
   1479   else
   1480     LC = RTLIB::getUINTTOFP(Op.getOperand(0).getValueType(), Op.getValueType());
   1481 
   1482   return LowerF128Call(Op, DAG, LC);
   1483 }
   1484 
   1485 SDValue AArch64TargetLowering::LowerFSINCOS(SDValue Op,
   1486                                             SelectionDAG &DAG) const {
   1487   // For iOS, we want to call an alternative entry point: __sincos_stret,
   1488   // which returns the values in two S / D registers.
   1489   SDLoc dl(Op);
   1490   SDValue Arg = Op.getOperand(0);
   1491   EVT ArgVT = Arg.getValueType();
   1492   Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext());
   1493 
   1494   ArgListTy Args;
   1495   ArgListEntry Entry;
   1496 
   1497   Entry.Node = Arg;
   1498   Entry.Ty = ArgTy;
   1499   Entry.isSExt = false;
   1500   Entry.isZExt = false;
   1501   Args.push_back(Entry);
   1502 
   1503   const char *LibcallName =
   1504       (ArgVT == MVT::f64) ? "__sincos_stret" : "__sincosf_stret";
   1505   SDValue Callee = DAG.getExternalSymbol(LibcallName, getPointerTy());
   1506 
   1507   StructType *RetTy = StructType::get(ArgTy, ArgTy, NULL);
   1508   TargetLowering::CallLoweringInfo CLI(DAG);
   1509   CLI.setDebugLoc(dl).setChain(DAG.getEntryNode())
   1510     .setCallee(CallingConv::Fast, RetTy, Callee, std::move(Args), 0);
   1511 
   1512   std::pair<SDValue, SDValue> CallResult = LowerCallTo(CLI);
   1513   return CallResult.first;
   1514 }
   1515 
   1516 SDValue AArch64TargetLowering::LowerOperation(SDValue Op,
   1517                                               SelectionDAG &DAG) const {
   1518   switch (Op.getOpcode()) {
   1519   default:
   1520     llvm_unreachable("unimplemented operand");
   1521     return SDValue();
   1522   case ISD::GlobalAddress:
   1523     return LowerGlobalAddress(Op, DAG);
   1524   case ISD::GlobalTLSAddress:
   1525     return LowerGlobalTLSAddress(Op, DAG);
   1526   case ISD::SETCC:
   1527     return LowerSETCC(Op, DAG);
   1528   case ISD::BR_CC:
   1529     return LowerBR_CC(Op, DAG);
   1530   case ISD::SELECT:
   1531     return LowerSELECT(Op, DAG);
   1532   case ISD::SELECT_CC:
   1533     return LowerSELECT_CC(Op, DAG);
   1534   case ISD::JumpTable:
   1535     return LowerJumpTable(Op, DAG);
   1536   case ISD::ConstantPool:
   1537     return LowerConstantPool(Op, DAG);
   1538   case ISD::BlockAddress:
   1539     return LowerBlockAddress(Op, DAG);
   1540   case ISD::VASTART:
   1541     return LowerVASTART(Op, DAG);
   1542   case ISD::VACOPY:
   1543     return LowerVACOPY(Op, DAG);
   1544   case ISD::VAARG:
   1545     return LowerVAARG(Op, DAG);
   1546   case ISD::ADDC:
   1547   case ISD::ADDE:
   1548   case ISD::SUBC:
   1549   case ISD::SUBE:
   1550     return LowerADDC_ADDE_SUBC_SUBE(Op, DAG);
   1551   case ISD::SADDO:
   1552   case ISD::UADDO:
   1553   case ISD::SSUBO:
   1554   case ISD::USUBO:
   1555   case ISD::SMULO:
   1556   case ISD::UMULO:
   1557     return LowerXALUO(Op, DAG);
   1558   case ISD::FADD:
   1559     return LowerF128Call(Op, DAG, RTLIB::ADD_F128);
   1560   case ISD::FSUB:
   1561     return LowerF128Call(Op, DAG, RTLIB::SUB_F128);
   1562   case ISD::FMUL:
   1563     return LowerF128Call(Op, DAG, RTLIB::MUL_F128);
   1564   case ISD::FDIV:
   1565     return LowerF128Call(Op, DAG, RTLIB::DIV_F128);
   1566   case ISD::FP_ROUND:
   1567     return LowerFP_ROUND(Op, DAG);
   1568   case ISD::FP_EXTEND:
   1569     return LowerFP_EXTEND(Op, DAG);
   1570   case ISD::FRAMEADDR:
   1571     return LowerFRAMEADDR(Op, DAG);
   1572   case ISD::RETURNADDR:
   1573     return LowerRETURNADDR(Op, DAG);
   1574   case ISD::INSERT_VECTOR_ELT:
   1575     return LowerINSERT_VECTOR_ELT(Op, DAG);
   1576   case ISD::EXTRACT_VECTOR_ELT:
   1577     return LowerEXTRACT_VECTOR_ELT(Op, DAG);
   1578   case ISD::BUILD_VECTOR:
   1579     return LowerBUILD_VECTOR(Op, DAG);
   1580   case ISD::VECTOR_SHUFFLE:
   1581     return LowerVECTOR_SHUFFLE(Op, DAG);
   1582   case ISD::EXTRACT_SUBVECTOR:
   1583     return LowerEXTRACT_SUBVECTOR(Op, DAG);
   1584   case ISD::SRA:
   1585   case ISD::SRL:
   1586   case ISD::SHL:
   1587     return LowerVectorSRA_SRL_SHL(Op, DAG);
   1588   case ISD::SHL_PARTS:
   1589     return LowerShiftLeftParts(Op, DAG);
   1590   case ISD::SRL_PARTS:
   1591   case ISD::SRA_PARTS:
   1592     return LowerShiftRightParts(Op, DAG);
   1593   case ISD::CTPOP:
   1594     return LowerCTPOP(Op, DAG);
   1595   case ISD::FCOPYSIGN:
   1596     return LowerFCOPYSIGN(Op, DAG);
   1597   case ISD::AND:
   1598     return LowerVectorAND(Op, DAG);
   1599   case ISD::OR:
   1600     return LowerVectorOR(Op, DAG);
   1601   case ISD::XOR:
   1602     return LowerXOR(Op, DAG);
   1603   case ISD::PREFETCH:
   1604     return LowerPREFETCH(Op, DAG);
   1605   case ISD::SINT_TO_FP:
   1606   case ISD::UINT_TO_FP:
   1607     return LowerINT_TO_FP(Op, DAG);
   1608   case ISD::FP_TO_SINT:
   1609   case ISD::FP_TO_UINT:
   1610     return LowerFP_TO_INT(Op, DAG);
   1611   case ISD::FSINCOS:
   1612     return LowerFSINCOS(Op, DAG);
   1613   }
   1614 }
   1615 
   1616 /// getFunctionAlignment - Return the Log2 alignment of this function.
   1617 unsigned AArch64TargetLowering::getFunctionAlignment(const Function *F) const {
   1618   return 2;
   1619 }
   1620 
   1621 //===----------------------------------------------------------------------===//
   1622 //                      Calling Convention Implementation
   1623 //===----------------------------------------------------------------------===//
   1624 
   1625 #include "AArch64GenCallingConv.inc"
   1626 
   1627 /// Selects the correct CCAssignFn for a the given CallingConvention
   1628 /// value.
   1629 CCAssignFn *AArch64TargetLowering::CCAssignFnForCall(CallingConv::ID CC,
   1630                                                      bool IsVarArg) const {
   1631   switch (CC) {
   1632   default:
   1633     llvm_unreachable("Unsupported calling convention.");
   1634   case CallingConv::WebKit_JS:
   1635     return CC_AArch64_WebKit_JS;
   1636   case CallingConv::C:
   1637   case CallingConv::Fast:
   1638     if (!Subtarget->isTargetDarwin())
   1639       return CC_AArch64_AAPCS;
   1640     return IsVarArg ? CC_AArch64_DarwinPCS_VarArg : CC_AArch64_DarwinPCS;
   1641   }
   1642 }
   1643 
   1644 SDValue AArch64TargetLowering::LowerFormalArguments(
   1645     SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
   1646     const SmallVectorImpl<ISD::InputArg> &Ins, SDLoc DL, SelectionDAG &DAG,
   1647     SmallVectorImpl<SDValue> &InVals) const {
   1648   MachineFunction &MF = DAG.getMachineFunction();
   1649   MachineFrameInfo *MFI = MF.getFrameInfo();
   1650 
   1651   // Assign locations to all of the incoming arguments.
   1652   SmallVector<CCValAssign, 16> ArgLocs;
   1653   CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(),
   1654                  getTargetMachine(), ArgLocs, *DAG.getContext());
   1655 
   1656   // At this point, Ins[].VT may already be promoted to i32. To correctly
   1657   // handle passing i8 as i8 instead of i32 on stack, we pass in both i32 and
   1658   // i8 to CC_AArch64_AAPCS with i32 being ValVT and i8 being LocVT.
   1659   // Since AnalyzeFormalArguments uses Ins[].VT for both ValVT and LocVT, here
   1660   // we use a special version of AnalyzeFormalArguments to pass in ValVT and
   1661   // LocVT.
   1662   unsigned NumArgs = Ins.size();
   1663   Function::const_arg_iterator CurOrigArg = MF.getFunction()->arg_begin();
   1664   unsigned CurArgIdx = 0;
   1665   for (unsigned i = 0; i != NumArgs; ++i) {
   1666     MVT ValVT = Ins[i].VT;
   1667     std::advance(CurOrigArg, Ins[i].OrigArgIndex - CurArgIdx);
   1668     CurArgIdx = Ins[i].OrigArgIndex;
   1669 
   1670     // Get type of the original argument.
   1671     EVT ActualVT = getValueType(CurOrigArg->getType(), /*AllowUnknown*/ true);
   1672     MVT ActualMVT = ActualVT.isSimple() ? ActualVT.getSimpleVT() : MVT::Other;
   1673     // If ActualMVT is i1/i8/i16, we should set LocVT to i8/i8/i16.
   1674     if (ActualMVT == MVT::i1 || ActualMVT == MVT::i8)
   1675       ValVT = MVT::i8;
   1676     else if (ActualMVT == MVT::i16)
   1677       ValVT = MVT::i16;
   1678 
   1679     CCAssignFn *AssignFn = CCAssignFnForCall(CallConv, /*IsVarArg=*/false);
   1680     bool Res =
   1681         AssignFn(i, ValVT, ValVT, CCValAssign::Full, Ins[i].Flags, CCInfo);
   1682     assert(!Res && "Call operand has unhandled type");
   1683     (void)Res;
   1684   }
   1685   assert(ArgLocs.size() == Ins.size());
   1686   SmallVector<SDValue, 16> ArgValues;
   1687   for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
   1688     CCValAssign &VA = ArgLocs[i];
   1689 
   1690     if (Ins[i].Flags.isByVal()) {
   1691       // Byval is used for HFAs in the PCS, but the system should work in a
   1692       // non-compliant manner for larger structs.
   1693       EVT PtrTy = getPointerTy();
   1694       int Size = Ins[i].Flags.getByValSize();
   1695       unsigned NumRegs = (Size + 7) / 8;
   1696 
   1697       // FIXME: This works on big-endian for composite byvals, which are the common
   1698       // case. It should also work for fundamental types too.
   1699       unsigned FrameIdx =
   1700         MFI->CreateFixedObject(8 * NumRegs, VA.getLocMemOffset(), false);
   1701       SDValue FrameIdxN = DAG.getFrameIndex(FrameIdx, PtrTy);
   1702       InVals.push_back(FrameIdxN);
   1703 
   1704       continue;
   1705     }
   1706 
   1707     if (VA.isRegLoc()) {
   1708       // Arguments stored in registers.
   1709       EVT RegVT = VA.getLocVT();
   1710 
   1711       SDValue ArgValue;
   1712       const TargetRegisterClass *RC;
   1713 
   1714       if (RegVT == MVT::i32)
   1715         RC = &AArch64::GPR32RegClass;
   1716       else if (RegVT == MVT::i64)
   1717         RC = &AArch64::GPR64RegClass;
   1718       else if (RegVT == MVT::f32)
   1719         RC = &AArch64::FPR32RegClass;
   1720       else if (RegVT == MVT::f64 || RegVT.is64BitVector())
   1721         RC = &AArch64::FPR64RegClass;
   1722       else if (RegVT == MVT::f128 || RegVT.is128BitVector())
   1723         RC = &AArch64::FPR128RegClass;
   1724       else
   1725         llvm_unreachable("RegVT not supported by FORMAL_ARGUMENTS Lowering");
   1726 
   1727       // Transform the arguments in physical registers into virtual ones.
   1728       unsigned Reg = MF.addLiveIn(VA.getLocReg(), RC);
   1729       ArgValue = DAG.getCopyFromReg(Chain, DL, Reg, RegVT);
   1730 
   1731       // If this is an 8, 16 or 32-bit value, it is really passed promoted
   1732       // to 64 bits.  Insert an assert[sz]ext to capture this, then
   1733       // truncate to the right size.
   1734       switch (VA.getLocInfo()) {
   1735       default:
   1736         llvm_unreachable("Unknown loc info!");
   1737       case CCValAssign::Full:
   1738         break;
   1739       case CCValAssign::BCvt:
   1740         ArgValue = DAG.getNode(ISD::BITCAST, DL, VA.getValVT(), ArgValue);
   1741         break;
   1742       case CCValAssign::AExt:
   1743       case CCValAssign::SExt:
   1744       case CCValAssign::ZExt:
   1745         // SelectionDAGBuilder will insert appropriate AssertZExt & AssertSExt
   1746         // nodes after our lowering.
   1747         assert(RegVT == Ins[i].VT && "incorrect register location selected");
   1748         break;
   1749       }
   1750 
   1751       InVals.push_back(ArgValue);
   1752 
   1753     } else { // VA.isRegLoc()
   1754       assert(VA.isMemLoc() && "CCValAssign is neither reg nor mem");
   1755       unsigned ArgOffset = VA.getLocMemOffset();
   1756       unsigned ArgSize = VA.getLocVT().getSizeInBits() / 8;
   1757 
   1758       uint32_t BEAlign = 0;
   1759       if (ArgSize < 8 && !Subtarget->isLittleEndian())
   1760         BEAlign = 8 - ArgSize;
   1761 
   1762       int FI = MFI->CreateFixedObject(ArgSize, ArgOffset + BEAlign, true);
   1763 
   1764       // Create load nodes to retrieve arguments from the stack.
   1765       SDValue FIN = DAG.getFrameIndex(FI, getPointerTy());
   1766       SDValue ArgValue;
   1767 
   1768       // For NON_EXTLOAD, generic code in getLoad assert(ValVT == MemVT)
   1769       ISD::LoadExtType ExtType = ISD::NON_EXTLOAD;
   1770       MVT MemVT = VA.getValVT();
   1771 
   1772       switch (VA.getLocInfo()) {
   1773       default:
   1774         break;
   1775       case CCValAssign::BCvt:
   1776         MemVT = VA.getLocVT();
   1777         break;
   1778       case CCValAssign::SExt:
   1779         ExtType = ISD::SEXTLOAD;
   1780         break;
   1781       case CCValAssign::ZExt:
   1782         ExtType = ISD::ZEXTLOAD;
   1783         break;
   1784       case CCValAssign::AExt:
   1785         ExtType = ISD::EXTLOAD;
   1786         break;
   1787       }
   1788 
   1789       ArgValue = DAG.getExtLoad(ExtType, DL, VA.getLocVT(), Chain, FIN,
   1790                                 MachinePointerInfo::getFixedStack(FI),
   1791                                 MemVT, false, false, false, nullptr);
   1792 
   1793       InVals.push_back(ArgValue);
   1794     }
   1795   }
   1796 
   1797   // varargs
   1798   if (isVarArg) {
   1799     if (!Subtarget->isTargetDarwin()) {
   1800       // The AAPCS variadic function ABI is identical to the non-variadic
   1801       // one. As a result there may be more arguments in registers and we should
   1802       // save them for future reference.
   1803       saveVarArgRegisters(CCInfo, DAG, DL, Chain);
   1804     }
   1805 
   1806     AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>();
   1807     // This will point to the next argument passed via stack.
   1808     unsigned StackOffset = CCInfo.getNextStackOffset();
   1809     // We currently pass all varargs at 8-byte alignment.
   1810     StackOffset = ((StackOffset + 7) & ~7);
   1811     AFI->setVarArgsStackIndex(MFI->CreateFixedObject(4, StackOffset, true));
   1812   }
   1813 
   1814   AArch64FunctionInfo *FuncInfo = MF.getInfo<AArch64FunctionInfo>();
   1815   unsigned StackArgSize = CCInfo.getNextStackOffset();
   1816   bool TailCallOpt = MF.getTarget().Options.GuaranteedTailCallOpt;
   1817   if (DoesCalleeRestoreStack(CallConv, TailCallOpt)) {
   1818     // This is a non-standard ABI so by fiat I say we're allowed to make full
   1819     // use of the stack area to be popped, which must be aligned to 16 bytes in
   1820     // any case:
   1821     StackArgSize = RoundUpToAlignment(StackArgSize, 16);
   1822 
   1823     // If we're expected to restore the stack (e.g. fastcc) then we'll be adding
   1824     // a multiple of 16.
   1825     FuncInfo->setArgumentStackToRestore(StackArgSize);
   1826 
   1827     // This realignment carries over to the available bytes below. Our own
   1828     // callers will guarantee the space is free by giving an aligned value to
   1829     // CALLSEQ_START.
   1830   }
   1831   // Even if we're not expected to free up the space, it's useful to know how
   1832   // much is there while considering tail calls (because we can reuse it).
   1833   FuncInfo->setBytesInStackArgArea(StackArgSize);
   1834 
   1835   return Chain;
   1836 }
   1837 
   1838 void AArch64TargetLowering::saveVarArgRegisters(CCState &CCInfo,
   1839                                                 SelectionDAG &DAG, SDLoc DL,
   1840                                                 SDValue &Chain) const {
   1841   MachineFunction &MF = DAG.getMachineFunction();
   1842   MachineFrameInfo *MFI = MF.getFrameInfo();
   1843   AArch64FunctionInfo *FuncInfo = MF.getInfo<AArch64FunctionInfo>();
   1844 
   1845   SmallVector<SDValue, 8> MemOps;
   1846 
   1847   static const MCPhysReg GPRArgRegs[] = { AArch64::X0, AArch64::X1, AArch64::X2,
   1848                                           AArch64::X3, AArch64::X4, AArch64::X5,
   1849                                           AArch64::X6, AArch64::X7 };
   1850   static const unsigned NumGPRArgRegs = array_lengthof(GPRArgRegs);
   1851   unsigned FirstVariadicGPR =
   1852       CCInfo.getFirstUnallocated(GPRArgRegs, NumGPRArgRegs);
   1853 
   1854   unsigned GPRSaveSize = 8 * (NumGPRArgRegs - FirstVariadicGPR);
   1855   int GPRIdx = 0;
   1856   if (GPRSaveSize != 0) {
   1857     GPRIdx = MFI->CreateStackObject(GPRSaveSize, 8, false);
   1858 
   1859     SDValue FIN = DAG.getFrameIndex(GPRIdx, getPointerTy());
   1860 
   1861     for (unsigned i = FirstVariadicGPR; i < NumGPRArgRegs; ++i) {
   1862       unsigned VReg = MF.addLiveIn(GPRArgRegs[i], &AArch64::GPR64RegClass);
   1863       SDValue Val = DAG.getCopyFromReg(Chain, DL, VReg, MVT::i64);
   1864       SDValue Store =
   1865           DAG.getStore(Val.getValue(1), DL, Val, FIN,
   1866                        MachinePointerInfo::getStack(i * 8), false, false, 0);
   1867       MemOps.push_back(Store);
   1868       FIN = DAG.getNode(ISD::ADD, DL, getPointerTy(), FIN,
   1869                         DAG.getConstant(8, getPointerTy()));
   1870     }
   1871   }
   1872   FuncInfo->setVarArgsGPRIndex(GPRIdx);
   1873   FuncInfo->setVarArgsGPRSize(GPRSaveSize);
   1874 
   1875   if (Subtarget->hasFPARMv8()) {
   1876     static const MCPhysReg FPRArgRegs[] = {
   1877         AArch64::Q0, AArch64::Q1, AArch64::Q2, AArch64::Q3,
   1878         AArch64::Q4, AArch64::Q5, AArch64::Q6, AArch64::Q7};
   1879     static const unsigned NumFPRArgRegs = array_lengthof(FPRArgRegs);
   1880     unsigned FirstVariadicFPR =
   1881         CCInfo.getFirstUnallocated(FPRArgRegs, NumFPRArgRegs);
   1882 
   1883     unsigned FPRSaveSize = 16 * (NumFPRArgRegs - FirstVariadicFPR);
   1884     int FPRIdx = 0;
   1885     if (FPRSaveSize != 0) {
   1886       FPRIdx = MFI->CreateStackObject(FPRSaveSize, 16, false);
   1887 
   1888       SDValue FIN = DAG.getFrameIndex(FPRIdx, getPointerTy());
   1889 
   1890       for (unsigned i = FirstVariadicFPR; i < NumFPRArgRegs; ++i) {
   1891         unsigned VReg = MF.addLiveIn(FPRArgRegs[i], &AArch64::FPR128RegClass);
   1892         SDValue Val = DAG.getCopyFromReg(Chain, DL, VReg, MVT::f128);
   1893 
   1894         SDValue Store =
   1895             DAG.getStore(Val.getValue(1), DL, Val, FIN,
   1896                          MachinePointerInfo::getStack(i * 16), false, false, 0);
   1897         MemOps.push_back(Store);
   1898         FIN = DAG.getNode(ISD::ADD, DL, getPointerTy(), FIN,
   1899                           DAG.getConstant(16, getPointerTy()));
   1900       }
   1901     }
   1902     FuncInfo->setVarArgsFPRIndex(FPRIdx);
   1903     FuncInfo->setVarArgsFPRSize(FPRSaveSize);
   1904   }
   1905 
   1906   if (!MemOps.empty()) {
   1907     Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOps);
   1908   }
   1909 }
   1910 
   1911 /// LowerCallResult - Lower the result values of a call into the
   1912 /// appropriate copies out of appropriate physical registers.
   1913 SDValue AArch64TargetLowering::LowerCallResult(
   1914     SDValue Chain, SDValue InFlag, CallingConv::ID CallConv, bool isVarArg,
   1915     const SmallVectorImpl<ISD::InputArg> &Ins, SDLoc DL, SelectionDAG &DAG,
   1916     SmallVectorImpl<SDValue> &InVals, bool isThisReturn,
   1917     SDValue ThisVal) const {
   1918   CCAssignFn *RetCC = CallConv == CallingConv::WebKit_JS
   1919                           ? RetCC_AArch64_WebKit_JS
   1920                           : RetCC_AArch64_AAPCS;
   1921   // Assign locations to each value returned by this call.
   1922   SmallVector<CCValAssign, 16> RVLocs;
   1923   CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(),
   1924                  getTargetMachine(), RVLocs, *DAG.getContext());
   1925   CCInfo.AnalyzeCallResult(Ins, RetCC);
   1926 
   1927   // Copy all of the result registers out of their specified physreg.
   1928   for (unsigned i = 0; i != RVLocs.size(); ++i) {
   1929     CCValAssign VA = RVLocs[i];
   1930 
   1931     // Pass 'this' value directly from the argument to return value, to avoid
   1932     // reg unit interference
   1933     if (i == 0 && isThisReturn) {
   1934       assert(!VA.needsCustom() && VA.getLocVT() == MVT::i64 &&
   1935              "unexpected return calling convention register assignment");
   1936       InVals.push_back(ThisVal);
   1937       continue;
   1938     }
   1939 
   1940     SDValue Val =
   1941         DAG.getCopyFromReg(Chain, DL, VA.getLocReg(), VA.getLocVT(), InFlag);
   1942     Chain = Val.getValue(1);
   1943     InFlag = Val.getValue(2);
   1944 
   1945     switch (VA.getLocInfo()) {
   1946     default:
   1947       llvm_unreachable("Unknown loc info!");
   1948     case CCValAssign::Full:
   1949       break;
   1950     case CCValAssign::BCvt:
   1951       Val = DAG.getNode(ISD::BITCAST, DL, VA.getValVT(), Val);
   1952       break;
   1953     }
   1954 
   1955     InVals.push_back(Val);
   1956   }
   1957 
   1958   return Chain;
   1959 }
   1960 
   1961 bool AArch64TargetLowering::isEligibleForTailCallOptimization(
   1962     SDValue Callee, CallingConv::ID CalleeCC, bool isVarArg,
   1963     bool isCalleeStructRet, bool isCallerStructRet,
   1964     const SmallVectorImpl<ISD::OutputArg> &Outs,
   1965     const SmallVectorImpl<SDValue> &OutVals,
   1966     const SmallVectorImpl<ISD::InputArg> &Ins, SelectionDAG &DAG) const {
   1967   // For CallingConv::C this function knows whether the ABI needs
   1968   // changing. That's not true for other conventions so they will have to opt in
   1969   // manually.
   1970   if (!IsTailCallConvention(CalleeCC) && CalleeCC != CallingConv::C)
   1971     return false;
   1972 
   1973   const MachineFunction &MF = DAG.getMachineFunction();
   1974   const Function *CallerF = MF.getFunction();
   1975   CallingConv::ID CallerCC = CallerF->getCallingConv();
   1976   bool CCMatch = CallerCC == CalleeCC;
   1977 
   1978   // Byval parameters hand the function a pointer directly into the stack area
   1979   // we want to reuse during a tail call. Working around this *is* possible (see
   1980   // X86) but less efficient and uglier in LowerCall.
   1981   for (Function::const_arg_iterator i = CallerF->arg_begin(),
   1982                                     e = CallerF->arg_end();
   1983        i != e; ++i)
   1984     if (i->hasByValAttr())
   1985       return false;
   1986 
   1987   if (getTargetMachine().Options.GuaranteedTailCallOpt) {
   1988     if (IsTailCallConvention(CalleeCC) && CCMatch)
   1989       return true;
   1990     return false;
   1991   }
   1992 
   1993   // Now we search for cases where we can use a tail call without changing the
   1994   // ABI. Sibcall is used in some places (particularly gcc) to refer to this
   1995   // concept.
   1996 
   1997   // I want anyone implementing a new calling convention to think long and hard
   1998   // about this assert.
   1999   assert((!isVarArg || CalleeCC == CallingConv::C) &&
   2000          "Unexpected variadic calling convention");
   2001 
   2002   if (isVarArg && !Outs.empty()) {
   2003     // At least two cases here: if caller is fastcc then we can't have any
   2004     // memory arguments (we'd be expected to clean up the stack afterwards). If
   2005     // caller is C then we could potentially use its argument area.
   2006 
   2007     // FIXME: for now we take the most conservative of these in both cases:
   2008     // disallow all variadic memory operands.
   2009     SmallVector<CCValAssign, 16> ArgLocs;
   2010     CCState CCInfo(CalleeCC, isVarArg, DAG.getMachineFunction(),
   2011                    getTargetMachine(), ArgLocs, *DAG.getContext());
   2012 
   2013     CCInfo.AnalyzeCallOperands(Outs, CCAssignFnForCall(CalleeCC, true));
   2014     for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i)
   2015       if (!ArgLocs[i].isRegLoc())
   2016         return false;
   2017   }
   2018 
   2019   // If the calling conventions do not match, then we'd better make sure the
   2020   // results are returned in the same way as what the caller expects.
   2021   if (!CCMatch) {
   2022     SmallVector<CCValAssign, 16> RVLocs1;
   2023     CCState CCInfo1(CalleeCC, false, DAG.getMachineFunction(),
   2024                     getTargetMachine(), RVLocs1, *DAG.getContext());
   2025     CCInfo1.AnalyzeCallResult(Ins, CCAssignFnForCall(CalleeCC, isVarArg));
   2026 
   2027     SmallVector<CCValAssign, 16> RVLocs2;
   2028     CCState CCInfo2(CallerCC, false, DAG.getMachineFunction(),
   2029                     getTargetMachine(), RVLocs2, *DAG.getContext());
   2030     CCInfo2.AnalyzeCallResult(Ins, CCAssignFnForCall(CallerCC, isVarArg));
   2031 
   2032     if (RVLocs1.size() != RVLocs2.size())
   2033       return false;
   2034     for (unsigned i = 0, e = RVLocs1.size(); i != e; ++i) {
   2035       if (RVLocs1[i].isRegLoc() != RVLocs2[i].isRegLoc())
   2036         return false;
   2037       if (RVLocs1[i].getLocInfo() != RVLocs2[i].getLocInfo())
   2038         return false;
   2039       if (RVLocs1[i].isRegLoc()) {
   2040         if (RVLocs1[i].getLocReg() != RVLocs2[i].getLocReg())
   2041           return false;
   2042       } else {
   2043         if (RVLocs1[i].getLocMemOffset() != RVLocs2[i].getLocMemOffset())
   2044           return false;
   2045       }
   2046     }
   2047   }
   2048 
   2049   // Nothing more to check if the callee is taking no arguments
   2050   if (Outs.empty())
   2051     return true;
   2052 
   2053   SmallVector<CCValAssign, 16> ArgLocs;
   2054   CCState CCInfo(CalleeCC, isVarArg, DAG.getMachineFunction(),
   2055                  getTargetMachine(), ArgLocs, *DAG.getContext());
   2056 
   2057   CCInfo.AnalyzeCallOperands(Outs, CCAssignFnForCall(CalleeCC, isVarArg));
   2058 
   2059   const AArch64FunctionInfo *FuncInfo = MF.getInfo<AArch64FunctionInfo>();
   2060 
   2061   // If the stack arguments for this call would fit into our own save area then
   2062   // the call can be made tail.
   2063   return CCInfo.getNextStackOffset() <= FuncInfo->getBytesInStackArgArea();
   2064 }
   2065 
   2066 SDValue AArch64TargetLowering::addTokenForArgument(SDValue Chain,
   2067                                                    SelectionDAG &DAG,
   2068                                                    MachineFrameInfo *MFI,
   2069                                                    int ClobberedFI) const {
   2070   SmallVector<SDValue, 8> ArgChains;
   2071   int64_t FirstByte = MFI->getObjectOffset(ClobberedFI);
   2072   int64_t LastByte = FirstByte + MFI->getObjectSize(ClobberedFI) - 1;
   2073 
   2074   // Include the original chain at the beginning of the list. When this is
   2075   // used by target LowerCall hooks, this helps legalize find the
   2076   // CALLSEQ_BEGIN node.
   2077   ArgChains.push_back(Chain);
   2078 
   2079   // Add a chain value for each stack argument corresponding
   2080   for (SDNode::use_iterator U = DAG.getEntryNode().getNode()->use_begin(),
   2081                             UE = DAG.getEntryNode().getNode()->use_end();
   2082        U != UE; ++U)
   2083     if (LoadSDNode *L = dyn_cast<LoadSDNode>(*U))
   2084       if (FrameIndexSDNode *FI = dyn_cast<FrameIndexSDNode>(L->getBasePtr()))
   2085         if (FI->getIndex() < 0) {
   2086           int64_t InFirstByte = MFI->getObjectOffset(FI->getIndex());
   2087           int64_t InLastByte = InFirstByte;
   2088           InLastByte += MFI->getObjectSize(FI->getIndex()) - 1;
   2089 
   2090           if ((InFirstByte <= FirstByte && FirstByte <= InLastByte) ||
   2091               (FirstByte <= InFirstByte && InFirstByte <= LastByte))
   2092             ArgChains.push_back(SDValue(L, 1));
   2093         }
   2094 
   2095   // Build a tokenfactor for all the chains.
   2096   return DAG.getNode(ISD::TokenFactor, SDLoc(Chain), MVT::Other, ArgChains);
   2097 }
   2098 
   2099 bool AArch64TargetLowering::DoesCalleeRestoreStack(CallingConv::ID CallCC,
   2100                                                    bool TailCallOpt) const {
   2101   return CallCC == CallingConv::Fast && TailCallOpt;
   2102 }
   2103 
   2104 bool AArch64TargetLowering::IsTailCallConvention(CallingConv::ID CallCC) const {
   2105   return CallCC == CallingConv::Fast;
   2106 }
   2107 
   2108 /// LowerCall - Lower a call to a callseq_start + CALL + callseq_end chain,
   2109 /// and add input and output parameter nodes.
   2110 SDValue
   2111 AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI,
   2112                                  SmallVectorImpl<SDValue> &InVals) const {
   2113   SelectionDAG &DAG = CLI.DAG;
   2114   SDLoc &DL = CLI.DL;
   2115   SmallVector<ISD::OutputArg, 32> &Outs = CLI.Outs;
   2116   SmallVector<SDValue, 32> &OutVals = CLI.OutVals;
   2117   SmallVector<ISD::InputArg, 32> &Ins = CLI.Ins;
   2118   SDValue Chain = CLI.Chain;
   2119   SDValue Callee = CLI.Callee;
   2120   bool &IsTailCall = CLI.IsTailCall;
   2121   CallingConv::ID CallConv = CLI.CallConv;
   2122   bool IsVarArg = CLI.IsVarArg;
   2123 
   2124   MachineFunction &MF = DAG.getMachineFunction();
   2125   bool IsStructRet = (Outs.empty()) ? false : Outs[0].Flags.isSRet();
   2126   bool IsThisReturn = false;
   2127 
   2128   AArch64FunctionInfo *FuncInfo = MF.getInfo<AArch64FunctionInfo>();
   2129   bool TailCallOpt = MF.getTarget().Options.GuaranteedTailCallOpt;
   2130   bool IsSibCall = false;
   2131 
   2132   if (IsTailCall) {
   2133     // Check if it's really possible to do a tail call.
   2134     IsTailCall = isEligibleForTailCallOptimization(
   2135         Callee, CallConv, IsVarArg, IsStructRet,
   2136         MF.getFunction()->hasStructRetAttr(), Outs, OutVals, Ins, DAG);
   2137     if (!IsTailCall && CLI.CS && CLI.CS->isMustTailCall())
   2138       report_fatal_error("failed to perform tail call elimination on a call "
   2139                          "site marked musttail");
   2140 
   2141     // A sibling call is one where we're under the usual C ABI and not planning
   2142     // to change that but can still do a tail call:
   2143     if (!TailCallOpt && IsTailCall)
   2144       IsSibCall = true;
   2145 
   2146     if (IsTailCall)
   2147       ++NumTailCalls;
   2148   }
   2149 
   2150   // Analyze operands of the call, assigning locations to each operand.
   2151   SmallVector<CCValAssign, 16> ArgLocs;
   2152   CCState CCInfo(CallConv, IsVarArg, DAG.getMachineFunction(),
   2153                  getTargetMachine(), ArgLocs, *DAG.getContext());
   2154 
   2155   if (IsVarArg) {
   2156     // Handle fixed and variable vector arguments differently.
   2157     // Variable vector arguments always go into memory.
   2158     unsigned NumArgs = Outs.size();
   2159 
   2160     for (unsigned i = 0; i != NumArgs; ++i) {
   2161       MVT ArgVT = Outs[i].VT;
   2162       ISD::ArgFlagsTy ArgFlags = Outs[i].Flags;
   2163       CCAssignFn *AssignFn = CCAssignFnForCall(CallConv,
   2164                                                /*IsVarArg=*/ !Outs[i].IsFixed);
   2165       bool Res = AssignFn(i, ArgVT, ArgVT, CCValAssign::Full, ArgFlags, CCInfo);
   2166       assert(!Res && "Call operand has unhandled type");
   2167       (void)Res;
   2168     }
   2169   } else {
   2170     // At this point, Outs[].VT may already be promoted to i32. To correctly
   2171     // handle passing i8 as i8 instead of i32 on stack, we pass in both i32 and
   2172     // i8 to CC_AArch64_AAPCS with i32 being ValVT and i8 being LocVT.
   2173     // Since AnalyzeCallOperands uses Ins[].VT for both ValVT and LocVT, here
   2174     // we use a special version of AnalyzeCallOperands to pass in ValVT and
   2175     // LocVT.
   2176     unsigned NumArgs = Outs.size();
   2177     for (unsigned i = 0; i != NumArgs; ++i) {
   2178       MVT ValVT = Outs[i].VT;
   2179       // Get type of the original argument.
   2180       EVT ActualVT = getValueType(CLI.getArgs()[Outs[i].OrigArgIndex].Ty,
   2181                                   /*AllowUnknown*/ true);
   2182       MVT ActualMVT = ActualVT.isSimple() ? ActualVT.getSimpleVT() : ValVT;
   2183       ISD::ArgFlagsTy ArgFlags = Outs[i].Flags;
   2184       // If ActualMVT is i1/i8/i16, we should set LocVT to i8/i8/i16.
   2185       if (ActualMVT == MVT::i1 || ActualMVT == MVT::i8)
   2186         ValVT = MVT::i8;
   2187       else if (ActualMVT == MVT::i16)
   2188         ValVT = MVT::i16;
   2189 
   2190       CCAssignFn *AssignFn = CCAssignFnForCall(CallConv, /*IsVarArg=*/false);
   2191       bool Res = AssignFn(i, ValVT, ValVT, CCValAssign::Full, ArgFlags, CCInfo);
   2192       assert(!Res && "Call operand has unhandled type");
   2193       (void)Res;
   2194     }
   2195   }
   2196 
   2197   // Get a count of how many bytes are to be pushed on the stack.
   2198   unsigned NumBytes = CCInfo.getNextStackOffset();
   2199 
   2200   if (IsSibCall) {
   2201     // Since we're not changing the ABI to make this a tail call, the memory
   2202     // operands are already available in the caller's incoming argument space.
   2203     NumBytes = 0;
   2204   }
   2205 
   2206   // FPDiff is the byte offset of the call's argument area from the callee's.
   2207   // Stores to callee stack arguments will be placed in FixedStackSlots offset
   2208   // by this amount for a tail call. In a sibling call it must be 0 because the
   2209   // caller will deallocate the entire stack and the callee still expects its
   2210   // arguments to begin at SP+0. Completely unused for non-tail calls.
   2211   int FPDiff = 0;
   2212 
   2213   if (IsTailCall && !IsSibCall) {
   2214     unsigned NumReusableBytes = FuncInfo->getBytesInStackArgArea();
   2215 
   2216     // Since callee will pop argument stack as a tail call, we must keep the
   2217     // popped size 16-byte aligned.
   2218     NumBytes = RoundUpToAlignment(NumBytes, 16);
   2219 
   2220     // FPDiff will be negative if this tail call requires more space than we
   2221     // would automatically have in our incoming argument space. Positive if we
   2222     // can actually shrink the stack.
   2223     FPDiff = NumReusableBytes - NumBytes;
   2224 
   2225     // The stack pointer must be 16-byte aligned at all times it's used for a
   2226     // memory operation, which in practice means at *all* times and in
   2227     // particular across call boundaries. Therefore our own arguments started at
   2228     // a 16-byte aligned SP and the delta applied for the tail call should
   2229     // satisfy the same constraint.
   2230     assert(FPDiff % 16 == 0 && "unaligned stack on tail call");
   2231   }
   2232 
   2233   // Adjust the stack pointer for the new arguments...
   2234   // These operations are automatically eliminated by the prolog/epilog pass
   2235   if (!IsSibCall)
   2236     Chain =
   2237         DAG.getCALLSEQ_START(Chain, DAG.getIntPtrConstant(NumBytes, true), DL);
   2238 
   2239   SDValue StackPtr = DAG.getCopyFromReg(Chain, DL, AArch64::SP, getPointerTy());
   2240 
   2241   SmallVector<std::pair<unsigned, SDValue>, 8> RegsToPass;
   2242   SmallVector<SDValue, 8> MemOpChains;
   2243 
   2244   // Walk the register/memloc assignments, inserting copies/loads.
   2245   for (unsigned i = 0, realArgIdx = 0, e = ArgLocs.size(); i != e;
   2246        ++i, ++realArgIdx) {
   2247     CCValAssign &VA = ArgLocs[i];
   2248     SDValue Arg = OutVals[realArgIdx];
   2249     ISD::ArgFlagsTy Flags = Outs[realArgIdx].Flags;
   2250 
   2251     // Promote the value if needed.
   2252     switch (VA.getLocInfo()) {
   2253     default:
   2254       llvm_unreachable("Unknown loc info!");
   2255     case CCValAssign::Full:
   2256       break;
   2257     case CCValAssign::SExt:
   2258       Arg = DAG.getNode(ISD::SIGN_EXTEND, DL, VA.getLocVT(), Arg);
   2259       break;
   2260     case CCValAssign::ZExt:
   2261       Arg = DAG.getNode(ISD::ZERO_EXTEND, DL, VA.getLocVT(), Arg);
   2262       break;
   2263     case CCValAssign::AExt:
   2264       if (Outs[realArgIdx].ArgVT == MVT::i1) {
   2265         // AAPCS requires i1 to be zero-extended to 8-bits by the caller.
   2266         Arg = DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, Arg);
   2267         Arg = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i8, Arg);
   2268       }
   2269       Arg = DAG.getNode(ISD::ANY_EXTEND, DL, VA.getLocVT(), Arg);
   2270       break;
   2271     case CCValAssign::BCvt:
   2272       Arg = DAG.getNode(ISD::BITCAST, DL, VA.getLocVT(), Arg);
   2273       break;
   2274     case CCValAssign::FPExt:
   2275       Arg = DAG.getNode(ISD::FP_EXTEND, DL, VA.getLocVT(), Arg);
   2276       break;
   2277     }
   2278 
   2279     if (VA.isRegLoc()) {
   2280       if (realArgIdx == 0 && Flags.isReturned() && Outs[0].VT == MVT::i64) {
   2281         assert(VA.getLocVT() == MVT::i64 &&
   2282                "unexpected calling convention register assignment");
   2283         assert(!Ins.empty() && Ins[0].VT == MVT::i64 &&
   2284                "unexpected use of 'returned'");
   2285         IsThisReturn = true;
   2286       }
   2287       RegsToPass.push_back(std::make_pair(VA.getLocReg(), Arg));
   2288     } else {
   2289       assert(VA.isMemLoc());
   2290 
   2291       SDValue DstAddr;
   2292       MachinePointerInfo DstInfo;
   2293 
   2294       // FIXME: This works on big-endian for composite byvals, which are the
   2295       // common case. It should also work for fundamental types too.
   2296       uint32_t BEAlign = 0;
   2297       unsigned OpSize = Flags.isByVal() ? Flags.getByValSize() * 8
   2298                                         : VA.getLocVT().getSizeInBits();
   2299       OpSize = (OpSize + 7) / 8;
   2300       if (!Subtarget->isLittleEndian() && !Flags.isByVal()) {
   2301         if (OpSize < 8)
   2302           BEAlign = 8 - OpSize;
   2303       }
   2304       unsigned LocMemOffset = VA.getLocMemOffset();
   2305       int32_t Offset = LocMemOffset + BEAlign;
   2306       SDValue PtrOff = DAG.getIntPtrConstant(Offset);
   2307       PtrOff = DAG.getNode(ISD::ADD, DL, getPointerTy(), StackPtr, PtrOff);
   2308 
   2309       if (IsTailCall) {
   2310         Offset = Offset + FPDiff;
   2311         int FI = MF.getFrameInfo()->CreateFixedObject(OpSize, Offset, true);
   2312 
   2313         DstAddr = DAG.getFrameIndex(FI, getPointerTy());
   2314         DstInfo = MachinePointerInfo::getFixedStack(FI);
   2315 
   2316         // Make sure any stack arguments overlapping with where we're storing
   2317         // are loaded before this eventual operation. Otherwise they'll be
   2318         // clobbered.
   2319         Chain = addTokenForArgument(Chain, DAG, MF.getFrameInfo(), FI);
   2320       } else {
   2321         SDValue PtrOff = DAG.getIntPtrConstant(Offset);
   2322 
   2323         DstAddr = DAG.getNode(ISD::ADD, DL, getPointerTy(), StackPtr, PtrOff);
   2324         DstInfo = MachinePointerInfo::getStack(LocMemOffset);
   2325       }
   2326 
   2327       if (Outs[i].Flags.isByVal()) {
   2328         SDValue SizeNode =
   2329             DAG.getConstant(Outs[i].Flags.getByValSize(), MVT::i64);
   2330         SDValue Cpy = DAG.getMemcpy(
   2331             Chain, DL, DstAddr, Arg, SizeNode, Outs[i].Flags.getByValAlign(),
   2332             /*isVolatile = */ false,
   2333             /*alwaysInline = */ false, DstInfo, MachinePointerInfo());
   2334 
   2335         MemOpChains.push_back(Cpy);
   2336       } else {
   2337         // Since we pass i1/i8/i16 as i1/i8/i16 on stack and Arg is already
   2338         // promoted to a legal register type i32, we should truncate Arg back to
   2339         // i1/i8/i16.
   2340         if (VA.getValVT() == MVT::i1 || VA.getValVT() == MVT::i8 ||
   2341             VA.getValVT() == MVT::i16)
   2342           Arg = DAG.getNode(ISD::TRUNCATE, DL, VA.getValVT(), Arg);
   2343 
   2344         SDValue Store =
   2345             DAG.getStore(Chain, DL, Arg, DstAddr, DstInfo, false, false, 0);
   2346         MemOpChains.push_back(Store);
   2347       }
   2348     }
   2349   }
   2350 
   2351   if (!MemOpChains.empty())
   2352     Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOpChains);
   2353 
   2354   // Build a sequence of copy-to-reg nodes chained together with token chain
   2355   // and flag operands which copy the outgoing args into the appropriate regs.
   2356   SDValue InFlag;
   2357   for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) {
   2358     Chain = DAG.getCopyToReg(Chain, DL, RegsToPass[i].first,
   2359                              RegsToPass[i].second, InFlag);
   2360     InFlag = Chain.getValue(1);
   2361   }
   2362 
   2363   // If the callee is a GlobalAddress/ExternalSymbol node (quite common, every
   2364   // direct call is) turn it into a TargetGlobalAddress/TargetExternalSymbol
   2365   // node so that legalize doesn't hack it.
   2366   if (getTargetMachine().getCodeModel() == CodeModel::Large &&
   2367       Subtarget->isTargetMachO()) {
   2368     if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee)) {
   2369       const GlobalValue *GV = G->getGlobal();
   2370       bool InternalLinkage = GV->hasInternalLinkage();
   2371       if (InternalLinkage)
   2372         Callee = DAG.getTargetGlobalAddress(GV, DL, getPointerTy(), 0, 0);
   2373       else {
   2374         Callee = DAG.getTargetGlobalAddress(GV, DL, getPointerTy(), 0,
   2375                                             AArch64II::MO_GOT);
   2376         Callee = DAG.getNode(AArch64ISD::LOADgot, DL, getPointerTy(), Callee);
   2377       }
   2378     } else if (ExternalSymbolSDNode *S =
   2379                    dyn_cast<ExternalSymbolSDNode>(Callee)) {
   2380       const char *Sym = S->getSymbol();
   2381       Callee =
   2382           DAG.getTargetExternalSymbol(Sym, getPointerTy(), AArch64II::MO_GOT);
   2383       Callee = DAG.getNode(AArch64ISD::LOADgot, DL, getPointerTy(), Callee);
   2384     }
   2385   } else if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee)) {
   2386     const GlobalValue *GV = G->getGlobal();
   2387     Callee = DAG.getTargetGlobalAddress(GV, DL, getPointerTy(), 0, 0);
   2388   } else if (ExternalSymbolSDNode *S = dyn_cast<ExternalSymbolSDNode>(Callee)) {
   2389     const char *Sym = S->getSymbol();
   2390     Callee = DAG.getTargetExternalSymbol(Sym, getPointerTy(), 0);
   2391   }
   2392 
   2393   // We don't usually want to end the call-sequence here because we would tidy
   2394   // the frame up *after* the call, however in the ABI-changing tail-call case
   2395   // we've carefully laid out the parameters so that when sp is reset they'll be
   2396   // in the correct location.
   2397   if (IsTailCall && !IsSibCall) {
   2398     Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(NumBytes, true),
   2399                                DAG.getIntPtrConstant(0, true), InFlag, DL);
   2400     InFlag = Chain.getValue(1);
   2401   }
   2402 
   2403   std::vector<SDValue> Ops;
   2404   Ops.push_back(Chain);
   2405   Ops.push_back(Callee);
   2406 
   2407   if (IsTailCall) {
   2408     // Each tail call may have to adjust the stack by a different amount, so
   2409     // this information must travel along with the operation for eventual
   2410     // consumption by emitEpilogue.
   2411     Ops.push_back(DAG.getTargetConstant(FPDiff, MVT::i32));
   2412   }
   2413 
   2414   // Add argument registers to the end of the list so that they are known live
   2415   // into the call.
   2416   for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i)
   2417     Ops.push_back(DAG.getRegister(RegsToPass[i].first,
   2418                                   RegsToPass[i].second.getValueType()));
   2419 
   2420   // Add a register mask operand representing the call-preserved registers.
   2421   const uint32_t *Mask;
   2422   const TargetRegisterInfo *TRI = getTargetMachine().getRegisterInfo();
   2423   const AArch64RegisterInfo *ARI =
   2424       static_cast<const AArch64RegisterInfo *>(TRI);
   2425   if (IsThisReturn) {
   2426     // For 'this' returns, use the X0-preserving mask if applicable
   2427     Mask = ARI->getThisReturnPreservedMask(CallConv);
   2428     if (!Mask) {
   2429       IsThisReturn = false;
   2430       Mask = ARI->getCallPreservedMask(CallConv);
   2431     }
   2432   } else
   2433     Mask = ARI->getCallPreservedMask(CallConv);
   2434 
   2435   assert(Mask && "Missing call preserved mask for calling convention");
   2436   Ops.push_back(DAG.getRegisterMask(Mask));
   2437 
   2438   if (InFlag.getNode())
   2439     Ops.push_back(InFlag);
   2440 
   2441   SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
   2442 
   2443   // If we're doing a tall call, use a TC_RETURN here rather than an
   2444   // actual call instruction.
   2445   if (IsTailCall)
   2446     return DAG.getNode(AArch64ISD::TC_RETURN, DL, NodeTys, Ops);
   2447 
   2448   // Returns a chain and a flag for retval copy to use.
   2449   Chain = DAG.getNode(AArch64ISD::CALL, DL, NodeTys, Ops);
   2450   InFlag = Chain.getValue(1);
   2451 
   2452   uint64_t CalleePopBytes = DoesCalleeRestoreStack(CallConv, TailCallOpt)
   2453                                 ? RoundUpToAlignment(NumBytes, 16)
   2454                                 : 0;
   2455 
   2456   Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(NumBytes, true),
   2457                              DAG.getIntPtrConstant(CalleePopBytes, true),
   2458                              InFlag, DL);
   2459   if (!Ins.empty())
   2460     InFlag = Chain.getValue(1);
   2461 
   2462   // Handle result values, copying them out of physregs into vregs that we
   2463   // return.
   2464   return LowerCallResult(Chain, InFlag, CallConv, IsVarArg, Ins, DL, DAG,
   2465                          InVals, IsThisReturn,
   2466                          IsThisReturn ? OutVals[0] : SDValue());
   2467 }
   2468 
   2469 bool AArch64TargetLowering::CanLowerReturn(
   2470     CallingConv::ID CallConv, MachineFunction &MF, bool isVarArg,
   2471     const SmallVectorImpl<ISD::OutputArg> &Outs, LLVMContext &Context) const {
   2472   CCAssignFn *RetCC = CallConv == CallingConv::WebKit_JS
   2473                           ? RetCC_AArch64_WebKit_JS
   2474                           : RetCC_AArch64_AAPCS;
   2475   SmallVector<CCValAssign, 16> RVLocs;
   2476   CCState CCInfo(CallConv, isVarArg, MF, getTargetMachine(), RVLocs, Context);
   2477   return CCInfo.CheckReturn(Outs, RetCC);
   2478 }
   2479 
   2480 SDValue
   2481 AArch64TargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
   2482                                    bool isVarArg,
   2483                                    const SmallVectorImpl<ISD::OutputArg> &Outs,
   2484                                    const SmallVectorImpl<SDValue> &OutVals,
   2485                                    SDLoc DL, SelectionDAG &DAG) const {
   2486   CCAssignFn *RetCC = CallConv == CallingConv::WebKit_JS
   2487                           ? RetCC_AArch64_WebKit_JS
   2488                           : RetCC_AArch64_AAPCS;
   2489   SmallVector<CCValAssign, 16> RVLocs;
   2490   CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(),
   2491                  getTargetMachine(), RVLocs, *DAG.getContext());
   2492   CCInfo.AnalyzeReturn(Outs, RetCC);
   2493 
   2494   // Copy the result values into the output registers.
   2495   SDValue Flag;
   2496   SmallVector<SDValue, 4> RetOps(1, Chain);
   2497   for (unsigned i = 0, realRVLocIdx = 0; i != RVLocs.size();
   2498        ++i, ++realRVLocIdx) {
   2499     CCValAssign &VA = RVLocs[i];
   2500     assert(VA.isRegLoc() && "Can only return in registers!");
   2501     SDValue Arg = OutVals[realRVLocIdx];
   2502 
   2503     switch (VA.getLocInfo()) {
   2504     default:
   2505       llvm_unreachable("Unknown loc info!");
   2506     case CCValAssign::Full:
   2507       if (Outs[i].ArgVT == MVT::i1) {
   2508         // AAPCS requires i1 to be zero-extended to i8 by the producer of the
   2509         // value. This is strictly redundant on Darwin (which uses "zeroext
   2510         // i1"), but will be optimised out before ISel.
   2511         Arg = DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, Arg);
   2512         Arg = DAG.getNode(ISD::ZERO_EXTEND, DL, VA.getLocVT(), Arg);
   2513       }
   2514       break;
   2515     case CCValAssign::BCvt:
   2516       Arg = DAG.getNode(ISD::BITCAST, DL, VA.getLocVT(), Arg);
   2517       break;
   2518     }
   2519 
   2520     Chain = DAG.getCopyToReg(Chain, DL, VA.getLocReg(), Arg, Flag);
   2521     Flag = Chain.getValue(1);
   2522     RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT()));
   2523   }
   2524 
   2525   RetOps[0] = Chain; // Update chain.
   2526 
   2527   // Add the flag if we have it.
   2528   if (Flag.getNode())
   2529     RetOps.push_back(Flag);
   2530 
   2531   return DAG.getNode(AArch64ISD::RET_FLAG, DL, MVT::Other, RetOps);
   2532 }
   2533 
   2534 //===----------------------------------------------------------------------===//
   2535 //  Other Lowering Code
   2536 //===----------------------------------------------------------------------===//
   2537 
   2538 SDValue AArch64TargetLowering::LowerGlobalAddress(SDValue Op,
   2539                                                   SelectionDAG &DAG) const {
   2540   EVT PtrVT = getPointerTy();
   2541   SDLoc DL(Op);
   2542   const GlobalValue *GV = cast<GlobalAddressSDNode>(Op)->getGlobal();
   2543   unsigned char OpFlags =
   2544       Subtarget->ClassifyGlobalReference(GV, getTargetMachine());
   2545 
   2546   assert(cast<GlobalAddressSDNode>(Op)->getOffset() == 0 &&
   2547          "unexpected offset in global node");
   2548 
   2549   // This also catched the large code model case for Darwin.
   2550   if ((OpFlags & AArch64II::MO_GOT) != 0) {
   2551     SDValue GotAddr = DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, OpFlags);
   2552     // FIXME: Once remat is capable of dealing with instructions with register
   2553     // operands, expand this into two nodes instead of using a wrapper node.
   2554     return DAG.getNode(AArch64ISD::LOADgot, DL, PtrVT, GotAddr);
   2555   }
   2556 
   2557   if (getTargetMachine().getCodeModel() == CodeModel::Large) {
   2558     const unsigned char MO_NC = AArch64II::MO_NC;
   2559     return DAG.getNode(
   2560         AArch64ISD::WrapperLarge, DL, PtrVT,
   2561         DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, AArch64II::MO_G3),
   2562         DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, AArch64II::MO_G2 | MO_NC),
   2563         DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, AArch64II::MO_G1 | MO_NC),
   2564         DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, AArch64II::MO_G0 | MO_NC));
   2565   } else {
   2566     // Use ADRP/ADD or ADRP/LDR for everything else: the small model on ELF and
   2567     // the only correct model on Darwin.
   2568     SDValue Hi = DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0,
   2569                                             OpFlags | AArch64II::MO_PAGE);
   2570     unsigned char LoFlags = OpFlags | AArch64II::MO_PAGEOFF | AArch64II::MO_NC;
   2571     SDValue Lo = DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, LoFlags);
   2572 
   2573     SDValue ADRP = DAG.getNode(AArch64ISD::ADRP, DL, PtrVT, Hi);
   2574     return DAG.getNode(AArch64ISD::ADDlow, DL, PtrVT, ADRP, Lo);
   2575   }
   2576 }
   2577 
   2578 /// \brief Convert a TLS address reference into the correct sequence of loads
   2579 /// and calls to compute the variable's address (for Darwin, currently) and
   2580 /// return an SDValue containing the final node.
   2581 
   2582 /// Darwin only has one TLS scheme which must be capable of dealing with the
   2583 /// fully general situation, in the worst case. This means:
   2584 ///     + "extern __thread" declaration.
   2585 ///     + Defined in a possibly unknown dynamic library.
   2586 ///
   2587 /// The general system is that each __thread variable has a [3 x i64] descriptor
   2588 /// which contains information used by the runtime to calculate the address. The
   2589 /// only part of this the compiler needs to know about is the first xword, which
   2590 /// contains a function pointer that must be called with the address of the
   2591 /// entire descriptor in "x0".
   2592 ///
   2593 /// Since this descriptor may be in a different unit, in general even the
   2594 /// descriptor must be accessed via an indirect load. The "ideal" code sequence
   2595 /// is:
   2596 ///     adrp x0, _var@TLVPPAGE
   2597 ///     ldr x0, [x0, _var@TLVPPAGEOFF]   ; x0 now contains address of descriptor
   2598 ///     ldr x1, [x0]                     ; x1 contains 1st entry of descriptor,
   2599 ///                                      ; the function pointer
   2600 ///     blr x1                           ; Uses descriptor address in x0
   2601 ///     ; Address of _var is now in x0.
   2602 ///
   2603 /// If the address of _var's descriptor *is* known to the linker, then it can
   2604 /// change the first "ldr" instruction to an appropriate "add x0, x0, #imm" for
   2605 /// a slight efficiency gain.
   2606 SDValue
   2607 AArch64TargetLowering::LowerDarwinGlobalTLSAddress(SDValue Op,
   2608                                                    SelectionDAG &DAG) const {
   2609   assert(Subtarget->isTargetDarwin() && "TLS only supported on Darwin");
   2610 
   2611   SDLoc DL(Op);
   2612   MVT PtrVT = getPointerTy();
   2613   const GlobalValue *GV = cast<GlobalAddressSDNode>(Op)->getGlobal();
   2614 
   2615   SDValue TLVPAddr =
   2616       DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, AArch64II::MO_TLS);
   2617   SDValue DescAddr = DAG.getNode(AArch64ISD::LOADgot, DL, PtrVT, TLVPAddr);
   2618 
   2619   // The first entry in the descriptor is a function pointer that we must call
   2620   // to obtain the address of the variable.
   2621   SDValue Chain = DAG.getEntryNode();
   2622   SDValue FuncTLVGet =
   2623       DAG.getLoad(MVT::i64, DL, Chain, DescAddr, MachinePointerInfo::getGOT(),
   2624                   false, true, true, 8);
   2625   Chain = FuncTLVGet.getValue(1);
   2626 
   2627   MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo();
   2628   MFI->setAdjustsStack(true);
   2629 
   2630   // TLS calls preserve all registers except those that absolutely must be
   2631   // trashed: X0 (it takes an argument), LR (it's a call) and NZCV (let's not be
   2632   // silly).
   2633   const TargetRegisterInfo *TRI = getTargetMachine().getRegisterInfo();
   2634   const AArch64RegisterInfo *ARI =
   2635       static_cast<const AArch64RegisterInfo *>(TRI);
   2636   const uint32_t *Mask = ARI->getTLSCallPreservedMask();
   2637 
   2638   // Finally, we can make the call. This is just a degenerate version of a
   2639   // normal AArch64 call node: x0 takes the address of the descriptor, and
   2640   // returns the address of the variable in this thread.
   2641   Chain = DAG.getCopyToReg(Chain, DL, AArch64::X0, DescAddr, SDValue());
   2642   Chain =
   2643       DAG.getNode(AArch64ISD::CALL, DL, DAG.getVTList(MVT::Other, MVT::Glue),
   2644                   Chain, FuncTLVGet, DAG.getRegister(AArch64::X0, MVT::i64),
   2645                   DAG.getRegisterMask(Mask), Chain.getValue(1));
   2646   return DAG.getCopyFromReg(Chain, DL, AArch64::X0, PtrVT, Chain.getValue(1));
   2647 }
   2648 
   2649 /// When accessing thread-local variables under either the general-dynamic or
   2650 /// local-dynamic system, we make a "TLS-descriptor" call. The variable will
   2651 /// have a descriptor, accessible via a PC-relative ADRP, and whose first entry
   2652 /// is a function pointer to carry out the resolution. This function takes the
   2653 /// address of the descriptor in X0 and returns the TPIDR_EL0 offset in X0. All
   2654 /// other registers (except LR, NZCV) are preserved.
   2655 ///
   2656 /// Thus, the ideal call sequence on AArch64 is:
   2657 ///
   2658 ///     adrp x0, :tlsdesc:thread_var
   2659 ///     ldr x8, [x0, :tlsdesc_lo12:thread_var]
   2660 ///     add x0, x0, :tlsdesc_lo12:thread_var
   2661 ///     .tlsdesccall thread_var
   2662 ///     blr x8
   2663 ///     (TPIDR_EL0 offset now in x0).
   2664 ///
   2665 /// The ".tlsdesccall" directive instructs the assembler to insert a particular
   2666 /// relocation to help the linker relax this sequence if it turns out to be too
   2667 /// conservative.
   2668 ///
   2669 /// FIXME: we currently produce an extra, duplicated, ADRP instruction, but this
   2670 /// is harmless.
   2671 SDValue AArch64TargetLowering::LowerELFTLSDescCall(SDValue SymAddr,
   2672                                                    SDValue DescAddr, SDLoc DL,
   2673                                                    SelectionDAG &DAG) const {
   2674   EVT PtrVT = getPointerTy();
   2675 
   2676   // The function we need to call is simply the first entry in the GOT for this
   2677   // descriptor, load it in preparation.
   2678   SDValue Func = DAG.getNode(AArch64ISD::LOADgot, DL, PtrVT, SymAddr);
   2679 
   2680   // TLS calls preserve all registers except those that absolutely must be
   2681   // trashed: X0 (it takes an argument), LR (it's a call) and NZCV (let's not be
   2682   // silly).
   2683   const TargetRegisterInfo *TRI = getTargetMachine().getRegisterInfo();
   2684   const AArch64RegisterInfo *ARI =
   2685       static_cast<const AArch64RegisterInfo *>(TRI);
   2686   const uint32_t *Mask = ARI->getTLSCallPreservedMask();
   2687 
   2688   // The function takes only one argument: the address of the descriptor itself
   2689   // in X0.
   2690   SDValue Glue, Chain;
   2691   Chain = DAG.getCopyToReg(DAG.getEntryNode(), DL, AArch64::X0, DescAddr, Glue);
   2692   Glue = Chain.getValue(1);
   2693 
   2694   // We're now ready to populate the argument list, as with a normal call:
   2695   SmallVector<SDValue, 6> Ops;
   2696   Ops.push_back(Chain);
   2697   Ops.push_back(Func);
   2698   Ops.push_back(SymAddr);
   2699   Ops.push_back(DAG.getRegister(AArch64::X0, PtrVT));
   2700   Ops.push_back(DAG.getRegisterMask(Mask));
   2701   Ops.push_back(Glue);
   2702 
   2703   SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
   2704   Chain = DAG.getNode(AArch64ISD::TLSDESC_CALL, DL, NodeTys, Ops);
   2705   Glue = Chain.getValue(1);
   2706 
   2707   return DAG.getCopyFromReg(Chain, DL, AArch64::X0, PtrVT, Glue);
   2708 }
   2709 
   2710 SDValue
   2711 AArch64TargetLowering::LowerELFGlobalTLSAddress(SDValue Op,
   2712                                                 SelectionDAG &DAG) const {
   2713   assert(Subtarget->isTargetELF() && "This function expects an ELF target");
   2714   assert(getTargetMachine().getCodeModel() == CodeModel::Small &&
   2715          "ELF TLS only supported in small memory model");
   2716   const GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(Op);
   2717 
   2718   TLSModel::Model Model = getTargetMachine().getTLSModel(GA->getGlobal());
   2719 
   2720   SDValue TPOff;
   2721   EVT PtrVT = getPointerTy();
   2722   SDLoc DL(Op);
   2723   const GlobalValue *GV = GA->getGlobal();
   2724 
   2725   SDValue ThreadBase = DAG.getNode(AArch64ISD::THREAD_POINTER, DL, PtrVT);
   2726 
   2727   if (Model == TLSModel::LocalExec) {
   2728     SDValue HiVar = DAG.getTargetGlobalAddress(
   2729         GV, DL, PtrVT, 0, AArch64II::MO_TLS | AArch64II::MO_G1);
   2730     SDValue LoVar = DAG.getTargetGlobalAddress(
   2731         GV, DL, PtrVT, 0,
   2732         AArch64II::MO_TLS | AArch64II::MO_G0 | AArch64II::MO_NC);
   2733 
   2734     TPOff = SDValue(DAG.getMachineNode(AArch64::MOVZXi, DL, PtrVT, HiVar,
   2735                                        DAG.getTargetConstant(16, MVT::i32)),
   2736                     0);
   2737     TPOff = SDValue(DAG.getMachineNode(AArch64::MOVKXi, DL, PtrVT, TPOff, LoVar,
   2738                                        DAG.getTargetConstant(0, MVT::i32)),
   2739                     0);
   2740   } else if (Model == TLSModel::InitialExec) {
   2741     TPOff = DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, AArch64II::MO_TLS);
   2742     TPOff = DAG.getNode(AArch64ISD::LOADgot, DL, PtrVT, TPOff);
   2743   } else if (Model == TLSModel::LocalDynamic) {
   2744     // Local-dynamic accesses proceed in two phases. A general-dynamic TLS
   2745     // descriptor call against the special symbol _TLS_MODULE_BASE_ to calculate
   2746     // the beginning of the module's TLS region, followed by a DTPREL offset
   2747     // calculation.
   2748 
   2749     // These accesses will need deduplicating if there's more than one.
   2750     AArch64FunctionInfo *MFI =
   2751         DAG.getMachineFunction().getInfo<AArch64FunctionInfo>();
   2752     MFI->incNumLocalDynamicTLSAccesses();
   2753 
   2754     // Accesses used in this sequence go via the TLS descriptor which lives in
   2755     // the GOT. Prepare an address we can use to handle this.
   2756     SDValue HiDesc = DAG.getTargetExternalSymbol(
   2757         "_TLS_MODULE_BASE_", PtrVT, AArch64II::MO_TLS | AArch64II::MO_PAGE);
   2758     SDValue LoDesc = DAG.getTargetExternalSymbol(
   2759         "_TLS_MODULE_BASE_", PtrVT,
   2760         AArch64II::MO_TLS | AArch64II::MO_PAGEOFF | AArch64II::MO_NC);
   2761 
   2762     // First argument to the descriptor call is the address of the descriptor
   2763     // itself.
   2764     SDValue DescAddr = DAG.getNode(AArch64ISD::ADRP, DL, PtrVT, HiDesc);
   2765     DescAddr = DAG.getNode(AArch64ISD::ADDlow, DL, PtrVT, DescAddr, LoDesc);
   2766 
   2767     // The call needs a relocation too for linker relaxation. It doesn't make
   2768     // sense to call it MO_PAGE or MO_PAGEOFF though so we need another copy of
   2769     // the address.
   2770     SDValue SymAddr = DAG.getTargetExternalSymbol("_TLS_MODULE_BASE_", PtrVT,
   2771                                                   AArch64II::MO_TLS);
   2772 
   2773     // Now we can calculate the offset from TPIDR_EL0 to this module's
   2774     // thread-local area.
   2775     TPOff = LowerELFTLSDescCall(SymAddr, DescAddr, DL, DAG);
   2776 
   2777     // Now use :dtprel_whatever: operations to calculate this variable's offset
   2778     // in its thread-storage area.
   2779     SDValue HiVar = DAG.getTargetGlobalAddress(
   2780         GV, DL, MVT::i64, 0, AArch64II::MO_TLS | AArch64II::MO_G1);
   2781     SDValue LoVar = DAG.getTargetGlobalAddress(
   2782         GV, DL, MVT::i64, 0,
   2783         AArch64II::MO_TLS | AArch64II::MO_G0 | AArch64II::MO_NC);
   2784 
   2785     SDValue DTPOff =
   2786         SDValue(DAG.getMachineNode(AArch64::MOVZXi, DL, PtrVT, HiVar,
   2787                                    DAG.getTargetConstant(16, MVT::i32)),
   2788                 0);
   2789     DTPOff =
   2790         SDValue(DAG.getMachineNode(AArch64::MOVKXi, DL, PtrVT, DTPOff, LoVar,
   2791                                    DAG.getTargetConstant(0, MVT::i32)),
   2792                 0);
   2793 
   2794     TPOff = DAG.getNode(ISD::ADD, DL, PtrVT, TPOff, DTPOff);
   2795   } else if (Model == TLSModel::GeneralDynamic) {
   2796     // Accesses used in this sequence go via the TLS descriptor which lives in
   2797     // the GOT. Prepare an address we can use to handle this.
   2798     SDValue HiDesc = DAG.getTargetGlobalAddress(
   2799         GV, DL, PtrVT, 0, AArch64II::MO_TLS | AArch64II::MO_PAGE);
   2800     SDValue LoDesc = DAG.getTargetGlobalAddress(
   2801         GV, DL, PtrVT, 0,
   2802         AArch64II::MO_TLS | AArch64II::MO_PAGEOFF | AArch64II::MO_NC);
   2803 
   2804     // First argument to the descriptor call is the address of the descriptor
   2805     // itself.
   2806     SDValue DescAddr = DAG.getNode(AArch64ISD::ADRP, DL, PtrVT, HiDesc);
   2807     DescAddr = DAG.getNode(AArch64ISD::ADDlow, DL, PtrVT, DescAddr, LoDesc);
   2808 
   2809     // The call needs a relocation too for linker relaxation. It doesn't make
   2810     // sense to call it MO_PAGE or MO_PAGEOFF though so we need another copy of
   2811     // the address.
   2812     SDValue SymAddr =
   2813         DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, AArch64II::MO_TLS);
   2814 
   2815     // Finally we can make a call to calculate the offset from tpidr_el0.
   2816     TPOff = LowerELFTLSDescCall(SymAddr, DescAddr, DL, DAG);
   2817   } else
   2818     llvm_unreachable("Unsupported ELF TLS access model");
   2819 
   2820   return DAG.getNode(ISD::ADD, DL, PtrVT, ThreadBase, TPOff);
   2821 }
   2822 
   2823 SDValue AArch64TargetLowering::LowerGlobalTLSAddress(SDValue Op,
   2824                                                      SelectionDAG &DAG) const {
   2825   if (Subtarget->isTargetDarwin())
   2826     return LowerDarwinGlobalTLSAddress(Op, DAG);
   2827   else if (Subtarget->isTargetELF())
   2828     return LowerELFGlobalTLSAddress(Op, DAG);
   2829 
   2830   llvm_unreachable("Unexpected platform trying to use TLS");
   2831 }
   2832 SDValue AArch64TargetLowering::LowerBR_CC(SDValue Op, SelectionDAG &DAG) const {
   2833   SDValue Chain = Op.getOperand(0);
   2834   ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(1))->get();
   2835   SDValue LHS = Op.getOperand(2);
   2836   SDValue RHS = Op.getOperand(3);
   2837   SDValue Dest = Op.getOperand(4);
   2838   SDLoc dl(Op);
   2839 
   2840   // Handle f128 first, since lowering it will result in comparing the return
   2841   // value of a libcall against zero, which is just what the rest of LowerBR_CC
   2842   // is expecting to deal with.
   2843   if (LHS.getValueType() == MVT::f128) {
   2844     softenSetCCOperands(DAG, MVT::f128, LHS, RHS, CC, dl);
   2845 
   2846     // If softenSetCCOperands returned a scalar, we need to compare the result
   2847     // against zero to select between true and false values.
   2848     if (!RHS.getNode()) {
   2849       RHS = DAG.getConstant(0, LHS.getValueType());
   2850       CC = ISD::SETNE;
   2851     }
   2852   }
   2853 
   2854   // Optimize {s|u}{add|sub|mul}.with.overflow feeding into a branch
   2855   // instruction.
   2856   unsigned Opc = LHS.getOpcode();
   2857   if (LHS.getResNo() == 1 && isa<ConstantSDNode>(RHS) &&
   2858       cast<ConstantSDNode>(RHS)->isOne() &&
   2859       (Opc == ISD::SADDO || Opc == ISD::UADDO || Opc == ISD::SSUBO ||
   2860        Opc == ISD::USUBO || Opc == ISD::SMULO || Opc == ISD::UMULO)) {
   2861     assert((CC == ISD::SETEQ || CC == ISD::SETNE) &&
   2862            "Unexpected condition code.");
   2863     // Only lower legal XALUO ops.
   2864     if (!DAG.getTargetLoweringInfo().isTypeLegal(LHS->getValueType(0)))
   2865       return SDValue();
   2866 
   2867     // The actual operation with overflow check.
   2868     AArch64CC::CondCode OFCC;
   2869     SDValue Value, Overflow;
   2870     std::tie(Value, Overflow) = getAArch64XALUOOp(OFCC, LHS.getValue(0), DAG);
   2871 
   2872     if (CC == ISD::SETNE)
   2873       OFCC = getInvertedCondCode(OFCC);
   2874     SDValue CCVal = DAG.getConstant(OFCC, MVT::i32);
   2875 
   2876     return DAG.getNode(AArch64ISD::BRCOND, SDLoc(LHS), MVT::Other, Chain, Dest,
   2877                        CCVal, Overflow);
   2878   }
   2879 
   2880   if (LHS.getValueType().isInteger()) {
   2881     assert((LHS.getValueType() == RHS.getValueType()) &&
   2882            (LHS.getValueType() == MVT::i32 || LHS.getValueType() == MVT::i64));
   2883 
   2884     // If the RHS of the comparison is zero, we can potentially fold this
   2885     // to a specialized branch.
   2886     const ConstantSDNode *RHSC = dyn_cast<ConstantSDNode>(RHS);
   2887     if (RHSC && RHSC->getZExtValue() == 0) {
   2888       if (CC == ISD::SETEQ) {
   2889         // See if we can use a TBZ to fold in an AND as well.
   2890         // TBZ has a smaller branch displacement than CBZ.  If the offset is
   2891         // out of bounds, a late MI-layer pass rewrites branches.
   2892         // 403.gcc is an example that hits this case.
   2893         if (LHS.getOpcode() == ISD::AND &&
   2894             isa<ConstantSDNode>(LHS.getOperand(1)) &&
   2895             isPowerOf2_64(LHS.getConstantOperandVal(1))) {
   2896           SDValue Test = LHS.getOperand(0);
   2897           uint64_t Mask = LHS.getConstantOperandVal(1);
   2898 
   2899           // TBZ only operates on i64's, but the ext should be free.
   2900           if (Test.getValueType() == MVT::i32)
   2901             Test = DAG.getAnyExtOrTrunc(Test, dl, MVT::i64);
   2902 
   2903           return DAG.getNode(AArch64ISD::TBZ, dl, MVT::Other, Chain, Test,
   2904                              DAG.getConstant(Log2_64(Mask), MVT::i64), Dest);
   2905         }
   2906 
   2907         return DAG.getNode(AArch64ISD::CBZ, dl, MVT::Other, Chain, LHS, Dest);
   2908       } else if (CC == ISD::SETNE) {
   2909         // See if we can use a TBZ to fold in an AND as well.
   2910         // TBZ has a smaller branch displacement than CBZ.  If the offset is
   2911         // out of bounds, a late MI-layer pass rewrites branches.
   2912         // 403.gcc is an example that hits this case.
   2913         if (LHS.getOpcode() == ISD::AND &&
   2914             isa<ConstantSDNode>(LHS.getOperand(1)) &&
   2915             isPowerOf2_64(LHS.getConstantOperandVal(1))) {
   2916           SDValue Test = LHS.getOperand(0);
   2917           uint64_t Mask = LHS.getConstantOperandVal(1);
   2918 
   2919           // TBNZ only operates on i64's, but the ext should be free.
   2920           if (Test.getValueType() == MVT::i32)
   2921             Test = DAG.getAnyExtOrTrunc(Test, dl, MVT::i64);
   2922 
   2923           return DAG.getNode(AArch64ISD::TBNZ, dl, MVT::Other, Chain, Test,
   2924                              DAG.getConstant(Log2_64(Mask), MVT::i64), Dest);
   2925         }
   2926 
   2927         return DAG.getNode(AArch64ISD::CBNZ, dl, MVT::Other, Chain, LHS, Dest);
   2928       }
   2929     }
   2930 
   2931     SDValue CCVal;
   2932     SDValue Cmp = getAArch64Cmp(LHS, RHS, CC, CCVal, DAG, dl);
   2933     return DAG.getNode(AArch64ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal,
   2934                        Cmp);
   2935   }
   2936 
   2937   assert(LHS.getValueType() == MVT::f32 || LHS.getValueType() == MVT::f64);
   2938 
   2939   // Unfortunately, the mapping of LLVM FP CC's onto AArch64 CC's isn't totally
   2940   // clean.  Some of them require two branches to implement.
   2941   SDValue Cmp = emitComparison(LHS, RHS, CC, dl, DAG);
   2942   AArch64CC::CondCode CC1, CC2;
   2943   changeFPCCToAArch64CC(CC, CC1, CC2);
   2944   SDValue CC1Val = DAG.getConstant(CC1, MVT::i32);
   2945   SDValue BR1 =
   2946       DAG.getNode(AArch64ISD::BRCOND, dl, MVT::Other, Chain, Dest, CC1Val, Cmp);
   2947   if (CC2 != AArch64CC::AL) {
   2948     SDValue CC2Val = DAG.getConstant(CC2, MVT::i32);
   2949     return DAG.getNode(AArch64ISD::BRCOND, dl, MVT::Other, BR1, Dest, CC2Val,
   2950                        Cmp);
   2951   }
   2952 
   2953   return BR1;
   2954 }
   2955 
   2956 SDValue AArch64TargetLowering::LowerFCOPYSIGN(SDValue Op,
   2957                                               SelectionDAG &DAG) const {
   2958   EVT VT = Op.getValueType();
   2959   SDLoc DL(Op);
   2960 
   2961   SDValue In1 = Op.getOperand(0);
   2962   SDValue In2 = Op.getOperand(1);
   2963   EVT SrcVT = In2.getValueType();
   2964   if (SrcVT != VT) {
   2965     if (SrcVT == MVT::f32 && VT == MVT::f64)
   2966       In2 = DAG.getNode(ISD::FP_EXTEND, DL, VT, In2);
   2967     else if (SrcVT == MVT::f64 && VT == MVT::f32)
   2968       In2 = DAG.getNode(ISD::FP_ROUND, DL, VT, In2, DAG.getIntPtrConstant(0));
   2969     else
   2970       // FIXME: Src type is different, bail out for now. Can VT really be a
   2971       // vector type?
   2972       return SDValue();
   2973   }
   2974 
   2975   EVT VecVT;
   2976   EVT EltVT;
   2977   SDValue EltMask, VecVal1, VecVal2;
   2978   if (VT == MVT::f32 || VT == MVT::v2f32 || VT == MVT::v4f32) {
   2979     EltVT = MVT::i32;
   2980     VecVT = MVT::v4i32;
   2981     EltMask = DAG.getConstant(0x80000000ULL, EltVT);
   2982 
   2983     if (!VT.isVector()) {
   2984       VecVal1 = DAG.getTargetInsertSubreg(AArch64::ssub, DL, VecVT,
   2985                                           DAG.getUNDEF(VecVT), In1);
   2986       VecVal2 = DAG.getTargetInsertSubreg(AArch64::ssub, DL, VecVT,
   2987                                           DAG.getUNDEF(VecVT), In2);
   2988     } else {
   2989       VecVal1 = DAG.getNode(ISD::BITCAST, DL, VecVT, In1);
   2990       VecVal2 = DAG.getNode(ISD::BITCAST, DL, VecVT, In2);
   2991     }
   2992   } else if (VT == MVT::f64 || VT == MVT::v2f64) {
   2993     EltVT = MVT::i64;
   2994     VecVT = MVT::v2i64;
   2995 
   2996     // We want to materialize a mask with the the high bit set, but the AdvSIMD
   2997     // immediate moves cannot materialize that in a single instruction for
   2998     // 64-bit elements. Instead, materialize zero and then negate it.
   2999     EltMask = DAG.getConstant(0, EltVT);
   3000 
   3001     if (!VT.isVector()) {
   3002       VecVal1 = DAG.getTargetInsertSubreg(AArch64::dsub, DL, VecVT,
   3003                                           DAG.getUNDEF(VecVT), In1);
   3004       VecVal2 = DAG.getTargetInsertSubreg(AArch64::dsub, DL, VecVT,
   3005                                           DAG.getUNDEF(VecVT), In2);
   3006     } else {
   3007       VecVal1 = DAG.getNode(ISD::BITCAST, DL, VecVT, In1);
   3008       VecVal2 = DAG.getNode(ISD::BITCAST, DL, VecVT, In2);
   3009     }
   3010   } else {
   3011     llvm_unreachable("Invalid type for copysign!");
   3012   }
   3013 
   3014   std::vector<SDValue> BuildVectorOps;
   3015   for (unsigned i = 0; i < VecVT.getVectorNumElements(); ++i)
   3016     BuildVectorOps.push_back(EltMask);
   3017 
   3018   SDValue BuildVec = DAG.getNode(ISD::BUILD_VECTOR, DL, VecVT, BuildVectorOps);
   3019 
   3020   // If we couldn't materialize the mask above, then the mask vector will be
   3021   // the zero vector, and we need to negate it here.
   3022   if (VT == MVT::f64 || VT == MVT::v2f64) {
   3023     BuildVec = DAG.getNode(ISD::BITCAST, DL, MVT::v2f64, BuildVec);
   3024     BuildVec = DAG.getNode(ISD::FNEG, DL, MVT::v2f64, BuildVec);
   3025     BuildVec = DAG.getNode(ISD::BITCAST, DL, MVT::v2i64, BuildVec);
   3026   }
   3027 
   3028   SDValue Sel =
   3029       DAG.getNode(AArch64ISD::BIT, DL, VecVT, VecVal1, VecVal2, BuildVec);
   3030 
   3031   if (VT == MVT::f32)
   3032     return DAG.getTargetExtractSubreg(AArch64::ssub, DL, VT, Sel);
   3033   else if (VT == MVT::f64)
   3034     return DAG.getTargetExtractSubreg(AArch64::dsub, DL, VT, Sel);
   3035   else
   3036     return DAG.getNode(ISD::BITCAST, DL, VT, Sel);
   3037 }
   3038 
   3039 SDValue AArch64TargetLowering::LowerCTPOP(SDValue Op, SelectionDAG &DAG) const {
   3040   if (DAG.getMachineFunction().getFunction()->getAttributes().hasAttribute(
   3041           AttributeSet::FunctionIndex, Attribute::NoImplicitFloat))
   3042     return SDValue();
   3043 
   3044   // While there is no integer popcount instruction, it can
   3045   // be more efficiently lowered to the following sequence that uses
   3046   // AdvSIMD registers/instructions as long as the copies to/from
   3047   // the AdvSIMD registers are cheap.
   3048   //  FMOV    D0, X0        // copy 64-bit int to vector, high bits zero'd
   3049   //  CNT     V0.8B, V0.8B  // 8xbyte pop-counts
   3050   //  ADDV    B0, V0.8B     // sum 8xbyte pop-counts
   3051   //  UMOV    X0, V0.B[0]   // copy byte result back to integer reg
   3052   SDValue Val = Op.getOperand(0);
   3053   SDLoc DL(Op);
   3054   EVT VT = Op.getValueType();
   3055   SDValue ZeroVec = DAG.getUNDEF(MVT::v8i8);
   3056 
   3057   SDValue VecVal;
   3058   if (VT == MVT::i32) {
   3059     VecVal = DAG.getNode(ISD::BITCAST, DL, MVT::f32, Val);
   3060     VecVal = DAG.getTargetInsertSubreg(AArch64::ssub, DL, MVT::v8i8, ZeroVec,
   3061                                        VecVal);
   3062   } else {
   3063     VecVal = DAG.getNode(ISD::BITCAST, DL, MVT::v8i8, Val);
   3064   }
   3065 
   3066   SDValue CtPop = DAG.getNode(ISD::CTPOP, DL, MVT::v8i8, VecVal);
   3067   SDValue UaddLV = DAG.getNode(
   3068       ISD::INTRINSIC_WO_CHAIN, DL, MVT::i32,
   3069       DAG.getConstant(Intrinsic::aarch64_neon_uaddlv, MVT::i32), CtPop);
   3070 
   3071   if (VT == MVT::i64)
   3072     UaddLV = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i64, UaddLV);
   3073   return UaddLV;
   3074 }
   3075 
   3076 SDValue AArch64TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const {
   3077 
   3078   if (Op.getValueType().isVector())
   3079     return LowerVSETCC(Op, DAG);
   3080 
   3081   SDValue LHS = Op.getOperand(0);
   3082   SDValue RHS = Op.getOperand(1);
   3083   ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(2))->get();
   3084   SDLoc dl(Op);
   3085 
   3086   // We chose ZeroOrOneBooleanContents, so use zero and one.
   3087   EVT VT = Op.getValueType();
   3088   SDValue TVal = DAG.getConstant(1, VT);
   3089   SDValue FVal = DAG.getConstant(0, VT);
   3090 
   3091   // Handle f128 first, since one possible outcome is a normal integer
   3092   // comparison which gets picked up by the next if statement.
   3093   if (LHS.getValueType() == MVT::f128) {
   3094     softenSetCCOperands(DAG, MVT::f128, LHS, RHS, CC, dl);
   3095 
   3096     // If softenSetCCOperands returned a scalar, use it.
   3097     if (!RHS.getNode()) {
   3098       assert(LHS.getValueType() == Op.getValueType() &&
   3099              "Unexpected setcc expansion!");
   3100       return LHS;
   3101     }
   3102   }
   3103 
   3104   if (LHS.getValueType().isInteger()) {
   3105     SDValue CCVal;
   3106     SDValue Cmp =
   3107         getAArch64Cmp(LHS, RHS, ISD::getSetCCInverse(CC, true), CCVal, DAG, dl);
   3108 
   3109     // Note that we inverted the condition above, so we reverse the order of
   3110     // the true and false operands here.  This will allow the setcc to be
   3111     // matched to a single CSINC instruction.
   3112     return DAG.getNode(AArch64ISD::CSEL, dl, VT, FVal, TVal, CCVal, Cmp);
   3113   }
   3114 
   3115   // Now we know we're dealing with FP values.
   3116   assert(LHS.getValueType() == MVT::f32 || LHS.getValueType() == MVT::f64);
   3117 
   3118   // If that fails, we'll need to perform an FCMP + CSEL sequence.  Go ahead
   3119   // and do the comparison.
   3120   SDValue Cmp = emitComparison(LHS, RHS, CC, dl, DAG);
   3121 
   3122   AArch64CC::CondCode CC1, CC2;
   3123   changeFPCCToAArch64CC(CC, CC1, CC2);
   3124   if (CC2 == AArch64CC::AL) {
   3125     changeFPCCToAArch64CC(ISD::getSetCCInverse(CC, false), CC1, CC2);
   3126     SDValue CC1Val = DAG.getConstant(CC1, MVT::i32);
   3127 
   3128     // Note that we inverted the condition above, so we reverse the order of
   3129     // the true and false operands here.  This will allow the setcc to be
   3130     // matched to a single CSINC instruction.
   3131     return DAG.getNode(AArch64ISD::CSEL, dl, VT, FVal, TVal, CC1Val, Cmp);
   3132   } else {
   3133     // Unfortunately, the mapping of LLVM FP CC's onto AArch64 CC's isn't
   3134     // totally clean.  Some of them require two CSELs to implement.  As is in
   3135     // this case, we emit the first CSEL and then emit a second using the output
   3136     // of the first as the RHS.  We're effectively OR'ing the two CC's together.
   3137 
   3138     // FIXME: It would be nice if we could match the two CSELs to two CSINCs.
   3139     SDValue CC1Val = DAG.getConstant(CC1, MVT::i32);
   3140     SDValue CS1 =
   3141         DAG.getNode(AArch64ISD::CSEL, dl, VT, TVal, FVal, CC1Val, Cmp);
   3142 
   3143     SDValue CC2Val = DAG.getConstant(CC2, MVT::i32);
   3144     return DAG.getNode(AArch64ISD::CSEL, dl, VT, TVal, CS1, CC2Val, Cmp);
   3145   }
   3146 }
   3147 
   3148 /// A SELECT_CC operation is really some kind of max or min if both values being
   3149 /// compared are, in some sense, equal to the results in either case. However,
   3150 /// it is permissible to compare f32 values and produce directly extended f64
   3151 /// values.
   3152 ///
   3153 /// Extending the comparison operands would also be allowed, but is less likely
   3154 /// to happen in practice since their use is right here. Note that truncate
   3155 /// operations would *not* be semantically equivalent.
   3156 static bool selectCCOpsAreFMaxCompatible(SDValue Cmp, SDValue Result) {
   3157   if (Cmp == Result)
   3158     return true;
   3159 
   3160   ConstantFPSDNode *CCmp = dyn_cast<ConstantFPSDNode>(Cmp);
   3161   ConstantFPSDNode *CResult = dyn_cast<ConstantFPSDNode>(Result);
   3162   if (CCmp && CResult && Cmp.getValueType() == MVT::f32 &&
   3163       Result.getValueType() == MVT::f64) {
   3164     bool Lossy;
   3165     APFloat CmpVal = CCmp->getValueAPF();
   3166     CmpVal.convert(APFloat::IEEEdouble, APFloat::rmNearestTiesToEven, &Lossy);
   3167     return CResult->getValueAPF().bitwiseIsEqual(CmpVal);
   3168   }
   3169 
   3170   return Result->getOpcode() == ISD::FP_EXTEND && Result->getOperand(0) == Cmp;
   3171 }
   3172 
   3173 SDValue AArch64TargetLowering::LowerSELECT(SDValue Op,
   3174                                            SelectionDAG &DAG) const {
   3175   SDValue CC = Op->getOperand(0);
   3176   SDValue TVal = Op->getOperand(1);
   3177   SDValue FVal = Op->getOperand(2);
   3178   SDLoc DL(Op);
   3179 
   3180   unsigned Opc = CC.getOpcode();
   3181   // Optimize {s|u}{add|sub|mul}.with.overflow feeding into a select
   3182   // instruction.
   3183   if (CC.getResNo() == 1 &&
   3184       (Opc == ISD::SADDO || Opc == ISD::UADDO || Opc == ISD::SSUBO ||
   3185        Opc == ISD::USUBO || Opc == ISD::SMULO || Opc == ISD::UMULO)) {
   3186     // Only lower legal XALUO ops.
   3187     if (!DAG.getTargetLoweringInfo().isTypeLegal(CC->getValueType(0)))
   3188       return SDValue();
   3189 
   3190     AArch64CC::CondCode OFCC;
   3191     SDValue Value, Overflow;
   3192     std::tie(Value, Overflow) = getAArch64XALUOOp(OFCC, CC.getValue(0), DAG);
   3193     SDValue CCVal = DAG.getConstant(OFCC, MVT::i32);
   3194 
   3195     return DAG.getNode(AArch64ISD::CSEL, DL, Op.getValueType(), TVal, FVal,
   3196                        CCVal, Overflow);
   3197   }
   3198 
   3199   if (CC.getOpcode() == ISD::SETCC)
   3200     return DAG.getSelectCC(DL, CC.getOperand(0), CC.getOperand(1), TVal, FVal,
   3201                            cast<CondCodeSDNode>(CC.getOperand(2))->get());
   3202   else
   3203     return DAG.getSelectCC(DL, CC, DAG.getConstant(0, CC.getValueType()), TVal,
   3204                            FVal, ISD::SETNE);
   3205 }
   3206 
   3207 SDValue AArch64TargetLowering::LowerSELECT_CC(SDValue Op,
   3208                                               SelectionDAG &DAG) const {
   3209   ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(4))->get();
   3210   SDValue LHS = Op.getOperand(0);
   3211   SDValue RHS = Op.getOperand(1);
   3212   SDValue TVal = Op.getOperand(2);
   3213   SDValue FVal = Op.getOperand(3);
   3214   SDLoc dl(Op);
   3215 
   3216   // Handle f128 first, because it will result in a comparison of some RTLIB
   3217   // call result against zero.
   3218   if (LHS.getValueType() == MVT::f128) {
   3219     softenSetCCOperands(DAG, MVT::f128, LHS, RHS, CC, dl);
   3220 
   3221     // If softenSetCCOperands returned a scalar, we need to compare the result
   3222     // against zero to select between true and false values.
   3223     if (!RHS.getNode()) {
   3224       RHS = DAG.getConstant(0, LHS.getValueType());
   3225       CC = ISD::SETNE;
   3226     }
   3227   }
   3228 
   3229   // Handle integers first.
   3230   if (LHS.getValueType().isInteger()) {
   3231     assert((LHS.getValueType() == RHS.getValueType()) &&
   3232            (LHS.getValueType() == MVT::i32 || LHS.getValueType() == MVT::i64));
   3233 
   3234     unsigned Opcode = AArch64ISD::CSEL;
   3235 
   3236     // If both the TVal and the FVal are constants, see if we can swap them in
   3237     // order to for a CSINV or CSINC out of them.
   3238     ConstantSDNode *CFVal = dyn_cast<ConstantSDNode>(FVal);
   3239     ConstantSDNode *CTVal = dyn_cast<ConstantSDNode>(TVal);
   3240 
   3241     if (CTVal && CFVal && CTVal->isAllOnesValue() && CFVal->isNullValue()) {
   3242       std::swap(TVal, FVal);
   3243       std::swap(CTVal, CFVal);
   3244       CC = ISD::getSetCCInverse(CC, true);
   3245     } else if (CTVal && CFVal && CTVal->isOne() && CFVal->isNullValue()) {
   3246       std::swap(TVal, FVal);
   3247       std::swap(CTVal, CFVal);
   3248       CC = ISD::getSetCCInverse(CC, true);
   3249     } else if (TVal.getOpcode() == ISD::XOR) {
   3250       // If TVal is a NOT we want to swap TVal and FVal so that we can match
   3251       // with a CSINV rather than a CSEL.
   3252       ConstantSDNode *CVal = dyn_cast<ConstantSDNode>(TVal.getOperand(1));
   3253 
   3254       if (CVal && CVal->isAllOnesValue()) {
   3255         std::swap(TVal, FVal);
   3256         std::swap(CTVal, CFVal);
   3257         CC = ISD::getSetCCInverse(CC, true);
   3258       }
   3259     } else if (TVal.getOpcode() == ISD::SUB) {
   3260       // If TVal is a negation (SUB from 0) we want to swap TVal and FVal so
   3261       // that we can match with a CSNEG rather than a CSEL.
   3262       ConstantSDNode *CVal = dyn_cast<ConstantSDNode>(TVal.getOperand(0));
   3263 
   3264       if (CVal && CVal->isNullValue()) {
   3265         std::swap(TVal, FVal);
   3266         std::swap(CTVal, CFVal);
   3267         CC = ISD::getSetCCInverse(CC, true);
   3268       }
   3269     } else if (CTVal && CFVal) {
   3270       const int64_t TrueVal = CTVal->getSExtValue();
   3271       const int64_t FalseVal = CFVal->getSExtValue();
   3272       bool Swap = false;
   3273 
   3274       // If both TVal and FVal are constants, see if FVal is the
   3275       // inverse/negation/increment of TVal and generate a CSINV/CSNEG/CSINC
   3276       // instead of a CSEL in that case.
   3277       if (TrueVal == ~FalseVal) {
   3278         Opcode = AArch64ISD::CSINV;
   3279       } else if (TrueVal == -FalseVal) {
   3280         Opcode = AArch64ISD::CSNEG;
   3281       } else if (TVal.getValueType() == MVT::i32) {
   3282         // If our operands are only 32-bit wide, make sure we use 32-bit
   3283         // arithmetic for the check whether we can use CSINC. This ensures that
   3284         // the addition in the check will wrap around properly in case there is
   3285         // an overflow (which would not be the case if we do the check with
   3286         // 64-bit arithmetic).
   3287         const uint32_t TrueVal32 = CTVal->getZExtValue();
   3288         const uint32_t FalseVal32 = CFVal->getZExtValue();
   3289 
   3290         if ((TrueVal32 == FalseVal32 + 1) || (TrueVal32 + 1 == FalseVal32)) {
   3291           Opcode = AArch64ISD::CSINC;
   3292 
   3293           if (TrueVal32 > FalseVal32) {
   3294             Swap = true;
   3295           }
   3296         }
   3297         // 64-bit check whether we can use CSINC.
   3298       } else if ((TrueVal == FalseVal + 1) || (TrueVal + 1 == FalseVal)) {
   3299         Opcode = AArch64ISD::CSINC;
   3300 
   3301         if (TrueVal > FalseVal) {
   3302           Swap = true;
   3303         }
   3304       }
   3305 
   3306       // Swap TVal and FVal if necessary.
   3307       if (Swap) {
   3308         std::swap(TVal, FVal);
   3309         std::swap(CTVal, CFVal);
   3310         CC = ISD::getSetCCInverse(CC, true);
   3311       }
   3312 
   3313       if (Opcode != AArch64ISD::CSEL) {
   3314         // Drop FVal since we can get its value by simply inverting/negating
   3315         // TVal.
   3316         FVal = TVal;
   3317       }
   3318     }
   3319 
   3320     SDValue CCVal;
   3321     SDValue Cmp = getAArch64Cmp(LHS, RHS, CC, CCVal, DAG, dl);
   3322 
   3323     EVT VT = Op.getValueType();
   3324     return DAG.getNode(Opcode, dl, VT, TVal, FVal, CCVal, Cmp);
   3325   }
   3326 
   3327   // Now we know we're dealing with FP values.
   3328   assert(LHS.getValueType() == MVT::f32 || LHS.getValueType() == MVT::f64);
   3329   assert(LHS.getValueType() == RHS.getValueType());
   3330   EVT VT = Op.getValueType();
   3331 
   3332   // Try to match this select into a max/min operation, which have dedicated
   3333   // opcode in the instruction set.
   3334   // FIXME: This is not correct in the presence of NaNs, so we only enable this
   3335   // in no-NaNs mode.
   3336   if (getTargetMachine().Options.NoNaNsFPMath) {
   3337     SDValue MinMaxLHS = TVal, MinMaxRHS = FVal;
   3338     if (selectCCOpsAreFMaxCompatible(LHS, MinMaxRHS) &&
   3339         selectCCOpsAreFMaxCompatible(RHS, MinMaxLHS)) {
   3340       CC = ISD::getSetCCSwappedOperands(CC);
   3341       std::swap(MinMaxLHS, MinMaxRHS);
   3342     }
   3343 
   3344     if (selectCCOpsAreFMaxCompatible(LHS, MinMaxLHS) &&
   3345         selectCCOpsAreFMaxCompatible(RHS, MinMaxRHS)) {
   3346       switch (CC) {
   3347       default:
   3348         break;
   3349       case ISD::SETGT:
   3350       case ISD::SETGE:
   3351       case ISD::SETUGT:
   3352       case ISD::SETUGE:
   3353       case ISD::SETOGT:
   3354       case ISD::SETOGE:
   3355         return DAG.getNode(AArch64ISD::FMAX, dl, VT, MinMaxLHS, MinMaxRHS);
   3356         break;
   3357       case ISD::SETLT:
   3358       case ISD::SETLE:
   3359       case ISD::SETULT:
   3360       case ISD::SETULE:
   3361       case ISD::SETOLT:
   3362       case ISD::SETOLE:
   3363         return DAG.getNode(AArch64ISD::FMIN, dl, VT, MinMaxLHS, MinMaxRHS);
   3364         break;
   3365       }
   3366     }
   3367   }
   3368 
   3369   // If that fails, we'll need to perform an FCMP + CSEL sequence.  Go ahead
   3370   // and do the comparison.
   3371   SDValue Cmp = emitComparison(LHS, RHS, CC, dl, DAG);
   3372 
   3373   // Unfortunately, the mapping of LLVM FP CC's onto AArch64 CC's isn't totally
   3374   // clean.  Some of them require two CSELs to implement.
   3375   AArch64CC::CondCode CC1, CC2;
   3376   changeFPCCToAArch64CC(CC, CC1, CC2);
   3377   SDValue CC1Val = DAG.getConstant(CC1, MVT::i32);
   3378   SDValue CS1 = DAG.getNode(AArch64ISD::CSEL, dl, VT, TVal, FVal, CC1Val, Cmp);
   3379 
   3380   // If we need a second CSEL, emit it, using the output of the first as the
   3381   // RHS.  We're effectively OR'ing the two CC's together.
   3382   if (CC2 != AArch64CC::AL) {
   3383     SDValue CC2Val = DAG.getConstant(CC2, MVT::i32);
   3384     return DAG.getNode(AArch64ISD::CSEL, dl, VT, TVal, CS1, CC2Val, Cmp);
   3385   }
   3386 
   3387   // Otherwise, return the output of the first CSEL.
   3388   return CS1;
   3389 }
   3390 
   3391 SDValue AArch64TargetLowering::LowerJumpTable(SDValue Op,
   3392                                               SelectionDAG &DAG) const {
   3393   // Jump table entries as PC relative offsets. No additional tweaking
   3394   // is necessary here. Just get the address of the jump table.
   3395   JumpTableSDNode *JT = cast<JumpTableSDNode>(Op);
   3396   EVT PtrVT = getPointerTy();
   3397   SDLoc DL(Op);
   3398 
   3399   if (getTargetMachine().getCodeModel() == CodeModel::Large &&
   3400       !Subtarget->isTargetMachO()) {
   3401     const unsigned char MO_NC = AArch64II::MO_NC;
   3402     return DAG.getNode(
   3403         AArch64ISD::WrapperLarge, DL, PtrVT,
   3404         DAG.getTargetJumpTable(JT->getIndex(), PtrVT, AArch64II::MO_G3),
   3405         DAG.getTargetJumpTable(JT->getIndex(), PtrVT, AArch64II::MO_G2 | MO_NC),
   3406         DAG.getTargetJumpTable(JT->getIndex(), PtrVT, AArch64II::MO_G1 | MO_NC),
   3407         DAG.getTargetJumpTable(JT->getIndex(), PtrVT,
   3408                                AArch64II::MO_G0 | MO_NC));
   3409   }
   3410 
   3411   SDValue Hi =
   3412       DAG.getTargetJumpTable(JT->getIndex(), PtrVT, AArch64II::MO_PAGE);
   3413   SDValue Lo = DAG.getTargetJumpTable(JT->getIndex(), PtrVT,
   3414                                       AArch64II::MO_PAGEOFF | AArch64II::MO_NC);
   3415   SDValue ADRP = DAG.getNode(AArch64ISD::ADRP, DL, PtrVT, Hi);
   3416   return DAG.getNode(AArch64ISD::ADDlow, DL, PtrVT, ADRP, Lo);
   3417 }
   3418 
   3419 SDValue AArch64TargetLowering::LowerConstantPool(SDValue Op,
   3420                                                  SelectionDAG &DAG) const {
   3421   ConstantPoolSDNode *CP = cast<ConstantPoolSDNode>(Op);
   3422   EVT PtrVT = getPointerTy();
   3423   SDLoc DL(Op);
   3424 
   3425   if (getTargetMachine().getCodeModel() == CodeModel::Large) {
   3426     // Use the GOT for the large code model on iOS.
   3427     if (Subtarget->isTargetMachO()) {
   3428       SDValue GotAddr = DAG.getTargetConstantPool(
   3429           CP->getConstVal(), PtrVT, CP->getAlignment(), CP->getOffset(),
   3430           AArch64II::MO_GOT);
   3431       return DAG.getNode(AArch64ISD::LOADgot, DL, PtrVT, GotAddr);
   3432     }
   3433 
   3434     const unsigned char MO_NC = AArch64II::MO_NC;
   3435     return DAG.getNode(
   3436         AArch64ISD::WrapperLarge, DL, PtrVT,
   3437         DAG.getTargetConstantPool(CP->getConstVal(), PtrVT, CP->getAlignment(),
   3438                                   CP->getOffset(), AArch64II::MO_G3),
   3439         DAG.getTargetConstantPool(CP->getConstVal(), PtrVT, CP->getAlignment(),
   3440                                   CP->getOffset(), AArch64II::MO_G2 | MO_NC),
   3441         DAG.getTargetConstantPool(CP->getConstVal(), PtrVT, CP->getAlignment(),
   3442                                   CP->getOffset(), AArch64II::MO_G1 | MO_NC),
   3443         DAG.getTargetConstantPool(CP->getConstVal(), PtrVT, CP->getAlignment(),
   3444                                   CP->getOffset(), AArch64II::MO_G0 | MO_NC));
   3445   } else {
   3446     // Use ADRP/ADD or ADRP/LDR for everything else: the small memory model on
   3447     // ELF, the only valid one on Darwin.
   3448     SDValue Hi =
   3449         DAG.getTargetConstantPool(CP->getConstVal(), PtrVT, CP->getAlignment(),
   3450                                   CP->getOffset(), AArch64II::MO_PAGE);
   3451     SDValue Lo = DAG.getTargetConstantPool(
   3452         CP->getConstVal(), PtrVT, CP->getAlignment(), CP->getOffset(),
   3453         AArch64II::MO_PAGEOFF | AArch64II::MO_NC);
   3454 
   3455     SDValue ADRP = DAG.getNode(AArch64ISD::ADRP, DL, PtrVT, Hi);
   3456     return DAG.getNode(AArch64ISD::ADDlow, DL, PtrVT, ADRP, Lo);
   3457   }
   3458 }
   3459 
   3460 SDValue AArch64TargetLowering::LowerBlockAddress(SDValue Op,
   3461                                                SelectionDAG &DAG) const {
   3462   const BlockAddress *BA = cast<BlockAddressSDNode>(Op)->getBlockAddress();
   3463   EVT PtrVT = getPointerTy();
   3464   SDLoc DL(Op);
   3465   if (getTargetMachine().getCodeModel() == CodeModel::Large &&
   3466       !Subtarget->isTargetMachO()) {
   3467     const unsigned char MO_NC = AArch64II::MO_NC;
   3468     return DAG.getNode(
   3469         AArch64ISD::WrapperLarge, DL, PtrVT,
   3470         DAG.getTargetBlockAddress(BA, PtrVT, 0, AArch64II::MO_G3),
   3471         DAG.getTargetBlockAddress(BA, PtrVT, 0, AArch64II::MO_G2 | MO_NC),
   3472         DAG.getTargetBlockAddress(BA, PtrVT, 0, AArch64II::MO_G1 | MO_NC),
   3473         DAG.getTargetBlockAddress(BA, PtrVT, 0, AArch64II::MO_G0 | MO_NC));
   3474   } else {
   3475     SDValue Hi = DAG.getTargetBlockAddress(BA, PtrVT, 0, AArch64II::MO_PAGE);
   3476     SDValue Lo = DAG.getTargetBlockAddress(BA, PtrVT, 0, AArch64II::MO_PAGEOFF |
   3477                                                              AArch64II::MO_NC);
   3478     SDValue ADRP = DAG.getNode(AArch64ISD::ADRP, DL, PtrVT, Hi);
   3479     return DAG.getNode(AArch64ISD::ADDlow, DL, PtrVT, ADRP, Lo);
   3480   }
   3481 }
   3482 
   3483 SDValue AArch64TargetLowering::LowerDarwin_VASTART(SDValue Op,
   3484                                                  SelectionDAG &DAG) const {
   3485   AArch64FunctionInfo *FuncInfo =
   3486       DAG.getMachineFunction().getInfo<AArch64FunctionInfo>();
   3487 
   3488   SDLoc DL(Op);
   3489   SDValue FR =
   3490       DAG.getFrameIndex(FuncInfo->getVarArgsStackIndex(), getPointerTy());
   3491   const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
   3492   return DAG.getStore(Op.getOperand(0), DL, FR, Op.getOperand(1),
   3493                       MachinePointerInfo(SV), false, false, 0);
   3494 }
   3495 
   3496 SDValue AArch64TargetLowering::LowerAAPCS_VASTART(SDValue Op,
   3497                                                 SelectionDAG &DAG) const {
   3498   // The layout of the va_list struct is specified in the AArch64 Procedure Call
   3499   // Standard, section B.3.
   3500   MachineFunction &MF = DAG.getMachineFunction();
   3501   AArch64FunctionInfo *FuncInfo = MF.getInfo<AArch64FunctionInfo>();
   3502   SDLoc DL(Op);
   3503 
   3504   SDValue Chain = Op.getOperand(0);
   3505   SDValue VAList = Op.getOperand(1);
   3506   const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
   3507   SmallVector<SDValue, 4> MemOps;
   3508 
   3509   // void *__stack at offset 0
   3510   SDValue Stack =
   3511       DAG.getFrameIndex(FuncInfo->getVarArgsStackIndex(), getPointerTy());
   3512   MemOps.push_back(DAG.getStore(Chain, DL, Stack, VAList,
   3513                                 MachinePointerInfo(SV), false, false, 8));
   3514 
   3515   // void *__gr_top at offset 8
   3516   int GPRSize = FuncInfo->getVarArgsGPRSize();
   3517   if (GPRSize > 0) {
   3518     SDValue GRTop, GRTopAddr;
   3519 
   3520     GRTopAddr = DAG.getNode(ISD::ADD, DL, getPointerTy(), VAList,
   3521                             DAG.getConstant(8, getPointerTy()));
   3522 
   3523     GRTop = DAG.getFrameIndex(FuncInfo->getVarArgsGPRIndex(), getPointerTy());
   3524     GRTop = DAG.getNode(ISD::ADD, DL, getPointerTy(), GRTop,
   3525                         DAG.getConstant(GPRSize, getPointerTy()));
   3526 
   3527     MemOps.push_back(DAG.getStore(Chain, DL, GRTop, GRTopAddr,
   3528                                   MachinePointerInfo(SV, 8), false, false, 8));
   3529   }
   3530 
   3531   // void *__vr_top at offset 16
   3532   int FPRSize = FuncInfo->getVarArgsFPRSize();
   3533   if (FPRSize > 0) {
   3534     SDValue VRTop, VRTopAddr;
   3535     VRTopAddr = DAG.getNode(ISD::ADD, DL, getPointerTy(), VAList,
   3536                             DAG.getConstant(16, getPointerTy()));
   3537 
   3538     VRTop = DAG.getFrameIndex(FuncInfo->getVarArgsFPRIndex(), getPointerTy());
   3539     VRTop = DAG.getNode(ISD::ADD, DL, getPointerTy(), VRTop,
   3540                         DAG.getConstant(FPRSize, getPointerTy()));
   3541 
   3542     MemOps.push_back(DAG.getStore(Chain, DL, VRTop, VRTopAddr,
   3543                                   MachinePointerInfo(SV, 16), false, false, 8));
   3544   }
   3545 
   3546   // int __gr_offs at offset 24
   3547   SDValue GROffsAddr = DAG.getNode(ISD::ADD, DL, getPointerTy(), VAList,
   3548                                    DAG.getConstant(24, getPointerTy()));
   3549   MemOps.push_back(DAG.getStore(Chain, DL, DAG.getConstant(-GPRSize, MVT::i32),
   3550                                 GROffsAddr, MachinePointerInfo(SV, 24), false,
   3551                                 false, 4));
   3552 
   3553   // int __vr_offs at offset 28
   3554   SDValue VROffsAddr = DAG.getNode(ISD::ADD, DL, getPointerTy(), VAList,
   3555                                    DAG.getConstant(28, getPointerTy()));
   3556   MemOps.push_back(DAG.getStore(Chain, DL, DAG.getConstant(-FPRSize, MVT::i32),
   3557                                 VROffsAddr, MachinePointerInfo(SV, 28), false,
   3558                                 false, 4));
   3559 
   3560   return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOps);
   3561 }
   3562 
   3563 SDValue AArch64TargetLowering::LowerVASTART(SDValue Op,
   3564                                             SelectionDAG &DAG) const {
   3565   return Subtarget->isTargetDarwin() ? LowerDarwin_VASTART(Op, DAG)
   3566                                      : LowerAAPCS_VASTART(Op, DAG);
   3567 }
   3568 
   3569 SDValue AArch64TargetLowering::LowerVACOPY(SDValue Op,
   3570                                            SelectionDAG &DAG) const {
   3571   // AAPCS has three pointers and two ints (= 32 bytes), Darwin has single
   3572   // pointer.
   3573   unsigned VaListSize = Subtarget->isTargetDarwin() ? 8 : 32;
   3574   const Value *DestSV = cast<SrcValueSDNode>(Op.getOperand(3))->getValue();
   3575   const Value *SrcSV = cast<SrcValueSDNode>(Op.getOperand(4))->getValue();
   3576 
   3577   return DAG.getMemcpy(Op.getOperand(0), SDLoc(Op), Op.getOperand(1),
   3578                        Op.getOperand(2), DAG.getConstant(VaListSize, MVT::i32),
   3579                        8, false, false, MachinePointerInfo(DestSV),
   3580                        MachinePointerInfo(SrcSV));
   3581 }
   3582 
   3583 SDValue AArch64TargetLowering::LowerVAARG(SDValue Op, SelectionDAG &DAG) const {
   3584   assert(Subtarget->isTargetDarwin() &&
   3585          "automatic va_arg instruction only works on Darwin");
   3586 
   3587   const Value *V = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
   3588   EVT VT = Op.getValueType();
   3589   SDLoc DL(Op);
   3590   SDValue Chain = Op.getOperand(0);
   3591   SDValue Addr = Op.getOperand(1);
   3592   unsigned Align = Op.getConstantOperandVal(3);
   3593 
   3594   SDValue VAList = DAG.getLoad(getPointerTy(), DL, Chain, Addr,
   3595                                MachinePointerInfo(V), false, false, false, 0);
   3596   Chain = VAList.getValue(1);
   3597 
   3598   if (Align > 8) {
   3599     assert(((Align & (Align - 1)) == 0) && "Expected Align to be a power of 2");
   3600     VAList = DAG.getNode(ISD::ADD, DL, getPointerTy(), VAList,
   3601                          DAG.getConstant(Align - 1, getPointerTy()));
   3602     VAList = DAG.getNode(ISD::AND, DL, getPointerTy(), VAList,
   3603                          DAG.getConstant(-(int64_t)Align, getPointerTy()));
   3604   }
   3605 
   3606   Type *ArgTy = VT.getTypeForEVT(*DAG.getContext());
   3607   uint64_t ArgSize = getDataLayout()->getTypeAllocSize(ArgTy);
   3608 
   3609   // Scalar integer and FP values smaller than 64 bits are implicitly extended
   3610   // up to 64 bits.  At the very least, we have to increase the striding of the
   3611   // vaargs list to match this, and for FP values we need to introduce
   3612   // FP_ROUND nodes as well.
   3613   if (VT.isInteger() && !VT.isVector())
   3614     ArgSize = 8;
   3615   bool NeedFPTrunc = false;
   3616   if (VT.isFloatingPoint() && !VT.isVector() && VT != MVT::f64) {
   3617     ArgSize = 8;
   3618     NeedFPTrunc = true;
   3619   }
   3620 
   3621   // Increment the pointer, VAList, to the next vaarg
   3622   SDValue VANext = DAG.getNode(ISD::ADD, DL, getPointerTy(), VAList,
   3623                                DAG.getConstant(ArgSize, getPointerTy()));
   3624   // Store the incremented VAList to the legalized pointer
   3625   SDValue APStore = DAG.getStore(Chain, DL, VANext, Addr, MachinePointerInfo(V),
   3626                                  false, false, 0);
   3627 
   3628   // Load the actual argument out of the pointer VAList
   3629   if (NeedFPTrunc) {
   3630     // Load the value as an f64.
   3631     SDValue WideFP = DAG.getLoad(MVT::f64, DL, APStore, VAList,
   3632                                  MachinePointerInfo(), false, false, false, 0);
   3633     // Round the value down to an f32.
   3634     SDValue NarrowFP = DAG.getNode(ISD::FP_ROUND, DL, VT, WideFP.getValue(0),
   3635                                    DAG.getIntPtrConstant(1));
   3636     SDValue Ops[] = { NarrowFP, WideFP.getValue(1) };
   3637     // Merge the rounded value with the chain output of the load.
   3638     return DAG.getMergeValues(Ops, DL);
   3639   }
   3640 
   3641   return DAG.getLoad(VT, DL, APStore, VAList, MachinePointerInfo(), false,
   3642                      false, false, 0);
   3643 }
   3644 
   3645 SDValue AArch64TargetLowering::LowerFRAMEADDR(SDValue Op,
   3646                                               SelectionDAG &DAG) const {
   3647   MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo();
   3648   MFI->setFrameAddressIsTaken(true);
   3649 
   3650   EVT VT = Op.getValueType();
   3651   SDLoc DL(Op);
   3652   unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
   3653   SDValue FrameAddr =
   3654       DAG.getCopyFromReg(DAG.getEntryNode(), DL, AArch64::FP, VT);
   3655   while (Depth--)
   3656     FrameAddr = DAG.getLoad(VT, DL, DAG.getEntryNode(), FrameAddr,
   3657                             MachinePointerInfo(), false, false, false, 0);
   3658   return FrameAddr;
   3659 }
   3660 
   3661 // FIXME? Maybe this could be a TableGen attribute on some registers and
   3662 // this table could be generated automatically from RegInfo.
   3663 unsigned AArch64TargetLowering::getRegisterByName(const char* RegName,
   3664                                                   EVT VT) const {
   3665   unsigned Reg = StringSwitch<unsigned>(RegName)
   3666                        .Case("sp", AArch64::SP)
   3667                        .Default(0);
   3668   if (Reg)
   3669     return Reg;
   3670   report_fatal_error("Invalid register name global variable");
   3671 }
   3672 
   3673 SDValue AArch64TargetLowering::LowerRETURNADDR(SDValue Op,
   3674                                                SelectionDAG &DAG) const {
   3675   MachineFunction &MF = DAG.getMachineFunction();
   3676   MachineFrameInfo *MFI = MF.getFrameInfo();
   3677   MFI->setReturnAddressIsTaken(true);
   3678 
   3679   EVT VT = Op.getValueType();
   3680   SDLoc DL(Op);
   3681   unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
   3682   if (Depth) {
   3683     SDValue FrameAddr = LowerFRAMEADDR(Op, DAG);
   3684     SDValue Offset = DAG.getConstant(8, getPointerTy());
   3685     return DAG.getLoad(VT, DL, DAG.getEntryNode(),
   3686                        DAG.getNode(ISD::ADD, DL, VT, FrameAddr, Offset),
   3687                        MachinePointerInfo(), false, false, false, 0);
   3688   }
   3689 
   3690   // Return LR, which contains the return address. Mark it an implicit live-in.
   3691   unsigned Reg = MF.addLiveIn(AArch64::LR, &AArch64::GPR64RegClass);
   3692   return DAG.getCopyFromReg(DAG.getEntryNode(), DL, Reg, VT);
   3693 }
   3694 
   3695 /// LowerShiftRightParts - Lower SRA_PARTS, which returns two
   3696 /// i64 values and take a 2 x i64 value to shift plus a shift amount.
   3697 SDValue AArch64TargetLowering::LowerShiftRightParts(SDValue Op,
   3698                                                     SelectionDAG &DAG) const {
   3699   assert(Op.getNumOperands() == 3 && "Not a double-shift!");
   3700   EVT VT = Op.getValueType();
   3701   unsigned VTBits = VT.getSizeInBits();
   3702   SDLoc dl(Op);
   3703   SDValue ShOpLo = Op.getOperand(0);
   3704   SDValue ShOpHi = Op.getOperand(1);
   3705   SDValue ShAmt = Op.getOperand(2);
   3706   SDValue ARMcc;
   3707   unsigned Opc = (Op.getOpcode() == ISD::SRA_PARTS) ? ISD::SRA : ISD::SRL;
   3708 
   3709   assert(Op.getOpcode() == ISD::SRA_PARTS || Op.getOpcode() == ISD::SRL_PARTS);
   3710 
   3711   SDValue RevShAmt = DAG.getNode(ISD::SUB, dl, MVT::i64,
   3712                                  DAG.getConstant(VTBits, MVT::i64), ShAmt);
   3713   SDValue Tmp1 = DAG.getNode(ISD::SRL, dl, VT, ShOpLo, ShAmt);
   3714   SDValue ExtraShAmt = DAG.getNode(ISD::SUB, dl, MVT::i64, ShAmt,
   3715                                    DAG.getConstant(VTBits, MVT::i64));
   3716   SDValue Tmp2 = DAG.getNode(ISD::SHL, dl, VT, ShOpHi, RevShAmt);
   3717 
   3718   SDValue Cmp = emitComparison(ExtraShAmt, DAG.getConstant(0, MVT::i64),
   3719                                ISD::SETGE, dl, DAG);
   3720   SDValue CCVal = DAG.getConstant(AArch64CC::GE, MVT::i32);
   3721 
   3722   SDValue FalseValLo = DAG.getNode(ISD::OR, dl, VT, Tmp1, Tmp2);
   3723   SDValue TrueValLo = DAG.getNode(Opc, dl, VT, ShOpHi, ExtraShAmt);
   3724   SDValue Lo =
   3725       DAG.getNode(AArch64ISD::CSEL, dl, VT, TrueValLo, FalseValLo, CCVal, Cmp);
   3726 
   3727   // AArch64 shifts larger than the register width are wrapped rather than
   3728   // clamped, so we can't just emit "hi >> x".
   3729   SDValue FalseValHi = DAG.getNode(Opc, dl, VT, ShOpHi, ShAmt);
   3730   SDValue TrueValHi = Opc == ISD::SRA
   3731                           ? DAG.getNode(Opc, dl, VT, ShOpHi,
   3732                                         DAG.getConstant(VTBits - 1, MVT::i64))
   3733                           : DAG.getConstant(0, VT);
   3734   SDValue Hi =
   3735       DAG.getNode(AArch64ISD::CSEL, dl, VT, TrueValHi, FalseValHi, CCVal, Cmp);
   3736 
   3737   SDValue Ops[2] = { Lo, Hi };
   3738   return DAG.getMergeValues(Ops, dl);
   3739 }
   3740 
   3741 /// LowerShiftLeftParts - Lower SHL_PARTS, which returns two
   3742 /// i64 values and take a 2 x i64 value to shift plus a shift amount.
   3743 SDValue AArch64TargetLowering::LowerShiftLeftParts(SDValue Op,
   3744                                                  SelectionDAG &DAG) const {
   3745   assert(Op.getNumOperands() == 3 && "Not a double-shift!");
   3746   EVT VT = Op.getValueType();
   3747   unsigned VTBits = VT.getSizeInBits();
   3748   SDLoc dl(Op);
   3749   SDValue ShOpLo = Op.getOperand(0);
   3750   SDValue ShOpHi = Op.getOperand(1);
   3751   SDValue ShAmt = Op.getOperand(2);
   3752   SDValue ARMcc;
   3753 
   3754   assert(Op.getOpcode() == ISD::SHL_PARTS);
   3755   SDValue RevShAmt = DAG.getNode(ISD::SUB, dl, MVT::i64,
   3756                                  DAG.getConstant(VTBits, MVT::i64), ShAmt);
   3757   SDValue Tmp1 = DAG.getNode(ISD::SRL, dl, VT, ShOpLo, RevShAmt);
   3758   SDValue ExtraShAmt = DAG.getNode(ISD::SUB, dl, MVT::i64, ShAmt,
   3759                                    DAG.getConstant(VTBits, MVT::i64));
   3760   SDValue Tmp2 = DAG.getNode(ISD::SHL, dl, VT, ShOpHi, ShAmt);
   3761   SDValue Tmp3 = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, ExtraShAmt);
   3762 
   3763   SDValue FalseVal = DAG.getNode(ISD::OR, dl, VT, Tmp1, Tmp2);
   3764 
   3765   SDValue Cmp = emitComparison(ExtraShAmt, DAG.getConstant(0, MVT::i64),
   3766                                ISD::SETGE, dl, DAG);
   3767   SDValue CCVal = DAG.getConstant(AArch64CC::GE, MVT::i32);
   3768   SDValue Hi =
   3769       DAG.getNode(AArch64ISD::CSEL, dl, VT, Tmp3, FalseVal, CCVal, Cmp);
   3770 
   3771   // AArch64 shifts of larger than register sizes are wrapped rather than
   3772   // clamped, so we can't just emit "lo << a" if a is too big.
   3773   SDValue TrueValLo = DAG.getConstant(0, VT);
   3774   SDValue FalseValLo = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, ShAmt);
   3775   SDValue Lo =
   3776       DAG.getNode(AArch64ISD::CSEL, dl, VT, TrueValLo, FalseValLo, CCVal, Cmp);
   3777 
   3778   SDValue Ops[2] = { Lo, Hi };
   3779   return DAG.getMergeValues(Ops, dl);
   3780 }
   3781 
   3782 bool AArch64TargetLowering::isOffsetFoldingLegal(
   3783     const GlobalAddressSDNode *GA) const {
   3784   // The AArch64 target doesn't support folding offsets into global addresses.
   3785   return false;
   3786 }
   3787 
   3788 bool AArch64TargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT) const {
   3789   // We can materialize #0.0 as fmov $Rd, XZR for 64-bit and 32-bit cases.
   3790   // FIXME: We should be able to handle f128 as well with a clever lowering.
   3791   if (Imm.isPosZero() && (VT == MVT::f64 || VT == MVT::f32))
   3792     return true;
   3793 
   3794   if (VT == MVT::f64)
   3795     return AArch64_AM::getFP64Imm(Imm) != -1;
   3796   else if (VT == MVT::f32)
   3797     return AArch64_AM::getFP32Imm(Imm) != -1;
   3798   return false;
   3799 }
   3800 
   3801 //===----------------------------------------------------------------------===//
   3802 //                          AArch64 Optimization Hooks
   3803 //===----------------------------------------------------------------------===//
   3804 
   3805 //===----------------------------------------------------------------------===//
   3806 //                          AArch64 Inline Assembly Support
   3807 //===----------------------------------------------------------------------===//
   3808 
   3809 // Table of Constraints
   3810 // TODO: This is the current set of constraints supported by ARM for the
   3811 // compiler, not all of them may make sense, e.g. S may be difficult to support.
   3812 //
   3813 // r - A general register
   3814 // w - An FP/SIMD register of some size in the range v0-v31
   3815 // x - An FP/SIMD register of some size in the range v0-v15
   3816 // I - Constant that can be used with an ADD instruction
   3817 // J - Constant that can be used with a SUB instruction
   3818 // K - Constant that can be used with a 32-bit logical instruction
   3819 // L - Constant that can be used with a 64-bit logical instruction
   3820 // M - Constant that can be used as a 32-bit MOV immediate
   3821 // N - Constant that can be used as a 64-bit MOV immediate
   3822 // Q - A memory reference with base register and no offset
   3823 // S - A symbolic address
   3824 // Y - Floating point constant zero
   3825 // Z - Integer constant zero
   3826 //
   3827 //   Note that general register operands will be output using their 64-bit x
   3828 // register name, whatever the size of the variable, unless the asm operand
   3829 // is prefixed by the %w modifier. Floating-point and SIMD register operands
   3830 // will be output with the v prefix unless prefixed by the %b, %h, %s, %d or
   3831 // %q modifier.
   3832 
   3833 /// getConstraintType - Given a constraint letter, return the type of
   3834 /// constraint it is for this target.
   3835 AArch64TargetLowering::ConstraintType
   3836 AArch64TargetLowering::getConstraintType(const std::string &Constraint) const {
   3837   if (Constraint.size() == 1) {
   3838     switch (Constraint[0]) {
   3839     default:
   3840       break;
   3841     case 'z':
   3842       return C_Other;
   3843     case 'x':
   3844     case 'w':
   3845       return C_RegisterClass;
   3846     // An address with a single base register. Due to the way we
   3847     // currently handle addresses it is the same as 'r'.
   3848     case 'Q':
   3849       return C_Memory;
   3850     }
   3851   }
   3852   return TargetLowering::getConstraintType(Constraint);
   3853 }
   3854 
   3855 /// Examine constraint type and operand type and determine a weight value.
   3856 /// This object must already have been set up with the operand type
   3857 /// and the current alternative constraint selected.
   3858 TargetLowering::ConstraintWeight
   3859 AArch64TargetLowering::getSingleConstraintMatchWeight(
   3860     AsmOperandInfo &info, const char *constraint) const {
   3861   ConstraintWeight weight = CW_Invalid;
   3862   Value *CallOperandVal = info.CallOperandVal;
   3863   // If we don't have a value, we can't do a match,
   3864   // but allow it at the lowest weight.
   3865   if (!CallOperandVal)
   3866     return CW_Default;
   3867   Type *type = CallOperandVal->getType();
   3868   // Look at the constraint type.
   3869   switch (*constraint) {
   3870   default:
   3871     weight = TargetLowering::getSingleConstraintMatchWeight(info, constraint);
   3872     break;
   3873   case 'x':
   3874   case 'w':
   3875     if (type->isFloatingPointTy() || type->isVectorTy())
   3876       weight = CW_Register;
   3877     break;
   3878   case 'z':
   3879     weight = CW_Constant;
   3880     break;
   3881   }
   3882   return weight;
   3883 }
   3884 
   3885 std::pair<unsigned, const TargetRegisterClass *>
   3886 AArch64TargetLowering::getRegForInlineAsmConstraint(
   3887     const std::string &Constraint, MVT VT) const {
   3888   if (Constraint.size() == 1) {
   3889     switch (Constraint[0]) {
   3890     case 'r':
   3891       if (VT.getSizeInBits() == 64)
   3892         return std::make_pair(0U, &AArch64::GPR64commonRegClass);
   3893       return std::make_pair(0U, &AArch64::GPR32commonRegClass);
   3894     case 'w':
   3895       if (VT == MVT::f32)
   3896         return std::make_pair(0U, &AArch64::FPR32RegClass);
   3897       if (VT.getSizeInBits() == 64)
   3898         return std::make_pair(0U, &AArch64::FPR64RegClass);
   3899       if (VT.getSizeInBits() == 128)
   3900         return std::make_pair(0U, &AArch64::FPR128RegClass);
   3901       break;
   3902     // The instructions that this constraint is designed for can
   3903     // only take 128-bit registers so just use that regclass.
   3904     case 'x':
   3905       if (VT.getSizeInBits() == 128)
   3906         return std::make_pair(0U, &AArch64::FPR128_loRegClass);
   3907       break;
   3908     }
   3909   }
   3910   if (StringRef("{cc}").equals_lower(Constraint))
   3911     return std::make_pair(unsigned(AArch64::NZCV), &AArch64::CCRRegClass);
   3912 
   3913   // Use the default implementation in TargetLowering to convert the register
   3914   // constraint into a member of a register class.
   3915   std::pair<unsigned, const TargetRegisterClass *> Res;
   3916   Res = TargetLowering::getRegForInlineAsmConstraint(Constraint, VT);
   3917 
   3918   // Not found as a standard register?
   3919   if (!Res.second) {
   3920     unsigned Size = Constraint.size();
   3921     if ((Size == 4 || Size == 5) && Constraint[0] == '{' &&
   3922         tolower(Constraint[1]) == 'v' && Constraint[Size - 1] == '}') {
   3923       const std::string Reg =
   3924           std::string(&Constraint[2], &Constraint[Size - 1]);
   3925       int RegNo = atoi(Reg.c_str());
   3926       if (RegNo >= 0 && RegNo <= 31) {
   3927         // v0 - v31 are aliases of q0 - q31.
   3928         // By default we'll emit v0-v31 for this unless there's a modifier where
   3929         // we'll emit the correct register as well.
   3930         Res.first = AArch64::FPR128RegClass.getRegister(RegNo);
   3931         Res.second = &AArch64::FPR128RegClass;
   3932       }
   3933     }
   3934   }
   3935 
   3936   return Res;
   3937 }
   3938 
   3939 /// LowerAsmOperandForConstraint - Lower the specified operand into the Ops
   3940 /// vector.  If it is invalid, don't add anything to Ops.
   3941 void AArch64TargetLowering::LowerAsmOperandForConstraint(
   3942     SDValue Op, std::string &Constraint, std::vector<SDValue> &Ops,
   3943     SelectionDAG &DAG) const {
   3944   SDValue Result;
   3945 
   3946   // Currently only support length 1 constraints.
   3947   if (Constraint.length() != 1)
   3948     return;
   3949 
   3950   char ConstraintLetter = Constraint[0];
   3951   switch (ConstraintLetter) {
   3952   default:
   3953     break;
   3954 
   3955   // This set of constraints deal with valid constants for various instructions.
   3956   // Validate and return a target constant for them if we can.
   3957   case 'z': {
   3958     // 'z' maps to xzr or wzr so it needs an input of 0.
   3959     ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op);
   3960     if (!C || C->getZExtValue() != 0)
   3961       return;
   3962 
   3963     if (Op.getValueType() == MVT::i64)
   3964       Result = DAG.getRegister(AArch64::XZR, MVT::i64);
   3965     else
   3966       Result = DAG.getRegister(AArch64::WZR, MVT::i32);
   3967     break;
   3968   }
   3969 
   3970   case 'I':
   3971   case 'J':
   3972   case 'K':
   3973   case 'L':
   3974   case 'M':
   3975   case 'N':
   3976     ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op);
   3977     if (!C)
   3978       return;
   3979 
   3980     // Grab the value and do some validation.
   3981     uint64_t CVal = C->getZExtValue();
   3982     switch (ConstraintLetter) {
   3983     // The I constraint applies only to simple ADD or SUB immediate operands:
   3984     // i.e. 0 to 4095 with optional shift by 12
   3985     // The J constraint applies only to ADD or SUB immediates that would be
   3986     // valid when negated, i.e. if [an add pattern] were to be output as a SUB
   3987     // instruction [or vice versa], in other words -1 to -4095 with optional
   3988     // left shift by 12.
   3989     case 'I':
   3990       if (isUInt<12>(CVal) || isShiftedUInt<12, 12>(CVal))
   3991         break;
   3992       return;
   3993     case 'J': {
   3994       uint64_t NVal = -C->getSExtValue();
   3995       if (isUInt<12>(NVal) || isShiftedUInt<12, 12>(NVal))
   3996         break;
   3997       return;
   3998     }
   3999     // The K and L constraints apply *only* to logical immediates, including
   4000     // what used to be the MOVI alias for ORR (though the MOVI alias has now
   4001     // been removed and MOV should be used). So these constraints have to
   4002     // distinguish between bit patterns that are valid 32-bit or 64-bit
   4003     // "bitmask immediates": for example 0xaaaaaaaa is a valid bimm32 (K), but
   4004     // not a valid bimm64 (L) where 0xaaaaaaaaaaaaaaaa would be valid, and vice
   4005     // versa.
   4006     case 'K':
   4007       if (AArch64_AM::isLogicalImmediate(CVal, 32))
   4008         break;
   4009       return;
   4010     case 'L':
   4011       if (AArch64_AM::isLogicalImmediate(CVal, 64))
   4012         break;
   4013       return;
   4014     // The M and N constraints are a superset of K and L respectively, for use
   4015     // with the MOV (immediate) alias. As well as the logical immediates they
   4016     // also match 32 or 64-bit immediates that can be loaded either using a
   4017     // *single* MOVZ or MOVN , such as 32-bit 0x12340000, 0x00001234, 0xffffedca
   4018     // (M) or 64-bit 0x1234000000000000 (N) etc.
   4019     // As a note some of this code is liberally stolen from the asm parser.
   4020     case 'M': {
   4021       if (!isUInt<32>(CVal))
   4022         return;
   4023       if (AArch64_AM::isLogicalImmediate(CVal, 32))
   4024         break;
   4025       if ((CVal & 0xFFFF) == CVal)
   4026         break;
   4027       if ((CVal & 0xFFFF0000ULL) == CVal)
   4028         break;
   4029       uint64_t NCVal = ~(uint32_t)CVal;
   4030       if ((NCVal & 0xFFFFULL) == NCVal)
   4031         break;
   4032       if ((NCVal & 0xFFFF0000ULL) == NCVal)
   4033         break;
   4034       return;
   4035     }
   4036     case 'N': {
   4037       if (AArch64_AM::isLogicalImmediate(CVal, 64))
   4038         break;
   4039       if ((CVal & 0xFFFFULL) == CVal)
   4040         break;
   4041       if ((CVal & 0xFFFF0000ULL) == CVal)
   4042         break;
   4043       if ((CVal & 0xFFFF00000000ULL) == CVal)
   4044         break;
   4045       if ((CVal & 0xFFFF000000000000ULL) == CVal)
   4046         break;
   4047       uint64_t NCVal = ~CVal;
   4048       if ((NCVal & 0xFFFFULL) == NCVal)
   4049         break;
   4050       if ((NCVal & 0xFFFF0000ULL) == NCVal)
   4051         break;
   4052       if ((NCVal & 0xFFFF00000000ULL) == NCVal)
   4053         break;
   4054       if ((NCVal & 0xFFFF000000000000ULL) == NCVal)
   4055         break;
   4056       return;
   4057     }
   4058     default:
   4059       return;
   4060     }
   4061 
   4062     // All assembler immediates are 64-bit integers.
   4063     Result = DAG.getTargetConstant(CVal, MVT::i64);
   4064     break;
   4065   }
   4066 
   4067   if (Result.getNode()) {
   4068     Ops.push_back(Result);
   4069     return;
   4070   }
   4071 
   4072   return TargetLowering::LowerAsmOperandForConstraint(Op, Constraint, Ops, DAG);
   4073 }
   4074 
   4075 //===----------------------------------------------------------------------===//
   4076 //                     AArch64 Advanced SIMD Support
   4077 //===----------------------------------------------------------------------===//
   4078 
   4079 /// WidenVector - Given a value in the V64 register class, produce the
   4080 /// equivalent value in the V128 register class.
   4081 static SDValue WidenVector(SDValue V64Reg, SelectionDAG &DAG) {
   4082   EVT VT = V64Reg.getValueType();
   4083   unsigned NarrowSize = VT.getVectorNumElements();
   4084   MVT EltTy = VT.getVectorElementType().getSimpleVT();
   4085   MVT WideTy = MVT::getVectorVT(EltTy, 2 * NarrowSize);
   4086   SDLoc DL(V64Reg);
   4087 
   4088   return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, WideTy, DAG.getUNDEF(WideTy),
   4089                      V64Reg, DAG.getConstant(0, MVT::i32));
   4090 }
   4091 
   4092 /// getExtFactor - Determine the adjustment factor for the position when
   4093 /// generating an "extract from vector registers" instruction.
   4094 static unsigned getExtFactor(SDValue &V) {
   4095   EVT EltType = V.getValueType().getVectorElementType();
   4096   return EltType.getSizeInBits() / 8;
   4097 }
   4098 
   4099 /// NarrowVector - Given a value in the V128 register class, produce the
   4100 /// equivalent value in the V64 register class.
   4101 static SDValue NarrowVector(SDValue V128Reg, SelectionDAG &DAG) {
   4102   EVT VT = V128Reg.getValueType();
   4103   unsigned WideSize = VT.getVectorNumElements();
   4104   MVT EltTy = VT.getVectorElementType().getSimpleVT();
   4105   MVT NarrowTy = MVT::getVectorVT(EltTy, WideSize / 2);
   4106   SDLoc DL(V128Reg);
   4107 
   4108   return DAG.getTargetExtractSubreg(AArch64::dsub, DL, NarrowTy, V128Reg);
   4109 }
   4110 
   4111 // Gather data to see if the operation can be modelled as a
   4112 // shuffle in combination with VEXTs.
   4113 SDValue AArch64TargetLowering::ReconstructShuffle(SDValue Op,
   4114                                                   SelectionDAG &DAG) const {
   4115   assert(Op.getOpcode() == ISD::BUILD_VECTOR && "Unknown opcode!");
   4116   SDLoc dl(Op);
   4117   EVT VT = Op.getValueType();
   4118   unsigned NumElts = VT.getVectorNumElements();
   4119 
   4120   SmallVector<SDValue, 2> SourceVecs;
   4121   SmallVector<unsigned, 2> MinElts;
   4122   SmallVector<unsigned, 2> MaxElts;
   4123 
   4124   for (unsigned i = 0; i < NumElts; ++i) {
   4125     SDValue V = Op.getOperand(i);
   4126     if (V.getOpcode() == ISD::UNDEF)
   4127       continue;
   4128     else if (V.getOpcode() != ISD::EXTRACT_VECTOR_ELT) {
   4129       // A shuffle can only come from building a vector from various
   4130       // elements of other vectors.
   4131       return SDValue();
   4132     }
   4133 
   4134     // Record this extraction against the appropriate vector if possible...
   4135     SDValue SourceVec = V.getOperand(0);
   4136     unsigned EltNo = cast<ConstantSDNode>(V.getOperand(1))->getZExtValue();
   4137     bool FoundSource = false;
   4138     for (unsigned j = 0; j < SourceVecs.size(); ++j) {
   4139       if (SourceVecs[j] == SourceVec) {
   4140         if (MinElts[j] > EltNo)
   4141           MinElts[j] = EltNo;
   4142         if (MaxElts[j] < EltNo)
   4143           MaxElts[j] = EltNo;
   4144         FoundSource = true;
   4145         break;
   4146       }
   4147     }
   4148 
   4149     // Or record a new source if not...
   4150     if (!FoundSource) {
   4151       SourceVecs.push_back(SourceVec);
   4152       MinElts.push_back(EltNo);
   4153       MaxElts.push_back(EltNo);
   4154     }
   4155   }
   4156 
   4157   // Currently only do something sane when at most two source vectors
   4158   // involved.
   4159   if (SourceVecs.size() > 2)
   4160     return SDValue();
   4161 
   4162   SDValue ShuffleSrcs[2] = { DAG.getUNDEF(VT), DAG.getUNDEF(VT) };
   4163   int VEXTOffsets[2] = { 0, 0 };
   4164   int OffsetMultipliers[2] = { 1, 1 };
   4165 
   4166   // This loop extracts the usage patterns of the source vectors
   4167   // and prepares appropriate SDValues for a shuffle if possible.
   4168   for (unsigned i = 0; i < SourceVecs.size(); ++i) {
   4169     unsigned NumSrcElts = SourceVecs[i].getValueType().getVectorNumElements();
   4170     SDValue CurSource = SourceVecs[i];
   4171     if (SourceVecs[i].getValueType().getVectorElementType() !=
   4172         VT.getVectorElementType()) {
   4173       // It may hit this case if SourceVecs[i] is AssertSext/AssertZext.
   4174       // Then bitcast it to the vector which holds asserted element type,
   4175       // and record the multiplier of element width between SourceVecs and
   4176       // Build_vector which is needed to extract the correct lanes later.
   4177       EVT CastVT =
   4178           EVT::getVectorVT(*DAG.getContext(), VT.getVectorElementType(),
   4179                            SourceVecs[i].getValueSizeInBits() /
   4180                                VT.getVectorElementType().getSizeInBits());
   4181 
   4182       CurSource = DAG.getNode(ISD::BITCAST, dl, CastVT, SourceVecs[i]);
   4183       OffsetMultipliers[i] = CastVT.getVectorNumElements() / NumSrcElts;
   4184       NumSrcElts *= OffsetMultipliers[i];
   4185       MaxElts[i] *= OffsetMultipliers[i];
   4186       MinElts[i] *= OffsetMultipliers[i];
   4187     }
   4188 
   4189     if (CurSource.getValueType() == VT) {
   4190       // No VEXT necessary
   4191       ShuffleSrcs[i] = CurSource;
   4192       VEXTOffsets[i] = 0;
   4193       continue;
   4194     } else if (NumSrcElts < NumElts) {
   4195       // We can pad out the smaller vector for free, so if it's part of a
   4196       // shuffle...
   4197       ShuffleSrcs[i] = DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, CurSource,
   4198                                    DAG.getUNDEF(CurSource.getValueType()));
   4199       continue;
   4200     }
   4201 
   4202     // Since only 64-bit and 128-bit vectors are legal on ARM and
   4203     // we've eliminated the other cases...
   4204     assert(NumSrcElts == 2 * NumElts &&
   4205            "unexpected vector sizes in ReconstructShuffle");
   4206 
   4207     if (MaxElts[i] - MinElts[i] >= NumElts) {
   4208       // Span too large for a VEXT to cope
   4209       return SDValue();
   4210     }
   4211 
   4212     if (MinElts[i] >= NumElts) {
   4213       // The extraction can just take the second half
   4214       VEXTOffsets[i] = NumElts;
   4215       ShuffleSrcs[i] = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, CurSource,
   4216                                    DAG.getIntPtrConstant(NumElts));
   4217     } else if (MaxElts[i] < NumElts) {
   4218       // The extraction can just take the first half
   4219       VEXTOffsets[i] = 0;
   4220       ShuffleSrcs[i] = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, CurSource,
   4221                                    DAG.getIntPtrConstant(0));
   4222     } else {
   4223       // An actual VEXT is needed
   4224       VEXTOffsets[i] = MinElts[i];
   4225       SDValue VEXTSrc1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, CurSource,
   4226                                      DAG.getIntPtrConstant(0));
   4227       SDValue VEXTSrc2 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, CurSource,
   4228                                      DAG.getIntPtrConstant(NumElts));
   4229       unsigned Imm = VEXTOffsets[i] * getExtFactor(VEXTSrc1);
   4230       ShuffleSrcs[i] = DAG.getNode(AArch64ISD::EXT, dl, VT, VEXTSrc1, VEXTSrc2,
   4231                                    DAG.getConstant(Imm, MVT::i32));
   4232     }
   4233   }
   4234 
   4235   SmallVector<int, 8> Mask;
   4236 
   4237   for (unsigned i = 0; i < NumElts; ++i) {
   4238     SDValue Entry = Op.getOperand(i);
   4239     if (Entry.getOpcode() == ISD::UNDEF) {
   4240       Mask.push_back(-1);
   4241       continue;
   4242     }
   4243 
   4244     SDValue ExtractVec = Entry.getOperand(0);
   4245     int ExtractElt =
   4246         cast<ConstantSDNode>(Op.getOperand(i).getOperand(1))->getSExtValue();
   4247     if (ExtractVec == SourceVecs[0]) {
   4248       Mask.push_back(ExtractElt * OffsetMultipliers[0] - VEXTOffsets[0]);
   4249     } else {
   4250       Mask.push_back(ExtractElt * OffsetMultipliers[1] + NumElts -
   4251                      VEXTOffsets[1]);
   4252     }
   4253   }
   4254 
   4255   // Final check before we try to produce nonsense...
   4256   if (isShuffleMaskLegal(Mask, VT))
   4257     return DAG.getVectorShuffle(VT, dl, ShuffleSrcs[0], ShuffleSrcs[1],
   4258                                 &Mask[0]);
   4259 
   4260   return SDValue();
   4261 }
   4262 
   4263 // check if an EXT instruction can handle the shuffle mask when the
   4264 // vector sources of the shuffle are the same.
   4265 static bool isSingletonEXTMask(ArrayRef<int> M, EVT VT, unsigned &Imm) {
   4266   unsigned NumElts = VT.getVectorNumElements();
   4267 
   4268   // Assume that the first shuffle index is not UNDEF.  Fail if it is.
   4269   if (M[0] < 0)
   4270     return false;
   4271 
   4272   Imm = M[0];
   4273 
   4274   // If this is a VEXT shuffle, the immediate value is the index of the first
   4275   // element.  The other shuffle indices must be the successive elements after
   4276   // the first one.
   4277   unsigned ExpectedElt = Imm;
   4278   for (unsigned i = 1; i < NumElts; ++i) {
   4279     // Increment the expected index.  If it wraps around, just follow it
   4280     // back to index zero and keep going.
   4281     ++ExpectedElt;
   4282     if (ExpectedElt == NumElts)
   4283       ExpectedElt = 0;
   4284 
   4285     if (M[i] < 0)
   4286       continue; // ignore UNDEF indices
   4287     if (ExpectedElt != static_cast<unsigned>(M[i]))
   4288       return false;
   4289   }
   4290 
   4291   return true;
   4292 }
   4293 
   4294 // check if an EXT instruction can handle the shuffle mask when the
   4295 // vector sources of the shuffle are different.
   4296 static bool isEXTMask(ArrayRef<int> M, EVT VT, bool &ReverseEXT,
   4297                       unsigned &Imm) {
   4298   // Look for the first non-undef element.
   4299   const int *FirstRealElt = std::find_if(M.begin(), M.end(),
   4300       [](int Elt) {return Elt >= 0;});
   4301 
   4302   // Benefit form APInt to handle overflow when calculating expected element.
   4303   unsigned NumElts = VT.getVectorNumElements();
   4304   unsigned MaskBits = APInt(32, NumElts * 2).logBase2();
   4305   APInt ExpectedElt = APInt(MaskBits, *FirstRealElt + 1);
   4306   // The following shuffle indices must be the successive elements after the
   4307   // first real element.
   4308   const int *FirstWrongElt = std::find_if(FirstRealElt + 1, M.end(),
   4309       [&](int Elt) {return Elt != ExpectedElt++ && Elt != -1;});
   4310   if (FirstWrongElt != M.end())
   4311     return false;
   4312 
   4313   // The index of an EXT is the first element if it is not UNDEF.
   4314   // Watch out for the beginning UNDEFs. The EXT index should be the expected
   4315   // value of the first element.  E.g.
   4316   // <-1, -1, 3, ...> is treated as <1, 2, 3, ...>.
   4317   // <-1, -1, 0, 1, ...> is treated as <2*NumElts-2, 2*NumElts-1, 0, 1, ...>.
   4318   // ExpectedElt is the last mask index plus 1.
   4319   Imm = ExpectedElt.getZExtValue();
   4320 
   4321   // There are two difference cases requiring to reverse input vectors.
   4322   // For example, for vector <4 x i32> we have the following cases,
   4323   // Case 1: shufflevector(<4 x i32>,<4 x i32>,<-1, -1, -1, 0>)
   4324   // Case 2: shufflevector(<4 x i32>,<4 x i32>,<-1, -1, 7, 0>)
   4325   // For both cases, we finally use mask <5, 6, 7, 0>, which requires
   4326   // to reverse two input vectors.
   4327   if (Imm < NumElts)
   4328     ReverseEXT = true;
   4329   else
   4330     Imm -= NumElts;
   4331 
   4332   return true;
   4333 }
   4334 
   4335 /// isREVMask - Check if a vector shuffle corresponds to a REV
   4336 /// instruction with the specified blocksize.  (The order of the elements
   4337 /// within each block of the vector is reversed.)
   4338 static bool isREVMask(ArrayRef<int> M, EVT VT, unsigned BlockSize) {
   4339   assert((BlockSize == 16 || BlockSize == 32 || BlockSize == 64) &&
   4340          "Only possible block sizes for REV are: 16, 32, 64");
   4341 
   4342   unsigned EltSz = VT.getVectorElementType().getSizeInBits();
   4343   if (EltSz == 64)
   4344     return false;
   4345 
   4346   unsigned NumElts = VT.getVectorNumElements();
   4347   unsigned BlockElts = M[0] + 1;
   4348   // If the first shuffle index is UNDEF, be optimistic.
   4349   if (M[0] < 0)
   4350     BlockElts = BlockSize / EltSz;
   4351 
   4352   if (BlockSize <= EltSz || BlockSize != BlockElts * EltSz)
   4353     return false;
   4354 
   4355   for (unsigned i = 0; i < NumElts; ++i) {
   4356     if (M[i] < 0)
   4357       continue; // ignore UNDEF indices
   4358     if ((unsigned)M[i] != (i - i % BlockElts) + (BlockElts - 1 - i % BlockElts))
   4359       return false;
   4360   }
   4361 
   4362   return true;
   4363 }
   4364 
   4365 static bool isZIPMask(ArrayRef<int> M, EVT VT, unsigned &WhichResult) {
   4366   unsigned NumElts = VT.getVectorNumElements();
   4367   WhichResult = (M[0] == 0 ? 0 : 1);
   4368   unsigned Idx = WhichResult * NumElts / 2;
   4369   for (unsigned i = 0; i != NumElts; i += 2) {
   4370     if ((M[i] >= 0 && (unsigned)M[i] != Idx) ||
   4371         (M[i + 1] >= 0 && (unsigned)M[i + 1] != Idx + NumElts))
   4372       return false;
   4373     Idx += 1;
   4374   }
   4375 
   4376   return true;
   4377 }
   4378 
   4379 static bool isUZPMask(ArrayRef<int> M, EVT VT, unsigned &WhichResult) {
   4380   unsigned NumElts = VT.getVectorNumElements();
   4381   WhichResult = (M[0] == 0 ? 0 : 1);
   4382   for (unsigned i = 0; i != NumElts; ++i) {
   4383     if (M[i] < 0)
   4384       continue; // ignore UNDEF indices
   4385     if ((unsigned)M[i] != 2 * i + WhichResult)
   4386       return false;
   4387   }
   4388 
   4389   return true;
   4390 }
   4391 
   4392 static bool isTRNMask(ArrayRef<int> M, EVT VT, unsigned &WhichResult) {
   4393   unsigned NumElts = VT.getVectorNumElements();
   4394   WhichResult = (M[0] == 0 ? 0 : 1);
   4395   for (unsigned i = 0; i < NumElts; i += 2) {
   4396     if ((M[i] >= 0 && (unsigned)M[i] != i + WhichResult) ||
   4397         (M[i + 1] >= 0 && (unsigned)M[i + 1] != i + NumElts + WhichResult))
   4398       return false;
   4399   }
   4400   return true;
   4401 }
   4402 
   4403 /// isZIP_v_undef_Mask - Special case of isZIPMask for canonical form of
   4404 /// "vector_shuffle v, v", i.e., "vector_shuffle v, undef".
   4405 /// Mask is e.g., <0, 0, 1, 1> instead of <0, 4, 1, 5>.
   4406 static bool isZIP_v_undef_Mask(ArrayRef<int> M, EVT VT, unsigned &WhichResult) {
   4407   unsigned NumElts = VT.getVectorNumElements();
   4408   WhichResult = (M[0] == 0 ? 0 : 1);
   4409   unsigned Idx = WhichResult * NumElts / 2;
   4410   for (unsigned i = 0; i != NumElts; i += 2) {
   4411     if ((M[i] >= 0 && (unsigned)M[i] != Idx) ||
   4412         (M[i + 1] >= 0 && (unsigned)M[i + 1] != Idx))
   4413       return false;
   4414     Idx += 1;
   4415   }
   4416 
   4417   return true;
   4418 }
   4419 
   4420 /// isUZP_v_undef_Mask - Special case of isUZPMask for canonical form of
   4421 /// "vector_shuffle v, v", i.e., "vector_shuffle v, undef".
   4422 /// Mask is e.g., <0, 2, 0, 2> instead of <0, 2, 4, 6>,
   4423 static bool isUZP_v_undef_Mask(ArrayRef<int> M, EVT VT, unsigned &WhichResult) {
   4424   unsigned Half = VT.getVectorNumElements() / 2;
   4425   WhichResult = (M[0] == 0 ? 0 : 1);
   4426   for (unsigned j = 0; j != 2; ++j) {
   4427     unsigned Idx = WhichResult;
   4428     for (unsigned i = 0; i != Half; ++i) {
   4429       int MIdx = M[i + j * Half];
   4430       if (MIdx >= 0 && (unsigned)MIdx != Idx)
   4431         return false;
   4432       Idx += 2;
   4433     }
   4434   }
   4435 
   4436   return true;
   4437 }
   4438 
   4439 /// isTRN_v_undef_Mask - Special case of isTRNMask for canonical form of
   4440 /// "vector_shuffle v, v", i.e., "vector_shuffle v, undef".
   4441 /// Mask is e.g., <0, 0, 2, 2> instead of <0, 4, 2, 6>.
   4442 static bool isTRN_v_undef_Mask(ArrayRef<int> M, EVT VT, unsigned &WhichResult) {
   4443   unsigned NumElts = VT.getVectorNumElements();
   4444   WhichResult = (M[0] == 0 ? 0 : 1);
   4445   for (unsigned i = 0; i < NumElts; i += 2) {
   4446     if ((M[i] >= 0 && (unsigned)M[i] != i + WhichResult) ||
   4447         (M[i + 1] >= 0 && (unsigned)M[i + 1] != i + WhichResult))
   4448       return false;
   4449   }
   4450   return true;
   4451 }
   4452 
   4453 static bool isINSMask(ArrayRef<int> M, int NumInputElements,
   4454                       bool &DstIsLeft, int &Anomaly) {
   4455   if (M.size() != static_cast<size_t>(NumInputElements))
   4456     return false;
   4457 
   4458   int NumLHSMatch = 0, NumRHSMatch = 0;
   4459   int LastLHSMismatch = -1, LastRHSMismatch = -1;
   4460 
   4461   for (int i = 0; i < NumInputElements; ++i) {
   4462     if (M[i] == -1) {
   4463       ++NumLHSMatch;
   4464       ++NumRHSMatch;
   4465       continue;
   4466     }
   4467 
   4468     if (M[i] == i)
   4469       ++NumLHSMatch;
   4470     else
   4471       LastLHSMismatch = i;
   4472 
   4473     if (M[i] == i + NumInputElements)
   4474       ++NumRHSMatch;
   4475     else
   4476       LastRHSMismatch = i;
   4477   }
   4478 
   4479   if (NumLHSMatch == NumInputElements - 1) {
   4480     DstIsLeft = true;
   4481     Anomaly = LastLHSMismatch;
   4482     return true;
   4483   } else if (NumRHSMatch == NumInputElements - 1) {
   4484     DstIsLeft = false;
   4485     Anomaly = LastRHSMismatch;
   4486     return true;
   4487   }
   4488 
   4489   return false;
   4490 }
   4491 
   4492 static bool isConcatMask(ArrayRef<int> Mask, EVT VT, bool SplitLHS) {
   4493   if (VT.getSizeInBits() != 128)
   4494     return false;
   4495 
   4496   unsigned NumElts = VT.getVectorNumElements();
   4497 
   4498   for (int I = 0, E = NumElts / 2; I != E; I++) {
   4499     if (Mask[I] != I)
   4500       return false;
   4501   }
   4502 
   4503   int Offset = NumElts / 2;
   4504   for (int I = NumElts / 2, E = NumElts; I != E; I++) {
   4505     if (Mask[I] != I + SplitLHS * Offset)
   4506       return false;
   4507   }
   4508 
   4509   return true;
   4510 }
   4511 
   4512 static SDValue tryFormConcatFromShuffle(SDValue Op, SelectionDAG &DAG) {
   4513   SDLoc DL(Op);
   4514   EVT VT = Op.getValueType();
   4515   SDValue V0 = Op.getOperand(0);
   4516   SDValue V1 = Op.getOperand(1);
   4517   ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(Op)->getMask();
   4518 
   4519   if (VT.getVectorElementType() != V0.getValueType().getVectorElementType() ||
   4520       VT.getVectorElementType() != V1.getValueType().getVectorElementType())
   4521     return SDValue();
   4522 
   4523   bool SplitV0 = V0.getValueType().getSizeInBits() == 128;
   4524 
   4525   if (!isConcatMask(Mask, VT, SplitV0))
   4526     return SDValue();
   4527 
   4528   EVT CastVT = EVT::getVectorVT(*DAG.getContext(), VT.getVectorElementType(),
   4529                                 VT.getVectorNumElements() / 2);
   4530   if (SplitV0) {
   4531     V0 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, CastVT, V0,
   4532                      DAG.getConstant(0, MVT::i64));
   4533   }
   4534   if (V1.getValueType().getSizeInBits() == 128) {
   4535     V1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, CastVT, V1,
   4536                      DAG.getConstant(0, MVT::i64));
   4537   }
   4538   return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, V0, V1);
   4539 }
   4540 
   4541 /// GeneratePerfectShuffle - Given an entry in the perfect-shuffle table, emit
   4542 /// the specified operations to build the shuffle.
   4543 static SDValue GeneratePerfectShuffle(unsigned PFEntry, SDValue LHS,
   4544                                       SDValue RHS, SelectionDAG &DAG,
   4545                                       SDLoc dl) {
   4546   unsigned OpNum = (PFEntry >> 26) & 0x0F;
   4547   unsigned LHSID = (PFEntry >> 13) & ((1 << 13) - 1);
   4548   unsigned RHSID = (PFEntry >> 0) & ((1 << 13) - 1);
   4549 
   4550   enum {
   4551     OP_COPY = 0, // Copy, used for things like <u,u,u,3> to say it is <0,1,2,3>
   4552     OP_VREV,
   4553     OP_VDUP0,
   4554     OP_VDUP1,
   4555     OP_VDUP2,
   4556     OP_VDUP3,
   4557     OP_VEXT1,
   4558     OP_VEXT2,
   4559     OP_VEXT3,
   4560     OP_VUZPL, // VUZP, left result
   4561     OP_VUZPR, // VUZP, right result
   4562     OP_VZIPL, // VZIP, left result
   4563     OP_VZIPR, // VZIP, right result
   4564     OP_VTRNL, // VTRN, left result
   4565     OP_VTRNR  // VTRN, right result
   4566   };
   4567 
   4568   if (OpNum == OP_COPY) {
   4569     if (LHSID == (1 * 9 + 2) * 9 + 3)
   4570       return LHS;
   4571     assert(LHSID == ((4 * 9 + 5) * 9 + 6) * 9 + 7 && "Illegal OP_COPY!");
   4572     return RHS;
   4573   }
   4574 
   4575   SDValue OpLHS, OpRHS;
   4576   OpLHS = GeneratePerfectShuffle(PerfectShuffleTable[LHSID], LHS, RHS, DAG, dl);
   4577   OpRHS = GeneratePerfectShuffle(PerfectShuffleTable[RHSID], LHS, RHS, DAG, dl);
   4578   EVT VT = OpLHS.getValueType();
   4579 
   4580   switch (OpNum) {
   4581   default:
   4582     llvm_unreachable("Unknown shuffle opcode!");
   4583   case OP_VREV:
   4584     // VREV divides the vector in half and swaps within the half.
   4585     if (VT.getVectorElementType() == MVT::i32 ||
   4586         VT.getVectorElementType() == MVT::f32)
   4587       return DAG.getNode(AArch64ISD::REV64, dl, VT, OpLHS);
   4588     // vrev <4 x i16> -> REV32
   4589     if (VT.getVectorElementType() == MVT::i16)
   4590       return DAG.getNode(AArch64ISD::REV32, dl, VT, OpLHS);
   4591     // vrev <4 x i8> -> REV16
   4592     assert(VT.getVectorElementType() == MVT::i8);
   4593     return DAG.getNode(AArch64ISD::REV16, dl, VT, OpLHS);
   4594   case OP_VDUP0:
   4595   case OP_VDUP1:
   4596   case OP_VDUP2:
   4597   case OP_VDUP3: {
   4598     EVT EltTy = VT.getVectorElementType();
   4599     unsigned Opcode;
   4600     if (EltTy == MVT::i8)
   4601       Opcode = AArch64ISD::DUPLANE8;
   4602     else if (EltTy == MVT::i16)
   4603       Opcode = AArch64ISD::DUPLANE16;
   4604     else if (EltTy == MVT::i32 || EltTy == MVT::f32)
   4605       Opcode = AArch64ISD::DUPLANE32;
   4606     else if (EltTy == MVT::i64 || EltTy == MVT::f64)
   4607       Opcode = AArch64ISD::DUPLANE64;
   4608     else
   4609       llvm_unreachable("Invalid vector element type?");
   4610 
   4611     if (VT.getSizeInBits() == 64)
   4612       OpLHS = WidenVector(OpLHS, DAG);
   4613     SDValue Lane = DAG.getConstant(OpNum - OP_VDUP0, MVT::i64);
   4614     return DAG.getNode(Opcode, dl, VT, OpLHS, Lane);
   4615   }
   4616   case OP_VEXT1:
   4617   case OP_VEXT2:
   4618   case OP_VEXT3: {
   4619     unsigned Imm = (OpNum - OP_VEXT1 + 1) * getExtFactor(OpLHS);
   4620     return DAG.getNode(AArch64ISD::EXT, dl, VT, OpLHS, OpRHS,
   4621                        DAG.getConstant(Imm, MVT::i32));
   4622   }
   4623   case OP_VUZPL:
   4624     return DAG.getNode(AArch64ISD::UZP1, dl, DAG.getVTList(VT, VT), OpLHS,
   4625                        OpRHS);
   4626   case OP_VUZPR:
   4627     return DAG.getNode(AArch64ISD::UZP2, dl, DAG.getVTList(VT, VT), OpLHS,
   4628                        OpRHS);
   4629   case OP_VZIPL:
   4630     return DAG.getNode(AArch64ISD::ZIP1, dl, DAG.getVTList(VT, VT), OpLHS,
   4631                        OpRHS);
   4632   case OP_VZIPR:
   4633     return DAG.getNode(AArch64ISD::ZIP2, dl, DAG.getVTList(VT, VT), OpLHS,
   4634                        OpRHS);
   4635   case OP_VTRNL:
   4636     return DAG.getNode(AArch64ISD::TRN1, dl, DAG.getVTList(VT, VT), OpLHS,
   4637                        OpRHS);
   4638   case OP_VTRNR:
   4639     return DAG.getNode(AArch64ISD::TRN2, dl, DAG.getVTList(VT, VT), OpLHS,
   4640                        OpRHS);
   4641   }
   4642 }
   4643 
   4644 static SDValue GenerateTBL(SDValue Op, ArrayRef<int> ShuffleMask,
   4645                            SelectionDAG &DAG) {
   4646   // Check to see if we can use the TBL instruction.
   4647   SDValue V1 = Op.getOperand(0);
   4648   SDValue V2 = Op.getOperand(1);
   4649   SDLoc DL(Op);
   4650 
   4651   EVT EltVT = Op.getValueType().getVectorElementType();
   4652   unsigned BytesPerElt = EltVT.getSizeInBits() / 8;
   4653 
   4654   SmallVector<SDValue, 8> TBLMask;
   4655   for (int Val : ShuffleMask) {
   4656     for (unsigned Byte = 0; Byte < BytesPerElt; ++Byte) {
   4657       unsigned Offset = Byte + Val * BytesPerElt;
   4658       TBLMask.push_back(DAG.getConstant(Offset, MVT::i32));
   4659     }
   4660   }
   4661 
   4662   MVT IndexVT = MVT::v8i8;
   4663   unsigned IndexLen = 8;
   4664   if (Op.getValueType().getSizeInBits() == 128) {
   4665     IndexVT = MVT::v16i8;
   4666     IndexLen = 16;
   4667   }
   4668 
   4669   SDValue V1Cst = DAG.getNode(ISD::BITCAST, DL, IndexVT, V1);
   4670   SDValue V2Cst = DAG.getNode(ISD::BITCAST, DL, IndexVT, V2);
   4671 
   4672   SDValue Shuffle;
   4673   if (V2.getNode()->getOpcode() == ISD::UNDEF) {
   4674     if (IndexLen == 8)
   4675       V1Cst = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v16i8, V1Cst, V1Cst);
   4676     Shuffle = DAG.getNode(
   4677         ISD::INTRINSIC_WO_CHAIN, DL, IndexVT,
   4678         DAG.getConstant(Intrinsic::aarch64_neon_tbl1, MVT::i32), V1Cst,
   4679         DAG.getNode(ISD::BUILD_VECTOR, DL, IndexVT,
   4680                     makeArrayRef(TBLMask.data(), IndexLen)));
   4681   } else {
   4682     if (IndexLen == 8) {
   4683       V1Cst = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v16i8, V1Cst, V2Cst);
   4684       Shuffle = DAG.getNode(
   4685           ISD::INTRINSIC_WO_CHAIN, DL, IndexVT,
   4686           DAG.getConstant(Intrinsic::aarch64_neon_tbl1, MVT::i32), V1Cst,
   4687           DAG.getNode(ISD::BUILD_VECTOR, DL, IndexVT,
   4688                       makeArrayRef(TBLMask.data(), IndexLen)));
   4689     } else {
   4690       // FIXME: We cannot, for the moment, emit a TBL2 instruction because we
   4691       // cannot currently represent the register constraints on the input
   4692       // table registers.
   4693       //  Shuffle = DAG.getNode(AArch64ISD::TBL2, DL, IndexVT, V1Cst, V2Cst,
   4694       //                   DAG.getNode(ISD::BUILD_VECTOR, DL, IndexVT,
   4695       //                               &TBLMask[0], IndexLen));
   4696       Shuffle = DAG.getNode(
   4697           ISD::INTRINSIC_WO_CHAIN, DL, IndexVT,
   4698           DAG.getConstant(Intrinsic::aarch64_neon_tbl2, MVT::i32), V1Cst, V2Cst,
   4699           DAG.getNode(ISD::BUILD_VECTOR, DL, IndexVT,
   4700                       makeArrayRef(TBLMask.data(), IndexLen)));
   4701     }
   4702   }
   4703   return DAG.getNode(ISD::BITCAST, DL, Op.getValueType(), Shuffle);
   4704 }
   4705 
   4706 static unsigned getDUPLANEOp(EVT EltType) {
   4707   if (EltType == MVT::i8)
   4708     return AArch64ISD::DUPLANE8;
   4709   if (EltType == MVT::i16)
   4710     return AArch64ISD::DUPLANE16;
   4711   if (EltType == MVT::i32 || EltType == MVT::f32)
   4712     return AArch64ISD::DUPLANE32;
   4713   if (EltType == MVT::i64 || EltType == MVT::f64)
   4714     return AArch64ISD::DUPLANE64;
   4715 
   4716   llvm_unreachable("Invalid vector element type?");
   4717 }
   4718 
   4719 SDValue AArch64TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op,
   4720                                                    SelectionDAG &DAG) const {
   4721   SDLoc dl(Op);
   4722   EVT VT = Op.getValueType();
   4723 
   4724   ShuffleVectorSDNode *SVN = cast<ShuffleVectorSDNode>(Op.getNode());
   4725 
   4726   // Convert shuffles that are directly supported on NEON to target-specific
   4727   // DAG nodes, instead of keeping them as shuffles and matching them again
   4728   // during code selection.  This is more efficient and avoids the possibility
   4729   // of inconsistencies between legalization and selection.
   4730   ArrayRef<int> ShuffleMask = SVN->getMask();
   4731 
   4732   SDValue V1 = Op.getOperand(0);
   4733   SDValue V2 = Op.getOperand(1);
   4734 
   4735   if (ShuffleVectorSDNode::isSplatMask(&ShuffleMask[0],
   4736                                        V1.getValueType().getSimpleVT())) {
   4737     int Lane = SVN->getSplatIndex();
   4738     // If this is undef splat, generate it via "just" vdup, if possible.
   4739     if (Lane == -1)
   4740       Lane = 0;
   4741 
   4742     if (Lane == 0 && V1.getOpcode() == ISD::SCALAR_TO_VECTOR)
   4743       return DAG.getNode(AArch64ISD::DUP, dl, V1.getValueType(),
   4744                          V1.getOperand(0));
   4745     // Test if V1 is a BUILD_VECTOR and the lane being referenced is a non-
   4746     // constant. If so, we can just reference the lane's definition directly.
   4747     if (V1.getOpcode() == ISD::BUILD_VECTOR &&
   4748         !isa<ConstantSDNode>(V1.getOperand(Lane)))
   4749       return DAG.getNode(AArch64ISD::DUP, dl, VT, V1.getOperand(Lane));
   4750 
   4751     // Otherwise, duplicate from the lane of the input vector.
   4752     unsigned Opcode = getDUPLANEOp(V1.getValueType().getVectorElementType());
   4753 
   4754     // SelectionDAGBuilder may have "helpfully" already extracted or conatenated
   4755     // to make a vector of the same size as this SHUFFLE. We can ignore the
   4756     // extract entirely, and canonicalise the concat using WidenVector.
   4757     if (V1.getOpcode() == ISD::EXTRACT_SUBVECTOR) {
   4758       Lane += cast<ConstantSDNode>(V1.getOperand(1))->getZExtValue();
   4759       V1 = V1.getOperand(0);
   4760     } else if (V1.getOpcode() == ISD::CONCAT_VECTORS) {
   4761       unsigned Idx = Lane >= (int)VT.getVectorNumElements() / 2;
   4762       Lane -= Idx * VT.getVectorNumElements() / 2;
   4763       V1 = WidenVector(V1.getOperand(Idx), DAG);
   4764     } else if (VT.getSizeInBits() == 64)
   4765       V1 = WidenVector(V1, DAG);
   4766 
   4767     return DAG.getNode(Opcode, dl, VT, V1, DAG.getConstant(Lane, MVT::i64));
   4768   }
   4769 
   4770   if (isREVMask(ShuffleMask, VT, 64))
   4771     return DAG.getNode(AArch64ISD::REV64, dl, V1.getValueType(), V1, V2);
   4772   if (isREVMask(ShuffleMask, VT, 32))
   4773     return DAG.getNode(AArch64ISD::REV32, dl, V1.getValueType(), V1, V2);
   4774   if (isREVMask(ShuffleMask, VT, 16))
   4775     return DAG.getNode(AArch64ISD::REV16, dl, V1.getValueType(), V1, V2);
   4776 
   4777   bool ReverseEXT = false;
   4778   unsigned Imm;
   4779   if (isEXTMask(ShuffleMask, VT, ReverseEXT, Imm)) {
   4780     if (ReverseEXT)
   4781       std::swap(V1, V2);
   4782     Imm *= getExtFactor(V1);
   4783     return DAG.getNode(AArch64ISD::EXT, dl, V1.getValueType(), V1, V2,
   4784                        DAG.getConstant(Imm, MVT::i32));
   4785   } else if (V2->getOpcode() == ISD::UNDEF &&
   4786              isSingletonEXTMask(ShuffleMask, VT, Imm)) {
   4787     Imm *= getExtFactor(V1);
   4788     return DAG.getNode(AArch64ISD::EXT, dl, V1.getValueType(), V1, V1,
   4789                        DAG.getConstant(Imm, MVT::i32));
   4790   }
   4791 
   4792   unsigned WhichResult;
   4793   if (isZIPMask(ShuffleMask, VT, WhichResult)) {
   4794     unsigned Opc = (WhichResult == 0) ? AArch64ISD::ZIP1 : AArch64ISD::ZIP2;
   4795     return DAG.getNode(Opc, dl, V1.getValueType(), V1, V2);
   4796   }
   4797   if (isUZPMask(ShuffleMask, VT, WhichResult)) {
   4798     unsigned Opc = (WhichResult == 0) ? AArch64ISD::UZP1 : AArch64ISD::UZP2;
   4799     return DAG.getNode(Opc, dl, V1.getValueType(), V1, V2);
   4800   }
   4801   if (isTRNMask(ShuffleMask, VT, WhichResult)) {
   4802     unsigned Opc = (WhichResult == 0) ? AArch64ISD::TRN1 : AArch64ISD::TRN2;
   4803     return DAG.getNode(Opc, dl, V1.getValueType(), V1, V2);
   4804   }
   4805 
   4806   if (isZIP_v_undef_Mask(ShuffleMask, VT, WhichResult)) {
   4807     unsigned Opc = (WhichResult == 0) ? AArch64ISD::ZIP1 : AArch64ISD::ZIP2;
   4808     return DAG.getNode(Opc, dl, V1.getValueType(), V1, V1);
   4809   }
   4810   if (isUZP_v_undef_Mask(ShuffleMask, VT, WhichResult)) {
   4811     unsigned Opc = (WhichResult == 0) ? AArch64ISD::UZP1 : AArch64ISD::UZP2;
   4812     return DAG.getNode(Opc, dl, V1.getValueType(), V1, V1);
   4813   }
   4814   if (isTRN_v_undef_Mask(ShuffleMask, VT, WhichResult)) {
   4815     unsigned Opc = (WhichResult == 0) ? AArch64ISD::TRN1 : AArch64ISD::TRN2;
   4816     return DAG.getNode(Opc, dl, V1.getValueType(), V1, V1);
   4817   }
   4818 
   4819   SDValue Concat = tryFormConcatFromShuffle(Op, DAG);
   4820   if (Concat.getNode())
   4821     return Concat;
   4822 
   4823   bool DstIsLeft;
   4824   int Anomaly;
   4825   int NumInputElements = V1.getValueType().getVectorNumElements();
   4826   if (isINSMask(ShuffleMask, NumInputElements, DstIsLeft, Anomaly)) {
   4827     SDValue DstVec = DstIsLeft ? V1 : V2;
   4828     SDValue DstLaneV = DAG.getConstant(Anomaly, MVT::i64);
   4829 
   4830     SDValue SrcVec = V1;
   4831     int SrcLane = ShuffleMask[Anomaly];
   4832     if (SrcLane >= NumInputElements) {
   4833       SrcVec = V2;
   4834       SrcLane -= VT.getVectorNumElements();
   4835     }
   4836     SDValue SrcLaneV = DAG.getConstant(SrcLane, MVT::i64);
   4837 
   4838     EVT ScalarVT = VT.getVectorElementType();
   4839     if (ScalarVT.getSizeInBits() < 32)
   4840       ScalarVT = MVT::i32;
   4841 
   4842     return DAG.getNode(
   4843         ISD::INSERT_VECTOR_ELT, dl, VT, DstVec,
   4844         DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, ScalarVT, SrcVec, SrcLaneV),
   4845         DstLaneV);
   4846   }
   4847 
   4848   // If the shuffle is not directly supported and it has 4 elements, use
   4849   // the PerfectShuffle-generated table to synthesize it from other shuffles.
   4850   unsigned NumElts = VT.getVectorNumElements();
   4851   if (NumElts == 4) {
   4852     unsigned PFIndexes[4];
   4853     for (unsigned i = 0; i != 4; ++i) {
   4854       if (ShuffleMask[i] < 0)
   4855         PFIndexes[i] = 8;
   4856       else
   4857         PFIndexes[i] = ShuffleMask[i];
   4858     }
   4859 
   4860     // Compute the index in the perfect shuffle table.
   4861     unsigned PFTableIndex = PFIndexes[0] * 9 * 9 * 9 + PFIndexes[1] * 9 * 9 +
   4862                             PFIndexes[2] * 9 + PFIndexes[3];
   4863     unsigned PFEntry = PerfectShuffleTable[PFTableIndex];
   4864     unsigned Cost = (PFEntry >> 30);
   4865 
   4866     if (Cost <= 4)
   4867       return GeneratePerfectShuffle(PFEntry, V1, V2, DAG, dl);
   4868   }
   4869 
   4870   return GenerateTBL(Op, ShuffleMask, DAG);
   4871 }
   4872 
   4873 static bool resolveBuildVector(BuildVectorSDNode *BVN, APInt &CnstBits,
   4874                                APInt &UndefBits) {
   4875   EVT VT = BVN->getValueType(0);
   4876   APInt SplatBits, SplatUndef;
   4877   unsigned SplatBitSize;
   4878   bool HasAnyUndefs;
   4879   if (BVN->isConstantSplat(SplatBits, SplatUndef, SplatBitSize, HasAnyUndefs)) {
   4880     unsigned NumSplats = VT.getSizeInBits() / SplatBitSize;
   4881 
   4882     for (unsigned i = 0; i < NumSplats; ++i) {
   4883       CnstBits <<= SplatBitSize;
   4884       UndefBits <<= SplatBitSize;
   4885       CnstBits |= SplatBits.zextOrTrunc(VT.getSizeInBits());
   4886       UndefBits |= (SplatBits ^ SplatUndef).zextOrTrunc(VT.getSizeInBits());
   4887     }
   4888 
   4889     return true;
   4890   }
   4891 
   4892   return false;
   4893 }
   4894 
   4895 SDValue AArch64TargetLowering::LowerVectorAND(SDValue Op,
   4896                                               SelectionDAG &DAG) const {
   4897   BuildVectorSDNode *BVN =
   4898       dyn_cast<BuildVectorSDNode>(Op.getOperand(1).getNode());
   4899   SDValue LHS = Op.getOperand(0);
   4900   SDLoc dl(Op);
   4901   EVT VT = Op.getValueType();
   4902 
   4903   if (!BVN)
   4904     return Op;
   4905 
   4906   APInt CnstBits(VT.getSizeInBits(), 0);
   4907   APInt UndefBits(VT.getSizeInBits(), 0);
   4908   if (resolveBuildVector(BVN, CnstBits, UndefBits)) {
   4909     // We only have BIC vector immediate instruction, which is and-not.
   4910     CnstBits = ~CnstBits;
   4911 
   4912     // We make use of a little bit of goto ickiness in order to avoid having to
   4913     // duplicate the immediate matching logic for the undef toggled case.
   4914     bool SecondTry = false;
   4915   AttemptModImm:
   4916 
   4917     if (CnstBits.getHiBits(64) == CnstBits.getLoBits(64)) {
   4918       CnstBits = CnstBits.zextOrTrunc(64);
   4919       uint64_t CnstVal = CnstBits.getZExtValue();
   4920 
   4921       if (AArch64_AM::isAdvSIMDModImmType1(CnstVal)) {
   4922         CnstVal = AArch64_AM::encodeAdvSIMDModImmType1(CnstVal);
   4923         MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32;
   4924         SDValue Mov = DAG.getNode(AArch64ISD::BICi, dl, MovTy, LHS,
   4925                                   DAG.getConstant(CnstVal, MVT::i32),
   4926                                   DAG.getConstant(0, MVT::i32));
   4927         return DAG.getNode(ISD::BITCAST, dl, VT, Mov);
   4928       }
   4929 
   4930       if (AArch64_AM::isAdvSIMDModImmType2(CnstVal)) {
   4931         CnstVal = AArch64_AM::encodeAdvSIMDModImmType2(CnstVal);
   4932         MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32;
   4933         SDValue Mov = DAG.getNode(AArch64ISD::BICi, dl, MovTy, LHS,
   4934                                   DAG.getConstant(CnstVal, MVT::i32),
   4935                                   DAG.getConstant(8, MVT::i32));
   4936         return DAG.getNode(ISD::BITCAST, dl, VT, Mov);
   4937       }
   4938 
   4939       if (AArch64_AM::isAdvSIMDModImmType3(CnstVal)) {
   4940         CnstVal = AArch64_AM::encodeAdvSIMDModImmType3(CnstVal);
   4941         MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32;
   4942         SDValue Mov = DAG.getNode(AArch64ISD::BICi, dl, MovTy, LHS,
   4943                                   DAG.getConstant(CnstVal, MVT::i32),
   4944                                   DAG.getConstant(16, MVT::i32));
   4945         return DAG.getNode(ISD::BITCAST, dl, VT, Mov);
   4946       }
   4947 
   4948       if (AArch64_AM::isAdvSIMDModImmType4(CnstVal)) {
   4949         CnstVal = AArch64_AM::encodeAdvSIMDModImmType4(CnstVal);
   4950         MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32;
   4951         SDValue Mov = DAG.getNode(AArch64ISD::BICi, dl, MovTy, LHS,
   4952                                   DAG.getConstant(CnstVal, MVT::i32),
   4953                                   DAG.getConstant(24, MVT::i32));
   4954         return DAG.getNode(ISD::BITCAST, dl, VT, Mov);
   4955       }
   4956 
   4957       if (AArch64_AM::isAdvSIMDModImmType5(CnstVal)) {
   4958         CnstVal = AArch64_AM::encodeAdvSIMDModImmType5(CnstVal);
   4959         MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v8i16 : MVT::v4i16;
   4960         SDValue Mov = DAG.getNode(AArch64ISD::BICi, dl, MovTy, LHS,
   4961                                   DAG.getConstant(CnstVal, MVT::i32),
   4962                                   DAG.getConstant(0, MVT::i32));
   4963         return DAG.getNode(ISD::BITCAST, dl, VT, Mov);
   4964       }
   4965 
   4966       if (AArch64_AM::isAdvSIMDModImmType6(CnstVal)) {
   4967         CnstVal = AArch64_AM::encodeAdvSIMDModImmType6(CnstVal);
   4968         MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v8i16 : MVT::v4i16;
   4969         SDValue Mov = DAG.getNode(AArch64ISD::BICi, dl, MovTy, LHS,
   4970                                   DAG.getConstant(CnstVal, MVT::i32),
   4971                                   DAG.getConstant(8, MVT::i32));
   4972         return DAG.getNode(ISD::BITCAST, dl, VT, Mov);
   4973       }
   4974     }
   4975 
   4976     if (SecondTry)
   4977       goto FailedModImm;
   4978     SecondTry = true;
   4979     CnstBits = ~UndefBits;
   4980     goto AttemptModImm;
   4981   }
   4982 
   4983 // We can always fall back to a non-immediate AND.
   4984 FailedModImm:
   4985   return Op;
   4986 }
   4987 
   4988 // Specialized code to quickly find if PotentialBVec is a BuildVector that
   4989 // consists of only the same constant int value, returned in reference arg
   4990 // ConstVal
   4991 static bool isAllConstantBuildVector(const SDValue &PotentialBVec,
   4992                                      uint64_t &ConstVal) {
   4993   BuildVectorSDNode *Bvec = dyn_cast<BuildVectorSDNode>(PotentialBVec);
   4994   if (!Bvec)
   4995     return false;
   4996   ConstantSDNode *FirstElt = dyn_cast<ConstantSDNode>(Bvec->getOperand(0));
   4997   if (!FirstElt)
   4998     return false;
   4999   EVT VT = Bvec->getValueType(0);
   5000   unsigned NumElts = VT.getVectorNumElements();
   5001   for (unsigned i = 1; i < NumElts; ++i)
   5002     if (dyn_cast<ConstantSDNode>(Bvec->getOperand(i)) != FirstElt)
   5003       return false;
   5004   ConstVal = FirstElt->getZExtValue();
   5005   return true;
   5006 }
   5007 
   5008 static unsigned getIntrinsicID(const SDNode *N) {
   5009   unsigned Opcode = N->getOpcode();
   5010   switch (Opcode) {
   5011   default:
   5012     return Intrinsic::not_intrinsic;
   5013   case ISD::INTRINSIC_WO_CHAIN: {
   5014     unsigned IID = cast<ConstantSDNode>(N->getOperand(0))->getZExtValue();
   5015     if (IID < Intrinsic::num_intrinsics)
   5016       return IID;
   5017     return Intrinsic::not_intrinsic;
   5018   }
   5019   }
   5020 }
   5021 
   5022 // Attempt to form a vector S[LR]I from (or (and X, BvecC1), (lsl Y, C2)),
   5023 // to (SLI X, Y, C2), where X and Y have matching vector types, BvecC1 is a
   5024 // BUILD_VECTORs with constant element C1, C2 is a constant, and C1 == ~C2.
   5025 // Also, logical shift right -> sri, with the same structure.
   5026 static SDValue tryLowerToSLI(SDNode *N, SelectionDAG &DAG) {
   5027   EVT VT = N->getValueType(0);
   5028 
   5029   if (!VT.isVector())
   5030     return SDValue();
   5031 
   5032   SDLoc DL(N);
   5033 
   5034   // Is the first op an AND?
   5035   const SDValue And = N->getOperand(0);
   5036   if (And.getOpcode() != ISD::AND)
   5037     return SDValue();
   5038 
   5039   // Is the second op an shl or lshr?
   5040   SDValue Shift = N->getOperand(1);
   5041   // This will have been turned into: AArch64ISD::VSHL vector, #shift
   5042   // or AArch64ISD::VLSHR vector, #shift
   5043   unsigned ShiftOpc = Shift.getOpcode();
   5044   if ((ShiftOpc != AArch64ISD::VSHL && ShiftOpc != AArch64ISD::VLSHR))
   5045     return SDValue();
   5046   bool IsShiftRight = ShiftOpc == AArch64ISD::VLSHR;
   5047 
   5048   // Is the shift amount constant?
   5049   ConstantSDNode *C2node = dyn_cast<ConstantSDNode>(Shift.getOperand(1));
   5050   if (!C2node)
   5051     return SDValue();
   5052 
   5053   // Is the and mask vector all constant?
   5054   uint64_t C1;
   5055   if (!isAllConstantBuildVector(And.getOperand(1), C1))
   5056     return SDValue();
   5057 
   5058   // Is C1 == ~C2, taking into account how much one can shift elements of a
   5059   // particular size?
   5060   uint64_t C2 = C2node->getZExtValue();
   5061   unsigned ElemSizeInBits = VT.getVectorElementType().getSizeInBits();
   5062   if (C2 > ElemSizeInBits)
   5063     return SDValue();
   5064   unsigned ElemMask = (1 << ElemSizeInBits) - 1;
   5065   if ((C1 & ElemMask) != (~C2 & ElemMask))
   5066     return SDValue();
   5067 
   5068   SDValue X = And.getOperand(0);
   5069   SDValue Y = Shift.getOperand(0);
   5070 
   5071   unsigned Intrin =
   5072       IsShiftRight ? Intrinsic::aarch64_neon_vsri : Intrinsic::aarch64_neon_vsli;
   5073   SDValue ResultSLI =
   5074       DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT,
   5075                   DAG.getConstant(Intrin, MVT::i32), X, Y, Shift.getOperand(1));
   5076 
   5077   DEBUG(dbgs() << "aarch64-lower: transformed: \n");
   5078   DEBUG(N->dump(&DAG));
   5079   DEBUG(dbgs() << "into: \n");
   5080   DEBUG(ResultSLI->dump(&DAG));
   5081 
   5082   ++NumShiftInserts;
   5083   return ResultSLI;
   5084 }
   5085 
   5086 SDValue AArch64TargetLowering::LowerVectorOR(SDValue Op,
   5087                                              SelectionDAG &DAG) const {
   5088   // Attempt to form a vector S[LR]I from (or (and X, C1), (lsl Y, C2))
   5089   if (EnableAArch64SlrGeneration) {
   5090     SDValue Res = tryLowerToSLI(Op.getNode(), DAG);
   5091     if (Res.getNode())
   5092       return Res;
   5093   }
   5094 
   5095   BuildVectorSDNode *BVN =
   5096       dyn_cast<BuildVectorSDNode>(Op.getOperand(0).getNode());
   5097   SDValue LHS = Op.getOperand(1);
   5098   SDLoc dl(Op);
   5099   EVT VT = Op.getValueType();
   5100 
   5101   // OR commutes, so try swapping the operands.
   5102   if (!BVN) {
   5103     LHS = Op.getOperand(0);
   5104     BVN = dyn_cast<BuildVectorSDNode>(Op.getOperand(1).getNode());
   5105   }
   5106   if (!BVN)
   5107     return Op;
   5108 
   5109   APInt CnstBits(VT.getSizeInBits(), 0);
   5110   APInt UndefBits(VT.getSizeInBits(), 0);
   5111   if (resolveBuildVector(BVN, CnstBits, UndefBits)) {
   5112     // We make use of a little bit of goto ickiness in order to avoid having to
   5113     // duplicate the immediate matching logic for the undef toggled case.
   5114     bool SecondTry = false;
   5115   AttemptModImm:
   5116 
   5117     if (CnstBits.getHiBits(64) == CnstBits.getLoBits(64)) {
   5118       CnstBits = CnstBits.zextOrTrunc(64);
   5119       uint64_t CnstVal = CnstBits.getZExtValue();
   5120 
   5121       if (AArch64_AM::isAdvSIMDModImmType1(CnstVal)) {
   5122         CnstVal = AArch64_AM::encodeAdvSIMDModImmType1(CnstVal);
   5123         MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32;
   5124         SDValue Mov = DAG.getNode(AArch64ISD::ORRi, dl, MovTy, LHS,
   5125                                   DAG.getConstant(CnstVal, MVT::i32),
   5126                                   DAG.getConstant(0, MVT::i32));
   5127         return DAG.getNode(ISD::BITCAST, dl, VT, Mov);
   5128       }
   5129 
   5130       if (AArch64_AM::isAdvSIMDModImmType2(CnstVal)) {
   5131         CnstVal = AArch64_AM::encodeAdvSIMDModImmType2(CnstVal);
   5132         MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32;
   5133         SDValue Mov = DAG.getNode(AArch64ISD::ORRi, dl, MovTy, LHS,
   5134                                   DAG.getConstant(CnstVal, MVT::i32),
   5135                                   DAG.getConstant(8, MVT::i32));
   5136         return DAG.getNode(ISD::BITCAST, dl, VT, Mov);
   5137       }
   5138 
   5139       if (AArch64_AM::isAdvSIMDModImmType3(CnstVal)) {
   5140         CnstVal = AArch64_AM::encodeAdvSIMDModImmType3(CnstVal);
   5141         MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32;
   5142         SDValue Mov = DAG.getNode(AArch64ISD::ORRi, dl, MovTy, LHS,
   5143                                   DAG.getConstant(CnstVal, MVT::i32),
   5144                                   DAG.getConstant(16, MVT::i32));
   5145         return DAG.getNode(ISD::BITCAST, dl, VT, Mov);
   5146       }
   5147 
   5148       if (AArch64_AM::isAdvSIMDModImmType4(CnstVal)) {
   5149         CnstVal = AArch64_AM::encodeAdvSIMDModImmType4(CnstVal);
   5150         MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32;
   5151         SDValue Mov = DAG.getNode(AArch64ISD::ORRi, dl, MovTy, LHS,
   5152                                   DAG.getConstant(CnstVal, MVT::i32),
   5153                                   DAG.getConstant(24, MVT::i32));
   5154         return DAG.getNode(ISD::BITCAST, dl, VT, Mov);
   5155       }
   5156 
   5157       if (AArch64_AM::isAdvSIMDModImmType5(CnstVal)) {
   5158         CnstVal = AArch64_AM::encodeAdvSIMDModImmType5(CnstVal);
   5159         MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v8i16 : MVT::v4i16;
   5160         SDValue Mov = DAG.getNode(AArch64ISD::ORRi, dl, MovTy, LHS,
   5161                                   DAG.getConstant(CnstVal, MVT::i32),
   5162                                   DAG.getConstant(0, MVT::i32));
   5163         return DAG.getNode(ISD::BITCAST, dl, VT, Mov);
   5164       }
   5165 
   5166       if (AArch64_AM::isAdvSIMDModImmType6(CnstVal)) {
   5167         CnstVal = AArch64_AM::encodeAdvSIMDModImmType6(CnstVal);
   5168         MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v8i16 : MVT::v4i16;
   5169         SDValue Mov = DAG.getNode(AArch64ISD::ORRi, dl, MovTy, LHS,
   5170                                   DAG.getConstant(CnstVal, MVT::i32),
   5171                                   DAG.getConstant(8, MVT::i32));
   5172         return DAG.getNode(ISD::BITCAST, dl, VT, Mov);
   5173       }
   5174     }
   5175 
   5176     if (SecondTry)
   5177       goto FailedModImm;
   5178     SecondTry = true;
   5179     CnstBits = UndefBits;
   5180     goto AttemptModImm;
   5181   }
   5182 
   5183 // We can always fall back to a non-immediate OR.
   5184 FailedModImm:
   5185   return Op;
   5186 }
   5187 
   5188 // Normalize the operands of BUILD_VECTOR. The value of constant operands will
   5189 // be truncated to fit element width.
   5190 static SDValue NormalizeBuildVector(SDValue Op,
   5191                                     SelectionDAG &DAG) {
   5192   assert(Op.getOpcode() == ISD::BUILD_VECTOR && "Unknown opcode!");
   5193   SDLoc dl(Op);
   5194   EVT VT = Op.getValueType();
   5195   EVT EltTy= VT.getVectorElementType();
   5196 
   5197   if (EltTy.isFloatingPoint() || EltTy.getSizeInBits() > 16)
   5198     return Op;
   5199 
   5200   SmallVector<SDValue, 16> Ops;
   5201   for (unsigned I = 0, E = VT.getVectorNumElements(); I != E; ++I) {
   5202     SDValue Lane = Op.getOperand(I);
   5203     if (Lane.getOpcode() == ISD::Constant) {
   5204       APInt LowBits(EltTy.getSizeInBits(),
   5205                     cast<ConstantSDNode>(Lane)->getZExtValue());
   5206       Lane = DAG.getConstant(LowBits.getZExtValue(), MVT::i32);
   5207     }
   5208     Ops.push_back(Lane);
   5209   }
   5210   return DAG.getNode(ISD::BUILD_VECTOR, dl, VT, Ops);
   5211 }
   5212 
   5213 SDValue AArch64TargetLowering::LowerBUILD_VECTOR(SDValue Op,
   5214                                                  SelectionDAG &DAG) const {
   5215   SDLoc dl(Op);
   5216   EVT VT = Op.getValueType();
   5217   Op = NormalizeBuildVector(Op, DAG);
   5218   BuildVectorSDNode *BVN = cast<BuildVectorSDNode>(Op.getNode());
   5219 
   5220   APInt CnstBits(VT.getSizeInBits(), 0);
   5221   APInt UndefBits(VT.getSizeInBits(), 0);
   5222   if (resolveBuildVector(BVN, CnstBits, UndefBits)) {
   5223     // We make use of a little bit of goto ickiness in order to avoid having to
   5224     // duplicate the immediate matching logic for the undef toggled case.
   5225     bool SecondTry = false;
   5226   AttemptModImm:
   5227 
   5228     if (CnstBits.getHiBits(64) == CnstBits.getLoBits(64)) {
   5229       CnstBits = CnstBits.zextOrTrunc(64);
   5230       uint64_t CnstVal = CnstBits.getZExtValue();
   5231 
   5232       // Certain magic vector constants (used to express things like NOT
   5233       // and NEG) are passed through unmodified.  This allows codegen patterns
   5234       // for these operations to match.  Special-purpose patterns will lower
   5235       // these immediates to MOVIs if it proves necessary.
   5236       if (VT.isInteger() && (CnstVal == 0 || CnstVal == ~0ULL))
   5237         return Op;
   5238 
   5239       // The many faces of MOVI...
   5240       if (AArch64_AM::isAdvSIMDModImmType10(CnstVal)) {
   5241         CnstVal = AArch64_AM::encodeAdvSIMDModImmType10(CnstVal);
   5242         if (VT.getSizeInBits() == 128) {
   5243           SDValue Mov = DAG.getNode(AArch64ISD::MOVIedit, dl, MVT::v2i64,
   5244                                     DAG.getConstant(CnstVal, MVT::i32));
   5245           return DAG.getNode(ISD::BITCAST, dl, VT, Mov);
   5246         }
   5247 
   5248         // Support the V64 version via subregister insertion.
   5249         SDValue Mov = DAG.getNode(AArch64ISD::MOVIedit, dl, MVT::f64,
   5250                                   DAG.getConstant(CnstVal, MVT::i32));
   5251         return DAG.getNode(ISD::BITCAST, dl, VT, Mov);
   5252       }
   5253 
   5254       if (AArch64_AM::isAdvSIMDModImmType1(CnstVal)) {
   5255         CnstVal = AArch64_AM::encodeAdvSIMDModImmType1(CnstVal);
   5256         MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32;
   5257         SDValue Mov = DAG.getNode(AArch64ISD::MOVIshift, dl, MovTy,
   5258                                   DAG.getConstant(CnstVal, MVT::i32),
   5259                                   DAG.getConstant(0, MVT::i32));
   5260         return DAG.getNode(ISD::BITCAST, dl, VT, Mov);
   5261       }
   5262 
   5263       if (AArch64_AM::isAdvSIMDModImmType2(CnstVal)) {
   5264         CnstVal = AArch64_AM::encodeAdvSIMDModImmType2(CnstVal);
   5265         MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32;
   5266         SDValue Mov = DAG.getNode(AArch64ISD::MOVIshift, dl, MovTy,
   5267                                   DAG.getConstant(CnstVal, MVT::i32),
   5268                                   DAG.getConstant(8, MVT::i32));
   5269         return DAG.getNode(ISD::BITCAST, dl, VT, Mov);
   5270       }
   5271 
   5272       if (AArch64_AM::isAdvSIMDModImmType3(CnstVal)) {
   5273         CnstVal = AArch64_AM::encodeAdvSIMDModImmType3(CnstVal);
   5274         MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32;
   5275         SDValue Mov = DAG.getNode(AArch64ISD::MOVIshift, dl, MovTy,
   5276                                   DAG.getConstant(CnstVal, MVT::i32),
   5277                                   DAG.getConstant(16, MVT::i32));
   5278         return DAG.getNode(ISD::BITCAST, dl, VT, Mov);
   5279       }
   5280 
   5281       if (AArch64_AM::isAdvSIMDModImmType4(CnstVal)) {
   5282         CnstVal = AArch64_AM::encodeAdvSIMDModImmType4(CnstVal);
   5283         MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32;
   5284         SDValue Mov = DAG.getNode(AArch64ISD::MOVIshift, dl, MovTy,
   5285                                   DAG.getConstant(CnstVal, MVT::i32),
   5286                                   DAG.getConstant(24, MVT::i32));
   5287         return DAG.getNode(ISD::BITCAST, dl, VT, Mov);
   5288       }
   5289 
   5290       if (AArch64_AM::isAdvSIMDModImmType5(CnstVal)) {
   5291         CnstVal = AArch64_AM::encodeAdvSIMDModImmType5(CnstVal);
   5292         MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v8i16 : MVT::v4i16;
   5293         SDValue Mov = DAG.getNode(AArch64ISD::MOVIshift, dl, MovTy,
   5294                                   DAG.getConstant(CnstVal, MVT::i32),
   5295                                   DAG.getConstant(0, MVT::i32));
   5296         return DAG.getNode(ISD::BITCAST, dl, VT, Mov);
   5297       }
   5298 
   5299       if (AArch64_AM::isAdvSIMDModImmType6(CnstVal)) {
   5300         CnstVal = AArch64_AM::encodeAdvSIMDModImmType6(CnstVal);
   5301         MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v8i16 : MVT::v4i16;
   5302         SDValue Mov = DAG.getNode(AArch64ISD::MOVIshift, dl, MovTy,
   5303                                   DAG.getConstant(CnstVal, MVT::i32),
   5304                                   DAG.getConstant(8, MVT::i32));
   5305         return DAG.getNode(ISD::BITCAST, dl, VT, Mov);
   5306       }
   5307 
   5308       if (AArch64_AM::isAdvSIMDModImmType7(CnstVal)) {
   5309         CnstVal = AArch64_AM::encodeAdvSIMDModImmType7(CnstVal);
   5310         MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32;
   5311         SDValue Mov = DAG.getNode(AArch64ISD::MOVImsl, dl, MovTy,
   5312                                   DAG.getConstant(CnstVal, MVT::i32),
   5313                                   DAG.getConstant(264, MVT::i32));
   5314         return DAG.getNode(ISD::BITCAST, dl, VT, Mov);
   5315       }
   5316 
   5317       if (AArch64_AM::isAdvSIMDModImmType8(CnstVal)) {
   5318         CnstVal = AArch64_AM::encodeAdvSIMDModImmType8(CnstVal);
   5319         MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32;
   5320         SDValue Mov = DAG.getNode(AArch64ISD::MOVImsl, dl, MovTy,
   5321                                   DAG.getConstant(CnstVal, MVT::i32),
   5322                                   DAG.getConstant(272, MVT::i32));
   5323         return DAG.getNode(ISD::BITCAST, dl, VT, Mov);
   5324       }
   5325 
   5326       if (AArch64_AM::isAdvSIMDModImmType9(CnstVal)) {
   5327         CnstVal = AArch64_AM::encodeAdvSIMDModImmType9(CnstVal);
   5328         MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v16i8 : MVT::v8i8;
   5329         SDValue Mov = DAG.getNode(AArch64ISD::MOVI, dl, MovTy,
   5330                                   DAG.getConstant(CnstVal, MVT::i32));
   5331         return DAG.getNode(ISD::BITCAST, dl, VT, Mov);
   5332       }
   5333 
   5334       // The few faces of FMOV...
   5335       if (AArch64_AM::isAdvSIMDModImmType11(CnstVal)) {
   5336         CnstVal = AArch64_AM::encodeAdvSIMDModImmType11(CnstVal);
   5337         MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4f32 : MVT::v2f32;
   5338         SDValue Mov = DAG.getNode(AArch64ISD::FMOV, dl, MovTy,
   5339                                   DAG.getConstant(CnstVal, MVT::i32));
   5340         return DAG.getNode(ISD::BITCAST, dl, VT, Mov);
   5341       }
   5342 
   5343       if (AArch64_AM::isAdvSIMDModImmType12(CnstVal) &&
   5344           VT.getSizeInBits() == 128) {
   5345         CnstVal = AArch64_AM::encodeAdvSIMDModImmType12(CnstVal);
   5346         SDValue Mov = DAG.getNode(AArch64ISD::FMOV, dl, MVT::v2f64,
   5347                                   DAG.getConstant(CnstVal, MVT::i32));
   5348         return DAG.getNode(ISD::BITCAST, dl, VT, Mov);
   5349       }
   5350 
   5351       // The many faces of MVNI...
   5352       CnstVal = ~CnstVal;
   5353       if (AArch64_AM::isAdvSIMDModImmType1(CnstVal)) {
   5354         CnstVal = AArch64_AM::encodeAdvSIMDModImmType1(CnstVal);
   5355         MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32;
   5356         SDValue Mov = DAG.getNode(AArch64ISD::MVNIshift, dl, MovTy,
   5357                                   DAG.getConstant(CnstVal, MVT::i32),
   5358                                   DAG.getConstant(0, MVT::i32));
   5359         return DAG.getNode(ISD::BITCAST, dl, VT, Mov);
   5360       }
   5361 
   5362       if (AArch64_AM::isAdvSIMDModImmType2(CnstVal)) {
   5363         CnstVal = AArch64_AM::encodeAdvSIMDModImmType2(CnstVal);
   5364         MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32;
   5365         SDValue Mov = DAG.getNode(AArch64ISD::MVNIshift, dl, MovTy,
   5366                                   DAG.getConstant(CnstVal, MVT::i32),
   5367                                   DAG.getConstant(8, MVT::i32));
   5368         return DAG.getNode(ISD::BITCAST, dl, VT, Mov);
   5369       }
   5370 
   5371       if (AArch64_AM::isAdvSIMDModImmType3(CnstVal)) {
   5372         CnstVal = AArch64_AM::encodeAdvSIMDModImmType3(CnstVal);
   5373         MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32;
   5374         SDValue Mov = DAG.getNode(AArch64ISD::MVNIshift, dl, MovTy,
   5375                                   DAG.getConstant(CnstVal, MVT::i32),
   5376                                   DAG.getConstant(16, MVT::i32));
   5377         return DAG.getNode(ISD::BITCAST, dl, VT, Mov);
   5378       }
   5379 
   5380       if (AArch64_AM::isAdvSIMDModImmType4(CnstVal)) {
   5381         CnstVal = AArch64_AM::encodeAdvSIMDModImmType4(CnstVal);
   5382         MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32;
   5383         SDValue Mov = DAG.getNode(AArch64ISD::MVNIshift, dl, MovTy,
   5384                                   DAG.getConstant(CnstVal, MVT::i32),
   5385                                   DAG.getConstant(24, MVT::i32));
   5386         return DAG.getNode(ISD::BITCAST, dl, VT, Mov);
   5387       }
   5388 
   5389       if (AArch64_AM::isAdvSIMDModImmType5(CnstVal)) {
   5390         CnstVal = AArch64_AM::encodeAdvSIMDModImmType5(CnstVal);
   5391         MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v8i16 : MVT::v4i16;
   5392         SDValue Mov = DAG.getNode(AArch64ISD::MVNIshift, dl, MovTy,
   5393                                   DAG.getConstant(CnstVal, MVT::i32),
   5394                                   DAG.getConstant(0, MVT::i32));
   5395         return DAG.getNode(ISD::BITCAST, dl, VT, Mov);
   5396       }
   5397 
   5398       if (AArch64_AM::isAdvSIMDModImmType6(CnstVal)) {
   5399         CnstVal = AArch64_AM::encodeAdvSIMDModImmType6(CnstVal);
   5400         MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v8i16 : MVT::v4i16;
   5401         SDValue Mov = DAG.getNode(AArch64ISD::MVNIshift, dl, MovTy,
   5402                                   DAG.getConstant(CnstVal, MVT::i32),
   5403                                   DAG.getConstant(8, MVT::i32));
   5404         return DAG.getNode(ISD::BITCAST, dl, VT, Mov);
   5405       }
   5406 
   5407       if (AArch64_AM::isAdvSIMDModImmType7(CnstVal)) {
   5408         CnstVal = AArch64_AM::encodeAdvSIMDModImmType7(CnstVal);
   5409         MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32;
   5410         SDValue Mov = DAG.getNode(AArch64ISD::MVNImsl, dl, MovTy,
   5411                                   DAG.getConstant(CnstVal, MVT::i32),
   5412                                   DAG.getConstant(264, MVT::i32));
   5413         return DAG.getNode(ISD::BITCAST, dl, VT, Mov);
   5414       }
   5415 
   5416       if (AArch64_AM::isAdvSIMDModImmType8(CnstVal)) {
   5417         CnstVal = AArch64_AM::encodeAdvSIMDModImmType8(CnstVal);
   5418         MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32;
   5419         SDValue Mov = DAG.getNode(AArch64ISD::MVNImsl, dl, MovTy,
   5420                                   DAG.getConstant(CnstVal, MVT::i32),
   5421                                   DAG.getConstant(272, MVT::i32));
   5422         return DAG.getNode(ISD::BITCAST, dl, VT, Mov);
   5423       }
   5424     }
   5425 
   5426     if (SecondTry)
   5427       goto FailedModImm;
   5428     SecondTry = true;
   5429     CnstBits = UndefBits;
   5430     goto AttemptModImm;
   5431   }
   5432 FailedModImm:
   5433 
   5434   // Scan through the operands to find some interesting properties we can
   5435   // exploit:
   5436   //   1) If only one value is used, we can use a DUP, or
   5437   //   2) if only the low element is not undef, we can just insert that, or
   5438   //   3) if only one constant value is used (w/ some non-constant lanes),
   5439   //      we can splat the constant value into the whole vector then fill
   5440   //      in the non-constant lanes.
   5441   //   4) FIXME: If different constant values are used, but we can intelligently
   5442   //             select the values we'll be overwriting for the non-constant
   5443   //             lanes such that we can directly materialize the vector
   5444   //             some other way (MOVI, e.g.), we can be sneaky.
   5445   unsigned NumElts = VT.getVectorNumElements();
   5446   bool isOnlyLowElement = true;
   5447   bool usesOnlyOneValue = true;
   5448   bool usesOnlyOneConstantValue = true;
   5449   bool isConstant = true;
   5450   unsigned NumConstantLanes = 0;
   5451   SDValue Value;
   5452   SDValue ConstantValue;
   5453   for (unsigned i = 0; i < NumElts; ++i) {
   5454     SDValue V = Op.getOperand(i);
   5455     if (V.getOpcode() == ISD::UNDEF)
   5456       continue;
   5457     if (i > 0)
   5458       isOnlyLowElement = false;
   5459     if (!isa<ConstantFPSDNode>(V) && !isa<ConstantSDNode>(V))
   5460       isConstant = false;
   5461 
   5462     if (isa<ConstantSDNode>(V) || isa<ConstantFPSDNode>(V)) {
   5463       ++NumConstantLanes;
   5464       if (!ConstantValue.getNode())
   5465         ConstantValue = V;
   5466       else if (ConstantValue != V)
   5467         usesOnlyOneConstantValue = false;
   5468     }
   5469 
   5470     if (!Value.getNode())
   5471       Value = V;
   5472     else if (V != Value)
   5473       usesOnlyOneValue = false;
   5474   }
   5475 
   5476   if (!Value.getNode())
   5477     return DAG.getUNDEF(VT);
   5478 
   5479   if (isOnlyLowElement)
   5480     return DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Value);
   5481 
   5482   // Use DUP for non-constant splats.  For f32 constant splats, reduce to
   5483   // i32 and try again.
   5484   if (usesOnlyOneValue) {
   5485     if (!isConstant) {
   5486       if (Value.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
   5487           Value.getValueType() != VT)
   5488         return DAG.getNode(AArch64ISD::DUP, dl, VT, Value);
   5489 
   5490       // This is actually a DUPLANExx operation, which keeps everything vectory.
   5491 
   5492       // DUPLANE works on 128-bit vectors, widen it if necessary.
   5493       SDValue Lane = Value.getOperand(1);
   5494       Value = Value.getOperand(0);
   5495       if (Value.getValueType().getSizeInBits() == 64)
   5496         Value = WidenVector(Value, DAG);
   5497 
   5498       unsigned Opcode = getDUPLANEOp(VT.getVectorElementType());
   5499       return DAG.getNode(Opcode, dl, VT, Value, Lane);
   5500     }
   5501 
   5502     if (VT.getVectorElementType().isFloatingPoint()) {
   5503       SmallVector<SDValue, 8> Ops;
   5504       MVT NewType =
   5505           (VT.getVectorElementType() == MVT::f32) ? MVT::i32 : MVT::i64;
   5506       for (unsigned i = 0; i < NumElts; ++i)
   5507         Ops.push_back(DAG.getNode(ISD::BITCAST, dl, NewType, Op.getOperand(i)));
   5508       EVT VecVT = EVT::getVectorVT(*DAG.getContext(), NewType, NumElts);
   5509       SDValue Val = DAG.getNode(ISD::BUILD_VECTOR, dl, VecVT, Ops);
   5510       Val = LowerBUILD_VECTOR(Val, DAG);
   5511       if (Val.getNode())
   5512         return DAG.getNode(ISD::BITCAST, dl, VT, Val);
   5513     }
   5514   }
   5515 
   5516   // If there was only one constant value used and for more than one lane,
   5517   // start by splatting that value, then replace the non-constant lanes. This
   5518   // is better than the default, which will perform a separate initialization
   5519   // for each lane.
   5520   if (NumConstantLanes > 0 && usesOnlyOneConstantValue) {
   5521     SDValue Val = DAG.getNode(AArch64ISD::DUP, dl, VT, ConstantValue);
   5522     // Now insert the non-constant lanes.
   5523     for (unsigned i = 0; i < NumElts; ++i) {
   5524       SDValue V = Op.getOperand(i);
   5525       SDValue LaneIdx = DAG.getConstant(i, MVT::i64);
   5526       if (!isa<ConstantSDNode>(V) && !isa<ConstantFPSDNode>(V)) {
   5527         // Note that type legalization likely mucked about with the VT of the
   5528         // source operand, so we may have to convert it here before inserting.
   5529         Val = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, Val, V, LaneIdx);
   5530       }
   5531     }
   5532     return Val;
   5533   }
   5534 
   5535   // If all elements are constants and the case above didn't get hit, fall back
   5536   // to the default expansion, which will generate a load from the constant
   5537   // pool.
   5538   if (isConstant)
   5539     return SDValue();
   5540 
   5541   // Empirical tests suggest this is rarely worth it for vectors of length <= 2.
   5542   if (NumElts >= 4) {
   5543     SDValue shuffle = ReconstructShuffle(Op, DAG);
   5544     if (shuffle != SDValue())
   5545       return shuffle;
   5546   }
   5547 
   5548   // If all else fails, just use a sequence of INSERT_VECTOR_ELT when we
   5549   // know the default expansion would otherwise fall back on something even
   5550   // worse. For a vector with one or two non-undef values, that's
   5551   // scalar_to_vector for the elements followed by a shuffle (provided the
   5552   // shuffle is valid for the target) and materialization element by element
   5553   // on the stack followed by a load for everything else.
   5554   if (!isConstant && !usesOnlyOneValue) {
   5555     SDValue Vec = DAG.getUNDEF(VT);
   5556     SDValue Op0 = Op.getOperand(0);
   5557     unsigned ElemSize = VT.getVectorElementType().getSizeInBits();
   5558     unsigned i = 0;
   5559     // For 32 and 64 bit types, use INSERT_SUBREG for lane zero to
   5560     // a) Avoid a RMW dependency on the full vector register, and
   5561     // b) Allow the register coalescer to fold away the copy if the
   5562     //    value is already in an S or D register.
   5563     if (Op0.getOpcode() != ISD::UNDEF && (ElemSize == 32 || ElemSize == 64)) {
   5564       unsigned SubIdx = ElemSize == 32 ? AArch64::ssub : AArch64::dsub;
   5565       MachineSDNode *N =
   5566           DAG.getMachineNode(TargetOpcode::INSERT_SUBREG, dl, VT, Vec, Op0,
   5567                              DAG.getTargetConstant(SubIdx, MVT::i32));
   5568       Vec = SDValue(N, 0);
   5569       ++i;
   5570     }
   5571     for (; i < NumElts; ++i) {
   5572       SDValue V = Op.getOperand(i);
   5573       if (V.getOpcode() == ISD::UNDEF)
   5574         continue;
   5575       SDValue LaneIdx = DAG.getConstant(i, MVT::i64);
   5576       Vec = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, Vec, V, LaneIdx);
   5577     }
   5578     return Vec;
   5579   }
   5580 
   5581   // Just use the default expansion. We failed to find a better alternative.
   5582   return SDValue();
   5583 }
   5584 
   5585 SDValue AArch64TargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op,
   5586                                                       SelectionDAG &DAG) const {
   5587   assert(Op.getOpcode() == ISD::INSERT_VECTOR_ELT && "Unknown opcode!");
   5588 
   5589   // Check for non-constant lane.
   5590   if (!isa<ConstantSDNode>(Op.getOperand(2)))
   5591     return SDValue();
   5592 
   5593   EVT VT = Op.getOperand(0).getValueType();
   5594 
   5595   // Insertion/extraction are legal for V128 types.
   5596   if (VT == MVT::v16i8 || VT == MVT::v8i16 || VT == MVT::v4i32 ||
   5597       VT == MVT::v2i64 || VT == MVT::v4f32 || VT == MVT::v2f64)
   5598     return Op;
   5599 
   5600   if (VT != MVT::v8i8 && VT != MVT::v4i16 && VT != MVT::v2i32 &&
   5601       VT != MVT::v1i64 && VT != MVT::v2f32)
   5602     return SDValue();
   5603 
   5604   // For V64 types, we perform insertion by expanding the value
   5605   // to a V128 type and perform the insertion on that.
   5606   SDLoc DL(Op);
   5607   SDValue WideVec = WidenVector(Op.getOperand(0), DAG);
   5608   EVT WideTy = WideVec.getValueType();
   5609 
   5610   SDValue Node = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, WideTy, WideVec,
   5611                              Op.getOperand(1), Op.getOperand(2));
   5612   // Re-narrow the resultant vector.
   5613   return NarrowVector(Node, DAG);
   5614 }
   5615 
   5616 SDValue
   5617 AArch64TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op,
   5618                                                SelectionDAG &DAG) const {
   5619   assert(Op.getOpcode() == ISD::EXTRACT_VECTOR_ELT && "Unknown opcode!");
   5620 
   5621   // Check for non-constant lane.
   5622   if (!isa<ConstantSDNode>(Op.getOperand(1)))
   5623     return SDValue();
   5624 
   5625   EVT VT = Op.getOperand(0).getValueType();
   5626 
   5627   // Insertion/extraction are legal for V128 types.
   5628   if (VT == MVT::v16i8 || VT == MVT::v8i16 || VT == MVT::v4i32 ||
   5629       VT == MVT::v2i64 || VT == MVT::v4f32 || VT == MVT::v2f64)
   5630     return Op;
   5631 
   5632   if (VT != MVT::v8i8 && VT != MVT::v4i16 && VT != MVT::v2i32 &&
   5633       VT != MVT::v1i64 && VT != MVT::v2f32)
   5634     return SDValue();
   5635 
   5636   // For V64 types, we perform extraction by expanding the value
   5637   // to a V128 type and perform the extraction on that.
   5638   SDLoc DL(Op);
   5639   SDValue WideVec = WidenVector(Op.getOperand(0), DAG);
   5640   EVT WideTy = WideVec.getValueType();
   5641 
   5642   EVT ExtrTy = WideTy.getVectorElementType();
   5643   if (ExtrTy == MVT::i16 || ExtrTy == MVT::i8)
   5644     ExtrTy = MVT::i32;
   5645 
   5646   // For extractions, we just return the result directly.
   5647   return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ExtrTy, WideVec,
   5648                      Op.getOperand(1));
   5649 }
   5650 
   5651 SDValue AArch64TargetLowering::LowerEXTRACT_SUBVECTOR(SDValue Op,
   5652                                                       SelectionDAG &DAG) const {
   5653   EVT VT = Op.getOperand(0).getValueType();
   5654   SDLoc dl(Op);
   5655   // Just in case...
   5656   if (!VT.isVector())
   5657     return SDValue();
   5658 
   5659   ConstantSDNode *Cst = dyn_cast<ConstantSDNode>(Op.getOperand(1));
   5660   if (!Cst)
   5661     return SDValue();
   5662   unsigned Val = Cst->getZExtValue();
   5663 
   5664   unsigned Size = Op.getValueType().getSizeInBits();
   5665   if (Val == 0) {
   5666     switch (Size) {
   5667     case 8:
   5668       return DAG.getTargetExtractSubreg(AArch64::bsub, dl, Op.getValueType(),
   5669                                         Op.getOperand(0));
   5670     case 16:
   5671       return DAG.getTargetExtractSubreg(AArch64::hsub, dl, Op.getValueType(),
   5672                                         Op.getOperand(0));
   5673     case 32:
   5674       return DAG.getTargetExtractSubreg(AArch64::ssub, dl, Op.getValueType(),
   5675                                         Op.getOperand(0));
   5676     case 64:
   5677       return DAG.getTargetExtractSubreg(AArch64::dsub, dl, Op.getValueType(),
   5678                                         Op.getOperand(0));
   5679     default:
   5680       llvm_unreachable("Unexpected vector type in extract_subvector!");
   5681     }
   5682   }
   5683   // If this is extracting the upper 64-bits of a 128-bit vector, we match
   5684   // that directly.
   5685   if (Size == 64 && Val * VT.getVectorElementType().getSizeInBits() == 64)
   5686     return Op;
   5687 
   5688   return SDValue();
   5689 }
   5690 
   5691 bool AArch64TargetLowering::isShuffleMaskLegal(const SmallVectorImpl<int> &M,
   5692                                                EVT VT) const {
   5693   if (VT.getVectorNumElements() == 4 &&
   5694       (VT.is128BitVector() || VT.is64BitVector())) {
   5695     unsigned PFIndexes[4];
   5696     for (unsigned i = 0; i != 4; ++i) {
   5697       if (M[i] < 0)
   5698         PFIndexes[i] = 8;
   5699       else
   5700         PFIndexes[i] = M[i];
   5701     }
   5702 
   5703     // Compute the index in the perfect shuffle table.
   5704     unsigned PFTableIndex = PFIndexes[0] * 9 * 9 * 9 + PFIndexes[1] * 9 * 9 +
   5705                             PFIndexes[2] * 9 + PFIndexes[3];
   5706     unsigned PFEntry = PerfectShuffleTable[PFTableIndex];
   5707     unsigned Cost = (PFEntry >> 30);
   5708 
   5709     if (Cost <= 4)
   5710       return true;
   5711   }
   5712 
   5713   bool DummyBool;
   5714   int DummyInt;
   5715   unsigned DummyUnsigned;
   5716 
   5717   return (ShuffleVectorSDNode::isSplatMask(&M[0], VT) || isREVMask(M, VT, 64) ||
   5718           isREVMask(M, VT, 32) || isREVMask(M, VT, 16) ||
   5719           isEXTMask(M, VT, DummyBool, DummyUnsigned) ||
   5720           // isTBLMask(M, VT) || // FIXME: Port TBL support from ARM.
   5721           isTRNMask(M, VT, DummyUnsigned) || isUZPMask(M, VT, DummyUnsigned) ||
   5722           isZIPMask(M, VT, DummyUnsigned) ||
   5723           isTRN_v_undef_Mask(M, VT, DummyUnsigned) ||
   5724           isUZP_v_undef_Mask(M, VT, DummyUnsigned) ||
   5725           isZIP_v_undef_Mask(M, VT, DummyUnsigned) ||
   5726           isINSMask(M, VT.getVectorNumElements(), DummyBool, DummyInt) ||
   5727           isConcatMask(M, VT, VT.getSizeInBits() == 128));
   5728 }
   5729 
   5730 /// getVShiftImm - Check if this is a valid build_vector for the immediate
   5731 /// operand of a vector shift operation, where all the elements of the
   5732 /// build_vector must have the same constant integer value.
   5733 static bool getVShiftImm(SDValue Op, unsigned ElementBits, int64_t &Cnt) {
   5734   // Ignore bit_converts.
   5735   while (Op.getOpcode() == ISD::BITCAST)
   5736     Op = Op.getOperand(0);
   5737   BuildVectorSDNode *BVN = dyn_cast<BuildVectorSDNode>(Op.getNode());
   5738   APInt SplatBits, SplatUndef;
   5739   unsigned SplatBitSize;
   5740   bool HasAnyUndefs;
   5741   if (!BVN || !BVN->isConstantSplat(SplatBits, SplatUndef, SplatBitSize,
   5742                                     HasAnyUndefs, ElementBits) ||
   5743       SplatBitSize > ElementBits)
   5744     return false;
   5745   Cnt = SplatBits.getSExtValue();
   5746   return true;
   5747 }
   5748 
   5749 /// isVShiftLImm - Check if this is a valid build_vector for the immediate
   5750 /// operand of a vector shift left operation.  That value must be in the range:
   5751 ///   0 <= Value < ElementBits for a left shift; or
   5752 ///   0 <= Value <= ElementBits for a long left shift.
   5753 static bool isVShiftLImm(SDValue Op, EVT VT, bool isLong, int64_t &Cnt) {
   5754   assert(VT.isVector() && "vector shift count is not a vector type");
   5755   unsigned ElementBits = VT.getVectorElementType().getSizeInBits();
   5756   if (!getVShiftImm(Op, ElementBits, Cnt))
   5757     return false;
   5758   return (Cnt >= 0 && (isLong ? Cnt - 1 : Cnt) < ElementBits);
   5759 }
   5760 
   5761 /// isVShiftRImm - Check if this is a valid build_vector for the immediate
   5762 /// operand of a vector shift right operation.  For a shift opcode, the value
   5763 /// is positive, but for an intrinsic the value count must be negative. The
   5764 /// absolute value must be in the range:
   5765 ///   1 <= |Value| <= ElementBits for a right shift; or
   5766 ///   1 <= |Value| <= ElementBits/2 for a narrow right shift.
   5767 static bool isVShiftRImm(SDValue Op, EVT VT, bool isNarrow, bool isIntrinsic,
   5768                          int64_t &Cnt) {
   5769   assert(VT.isVector() && "vector shift count is not a vector type");
   5770   unsigned ElementBits = VT.getVectorElementType().getSizeInBits();
   5771   if (!getVShiftImm(Op, ElementBits, Cnt))
   5772     return false;
   5773   if (isIntrinsic)
   5774     Cnt = -Cnt;
   5775   return (Cnt >= 1 && Cnt <= (isNarrow ? ElementBits / 2 : ElementBits));
   5776 }
   5777 
   5778 SDValue AArch64TargetLowering::LowerVectorSRA_SRL_SHL(SDValue Op,
   5779                                                       SelectionDAG &DAG) const {
   5780   EVT VT = Op.getValueType();
   5781   SDLoc DL(Op);
   5782   int64_t Cnt;
   5783 
   5784   if (!Op.getOperand(1).getValueType().isVector())
   5785     return Op;
   5786   unsigned EltSize = VT.getVectorElementType().getSizeInBits();
   5787 
   5788   switch (Op.getOpcode()) {
   5789   default:
   5790     llvm_unreachable("unexpected shift opcode");
   5791 
   5792   case ISD::SHL:
   5793     if (isVShiftLImm(Op.getOperand(1), VT, false, Cnt) && Cnt < EltSize)
   5794       return DAG.getNode(AArch64ISD::VSHL, SDLoc(Op), VT, Op.getOperand(0),
   5795                          DAG.getConstant(Cnt, MVT::i32));
   5796     return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT,
   5797                        DAG.getConstant(Intrinsic::aarch64_neon_ushl, MVT::i32),
   5798                        Op.getOperand(0), Op.getOperand(1));
   5799   case ISD::SRA:
   5800   case ISD::SRL:
   5801     // Right shift immediate
   5802     if (isVShiftRImm(Op.getOperand(1), VT, false, false, Cnt) &&
   5803         Cnt < EltSize) {
   5804       unsigned Opc =
   5805           (Op.getOpcode() == ISD::SRA) ? AArch64ISD::VASHR : AArch64ISD::VLSHR;
   5806       return DAG.getNode(Opc, SDLoc(Op), VT, Op.getOperand(0),
   5807                          DAG.getConstant(Cnt, MVT::i32));
   5808     }
   5809 
   5810     // Right shift register.  Note, there is not a shift right register
   5811     // instruction, but the shift left register instruction takes a signed
   5812     // value, where negative numbers specify a right shift.
   5813     unsigned Opc = (Op.getOpcode() == ISD::SRA) ? Intrinsic::aarch64_neon_sshl
   5814                                                 : Intrinsic::aarch64_neon_ushl;
   5815     // negate the shift amount
   5816     SDValue NegShift = DAG.getNode(AArch64ISD::NEG, DL, VT, Op.getOperand(1));
   5817     SDValue NegShiftLeft =
   5818         DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT,
   5819                     DAG.getConstant(Opc, MVT::i32), Op.getOperand(0), NegShift);
   5820     return NegShiftLeft;
   5821   }
   5822 
   5823   return SDValue();
   5824 }
   5825 
   5826 static SDValue EmitVectorComparison(SDValue LHS, SDValue RHS,
   5827                                     AArch64CC::CondCode CC, bool NoNans, EVT VT,
   5828                                     SDLoc dl, SelectionDAG &DAG) {
   5829   EVT SrcVT = LHS.getValueType();
   5830 
   5831   BuildVectorSDNode *BVN = dyn_cast<BuildVectorSDNode>(RHS.getNode());
   5832   APInt CnstBits(VT.getSizeInBits(), 0);
   5833   APInt UndefBits(VT.getSizeInBits(), 0);
   5834   bool IsCnst = BVN && resolveBuildVector(BVN, CnstBits, UndefBits);
   5835   bool IsZero = IsCnst && (CnstBits == 0);
   5836 
   5837   if (SrcVT.getVectorElementType().isFloatingPoint()) {
   5838     switch (CC) {
   5839     default:
   5840       return SDValue();
   5841     case AArch64CC::NE: {
   5842       SDValue Fcmeq;
   5843       if (IsZero)
   5844         Fcmeq = DAG.getNode(AArch64ISD::FCMEQz, dl, VT, LHS);
   5845       else
   5846         Fcmeq = DAG.getNode(AArch64ISD::FCMEQ, dl, VT, LHS, RHS);
   5847       return DAG.getNode(AArch64ISD::NOT, dl, VT, Fcmeq);
   5848     }
   5849     case AArch64CC::EQ:
   5850       if (IsZero)
   5851         return DAG.getNode(AArch64ISD::FCMEQz, dl, VT, LHS);
   5852       return DAG.getNode(AArch64ISD::FCMEQ, dl, VT, LHS, RHS);
   5853     case AArch64CC::GE:
   5854       if (IsZero)
   5855         return DAG.getNode(AArch64ISD::FCMGEz, dl, VT, LHS);
   5856       return DAG.getNode(AArch64ISD::FCMGE, dl, VT, LHS, RHS);
   5857     case AArch64CC::GT:
   5858       if (IsZero)
   5859         return DAG.getNode(AArch64ISD::FCMGTz, dl, VT, LHS);
   5860       return DAG.getNode(AArch64ISD::FCMGT, dl, VT, LHS, RHS);
   5861     case AArch64CC::LS:
   5862       if (IsZero)
   5863         return DAG.getNode(AArch64ISD::FCMLEz, dl, VT, LHS);
   5864       return DAG.getNode(AArch64ISD::FCMGE, dl, VT, RHS, LHS);
   5865     case AArch64CC::LT:
   5866       if (!NoNans)
   5867         return SDValue();
   5868     // If we ignore NaNs then we can use to the MI implementation.
   5869     // Fallthrough.
   5870     case AArch64CC::MI:
   5871       if (IsZero)
   5872         return DAG.getNode(AArch64ISD::FCMLTz, dl, VT, LHS);
   5873       return DAG.getNode(AArch64ISD::FCMGT, dl, VT, RHS, LHS);
   5874     }
   5875   }
   5876 
   5877   switch (CC) {
   5878   default:
   5879     return SDValue();
   5880   case AArch64CC::NE: {
   5881     SDValue Cmeq;
   5882     if (IsZero)
   5883       Cmeq = DAG.getNode(AArch64ISD::CMEQz, dl, VT, LHS);
   5884     else
   5885       Cmeq = DAG.getNode(AArch64ISD::CMEQ, dl, VT, LHS, RHS);
   5886     return DAG.getNode(AArch64ISD::NOT, dl, VT, Cmeq);
   5887   }
   5888   case AArch64CC::EQ:
   5889     if (IsZero)
   5890       return DAG.getNode(AArch64ISD::CMEQz, dl, VT, LHS);
   5891     return DAG.getNode(AArch64ISD::CMEQ, dl, VT, LHS, RHS);
   5892   case AArch64CC::GE:
   5893     if (IsZero)
   5894       return DAG.getNode(AArch64ISD::CMGEz, dl, VT, LHS);
   5895     return DAG.getNode(AArch64ISD::CMGE, dl, VT, LHS, RHS);
   5896   case AArch64CC::GT:
   5897     if (IsZero)
   5898       return DAG.getNode(AArch64ISD::CMGTz, dl, VT, LHS);
   5899     return DAG.getNode(AArch64ISD::CMGT, dl, VT, LHS, RHS);
   5900   case AArch64CC::LE:
   5901     if (IsZero)
   5902       return DAG.getNode(AArch64ISD::CMLEz, dl, VT, LHS);
   5903     return DAG.getNode(AArch64ISD::CMGE, dl, VT, RHS, LHS);
   5904   case AArch64CC::LS:
   5905     return DAG.getNode(AArch64ISD::CMHS, dl, VT, RHS, LHS);
   5906   case AArch64CC::LO:
   5907     return DAG.getNode(AArch64ISD::CMHI, dl, VT, RHS, LHS);
   5908   case AArch64CC::LT:
   5909     if (IsZero)
   5910       return DAG.getNode(AArch64ISD::CMLTz, dl, VT, LHS);
   5911     return DAG.getNode(AArch64ISD::CMGT, dl, VT, RHS, LHS);
   5912   case AArch64CC::HI:
   5913     return DAG.getNode(AArch64ISD::CMHI, dl, VT, LHS, RHS);
   5914   case AArch64CC::HS:
   5915     return DAG.getNode(AArch64ISD::CMHS, dl, VT, LHS, RHS);
   5916   }
   5917 }
   5918 
   5919 SDValue AArch64TargetLowering::LowerVSETCC(SDValue Op,
   5920                                            SelectionDAG &DAG) const {
   5921   ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(2))->get();
   5922   SDValue LHS = Op.getOperand(0);
   5923   SDValue RHS = Op.getOperand(1);
   5924   SDLoc dl(Op);
   5925 
   5926   if (LHS.getValueType().getVectorElementType().isInteger()) {
   5927     assert(LHS.getValueType() == RHS.getValueType());
   5928     AArch64CC::CondCode AArch64CC = changeIntCCToAArch64CC(CC);
   5929     return EmitVectorComparison(LHS, RHS, AArch64CC, false, Op.getValueType(),
   5930                                 dl, DAG);
   5931   }
   5932 
   5933   assert(LHS.getValueType().getVectorElementType() == MVT::f32 ||
   5934          LHS.getValueType().getVectorElementType() == MVT::f64);
   5935 
   5936   // Unfortunately, the mapping of LLVM FP CC's onto AArch64 CC's isn't totally
   5937   // clean.  Some of them require two branches to implement.
   5938   AArch64CC::CondCode CC1, CC2;
   5939   bool ShouldInvert;
   5940   changeVectorFPCCToAArch64CC(CC, CC1, CC2, ShouldInvert);
   5941 
   5942   bool NoNaNs = getTargetMachine().Options.NoNaNsFPMath;
   5943   SDValue Cmp =
   5944       EmitVectorComparison(LHS, RHS, CC1, NoNaNs, Op.getValueType(), dl, DAG);
   5945   if (!Cmp.getNode())
   5946     return SDValue();
   5947 
   5948   if (CC2 != AArch64CC::AL) {
   5949     SDValue Cmp2 =
   5950         EmitVectorComparison(LHS, RHS, CC2, NoNaNs, Op.getValueType(), dl, DAG);
   5951     if (!Cmp2.getNode())
   5952       return SDValue();
   5953 
   5954     Cmp = DAG.getNode(ISD::OR, dl, Cmp.getValueType(), Cmp, Cmp2);
   5955   }
   5956 
   5957   if (ShouldInvert)
   5958     return Cmp = DAG.getNOT(dl, Cmp, Cmp.getValueType());
   5959 
   5960   return Cmp;
   5961 }
   5962 
   5963 /// getTgtMemIntrinsic - Represent NEON load and store intrinsics as
   5964 /// MemIntrinsicNodes.  The associated MachineMemOperands record the alignment
   5965 /// specified in the intrinsic calls.
   5966 bool AArch64TargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
   5967                                                const CallInst &I,
   5968                                                unsigned Intrinsic) const {
   5969   switch (Intrinsic) {
   5970   case Intrinsic::aarch64_neon_ld2:
   5971   case Intrinsic::aarch64_neon_ld3:
   5972   case Intrinsic::aarch64_neon_ld4:
   5973   case Intrinsic::aarch64_neon_ld1x2:
   5974   case Intrinsic::aarch64_neon_ld1x3:
   5975   case Intrinsic::aarch64_neon_ld1x4:
   5976   case Intrinsic::aarch64_neon_ld2lane:
   5977   case Intrinsic::aarch64_neon_ld3lane:
   5978   case Intrinsic::aarch64_neon_ld4lane:
   5979   case Intrinsic::aarch64_neon_ld2r:
   5980   case Intrinsic::aarch64_neon_ld3r:
   5981   case Intrinsic::aarch64_neon_ld4r: {
   5982     Info.opc = ISD::INTRINSIC_W_CHAIN;
   5983     // Conservatively set memVT to the entire set of vectors loaded.
   5984     uint64_t NumElts = getDataLayout()->getTypeAllocSize(I.getType()) / 8;
   5985     Info.memVT = EVT::getVectorVT(I.getType()->getContext(), MVT::i64, NumElts);
   5986     Info.ptrVal = I.getArgOperand(I.getNumArgOperands() - 1);
   5987     Info.offset = 0;
   5988     Info.align = 0;
   5989     Info.vol = false; // volatile loads with NEON intrinsics not supported
   5990     Info.readMem = true;
   5991     Info.writeMem = false;
   5992     return true;
   5993   }
   5994   case Intrinsic::aarch64_neon_st2:
   5995   case Intrinsic::aarch64_neon_st3:
   5996   case Intrinsic::aarch64_neon_st4:
   5997   case Intrinsic::aarch64_neon_st1x2:
   5998   case Intrinsic::aarch64_neon_st1x3:
   5999   case Intrinsic::aarch64_neon_st1x4:
   6000   case Intrinsic::aarch64_neon_st2lane:
   6001   case Intrinsic::aarch64_neon_st3lane:
   6002   case Intrinsic::aarch64_neon_st4lane: {
   6003     Info.opc = ISD::INTRINSIC_VOID;
   6004     // Conservatively set memVT to the entire set of vectors stored.
   6005     unsigned NumElts = 0;
   6006     for (unsigned ArgI = 1, ArgE = I.getNumArgOperands(); ArgI < ArgE; ++ArgI) {
   6007       Type *ArgTy = I.getArgOperand(ArgI)->getType();
   6008       if (!ArgTy->isVectorTy())
   6009         break;
   6010       NumElts += getDataLayout()->getTypeAllocSize(ArgTy) / 8;
   6011     }
   6012     Info.memVT = EVT::getVectorVT(I.getType()->getContext(), MVT::i64, NumElts);
   6013     Info.ptrVal = I.getArgOperand(I.getNumArgOperands() - 1);
   6014     Info.offset = 0;
   6015     Info.align = 0;
   6016     Info.vol = false; // volatile stores with NEON intrinsics not supported
   6017     Info.readMem = false;
   6018     Info.writeMem = true;
   6019     return true;
   6020   }
   6021   case Intrinsic::aarch64_ldaxr:
   6022   case Intrinsic::aarch64_ldxr: {
   6023     PointerType *PtrTy = cast<PointerType>(I.getArgOperand(0)->getType());
   6024     Info.opc = ISD::INTRINSIC_W_CHAIN;
   6025     Info.memVT = MVT::getVT(PtrTy->getElementType());
   6026     Info.ptrVal = I.getArgOperand(0);
   6027     Info.offset = 0;
   6028     Info.align = getDataLayout()->getABITypeAlignment(PtrTy->getElementType());
   6029     Info.vol = true;
   6030     Info.readMem = true;
   6031     Info.writeMem = false;
   6032     return true;
   6033   }
   6034   case Intrinsic::aarch64_stlxr:
   6035   case Intrinsic::aarch64_stxr: {
   6036     PointerType *PtrTy = cast<PointerType>(I.getArgOperand(1)->getType());
   6037     Info.opc = ISD::INTRINSIC_W_CHAIN;
   6038     Info.memVT = MVT::getVT(PtrTy->getElementType());
   6039     Info.ptrVal = I.getArgOperand(1);
   6040     Info.offset = 0;
   6041     Info.align = getDataLayout()->getABITypeAlignment(PtrTy->getElementType());
   6042     Info.vol = true;
   6043     Info.readMem = false;
   6044     Info.writeMem = true;
   6045     return true;
   6046   }
   6047   case Intrinsic::aarch64_ldaxp:
   6048   case Intrinsic::aarch64_ldxp: {
   6049     Info.opc = ISD::INTRINSIC_W_CHAIN;
   6050     Info.memVT = MVT::i128;
   6051     Info.ptrVal = I.getArgOperand(0);
   6052     Info.offset = 0;
   6053     Info.align = 16;
   6054     Info.vol = true;
   6055     Info.readMem = true;
   6056     Info.writeMem = false;
   6057     return true;
   6058   }
   6059   case Intrinsic::aarch64_stlxp:
   6060   case Intrinsic::aarch64_stxp: {
   6061     Info.opc = ISD::INTRINSIC_W_CHAIN;
   6062     Info.memVT = MVT::i128;
   6063     Info.ptrVal = I.getArgOperand(2);
   6064     Info.offset = 0;
   6065     Info.align = 16;
   6066     Info.vol = true;
   6067     Info.readMem = false;
   6068     Info.writeMem = true;
   6069     return true;
   6070   }
   6071   default:
   6072     break;
   6073   }
   6074 
   6075   return false;
   6076 }
   6077 
   6078 // Truncations from 64-bit GPR to 32-bit GPR is free.
   6079 bool AArch64TargetLowering::isTruncateFree(Type *Ty1, Type *Ty2) const {
   6080   if (!Ty1->isIntegerTy() || !Ty2->isIntegerTy())
   6081     return false;
   6082   unsigned NumBits1 = Ty1->getPrimitiveSizeInBits();
   6083   unsigned NumBits2 = Ty2->getPrimitiveSizeInBits();
   6084   return NumBits1 > NumBits2;
   6085 }
   6086 bool AArch64TargetLowering::isTruncateFree(EVT VT1, EVT VT2) const {
   6087   if (VT1.isVector() || VT2.isVector() || !VT1.isInteger() || !VT2.isInteger())
   6088     return false;
   6089   unsigned NumBits1 = VT1.getSizeInBits();
   6090   unsigned NumBits2 = VT2.getSizeInBits();
   6091   return NumBits1 > NumBits2;
   6092 }
   6093 
   6094 // All 32-bit GPR operations implicitly zero the high-half of the corresponding
   6095 // 64-bit GPR.
   6096 bool AArch64TargetLowering::isZExtFree(Type *Ty1, Type *Ty2) const {
   6097   if (!Ty1->isIntegerTy() || !Ty2->isIntegerTy())
   6098     return false;
   6099   unsigned NumBits1 = Ty1->getPrimitiveSizeInBits();
   6100   unsigned NumBits2 = Ty2->getPrimitiveSizeInBits();
   6101   return NumBits1 == 32 && NumBits2 == 64;
   6102 }
   6103 bool AArch64TargetLowering::isZExtFree(EVT VT1, EVT VT2) const {
   6104   if (VT1.isVector() || VT2.isVector() || !VT1.isInteger() || !VT2.isInteger())
   6105     return false;
   6106   unsigned NumBits1 = VT1.getSizeInBits();
   6107   unsigned NumBits2 = VT2.getSizeInBits();
   6108   return NumBits1 == 32 && NumBits2 == 64;
   6109 }
   6110 
   6111 bool AArch64TargetLowering::isZExtFree(SDValue Val, EVT VT2) const {
   6112   EVT VT1 = Val.getValueType();
   6113   if (isZExtFree(VT1, VT2)) {
   6114     return true;
   6115   }
   6116 
   6117   if (Val.getOpcode() != ISD::LOAD)
   6118     return false;
   6119 
   6120   // 8-, 16-, and 32-bit integer loads all implicitly zero-extend.
   6121   return (VT1.isSimple() && !VT1.isVector() && VT1.isInteger() &&
   6122           VT2.isSimple() && !VT2.isVector() && VT2.isInteger() &&
   6123           VT1.getSizeInBits() <= 32);
   6124 }
   6125 
   6126 bool AArch64TargetLowering::hasPairedLoad(Type *LoadedType,
   6127                                           unsigned &RequiredAligment) const {
   6128   if (!LoadedType->isIntegerTy() && !LoadedType->isFloatTy())
   6129     return false;
   6130   // Cyclone supports unaligned accesses.
   6131   RequiredAligment = 0;
   6132   unsigned NumBits = LoadedType->getPrimitiveSizeInBits();
   6133   return NumBits == 32 || NumBits == 64;
   6134 }
   6135 
   6136 bool AArch64TargetLowering::hasPairedLoad(EVT LoadedType,
   6137                                           unsigned &RequiredAligment) const {
   6138   if (!LoadedType.isSimple() ||
   6139       (!LoadedType.isInteger() && !LoadedType.isFloatingPoint()))
   6140     return false;
   6141   // Cyclone supports unaligned accesses.
   6142   RequiredAligment = 0;
   6143   unsigned NumBits = LoadedType.getSizeInBits();
   6144   return NumBits == 32 || NumBits == 64;
   6145 }
   6146 
   6147 static bool memOpAlign(unsigned DstAlign, unsigned SrcAlign,
   6148                        unsigned AlignCheck) {
   6149   return ((SrcAlign == 0 || SrcAlign % AlignCheck == 0) &&
   6150           (DstAlign == 0 || DstAlign % AlignCheck == 0));
   6151 }
   6152 
   6153 EVT AArch64TargetLowering::getOptimalMemOpType(uint64_t Size, unsigned DstAlign,
   6154                                                unsigned SrcAlign, bool IsMemset,
   6155                                                bool ZeroMemset,
   6156                                                bool MemcpyStrSrc,
   6157                                                MachineFunction &MF) const {
   6158   // Don't use AdvSIMD to implement 16-byte memset. It would have taken one
   6159   // instruction to materialize the v2i64 zero and one store (with restrictive
   6160   // addressing mode). Just do two i64 store of zero-registers.
   6161   bool Fast;
   6162   const Function *F = MF.getFunction();
   6163   if (Subtarget->hasFPARMv8() && !IsMemset && Size >= 16 &&
   6164       !F->getAttributes().hasAttribute(AttributeSet::FunctionIndex,
   6165                                        Attribute::NoImplicitFloat) &&
   6166       (memOpAlign(SrcAlign, DstAlign, 16) ||
   6167        (allowsUnalignedMemoryAccesses(MVT::f128, 0, &Fast) && Fast)))
   6168     return MVT::f128;
   6169 
   6170   return Size >= 8 ? MVT::i64 : MVT::i32;
   6171 }
   6172 
   6173 // 12-bit optionally shifted immediates are legal for adds.
   6174 bool AArch64TargetLowering::isLegalAddImmediate(int64_t Immed) const {
   6175   if ((Immed >> 12) == 0 || ((Immed & 0xfff) == 0 && Immed >> 24 == 0))
   6176     return true;
   6177   return false;
   6178 }
   6179 
   6180 // Integer comparisons are implemented with ADDS/SUBS, so the range of valid
   6181 // immediates is the same as for an add or a sub.
   6182 bool AArch64TargetLowering::isLegalICmpImmediate(int64_t Immed) const {
   6183   if (Immed < 0)
   6184     Immed *= -1;
   6185   return isLegalAddImmediate(Immed);
   6186 }
   6187 
   6188 /// isLegalAddressingMode - Return true if the addressing mode represented
   6189 /// by AM is legal for this target, for a load/store of the specified type.
   6190 bool AArch64TargetLowering::isLegalAddressingMode(const AddrMode &AM,
   6191                                                   Type *Ty) const {
   6192   // AArch64 has five basic addressing modes:
   6193   //  reg
   6194   //  reg + 9-bit signed offset
   6195   //  reg + SIZE_IN_BYTES * 12-bit unsigned offset
   6196   //  reg1 + reg2
   6197   //  reg + SIZE_IN_BYTES * reg
   6198 
   6199   // No global is ever allowed as a base.
   6200   if (AM.BaseGV)
   6201     return false;
   6202 
   6203   // No reg+reg+imm addressing.
   6204   if (AM.HasBaseReg && AM.BaseOffs && AM.Scale)
   6205     return false;
   6206 
   6207   // check reg + imm case:
   6208   // i.e., reg + 0, reg + imm9, reg + SIZE_IN_BYTES * uimm12
   6209   uint64_t NumBytes = 0;
   6210   if (Ty->isSized()) {
   6211     uint64_t NumBits = getDataLayout()->getTypeSizeInBits(Ty);
   6212     NumBytes = NumBits / 8;
   6213     if (!isPowerOf2_64(NumBits))
   6214       NumBytes = 0;
   6215   }
   6216 
   6217   if (!AM.Scale) {
   6218     int64_t Offset = AM.BaseOffs;
   6219 
   6220     // 9-bit signed offset
   6221     if (Offset >= -(1LL << 9) && Offset <= (1LL << 9) - 1)
   6222       return true;
   6223 
   6224     // 12-bit unsigned offset
   6225     unsigned shift = Log2_64(NumBytes);
   6226     if (NumBytes && Offset > 0 && (Offset / NumBytes) <= (1LL << 12) - 1 &&
   6227         // Must be a multiple of NumBytes (NumBytes is a power of 2)
   6228         (Offset >> shift) << shift == Offset)
   6229       return true;
   6230     return false;
   6231   }
   6232 
   6233   // Check reg1 + SIZE_IN_BYTES * reg2 and reg1 + reg2
   6234 
   6235   if (!AM.Scale || AM.Scale == 1 ||
   6236       (AM.Scale > 0 && (uint64_t)AM.Scale == NumBytes))
   6237     return true;
   6238   return false;
   6239 }
   6240 
   6241 int AArch64TargetLowering::getScalingFactorCost(const AddrMode &AM,
   6242                                                 Type *Ty) const {
   6243   // Scaling factors are not free at all.
   6244   // Operands                     | Rt Latency
   6245   // -------------------------------------------
   6246   // Rt, [Xn, Xm]                 | 4
   6247   // -------------------------------------------
   6248   // Rt, [Xn, Xm, lsl #imm]       | Rn: 4 Rm: 5
   6249   // Rt, [Xn, Wm, <extend> #imm]  |
   6250   if (isLegalAddressingMode(AM, Ty))
   6251     // Scale represents reg2 * scale, thus account for 1 if
   6252     // it is not equal to 0 or 1.
   6253     return AM.Scale != 0 && AM.Scale != 1;
   6254   return -1;
   6255 }
   6256 
   6257 bool AArch64TargetLowering::isFMAFasterThanFMulAndFAdd(EVT VT) const {
   6258   VT = VT.getScalarType();
   6259 
   6260   if (!VT.isSimple())
   6261     return false;
   6262 
   6263   switch (VT.getSimpleVT().SimpleTy) {
   6264   case MVT::f32:
   6265   case MVT::f64:
   6266     return true;
   6267   default:
   6268     break;
   6269   }
   6270 
   6271   return false;
   6272 }
   6273 
   6274 const MCPhysReg *
   6275 AArch64TargetLowering::getScratchRegisters(CallingConv::ID) const {
   6276   // LR is a callee-save register, but we must treat it as clobbered by any call
   6277   // site. Hence we include LR in the scratch registers, which are in turn added
   6278   // as implicit-defs for stackmaps and patchpoints.
   6279   static const MCPhysReg ScratchRegs[] = {
   6280     AArch64::X16, AArch64::X17, AArch64::LR, 0
   6281   };
   6282   return ScratchRegs;
   6283 }
   6284 
   6285 bool
   6286 AArch64TargetLowering::isDesirableToCommuteWithShift(const SDNode *N) const {
   6287   EVT VT = N->getValueType(0);
   6288     // If N is unsigned bit extraction: ((x >> C) & mask), then do not combine
   6289     // it with shift to let it be lowered to UBFX.
   6290   if (N->getOpcode() == ISD::AND && (VT == MVT::i32 || VT == MVT::i64) &&
   6291       isa<ConstantSDNode>(N->getOperand(1))) {
   6292     uint64_t TruncMask = N->getConstantOperandVal(1);
   6293     if (isMask_64(TruncMask) &&
   6294       N->getOperand(0).getOpcode() == ISD::SRL &&
   6295       isa<ConstantSDNode>(N->getOperand(0)->getOperand(1)))
   6296       return false;
   6297   }
   6298   return true;
   6299 }
   6300 
   6301 bool AArch64TargetLowering::shouldConvertConstantLoadToIntImm(const APInt &Imm,
   6302                                                               Type *Ty) const {
   6303   assert(Ty->isIntegerTy());
   6304 
   6305   unsigned BitSize = Ty->getPrimitiveSizeInBits();
   6306   if (BitSize == 0)
   6307     return false;
   6308 
   6309   int64_t Val = Imm.getSExtValue();
   6310   if (Val == 0 || AArch64_AM::isLogicalImmediate(Val, BitSize))
   6311     return true;
   6312 
   6313   if ((int64_t)Val < 0)
   6314     Val = ~Val;
   6315   if (BitSize == 32)
   6316     Val &= (1LL << 32) - 1;
   6317 
   6318   unsigned LZ = countLeadingZeros((uint64_t)Val);
   6319   unsigned Shift = (63 - LZ) / 16;
   6320   // MOVZ is free so return true for one or fewer MOVK.
   6321   return (Shift < 3) ? true : false;
   6322 }
   6323 
   6324 // Generate SUBS and CSEL for integer abs.
   6325 static SDValue performIntegerAbsCombine(SDNode *N, SelectionDAG &DAG) {
   6326   EVT VT = N->getValueType(0);
   6327 
   6328   SDValue N0 = N->getOperand(0);
   6329   SDValue N1 = N->getOperand(1);
   6330   SDLoc DL(N);
   6331 
   6332   // Check pattern of XOR(ADD(X,Y), Y) where Y is SRA(X, size(X)-1)
   6333   // and change it to SUB and CSEL.
   6334   if (VT.isInteger() && N->getOpcode() == ISD::XOR &&
   6335       N0.getOpcode() == ISD::ADD && N0.getOperand(1) == N1 &&
   6336       N1.getOpcode() == ISD::SRA && N1.getOperand(0) == N0.getOperand(0))
   6337     if (ConstantSDNode *Y1C = dyn_cast<ConstantSDNode>(N1.getOperand(1)))
   6338       if (Y1C->getAPIntValue() == VT.getSizeInBits() - 1) {
   6339         SDValue Neg = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, VT),
   6340                                   N0.getOperand(0));
   6341         // Generate SUBS & CSEL.
   6342         SDValue Cmp =
   6343             DAG.getNode(AArch64ISD::SUBS, DL, DAG.getVTList(VT, MVT::i32),
   6344                         N0.getOperand(0), DAG.getConstant(0, VT));
   6345         return DAG.getNode(AArch64ISD::CSEL, DL, VT, N0.getOperand(0), Neg,
   6346                            DAG.getConstant(AArch64CC::PL, MVT::i32),
   6347                            SDValue(Cmp.getNode(), 1));
   6348       }
   6349   return SDValue();
   6350 }
   6351 
   6352 // performXorCombine - Attempts to handle integer ABS.
   6353 static SDValue performXorCombine(SDNode *N, SelectionDAG &DAG,
   6354                                  TargetLowering::DAGCombinerInfo &DCI,
   6355                                  const AArch64Subtarget *Subtarget) {
   6356   if (DCI.isBeforeLegalizeOps())
   6357     return SDValue();
   6358 
   6359   return performIntegerAbsCombine(N, DAG);
   6360 }
   6361 
   6362 static SDValue performMulCombine(SDNode *N, SelectionDAG &DAG,
   6363                                  TargetLowering::DAGCombinerInfo &DCI,
   6364                                  const AArch64Subtarget *Subtarget) {
   6365   if (DCI.isBeforeLegalizeOps())
   6366     return SDValue();
   6367 
   6368   // Multiplication of a power of two plus/minus one can be done more
   6369   // cheaply as as shift+add/sub. For now, this is true unilaterally. If
   6370   // future CPUs have a cheaper MADD instruction, this may need to be
   6371   // gated on a subtarget feature. For Cyclone, 32-bit MADD is 4 cycles and
   6372   // 64-bit is 5 cycles, so this is always a win.
   6373   if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(N->getOperand(1))) {
   6374     APInt Value = C->getAPIntValue();
   6375     EVT VT = N->getValueType(0);
   6376     if (Value.isNonNegative()) {
   6377       // (mul x, 2^N + 1) => (add (shl x, N), x)
   6378       APInt VM1 = Value - 1;
   6379       if (VM1.isPowerOf2()) {
   6380         SDValue ShiftedVal =
   6381             DAG.getNode(ISD::SHL, SDLoc(N), VT, N->getOperand(0),
   6382                         DAG.getConstant(VM1.logBase2(), MVT::i64));
   6383         return DAG.getNode(ISD::ADD, SDLoc(N), VT, ShiftedVal,
   6384                            N->getOperand(0));
   6385       }
   6386       // (mul x, 2^N - 1) => (sub (shl x, N), x)
   6387       APInt VP1 = Value + 1;
   6388       if (VP1.isPowerOf2()) {
   6389         SDValue ShiftedVal =
   6390             DAG.getNode(ISD::SHL, SDLoc(N), VT, N->getOperand(0),
   6391                         DAG.getConstant(VP1.logBase2(), MVT::i64));
   6392         return DAG.getNode(ISD::SUB, SDLoc(N), VT, ShiftedVal,
   6393                            N->getOperand(0));
   6394       }
   6395     } else {
   6396       // (mul x, -(2^N + 1)) => - (add (shl x, N), x)
   6397       APInt VNM1 = -Value - 1;
   6398       if (VNM1.isPowerOf2()) {
   6399         SDValue ShiftedVal =
   6400             DAG.getNode(ISD::SHL, SDLoc(N), VT, N->getOperand(0),
   6401                         DAG.getConstant(VNM1.logBase2(), MVT::i64));
   6402         SDValue Add =
   6403             DAG.getNode(ISD::ADD, SDLoc(N), VT, ShiftedVal, N->getOperand(0));
   6404         return DAG.getNode(ISD::SUB, SDLoc(N), VT, DAG.getConstant(0, VT), Add);
   6405       }
   6406       // (mul x, -(2^N - 1)) => (sub x, (shl x, N))
   6407       APInt VNP1 = -Value + 1;
   6408       if (VNP1.isPowerOf2()) {
   6409         SDValue ShiftedVal =
   6410             DAG.getNode(ISD::SHL, SDLoc(N), VT, N->getOperand(0),
   6411                         DAG.getConstant(VNP1.logBase2(), MVT::i64));
   6412         return DAG.getNode(ISD::SUB, SDLoc(N), VT, N->getOperand(0),
   6413                            ShiftedVal);
   6414       }
   6415     }
   6416   }
   6417   return SDValue();
   6418 }
   6419 
   6420 static SDValue performIntToFpCombine(SDNode *N, SelectionDAG &DAG) {
   6421   EVT VT = N->getValueType(0);
   6422   if (VT != MVT::f32 && VT != MVT::f64)
   6423     return SDValue();
   6424   // Only optimize when the source and destination types have the same width.
   6425   if (VT.getSizeInBits() != N->getOperand(0).getValueType().getSizeInBits())
   6426     return SDValue();
   6427 
   6428   // If the result of an integer load is only used by an integer-to-float
   6429   // conversion, use a fp load instead and a AdvSIMD scalar {S|U}CVTF instead.
   6430   // This eliminates an "integer-to-vector-move UOP and improve throughput.
   6431   SDValue N0 = N->getOperand(0);
   6432   if (ISD::isNormalLoad(N0.getNode()) && N0.hasOneUse() &&
   6433       // Do not change the width of a volatile load.
   6434       !cast<LoadSDNode>(N0)->isVolatile()) {
   6435     LoadSDNode *LN0 = cast<LoadSDNode>(N0);
   6436     SDValue Load = DAG.getLoad(VT, SDLoc(N), LN0->getChain(), LN0->getBasePtr(),
   6437                                LN0->getPointerInfo(), LN0->isVolatile(),
   6438                                LN0->isNonTemporal(), LN0->isInvariant(),
   6439                                LN0->getAlignment());
   6440 
   6441     // Make sure successors of the original load stay after it by updating them
   6442     // to use the new Chain.
   6443     DAG.ReplaceAllUsesOfValueWith(SDValue(LN0, 1), Load.getValue(1));
   6444 
   6445     unsigned Opcode =
   6446         (N->getOpcode() == ISD::SINT_TO_FP) ? AArch64ISD::SITOF : AArch64ISD::UITOF;
   6447     return DAG.getNode(Opcode, SDLoc(N), VT, Load);
   6448   }
   6449 
   6450   return SDValue();
   6451 }
   6452 
   6453 /// An EXTR instruction is made up of two shifts, ORed together. This helper
   6454 /// searches for and classifies those shifts.
   6455 static bool findEXTRHalf(SDValue N, SDValue &Src, uint32_t &ShiftAmount,
   6456                          bool &FromHi) {
   6457   if (N.getOpcode() == ISD::SHL)
   6458     FromHi = false;
   6459   else if (N.getOpcode() == ISD::SRL)
   6460     FromHi = true;
   6461   else
   6462     return false;
   6463 
   6464   if (!isa<ConstantSDNode>(N.getOperand(1)))
   6465     return false;
   6466 
   6467   ShiftAmount = N->getConstantOperandVal(1);
   6468   Src = N->getOperand(0);
   6469   return true;
   6470 }
   6471 
   6472 /// EXTR instruction extracts a contiguous chunk of bits from two existing
   6473 /// registers viewed as a high/low pair. This function looks for the pattern:
   6474 /// (or (shl VAL1, #N), (srl VAL2, #RegWidth-N)) and replaces it with an
   6475 /// EXTR. Can't quite be done in TableGen because the two immediates aren't
   6476 /// independent.
   6477 static SDValue tryCombineToEXTR(SDNode *N,
   6478                                 TargetLowering::DAGCombinerInfo &DCI) {
   6479   SelectionDAG &DAG = DCI.DAG;
   6480   SDLoc DL(N);
   6481   EVT VT = N->getValueType(0);
   6482 
   6483   assert(N->getOpcode() == ISD::OR && "Unexpected root");
   6484 
   6485   if (VT != MVT::i32 && VT != MVT::i64)
   6486     return SDValue();
   6487 
   6488   SDValue LHS;
   6489   uint32_t ShiftLHS = 0;
   6490   bool LHSFromHi = 0;
   6491   if (!findEXTRHalf(N->getOperand(0), LHS, ShiftLHS, LHSFromHi))
   6492     return SDValue();
   6493 
   6494   SDValue RHS;
   6495   uint32_t ShiftRHS = 0;
   6496   bool RHSFromHi = 0;
   6497   if (!findEXTRHalf(N->getOperand(1), RHS, ShiftRHS, RHSFromHi))
   6498     return SDValue();
   6499 
   6500   // If they're both trying to come from the high part of the register, they're
   6501   // not really an EXTR.
   6502   if (LHSFromHi == RHSFromHi)
   6503     return SDValue();
   6504 
   6505   if (ShiftLHS + ShiftRHS != VT.getSizeInBits())
   6506     return SDValue();
   6507 
   6508   if (LHSFromHi) {
   6509     std::swap(LHS, RHS);
   6510     std::swap(ShiftLHS, ShiftRHS);
   6511   }
   6512 
   6513   return DAG.getNode(AArch64ISD::EXTR, DL, VT, LHS, RHS,
   6514                      DAG.getConstant(ShiftRHS, MVT::i64));
   6515 }
   6516 
   6517 static SDValue tryCombineToBSL(SDNode *N,
   6518                                 TargetLowering::DAGCombinerInfo &DCI) {
   6519   EVT VT = N->getValueType(0);
   6520   SelectionDAG &DAG = DCI.DAG;
   6521   SDLoc DL(N);
   6522 
   6523   if (!VT.isVector())
   6524     return SDValue();
   6525 
   6526   SDValue N0 = N->getOperand(0);
   6527   if (N0.getOpcode() != ISD::AND)
   6528     return SDValue();
   6529 
   6530   SDValue N1 = N->getOperand(1);
   6531   if (N1.getOpcode() != ISD::AND)
   6532     return SDValue();
   6533 
   6534   // We only have to look for constant vectors here since the general, variable
   6535   // case can be handled in TableGen.
   6536   unsigned Bits = VT.getVectorElementType().getSizeInBits();
   6537   uint64_t BitMask = Bits == 64 ? -1ULL : ((1ULL << Bits) - 1);
   6538   for (int i = 1; i >= 0; --i)
   6539     for (int j = 1; j >= 0; --j) {
   6540       BuildVectorSDNode *BVN0 = dyn_cast<BuildVectorSDNode>(N0->getOperand(i));
   6541       BuildVectorSDNode *BVN1 = dyn_cast<BuildVectorSDNode>(N1->getOperand(j));
   6542       if (!BVN0 || !BVN1)
   6543         continue;
   6544 
   6545       bool FoundMatch = true;
   6546       for (unsigned k = 0; k < VT.getVectorNumElements(); ++k) {
   6547         ConstantSDNode *CN0 = dyn_cast<ConstantSDNode>(BVN0->getOperand(k));
   6548         ConstantSDNode *CN1 = dyn_cast<ConstantSDNode>(BVN1->getOperand(k));
   6549         if (!CN0 || !CN1 ||
   6550             CN0->getZExtValue() != (BitMask & ~CN1->getZExtValue())) {
   6551           FoundMatch = false;
   6552           break;
   6553         }
   6554       }
   6555 
   6556       if (FoundMatch)
   6557         return DAG.getNode(AArch64ISD::BSL, DL, VT, SDValue(BVN0, 0),
   6558                            N0->getOperand(1 - i), N1->getOperand(1 - j));
   6559     }
   6560 
   6561   return SDValue();
   6562 }
   6563 
   6564 static SDValue performORCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI,
   6565                                 const AArch64Subtarget *Subtarget) {
   6566   // Attempt to form an EXTR from (or (shl VAL1, #N), (srl VAL2, #RegWidth-N))
   6567   if (!EnableAArch64ExtrGeneration)
   6568     return SDValue();
   6569   SelectionDAG &DAG = DCI.DAG;
   6570   EVT VT = N->getValueType(0);
   6571 
   6572   if (!DAG.getTargetLoweringInfo().isTypeLegal(VT))
   6573     return SDValue();
   6574 
   6575   SDValue Res = tryCombineToEXTR(N, DCI);
   6576   if (Res.getNode())
   6577     return Res;
   6578 
   6579   Res = tryCombineToBSL(N, DCI);
   6580   if (Res.getNode())
   6581     return Res;
   6582 
   6583   return SDValue();
   6584 }
   6585 
   6586 static SDValue performBitcastCombine(SDNode *N,
   6587                                      TargetLowering::DAGCombinerInfo &DCI,
   6588                                      SelectionDAG &DAG) {
   6589   // Wait 'til after everything is legalized to try this. That way we have
   6590   // legal vector types and such.
   6591   if (DCI.isBeforeLegalizeOps())
   6592     return SDValue();
   6593 
   6594   // Remove extraneous bitcasts around an extract_subvector.
   6595   // For example,
   6596   //    (v4i16 (bitconvert
   6597   //             (extract_subvector (v2i64 (bitconvert (v8i16 ...)), (i64 1)))))
   6598   //  becomes
   6599   //    (extract_subvector ((v8i16 ...), (i64 4)))
   6600 
   6601   // Only interested in 64-bit vectors as the ultimate result.
   6602   EVT VT = N->getValueType(0);
   6603   if (!VT.isVector())
   6604     return SDValue();
   6605   if (VT.getSimpleVT().getSizeInBits() != 64)
   6606     return SDValue();
   6607   // Is the operand an extract_subvector starting at the beginning or halfway
   6608   // point of the vector? A low half may also come through as an
   6609   // EXTRACT_SUBREG, so look for that, too.
   6610   SDValue Op0 = N->getOperand(0);
   6611   if (Op0->getOpcode() != ISD::EXTRACT_SUBVECTOR &&
   6612       !(Op0->isMachineOpcode() &&
   6613         Op0->getMachineOpcode() == AArch64::EXTRACT_SUBREG))
   6614     return SDValue();
   6615   uint64_t idx = cast<ConstantSDNode>(Op0->getOperand(1))->getZExtValue();
   6616   if (Op0->getOpcode() == ISD::EXTRACT_SUBVECTOR) {
   6617     if (Op0->getValueType(0).getVectorNumElements() != idx && idx != 0)
   6618       return SDValue();
   6619   } else if (Op0->getMachineOpcode() == AArch64::EXTRACT_SUBREG) {
   6620     if (idx != AArch64::dsub)
   6621       return SDValue();
   6622     // The dsub reference is equivalent to a lane zero subvector reference.
   6623     idx = 0;
   6624   }
   6625   // Look through the bitcast of the input to the extract.
   6626   if (Op0->getOperand(0)->getOpcode() != ISD::BITCAST)
   6627     return SDValue();
   6628   SDValue Source = Op0->getOperand(0)->getOperand(0);
   6629   // If the source type has twice the number of elements as our destination
   6630   // type, we know this is an extract of the high or low half of the vector.
   6631   EVT SVT = Source->getValueType(0);
   6632   if (SVT.getVectorNumElements() != VT.getVectorNumElements() * 2)
   6633     return SDValue();
   6634 
   6635   DEBUG(dbgs() << "aarch64-lower: bitcast extract_subvector simplification\n");
   6636 
   6637   // Create the simplified form to just extract the low or high half of the
   6638   // vector directly rather than bothering with the bitcasts.
   6639   SDLoc dl(N);
   6640   unsigned NumElements = VT.getVectorNumElements();
   6641   if (idx) {
   6642     SDValue HalfIdx = DAG.getConstant(NumElements, MVT::i64);
   6643     return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, Source, HalfIdx);
   6644   } else {
   6645     SDValue SubReg = DAG.getTargetConstant(AArch64::dsub, MVT::i32);
   6646     return SDValue(DAG.getMachineNode(TargetOpcode::EXTRACT_SUBREG, dl, VT,
   6647                                       Source, SubReg),
   6648                    0);
   6649   }
   6650 }
   6651 
   6652 static SDValue performConcatVectorsCombine(SDNode *N,
   6653                                            TargetLowering::DAGCombinerInfo &DCI,
   6654                                            SelectionDAG &DAG) {
   6655   // Wait 'til after everything is legalized to try this. That way we have
   6656   // legal vector types and such.
   6657   if (DCI.isBeforeLegalizeOps())
   6658     return SDValue();
   6659 
   6660   SDLoc dl(N);
   6661   EVT VT = N->getValueType(0);
   6662 
   6663   // If we see a (concat_vectors (v1x64 A), (v1x64 A)) it's really a vector
   6664   // splat. The indexed instructions are going to be expecting a DUPLANE64, so
   6665   // canonicalise to that.
   6666   if (N->getOperand(0) == N->getOperand(1) && VT.getVectorNumElements() == 2) {
   6667     assert(VT.getVectorElementType().getSizeInBits() == 64);
   6668     return DAG.getNode(AArch64ISD::DUPLANE64, dl, VT,
   6669                        WidenVector(N->getOperand(0), DAG),
   6670                        DAG.getConstant(0, MVT::i64));
   6671   }
   6672 
   6673   // Canonicalise concat_vectors so that the right-hand vector has as few
   6674   // bit-casts as possible before its real operation. The primary matching
   6675   // destination for these operations will be the narrowing "2" instructions,
   6676   // which depend on the operation being performed on this right-hand vector.
   6677   // For example,
   6678   //    (concat_vectors LHS,  (v1i64 (bitconvert (v4i16 RHS))))
   6679   // becomes
   6680   //    (bitconvert (concat_vectors (v4i16 (bitconvert LHS)), RHS))
   6681 
   6682   SDValue Op1 = N->getOperand(1);
   6683   if (Op1->getOpcode() != ISD::BITCAST)
   6684     return SDValue();
   6685   SDValue RHS = Op1->getOperand(0);
   6686   MVT RHSTy = RHS.getValueType().getSimpleVT();
   6687   // If the RHS is not a vector, this is not the pattern we're looking for.
   6688   if (!RHSTy.isVector())
   6689     return SDValue();
   6690 
   6691   DEBUG(dbgs() << "aarch64-lower: concat_vectors bitcast simplification\n");
   6692 
   6693   MVT ConcatTy = MVT::getVectorVT(RHSTy.getVectorElementType(),
   6694                                   RHSTy.getVectorNumElements() * 2);
   6695   return DAG.getNode(
   6696       ISD::BITCAST, dl, VT,
   6697       DAG.getNode(ISD::CONCAT_VECTORS, dl, ConcatTy,
   6698                   DAG.getNode(ISD::BITCAST, dl, RHSTy, N->getOperand(0)), RHS));
   6699 }
   6700 
   6701 static SDValue tryCombineFixedPointConvert(SDNode *N,
   6702                                            TargetLowering::DAGCombinerInfo &DCI,
   6703                                            SelectionDAG &DAG) {
   6704   // Wait 'til after everything is legalized to try this. That way we have
   6705   // legal vector types and such.
   6706   if (DCI.isBeforeLegalizeOps())
   6707     return SDValue();
   6708   // Transform a scalar conversion of a value from a lane extract into a
   6709   // lane extract of a vector conversion. E.g., from foo1 to foo2:
   6710   // double foo1(int64x2_t a) { return vcvtd_n_f64_s64(a[1], 9); }
   6711   // double foo2(int64x2_t a) { return vcvtq_n_f64_s64(a, 9)[1]; }
   6712   //
   6713   // The second form interacts better with instruction selection and the
   6714   // register allocator to avoid cross-class register copies that aren't
   6715   // coalescable due to a lane reference.
   6716 
   6717   // Check the operand and see if it originates from a lane extract.
   6718   SDValue Op1 = N->getOperand(1);
   6719   if (Op1.getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
   6720     // Yep, no additional predication needed. Perform the transform.
   6721     SDValue IID = N->getOperand(0);
   6722     SDValue Shift = N->getOperand(2);
   6723     SDValue Vec = Op1.getOperand(0);
   6724     SDValue Lane = Op1.getOperand(1);
   6725     EVT ResTy = N->getValueType(0);
   6726     EVT VecResTy;
   6727     SDLoc DL(N);
   6728 
   6729     // The vector width should be 128 bits by the time we get here, even
   6730     // if it started as 64 bits (the extract_vector handling will have
   6731     // done so).
   6732     assert(Vec.getValueType().getSizeInBits() == 128 &&
   6733            "unexpected vector size on extract_vector_elt!");
   6734     if (Vec.getValueType() == MVT::v4i32)
   6735       VecResTy = MVT::v4f32;
   6736     else if (Vec.getValueType() == MVT::v2i64)
   6737       VecResTy = MVT::v2f64;
   6738     else
   6739       llvm_unreachable("unexpected vector type!");
   6740 
   6741     SDValue Convert =
   6742         DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VecResTy, IID, Vec, Shift);
   6743     return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ResTy, Convert, Lane);
   6744   }
   6745   return SDValue();
   6746 }
   6747 
   6748 // AArch64 high-vector "long" operations are formed by performing the non-high
   6749 // version on an extract_subvector of each operand which gets the high half:
   6750 //
   6751 //  (longop2 LHS, RHS) == (longop (extract_high LHS), (extract_high RHS))
   6752 //
   6753 // However, there are cases which don't have an extract_high explicitly, but
   6754 // have another operation that can be made compatible with one for free. For
   6755 // example:
   6756 //
   6757 //  (dupv64 scalar) --> (extract_high (dup128 scalar))
   6758 //
   6759 // This routine does the actual conversion of such DUPs, once outer routines
   6760 // have determined that everything else is in order.
   6761 static SDValue tryExtendDUPToExtractHigh(SDValue N, SelectionDAG &DAG) {
   6762   // We can handle most types of duplicate, but the lane ones have an extra
   6763   // operand saying *which* lane, so we need to know.
   6764   bool IsDUPLANE;
   6765   switch (N.getOpcode()) {
   6766   case AArch64ISD::DUP:
   6767     IsDUPLANE = false;
   6768     break;
   6769   case AArch64ISD::DUPLANE8:
   6770   case AArch64ISD::DUPLANE16:
   6771   case AArch64ISD::DUPLANE32:
   6772   case AArch64ISD::DUPLANE64:
   6773     IsDUPLANE = true;
   6774     break;
   6775   default:
   6776     return SDValue();
   6777   }
   6778 
   6779   MVT NarrowTy = N.getSimpleValueType();
   6780   if (!NarrowTy.is64BitVector())
   6781     return SDValue();
   6782 
   6783   MVT ElementTy = NarrowTy.getVectorElementType();
   6784   unsigned NumElems = NarrowTy.getVectorNumElements();
   6785   MVT NewDUPVT = MVT::getVectorVT(ElementTy, NumElems * 2);
   6786 
   6787   SDValue NewDUP;
   6788   if (IsDUPLANE)
   6789     NewDUP = DAG.getNode(N.getOpcode(), SDLoc(N), NewDUPVT, N.getOperand(0),
   6790                          N.getOperand(1));
   6791   else
   6792     NewDUP = DAG.getNode(AArch64ISD::DUP, SDLoc(N), NewDUPVT, N.getOperand(0));
   6793 
   6794   return DAG.getNode(ISD::EXTRACT_SUBVECTOR, SDLoc(N.getNode()), NarrowTy,
   6795                      NewDUP, DAG.getConstant(NumElems, MVT::i64));
   6796 }
   6797 
   6798 static bool isEssentiallyExtractSubvector(SDValue N) {
   6799   if (N.getOpcode() == ISD::EXTRACT_SUBVECTOR)
   6800     return true;
   6801 
   6802   return N.getOpcode() == ISD::BITCAST &&
   6803          N.getOperand(0).getOpcode() == ISD::EXTRACT_SUBVECTOR;
   6804 }
   6805 
   6806 /// \brief Helper structure to keep track of ISD::SET_CC operands.
   6807 struct GenericSetCCInfo {
   6808   const SDValue *Opnd0;
   6809   const SDValue *Opnd1;
   6810   ISD::CondCode CC;
   6811 };
   6812 
   6813 /// \brief Helper structure to keep track of a SET_CC lowered into AArch64 code.
   6814 struct AArch64SetCCInfo {
   6815   const SDValue *Cmp;
   6816   AArch64CC::CondCode CC;
   6817 };
   6818 
   6819 /// \brief Helper structure to keep track of SetCC information.
   6820 union SetCCInfo {
   6821   GenericSetCCInfo Generic;
   6822   AArch64SetCCInfo AArch64;
   6823 };
   6824 
   6825 /// \brief Helper structure to be able to read SetCC information.  If set to
   6826 /// true, IsAArch64 field, Info is a AArch64SetCCInfo, otherwise Info is a
   6827 /// GenericSetCCInfo.
   6828 struct SetCCInfoAndKind {
   6829   SetCCInfo Info;
   6830   bool IsAArch64;
   6831 };
   6832 
   6833 /// \brief Check whether or not \p Op is a SET_CC operation, either a generic or
   6834 /// an
   6835 /// AArch64 lowered one.
   6836 /// \p SetCCInfo is filled accordingly.
   6837 /// \post SetCCInfo is meanginfull only when this function returns true.
   6838 /// \return True when Op is a kind of SET_CC operation.
   6839 static bool isSetCC(SDValue Op, SetCCInfoAndKind &SetCCInfo) {
   6840   // If this is a setcc, this is straight forward.
   6841   if (Op.getOpcode() == ISD::SETCC) {
   6842     SetCCInfo.Info.Generic.Opnd0 = &Op.getOperand(0);
   6843     SetCCInfo.Info.Generic.Opnd1 = &Op.getOperand(1);
   6844     SetCCInfo.Info.Generic.CC = cast<CondCodeSDNode>(Op.getOperand(2))->get();
   6845     SetCCInfo.IsAArch64 = false;
   6846     return true;
   6847   }
   6848   // Otherwise, check if this is a matching csel instruction.
   6849   // In other words:
   6850   // - csel 1, 0, cc
   6851   // - csel 0, 1, !cc
   6852   if (Op.getOpcode() != AArch64ISD::CSEL)
   6853     return false;
   6854   // Set the information about the operands.
   6855   // TODO: we want the operands of the Cmp not the csel
   6856   SetCCInfo.Info.AArch64.Cmp = &Op.getOperand(3);
   6857   SetCCInfo.IsAArch64 = true;
   6858   SetCCInfo.Info.AArch64.CC = static_cast<AArch64CC::CondCode>(
   6859       cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue());
   6860 
   6861   // Check that the operands matches the constraints:
   6862   // (1) Both operands must be constants.
   6863   // (2) One must be 1 and the other must be 0.
   6864   ConstantSDNode *TValue = dyn_cast<ConstantSDNode>(Op.getOperand(0));
   6865   ConstantSDNode *FValue = dyn_cast<ConstantSDNode>(Op.getOperand(1));
   6866 
   6867   // Check (1).
   6868   if (!TValue || !FValue)
   6869     return false;
   6870 
   6871   // Check (2).
   6872   if (!TValue->isOne()) {
   6873     // Update the comparison when we are interested in !cc.
   6874     std::swap(TValue, FValue);
   6875     SetCCInfo.Info.AArch64.CC =
   6876         AArch64CC::getInvertedCondCode(SetCCInfo.Info.AArch64.CC);
   6877   }
   6878   return TValue->isOne() && FValue->isNullValue();
   6879 }
   6880 
   6881 // Returns true if Op is setcc or zext of setcc.
   6882 static bool isSetCCOrZExtSetCC(const SDValue& Op, SetCCInfoAndKind &Info) {
   6883   if (isSetCC(Op, Info))
   6884     return true;
   6885   return ((Op.getOpcode() == ISD::ZERO_EXTEND) &&
   6886     isSetCC(Op->getOperand(0), Info));
   6887 }
   6888 
   6889 // The folding we want to perform is:
   6890 // (add x, [zext] (setcc cc ...) )
   6891 //   -->
   6892 // (csel x, (add x, 1), !cc ...)
   6893 //
   6894 // The latter will get matched to a CSINC instruction.
   6895 static SDValue performSetccAddFolding(SDNode *Op, SelectionDAG &DAG) {
   6896   assert(Op && Op->getOpcode() == ISD::ADD && "Unexpected operation!");
   6897   SDValue LHS = Op->getOperand(0);
   6898   SDValue RHS = Op->getOperand(1);
   6899   SetCCInfoAndKind InfoAndKind;
   6900 
   6901   // If neither operand is a SET_CC, give up.
   6902   if (!isSetCCOrZExtSetCC(LHS, InfoAndKind)) {
   6903     std::swap(LHS, RHS);
   6904     if (!isSetCCOrZExtSetCC(LHS, InfoAndKind))
   6905       return SDValue();
   6906   }
   6907 
   6908   // FIXME: This could be generatized to work for FP comparisons.
   6909   EVT CmpVT = InfoAndKind.IsAArch64
   6910                   ? InfoAndKind.Info.AArch64.Cmp->getOperand(0).getValueType()
   6911                   : InfoAndKind.Info.Generic.Opnd0->getValueType();
   6912   if (CmpVT != MVT::i32 && CmpVT != MVT::i64)
   6913     return SDValue();
   6914 
   6915   SDValue CCVal;
   6916   SDValue Cmp;
   6917   SDLoc dl(Op);
   6918   if (InfoAndKind.IsAArch64) {
   6919     CCVal = DAG.getConstant(
   6920         AArch64CC::getInvertedCondCode(InfoAndKind.Info.AArch64.CC), MVT::i32);
   6921     Cmp = *InfoAndKind.Info.AArch64.Cmp;
   6922   } else
   6923     Cmp = getAArch64Cmp(*InfoAndKind.Info.Generic.Opnd0,
   6924                       *InfoAndKind.Info.Generic.Opnd1,
   6925                       ISD::getSetCCInverse(InfoAndKind.Info.Generic.CC, true),
   6926                       CCVal, DAG, dl);
   6927 
   6928   EVT VT = Op->getValueType(0);
   6929   LHS = DAG.getNode(ISD::ADD, dl, VT, RHS, DAG.getConstant(1, VT));
   6930   return DAG.getNode(AArch64ISD::CSEL, dl, VT, RHS, LHS, CCVal, Cmp);
   6931 }
   6932 
   6933 // The basic add/sub long vector instructions have variants with "2" on the end
   6934 // which act on the high-half of their inputs. They are normally matched by
   6935 // patterns like:
   6936 //
   6937 // (add (zeroext (extract_high LHS)),
   6938 //      (zeroext (extract_high RHS)))
   6939 // -> uaddl2 vD, vN, vM
   6940 //
   6941 // However, if one of the extracts is something like a duplicate, this
   6942 // instruction can still be used profitably. This function puts the DAG into a
   6943 // more appropriate form for those patterns to trigger.
   6944 static SDValue performAddSubLongCombine(SDNode *N,
   6945                                         TargetLowering::DAGCombinerInfo &DCI,
   6946                                         SelectionDAG &DAG) {
   6947   if (DCI.isBeforeLegalizeOps())
   6948     return SDValue();
   6949 
   6950   MVT VT = N->getSimpleValueType(0);
   6951   if (!VT.is128BitVector()) {
   6952     if (N->getOpcode() == ISD::ADD)
   6953       return performSetccAddFolding(N, DAG);
   6954     return SDValue();
   6955   }
   6956 
   6957   // Make sure both branches are extended in the same way.
   6958   SDValue LHS = N->getOperand(0);
   6959   SDValue RHS = N->getOperand(1);
   6960   if ((LHS.getOpcode() != ISD::ZERO_EXTEND &&
   6961        LHS.getOpcode() != ISD::SIGN_EXTEND) ||
   6962       LHS.getOpcode() != RHS.getOpcode())
   6963     return SDValue();
   6964 
   6965   unsigned ExtType = LHS.getOpcode();
   6966 
   6967   // It's not worth doing if at least one of the inputs isn't already an
   6968   // extract, but we don't know which it'll be so we have to try both.
   6969   if (isEssentiallyExtractSubvector(LHS.getOperand(0))) {
   6970     RHS = tryExtendDUPToExtractHigh(RHS.getOperand(0), DAG);
   6971     if (!RHS.getNode())
   6972       return SDValue();
   6973 
   6974     RHS = DAG.getNode(ExtType, SDLoc(N), VT, RHS);
   6975   } else if (isEssentiallyExtractSubvector(RHS.getOperand(0))) {
   6976     LHS = tryExtendDUPToExtractHigh(LHS.getOperand(0), DAG);
   6977     if (!LHS.getNode())
   6978       return SDValue();
   6979 
   6980     LHS = DAG.getNode(ExtType, SDLoc(N), VT, LHS);
   6981   }
   6982 
   6983   return DAG.getNode(N->getOpcode(), SDLoc(N), VT, LHS, RHS);
   6984 }
   6985 
   6986 // Massage DAGs which we can use the high-half "long" operations on into
   6987 // something isel will recognize better. E.g.
   6988 //
   6989 // (aarch64_neon_umull (extract_high vec) (dupv64 scalar)) -->
   6990 //   (aarch64_neon_umull (extract_high (v2i64 vec)))
   6991 //                     (extract_high (v2i64 (dup128 scalar)))))
   6992 //
   6993 static SDValue tryCombineLongOpWithDup(unsigned IID, SDNode *N,
   6994                                        TargetLowering::DAGCombinerInfo &DCI,
   6995                                        SelectionDAG &DAG) {
   6996   if (DCI.isBeforeLegalizeOps())
   6997     return SDValue();
   6998 
   6999   SDValue LHS = N->getOperand(1);
   7000   SDValue RHS = N->getOperand(2);
   7001   assert(LHS.getValueType().is64BitVector() &&
   7002          RHS.getValueType().is64BitVector() &&
   7003          "unexpected shape for long operation");
   7004 
   7005   // Either node could be a DUP, but it's not worth doing both of them (you'd
   7006   // just as well use the non-high version) so look for a corresponding extract
   7007   // operation on the other "wing".
   7008   if (isEssentiallyExtractSubvector(LHS)) {
   7009     RHS = tryExtendDUPToExtractHigh(RHS, DAG);
   7010     if (!RHS.getNode())
   7011       return SDValue();
   7012   } else if (isEssentiallyExtractSubvector(RHS)) {
   7013     LHS = tryExtendDUPToExtractHigh(LHS, DAG);
   7014     if (!LHS.getNode())
   7015       return SDValue();
   7016   }
   7017 
   7018   return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SDLoc(N), N->getValueType(0),
   7019                      N->getOperand(0), LHS, RHS);
   7020 }
   7021 
   7022 static SDValue tryCombineShiftImm(unsigned IID, SDNode *N, SelectionDAG &DAG) {
   7023   MVT ElemTy = N->getSimpleValueType(0).getScalarType();
   7024   unsigned ElemBits = ElemTy.getSizeInBits();
   7025 
   7026   int64_t ShiftAmount;
   7027   if (BuildVectorSDNode *BVN = dyn_cast<BuildVectorSDNode>(N->getOperand(2))) {
   7028     APInt SplatValue, SplatUndef;
   7029     unsigned SplatBitSize;
   7030     bool HasAnyUndefs;
   7031     if (!BVN->isConstantSplat(SplatValue, SplatUndef, SplatBitSize,
   7032                               HasAnyUndefs, ElemBits) ||
   7033         SplatBitSize != ElemBits)
   7034       return SDValue();
   7035 
   7036     ShiftAmount = SplatValue.getSExtValue();
   7037   } else if (ConstantSDNode *CVN = dyn_cast<ConstantSDNode>(N->getOperand(2))) {
   7038     ShiftAmount = CVN->getSExtValue();
   7039   } else
   7040     return SDValue();
   7041 
   7042   unsigned Opcode;
   7043   bool IsRightShift;
   7044   switch (IID) {
   7045   default:
   7046     llvm_unreachable("Unknown shift intrinsic");
   7047   case Intrinsic::aarch64_neon_sqshl:
   7048     Opcode = AArch64ISD::SQSHL_I;
   7049     IsRightShift = false;
   7050     break;
   7051   case Intrinsic::aarch64_neon_uqshl:
   7052     Opcode = AArch64ISD::UQSHL_I;
   7053     IsRightShift = false;
   7054     break;
   7055   case Intrinsic::aarch64_neon_srshl:
   7056     Opcode = AArch64ISD::SRSHR_I;
   7057     IsRightShift = true;
   7058     break;
   7059   case Intrinsic::aarch64_neon_urshl:
   7060     Opcode = AArch64ISD::URSHR_I;
   7061     IsRightShift = true;
   7062     break;
   7063   case Intrinsic::aarch64_neon_sqshlu:
   7064     Opcode = AArch64ISD::SQSHLU_I;
   7065     IsRightShift = false;
   7066     break;
   7067   }
   7068 
   7069   if (IsRightShift && ShiftAmount <= -1 && ShiftAmount >= -(int)ElemBits)
   7070     return DAG.getNode(Opcode, SDLoc(N), N->getValueType(0), N->getOperand(1),
   7071                        DAG.getConstant(-ShiftAmount, MVT::i32));
   7072   else if (!IsRightShift && ShiftAmount >= 0 && ShiftAmount < ElemBits)
   7073     return DAG.getNode(Opcode, SDLoc(N), N->getValueType(0), N->getOperand(1),
   7074                        DAG.getConstant(ShiftAmount, MVT::i32));
   7075 
   7076   return SDValue();
   7077 }
   7078 
   7079 // The CRC32[BH] instructions ignore the high bits of their data operand. Since
   7080 // the intrinsics must be legal and take an i32, this means there's almost
   7081 // certainly going to be a zext in the DAG which we can eliminate.
   7082 static SDValue tryCombineCRC32(unsigned Mask, SDNode *N, SelectionDAG &DAG) {
   7083   SDValue AndN = N->getOperand(2);
   7084   if (AndN.getOpcode() != ISD::AND)
   7085     return SDValue();
   7086 
   7087   ConstantSDNode *CMask = dyn_cast<ConstantSDNode>(AndN.getOperand(1));
   7088   if (!CMask || CMask->getZExtValue() != Mask)
   7089     return SDValue();
   7090 
   7091   return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SDLoc(N), MVT::i32,
   7092                      N->getOperand(0), N->getOperand(1), AndN.getOperand(0));
   7093 }
   7094 
   7095 static SDValue performIntrinsicCombine(SDNode *N,
   7096                                        TargetLowering::DAGCombinerInfo &DCI,
   7097                                        const AArch64Subtarget *Subtarget) {
   7098   SelectionDAG &DAG = DCI.DAG;
   7099   unsigned IID = getIntrinsicID(N);
   7100   switch (IID) {
   7101   default:
   7102     break;
   7103   case Intrinsic::aarch64_neon_vcvtfxs2fp:
   7104   case Intrinsic::aarch64_neon_vcvtfxu2fp:
   7105     return tryCombineFixedPointConvert(N, DCI, DAG);
   7106     break;
   7107   case Intrinsic::aarch64_neon_fmax:
   7108     return DAG.getNode(AArch64ISD::FMAX, SDLoc(N), N->getValueType(0),
   7109                        N->getOperand(1), N->getOperand(2));
   7110   case Intrinsic::aarch64_neon_fmin:
   7111     return DAG.getNode(AArch64ISD::FMIN, SDLoc(N), N->getValueType(0),
   7112                        N->getOperand(1), N->getOperand(2));
   7113   case Intrinsic::aarch64_neon_smull:
   7114   case Intrinsic::aarch64_neon_umull:
   7115   case Intrinsic::aarch64_neon_pmull:
   7116   case Intrinsic::aarch64_neon_sqdmull:
   7117     return tryCombineLongOpWithDup(IID, N, DCI, DAG);
   7118   case Intrinsic::aarch64_neon_sqshl:
   7119   case Intrinsic::aarch64_neon_uqshl:
   7120   case Intrinsic::aarch64_neon_sqshlu:
   7121   case Intrinsic::aarch64_neon_srshl:
   7122   case Intrinsic::aarch64_neon_urshl:
   7123     return tryCombineShiftImm(IID, N, DAG);
   7124   case Intrinsic::aarch64_crc32b:
   7125   case Intrinsic::aarch64_crc32cb:
   7126     return tryCombineCRC32(0xff, N, DAG);
   7127   case Intrinsic::aarch64_crc32h:
   7128   case Intrinsic::aarch64_crc32ch:
   7129     return tryCombineCRC32(0xffff, N, DAG);
   7130   }
   7131   return SDValue();
   7132 }
   7133 
   7134 static SDValue performExtendCombine(SDNode *N,
   7135                                     TargetLowering::DAGCombinerInfo &DCI,
   7136                                     SelectionDAG &DAG) {
   7137   // If we see something like (zext (sabd (extract_high ...), (DUP ...))) then
   7138   // we can convert that DUP into another extract_high (of a bigger DUP), which
   7139   // helps the backend to decide that an sabdl2 would be useful, saving a real
   7140   // extract_high operation.
   7141   if (!DCI.isBeforeLegalizeOps() && N->getOpcode() == ISD::ZERO_EXTEND &&
   7142       N->getOperand(0).getOpcode() == ISD::INTRINSIC_WO_CHAIN) {
   7143     SDNode *ABDNode = N->getOperand(0).getNode();
   7144     unsigned IID = getIntrinsicID(ABDNode);
   7145     if (IID == Intrinsic::aarch64_neon_sabd ||
   7146         IID == Intrinsic::aarch64_neon_uabd) {
   7147       SDValue NewABD = tryCombineLongOpWithDup(IID, ABDNode, DCI, DAG);
   7148       if (!NewABD.getNode())
   7149         return SDValue();
   7150 
   7151       return DAG.getNode(ISD::ZERO_EXTEND, SDLoc(N), N->getValueType(0),
   7152                          NewABD);
   7153     }
   7154   }
   7155 
   7156   // This is effectively a custom type legalization for AArch64.
   7157   //
   7158   // Type legalization will split an extend of a small, legal, type to a larger
   7159   // illegal type by first splitting the destination type, often creating
   7160   // illegal source types, which then get legalized in isel-confusing ways,
   7161   // leading to really terrible codegen. E.g.,
   7162   //   %result = v8i32 sext v8i8 %value
   7163   // becomes
   7164   //   %losrc = extract_subreg %value, ...
   7165   //   %hisrc = extract_subreg %value, ...
   7166   //   %lo = v4i32 sext v4i8 %losrc
   7167   //   %hi = v4i32 sext v4i8 %hisrc
   7168   // Things go rapidly downhill from there.
   7169   //
   7170   // For AArch64, the [sz]ext vector instructions can only go up one element
   7171   // size, so we can, e.g., extend from i8 to i16, but to go from i8 to i32
   7172   // take two instructions.
   7173   //
   7174   // This implies that the most efficient way to do the extend from v8i8
   7175   // to two v4i32 values is to first extend the v8i8 to v8i16, then do
   7176   // the normal splitting to happen for the v8i16->v8i32.
   7177 
   7178   // This is pre-legalization to catch some cases where the default
   7179   // type legalization will create ill-tempered code.
   7180   if (!DCI.isBeforeLegalizeOps())
   7181     return SDValue();
   7182 
   7183   // We're only interested in cleaning things up for non-legal vector types
   7184   // here. If both the source and destination are legal, things will just
   7185   // work naturally without any fiddling.
   7186   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
   7187   EVT ResVT = N->getValueType(0);
   7188   if (!ResVT.isVector() || TLI.isTypeLegal(ResVT))
   7189     return SDValue();
   7190   // If the vector type isn't a simple VT, it's beyond the scope of what
   7191   // we're  worried about here. Let legalization do its thing and hope for
   7192   // the best.
   7193   if (!ResVT.isSimple())
   7194     return SDValue();
   7195 
   7196   SDValue Src = N->getOperand(0);
   7197   MVT SrcVT = Src->getValueType(0).getSimpleVT();
   7198   // If the source VT is a 64-bit vector, we can play games and get the
   7199   // better results we want.
   7200   if (SrcVT.getSizeInBits() != 64)
   7201     return SDValue();
   7202 
   7203   unsigned SrcEltSize = SrcVT.getVectorElementType().getSizeInBits();
   7204   unsigned ElementCount = SrcVT.getVectorNumElements();
   7205   SrcVT = MVT::getVectorVT(MVT::getIntegerVT(SrcEltSize * 2), ElementCount);
   7206   SDLoc DL(N);
   7207   Src = DAG.getNode(N->getOpcode(), DL, SrcVT, Src);
   7208 
   7209   // Now split the rest of the operation into two halves, each with a 64
   7210   // bit source.
   7211   EVT LoVT, HiVT;
   7212   SDValue Lo, Hi;
   7213   unsigned NumElements = ResVT.getVectorNumElements();
   7214   assert(!(NumElements & 1) && "Splitting vector, but not in half!");
   7215   LoVT = HiVT = EVT::getVectorVT(*DAG.getContext(),
   7216                                  ResVT.getVectorElementType(), NumElements / 2);
   7217 
   7218   EVT InNVT = EVT::getVectorVT(*DAG.getContext(), SrcVT.getVectorElementType(),
   7219                                LoVT.getVectorNumElements());
   7220   Lo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, InNVT, Src,
   7221                    DAG.getIntPtrConstant(0));
   7222   Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, InNVT, Src,
   7223                    DAG.getIntPtrConstant(InNVT.getVectorNumElements()));
   7224   Lo = DAG.getNode(N->getOpcode(), DL, LoVT, Lo);
   7225   Hi = DAG.getNode(N->getOpcode(), DL, HiVT, Hi);
   7226 
   7227   // Now combine the parts back together so we still have a single result
   7228   // like the combiner expects.
   7229   return DAG.getNode(ISD::CONCAT_VECTORS, DL, ResVT, Lo, Hi);
   7230 }
   7231 
   7232 /// Replace a splat of a scalar to a vector store by scalar stores of the scalar
   7233 /// value. The load store optimizer pass will merge them to store pair stores.
   7234 /// This has better performance than a splat of the scalar followed by a split
   7235 /// vector store. Even if the stores are not merged it is four stores vs a dup,
   7236 /// followed by an ext.b and two stores.
   7237 static SDValue replaceSplatVectorStore(SelectionDAG &DAG, StoreSDNode *St) {
   7238   SDValue StVal = St->getValue();
   7239   EVT VT = StVal.getValueType();
   7240 
   7241   // Don't replace floating point stores, they possibly won't be transformed to
   7242   // stp because of the store pair suppress pass.
   7243   if (VT.isFloatingPoint())
   7244     return SDValue();
   7245 
   7246   // Check for insert vector elements.
   7247   if (StVal.getOpcode() != ISD::INSERT_VECTOR_ELT)
   7248     return SDValue();
   7249 
   7250   // We can express a splat as store pair(s) for 2 or 4 elements.
   7251   unsigned NumVecElts = VT.getVectorNumElements();
   7252   if (NumVecElts != 4 && NumVecElts != 2)
   7253     return SDValue();
   7254   SDValue SplatVal = StVal.getOperand(1);
   7255   unsigned RemainInsertElts = NumVecElts - 1;
   7256 
   7257   // Check that this is a splat.
   7258   while (--RemainInsertElts) {
   7259     SDValue NextInsertElt = StVal.getOperand(0);
   7260     if (NextInsertElt.getOpcode() != ISD::INSERT_VECTOR_ELT)
   7261       return SDValue();
   7262     if (NextInsertElt.getOperand(1) != SplatVal)
   7263       return SDValue();
   7264     StVal = NextInsertElt;
   7265   }
   7266   unsigned OrigAlignment = St->getAlignment();
   7267   unsigned EltOffset = NumVecElts == 4 ? 4 : 8;
   7268   unsigned Alignment = std::min(OrigAlignment, EltOffset);
   7269 
   7270   // Create scalar stores. This is at least as good as the code sequence for a
   7271   // split unaligned store wich is a dup.s, ext.b, and two stores.
   7272   // Most of the time the three stores should be replaced by store pair
   7273   // instructions (stp).
   7274   SDLoc DL(St);
   7275   SDValue BasePtr = St->getBasePtr();
   7276   SDValue NewST1 =
   7277       DAG.getStore(St->getChain(), DL, SplatVal, BasePtr, St->getPointerInfo(),
   7278                    St->isVolatile(), St->isNonTemporal(), St->getAlignment());
   7279 
   7280   unsigned Offset = EltOffset;
   7281   while (--NumVecElts) {
   7282     SDValue OffsetPtr = DAG.getNode(ISD::ADD, DL, MVT::i64, BasePtr,
   7283                                     DAG.getConstant(Offset, MVT::i64));
   7284     NewST1 = DAG.getStore(NewST1.getValue(0), DL, SplatVal, OffsetPtr,
   7285                           St->getPointerInfo(), St->isVolatile(),
   7286                           St->isNonTemporal(), Alignment);
   7287     Offset += EltOffset;
   7288   }
   7289   return NewST1;
   7290 }
   7291 
   7292 static SDValue performSTORECombine(SDNode *N,
   7293                                    TargetLowering::DAGCombinerInfo &DCI,
   7294                                    SelectionDAG &DAG,
   7295                                    const AArch64Subtarget *Subtarget) {
   7296   if (!DCI.isBeforeLegalize())
   7297     return SDValue();
   7298 
   7299   StoreSDNode *S = cast<StoreSDNode>(N);
   7300   if (S->isVolatile())
   7301     return SDValue();
   7302 
   7303   // Cyclone has bad performance on unaligned 16B stores when crossing line and
   7304   // page boundries. We want to split such stores.
   7305   if (!Subtarget->isCyclone())
   7306     return SDValue();
   7307 
   7308   // Don't split at Oz.
   7309   MachineFunction &MF = DAG.getMachineFunction();
   7310   bool IsMinSize = MF.getFunction()->getAttributes().hasAttribute(
   7311       AttributeSet::FunctionIndex, Attribute::MinSize);
   7312   if (IsMinSize)
   7313     return SDValue();
   7314 
   7315   SDValue StVal = S->getValue();
   7316   EVT VT = StVal.getValueType();
   7317 
   7318   // Don't split v2i64 vectors. Memcpy lowering produces those and splitting
   7319   // those up regresses performance on micro-benchmarks and olden/bh.
   7320   if (!VT.isVector() || VT.getVectorNumElements() < 2 || VT == MVT::v2i64)
   7321     return SDValue();
   7322 
   7323   // Split unaligned 16B stores. They are terrible for performance.
   7324   // Don't split stores with alignment of 1 or 2. Code that uses clang vector
   7325   // extensions can use this to mark that it does not want splitting to happen
   7326   // (by underspecifying alignment to be 1 or 2). Furthermore, the chance of
   7327   // eliminating alignment hazards is only 1 in 8 for alignment of 2.
   7328   if (VT.getSizeInBits() != 128 || S->getAlignment() >= 16 ||
   7329       S->getAlignment() <= 2)
   7330     return SDValue();
   7331 
   7332   // If we get a splat of a scalar convert this vector store to a store of
   7333   // scalars. They will be merged into store pairs thereby removing two
   7334   // instructions.
   7335   SDValue ReplacedSplat = replaceSplatVectorStore(DAG, S);
   7336   if (ReplacedSplat != SDValue())
   7337     return ReplacedSplat;
   7338 
   7339   SDLoc DL(S);
   7340   unsigned NumElts = VT.getVectorNumElements() / 2;
   7341   // Split VT into two.
   7342   EVT HalfVT =
   7343       EVT::getVectorVT(*DAG.getContext(), VT.getVectorElementType(), NumElts);
   7344   SDValue SubVector0 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, StVal,
   7345                                    DAG.getIntPtrConstant(0));
   7346   SDValue SubVector1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, StVal,
   7347                                    DAG.getIntPtrConstant(NumElts));
   7348   SDValue BasePtr = S->getBasePtr();
   7349   SDValue NewST1 =
   7350       DAG.getStore(S->getChain(), DL, SubVector0, BasePtr, S->getPointerInfo(),
   7351                    S->isVolatile(), S->isNonTemporal(), S->getAlignment());
   7352   SDValue OffsetPtr = DAG.getNode(ISD::ADD, DL, MVT::i64, BasePtr,
   7353                                   DAG.getConstant(8, MVT::i64));
   7354   return DAG.getStore(NewST1.getValue(0), DL, SubVector1, OffsetPtr,
   7355                       S->getPointerInfo(), S->isVolatile(), S->isNonTemporal(),
   7356                       S->getAlignment());
   7357 }
   7358 
   7359 /// Target-specific DAG combine function for post-increment LD1 (lane) and
   7360 /// post-increment LD1R.
   7361 static SDValue performPostLD1Combine(SDNode *N,
   7362                                      TargetLowering::DAGCombinerInfo &DCI,
   7363                                      bool IsLaneOp) {
   7364   if (DCI.isBeforeLegalizeOps())
   7365     return SDValue();
   7366 
   7367   SelectionDAG &DAG = DCI.DAG;
   7368   EVT VT = N->getValueType(0);
   7369 
   7370   unsigned LoadIdx = IsLaneOp ? 1 : 0;
   7371   SDNode *LD = N->getOperand(LoadIdx).getNode();
   7372   // If it is not LOAD, can not do such combine.
   7373   if (LD->getOpcode() != ISD::LOAD)
   7374     return SDValue();
   7375 
   7376   LoadSDNode *LoadSDN = cast<LoadSDNode>(LD);
   7377   EVT MemVT = LoadSDN->getMemoryVT();
   7378   // Check if memory operand is the same type as the vector element.
   7379   if (MemVT != VT.getVectorElementType())
   7380     return SDValue();
   7381 
   7382   // Check if there are other uses. If so, do not combine as it will introduce
   7383   // an extra load.
   7384   for (SDNode::use_iterator UI = LD->use_begin(), UE = LD->use_end(); UI != UE;
   7385        ++UI) {
   7386     if (UI.getUse().getResNo() == 1) // Ignore uses of the chain result.
   7387       continue;
   7388     if (*UI != N)
   7389       return SDValue();
   7390   }
   7391 
   7392   SDValue Addr = LD->getOperand(1);
   7393   SDValue Vector = N->getOperand(0);
   7394   // Search for a use of the address operand that is an increment.
   7395   for (SDNode::use_iterator UI = Addr.getNode()->use_begin(), UE =
   7396        Addr.getNode()->use_end(); UI != UE; ++UI) {
   7397     SDNode *User = *UI;
   7398     if (User->getOpcode() != ISD::ADD
   7399         || UI.getUse().getResNo() != Addr.getResNo())
   7400       continue;
   7401 
   7402     // Check that the add is independent of the load.  Otherwise, folding it
   7403     // would create a cycle.
   7404     if (User->isPredecessorOf(LD) || LD->isPredecessorOf(User))
   7405       continue;
   7406     // Also check that add is not used in the vector operand.  This would also
   7407     // create a cycle.
   7408     if (User->isPredecessorOf(Vector.getNode()))
   7409       continue;
   7410 
   7411     // If the increment is a constant, it must match the memory ref size.
   7412     SDValue Inc = User->getOperand(User->getOperand(0) == Addr ? 1 : 0);
   7413     if (ConstantSDNode *CInc = dyn_cast<ConstantSDNode>(Inc.getNode())) {
   7414       uint32_t IncVal = CInc->getZExtValue();
   7415       unsigned NumBytes = VT.getScalarSizeInBits() / 8;
   7416       if (IncVal != NumBytes)
   7417         continue;
   7418       Inc = DAG.getRegister(AArch64::XZR, MVT::i64);
   7419     }
   7420 
   7421     SmallVector<SDValue, 8> Ops;
   7422     Ops.push_back(LD->getOperand(0));  // Chain
   7423     if (IsLaneOp) {
   7424       Ops.push_back(Vector);           // The vector to be inserted
   7425       Ops.push_back(N->getOperand(2)); // The lane to be inserted in the vector
   7426     }
   7427     Ops.push_back(Addr);
   7428     Ops.push_back(Inc);
   7429 
   7430     EVT Tys[3] = { VT, MVT::i64, MVT::Other };
   7431     SDVTList SDTys = DAG.getVTList(ArrayRef<EVT>(Tys, 3));
   7432     unsigned NewOp = IsLaneOp ? AArch64ISD::LD1LANEpost : AArch64ISD::LD1DUPpost;
   7433     SDValue UpdN = DAG.getMemIntrinsicNode(NewOp, SDLoc(N), SDTys, Ops,
   7434                                            MemVT,
   7435                                            LoadSDN->getMemOperand());
   7436 
   7437     // Update the uses.
   7438     std::vector<SDValue> NewResults;
   7439     NewResults.push_back(SDValue(LD, 0));             // The result of load
   7440     NewResults.push_back(SDValue(UpdN.getNode(), 2)); // Chain
   7441     DCI.CombineTo(LD, NewResults);
   7442     DCI.CombineTo(N, SDValue(UpdN.getNode(), 0));     // Dup/Inserted Result
   7443     DCI.CombineTo(User, SDValue(UpdN.getNode(), 1));  // Write back register
   7444 
   7445     break;
   7446   }
   7447   return SDValue();
   7448 }
   7449 
   7450 /// Target-specific DAG combine function for NEON load/store intrinsics
   7451 /// to merge base address updates.
   7452 static SDValue performNEONPostLDSTCombine(SDNode *N,
   7453                                           TargetLowering::DAGCombinerInfo &DCI,
   7454                                           SelectionDAG &DAG) {
   7455   if (DCI.isBeforeLegalize() || DCI.isCalledByLegalizer())
   7456     return SDValue();
   7457 
   7458   unsigned AddrOpIdx = N->getNumOperands() - 1;
   7459   SDValue Addr = N->getOperand(AddrOpIdx);
   7460 
   7461   // Search for a use of the address operand that is an increment.
   7462   for (SDNode::use_iterator UI = Addr.getNode()->use_begin(),
   7463        UE = Addr.getNode()->use_end(); UI != UE; ++UI) {
   7464     SDNode *User = *UI;
   7465     if (User->getOpcode() != ISD::ADD ||
   7466         UI.getUse().getResNo() != Addr.getResNo())
   7467       continue;
   7468 
   7469     // Check that the add is independent of the load/store.  Otherwise, folding
   7470     // it would create a cycle.
   7471     if (User->isPredecessorOf(N) || N->isPredecessorOf(User))
   7472       continue;
   7473 
   7474     // Find the new opcode for the updating load/store.
   7475     bool IsStore = false;
   7476     bool IsLaneOp = false;
   7477     bool IsDupOp = false;
   7478     unsigned NewOpc = 0;
   7479     unsigned NumVecs = 0;
   7480     unsigned IntNo = cast<ConstantSDNode>(N->getOperand(1))->getZExtValue();
   7481     switch (IntNo) {
   7482     default: llvm_unreachable("unexpected intrinsic for Neon base update");
   7483     case Intrinsic::aarch64_neon_ld2:       NewOpc = AArch64ISD::LD2post;
   7484       NumVecs = 2; break;
   7485     case Intrinsic::aarch64_neon_ld3:       NewOpc = AArch64ISD::LD3post;
   7486       NumVecs = 3; break;
   7487     case Intrinsic::aarch64_neon_ld4:       NewOpc = AArch64ISD::LD4post;
   7488       NumVecs = 4; break;
   7489     case Intrinsic::aarch64_neon_st2:       NewOpc = AArch64ISD::ST2post;
   7490       NumVecs = 2; IsStore = true; break;
   7491     case Intrinsic::aarch64_neon_st3:       NewOpc = AArch64ISD::ST3post;
   7492       NumVecs = 3; IsStore = true; break;
   7493     case Intrinsic::aarch64_neon_st4:       NewOpc = AArch64ISD::ST4post;
   7494       NumVecs = 4; IsStore = true; break;
   7495     case Intrinsic::aarch64_neon_ld1x2:     NewOpc = AArch64ISD::LD1x2post;
   7496       NumVecs = 2; break;
   7497     case Intrinsic::aarch64_neon_ld1x3:     NewOpc = AArch64ISD::LD1x3post;
   7498       NumVecs = 3; break;
   7499     case Intrinsic::aarch64_neon_ld1x4:     NewOpc = AArch64ISD::LD1x4post;
   7500       NumVecs = 4; break;
   7501     case Intrinsic::aarch64_neon_st1x2:     NewOpc = AArch64ISD::ST1x2post;
   7502       NumVecs = 2; IsStore = true; break;
   7503     case Intrinsic::aarch64_neon_st1x3:     NewOpc = AArch64ISD::ST1x3post;
   7504       NumVecs = 3; IsStore = true; break;
   7505     case Intrinsic::aarch64_neon_st1x4:     NewOpc = AArch64ISD::ST1x4post;
   7506       NumVecs = 4; IsStore = true; break;
   7507     case Intrinsic::aarch64_neon_ld2r:      NewOpc = AArch64ISD::LD2DUPpost;
   7508       NumVecs = 2; IsDupOp = true; break;
   7509     case Intrinsic::aarch64_neon_ld3r:      NewOpc = AArch64ISD::LD3DUPpost;
   7510       NumVecs = 3; IsDupOp = true; break;
   7511     case Intrinsic::aarch64_neon_ld4r:      NewOpc = AArch64ISD::LD4DUPpost;
   7512       NumVecs = 4; IsDupOp = true; break;
   7513     case Intrinsic::aarch64_neon_ld2lane:   NewOpc = AArch64ISD::LD2LANEpost;
   7514       NumVecs = 2; IsLaneOp = true; break;
   7515     case Intrinsic::aarch64_neon_ld3lane:   NewOpc = AArch64ISD::LD3LANEpost;
   7516       NumVecs = 3; IsLaneOp = true; break;
   7517     case Intrinsic::aarch64_neon_ld4lane:   NewOpc = AArch64ISD::LD4LANEpost;
   7518       NumVecs = 4; IsLaneOp = true; break;
   7519     case Intrinsic::aarch64_neon_st2lane:   NewOpc = AArch64ISD::ST2LANEpost;
   7520       NumVecs = 2; IsStore = true; IsLaneOp = true; break;
   7521     case Intrinsic::aarch64_neon_st3lane:   NewOpc = AArch64ISD::ST3LANEpost;
   7522       NumVecs = 3; IsStore = true; IsLaneOp = true; break;
   7523     case Intrinsic::aarch64_neon_st4lane:   NewOpc = AArch64ISD::ST4LANEpost;
   7524       NumVecs = 4; IsStore = true; IsLaneOp = true; break;
   7525     }
   7526 
   7527     EVT VecTy;
   7528     if (IsStore)
   7529       VecTy = N->getOperand(2).getValueType();
   7530     else
   7531       VecTy = N->getValueType(0);
   7532 
   7533     // If the increment is a constant, it must match the memory ref size.
   7534     SDValue Inc = User->getOperand(User->getOperand(0) == Addr ? 1 : 0);
   7535     if (ConstantSDNode *CInc = dyn_cast<ConstantSDNode>(Inc.getNode())) {
   7536       uint32_t IncVal = CInc->getZExtValue();
   7537       unsigned NumBytes = NumVecs * VecTy.getSizeInBits() / 8;
   7538       if (IsLaneOp || IsDupOp)
   7539         NumBytes /= VecTy.getVectorNumElements();
   7540       if (IncVal != NumBytes)
   7541         continue;
   7542       Inc = DAG.getRegister(AArch64::XZR, MVT::i64);
   7543     }
   7544     SmallVector<SDValue, 8> Ops;
   7545     Ops.push_back(N->getOperand(0)); // Incoming chain
   7546     // Load lane and store have vector list as input.
   7547     if (IsLaneOp || IsStore)
   7548       for (unsigned i = 2; i < AddrOpIdx; ++i)
   7549         Ops.push_back(N->getOperand(i));
   7550     Ops.push_back(Addr); // Base register
   7551     Ops.push_back(Inc);
   7552 
   7553     // Return Types.
   7554     EVT Tys[6];
   7555     unsigned NumResultVecs = (IsStore ? 0 : NumVecs);
   7556     unsigned n;
   7557     for (n = 0; n < NumResultVecs; ++n)
   7558       Tys[n] = VecTy;
   7559     Tys[n++] = MVT::i64;  // Type of write back register
   7560     Tys[n] = MVT::Other;  // Type of the chain
   7561     SDVTList SDTys = DAG.getVTList(ArrayRef<EVT>(Tys, NumResultVecs + 2));
   7562 
   7563     MemIntrinsicSDNode *MemInt = cast<MemIntrinsicSDNode>(N);
   7564     SDValue UpdN = DAG.getMemIntrinsicNode(NewOpc, SDLoc(N), SDTys, Ops,
   7565                                            MemInt->getMemoryVT(),
   7566                                            MemInt->getMemOperand());
   7567 
   7568     // Update the uses.
   7569     std::vector<SDValue> NewResults;
   7570     for (unsigned i = 0; i < NumResultVecs; ++i) {
   7571       NewResults.push_back(SDValue(UpdN.getNode(), i));
   7572     }
   7573     NewResults.push_back(SDValue(UpdN.getNode(), NumResultVecs + 1));
   7574     DCI.CombineTo(N, NewResults);
   7575     DCI.CombineTo(User, SDValue(UpdN.getNode(), NumResultVecs));
   7576 
   7577     break;
   7578   }
   7579   return SDValue();
   7580 }
   7581 
   7582 // Optimize compare with zero and branch.
   7583 static SDValue performBRCONDCombine(SDNode *N,
   7584                                     TargetLowering::DAGCombinerInfo &DCI,
   7585                                     SelectionDAG &DAG) {
   7586   SDValue Chain = N->getOperand(0);
   7587   SDValue Dest = N->getOperand(1);
   7588   SDValue CCVal = N->getOperand(2);
   7589   SDValue Cmp = N->getOperand(3);
   7590 
   7591   assert(isa<ConstantSDNode>(CCVal) && "Expected a ConstantSDNode here!");
   7592   unsigned CC = cast<ConstantSDNode>(CCVal)->getZExtValue();
   7593   if (CC != AArch64CC::EQ && CC != AArch64CC::NE)
   7594     return SDValue();
   7595 
   7596   unsigned CmpOpc = Cmp.getOpcode();
   7597   if (CmpOpc != AArch64ISD::ADDS && CmpOpc != AArch64ISD::SUBS)
   7598     return SDValue();
   7599 
   7600   // Only attempt folding if there is only one use of the flag and no use of the
   7601   // value.
   7602   if (!Cmp->hasNUsesOfValue(0, 0) || !Cmp->hasNUsesOfValue(1, 1))
   7603     return SDValue();
   7604 
   7605   SDValue LHS = Cmp.getOperand(0);
   7606   SDValue RHS = Cmp.getOperand(1);
   7607 
   7608   assert(LHS.getValueType() == RHS.getValueType() &&
   7609          "Expected the value type to be the same for both operands!");
   7610   if (LHS.getValueType() != MVT::i32 && LHS.getValueType() != MVT::i64)
   7611     return SDValue();
   7612 
   7613   if (isa<ConstantSDNode>(LHS) && cast<ConstantSDNode>(LHS)->isNullValue())
   7614     std::swap(LHS, RHS);
   7615 
   7616   if (!isa<ConstantSDNode>(RHS) || !cast<ConstantSDNode>(RHS)->isNullValue())
   7617     return SDValue();
   7618 
   7619   if (LHS.getOpcode() == ISD::SHL || LHS.getOpcode() == ISD::SRA ||
   7620       LHS.getOpcode() == ISD::SRL)
   7621     return SDValue();
   7622 
   7623   // Fold the compare into the branch instruction.
   7624   SDValue BR;
   7625   if (CC == AArch64CC::EQ)
   7626     BR = DAG.getNode(AArch64ISD::CBZ, SDLoc(N), MVT::Other, Chain, LHS, Dest);
   7627   else
   7628     BR = DAG.getNode(AArch64ISD::CBNZ, SDLoc(N), MVT::Other, Chain, LHS, Dest);
   7629 
   7630   // Do not add new nodes to DAG combiner worklist.
   7631   DCI.CombineTo(N, BR, false);
   7632 
   7633   return SDValue();
   7634 }
   7635 
   7636 // vselect (v1i1 setcc) ->
   7637 //     vselect (v1iXX setcc)  (XX is the size of the compared operand type)
   7638 // FIXME: Currently the type legalizer can't handle VSELECT having v1i1 as
   7639 // condition. If it can legalize "VSELECT v1i1" correctly, no need to combine
   7640 // such VSELECT.
   7641 static SDValue performVSelectCombine(SDNode *N, SelectionDAG &DAG) {
   7642   SDValue N0 = N->getOperand(0);
   7643   EVT CCVT = N0.getValueType();
   7644 
   7645   if (N0.getOpcode() != ISD::SETCC || CCVT.getVectorNumElements() != 1 ||
   7646       CCVT.getVectorElementType() != MVT::i1)
   7647     return SDValue();
   7648 
   7649   EVT ResVT = N->getValueType(0);
   7650   EVT CmpVT = N0.getOperand(0).getValueType();
   7651   // Only combine when the result type is of the same size as the compared
   7652   // operands.
   7653   if (ResVT.getSizeInBits() != CmpVT.getSizeInBits())
   7654     return SDValue();
   7655 
   7656   SDValue IfTrue = N->getOperand(1);
   7657   SDValue IfFalse = N->getOperand(2);
   7658   SDValue SetCC =
   7659       DAG.getSetCC(SDLoc(N), CmpVT.changeVectorElementTypeToInteger(),
   7660                    N0.getOperand(0), N0.getOperand(1),
   7661                    cast<CondCodeSDNode>(N0.getOperand(2))->get());
   7662   return DAG.getNode(ISD::VSELECT, SDLoc(N), ResVT, SetCC,
   7663                      IfTrue, IfFalse);
   7664 }
   7665 
   7666 /// A vector select: "(select vL, vR, (setcc LHS, RHS))" is best performed with
   7667 /// the compare-mask instructions rather than going via NZCV, even if LHS and
   7668 /// RHS are really scalar. This replaces any scalar setcc in the above pattern
   7669 /// with a vector one followed by a DUP shuffle on the result.
   7670 static SDValue performSelectCombine(SDNode *N, SelectionDAG &DAG) {
   7671   SDValue N0 = N->getOperand(0);
   7672   EVT ResVT = N->getValueType(0);
   7673 
   7674   if (!N->getOperand(1).getValueType().isVector())
   7675     return SDValue();
   7676 
   7677   if (N0.getOpcode() != ISD::SETCC || N0.getValueType() != MVT::i1)
   7678     return SDValue();
   7679 
   7680   SDLoc DL(N0);
   7681 
   7682   EVT SrcVT = N0.getOperand(0).getValueType();
   7683   SrcVT = EVT::getVectorVT(*DAG.getContext(), SrcVT,
   7684                            ResVT.getSizeInBits() / SrcVT.getSizeInBits());
   7685   EVT CCVT = SrcVT.changeVectorElementTypeToInteger();
   7686 
   7687   // First perform a vector comparison, where lane 0 is the one we're interested
   7688   // in.
   7689   SDValue LHS =
   7690       DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, SrcVT, N0.getOperand(0));
   7691   SDValue RHS =
   7692       DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, SrcVT, N0.getOperand(1));
   7693   SDValue SetCC = DAG.getNode(ISD::SETCC, DL, CCVT, LHS, RHS, N0.getOperand(2));
   7694 
   7695   // Now duplicate the comparison mask we want across all other lanes.
   7696   SmallVector<int, 8> DUPMask(CCVT.getVectorNumElements(), 0);
   7697   SDValue Mask = DAG.getVectorShuffle(CCVT, DL, SetCC, SetCC, DUPMask.data());
   7698   Mask = DAG.getNode(ISD::BITCAST, DL, ResVT.changeVectorElementTypeToInteger(),
   7699                      Mask);
   7700 
   7701   return DAG.getSelect(DL, ResVT, Mask, N->getOperand(1), N->getOperand(2));
   7702 }
   7703 
   7704 SDValue AArch64TargetLowering::PerformDAGCombine(SDNode *N,
   7705                                                  DAGCombinerInfo &DCI) const {
   7706   SelectionDAG &DAG = DCI.DAG;
   7707   switch (N->getOpcode()) {
   7708   default:
   7709     break;
   7710   case ISD::ADD:
   7711   case ISD::SUB:
   7712     return performAddSubLongCombine(N, DCI, DAG);
   7713   case ISD::XOR:
   7714     return performXorCombine(N, DAG, DCI, Subtarget);
   7715   case ISD::MUL:
   7716     return performMulCombine(N, DAG, DCI, Subtarget);
   7717   case ISD::SINT_TO_FP:
   7718   case ISD::UINT_TO_FP:
   7719     return performIntToFpCombine(N, DAG);
   7720   case ISD::OR:
   7721     return performORCombine(N, DCI, Subtarget);
   7722   case ISD::INTRINSIC_WO_CHAIN:
   7723     return performIntrinsicCombine(N, DCI, Subtarget);
   7724   case ISD::ANY_EXTEND:
   7725   case ISD::ZERO_EXTEND:
   7726   case ISD::SIGN_EXTEND:
   7727     return performExtendCombine(N, DCI, DAG);
   7728   case ISD::BITCAST:
   7729     return performBitcastCombine(N, DCI, DAG);
   7730   case ISD::CONCAT_VECTORS:
   7731     return performConcatVectorsCombine(N, DCI, DAG);
   7732   case ISD::SELECT:
   7733     return performSelectCombine(N, DAG);
   7734   case ISD::VSELECT:
   7735     return performVSelectCombine(N, DCI.DAG);
   7736   case ISD::STORE:
   7737     return performSTORECombine(N, DCI, DAG, Subtarget);
   7738   case AArch64ISD::BRCOND:
   7739     return performBRCONDCombine(N, DCI, DAG);
   7740   case AArch64ISD::DUP:
   7741     return performPostLD1Combine(N, DCI, false);
   7742   case ISD::INSERT_VECTOR_ELT:
   7743     return performPostLD1Combine(N, DCI, true);
   7744   case ISD::INTRINSIC_VOID:
   7745   case ISD::INTRINSIC_W_CHAIN:
   7746     switch (cast<ConstantSDNode>(N->getOperand(1))->getZExtValue()) {
   7747     case Intrinsic::aarch64_neon_ld2:
   7748     case Intrinsic::aarch64_neon_ld3:
   7749     case Intrinsic::aarch64_neon_ld4:
   7750     case Intrinsic::aarch64_neon_ld1x2:
   7751     case Intrinsic::aarch64_neon_ld1x3:
   7752     case Intrinsic::aarch64_neon_ld1x4:
   7753     case Intrinsic::aarch64_neon_ld2lane:
   7754     case Intrinsic::aarch64_neon_ld3lane:
   7755     case Intrinsic::aarch64_neon_ld4lane:
   7756     case Intrinsic::aarch64_neon_ld2r:
   7757     case Intrinsic::aarch64_neon_ld3r:
   7758     case Intrinsic::aarch64_neon_ld4r:
   7759     case Intrinsic::aarch64_neon_st2:
   7760     case Intrinsic::aarch64_neon_st3:
   7761     case Intrinsic::aarch64_neon_st4:
   7762     case Intrinsic::aarch64_neon_st1x2:
   7763     case Intrinsic::aarch64_neon_st1x3:
   7764     case Intrinsic::aarch64_neon_st1x4:
   7765     case Intrinsic::aarch64_neon_st2lane:
   7766     case Intrinsic::aarch64_neon_st3lane:
   7767     case Intrinsic::aarch64_neon_st4lane:
   7768       return performNEONPostLDSTCombine(N, DCI, DAG);
   7769     default:
   7770       break;
   7771     }
   7772   }
   7773   return SDValue();
   7774 }
   7775 
   7776 // Check if the return value is used as only a return value, as otherwise
   7777 // we can't perform a tail-call. In particular, we need to check for
   7778 // target ISD nodes that are returns and any other "odd" constructs
   7779 // that the generic analysis code won't necessarily catch.
   7780 bool AArch64TargetLowering::isUsedByReturnOnly(SDNode *N,
   7781                                                SDValue &Chain) const {
   7782   if (N->getNumValues() != 1)
   7783     return false;
   7784   if (!N->hasNUsesOfValue(1, 0))
   7785     return false;
   7786 
   7787   SDValue TCChain = Chain;
   7788   SDNode *Copy = *N->use_begin();
   7789   if (Copy->getOpcode() == ISD::CopyToReg) {
   7790     // If the copy has a glue operand, we conservatively assume it isn't safe to
   7791     // perform a tail call.
   7792     if (Copy->getOperand(Copy->getNumOperands() - 1).getValueType() ==
   7793         MVT::Glue)
   7794       return false;
   7795     TCChain = Copy->getOperand(0);
   7796   } else if (Copy->getOpcode() != ISD::FP_EXTEND)
   7797     return false;
   7798 
   7799   bool HasRet = false;
   7800   for (SDNode *Node : Copy->uses()) {
   7801     if (Node->getOpcode() != AArch64ISD::RET_FLAG)
   7802       return false;
   7803     HasRet = true;
   7804   }
   7805 
   7806   if (!HasRet)
   7807     return false;
   7808 
   7809   Chain = TCChain;
   7810   return true;
   7811 }
   7812 
   7813 // Return whether the an instruction can potentially be optimized to a tail
   7814 // call. This will cause the optimizers to attempt to move, or duplicate,
   7815 // return instructions to help enable tail call optimizations for this
   7816 // instruction.
   7817 bool AArch64TargetLowering::mayBeEmittedAsTailCall(CallInst *CI) const {
   7818   if (!CI->isTailCall())
   7819     return false;
   7820 
   7821   return true;
   7822 }
   7823 
   7824 bool AArch64TargetLowering::getIndexedAddressParts(SDNode *Op, SDValue &Base,
   7825                                                    SDValue &Offset,
   7826                                                    ISD::MemIndexedMode &AM,
   7827                                                    bool &IsInc,
   7828                                                    SelectionDAG &DAG) const {
   7829   if (Op->getOpcode() != ISD::ADD && Op->getOpcode() != ISD::SUB)
   7830     return false;
   7831 
   7832   Base = Op->getOperand(0);
   7833   // All of the indexed addressing mode instructions take a signed
   7834   // 9 bit immediate offset.
   7835   if (ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(Op->getOperand(1))) {
   7836     int64_t RHSC = (int64_t)RHS->getZExtValue();
   7837     if (RHSC >= 256 || RHSC <= -256)
   7838       return false;
   7839     IsInc = (Op->getOpcode() == ISD::ADD);
   7840     Offset = Op->getOperand(1);
   7841     return true;
   7842   }
   7843   return false;
   7844 }
   7845 
   7846 bool AArch64TargetLowering::getPreIndexedAddressParts(SDNode *N, SDValue &Base,
   7847                                                       SDValue &Offset,
   7848                                                       ISD::MemIndexedMode &AM,
   7849                                                       SelectionDAG &DAG) const {
   7850   EVT VT;
   7851   SDValue Ptr;
   7852   if (LoadSDNode *LD = dyn_cast<LoadSDNode>(N)) {
   7853     VT = LD->getMemoryVT();
   7854     Ptr = LD->getBasePtr();
   7855   } else if (StoreSDNode *ST = dyn_cast<StoreSDNode>(N)) {
   7856     VT = ST->getMemoryVT();
   7857     Ptr = ST->getBasePtr();
   7858   } else
   7859     return false;
   7860 
   7861   bool IsInc;
   7862   if (!getIndexedAddressParts(Ptr.getNode(), Base, Offset, AM, IsInc, DAG))
   7863     return false;
   7864   AM = IsInc ? ISD::PRE_INC : ISD::PRE_DEC;
   7865   return true;
   7866 }
   7867 
   7868 bool AArch64TargetLowering::getPostIndexedAddressParts(
   7869     SDNode *N, SDNode *Op, SDValue &Base, SDValue &Offset,
   7870     ISD::MemIndexedMode &AM, SelectionDAG &DAG) const {
   7871   EVT VT;
   7872   SDValue Ptr;
   7873   if (LoadSDNode *LD = dyn_cast<LoadSDNode>(N)) {
   7874     VT = LD->getMemoryVT();
   7875     Ptr = LD->getBasePtr();
   7876   } else if (StoreSDNode *ST = dyn_cast<StoreSDNode>(N)) {
   7877     VT = ST->getMemoryVT();
   7878     Ptr = ST->getBasePtr();
   7879   } else
   7880     return false;
   7881 
   7882   bool IsInc;
   7883   if (!getIndexedAddressParts(Op, Base, Offset, AM, IsInc, DAG))
   7884     return false;
   7885   // Post-indexing updates the base, so it's not a valid transform
   7886   // if that's not the same as the load's pointer.
   7887   if (Ptr != Base)
   7888     return false;
   7889   AM = IsInc ? ISD::POST_INC : ISD::POST_DEC;
   7890   return true;
   7891 }
   7892 
   7893 void AArch64TargetLowering::ReplaceNodeResults(
   7894     SDNode *N, SmallVectorImpl<SDValue> &Results, SelectionDAG &DAG) const {
   7895   switch (N->getOpcode()) {
   7896   default:
   7897     llvm_unreachable("Don't know how to custom expand this");
   7898   case ISD::FP_TO_UINT:
   7899   case ISD::FP_TO_SINT:
   7900     assert(N->getValueType(0) == MVT::i128 && "unexpected illegal conversion");
   7901     // Let normal code take care of it by not adding anything to Results.
   7902     return;
   7903   }
   7904 }
   7905 
   7906 bool AArch64TargetLowering::shouldExpandAtomicInIR(Instruction *Inst) const {
   7907   // Loads and stores less than 128-bits are already atomic; ones above that
   7908   // are doomed anyway, so defer to the default libcall and blame the OS when
   7909   // things go wrong:
   7910   if (StoreInst *SI = dyn_cast<StoreInst>(Inst))
   7911     return SI->getValueOperand()->getType()->getPrimitiveSizeInBits() == 128;
   7912   else if (LoadInst *LI = dyn_cast<LoadInst>(Inst))
   7913     return LI->getType()->getPrimitiveSizeInBits() == 128;
   7914 
   7915   // For the real atomic operations, we have ldxr/stxr up to 128 bits.
   7916   return Inst->getType()->getPrimitiveSizeInBits() <= 128;
   7917 }
   7918 
   7919 TargetLoweringBase::LegalizeTypeAction
   7920 AArch64TargetLowering::getPreferredVectorAction(EVT VT) const {
   7921   MVT SVT = VT.getSimpleVT();
   7922   // During type legalization, we prefer to widen v1i8, v1i16, v1i32  to v8i8,
   7923   // v4i16, v2i32 instead of to promote.
   7924   if (SVT == MVT::v1i8 || SVT == MVT::v1i16 || SVT == MVT::v1i32
   7925       || SVT == MVT::v1f32)
   7926     return TypeWidenVector;
   7927 
   7928   return TargetLoweringBase::getPreferredVectorAction(VT);
   7929 }
   7930 
   7931 Value *AArch64TargetLowering::emitLoadLinked(IRBuilder<> &Builder, Value *Addr,
   7932                                              AtomicOrdering Ord) const {
   7933   Module *M = Builder.GetInsertBlock()->getParent()->getParent();
   7934   Type *ValTy = cast<PointerType>(Addr->getType())->getElementType();
   7935   bool IsAcquire =
   7936       Ord == Acquire || Ord == AcquireRelease || Ord == SequentiallyConsistent;
   7937 
   7938   // Since i128 isn't legal and intrinsics don't get type-lowered, the ldrexd
   7939   // intrinsic must return {i64, i64} and we have to recombine them into a
   7940   // single i128 here.
   7941   if (ValTy->getPrimitiveSizeInBits() == 128) {
   7942     Intrinsic::ID Int =
   7943         IsAcquire ? Intrinsic::aarch64_ldaxp : Intrinsic::aarch64_ldxp;
   7944     Function *Ldxr = llvm::Intrinsic::getDeclaration(M, Int);
   7945 
   7946     Addr = Builder.CreateBitCast(Addr, Type::getInt8PtrTy(M->getContext()));
   7947     Value *LoHi = Builder.CreateCall(Ldxr, Addr, "lohi");
   7948 
   7949     Value *Lo = Builder.CreateExtractValue(LoHi, 0, "lo");
   7950     Value *Hi = Builder.CreateExtractValue(LoHi, 1, "hi");
   7951     Lo = Builder.CreateZExt(Lo, ValTy, "lo64");
   7952     Hi = Builder.CreateZExt(Hi, ValTy, "hi64");
   7953     return Builder.CreateOr(
   7954         Lo, Builder.CreateShl(Hi, ConstantInt::get(ValTy, 64)), "val64");
   7955   }
   7956 
   7957   Type *Tys[] = { Addr->getType() };
   7958   Intrinsic::ID Int =
   7959       IsAcquire ? Intrinsic::aarch64_ldaxr : Intrinsic::aarch64_ldxr;
   7960   Function *Ldxr = llvm::Intrinsic::getDeclaration(M, Int, Tys);
   7961 
   7962   return Builder.CreateTruncOrBitCast(
   7963       Builder.CreateCall(Ldxr, Addr),
   7964       cast<PointerType>(Addr->getType())->getElementType());
   7965 }
   7966 
   7967 Value *AArch64TargetLowering::emitStoreConditional(IRBuilder<> &Builder,
   7968                                                    Value *Val, Value *Addr,
   7969                                                    AtomicOrdering Ord) const {
   7970   Module *M = Builder.GetInsertBlock()->getParent()->getParent();
   7971   bool IsRelease =
   7972       Ord == Release || Ord == AcquireRelease || Ord == SequentiallyConsistent;
   7973 
   7974   // Since the intrinsics must have legal type, the i128 intrinsics take two
   7975   // parameters: "i64, i64". We must marshal Val into the appropriate form
   7976   // before the call.
   7977   if (Val->getType()->getPrimitiveSizeInBits() == 128) {
   7978     Intrinsic::ID Int =
   7979         IsRelease ? Intrinsic::aarch64_stlxp : Intrinsic::aarch64_stxp;
   7980     Function *Stxr = Intrinsic::getDeclaration(M, Int);
   7981     Type *Int64Ty = Type::getInt64Ty(M->getContext());
   7982 
   7983     Value *Lo = Builder.CreateTrunc(Val, Int64Ty, "lo");
   7984     Value *Hi = Builder.CreateTrunc(Builder.CreateLShr(Val, 64), Int64Ty, "hi");
   7985     Addr = Builder.CreateBitCast(Addr, Type::getInt8PtrTy(M->getContext()));
   7986     return Builder.CreateCall3(Stxr, Lo, Hi, Addr);
   7987   }
   7988 
   7989   Intrinsic::ID Int =
   7990       IsRelease ? Intrinsic::aarch64_stlxr : Intrinsic::aarch64_stxr;
   7991   Type *Tys[] = { Addr->getType() };
   7992   Function *Stxr = Intrinsic::getDeclaration(M, Int, Tys);
   7993 
   7994   return Builder.CreateCall2(
   7995       Stxr, Builder.CreateZExtOrBitCast(
   7996                 Val, Stxr->getFunctionType()->getParamType(0)),
   7997       Addr);
   7998 }
   7999