1 //===-- ARMISelLowering.cpp - ARM DAG Lowering Implementation -------------===// 2 // 3 // The LLVM Compiler Infrastructure 4 // 5 // This file is distributed under the University of Illinois Open Source 6 // License. See LICENSE.TXT for details. 7 // 8 //===----------------------------------------------------------------------===// 9 // 10 // This file defines the interfaces that ARM uses to lower LLVM code into a 11 // selection DAG. 12 // 13 //===----------------------------------------------------------------------===// 14 15 #include "ARMISelLowering.h" 16 #include "ARMCallingConv.h" 17 #include "ARMConstantPoolValue.h" 18 #include "ARMMachineFunctionInfo.h" 19 #include "ARMPerfectShuffle.h" 20 #include "ARMSubtarget.h" 21 #include "ARMTargetMachine.h" 22 #include "ARMTargetObjectFile.h" 23 #include "MCTargetDesc/ARMAddressingModes.h" 24 #include "llvm/ADT/Statistic.h" 25 #include "llvm/ADT/StringExtras.h" 26 #include "llvm/ADT/StringSwitch.h" 27 #include "llvm/CodeGen/CallingConvLower.h" 28 #include "llvm/CodeGen/IntrinsicLowering.h" 29 #include "llvm/CodeGen/MachineBasicBlock.h" 30 #include "llvm/CodeGen/MachineFrameInfo.h" 31 #include "llvm/CodeGen/MachineFunction.h" 32 #include "llvm/CodeGen/MachineInstrBuilder.h" 33 #include "llvm/CodeGen/MachineJumpTableInfo.h" 34 #include "llvm/CodeGen/MachineModuleInfo.h" 35 #include "llvm/CodeGen/MachineRegisterInfo.h" 36 #include "llvm/CodeGen/SelectionDAG.h" 37 #include "llvm/IR/CallingConv.h" 38 #include "llvm/IR/Constants.h" 39 #include "llvm/IR/Function.h" 40 #include "llvm/IR/GlobalValue.h" 41 #include "llvm/IR/IRBuilder.h" 42 #include "llvm/IR/Instruction.h" 43 #include "llvm/IR/Instructions.h" 44 #include "llvm/IR/IntrinsicInst.h" 45 #include "llvm/IR/Intrinsics.h" 46 #include "llvm/IR/Type.h" 47 #include "llvm/MC/MCSectionMachO.h" 48 #include "llvm/Support/CommandLine.h" 49 #include "llvm/Support/Debug.h" 50 #include "llvm/Support/ErrorHandling.h" 51 #include "llvm/Support/MathExtras.h" 52 #include "llvm/Support/raw_ostream.h" 53 #include "llvm/Target/TargetOptions.h" 54 #include <utility> 55 using namespace llvm; 56 57 #define DEBUG_TYPE "arm-isel" 58 59 STATISTIC(NumTailCalls, "Number of tail calls"); 60 STATISTIC(NumMovwMovt, "Number of GAs materialized with movw + movt"); 61 STATISTIC(NumLoopByVals, "Number of loops generated for byval arguments"); 62 63 static cl::opt<bool> 64 ARMInterworking("arm-interworking", cl::Hidden, 65 cl::desc("Enable / disable ARM interworking (for debugging only)"), 66 cl::init(true)); 67 68 namespace { 69 class ARMCCState : public CCState { 70 public: 71 ARMCCState(CallingConv::ID CC, bool isVarArg, MachineFunction &MF, 72 SmallVectorImpl<CCValAssign> &locs, LLVMContext &C, 73 ParmContext PC) 74 : CCState(CC, isVarArg, MF, locs, C) { 75 assert(((PC == Call) || (PC == Prologue)) && 76 "ARMCCState users must specify whether their context is call" 77 "or prologue generation."); 78 CallOrPrologue = PC; 79 } 80 }; 81 } 82 83 // The APCS parameter registers. 84 static const MCPhysReg GPRArgRegs[] = { 85 ARM::R0, ARM::R1, ARM::R2, ARM::R3 86 }; 87 88 void ARMTargetLowering::addTypeForNEON(MVT VT, MVT PromotedLdStVT, 89 MVT PromotedBitwiseVT) { 90 if (VT != PromotedLdStVT) { 91 setOperationAction(ISD::LOAD, VT, Promote); 92 AddPromotedToType (ISD::LOAD, VT, PromotedLdStVT); 93 94 setOperationAction(ISD::STORE, VT, Promote); 95 AddPromotedToType (ISD::STORE, VT, PromotedLdStVT); 96 } 97 98 MVT ElemTy = VT.getVectorElementType(); 99 if (ElemTy != MVT::i64 && ElemTy != MVT::f64) 100 setOperationAction(ISD::SETCC, VT, Custom); 101 setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom); 102 setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom); 103 if (ElemTy == MVT::i32) { 104 setOperationAction(ISD::SINT_TO_FP, VT, Custom); 105 setOperationAction(ISD::UINT_TO_FP, VT, Custom); 106 setOperationAction(ISD::FP_TO_SINT, VT, Custom); 107 setOperationAction(ISD::FP_TO_UINT, VT, Custom); 108 } else { 109 setOperationAction(ISD::SINT_TO_FP, VT, Expand); 110 setOperationAction(ISD::UINT_TO_FP, VT, Expand); 111 setOperationAction(ISD::FP_TO_SINT, VT, Expand); 112 setOperationAction(ISD::FP_TO_UINT, VT, Expand); 113 } 114 setOperationAction(ISD::BUILD_VECTOR, VT, Custom); 115 setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom); 116 setOperationAction(ISD::CONCAT_VECTORS, VT, Legal); 117 setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Legal); 118 setOperationAction(ISD::SELECT, VT, Expand); 119 setOperationAction(ISD::SELECT_CC, VT, Expand); 120 setOperationAction(ISD::VSELECT, VT, Expand); 121 setOperationAction(ISD::SIGN_EXTEND_INREG, VT, Expand); 122 if (VT.isInteger()) { 123 setOperationAction(ISD::SHL, VT, Custom); 124 setOperationAction(ISD::SRA, VT, Custom); 125 setOperationAction(ISD::SRL, VT, Custom); 126 } 127 128 // Promote all bit-wise operations. 129 if (VT.isInteger() && VT != PromotedBitwiseVT) { 130 setOperationAction(ISD::AND, VT, Promote); 131 AddPromotedToType (ISD::AND, VT, PromotedBitwiseVT); 132 setOperationAction(ISD::OR, VT, Promote); 133 AddPromotedToType (ISD::OR, VT, PromotedBitwiseVT); 134 setOperationAction(ISD::XOR, VT, Promote); 135 AddPromotedToType (ISD::XOR, VT, PromotedBitwiseVT); 136 } 137 138 // Neon does not support vector divide/remainder operations. 139 setOperationAction(ISD::SDIV, VT, Expand); 140 setOperationAction(ISD::UDIV, VT, Expand); 141 setOperationAction(ISD::FDIV, VT, Expand); 142 setOperationAction(ISD::SREM, VT, Expand); 143 setOperationAction(ISD::UREM, VT, Expand); 144 setOperationAction(ISD::FREM, VT, Expand); 145 146 if (!VT.isFloatingPoint() && 147 VT != MVT::v2i64 && VT != MVT::v1i64) 148 for (unsigned Opcode : {ISD::SMIN, ISD::SMAX, ISD::UMIN, ISD::UMAX}) 149 setOperationAction(Opcode, VT, Legal); 150 } 151 152 void ARMTargetLowering::addDRTypeForNEON(MVT VT) { 153 addRegisterClass(VT, &ARM::DPRRegClass); 154 addTypeForNEON(VT, MVT::f64, MVT::v2i32); 155 } 156 157 void ARMTargetLowering::addQRTypeForNEON(MVT VT) { 158 addRegisterClass(VT, &ARM::DPairRegClass); 159 addTypeForNEON(VT, MVT::v2f64, MVT::v4i32); 160 } 161 162 ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM, 163 const ARMSubtarget &STI) 164 : TargetLowering(TM), Subtarget(&STI) { 165 RegInfo = Subtarget->getRegisterInfo(); 166 Itins = Subtarget->getInstrItineraryData(); 167 168 setBooleanVectorContents(ZeroOrNegativeOneBooleanContent); 169 170 if (Subtarget->isTargetMachO()) { 171 // Uses VFP for Thumb libfuncs if available. 172 if (Subtarget->isThumb() && Subtarget->hasVFP2() && 173 Subtarget->hasARMOps() && !Subtarget->useSoftFloat()) { 174 static const struct { 175 const RTLIB::Libcall Op; 176 const char * const Name; 177 const ISD::CondCode Cond; 178 } LibraryCalls[] = { 179 // Single-precision floating-point arithmetic. 180 { RTLIB::ADD_F32, "__addsf3vfp", ISD::SETCC_INVALID }, 181 { RTLIB::SUB_F32, "__subsf3vfp", ISD::SETCC_INVALID }, 182 { RTLIB::MUL_F32, "__mulsf3vfp", ISD::SETCC_INVALID }, 183 { RTLIB::DIV_F32, "__divsf3vfp", ISD::SETCC_INVALID }, 184 185 // Double-precision floating-point arithmetic. 186 { RTLIB::ADD_F64, "__adddf3vfp", ISD::SETCC_INVALID }, 187 { RTLIB::SUB_F64, "__subdf3vfp", ISD::SETCC_INVALID }, 188 { RTLIB::MUL_F64, "__muldf3vfp", ISD::SETCC_INVALID }, 189 { RTLIB::DIV_F64, "__divdf3vfp", ISD::SETCC_INVALID }, 190 191 // Single-precision comparisons. 192 { RTLIB::OEQ_F32, "__eqsf2vfp", ISD::SETNE }, 193 { RTLIB::UNE_F32, "__nesf2vfp", ISD::SETNE }, 194 { RTLIB::OLT_F32, "__ltsf2vfp", ISD::SETNE }, 195 { RTLIB::OLE_F32, "__lesf2vfp", ISD::SETNE }, 196 { RTLIB::OGE_F32, "__gesf2vfp", ISD::SETNE }, 197 { RTLIB::OGT_F32, "__gtsf2vfp", ISD::SETNE }, 198 { RTLIB::UO_F32, "__unordsf2vfp", ISD::SETNE }, 199 { RTLIB::O_F32, "__unordsf2vfp", ISD::SETEQ }, 200 201 // Double-precision comparisons. 202 { RTLIB::OEQ_F64, "__eqdf2vfp", ISD::SETNE }, 203 { RTLIB::UNE_F64, "__nedf2vfp", ISD::SETNE }, 204 { RTLIB::OLT_F64, "__ltdf2vfp", ISD::SETNE }, 205 { RTLIB::OLE_F64, "__ledf2vfp", ISD::SETNE }, 206 { RTLIB::OGE_F64, "__gedf2vfp", ISD::SETNE }, 207 { RTLIB::OGT_F64, "__gtdf2vfp", ISD::SETNE }, 208 { RTLIB::UO_F64, "__unorddf2vfp", ISD::SETNE }, 209 { RTLIB::O_F64, "__unorddf2vfp", ISD::SETEQ }, 210 211 // Floating-point to integer conversions. 212 // i64 conversions are done via library routines even when generating VFP 213 // instructions, so use the same ones. 214 { RTLIB::FPTOSINT_F64_I32, "__fixdfsivfp", ISD::SETCC_INVALID }, 215 { RTLIB::FPTOUINT_F64_I32, "__fixunsdfsivfp", ISD::SETCC_INVALID }, 216 { RTLIB::FPTOSINT_F32_I32, "__fixsfsivfp", ISD::SETCC_INVALID }, 217 { RTLIB::FPTOUINT_F32_I32, "__fixunssfsivfp", ISD::SETCC_INVALID }, 218 219 // Conversions between floating types. 220 { RTLIB::FPROUND_F64_F32, "__truncdfsf2vfp", ISD::SETCC_INVALID }, 221 { RTLIB::FPEXT_F32_F64, "__extendsfdf2vfp", ISD::SETCC_INVALID }, 222 223 // Integer to floating-point conversions. 224 // i64 conversions are done via library routines even when generating VFP 225 // instructions, so use the same ones. 226 // FIXME: There appears to be some naming inconsistency in ARM libgcc: 227 // e.g., __floatunsidf vs. __floatunssidfvfp. 228 { RTLIB::SINTTOFP_I32_F64, "__floatsidfvfp", ISD::SETCC_INVALID }, 229 { RTLIB::UINTTOFP_I32_F64, "__floatunssidfvfp", ISD::SETCC_INVALID }, 230 { RTLIB::SINTTOFP_I32_F32, "__floatsisfvfp", ISD::SETCC_INVALID }, 231 { RTLIB::UINTTOFP_I32_F32, "__floatunssisfvfp", ISD::SETCC_INVALID }, 232 }; 233 234 for (const auto &LC : LibraryCalls) { 235 setLibcallName(LC.Op, LC.Name); 236 if (LC.Cond != ISD::SETCC_INVALID) 237 setCmpLibcallCC(LC.Op, LC.Cond); 238 } 239 } 240 241 // Set the correct calling convention for ARMv7k WatchOS. It's just 242 // AAPCS_VFP for functions as simple as libcalls. 243 if (Subtarget->isTargetWatchABI()) { 244 for (int i = 0; i < RTLIB::UNKNOWN_LIBCALL; ++i) 245 setLibcallCallingConv((RTLIB::Libcall)i, CallingConv::ARM_AAPCS_VFP); 246 } 247 } 248 249 // These libcalls are not available in 32-bit. 250 setLibcallName(RTLIB::SHL_I128, nullptr); 251 setLibcallName(RTLIB::SRL_I128, nullptr); 252 setLibcallName(RTLIB::SRA_I128, nullptr); 253 254 // RTLIB 255 if (Subtarget->isAAPCS_ABI() && 256 (Subtarget->isTargetAEABI() || Subtarget->isTargetGNUAEABI() || 257 Subtarget->isTargetMuslAEABI() || Subtarget->isTargetAndroid())) { 258 static const struct { 259 const RTLIB::Libcall Op; 260 const char * const Name; 261 const CallingConv::ID CC; 262 const ISD::CondCode Cond; 263 } LibraryCalls[] = { 264 // Double-precision floating-point arithmetic helper functions 265 // RTABI chapter 4.1.2, Table 2 266 { RTLIB::ADD_F64, "__aeabi_dadd", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, 267 { RTLIB::DIV_F64, "__aeabi_ddiv", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, 268 { RTLIB::MUL_F64, "__aeabi_dmul", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, 269 { RTLIB::SUB_F64, "__aeabi_dsub", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, 270 271 // Double-precision floating-point comparison helper functions 272 // RTABI chapter 4.1.2, Table 3 273 { RTLIB::OEQ_F64, "__aeabi_dcmpeq", CallingConv::ARM_AAPCS, ISD::SETNE }, 274 { RTLIB::UNE_F64, "__aeabi_dcmpeq", CallingConv::ARM_AAPCS, ISD::SETEQ }, 275 { RTLIB::OLT_F64, "__aeabi_dcmplt", CallingConv::ARM_AAPCS, ISD::SETNE }, 276 { RTLIB::OLE_F64, "__aeabi_dcmple", CallingConv::ARM_AAPCS, ISD::SETNE }, 277 { RTLIB::OGE_F64, "__aeabi_dcmpge", CallingConv::ARM_AAPCS, ISD::SETNE }, 278 { RTLIB::OGT_F64, "__aeabi_dcmpgt", CallingConv::ARM_AAPCS, ISD::SETNE }, 279 { RTLIB::UO_F64, "__aeabi_dcmpun", CallingConv::ARM_AAPCS, ISD::SETNE }, 280 { RTLIB::O_F64, "__aeabi_dcmpun", CallingConv::ARM_AAPCS, ISD::SETEQ }, 281 282 // Single-precision floating-point arithmetic helper functions 283 // RTABI chapter 4.1.2, Table 4 284 { RTLIB::ADD_F32, "__aeabi_fadd", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, 285 { RTLIB::DIV_F32, "__aeabi_fdiv", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, 286 { RTLIB::MUL_F32, "__aeabi_fmul", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, 287 { RTLIB::SUB_F32, "__aeabi_fsub", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, 288 289 // Single-precision floating-point comparison helper functions 290 // RTABI chapter 4.1.2, Table 5 291 { RTLIB::OEQ_F32, "__aeabi_fcmpeq", CallingConv::ARM_AAPCS, ISD::SETNE }, 292 { RTLIB::UNE_F32, "__aeabi_fcmpeq", CallingConv::ARM_AAPCS, ISD::SETEQ }, 293 { RTLIB::OLT_F32, "__aeabi_fcmplt", CallingConv::ARM_AAPCS, ISD::SETNE }, 294 { RTLIB::OLE_F32, "__aeabi_fcmple", CallingConv::ARM_AAPCS, ISD::SETNE }, 295 { RTLIB::OGE_F32, "__aeabi_fcmpge", CallingConv::ARM_AAPCS, ISD::SETNE }, 296 { RTLIB::OGT_F32, "__aeabi_fcmpgt", CallingConv::ARM_AAPCS, ISD::SETNE }, 297 { RTLIB::UO_F32, "__aeabi_fcmpun", CallingConv::ARM_AAPCS, ISD::SETNE }, 298 { RTLIB::O_F32, "__aeabi_fcmpun", CallingConv::ARM_AAPCS, ISD::SETEQ }, 299 300 // Floating-point to integer conversions. 301 // RTABI chapter 4.1.2, Table 6 302 { RTLIB::FPTOSINT_F64_I32, "__aeabi_d2iz", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, 303 { RTLIB::FPTOUINT_F64_I32, "__aeabi_d2uiz", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, 304 { RTLIB::FPTOSINT_F64_I64, "__aeabi_d2lz", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, 305 { RTLIB::FPTOUINT_F64_I64, "__aeabi_d2ulz", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, 306 { RTLIB::FPTOSINT_F32_I32, "__aeabi_f2iz", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, 307 { RTLIB::FPTOUINT_F32_I32, "__aeabi_f2uiz", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, 308 { RTLIB::FPTOSINT_F32_I64, "__aeabi_f2lz", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, 309 { RTLIB::FPTOUINT_F32_I64, "__aeabi_f2ulz", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, 310 311 // Conversions between floating types. 312 // RTABI chapter 4.1.2, Table 7 313 { RTLIB::FPROUND_F64_F32, "__aeabi_d2f", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, 314 { RTLIB::FPROUND_F64_F16, "__aeabi_d2h", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, 315 { RTLIB::FPEXT_F32_F64, "__aeabi_f2d", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, 316 317 // Integer to floating-point conversions. 318 // RTABI chapter 4.1.2, Table 8 319 { RTLIB::SINTTOFP_I32_F64, "__aeabi_i2d", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, 320 { RTLIB::UINTTOFP_I32_F64, "__aeabi_ui2d", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, 321 { RTLIB::SINTTOFP_I64_F64, "__aeabi_l2d", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, 322 { RTLIB::UINTTOFP_I64_F64, "__aeabi_ul2d", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, 323 { RTLIB::SINTTOFP_I32_F32, "__aeabi_i2f", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, 324 { RTLIB::UINTTOFP_I32_F32, "__aeabi_ui2f", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, 325 { RTLIB::SINTTOFP_I64_F32, "__aeabi_l2f", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, 326 { RTLIB::UINTTOFP_I64_F32, "__aeabi_ul2f", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, 327 328 // Long long helper functions 329 // RTABI chapter 4.2, Table 9 330 { RTLIB::MUL_I64, "__aeabi_lmul", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, 331 { RTLIB::SHL_I64, "__aeabi_llsl", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, 332 { RTLIB::SRL_I64, "__aeabi_llsr", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, 333 { RTLIB::SRA_I64, "__aeabi_lasr", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, 334 335 // Integer division functions 336 // RTABI chapter 4.3.1 337 { RTLIB::SDIV_I8, "__aeabi_idiv", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, 338 { RTLIB::SDIV_I16, "__aeabi_idiv", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, 339 { RTLIB::SDIV_I32, "__aeabi_idiv", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, 340 { RTLIB::SDIV_I64, "__aeabi_ldivmod", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, 341 { RTLIB::UDIV_I8, "__aeabi_uidiv", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, 342 { RTLIB::UDIV_I16, "__aeabi_uidiv", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, 343 { RTLIB::UDIV_I32, "__aeabi_uidiv", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, 344 { RTLIB::UDIV_I64, "__aeabi_uldivmod", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, 345 }; 346 347 for (const auto &LC : LibraryCalls) { 348 setLibcallName(LC.Op, LC.Name); 349 setLibcallCallingConv(LC.Op, LC.CC); 350 if (LC.Cond != ISD::SETCC_INVALID) 351 setCmpLibcallCC(LC.Op, LC.Cond); 352 } 353 354 // EABI dependent RTLIB 355 if (TM.Options.EABIVersion == EABI::EABI4 || 356 TM.Options.EABIVersion == EABI::EABI5) { 357 static const struct { 358 const RTLIB::Libcall Op; 359 const char *const Name; 360 const CallingConv::ID CC; 361 const ISD::CondCode Cond; 362 } MemOpsLibraryCalls[] = { 363 // Memory operations 364 // RTABI chapter 4.3.4 365 { RTLIB::MEMCPY, "__aeabi_memcpy", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, 366 { RTLIB::MEMMOVE, "__aeabi_memmove", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, 367 { RTLIB::MEMSET, "__aeabi_memset", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, 368 }; 369 370 for (const auto &LC : MemOpsLibraryCalls) { 371 setLibcallName(LC.Op, LC.Name); 372 setLibcallCallingConv(LC.Op, LC.CC); 373 if (LC.Cond != ISD::SETCC_INVALID) 374 setCmpLibcallCC(LC.Op, LC.Cond); 375 } 376 } 377 } 378 379 if (Subtarget->isTargetWindows()) { 380 static const struct { 381 const RTLIB::Libcall Op; 382 const char * const Name; 383 const CallingConv::ID CC; 384 } LibraryCalls[] = { 385 { RTLIB::FPTOSINT_F32_I64, "__stoi64", CallingConv::ARM_AAPCS_VFP }, 386 { RTLIB::FPTOSINT_F64_I64, "__dtoi64", CallingConv::ARM_AAPCS_VFP }, 387 { RTLIB::FPTOUINT_F32_I64, "__stou64", CallingConv::ARM_AAPCS_VFP }, 388 { RTLIB::FPTOUINT_F64_I64, "__dtou64", CallingConv::ARM_AAPCS_VFP }, 389 { RTLIB::SINTTOFP_I64_F32, "__i64tos", CallingConv::ARM_AAPCS_VFP }, 390 { RTLIB::SINTTOFP_I64_F64, "__i64tod", CallingConv::ARM_AAPCS_VFP }, 391 { RTLIB::UINTTOFP_I64_F32, "__u64tos", CallingConv::ARM_AAPCS_VFP }, 392 { RTLIB::UINTTOFP_I64_F64, "__u64tod", CallingConv::ARM_AAPCS_VFP }, 393 }; 394 395 for (const auto &LC : LibraryCalls) { 396 setLibcallName(LC.Op, LC.Name); 397 setLibcallCallingConv(LC.Op, LC.CC); 398 } 399 } 400 401 // Use divmod compiler-rt calls for iOS 5.0 and later. 402 if (Subtarget->isTargetWatchOS() || 403 (Subtarget->isTargetIOS() && 404 !Subtarget->getTargetTriple().isOSVersionLT(5, 0))) { 405 setLibcallName(RTLIB::SDIVREM_I32, "__divmodsi4"); 406 setLibcallName(RTLIB::UDIVREM_I32, "__udivmodsi4"); 407 } 408 409 // The half <-> float conversion functions are always soft-float on 410 // non-watchos platforms, but are needed for some targets which use a 411 // hard-float calling convention by default. 412 if (!Subtarget->isTargetWatchABI()) { 413 if (Subtarget->isAAPCS_ABI()) { 414 setLibcallCallingConv(RTLIB::FPROUND_F32_F16, CallingConv::ARM_AAPCS); 415 setLibcallCallingConv(RTLIB::FPROUND_F64_F16, CallingConv::ARM_AAPCS); 416 setLibcallCallingConv(RTLIB::FPEXT_F16_F32, CallingConv::ARM_AAPCS); 417 } else { 418 setLibcallCallingConv(RTLIB::FPROUND_F32_F16, CallingConv::ARM_APCS); 419 setLibcallCallingConv(RTLIB::FPROUND_F64_F16, CallingConv::ARM_APCS); 420 setLibcallCallingConv(RTLIB::FPEXT_F16_F32, CallingConv::ARM_APCS); 421 } 422 } 423 424 // In EABI, these functions have an __aeabi_ prefix, but in GNUEABI they have 425 // a __gnu_ prefix (which is the default). 426 if (Subtarget->isTargetAEABI()) { 427 setLibcallName(RTLIB::FPROUND_F32_F16, "__aeabi_f2h"); 428 setLibcallName(RTLIB::FPROUND_F64_F16, "__aeabi_d2h"); 429 setLibcallName(RTLIB::FPEXT_F16_F32, "__aeabi_h2f"); 430 } 431 432 if (Subtarget->isThumb1Only()) 433 addRegisterClass(MVT::i32, &ARM::tGPRRegClass); 434 else 435 addRegisterClass(MVT::i32, &ARM::GPRRegClass); 436 if (!Subtarget->useSoftFloat() && Subtarget->hasVFP2() && 437 !Subtarget->isThumb1Only()) { 438 addRegisterClass(MVT::f32, &ARM::SPRRegClass); 439 addRegisterClass(MVT::f64, &ARM::DPRRegClass); 440 } 441 442 for (MVT VT : MVT::vector_valuetypes()) { 443 for (MVT InnerVT : MVT::vector_valuetypes()) { 444 setTruncStoreAction(VT, InnerVT, Expand); 445 setLoadExtAction(ISD::SEXTLOAD, VT, InnerVT, Expand); 446 setLoadExtAction(ISD::ZEXTLOAD, VT, InnerVT, Expand); 447 setLoadExtAction(ISD::EXTLOAD, VT, InnerVT, Expand); 448 } 449 450 setOperationAction(ISD::MULHS, VT, Expand); 451 setOperationAction(ISD::SMUL_LOHI, VT, Expand); 452 setOperationAction(ISD::MULHU, VT, Expand); 453 setOperationAction(ISD::UMUL_LOHI, VT, Expand); 454 455 setOperationAction(ISD::BSWAP, VT, Expand); 456 } 457 458 setOperationAction(ISD::ConstantFP, MVT::f32, Custom); 459 setOperationAction(ISD::ConstantFP, MVT::f64, Custom); 460 461 setOperationAction(ISD::READ_REGISTER, MVT::i64, Custom); 462 setOperationAction(ISD::WRITE_REGISTER, MVT::i64, Custom); 463 464 if (Subtarget->hasNEON()) { 465 addDRTypeForNEON(MVT::v2f32); 466 addDRTypeForNEON(MVT::v8i8); 467 addDRTypeForNEON(MVT::v4i16); 468 addDRTypeForNEON(MVT::v2i32); 469 addDRTypeForNEON(MVT::v1i64); 470 471 addQRTypeForNEON(MVT::v4f32); 472 addQRTypeForNEON(MVT::v2f64); 473 addQRTypeForNEON(MVT::v16i8); 474 addQRTypeForNEON(MVT::v8i16); 475 addQRTypeForNEON(MVT::v4i32); 476 addQRTypeForNEON(MVT::v2i64); 477 478 // v2f64 is legal so that QR subregs can be extracted as f64 elements, but 479 // neither Neon nor VFP support any arithmetic operations on it. 480 // The same with v4f32. But keep in mind that vadd, vsub, vmul are natively 481 // supported for v4f32. 482 setOperationAction(ISD::FADD, MVT::v2f64, Expand); 483 setOperationAction(ISD::FSUB, MVT::v2f64, Expand); 484 setOperationAction(ISD::FMUL, MVT::v2f64, Expand); 485 // FIXME: Code duplication: FDIV and FREM are expanded always, see 486 // ARMTargetLowering::addTypeForNEON method for details. 487 setOperationAction(ISD::FDIV, MVT::v2f64, Expand); 488 setOperationAction(ISD::FREM, MVT::v2f64, Expand); 489 // FIXME: Create unittest. 490 // In another words, find a way when "copysign" appears in DAG with vector 491 // operands. 492 setOperationAction(ISD::FCOPYSIGN, MVT::v2f64, Expand); 493 // FIXME: Code duplication: SETCC has custom operation action, see 494 // ARMTargetLowering::addTypeForNEON method for details. 495 setOperationAction(ISD::SETCC, MVT::v2f64, Expand); 496 // FIXME: Create unittest for FNEG and for FABS. 497 setOperationAction(ISD::FNEG, MVT::v2f64, Expand); 498 setOperationAction(ISD::FABS, MVT::v2f64, Expand); 499 setOperationAction(ISD::FSQRT, MVT::v2f64, Expand); 500 setOperationAction(ISD::FSIN, MVT::v2f64, Expand); 501 setOperationAction(ISD::FCOS, MVT::v2f64, Expand); 502 setOperationAction(ISD::FPOWI, MVT::v2f64, Expand); 503 setOperationAction(ISD::FPOW, MVT::v2f64, Expand); 504 setOperationAction(ISD::FLOG, MVT::v2f64, Expand); 505 setOperationAction(ISD::FLOG2, MVT::v2f64, Expand); 506 setOperationAction(ISD::FLOG10, MVT::v2f64, Expand); 507 setOperationAction(ISD::FEXP, MVT::v2f64, Expand); 508 setOperationAction(ISD::FEXP2, MVT::v2f64, Expand); 509 // FIXME: Create unittest for FCEIL, FTRUNC, FRINT, FNEARBYINT, FFLOOR. 510 setOperationAction(ISD::FCEIL, MVT::v2f64, Expand); 511 setOperationAction(ISD::FTRUNC, MVT::v2f64, Expand); 512 setOperationAction(ISD::FRINT, MVT::v2f64, Expand); 513 setOperationAction(ISD::FNEARBYINT, MVT::v2f64, Expand); 514 setOperationAction(ISD::FFLOOR, MVT::v2f64, Expand); 515 setOperationAction(ISD::FMA, MVT::v2f64, Expand); 516 517 setOperationAction(ISD::FSQRT, MVT::v4f32, Expand); 518 setOperationAction(ISD::FSIN, MVT::v4f32, Expand); 519 setOperationAction(ISD::FCOS, MVT::v4f32, Expand); 520 setOperationAction(ISD::FPOWI, MVT::v4f32, Expand); 521 setOperationAction(ISD::FPOW, MVT::v4f32, Expand); 522 setOperationAction(ISD::FLOG, MVT::v4f32, Expand); 523 setOperationAction(ISD::FLOG2, MVT::v4f32, Expand); 524 setOperationAction(ISD::FLOG10, MVT::v4f32, Expand); 525 setOperationAction(ISD::FEXP, MVT::v4f32, Expand); 526 setOperationAction(ISD::FEXP2, MVT::v4f32, Expand); 527 setOperationAction(ISD::FCEIL, MVT::v4f32, Expand); 528 setOperationAction(ISD::FTRUNC, MVT::v4f32, Expand); 529 setOperationAction(ISD::FRINT, MVT::v4f32, Expand); 530 setOperationAction(ISD::FNEARBYINT, MVT::v4f32, Expand); 531 setOperationAction(ISD::FFLOOR, MVT::v4f32, Expand); 532 533 // Mark v2f32 intrinsics. 534 setOperationAction(ISD::FSQRT, MVT::v2f32, Expand); 535 setOperationAction(ISD::FSIN, MVT::v2f32, Expand); 536 setOperationAction(ISD::FCOS, MVT::v2f32, Expand); 537 setOperationAction(ISD::FPOWI, MVT::v2f32, Expand); 538 setOperationAction(ISD::FPOW, MVT::v2f32, Expand); 539 setOperationAction(ISD::FLOG, MVT::v2f32, Expand); 540 setOperationAction(ISD::FLOG2, MVT::v2f32, Expand); 541 setOperationAction(ISD::FLOG10, MVT::v2f32, Expand); 542 setOperationAction(ISD::FEXP, MVT::v2f32, Expand); 543 setOperationAction(ISD::FEXP2, MVT::v2f32, Expand); 544 setOperationAction(ISD::FCEIL, MVT::v2f32, Expand); 545 setOperationAction(ISD::FTRUNC, MVT::v2f32, Expand); 546 setOperationAction(ISD::FRINT, MVT::v2f32, Expand); 547 setOperationAction(ISD::FNEARBYINT, MVT::v2f32, Expand); 548 setOperationAction(ISD::FFLOOR, MVT::v2f32, Expand); 549 550 // Neon does not support some operations on v1i64 and v2i64 types. 551 setOperationAction(ISD::MUL, MVT::v1i64, Expand); 552 // Custom handling for some quad-vector types to detect VMULL. 553 setOperationAction(ISD::MUL, MVT::v8i16, Custom); 554 setOperationAction(ISD::MUL, MVT::v4i32, Custom); 555 setOperationAction(ISD::MUL, MVT::v2i64, Custom); 556 // Custom handling for some vector types to avoid expensive expansions 557 setOperationAction(ISD::SDIV, MVT::v4i16, Custom); 558 setOperationAction(ISD::SDIV, MVT::v8i8, Custom); 559 setOperationAction(ISD::UDIV, MVT::v4i16, Custom); 560 setOperationAction(ISD::UDIV, MVT::v8i8, Custom); 561 setOperationAction(ISD::SETCC, MVT::v1i64, Expand); 562 setOperationAction(ISD::SETCC, MVT::v2i64, Expand); 563 // Neon does not have single instruction SINT_TO_FP and UINT_TO_FP with 564 // a destination type that is wider than the source, and nor does 565 // it have a FP_TO_[SU]INT instruction with a narrower destination than 566 // source. 567 setOperationAction(ISD::SINT_TO_FP, MVT::v4i16, Custom); 568 setOperationAction(ISD::UINT_TO_FP, MVT::v4i16, Custom); 569 setOperationAction(ISD::FP_TO_UINT, MVT::v4i16, Custom); 570 setOperationAction(ISD::FP_TO_SINT, MVT::v4i16, Custom); 571 572 setOperationAction(ISD::FP_ROUND, MVT::v2f32, Expand); 573 setOperationAction(ISD::FP_EXTEND, MVT::v2f64, Expand); 574 575 // NEON does not have single instruction CTPOP for vectors with element 576 // types wider than 8-bits. However, custom lowering can leverage the 577 // v8i8/v16i8 vcnt instruction. 578 setOperationAction(ISD::CTPOP, MVT::v2i32, Custom); 579 setOperationAction(ISD::CTPOP, MVT::v4i32, Custom); 580 setOperationAction(ISD::CTPOP, MVT::v4i16, Custom); 581 setOperationAction(ISD::CTPOP, MVT::v8i16, Custom); 582 setOperationAction(ISD::CTPOP, MVT::v1i64, Expand); 583 setOperationAction(ISD::CTPOP, MVT::v2i64, Expand); 584 585 setOperationAction(ISD::CTLZ, MVT::v1i64, Expand); 586 setOperationAction(ISD::CTLZ, MVT::v2i64, Expand); 587 588 // NEON does not have single instruction CTTZ for vectors. 589 setOperationAction(ISD::CTTZ, MVT::v8i8, Custom); 590 setOperationAction(ISD::CTTZ, MVT::v4i16, Custom); 591 setOperationAction(ISD::CTTZ, MVT::v2i32, Custom); 592 setOperationAction(ISD::CTTZ, MVT::v1i64, Custom); 593 594 setOperationAction(ISD::CTTZ, MVT::v16i8, Custom); 595 setOperationAction(ISD::CTTZ, MVT::v8i16, Custom); 596 setOperationAction(ISD::CTTZ, MVT::v4i32, Custom); 597 setOperationAction(ISD::CTTZ, MVT::v2i64, Custom); 598 599 setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::v8i8, Custom); 600 setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::v4i16, Custom); 601 setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::v2i32, Custom); 602 setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::v1i64, Custom); 603 604 setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::v16i8, Custom); 605 setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::v8i16, Custom); 606 setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::v4i32, Custom); 607 setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::v2i64, Custom); 608 609 // NEON only has FMA instructions as of VFP4. 610 if (!Subtarget->hasVFP4()) { 611 setOperationAction(ISD::FMA, MVT::v2f32, Expand); 612 setOperationAction(ISD::FMA, MVT::v4f32, Expand); 613 } 614 615 setTargetDAGCombine(ISD::INTRINSIC_VOID); 616 setTargetDAGCombine(ISD::INTRINSIC_W_CHAIN); 617 setTargetDAGCombine(ISD::INTRINSIC_WO_CHAIN); 618 setTargetDAGCombine(ISD::SHL); 619 setTargetDAGCombine(ISD::SRL); 620 setTargetDAGCombine(ISD::SRA); 621 setTargetDAGCombine(ISD::SIGN_EXTEND); 622 setTargetDAGCombine(ISD::ZERO_EXTEND); 623 setTargetDAGCombine(ISD::ANY_EXTEND); 624 setTargetDAGCombine(ISD::BUILD_VECTOR); 625 setTargetDAGCombine(ISD::VECTOR_SHUFFLE); 626 setTargetDAGCombine(ISD::INSERT_VECTOR_ELT); 627 setTargetDAGCombine(ISD::STORE); 628 setTargetDAGCombine(ISD::FP_TO_SINT); 629 setTargetDAGCombine(ISD::FP_TO_UINT); 630 setTargetDAGCombine(ISD::FDIV); 631 setTargetDAGCombine(ISD::LOAD); 632 633 // It is legal to extload from v4i8 to v4i16 or v4i32. 634 for (MVT Ty : {MVT::v8i8, MVT::v4i8, MVT::v2i8, MVT::v4i16, MVT::v2i16, 635 MVT::v2i32}) { 636 for (MVT VT : MVT::integer_vector_valuetypes()) { 637 setLoadExtAction(ISD::EXTLOAD, VT, Ty, Legal); 638 setLoadExtAction(ISD::ZEXTLOAD, VT, Ty, Legal); 639 setLoadExtAction(ISD::SEXTLOAD, VT, Ty, Legal); 640 } 641 } 642 } 643 644 // ARM and Thumb2 support UMLAL/SMLAL. 645 if (!Subtarget->isThumb1Only()) 646 setTargetDAGCombine(ISD::ADDC); 647 648 if (Subtarget->isFPOnlySP()) { 649 // When targeting a floating-point unit with only single-precision 650 // operations, f64 is legal for the few double-precision instructions which 651 // are present However, no double-precision operations other than moves, 652 // loads and stores are provided by the hardware. 653 setOperationAction(ISD::FADD, MVT::f64, Expand); 654 setOperationAction(ISD::FSUB, MVT::f64, Expand); 655 setOperationAction(ISD::FMUL, MVT::f64, Expand); 656 setOperationAction(ISD::FMA, MVT::f64, Expand); 657 setOperationAction(ISD::FDIV, MVT::f64, Expand); 658 setOperationAction(ISD::FREM, MVT::f64, Expand); 659 setOperationAction(ISD::FCOPYSIGN, MVT::f64, Expand); 660 setOperationAction(ISD::FGETSIGN, MVT::f64, Expand); 661 setOperationAction(ISD::FNEG, MVT::f64, Expand); 662 setOperationAction(ISD::FABS, MVT::f64, Expand); 663 setOperationAction(ISD::FSQRT, MVT::f64, Expand); 664 setOperationAction(ISD::FSIN, MVT::f64, Expand); 665 setOperationAction(ISD::FCOS, MVT::f64, Expand); 666 setOperationAction(ISD::FPOWI, MVT::f64, Expand); 667 setOperationAction(ISD::FPOW, MVT::f64, Expand); 668 setOperationAction(ISD::FLOG, MVT::f64, Expand); 669 setOperationAction(ISD::FLOG2, MVT::f64, Expand); 670 setOperationAction(ISD::FLOG10, MVT::f64, Expand); 671 setOperationAction(ISD::FEXP, MVT::f64, Expand); 672 setOperationAction(ISD::FEXP2, MVT::f64, Expand); 673 setOperationAction(ISD::FCEIL, MVT::f64, Expand); 674 setOperationAction(ISD::FTRUNC, MVT::f64, Expand); 675 setOperationAction(ISD::FRINT, MVT::f64, Expand); 676 setOperationAction(ISD::FNEARBYINT, MVT::f64, Expand); 677 setOperationAction(ISD::FFLOOR, MVT::f64, Expand); 678 setOperationAction(ISD::SINT_TO_FP, MVT::i32, Custom); 679 setOperationAction(ISD::UINT_TO_FP, MVT::i32, Custom); 680 setOperationAction(ISD::FP_TO_SINT, MVT::i32, Custom); 681 setOperationAction(ISD::FP_TO_UINT, MVT::i32, Custom); 682 setOperationAction(ISD::FP_TO_SINT, MVT::f64, Custom); 683 setOperationAction(ISD::FP_TO_UINT, MVT::f64, Custom); 684 setOperationAction(ISD::FP_ROUND, MVT::f32, Custom); 685 setOperationAction(ISD::FP_EXTEND, MVT::f64, Custom); 686 } 687 688 computeRegisterProperties(Subtarget->getRegisterInfo()); 689 690 // ARM does not have floating-point extending loads. 691 for (MVT VT : MVT::fp_valuetypes()) { 692 setLoadExtAction(ISD::EXTLOAD, VT, MVT::f32, Expand); 693 setLoadExtAction(ISD::EXTLOAD, VT, MVT::f16, Expand); 694 } 695 696 // ... or truncating stores 697 setTruncStoreAction(MVT::f64, MVT::f32, Expand); 698 setTruncStoreAction(MVT::f32, MVT::f16, Expand); 699 setTruncStoreAction(MVT::f64, MVT::f16, Expand); 700 701 // ARM does not have i1 sign extending load. 702 for (MVT VT : MVT::integer_valuetypes()) 703 setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i1, Promote); 704 705 // ARM supports all 4 flavors of integer indexed load / store. 706 if (!Subtarget->isThumb1Only()) { 707 for (unsigned im = (unsigned)ISD::PRE_INC; 708 im != (unsigned)ISD::LAST_INDEXED_MODE; ++im) { 709 setIndexedLoadAction(im, MVT::i1, Legal); 710 setIndexedLoadAction(im, MVT::i8, Legal); 711 setIndexedLoadAction(im, MVT::i16, Legal); 712 setIndexedLoadAction(im, MVT::i32, Legal); 713 setIndexedStoreAction(im, MVT::i1, Legal); 714 setIndexedStoreAction(im, MVT::i8, Legal); 715 setIndexedStoreAction(im, MVT::i16, Legal); 716 setIndexedStoreAction(im, MVT::i32, Legal); 717 } 718 } 719 720 setOperationAction(ISD::SADDO, MVT::i32, Custom); 721 setOperationAction(ISD::UADDO, MVT::i32, Custom); 722 setOperationAction(ISD::SSUBO, MVT::i32, Custom); 723 setOperationAction(ISD::USUBO, MVT::i32, Custom); 724 725 // i64 operation support. 726 setOperationAction(ISD::MUL, MVT::i64, Expand); 727 setOperationAction(ISD::MULHU, MVT::i32, Expand); 728 if (Subtarget->isThumb1Only()) { 729 setOperationAction(ISD::UMUL_LOHI, MVT::i32, Expand); 730 setOperationAction(ISD::SMUL_LOHI, MVT::i32, Expand); 731 } 732 if (Subtarget->isThumb1Only() || !Subtarget->hasV6Ops() 733 || (Subtarget->isThumb2() && !Subtarget->hasDSP())) 734 setOperationAction(ISD::MULHS, MVT::i32, Expand); 735 736 setOperationAction(ISD::SHL_PARTS, MVT::i32, Custom); 737 setOperationAction(ISD::SRA_PARTS, MVT::i32, Custom); 738 setOperationAction(ISD::SRL_PARTS, MVT::i32, Custom); 739 setOperationAction(ISD::SRL, MVT::i64, Custom); 740 setOperationAction(ISD::SRA, MVT::i64, Custom); 741 742 if (!Subtarget->isThumb1Only()) { 743 // FIXME: We should do this for Thumb1 as well. 744 setOperationAction(ISD::ADDC, MVT::i32, Custom); 745 setOperationAction(ISD::ADDE, MVT::i32, Custom); 746 setOperationAction(ISD::SUBC, MVT::i32, Custom); 747 setOperationAction(ISD::SUBE, MVT::i32, Custom); 748 } 749 750 if (!Subtarget->isThumb1Only() && Subtarget->hasV6T2Ops()) 751 setOperationAction(ISD::BITREVERSE, MVT::i32, Legal); 752 753 // ARM does not have ROTL. 754 setOperationAction(ISD::ROTL, MVT::i32, Expand); 755 for (MVT VT : MVT::vector_valuetypes()) { 756 setOperationAction(ISD::ROTL, VT, Expand); 757 setOperationAction(ISD::ROTR, VT, Expand); 758 } 759 setOperationAction(ISD::CTTZ, MVT::i32, Custom); 760 setOperationAction(ISD::CTPOP, MVT::i32, Expand); 761 if (!Subtarget->hasV5TOps() || Subtarget->isThumb1Only()) 762 setOperationAction(ISD::CTLZ, MVT::i32, Expand); 763 764 // @llvm.readcyclecounter requires the Performance Monitors extension. 765 // Default to the 0 expansion on unsupported platforms. 766 // FIXME: Technically there are older ARM CPUs that have 767 // implementation-specific ways of obtaining this information. 768 if (Subtarget->hasPerfMon()) 769 setOperationAction(ISD::READCYCLECOUNTER, MVT::i64, Custom); 770 771 // Only ARMv6 has BSWAP. 772 if (!Subtarget->hasV6Ops()) 773 setOperationAction(ISD::BSWAP, MVT::i32, Expand); 774 775 bool hasDivide = Subtarget->isThumb() ? Subtarget->hasDivide() 776 : Subtarget->hasDivideInARMMode(); 777 if (!hasDivide) { 778 // These are expanded into libcalls if the cpu doesn't have HW divider. 779 setOperationAction(ISD::SDIV, MVT::i32, LibCall); 780 setOperationAction(ISD::UDIV, MVT::i32, LibCall); 781 } 782 783 if (Subtarget->isTargetWindows() && !Subtarget->hasDivide()) { 784 setOperationAction(ISD::SDIV, MVT::i32, Custom); 785 setOperationAction(ISD::UDIV, MVT::i32, Custom); 786 787 setOperationAction(ISD::SDIV, MVT::i64, Custom); 788 setOperationAction(ISD::UDIV, MVT::i64, Custom); 789 } 790 791 setOperationAction(ISD::SREM, MVT::i32, Expand); 792 setOperationAction(ISD::UREM, MVT::i32, Expand); 793 // Register based DivRem for AEABI (RTABI 4.2) 794 if (Subtarget->isTargetAEABI() || Subtarget->isTargetAndroid() || 795 Subtarget->isTargetGNUAEABI() || Subtarget->isTargetMuslAEABI()) { 796 setOperationAction(ISD::SREM, MVT::i64, Custom); 797 setOperationAction(ISD::UREM, MVT::i64, Custom); 798 799 setLibcallName(RTLIB::SDIVREM_I8, "__aeabi_idivmod"); 800 setLibcallName(RTLIB::SDIVREM_I16, "__aeabi_idivmod"); 801 setLibcallName(RTLIB::SDIVREM_I32, "__aeabi_idivmod"); 802 setLibcallName(RTLIB::SDIVREM_I64, "__aeabi_ldivmod"); 803 setLibcallName(RTLIB::UDIVREM_I8, "__aeabi_uidivmod"); 804 setLibcallName(RTLIB::UDIVREM_I16, "__aeabi_uidivmod"); 805 setLibcallName(RTLIB::UDIVREM_I32, "__aeabi_uidivmod"); 806 setLibcallName(RTLIB::UDIVREM_I64, "__aeabi_uldivmod"); 807 808 setLibcallCallingConv(RTLIB::SDIVREM_I8, CallingConv::ARM_AAPCS); 809 setLibcallCallingConv(RTLIB::SDIVREM_I16, CallingConv::ARM_AAPCS); 810 setLibcallCallingConv(RTLIB::SDIVREM_I32, CallingConv::ARM_AAPCS); 811 setLibcallCallingConv(RTLIB::SDIVREM_I64, CallingConv::ARM_AAPCS); 812 setLibcallCallingConv(RTLIB::UDIVREM_I8, CallingConv::ARM_AAPCS); 813 setLibcallCallingConv(RTLIB::UDIVREM_I16, CallingConv::ARM_AAPCS); 814 setLibcallCallingConv(RTLIB::UDIVREM_I32, CallingConv::ARM_AAPCS); 815 setLibcallCallingConv(RTLIB::UDIVREM_I64, CallingConv::ARM_AAPCS); 816 817 setOperationAction(ISD::SDIVREM, MVT::i32, Custom); 818 setOperationAction(ISD::UDIVREM, MVT::i32, Custom); 819 setOperationAction(ISD::SDIVREM, MVT::i64, Custom); 820 setOperationAction(ISD::UDIVREM, MVT::i64, Custom); 821 } else { 822 setOperationAction(ISD::SDIVREM, MVT::i32, Expand); 823 setOperationAction(ISD::UDIVREM, MVT::i32, Expand); 824 } 825 826 setOperationAction(ISD::GlobalAddress, MVT::i32, Custom); 827 setOperationAction(ISD::ConstantPool, MVT::i32, Custom); 828 setOperationAction(ISD::GlobalTLSAddress, MVT::i32, Custom); 829 setOperationAction(ISD::BlockAddress, MVT::i32, Custom); 830 831 setOperationAction(ISD::TRAP, MVT::Other, Legal); 832 833 // Use the default implementation. 834 setOperationAction(ISD::VASTART, MVT::Other, Custom); 835 setOperationAction(ISD::VAARG, MVT::Other, Expand); 836 setOperationAction(ISD::VACOPY, MVT::Other, Expand); 837 setOperationAction(ISD::VAEND, MVT::Other, Expand); 838 setOperationAction(ISD::STACKSAVE, MVT::Other, Expand); 839 setOperationAction(ISD::STACKRESTORE, MVT::Other, Expand); 840 841 if (Subtarget->getTargetTriple().isWindowsItaniumEnvironment()) 842 setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i32, Custom); 843 else 844 setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i32, Expand); 845 846 // ARMv6 Thumb1 (except for CPUs that support dmb / dsb) and earlier use 847 // the default expansion. 848 InsertFencesForAtomic = false; 849 if (Subtarget->hasAnyDataBarrier() && 850 (!Subtarget->isThumb() || Subtarget->hasV8MBaselineOps())) { 851 // ATOMIC_FENCE needs custom lowering; the others should have been expanded 852 // to ldrex/strex loops already. 853 setOperationAction(ISD::ATOMIC_FENCE, MVT::Other, Custom); 854 if (!Subtarget->isThumb() || !Subtarget->isMClass()) 855 setOperationAction(ISD::ATOMIC_CMP_SWAP, MVT::i64, Custom); 856 857 // On v8, we have particularly efficient implementations of atomic fences 858 // if they can be combined with nearby atomic loads and stores. 859 if (!Subtarget->hasV8Ops() || getTargetMachine().getOptLevel() == 0) { 860 // Automatically insert fences (dmb ish) around ATOMIC_SWAP etc. 861 InsertFencesForAtomic = true; 862 } 863 } else { 864 // If there's anything we can use as a barrier, go through custom lowering 865 // for ATOMIC_FENCE. 866 setOperationAction(ISD::ATOMIC_FENCE, MVT::Other, 867 Subtarget->hasAnyDataBarrier() ? Custom : Expand); 868 869 // Set them all for expansion, which will force libcalls. 870 setOperationAction(ISD::ATOMIC_CMP_SWAP, MVT::i32, Expand); 871 setOperationAction(ISD::ATOMIC_SWAP, MVT::i32, Expand); 872 setOperationAction(ISD::ATOMIC_LOAD_ADD, MVT::i32, Expand); 873 setOperationAction(ISD::ATOMIC_LOAD_SUB, MVT::i32, Expand); 874 setOperationAction(ISD::ATOMIC_LOAD_AND, MVT::i32, Expand); 875 setOperationAction(ISD::ATOMIC_LOAD_OR, MVT::i32, Expand); 876 setOperationAction(ISD::ATOMIC_LOAD_XOR, MVT::i32, Expand); 877 setOperationAction(ISD::ATOMIC_LOAD_NAND, MVT::i32, Expand); 878 setOperationAction(ISD::ATOMIC_LOAD_MIN, MVT::i32, Expand); 879 setOperationAction(ISD::ATOMIC_LOAD_MAX, MVT::i32, Expand); 880 setOperationAction(ISD::ATOMIC_LOAD_UMIN, MVT::i32, Expand); 881 setOperationAction(ISD::ATOMIC_LOAD_UMAX, MVT::i32, Expand); 882 // Mark ATOMIC_LOAD and ATOMIC_STORE custom so we can handle the 883 // Unordered/Monotonic case. 884 setOperationAction(ISD::ATOMIC_LOAD, MVT::i32, Custom); 885 setOperationAction(ISD::ATOMIC_STORE, MVT::i32, Custom); 886 } 887 888 setOperationAction(ISD::PREFETCH, MVT::Other, Custom); 889 890 // Requires SXTB/SXTH, available on v6 and up in both ARM and Thumb modes. 891 if (!Subtarget->hasV6Ops()) { 892 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i16, Expand); 893 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i8, Expand); 894 } 895 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1, Expand); 896 897 if (!Subtarget->useSoftFloat() && Subtarget->hasVFP2() && 898 !Subtarget->isThumb1Only()) { 899 // Turn f64->i64 into VMOVRRD, i64 -> f64 to VMOVDRR 900 // iff target supports vfp2. 901 setOperationAction(ISD::BITCAST, MVT::i64, Custom); 902 setOperationAction(ISD::FLT_ROUNDS_, MVT::i32, Custom); 903 } 904 905 // We want to custom lower some of our intrinsics. 906 setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom); 907 setOperationAction(ISD::EH_SJLJ_SETJMP, MVT::i32, Custom); 908 setOperationAction(ISD::EH_SJLJ_LONGJMP, MVT::Other, Custom); 909 setOperationAction(ISD::EH_SJLJ_SETUP_DISPATCH, MVT::Other, Custom); 910 if (Subtarget->useSjLjEH()) 911 setLibcallName(RTLIB::UNWIND_RESUME, "_Unwind_SjLj_Resume"); 912 913 setOperationAction(ISD::SETCC, MVT::i32, Expand); 914 setOperationAction(ISD::SETCC, MVT::f32, Expand); 915 setOperationAction(ISD::SETCC, MVT::f64, Expand); 916 setOperationAction(ISD::SELECT, MVT::i32, Custom); 917 setOperationAction(ISD::SELECT, MVT::f32, Custom); 918 setOperationAction(ISD::SELECT, MVT::f64, Custom); 919 setOperationAction(ISD::SELECT_CC, MVT::i32, Custom); 920 setOperationAction(ISD::SELECT_CC, MVT::f32, Custom); 921 setOperationAction(ISD::SELECT_CC, MVT::f64, Custom); 922 923 // Thumb-1 cannot currently select ARMISD::SUBE. 924 if (!Subtarget->isThumb1Only()) 925 setOperationAction(ISD::SETCCE, MVT::i32, Custom); 926 927 setOperationAction(ISD::BRCOND, MVT::Other, Expand); 928 setOperationAction(ISD::BR_CC, MVT::i32, Custom); 929 setOperationAction(ISD::BR_CC, MVT::f32, Custom); 930 setOperationAction(ISD::BR_CC, MVT::f64, Custom); 931 setOperationAction(ISD::BR_JT, MVT::Other, Custom); 932 933 // We don't support sin/cos/fmod/copysign/pow 934 setOperationAction(ISD::FSIN, MVT::f64, Expand); 935 setOperationAction(ISD::FSIN, MVT::f32, Expand); 936 setOperationAction(ISD::FCOS, MVT::f32, Expand); 937 setOperationAction(ISD::FCOS, MVT::f64, Expand); 938 setOperationAction(ISD::FSINCOS, MVT::f64, Expand); 939 setOperationAction(ISD::FSINCOS, MVT::f32, Expand); 940 setOperationAction(ISD::FREM, MVT::f64, Expand); 941 setOperationAction(ISD::FREM, MVT::f32, Expand); 942 if (!Subtarget->useSoftFloat() && Subtarget->hasVFP2() && 943 !Subtarget->isThumb1Only()) { 944 setOperationAction(ISD::FCOPYSIGN, MVT::f64, Custom); 945 setOperationAction(ISD::FCOPYSIGN, MVT::f32, Custom); 946 } 947 setOperationAction(ISD::FPOW, MVT::f64, Expand); 948 setOperationAction(ISD::FPOW, MVT::f32, Expand); 949 950 if (!Subtarget->hasVFP4()) { 951 setOperationAction(ISD::FMA, MVT::f64, Expand); 952 setOperationAction(ISD::FMA, MVT::f32, Expand); 953 } 954 955 // Various VFP goodness 956 if (!Subtarget->useSoftFloat() && !Subtarget->isThumb1Only()) { 957 // FP-ARMv8 adds f64 <-> f16 conversion. Before that it should be expanded. 958 if (!Subtarget->hasFPARMv8() || Subtarget->isFPOnlySP()) { 959 setOperationAction(ISD::FP16_TO_FP, MVT::f64, Expand); 960 setOperationAction(ISD::FP_TO_FP16, MVT::f64, Expand); 961 } 962 963 // fp16 is a special v7 extension that adds f16 <-> f32 conversions. 964 if (!Subtarget->hasFP16()) { 965 setOperationAction(ISD::FP16_TO_FP, MVT::f32, Expand); 966 setOperationAction(ISD::FP_TO_FP16, MVT::f32, Expand); 967 } 968 } 969 970 // Combine sin / cos into one node or libcall if possible. 971 if (Subtarget->hasSinCos()) { 972 setLibcallName(RTLIB::SINCOS_F32, "sincosf"); 973 setLibcallName(RTLIB::SINCOS_F64, "sincos"); 974 if (Subtarget->isTargetWatchABI()) { 975 setLibcallCallingConv(RTLIB::SINCOS_F32, CallingConv::ARM_AAPCS_VFP); 976 setLibcallCallingConv(RTLIB::SINCOS_F64, CallingConv::ARM_AAPCS_VFP); 977 } 978 if (Subtarget->isTargetIOS() || Subtarget->isTargetWatchOS()) { 979 // For iOS, we don't want to the normal expansion of a libcall to 980 // sincos. We want to issue a libcall to __sincos_stret. 981 setOperationAction(ISD::FSINCOS, MVT::f64, Custom); 982 setOperationAction(ISD::FSINCOS, MVT::f32, Custom); 983 } 984 } 985 986 // FP-ARMv8 implements a lot of rounding-like FP operations. 987 if (Subtarget->hasFPARMv8()) { 988 setOperationAction(ISD::FFLOOR, MVT::f32, Legal); 989 setOperationAction(ISD::FCEIL, MVT::f32, Legal); 990 setOperationAction(ISD::FROUND, MVT::f32, Legal); 991 setOperationAction(ISD::FTRUNC, MVT::f32, Legal); 992 setOperationAction(ISD::FNEARBYINT, MVT::f32, Legal); 993 setOperationAction(ISD::FRINT, MVT::f32, Legal); 994 setOperationAction(ISD::FMINNUM, MVT::f32, Legal); 995 setOperationAction(ISD::FMAXNUM, MVT::f32, Legal); 996 setOperationAction(ISD::FMINNUM, MVT::v2f32, Legal); 997 setOperationAction(ISD::FMAXNUM, MVT::v2f32, Legal); 998 setOperationAction(ISD::FMINNUM, MVT::v4f32, Legal); 999 setOperationAction(ISD::FMAXNUM, MVT::v4f32, Legal); 1000 1001 if (!Subtarget->isFPOnlySP()) { 1002 setOperationAction(ISD::FFLOOR, MVT::f64, Legal); 1003 setOperationAction(ISD::FCEIL, MVT::f64, Legal); 1004 setOperationAction(ISD::FROUND, MVT::f64, Legal); 1005 setOperationAction(ISD::FTRUNC, MVT::f64, Legal); 1006 setOperationAction(ISD::FNEARBYINT, MVT::f64, Legal); 1007 setOperationAction(ISD::FRINT, MVT::f64, Legal); 1008 setOperationAction(ISD::FMINNUM, MVT::f64, Legal); 1009 setOperationAction(ISD::FMAXNUM, MVT::f64, Legal); 1010 } 1011 } 1012 1013 if (Subtarget->hasNEON()) { 1014 // vmin and vmax aren't available in a scalar form, so we use 1015 // a NEON instruction with an undef lane instead. 1016 setOperationAction(ISD::FMINNAN, MVT::f32, Legal); 1017 setOperationAction(ISD::FMAXNAN, MVT::f32, Legal); 1018 setOperationAction(ISD::FMINNAN, MVT::v2f32, Legal); 1019 setOperationAction(ISD::FMAXNAN, MVT::v2f32, Legal); 1020 setOperationAction(ISD::FMINNAN, MVT::v4f32, Legal); 1021 setOperationAction(ISD::FMAXNAN, MVT::v4f32, Legal); 1022 } 1023 1024 // We have target-specific dag combine patterns for the following nodes: 1025 // ARMISD::VMOVRRD - No need to call setTargetDAGCombine 1026 setTargetDAGCombine(ISD::ADD); 1027 setTargetDAGCombine(ISD::SUB); 1028 setTargetDAGCombine(ISD::MUL); 1029 setTargetDAGCombine(ISD::AND); 1030 setTargetDAGCombine(ISD::OR); 1031 setTargetDAGCombine(ISD::XOR); 1032 1033 if (Subtarget->hasV6Ops()) 1034 setTargetDAGCombine(ISD::SRL); 1035 1036 setStackPointerRegisterToSaveRestore(ARM::SP); 1037 1038 if (Subtarget->useSoftFloat() || Subtarget->isThumb1Only() || 1039 !Subtarget->hasVFP2()) 1040 setSchedulingPreference(Sched::RegPressure); 1041 else 1042 setSchedulingPreference(Sched::Hybrid); 1043 1044 //// temporary - rewrite interface to use type 1045 MaxStoresPerMemset = 8; 1046 MaxStoresPerMemsetOptSize = 4; 1047 MaxStoresPerMemcpy = 4; // For @llvm.memcpy -> sequence of stores 1048 MaxStoresPerMemcpyOptSize = 2; 1049 MaxStoresPerMemmove = 4; // For @llvm.memmove -> sequence of stores 1050 MaxStoresPerMemmoveOptSize = 2; 1051 1052 // On ARM arguments smaller than 4 bytes are extended, so all arguments 1053 // are at least 4 bytes aligned. 1054 setMinStackArgumentAlignment(4); 1055 1056 // Prefer likely predicted branches to selects on out-of-order cores. 1057 PredictableSelectIsExpensive = Subtarget->getSchedModel().isOutOfOrder(); 1058 1059 setMinFunctionAlignment(Subtarget->isThumb() ? 1 : 2); 1060 } 1061 1062 bool ARMTargetLowering::useSoftFloat() const { 1063 return Subtarget->useSoftFloat(); 1064 } 1065 1066 // FIXME: It might make sense to define the representative register class as the 1067 // nearest super-register that has a non-null superset. For example, DPR_VFP2 is 1068 // a super-register of SPR, and DPR is a superset if DPR_VFP2. Consequently, 1069 // SPR's representative would be DPR_VFP2. This should work well if register 1070 // pressure tracking were modified such that a register use would increment the 1071 // pressure of the register class's representative and all of it's super 1072 // classes' representatives transitively. We have not implemented this because 1073 // of the difficulty prior to coalescing of modeling operand register classes 1074 // due to the common occurrence of cross class copies and subregister insertions 1075 // and extractions. 1076 std::pair<const TargetRegisterClass *, uint8_t> 1077 ARMTargetLowering::findRepresentativeClass(const TargetRegisterInfo *TRI, 1078 MVT VT) const { 1079 const TargetRegisterClass *RRC = nullptr; 1080 uint8_t Cost = 1; 1081 switch (VT.SimpleTy) { 1082 default: 1083 return TargetLowering::findRepresentativeClass(TRI, VT); 1084 // Use DPR as representative register class for all floating point 1085 // and vector types. Since there are 32 SPR registers and 32 DPR registers so 1086 // the cost is 1 for both f32 and f64. 1087 case MVT::f32: case MVT::f64: case MVT::v8i8: case MVT::v4i16: 1088 case MVT::v2i32: case MVT::v1i64: case MVT::v2f32: 1089 RRC = &ARM::DPRRegClass; 1090 // When NEON is used for SP, only half of the register file is available 1091 // because operations that define both SP and DP results will be constrained 1092 // to the VFP2 class (D0-D15). We currently model this constraint prior to 1093 // coalescing by double-counting the SP regs. See the FIXME above. 1094 if (Subtarget->useNEONForSinglePrecisionFP()) 1095 Cost = 2; 1096 break; 1097 case MVT::v16i8: case MVT::v8i16: case MVT::v4i32: case MVT::v2i64: 1098 case MVT::v4f32: case MVT::v2f64: 1099 RRC = &ARM::DPRRegClass; 1100 Cost = 2; 1101 break; 1102 case MVT::v4i64: 1103 RRC = &ARM::DPRRegClass; 1104 Cost = 4; 1105 break; 1106 case MVT::v8i64: 1107 RRC = &ARM::DPRRegClass; 1108 Cost = 8; 1109 break; 1110 } 1111 return std::make_pair(RRC, Cost); 1112 } 1113 1114 const char *ARMTargetLowering::getTargetNodeName(unsigned Opcode) const { 1115 switch ((ARMISD::NodeType)Opcode) { 1116 case ARMISD::FIRST_NUMBER: break; 1117 case ARMISD::Wrapper: return "ARMISD::Wrapper"; 1118 case ARMISD::WrapperPIC: return "ARMISD::WrapperPIC"; 1119 case ARMISD::WrapperJT: return "ARMISD::WrapperJT"; 1120 case ARMISD::COPY_STRUCT_BYVAL: return "ARMISD::COPY_STRUCT_BYVAL"; 1121 case ARMISD::CALL: return "ARMISD::CALL"; 1122 case ARMISD::CALL_PRED: return "ARMISD::CALL_PRED"; 1123 case ARMISD::CALL_NOLINK: return "ARMISD::CALL_NOLINK"; 1124 case ARMISD::BRCOND: return "ARMISD::BRCOND"; 1125 case ARMISD::BR_JT: return "ARMISD::BR_JT"; 1126 case ARMISD::BR2_JT: return "ARMISD::BR2_JT"; 1127 case ARMISD::RET_FLAG: return "ARMISD::RET_FLAG"; 1128 case ARMISD::INTRET_FLAG: return "ARMISD::INTRET_FLAG"; 1129 case ARMISD::PIC_ADD: return "ARMISD::PIC_ADD"; 1130 case ARMISD::CMP: return "ARMISD::CMP"; 1131 case ARMISD::CMN: return "ARMISD::CMN"; 1132 case ARMISD::CMPZ: return "ARMISD::CMPZ"; 1133 case ARMISD::CMPFP: return "ARMISD::CMPFP"; 1134 case ARMISD::CMPFPw0: return "ARMISD::CMPFPw0"; 1135 case ARMISD::BCC_i64: return "ARMISD::BCC_i64"; 1136 case ARMISD::FMSTAT: return "ARMISD::FMSTAT"; 1137 1138 case ARMISD::CMOV: return "ARMISD::CMOV"; 1139 1140 case ARMISD::SSAT: return "ARMISD::SSAT"; 1141 1142 case ARMISD::SRL_FLAG: return "ARMISD::SRL_FLAG"; 1143 case ARMISD::SRA_FLAG: return "ARMISD::SRA_FLAG"; 1144 case ARMISD::RRX: return "ARMISD::RRX"; 1145 1146 case ARMISD::ADDC: return "ARMISD::ADDC"; 1147 case ARMISD::ADDE: return "ARMISD::ADDE"; 1148 case ARMISD::SUBC: return "ARMISD::SUBC"; 1149 case ARMISD::SUBE: return "ARMISD::SUBE"; 1150 1151 case ARMISD::VMOVRRD: return "ARMISD::VMOVRRD"; 1152 case ARMISD::VMOVDRR: return "ARMISD::VMOVDRR"; 1153 1154 case ARMISD::EH_SJLJ_SETJMP: return "ARMISD::EH_SJLJ_SETJMP"; 1155 case ARMISD::EH_SJLJ_LONGJMP: return "ARMISD::EH_SJLJ_LONGJMP"; 1156 case ARMISD::EH_SJLJ_SETUP_DISPATCH: return "ARMISD::EH_SJLJ_SETUP_DISPATCH"; 1157 1158 case ARMISD::TC_RETURN: return "ARMISD::TC_RETURN"; 1159 1160 case ARMISD::THREAD_POINTER:return "ARMISD::THREAD_POINTER"; 1161 1162 case ARMISD::DYN_ALLOC: return "ARMISD::DYN_ALLOC"; 1163 1164 case ARMISD::MEMBARRIER_MCR: return "ARMISD::MEMBARRIER_MCR"; 1165 1166 case ARMISD::PRELOAD: return "ARMISD::PRELOAD"; 1167 1168 case ARMISD::WIN__CHKSTK: return "ARMISD:::WIN__CHKSTK"; 1169 case ARMISD::WIN__DBZCHK: return "ARMISD::WIN__DBZCHK"; 1170 1171 case ARMISD::VCEQ: return "ARMISD::VCEQ"; 1172 case ARMISD::VCEQZ: return "ARMISD::VCEQZ"; 1173 case ARMISD::VCGE: return "ARMISD::VCGE"; 1174 case ARMISD::VCGEZ: return "ARMISD::VCGEZ"; 1175 case ARMISD::VCLEZ: return "ARMISD::VCLEZ"; 1176 case ARMISD::VCGEU: return "ARMISD::VCGEU"; 1177 case ARMISD::VCGT: return "ARMISD::VCGT"; 1178 case ARMISD::VCGTZ: return "ARMISD::VCGTZ"; 1179 case ARMISD::VCLTZ: return "ARMISD::VCLTZ"; 1180 case ARMISD::VCGTU: return "ARMISD::VCGTU"; 1181 case ARMISD::VTST: return "ARMISD::VTST"; 1182 1183 case ARMISD::VSHL: return "ARMISD::VSHL"; 1184 case ARMISD::VSHRs: return "ARMISD::VSHRs"; 1185 case ARMISD::VSHRu: return "ARMISD::VSHRu"; 1186 case ARMISD::VRSHRs: return "ARMISD::VRSHRs"; 1187 case ARMISD::VRSHRu: return "ARMISD::VRSHRu"; 1188 case ARMISD::VRSHRN: return "ARMISD::VRSHRN"; 1189 case ARMISD::VQSHLs: return "ARMISD::VQSHLs"; 1190 case ARMISD::VQSHLu: return "ARMISD::VQSHLu"; 1191 case ARMISD::VQSHLsu: return "ARMISD::VQSHLsu"; 1192 case ARMISD::VQSHRNs: return "ARMISD::VQSHRNs"; 1193 case ARMISD::VQSHRNu: return "ARMISD::VQSHRNu"; 1194 case ARMISD::VQSHRNsu: return "ARMISD::VQSHRNsu"; 1195 case ARMISD::VQRSHRNs: return "ARMISD::VQRSHRNs"; 1196 case ARMISD::VQRSHRNu: return "ARMISD::VQRSHRNu"; 1197 case ARMISD::VQRSHRNsu: return "ARMISD::VQRSHRNsu"; 1198 case ARMISD::VSLI: return "ARMISD::VSLI"; 1199 case ARMISD::VSRI: return "ARMISD::VSRI"; 1200 case ARMISD::VGETLANEu: return "ARMISD::VGETLANEu"; 1201 case ARMISD::VGETLANEs: return "ARMISD::VGETLANEs"; 1202 case ARMISD::VMOVIMM: return "ARMISD::VMOVIMM"; 1203 case ARMISD::VMVNIMM: return "ARMISD::VMVNIMM"; 1204 case ARMISD::VMOVFPIMM: return "ARMISD::VMOVFPIMM"; 1205 case ARMISD::VDUP: return "ARMISD::VDUP"; 1206 case ARMISD::VDUPLANE: return "ARMISD::VDUPLANE"; 1207 case ARMISD::VEXT: return "ARMISD::VEXT"; 1208 case ARMISD::VREV64: return "ARMISD::VREV64"; 1209 case ARMISD::VREV32: return "ARMISD::VREV32"; 1210 case ARMISD::VREV16: return "ARMISD::VREV16"; 1211 case ARMISD::VZIP: return "ARMISD::VZIP"; 1212 case ARMISD::VUZP: return "ARMISD::VUZP"; 1213 case ARMISD::VTRN: return "ARMISD::VTRN"; 1214 case ARMISD::VTBL1: return "ARMISD::VTBL1"; 1215 case ARMISD::VTBL2: return "ARMISD::VTBL2"; 1216 case ARMISD::VMULLs: return "ARMISD::VMULLs"; 1217 case ARMISD::VMULLu: return "ARMISD::VMULLu"; 1218 case ARMISD::UMAAL: return "ARMISD::UMAAL"; 1219 case ARMISD::UMLAL: return "ARMISD::UMLAL"; 1220 case ARMISD::SMLAL: return "ARMISD::SMLAL"; 1221 case ARMISD::BUILD_VECTOR: return "ARMISD::BUILD_VECTOR"; 1222 case ARMISD::BFI: return "ARMISD::BFI"; 1223 case ARMISD::VORRIMM: return "ARMISD::VORRIMM"; 1224 case ARMISD::VBICIMM: return "ARMISD::VBICIMM"; 1225 case ARMISD::VBSL: return "ARMISD::VBSL"; 1226 case ARMISD::MEMCPY: return "ARMISD::MEMCPY"; 1227 case ARMISD::VLD2DUP: return "ARMISD::VLD2DUP"; 1228 case ARMISD::VLD3DUP: return "ARMISD::VLD3DUP"; 1229 case ARMISD::VLD4DUP: return "ARMISD::VLD4DUP"; 1230 case ARMISD::VLD1_UPD: return "ARMISD::VLD1_UPD"; 1231 case ARMISD::VLD2_UPD: return "ARMISD::VLD2_UPD"; 1232 case ARMISD::VLD3_UPD: return "ARMISD::VLD3_UPD"; 1233 case ARMISD::VLD4_UPD: return "ARMISD::VLD4_UPD"; 1234 case ARMISD::VLD2LN_UPD: return "ARMISD::VLD2LN_UPD"; 1235 case ARMISD::VLD3LN_UPD: return "ARMISD::VLD3LN_UPD"; 1236 case ARMISD::VLD4LN_UPD: return "ARMISD::VLD4LN_UPD"; 1237 case ARMISD::VLD2DUP_UPD: return "ARMISD::VLD2DUP_UPD"; 1238 case ARMISD::VLD3DUP_UPD: return "ARMISD::VLD3DUP_UPD"; 1239 case ARMISD::VLD4DUP_UPD: return "ARMISD::VLD4DUP_UPD"; 1240 case ARMISD::VST1_UPD: return "ARMISD::VST1_UPD"; 1241 case ARMISD::VST2_UPD: return "ARMISD::VST2_UPD"; 1242 case ARMISD::VST3_UPD: return "ARMISD::VST3_UPD"; 1243 case ARMISD::VST4_UPD: return "ARMISD::VST4_UPD"; 1244 case ARMISD::VST2LN_UPD: return "ARMISD::VST2LN_UPD"; 1245 case ARMISD::VST3LN_UPD: return "ARMISD::VST3LN_UPD"; 1246 case ARMISD::VST4LN_UPD: return "ARMISD::VST4LN_UPD"; 1247 } 1248 return nullptr; 1249 } 1250 1251 EVT ARMTargetLowering::getSetCCResultType(const DataLayout &DL, LLVMContext &, 1252 EVT VT) const { 1253 if (!VT.isVector()) 1254 return getPointerTy(DL); 1255 return VT.changeVectorElementTypeToInteger(); 1256 } 1257 1258 /// getRegClassFor - Return the register class that should be used for the 1259 /// specified value type. 1260 const TargetRegisterClass *ARMTargetLowering::getRegClassFor(MVT VT) const { 1261 // Map v4i64 to QQ registers but do not make the type legal. Similarly map 1262 // v8i64 to QQQQ registers. v4i64 and v8i64 are only used for REG_SEQUENCE to 1263 // load / store 4 to 8 consecutive D registers. 1264 if (Subtarget->hasNEON()) { 1265 if (VT == MVT::v4i64) 1266 return &ARM::QQPRRegClass; 1267 if (VT == MVT::v8i64) 1268 return &ARM::QQQQPRRegClass; 1269 } 1270 return TargetLowering::getRegClassFor(VT); 1271 } 1272 1273 // memcpy, and other memory intrinsics, typically tries to use LDM/STM if the 1274 // source/dest is aligned and the copy size is large enough. We therefore want 1275 // to align such objects passed to memory intrinsics. 1276 bool ARMTargetLowering::shouldAlignPointerArgs(CallInst *CI, unsigned &MinSize, 1277 unsigned &PrefAlign) const { 1278 if (!isa<MemIntrinsic>(CI)) 1279 return false; 1280 MinSize = 8; 1281 // On ARM11 onwards (excluding M class) 8-byte aligned LDM is typically 1 1282 // cycle faster than 4-byte aligned LDM. 1283 PrefAlign = (Subtarget->hasV6Ops() && !Subtarget->isMClass() ? 8 : 4); 1284 return true; 1285 } 1286 1287 // Create a fast isel object. 1288 FastISel * 1289 ARMTargetLowering::createFastISel(FunctionLoweringInfo &funcInfo, 1290 const TargetLibraryInfo *libInfo) const { 1291 return ARM::createFastISel(funcInfo, libInfo); 1292 } 1293 1294 Sched::Preference ARMTargetLowering::getSchedulingPreference(SDNode *N) const { 1295 unsigned NumVals = N->getNumValues(); 1296 if (!NumVals) 1297 return Sched::RegPressure; 1298 1299 for (unsigned i = 0; i != NumVals; ++i) { 1300 EVT VT = N->getValueType(i); 1301 if (VT == MVT::Glue || VT == MVT::Other) 1302 continue; 1303 if (VT.isFloatingPoint() || VT.isVector()) 1304 return Sched::ILP; 1305 } 1306 1307 if (!N->isMachineOpcode()) 1308 return Sched::RegPressure; 1309 1310 // Load are scheduled for latency even if there instruction itinerary 1311 // is not available. 1312 const TargetInstrInfo *TII = Subtarget->getInstrInfo(); 1313 const MCInstrDesc &MCID = TII->get(N->getMachineOpcode()); 1314 1315 if (MCID.getNumDefs() == 0) 1316 return Sched::RegPressure; 1317 if (!Itins->isEmpty() && 1318 Itins->getOperandCycle(MCID.getSchedClass(), 0) > 2) 1319 return Sched::ILP; 1320 1321 return Sched::RegPressure; 1322 } 1323 1324 //===----------------------------------------------------------------------===// 1325 // Lowering Code 1326 //===----------------------------------------------------------------------===// 1327 1328 /// IntCCToARMCC - Convert a DAG integer condition code to an ARM CC 1329 static ARMCC::CondCodes IntCCToARMCC(ISD::CondCode CC) { 1330 switch (CC) { 1331 default: llvm_unreachable("Unknown condition code!"); 1332 case ISD::SETNE: return ARMCC::NE; 1333 case ISD::SETEQ: return ARMCC::EQ; 1334 case ISD::SETGT: return ARMCC::GT; 1335 case ISD::SETGE: return ARMCC::GE; 1336 case ISD::SETLT: return ARMCC::LT; 1337 case ISD::SETLE: return ARMCC::LE; 1338 case ISD::SETUGT: return ARMCC::HI; 1339 case ISD::SETUGE: return ARMCC::HS; 1340 case ISD::SETULT: return ARMCC::LO; 1341 case ISD::SETULE: return ARMCC::LS; 1342 } 1343 } 1344 1345 /// FPCCToARMCC - Convert a DAG fp condition code to an ARM CC. 1346 static void FPCCToARMCC(ISD::CondCode CC, ARMCC::CondCodes &CondCode, 1347 ARMCC::CondCodes &CondCode2) { 1348 CondCode2 = ARMCC::AL; 1349 switch (CC) { 1350 default: llvm_unreachable("Unknown FP condition!"); 1351 case ISD::SETEQ: 1352 case ISD::SETOEQ: CondCode = ARMCC::EQ; break; 1353 case ISD::SETGT: 1354 case ISD::SETOGT: CondCode = ARMCC::GT; break; 1355 case ISD::SETGE: 1356 case ISD::SETOGE: CondCode = ARMCC::GE; break; 1357 case ISD::SETOLT: CondCode = ARMCC::MI; break; 1358 case ISD::SETOLE: CondCode = ARMCC::LS; break; 1359 case ISD::SETONE: CondCode = ARMCC::MI; CondCode2 = ARMCC::GT; break; 1360 case ISD::SETO: CondCode = ARMCC::VC; break; 1361 case ISD::SETUO: CondCode = ARMCC::VS; break; 1362 case ISD::SETUEQ: CondCode = ARMCC::EQ; CondCode2 = ARMCC::VS; break; 1363 case ISD::SETUGT: CondCode = ARMCC::HI; break; 1364 case ISD::SETUGE: CondCode = ARMCC::PL; break; 1365 case ISD::SETLT: 1366 case ISD::SETULT: CondCode = ARMCC::LT; break; 1367 case ISD::SETLE: 1368 case ISD::SETULE: CondCode = ARMCC::LE; break; 1369 case ISD::SETNE: 1370 case ISD::SETUNE: CondCode = ARMCC::NE; break; 1371 } 1372 } 1373 1374 //===----------------------------------------------------------------------===// 1375 // Calling Convention Implementation 1376 //===----------------------------------------------------------------------===// 1377 1378 #include "ARMGenCallingConv.inc" 1379 1380 /// getEffectiveCallingConv - Get the effective calling convention, taking into 1381 /// account presence of floating point hardware and calling convention 1382 /// limitations, such as support for variadic functions. 1383 CallingConv::ID 1384 ARMTargetLowering::getEffectiveCallingConv(CallingConv::ID CC, 1385 bool isVarArg) const { 1386 switch (CC) { 1387 default: 1388 llvm_unreachable("Unsupported calling convention"); 1389 case CallingConv::ARM_AAPCS: 1390 case CallingConv::ARM_APCS: 1391 case CallingConv::GHC: 1392 return CC; 1393 case CallingConv::PreserveMost: 1394 return CallingConv::PreserveMost; 1395 case CallingConv::ARM_AAPCS_VFP: 1396 case CallingConv::Swift: 1397 return isVarArg ? CallingConv::ARM_AAPCS : CallingConv::ARM_AAPCS_VFP; 1398 case CallingConv::C: 1399 if (!Subtarget->isAAPCS_ABI()) 1400 return CallingConv::ARM_APCS; 1401 else if (Subtarget->hasVFP2() && !Subtarget->isThumb1Only() && 1402 getTargetMachine().Options.FloatABIType == FloatABI::Hard && 1403 !isVarArg) 1404 return CallingConv::ARM_AAPCS_VFP; 1405 else 1406 return CallingConv::ARM_AAPCS; 1407 case CallingConv::Fast: 1408 case CallingConv::CXX_FAST_TLS: 1409 if (!Subtarget->isAAPCS_ABI()) { 1410 if (Subtarget->hasVFP2() && !Subtarget->isThumb1Only() && !isVarArg) 1411 return CallingConv::Fast; 1412 return CallingConv::ARM_APCS; 1413 } else if (Subtarget->hasVFP2() && !Subtarget->isThumb1Only() && !isVarArg) 1414 return CallingConv::ARM_AAPCS_VFP; 1415 else 1416 return CallingConv::ARM_AAPCS; 1417 } 1418 } 1419 1420 /// CCAssignFnForNode - Selects the correct CCAssignFn for the given 1421 /// CallingConvention. 1422 CCAssignFn *ARMTargetLowering::CCAssignFnForNode(CallingConv::ID CC, 1423 bool Return, 1424 bool isVarArg) const { 1425 switch (getEffectiveCallingConv(CC, isVarArg)) { 1426 default: 1427 llvm_unreachable("Unsupported calling convention"); 1428 case CallingConv::ARM_APCS: 1429 return (Return ? RetCC_ARM_APCS : CC_ARM_APCS); 1430 case CallingConv::ARM_AAPCS: 1431 return (Return ? RetCC_ARM_AAPCS : CC_ARM_AAPCS); 1432 case CallingConv::ARM_AAPCS_VFP: 1433 return (Return ? RetCC_ARM_AAPCS_VFP : CC_ARM_AAPCS_VFP); 1434 case CallingConv::Fast: 1435 return (Return ? RetFastCC_ARM_APCS : FastCC_ARM_APCS); 1436 case CallingConv::GHC: 1437 return (Return ? RetCC_ARM_APCS : CC_ARM_APCS_GHC); 1438 case CallingConv::PreserveMost: 1439 return (Return ? RetCC_ARM_AAPCS : CC_ARM_AAPCS); 1440 } 1441 } 1442 1443 /// LowerCallResult - Lower the result values of a call into the 1444 /// appropriate copies out of appropriate physical registers. 1445 SDValue ARMTargetLowering::LowerCallResult( 1446 SDValue Chain, SDValue InFlag, CallingConv::ID CallConv, bool isVarArg, 1447 const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl, 1448 SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals, bool isThisReturn, 1449 SDValue ThisVal) const { 1450 1451 // Assign locations to each value returned by this call. 1452 SmallVector<CCValAssign, 16> RVLocs; 1453 ARMCCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), RVLocs, 1454 *DAG.getContext(), Call); 1455 CCInfo.AnalyzeCallResult(Ins, 1456 CCAssignFnForNode(CallConv, /* Return*/ true, 1457 isVarArg)); 1458 1459 // Copy all of the result registers out of their specified physreg. 1460 for (unsigned i = 0; i != RVLocs.size(); ++i) { 1461 CCValAssign VA = RVLocs[i]; 1462 1463 // Pass 'this' value directly from the argument to return value, to avoid 1464 // reg unit interference 1465 if (i == 0 && isThisReturn) { 1466 assert(!VA.needsCustom() && VA.getLocVT() == MVT::i32 && 1467 "unexpected return calling convention register assignment"); 1468 InVals.push_back(ThisVal); 1469 continue; 1470 } 1471 1472 SDValue Val; 1473 if (VA.needsCustom()) { 1474 // Handle f64 or half of a v2f64. 1475 SDValue Lo = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(), MVT::i32, 1476 InFlag); 1477 Chain = Lo.getValue(1); 1478 InFlag = Lo.getValue(2); 1479 VA = RVLocs[++i]; // skip ahead to next loc 1480 SDValue Hi = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(), MVT::i32, 1481 InFlag); 1482 Chain = Hi.getValue(1); 1483 InFlag = Hi.getValue(2); 1484 if (!Subtarget->isLittle()) 1485 std::swap (Lo, Hi); 1486 Val = DAG.getNode(ARMISD::VMOVDRR, dl, MVT::f64, Lo, Hi); 1487 1488 if (VA.getLocVT() == MVT::v2f64) { 1489 SDValue Vec = DAG.getNode(ISD::UNDEF, dl, MVT::v2f64); 1490 Vec = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v2f64, Vec, Val, 1491 DAG.getConstant(0, dl, MVT::i32)); 1492 1493 VA = RVLocs[++i]; // skip ahead to next loc 1494 Lo = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(), MVT::i32, InFlag); 1495 Chain = Lo.getValue(1); 1496 InFlag = Lo.getValue(2); 1497 VA = RVLocs[++i]; // skip ahead to next loc 1498 Hi = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(), MVT::i32, InFlag); 1499 Chain = Hi.getValue(1); 1500 InFlag = Hi.getValue(2); 1501 if (!Subtarget->isLittle()) 1502 std::swap (Lo, Hi); 1503 Val = DAG.getNode(ARMISD::VMOVDRR, dl, MVT::f64, Lo, Hi); 1504 Val = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v2f64, Vec, Val, 1505 DAG.getConstant(1, dl, MVT::i32)); 1506 } 1507 } else { 1508 Val = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(), VA.getLocVT(), 1509 InFlag); 1510 Chain = Val.getValue(1); 1511 InFlag = Val.getValue(2); 1512 } 1513 1514 switch (VA.getLocInfo()) { 1515 default: llvm_unreachable("Unknown loc info!"); 1516 case CCValAssign::Full: break; 1517 case CCValAssign::BCvt: 1518 Val = DAG.getNode(ISD::BITCAST, dl, VA.getValVT(), Val); 1519 break; 1520 } 1521 1522 InVals.push_back(Val); 1523 } 1524 1525 return Chain; 1526 } 1527 1528 /// LowerMemOpCallTo - Store the argument to the stack. 1529 SDValue ARMTargetLowering::LowerMemOpCallTo(SDValue Chain, SDValue StackPtr, 1530 SDValue Arg, const SDLoc &dl, 1531 SelectionDAG &DAG, 1532 const CCValAssign &VA, 1533 ISD::ArgFlagsTy Flags) const { 1534 unsigned LocMemOffset = VA.getLocMemOffset(); 1535 SDValue PtrOff = DAG.getIntPtrConstant(LocMemOffset, dl); 1536 PtrOff = DAG.getNode(ISD::ADD, dl, getPointerTy(DAG.getDataLayout()), 1537 StackPtr, PtrOff); 1538 return DAG.getStore( 1539 Chain, dl, Arg, PtrOff, 1540 MachinePointerInfo::getStack(DAG.getMachineFunction(), LocMemOffset), 1541 false, false, 0); 1542 } 1543 1544 void ARMTargetLowering::PassF64ArgInRegs(const SDLoc &dl, SelectionDAG &DAG, 1545 SDValue Chain, SDValue &Arg, 1546 RegsToPassVector &RegsToPass, 1547 CCValAssign &VA, CCValAssign &NextVA, 1548 SDValue &StackPtr, 1549 SmallVectorImpl<SDValue> &MemOpChains, 1550 ISD::ArgFlagsTy Flags) const { 1551 1552 SDValue fmrrd = DAG.getNode(ARMISD::VMOVRRD, dl, 1553 DAG.getVTList(MVT::i32, MVT::i32), Arg); 1554 unsigned id = Subtarget->isLittle() ? 0 : 1; 1555 RegsToPass.push_back(std::make_pair(VA.getLocReg(), fmrrd.getValue(id))); 1556 1557 if (NextVA.isRegLoc()) 1558 RegsToPass.push_back(std::make_pair(NextVA.getLocReg(), fmrrd.getValue(1-id))); 1559 else { 1560 assert(NextVA.isMemLoc()); 1561 if (!StackPtr.getNode()) 1562 StackPtr = DAG.getCopyFromReg(Chain, dl, ARM::SP, 1563 getPointerTy(DAG.getDataLayout())); 1564 1565 MemOpChains.push_back(LowerMemOpCallTo(Chain, StackPtr, fmrrd.getValue(1-id), 1566 dl, DAG, NextVA, 1567 Flags)); 1568 } 1569 } 1570 1571 /// LowerCall - Lowering a call into a callseq_start <- 1572 /// ARMISD:CALL <- callseq_end chain. Also add input and output parameter 1573 /// nodes. 1574 SDValue 1575 ARMTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, 1576 SmallVectorImpl<SDValue> &InVals) const { 1577 SelectionDAG &DAG = CLI.DAG; 1578 SDLoc &dl = CLI.DL; 1579 SmallVectorImpl<ISD::OutputArg> &Outs = CLI.Outs; 1580 SmallVectorImpl<SDValue> &OutVals = CLI.OutVals; 1581 SmallVectorImpl<ISD::InputArg> &Ins = CLI.Ins; 1582 SDValue Chain = CLI.Chain; 1583 SDValue Callee = CLI.Callee; 1584 bool &isTailCall = CLI.IsTailCall; 1585 CallingConv::ID CallConv = CLI.CallConv; 1586 bool doesNotRet = CLI.DoesNotReturn; 1587 bool isVarArg = CLI.IsVarArg; 1588 1589 MachineFunction &MF = DAG.getMachineFunction(); 1590 bool isStructRet = (Outs.empty()) ? false : Outs[0].Flags.isSRet(); 1591 bool isThisReturn = false; 1592 bool isSibCall = false; 1593 auto Attr = MF.getFunction()->getFnAttribute("disable-tail-calls"); 1594 1595 // Disable tail calls if they're not supported. 1596 if (!Subtarget->supportsTailCall() || Attr.getValueAsString() == "true") 1597 isTailCall = false; 1598 1599 if (isTailCall) { 1600 // Check if it's really possible to do a tail call. 1601 isTailCall = IsEligibleForTailCallOptimization(Callee, CallConv, 1602 isVarArg, isStructRet, MF.getFunction()->hasStructRetAttr(), 1603 Outs, OutVals, Ins, DAG); 1604 if (!isTailCall && CLI.CS && CLI.CS->isMustTailCall()) 1605 report_fatal_error("failed to perform tail call elimination on a call " 1606 "site marked musttail"); 1607 // We don't support GuaranteedTailCallOpt for ARM, only automatically 1608 // detected sibcalls. 1609 if (isTailCall) { 1610 ++NumTailCalls; 1611 isSibCall = true; 1612 } 1613 } 1614 1615 // Analyze operands of the call, assigning locations to each operand. 1616 SmallVector<CCValAssign, 16> ArgLocs; 1617 ARMCCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), ArgLocs, 1618 *DAG.getContext(), Call); 1619 CCInfo.AnalyzeCallOperands(Outs, 1620 CCAssignFnForNode(CallConv, /* Return*/ false, 1621 isVarArg)); 1622 1623 // Get a count of how many bytes are to be pushed on the stack. 1624 unsigned NumBytes = CCInfo.getNextStackOffset(); 1625 1626 // For tail calls, memory operands are available in our caller's stack. 1627 if (isSibCall) 1628 NumBytes = 0; 1629 1630 // Adjust the stack pointer for the new arguments... 1631 // These operations are automatically eliminated by the prolog/epilog pass 1632 if (!isSibCall) 1633 Chain = DAG.getCALLSEQ_START(Chain, 1634 DAG.getIntPtrConstant(NumBytes, dl, true), dl); 1635 1636 SDValue StackPtr = 1637 DAG.getCopyFromReg(Chain, dl, ARM::SP, getPointerTy(DAG.getDataLayout())); 1638 1639 RegsToPassVector RegsToPass; 1640 SmallVector<SDValue, 8> MemOpChains; 1641 1642 // Walk the register/memloc assignments, inserting copies/loads. In the case 1643 // of tail call optimization, arguments are handled later. 1644 for (unsigned i = 0, realArgIdx = 0, e = ArgLocs.size(); 1645 i != e; 1646 ++i, ++realArgIdx) { 1647 CCValAssign &VA = ArgLocs[i]; 1648 SDValue Arg = OutVals[realArgIdx]; 1649 ISD::ArgFlagsTy Flags = Outs[realArgIdx].Flags; 1650 bool isByVal = Flags.isByVal(); 1651 1652 // Promote the value if needed. 1653 switch (VA.getLocInfo()) { 1654 default: llvm_unreachable("Unknown loc info!"); 1655 case CCValAssign::Full: break; 1656 case CCValAssign::SExt: 1657 Arg = DAG.getNode(ISD::SIGN_EXTEND, dl, VA.getLocVT(), Arg); 1658 break; 1659 case CCValAssign::ZExt: 1660 Arg = DAG.getNode(ISD::ZERO_EXTEND, dl, VA.getLocVT(), Arg); 1661 break; 1662 case CCValAssign::AExt: 1663 Arg = DAG.getNode(ISD::ANY_EXTEND, dl, VA.getLocVT(), Arg); 1664 break; 1665 case CCValAssign::BCvt: 1666 Arg = DAG.getNode(ISD::BITCAST, dl, VA.getLocVT(), Arg); 1667 break; 1668 } 1669 1670 // f64 and v2f64 might be passed in i32 pairs and must be split into pieces 1671 if (VA.needsCustom()) { 1672 if (VA.getLocVT() == MVT::v2f64) { 1673 SDValue Op0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, Arg, 1674 DAG.getConstant(0, dl, MVT::i32)); 1675 SDValue Op1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, Arg, 1676 DAG.getConstant(1, dl, MVT::i32)); 1677 1678 PassF64ArgInRegs(dl, DAG, Chain, Op0, RegsToPass, 1679 VA, ArgLocs[++i], StackPtr, MemOpChains, Flags); 1680 1681 VA = ArgLocs[++i]; // skip ahead to next loc 1682 if (VA.isRegLoc()) { 1683 PassF64ArgInRegs(dl, DAG, Chain, Op1, RegsToPass, 1684 VA, ArgLocs[++i], StackPtr, MemOpChains, Flags); 1685 } else { 1686 assert(VA.isMemLoc()); 1687 1688 MemOpChains.push_back(LowerMemOpCallTo(Chain, StackPtr, Op1, 1689 dl, DAG, VA, Flags)); 1690 } 1691 } else { 1692 PassF64ArgInRegs(dl, DAG, Chain, Arg, RegsToPass, VA, ArgLocs[++i], 1693 StackPtr, MemOpChains, Flags); 1694 } 1695 } else if (VA.isRegLoc()) { 1696 if (realArgIdx == 0 && Flags.isReturned() && Outs[0].VT == MVT::i32) { 1697 assert(VA.getLocVT() == MVT::i32 && 1698 "unexpected calling convention register assignment"); 1699 assert(!Ins.empty() && Ins[0].VT == MVT::i32 && 1700 "unexpected use of 'returned'"); 1701 isThisReturn = true; 1702 } 1703 RegsToPass.push_back(std::make_pair(VA.getLocReg(), Arg)); 1704 } else if (isByVal) { 1705 assert(VA.isMemLoc()); 1706 unsigned offset = 0; 1707 1708 // True if this byval aggregate will be split between registers 1709 // and memory. 1710 unsigned ByValArgsCount = CCInfo.getInRegsParamsCount(); 1711 unsigned CurByValIdx = CCInfo.getInRegsParamsProcessed(); 1712 1713 if (CurByValIdx < ByValArgsCount) { 1714 1715 unsigned RegBegin, RegEnd; 1716 CCInfo.getInRegsParamInfo(CurByValIdx, RegBegin, RegEnd); 1717 1718 EVT PtrVT = 1719 DAG.getTargetLoweringInfo().getPointerTy(DAG.getDataLayout()); 1720 unsigned int i, j; 1721 for (i = 0, j = RegBegin; j < RegEnd; i++, j++) { 1722 SDValue Const = DAG.getConstant(4*i, dl, MVT::i32); 1723 SDValue AddArg = DAG.getNode(ISD::ADD, dl, PtrVT, Arg, Const); 1724 SDValue Load = DAG.getLoad(PtrVT, dl, Chain, AddArg, 1725 MachinePointerInfo(), 1726 false, false, false, 1727 DAG.InferPtrAlignment(AddArg)); 1728 MemOpChains.push_back(Load.getValue(1)); 1729 RegsToPass.push_back(std::make_pair(j, Load)); 1730 } 1731 1732 // If parameter size outsides register area, "offset" value 1733 // helps us to calculate stack slot for remained part properly. 1734 offset = RegEnd - RegBegin; 1735 1736 CCInfo.nextInRegsParam(); 1737 } 1738 1739 if (Flags.getByValSize() > 4*offset) { 1740 auto PtrVT = getPointerTy(DAG.getDataLayout()); 1741 unsigned LocMemOffset = VA.getLocMemOffset(); 1742 SDValue StkPtrOff = DAG.getIntPtrConstant(LocMemOffset, dl); 1743 SDValue Dst = DAG.getNode(ISD::ADD, dl, PtrVT, StackPtr, StkPtrOff); 1744 SDValue SrcOffset = DAG.getIntPtrConstant(4*offset, dl); 1745 SDValue Src = DAG.getNode(ISD::ADD, dl, PtrVT, Arg, SrcOffset); 1746 SDValue SizeNode = DAG.getConstant(Flags.getByValSize() - 4*offset, dl, 1747 MVT::i32); 1748 SDValue AlignNode = DAG.getConstant(Flags.getByValAlign(), dl, 1749 MVT::i32); 1750 1751 SDVTList VTs = DAG.getVTList(MVT::Other, MVT::Glue); 1752 SDValue Ops[] = { Chain, Dst, Src, SizeNode, AlignNode}; 1753 MemOpChains.push_back(DAG.getNode(ARMISD::COPY_STRUCT_BYVAL, dl, VTs, 1754 Ops)); 1755 } 1756 } else if (!isSibCall) { 1757 assert(VA.isMemLoc()); 1758 1759 MemOpChains.push_back(LowerMemOpCallTo(Chain, StackPtr, Arg, 1760 dl, DAG, VA, Flags)); 1761 } 1762 } 1763 1764 if (!MemOpChains.empty()) 1765 Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOpChains); 1766 1767 // Build a sequence of copy-to-reg nodes chained together with token chain 1768 // and flag operands which copy the outgoing args into the appropriate regs. 1769 SDValue InFlag; 1770 // Tail call byval lowering might overwrite argument registers so in case of 1771 // tail call optimization the copies to registers are lowered later. 1772 if (!isTailCall) 1773 for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) { 1774 Chain = DAG.getCopyToReg(Chain, dl, RegsToPass[i].first, 1775 RegsToPass[i].second, InFlag); 1776 InFlag = Chain.getValue(1); 1777 } 1778 1779 // For tail calls lower the arguments to the 'real' stack slot. 1780 if (isTailCall) { 1781 // Force all the incoming stack arguments to be loaded from the stack 1782 // before any new outgoing arguments are stored to the stack, because the 1783 // outgoing stack slots may alias the incoming argument stack slots, and 1784 // the alias isn't otherwise explicit. This is slightly more conservative 1785 // than necessary, because it means that each store effectively depends 1786 // on every argument instead of just those arguments it would clobber. 1787 1788 // Do not flag preceding copytoreg stuff together with the following stuff. 1789 InFlag = SDValue(); 1790 for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) { 1791 Chain = DAG.getCopyToReg(Chain, dl, RegsToPass[i].first, 1792 RegsToPass[i].second, InFlag); 1793 InFlag = Chain.getValue(1); 1794 } 1795 InFlag = SDValue(); 1796 } 1797 1798 // If the callee is a GlobalAddress/ExternalSymbol node (quite common, every 1799 // direct call is) turn it into a TargetGlobalAddress/TargetExternalSymbol 1800 // node so that legalize doesn't hack it. 1801 bool isDirect = false; 1802 1803 const TargetMachine &TM = getTargetMachine(); 1804 const Module *Mod = MF.getFunction()->getParent(); 1805 const GlobalValue *GV = nullptr; 1806 if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee)) 1807 GV = G->getGlobal(); 1808 bool isStub = 1809 !TM.shouldAssumeDSOLocal(*Mod, GV) && Subtarget->isTargetMachO(); 1810 1811 bool isARMFunc = !Subtarget->isThumb() || (isStub && !Subtarget->isMClass()); 1812 bool isLocalARMFunc = false; 1813 ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>(); 1814 auto PtrVt = getPointerTy(DAG.getDataLayout()); 1815 1816 if (Subtarget->genLongCalls()) { 1817 assert((!isPositionIndependent() || Subtarget->isTargetWindows()) && 1818 "long-calls codegen is not position independent!"); 1819 // Handle a global address or an external symbol. If it's not one of 1820 // those, the target's already in a register, so we don't need to do 1821 // anything extra. 1822 if (isa<GlobalAddressSDNode>(Callee)) { 1823 // Create a constant pool entry for the callee address 1824 unsigned ARMPCLabelIndex = AFI->createPICLabelUId(); 1825 ARMConstantPoolValue *CPV = 1826 ARMConstantPoolConstant::Create(GV, ARMPCLabelIndex, ARMCP::CPValue, 0); 1827 1828 // Get the address of the callee into a register 1829 SDValue CPAddr = DAG.getTargetConstantPool(CPV, PtrVt, 4); 1830 CPAddr = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, CPAddr); 1831 Callee = DAG.getLoad( 1832 PtrVt, dl, DAG.getEntryNode(), CPAddr, 1833 MachinePointerInfo::getConstantPool(DAG.getMachineFunction()), false, 1834 false, false, 0); 1835 } else if (ExternalSymbolSDNode *S=dyn_cast<ExternalSymbolSDNode>(Callee)) { 1836 const char *Sym = S->getSymbol(); 1837 1838 // Create a constant pool entry for the callee address 1839 unsigned ARMPCLabelIndex = AFI->createPICLabelUId(); 1840 ARMConstantPoolValue *CPV = 1841 ARMConstantPoolSymbol::Create(*DAG.getContext(), Sym, 1842 ARMPCLabelIndex, 0); 1843 // Get the address of the callee into a register 1844 SDValue CPAddr = DAG.getTargetConstantPool(CPV, PtrVt, 4); 1845 CPAddr = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, CPAddr); 1846 Callee = DAG.getLoad( 1847 PtrVt, dl, DAG.getEntryNode(), CPAddr, 1848 MachinePointerInfo::getConstantPool(DAG.getMachineFunction()), false, 1849 false, false, 0); 1850 } 1851 } else if (isa<GlobalAddressSDNode>(Callee)) { 1852 isDirect = true; 1853 bool isDef = GV->isStrongDefinitionForLinker(); 1854 1855 // ARM call to a local ARM function is predicable. 1856 isLocalARMFunc = !Subtarget->isThumb() && (isDef || !ARMInterworking); 1857 // tBX takes a register source operand. 1858 if (isStub && Subtarget->isThumb1Only() && !Subtarget->hasV5TOps()) { 1859 assert(Subtarget->isTargetMachO() && "WrapperPIC use on non-MachO?"); 1860 Callee = DAG.getNode( 1861 ARMISD::WrapperPIC, dl, PtrVt, 1862 DAG.getTargetGlobalAddress(GV, dl, PtrVt, 0, ARMII::MO_NONLAZY)); 1863 Callee = DAG.getLoad(PtrVt, dl, DAG.getEntryNode(), Callee, 1864 MachinePointerInfo::getGOT(DAG.getMachineFunction()), 1865 false, false, true, 0); 1866 } else if (Subtarget->isTargetCOFF()) { 1867 assert(Subtarget->isTargetWindows() && 1868 "Windows is the only supported COFF target"); 1869 unsigned TargetFlags = GV->hasDLLImportStorageClass() 1870 ? ARMII::MO_DLLIMPORT 1871 : ARMII::MO_NO_FLAG; 1872 Callee = 1873 DAG.getTargetGlobalAddress(GV, dl, PtrVt, /*Offset=*/0, TargetFlags); 1874 if (GV->hasDLLImportStorageClass()) 1875 Callee = 1876 DAG.getLoad(PtrVt, dl, DAG.getEntryNode(), 1877 DAG.getNode(ARMISD::Wrapper, dl, PtrVt, Callee), 1878 MachinePointerInfo::getGOT(DAG.getMachineFunction()), 1879 false, false, false, 0); 1880 } else { 1881 Callee = DAG.getTargetGlobalAddress(GV, dl, PtrVt, 0, 0); 1882 } 1883 } else if (ExternalSymbolSDNode *S = dyn_cast<ExternalSymbolSDNode>(Callee)) { 1884 isDirect = true; 1885 // tBX takes a register source operand. 1886 const char *Sym = S->getSymbol(); 1887 if (isARMFunc && Subtarget->isThumb1Only() && !Subtarget->hasV5TOps()) { 1888 unsigned ARMPCLabelIndex = AFI->createPICLabelUId(); 1889 ARMConstantPoolValue *CPV = 1890 ARMConstantPoolSymbol::Create(*DAG.getContext(), Sym, 1891 ARMPCLabelIndex, 4); 1892 SDValue CPAddr = DAG.getTargetConstantPool(CPV, PtrVt, 4); 1893 CPAddr = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, CPAddr); 1894 Callee = DAG.getLoad( 1895 PtrVt, dl, DAG.getEntryNode(), CPAddr, 1896 MachinePointerInfo::getConstantPool(DAG.getMachineFunction()), false, 1897 false, false, 0); 1898 SDValue PICLabel = DAG.getConstant(ARMPCLabelIndex, dl, MVT::i32); 1899 Callee = DAG.getNode(ARMISD::PIC_ADD, dl, PtrVt, Callee, PICLabel); 1900 } else { 1901 Callee = DAG.getTargetExternalSymbol(Sym, PtrVt, 0); 1902 } 1903 } 1904 1905 // FIXME: handle tail calls differently. 1906 unsigned CallOpc; 1907 if (Subtarget->isThumb()) { 1908 if ((!isDirect || isARMFunc) && !Subtarget->hasV5TOps()) 1909 CallOpc = ARMISD::CALL_NOLINK; 1910 else 1911 CallOpc = ARMISD::CALL; 1912 } else { 1913 if (!isDirect && !Subtarget->hasV5TOps()) 1914 CallOpc = ARMISD::CALL_NOLINK; 1915 else if (doesNotRet && isDirect && Subtarget->hasRetAddrStack() && 1916 // Emit regular call when code size is the priority 1917 !MF.getFunction()->optForMinSize()) 1918 // "mov lr, pc; b _foo" to avoid confusing the RSP 1919 CallOpc = ARMISD::CALL_NOLINK; 1920 else 1921 CallOpc = isLocalARMFunc ? ARMISD::CALL_PRED : ARMISD::CALL; 1922 } 1923 1924 std::vector<SDValue> Ops; 1925 Ops.push_back(Chain); 1926 Ops.push_back(Callee); 1927 1928 // Add argument registers to the end of the list so that they are known live 1929 // into the call. 1930 for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) 1931 Ops.push_back(DAG.getRegister(RegsToPass[i].first, 1932 RegsToPass[i].second.getValueType())); 1933 1934 // Add a register mask operand representing the call-preserved registers. 1935 if (!isTailCall) { 1936 const uint32_t *Mask; 1937 const ARMBaseRegisterInfo *ARI = Subtarget->getRegisterInfo(); 1938 if (isThisReturn) { 1939 // For 'this' returns, use the R0-preserving mask if applicable 1940 Mask = ARI->getThisReturnPreservedMask(MF, CallConv); 1941 if (!Mask) { 1942 // Set isThisReturn to false if the calling convention is not one that 1943 // allows 'returned' to be modeled in this way, so LowerCallResult does 1944 // not try to pass 'this' straight through 1945 isThisReturn = false; 1946 Mask = ARI->getCallPreservedMask(MF, CallConv); 1947 } 1948 } else 1949 Mask = ARI->getCallPreservedMask(MF, CallConv); 1950 1951 assert(Mask && "Missing call preserved mask for calling convention"); 1952 Ops.push_back(DAG.getRegisterMask(Mask)); 1953 } 1954 1955 if (InFlag.getNode()) 1956 Ops.push_back(InFlag); 1957 1958 SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue); 1959 if (isTailCall) { 1960 MF.getFrameInfo()->setHasTailCall(); 1961 return DAG.getNode(ARMISD::TC_RETURN, dl, NodeTys, Ops); 1962 } 1963 1964 // Returns a chain and a flag for retval copy to use. 1965 Chain = DAG.getNode(CallOpc, dl, NodeTys, Ops); 1966 InFlag = Chain.getValue(1); 1967 1968 Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(NumBytes, dl, true), 1969 DAG.getIntPtrConstant(0, dl, true), InFlag, dl); 1970 if (!Ins.empty()) 1971 InFlag = Chain.getValue(1); 1972 1973 // Handle result values, copying them out of physregs into vregs that we 1974 // return. 1975 return LowerCallResult(Chain, InFlag, CallConv, isVarArg, Ins, dl, DAG, 1976 InVals, isThisReturn, 1977 isThisReturn ? OutVals[0] : SDValue()); 1978 } 1979 1980 /// HandleByVal - Every parameter *after* a byval parameter is passed 1981 /// on the stack. Remember the next parameter register to allocate, 1982 /// and then confiscate the rest of the parameter registers to insure 1983 /// this. 1984 void ARMTargetLowering::HandleByVal(CCState *State, unsigned &Size, 1985 unsigned Align) const { 1986 assert((State->getCallOrPrologue() == Prologue || 1987 State->getCallOrPrologue() == Call) && 1988 "unhandled ParmContext"); 1989 1990 // Byval (as with any stack) slots are always at least 4 byte aligned. 1991 Align = std::max(Align, 4U); 1992 1993 unsigned Reg = State->AllocateReg(GPRArgRegs); 1994 if (!Reg) 1995 return; 1996 1997 unsigned AlignInRegs = Align / 4; 1998 unsigned Waste = (ARM::R4 - Reg) % AlignInRegs; 1999 for (unsigned i = 0; i < Waste; ++i) 2000 Reg = State->AllocateReg(GPRArgRegs); 2001 2002 if (!Reg) 2003 return; 2004 2005 unsigned Excess = 4 * (ARM::R4 - Reg); 2006 2007 // Special case when NSAA != SP and parameter size greater than size of 2008 // all remained GPR regs. In that case we can't split parameter, we must 2009 // send it to stack. We also must set NCRN to R4, so waste all 2010 // remained registers. 2011 const unsigned NSAAOffset = State->getNextStackOffset(); 2012 if (NSAAOffset != 0 && Size > Excess) { 2013 while (State->AllocateReg(GPRArgRegs)) 2014 ; 2015 return; 2016 } 2017 2018 // First register for byval parameter is the first register that wasn't 2019 // allocated before this method call, so it would be "reg". 2020 // If parameter is small enough to be saved in range [reg, r4), then 2021 // the end (first after last) register would be reg + param-size-in-regs, 2022 // else parameter would be splitted between registers and stack, 2023 // end register would be r4 in this case. 2024 unsigned ByValRegBegin = Reg; 2025 unsigned ByValRegEnd = std::min<unsigned>(Reg + Size / 4, ARM::R4); 2026 State->addInRegsParamInfo(ByValRegBegin, ByValRegEnd); 2027 // Note, first register is allocated in the beginning of function already, 2028 // allocate remained amount of registers we need. 2029 for (unsigned i = Reg + 1; i != ByValRegEnd; ++i) 2030 State->AllocateReg(GPRArgRegs); 2031 // A byval parameter that is split between registers and memory needs its 2032 // size truncated here. 2033 // In the case where the entire structure fits in registers, we set the 2034 // size in memory to zero. 2035 Size = std::max<int>(Size - Excess, 0); 2036 } 2037 2038 /// MatchingStackOffset - Return true if the given stack call argument is 2039 /// already available in the same position (relatively) of the caller's 2040 /// incoming argument stack. 2041 static 2042 bool MatchingStackOffset(SDValue Arg, unsigned Offset, ISD::ArgFlagsTy Flags, 2043 MachineFrameInfo *MFI, const MachineRegisterInfo *MRI, 2044 const TargetInstrInfo *TII) { 2045 unsigned Bytes = Arg.getValueType().getSizeInBits() / 8; 2046 int FI = INT_MAX; 2047 if (Arg.getOpcode() == ISD::CopyFromReg) { 2048 unsigned VR = cast<RegisterSDNode>(Arg.getOperand(1))->getReg(); 2049 if (!TargetRegisterInfo::isVirtualRegister(VR)) 2050 return false; 2051 MachineInstr *Def = MRI->getVRegDef(VR); 2052 if (!Def) 2053 return false; 2054 if (!Flags.isByVal()) { 2055 if (!TII->isLoadFromStackSlot(*Def, FI)) 2056 return false; 2057 } else { 2058 return false; 2059 } 2060 } else if (LoadSDNode *Ld = dyn_cast<LoadSDNode>(Arg)) { 2061 if (Flags.isByVal()) 2062 // ByVal argument is passed in as a pointer but it's now being 2063 // dereferenced. e.g. 2064 // define @foo(%struct.X* %A) { 2065 // tail call @bar(%struct.X* byval %A) 2066 // } 2067 return false; 2068 SDValue Ptr = Ld->getBasePtr(); 2069 FrameIndexSDNode *FINode = dyn_cast<FrameIndexSDNode>(Ptr); 2070 if (!FINode) 2071 return false; 2072 FI = FINode->getIndex(); 2073 } else 2074 return false; 2075 2076 assert(FI != INT_MAX); 2077 if (!MFI->isFixedObjectIndex(FI)) 2078 return false; 2079 return Offset == MFI->getObjectOffset(FI) && Bytes == MFI->getObjectSize(FI); 2080 } 2081 2082 /// IsEligibleForTailCallOptimization - Check whether the call is eligible 2083 /// for tail call optimization. Targets which want to do tail call 2084 /// optimization should implement this function. 2085 bool 2086 ARMTargetLowering::IsEligibleForTailCallOptimization(SDValue Callee, 2087 CallingConv::ID CalleeCC, 2088 bool isVarArg, 2089 bool isCalleeStructRet, 2090 bool isCallerStructRet, 2091 const SmallVectorImpl<ISD::OutputArg> &Outs, 2092 const SmallVectorImpl<SDValue> &OutVals, 2093 const SmallVectorImpl<ISD::InputArg> &Ins, 2094 SelectionDAG& DAG) const { 2095 MachineFunction &MF = DAG.getMachineFunction(); 2096 const Function *CallerF = MF.getFunction(); 2097 CallingConv::ID CallerCC = CallerF->getCallingConv(); 2098 2099 assert(Subtarget->supportsTailCall()); 2100 2101 // Look for obvious safe cases to perform tail call optimization that do not 2102 // require ABI changes. This is what gcc calls sibcall. 2103 2104 // Do not sibcall optimize vararg calls unless the call site is not passing 2105 // any arguments. 2106 if (isVarArg && !Outs.empty()) 2107 return false; 2108 2109 // Exception-handling functions need a special set of instructions to indicate 2110 // a return to the hardware. Tail-calling another function would probably 2111 // break this. 2112 if (CallerF->hasFnAttribute("interrupt")) 2113 return false; 2114 2115 // Also avoid sibcall optimization if either caller or callee uses struct 2116 // return semantics. 2117 if (isCalleeStructRet || isCallerStructRet) 2118 return false; 2119 2120 // Externally-defined functions with weak linkage should not be 2121 // tail-called on ARM when the OS does not support dynamic 2122 // pre-emption of symbols, as the AAELF spec requires normal calls 2123 // to undefined weak functions to be replaced with a NOP or jump to the 2124 // next instruction. The behaviour of branch instructions in this 2125 // situation (as used for tail calls) is implementation-defined, so we 2126 // cannot rely on the linker replacing the tail call with a return. 2127 if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee)) { 2128 const GlobalValue *GV = G->getGlobal(); 2129 const Triple &TT = getTargetMachine().getTargetTriple(); 2130 if (GV->hasExternalWeakLinkage() && 2131 (!TT.isOSWindows() || TT.isOSBinFormatELF() || TT.isOSBinFormatMachO())) 2132 return false; 2133 } 2134 2135 // Check that the call results are passed in the same way. 2136 LLVMContext &C = *DAG.getContext(); 2137 if (!CCState::resultsCompatible(CalleeCC, CallerCC, MF, C, Ins, 2138 CCAssignFnForNode(CalleeCC, true, isVarArg), 2139 CCAssignFnForNode(CallerCC, true, isVarArg))) 2140 return false; 2141 // The callee has to preserve all registers the caller needs to preserve. 2142 const ARMBaseRegisterInfo *TRI = Subtarget->getRegisterInfo(); 2143 const uint32_t *CallerPreserved = TRI->getCallPreservedMask(MF, CallerCC); 2144 if (CalleeCC != CallerCC) { 2145 const uint32_t *CalleePreserved = TRI->getCallPreservedMask(MF, CalleeCC); 2146 if (!TRI->regmaskSubsetEqual(CallerPreserved, CalleePreserved)) 2147 return false; 2148 } 2149 2150 // If Caller's vararg or byval argument has been split between registers and 2151 // stack, do not perform tail call, since part of the argument is in caller's 2152 // local frame. 2153 const ARMFunctionInfo *AFI_Caller = MF.getInfo<ARMFunctionInfo>(); 2154 if (AFI_Caller->getArgRegsSaveSize()) 2155 return false; 2156 2157 // If the callee takes no arguments then go on to check the results of the 2158 // call. 2159 if (!Outs.empty()) { 2160 // Check if stack adjustment is needed. For now, do not do this if any 2161 // argument is passed on the stack. 2162 SmallVector<CCValAssign, 16> ArgLocs; 2163 ARMCCState CCInfo(CalleeCC, isVarArg, MF, ArgLocs, C, Call); 2164 CCInfo.AnalyzeCallOperands(Outs, 2165 CCAssignFnForNode(CalleeCC, false, isVarArg)); 2166 if (CCInfo.getNextStackOffset()) { 2167 // Check if the arguments are already laid out in the right way as 2168 // the caller's fixed stack objects. 2169 MachineFrameInfo *MFI = MF.getFrameInfo(); 2170 const MachineRegisterInfo *MRI = &MF.getRegInfo(); 2171 const TargetInstrInfo *TII = Subtarget->getInstrInfo(); 2172 for (unsigned i = 0, realArgIdx = 0, e = ArgLocs.size(); 2173 i != e; 2174 ++i, ++realArgIdx) { 2175 CCValAssign &VA = ArgLocs[i]; 2176 EVT RegVT = VA.getLocVT(); 2177 SDValue Arg = OutVals[realArgIdx]; 2178 ISD::ArgFlagsTy Flags = Outs[realArgIdx].Flags; 2179 if (VA.getLocInfo() == CCValAssign::Indirect) 2180 return false; 2181 if (VA.needsCustom()) { 2182 // f64 and vector types are split into multiple registers or 2183 // register/stack-slot combinations. The types will not match 2184 // the registers; give up on memory f64 refs until we figure 2185 // out what to do about this. 2186 if (!VA.isRegLoc()) 2187 return false; 2188 if (!ArgLocs[++i].isRegLoc()) 2189 return false; 2190 if (RegVT == MVT::v2f64) { 2191 if (!ArgLocs[++i].isRegLoc()) 2192 return false; 2193 if (!ArgLocs[++i].isRegLoc()) 2194 return false; 2195 } 2196 } else if (!VA.isRegLoc()) { 2197 if (!MatchingStackOffset(Arg, VA.getLocMemOffset(), Flags, 2198 MFI, MRI, TII)) 2199 return false; 2200 } 2201 } 2202 } 2203 2204 const MachineRegisterInfo &MRI = MF.getRegInfo(); 2205 if (!parametersInCSRMatch(MRI, CallerPreserved, ArgLocs, OutVals)) 2206 return false; 2207 } 2208 2209 return true; 2210 } 2211 2212 bool 2213 ARMTargetLowering::CanLowerReturn(CallingConv::ID CallConv, 2214 MachineFunction &MF, bool isVarArg, 2215 const SmallVectorImpl<ISD::OutputArg> &Outs, 2216 LLVMContext &Context) const { 2217 SmallVector<CCValAssign, 16> RVLocs; 2218 CCState CCInfo(CallConv, isVarArg, MF, RVLocs, Context); 2219 return CCInfo.CheckReturn(Outs, CCAssignFnForNode(CallConv, /*Return=*/true, 2220 isVarArg)); 2221 } 2222 2223 static SDValue LowerInterruptReturn(SmallVectorImpl<SDValue> &RetOps, 2224 const SDLoc &DL, SelectionDAG &DAG) { 2225 const MachineFunction &MF = DAG.getMachineFunction(); 2226 const Function *F = MF.getFunction(); 2227 2228 StringRef IntKind = F->getFnAttribute("interrupt").getValueAsString(); 2229 2230 // See ARM ARM v7 B1.8.3. On exception entry LR is set to a possibly offset 2231 // version of the "preferred return address". These offsets affect the return 2232 // instruction if this is a return from PL1 without hypervisor extensions. 2233 // IRQ/FIQ: +4 "subs pc, lr, #4" 2234 // SWI: 0 "subs pc, lr, #0" 2235 // ABORT: +4 "subs pc, lr, #4" 2236 // UNDEF: +4/+2 "subs pc, lr, #0" 2237 // UNDEF varies depending on where the exception came from ARM or Thumb 2238 // mode. Alongside GCC, we throw our hands up in disgust and pretend it's 0. 2239 2240 int64_t LROffset; 2241 if (IntKind == "" || IntKind == "IRQ" || IntKind == "FIQ" || 2242 IntKind == "ABORT") 2243 LROffset = 4; 2244 else if (IntKind == "SWI" || IntKind == "UNDEF") 2245 LROffset = 0; 2246 else 2247 report_fatal_error("Unsupported interrupt attribute. If present, value " 2248 "must be one of: IRQ, FIQ, SWI, ABORT or UNDEF"); 2249 2250 RetOps.insert(RetOps.begin() + 1, 2251 DAG.getConstant(LROffset, DL, MVT::i32, false)); 2252 2253 return DAG.getNode(ARMISD::INTRET_FLAG, DL, MVT::Other, RetOps); 2254 } 2255 2256 SDValue 2257 ARMTargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv, 2258 bool isVarArg, 2259 const SmallVectorImpl<ISD::OutputArg> &Outs, 2260 const SmallVectorImpl<SDValue> &OutVals, 2261 const SDLoc &dl, SelectionDAG &DAG) const { 2262 2263 // CCValAssign - represent the assignment of the return value to a location. 2264 SmallVector<CCValAssign, 16> RVLocs; 2265 2266 // CCState - Info about the registers and stack slots. 2267 ARMCCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), RVLocs, 2268 *DAG.getContext(), Call); 2269 2270 // Analyze outgoing return values. 2271 CCInfo.AnalyzeReturn(Outs, CCAssignFnForNode(CallConv, /* Return */ true, 2272 isVarArg)); 2273 2274 SDValue Flag; 2275 SmallVector<SDValue, 4> RetOps; 2276 RetOps.push_back(Chain); // Operand #0 = Chain (updated below) 2277 bool isLittleEndian = Subtarget->isLittle(); 2278 2279 MachineFunction &MF = DAG.getMachineFunction(); 2280 ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>(); 2281 AFI->setReturnRegsCount(RVLocs.size()); 2282 2283 // Copy the result values into the output registers. 2284 for (unsigned i = 0, realRVLocIdx = 0; 2285 i != RVLocs.size(); 2286 ++i, ++realRVLocIdx) { 2287 CCValAssign &VA = RVLocs[i]; 2288 assert(VA.isRegLoc() && "Can only return in registers!"); 2289 2290 SDValue Arg = OutVals[realRVLocIdx]; 2291 2292 switch (VA.getLocInfo()) { 2293 default: llvm_unreachable("Unknown loc info!"); 2294 case CCValAssign::Full: break; 2295 case CCValAssign::BCvt: 2296 Arg = DAG.getNode(ISD::BITCAST, dl, VA.getLocVT(), Arg); 2297 break; 2298 } 2299 2300 if (VA.needsCustom()) { 2301 if (VA.getLocVT() == MVT::v2f64) { 2302 // Extract the first half and return it in two registers. 2303 SDValue Half = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, Arg, 2304 DAG.getConstant(0, dl, MVT::i32)); 2305 SDValue HalfGPRs = DAG.getNode(ARMISD::VMOVRRD, dl, 2306 DAG.getVTList(MVT::i32, MVT::i32), Half); 2307 2308 Chain = DAG.getCopyToReg(Chain, dl, VA.getLocReg(), 2309 HalfGPRs.getValue(isLittleEndian ? 0 : 1), 2310 Flag); 2311 Flag = Chain.getValue(1); 2312 RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT())); 2313 VA = RVLocs[++i]; // skip ahead to next loc 2314 Chain = DAG.getCopyToReg(Chain, dl, VA.getLocReg(), 2315 HalfGPRs.getValue(isLittleEndian ? 1 : 0), 2316 Flag); 2317 Flag = Chain.getValue(1); 2318 RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT())); 2319 VA = RVLocs[++i]; // skip ahead to next loc 2320 2321 // Extract the 2nd half and fall through to handle it as an f64 value. 2322 Arg = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, Arg, 2323 DAG.getConstant(1, dl, MVT::i32)); 2324 } 2325 // Legalize ret f64 -> ret 2 x i32. We always have fmrrd if f64 is 2326 // available. 2327 SDValue fmrrd = DAG.getNode(ARMISD::VMOVRRD, dl, 2328 DAG.getVTList(MVT::i32, MVT::i32), Arg); 2329 Chain = DAG.getCopyToReg(Chain, dl, VA.getLocReg(), 2330 fmrrd.getValue(isLittleEndian ? 0 : 1), 2331 Flag); 2332 Flag = Chain.getValue(1); 2333 RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT())); 2334 VA = RVLocs[++i]; // skip ahead to next loc 2335 Chain = DAG.getCopyToReg(Chain, dl, VA.getLocReg(), 2336 fmrrd.getValue(isLittleEndian ? 1 : 0), 2337 Flag); 2338 } else 2339 Chain = DAG.getCopyToReg(Chain, dl, VA.getLocReg(), Arg, Flag); 2340 2341 // Guarantee that all emitted copies are 2342 // stuck together, avoiding something bad. 2343 Flag = Chain.getValue(1); 2344 RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT())); 2345 } 2346 const ARMBaseRegisterInfo *TRI = Subtarget->getRegisterInfo(); 2347 const MCPhysReg *I = 2348 TRI->getCalleeSavedRegsViaCopy(&DAG.getMachineFunction()); 2349 if (I) { 2350 for (; *I; ++I) { 2351 if (ARM::GPRRegClass.contains(*I)) 2352 RetOps.push_back(DAG.getRegister(*I, MVT::i32)); 2353 else if (ARM::DPRRegClass.contains(*I)) 2354 RetOps.push_back(DAG.getRegister(*I, MVT::getFloatingPointVT(64))); 2355 else 2356 llvm_unreachable("Unexpected register class in CSRsViaCopy!"); 2357 } 2358 } 2359 2360 // Update chain and glue. 2361 RetOps[0] = Chain; 2362 if (Flag.getNode()) 2363 RetOps.push_back(Flag); 2364 2365 // CPUs which aren't M-class use a special sequence to return from 2366 // exceptions (roughly, any instruction setting pc and cpsr simultaneously, 2367 // though we use "subs pc, lr, #N"). 2368 // 2369 // M-class CPUs actually use a normal return sequence with a special 2370 // (hardware-provided) value in LR, so the normal code path works. 2371 if (DAG.getMachineFunction().getFunction()->hasFnAttribute("interrupt") && 2372 !Subtarget->isMClass()) { 2373 if (Subtarget->isThumb1Only()) 2374 report_fatal_error("interrupt attribute is not supported in Thumb1"); 2375 return LowerInterruptReturn(RetOps, dl, DAG); 2376 } 2377 2378 return DAG.getNode(ARMISD::RET_FLAG, dl, MVT::Other, RetOps); 2379 } 2380 2381 bool ARMTargetLowering::isUsedByReturnOnly(SDNode *N, SDValue &Chain) const { 2382 if (N->getNumValues() != 1) 2383 return false; 2384 if (!N->hasNUsesOfValue(1, 0)) 2385 return false; 2386 2387 SDValue TCChain = Chain; 2388 SDNode *Copy = *N->use_begin(); 2389 if (Copy->getOpcode() == ISD::CopyToReg) { 2390 // If the copy has a glue operand, we conservatively assume it isn't safe to 2391 // perform a tail call. 2392 if (Copy->getOperand(Copy->getNumOperands()-1).getValueType() == MVT::Glue) 2393 return false; 2394 TCChain = Copy->getOperand(0); 2395 } else if (Copy->getOpcode() == ARMISD::VMOVRRD) { 2396 SDNode *VMov = Copy; 2397 // f64 returned in a pair of GPRs. 2398 SmallPtrSet<SDNode*, 2> Copies; 2399 for (SDNode::use_iterator UI = VMov->use_begin(), UE = VMov->use_end(); 2400 UI != UE; ++UI) { 2401 if (UI->getOpcode() != ISD::CopyToReg) 2402 return false; 2403 Copies.insert(*UI); 2404 } 2405 if (Copies.size() > 2) 2406 return false; 2407 2408 for (SDNode::use_iterator UI = VMov->use_begin(), UE = VMov->use_end(); 2409 UI != UE; ++UI) { 2410 SDValue UseChain = UI->getOperand(0); 2411 if (Copies.count(UseChain.getNode())) 2412 // Second CopyToReg 2413 Copy = *UI; 2414 else { 2415 // We are at the top of this chain. 2416 // If the copy has a glue operand, we conservatively assume it 2417 // isn't safe to perform a tail call. 2418 if (UI->getOperand(UI->getNumOperands()-1).getValueType() == MVT::Glue) 2419 return false; 2420 // First CopyToReg 2421 TCChain = UseChain; 2422 } 2423 } 2424 } else if (Copy->getOpcode() == ISD::BITCAST) { 2425 // f32 returned in a single GPR. 2426 if (!Copy->hasOneUse()) 2427 return false; 2428 Copy = *Copy->use_begin(); 2429 if (Copy->getOpcode() != ISD::CopyToReg || !Copy->hasNUsesOfValue(1, 0)) 2430 return false; 2431 // If the copy has a glue operand, we conservatively assume it isn't safe to 2432 // perform a tail call. 2433 if (Copy->getOperand(Copy->getNumOperands()-1).getValueType() == MVT::Glue) 2434 return false; 2435 TCChain = Copy->getOperand(0); 2436 } else { 2437 return false; 2438 } 2439 2440 bool HasRet = false; 2441 for (SDNode::use_iterator UI = Copy->use_begin(), UE = Copy->use_end(); 2442 UI != UE; ++UI) { 2443 if (UI->getOpcode() != ARMISD::RET_FLAG && 2444 UI->getOpcode() != ARMISD::INTRET_FLAG) 2445 return false; 2446 HasRet = true; 2447 } 2448 2449 if (!HasRet) 2450 return false; 2451 2452 Chain = TCChain; 2453 return true; 2454 } 2455 2456 bool ARMTargetLowering::mayBeEmittedAsTailCall(CallInst *CI) const { 2457 if (!Subtarget->supportsTailCall()) 2458 return false; 2459 2460 auto Attr = 2461 CI->getParent()->getParent()->getFnAttribute("disable-tail-calls"); 2462 if (!CI->isTailCall() || Attr.getValueAsString() == "true") 2463 return false; 2464 2465 return true; 2466 } 2467 2468 // Trying to write a 64 bit value so need to split into two 32 bit values first, 2469 // and pass the lower and high parts through. 2470 static SDValue LowerWRITE_REGISTER(SDValue Op, SelectionDAG &DAG) { 2471 SDLoc DL(Op); 2472 SDValue WriteValue = Op->getOperand(2); 2473 2474 // This function is only supposed to be called for i64 type argument. 2475 assert(WriteValue.getValueType() == MVT::i64 2476 && "LowerWRITE_REGISTER called for non-i64 type argument."); 2477 2478 SDValue Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, MVT::i32, WriteValue, 2479 DAG.getConstant(0, DL, MVT::i32)); 2480 SDValue Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, MVT::i32, WriteValue, 2481 DAG.getConstant(1, DL, MVT::i32)); 2482 SDValue Ops[] = { Op->getOperand(0), Op->getOperand(1), Lo, Hi }; 2483 return DAG.getNode(ISD::WRITE_REGISTER, DL, MVT::Other, Ops); 2484 } 2485 2486 // ConstantPool, JumpTable, GlobalAddress, and ExternalSymbol are lowered as 2487 // their target counterpart wrapped in the ARMISD::Wrapper node. Suppose N is 2488 // one of the above mentioned nodes. It has to be wrapped because otherwise 2489 // Select(N) returns N. So the raw TargetGlobalAddress nodes, etc. can only 2490 // be used to form addressing mode. These wrapped nodes will be selected 2491 // into MOVi. 2492 static SDValue LowerConstantPool(SDValue Op, SelectionDAG &DAG) { 2493 EVT PtrVT = Op.getValueType(); 2494 // FIXME there is no actual debug info here 2495 SDLoc dl(Op); 2496 ConstantPoolSDNode *CP = cast<ConstantPoolSDNode>(Op); 2497 SDValue Res; 2498 if (CP->isMachineConstantPoolEntry()) 2499 Res = DAG.getTargetConstantPool(CP->getMachineCPVal(), PtrVT, 2500 CP->getAlignment()); 2501 else 2502 Res = DAG.getTargetConstantPool(CP->getConstVal(), PtrVT, 2503 CP->getAlignment()); 2504 return DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, Res); 2505 } 2506 2507 unsigned ARMTargetLowering::getJumpTableEncoding() const { 2508 return MachineJumpTableInfo::EK_Inline; 2509 } 2510 2511 SDValue ARMTargetLowering::LowerBlockAddress(SDValue Op, 2512 SelectionDAG &DAG) const { 2513 MachineFunction &MF = DAG.getMachineFunction(); 2514 ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>(); 2515 unsigned ARMPCLabelIndex = 0; 2516 SDLoc DL(Op); 2517 EVT PtrVT = getPointerTy(DAG.getDataLayout()); 2518 const BlockAddress *BA = cast<BlockAddressSDNode>(Op)->getBlockAddress(); 2519 SDValue CPAddr; 2520 bool IsPositionIndependent = isPositionIndependent(); 2521 if (!IsPositionIndependent) { 2522 CPAddr = DAG.getTargetConstantPool(BA, PtrVT, 4); 2523 } else { 2524 unsigned PCAdj = Subtarget->isThumb() ? 4 : 8; 2525 ARMPCLabelIndex = AFI->createPICLabelUId(); 2526 ARMConstantPoolValue *CPV = 2527 ARMConstantPoolConstant::Create(BA, ARMPCLabelIndex, 2528 ARMCP::CPBlockAddress, PCAdj); 2529 CPAddr = DAG.getTargetConstantPool(CPV, PtrVT, 4); 2530 } 2531 CPAddr = DAG.getNode(ARMISD::Wrapper, DL, PtrVT, CPAddr); 2532 SDValue Result = 2533 DAG.getLoad(PtrVT, DL, DAG.getEntryNode(), CPAddr, 2534 MachinePointerInfo::getConstantPool(DAG.getMachineFunction()), 2535 false, false, false, 0); 2536 if (!IsPositionIndependent) 2537 return Result; 2538 SDValue PICLabel = DAG.getConstant(ARMPCLabelIndex, DL, MVT::i32); 2539 return DAG.getNode(ARMISD::PIC_ADD, DL, PtrVT, Result, PICLabel); 2540 } 2541 2542 /// \brief Convert a TLS address reference into the correct sequence of loads 2543 /// and calls to compute the variable's address for Darwin, and return an 2544 /// SDValue containing the final node. 2545 2546 /// Darwin only has one TLS scheme which must be capable of dealing with the 2547 /// fully general situation, in the worst case. This means: 2548 /// + "extern __thread" declaration. 2549 /// + Defined in a possibly unknown dynamic library. 2550 /// 2551 /// The general system is that each __thread variable has a [3 x i32] descriptor 2552 /// which contains information used by the runtime to calculate the address. The 2553 /// only part of this the compiler needs to know about is the first word, which 2554 /// contains a function pointer that must be called with the address of the 2555 /// entire descriptor in "r0". 2556 /// 2557 /// Since this descriptor may be in a different unit, in general access must 2558 /// proceed along the usual ARM rules. A common sequence to produce is: 2559 /// 2560 /// movw rT1, :lower16:_var$non_lazy_ptr 2561 /// movt rT1, :upper16:_var$non_lazy_ptr 2562 /// ldr r0, [rT1] 2563 /// ldr rT2, [r0] 2564 /// blx rT2 2565 /// [...address now in r0...] 2566 SDValue 2567 ARMTargetLowering::LowerGlobalTLSAddressDarwin(SDValue Op, 2568 SelectionDAG &DAG) const { 2569 assert(Subtarget->isTargetDarwin() && "TLS only supported on Darwin"); 2570 SDLoc DL(Op); 2571 2572 // First step is to get the address of the actua global symbol. This is where 2573 // the TLS descriptor lives. 2574 SDValue DescAddr = LowerGlobalAddressDarwin(Op, DAG); 2575 2576 // The first entry in the descriptor is a function pointer that we must call 2577 // to obtain the address of the variable. 2578 SDValue Chain = DAG.getEntryNode(); 2579 SDValue FuncTLVGet = 2580 DAG.getLoad(MVT::i32, DL, Chain, DescAddr, 2581 MachinePointerInfo::getGOT(DAG.getMachineFunction()), 2582 false, true, true, 4); 2583 Chain = FuncTLVGet.getValue(1); 2584 2585 MachineFunction &F = DAG.getMachineFunction(); 2586 MachineFrameInfo *MFI = F.getFrameInfo(); 2587 MFI->setAdjustsStack(true); 2588 2589 // TLS calls preserve all registers except those that absolutely must be 2590 // trashed: R0 (it takes an argument), LR (it's a call) and CPSR (let's not be 2591 // silly). 2592 auto TRI = 2593 getTargetMachine().getSubtargetImpl(*F.getFunction())->getRegisterInfo(); 2594 auto ARI = static_cast<const ARMRegisterInfo *>(TRI); 2595 const uint32_t *Mask = ARI->getTLSCallPreservedMask(DAG.getMachineFunction()); 2596 2597 // Finally, we can make the call. This is just a degenerate version of a 2598 // normal AArch64 call node: r0 takes the address of the descriptor, and 2599 // returns the address of the variable in this thread. 2600 Chain = DAG.getCopyToReg(Chain, DL, ARM::R0, DescAddr, SDValue()); 2601 Chain = 2602 DAG.getNode(ARMISD::CALL, DL, DAG.getVTList(MVT::Other, MVT::Glue), 2603 Chain, FuncTLVGet, DAG.getRegister(ARM::R0, MVT::i32), 2604 DAG.getRegisterMask(Mask), Chain.getValue(1)); 2605 return DAG.getCopyFromReg(Chain, DL, ARM::R0, MVT::i32, Chain.getValue(1)); 2606 } 2607 2608 SDValue 2609 ARMTargetLowering::LowerGlobalTLSAddressWindows(SDValue Op, 2610 SelectionDAG &DAG) const { 2611 assert(Subtarget->isTargetWindows() && "Windows specific TLS lowering"); 2612 2613 SDValue Chain = DAG.getEntryNode(); 2614 EVT PtrVT = getPointerTy(DAG.getDataLayout()); 2615 SDLoc DL(Op); 2616 2617 // Load the current TEB (thread environment block) 2618 SDValue Ops[] = {Chain, 2619 DAG.getConstant(Intrinsic::arm_mrc, DL, MVT::i32), 2620 DAG.getConstant(15, DL, MVT::i32), 2621 DAG.getConstant(0, DL, MVT::i32), 2622 DAG.getConstant(13, DL, MVT::i32), 2623 DAG.getConstant(0, DL, MVT::i32), 2624 DAG.getConstant(2, DL, MVT::i32)}; 2625 SDValue CurrentTEB = DAG.getNode(ISD::INTRINSIC_W_CHAIN, DL, 2626 DAG.getVTList(MVT::i32, MVT::Other), Ops); 2627 2628 SDValue TEB = CurrentTEB.getValue(0); 2629 Chain = CurrentTEB.getValue(1); 2630 2631 // Load the ThreadLocalStoragePointer from the TEB 2632 // A pointer to the TLS array is located at offset 0x2c from the TEB. 2633 SDValue TLSArray = 2634 DAG.getNode(ISD::ADD, DL, PtrVT, TEB, DAG.getIntPtrConstant(0x2c, DL)); 2635 TLSArray = DAG.getLoad(PtrVT, DL, Chain, TLSArray, MachinePointerInfo(), 2636 false, false, false, 0); 2637 2638 // The pointer to the thread's TLS data area is at the TLS Index scaled by 4 2639 // offset into the TLSArray. 2640 2641 // Load the TLS index from the C runtime 2642 SDValue TLSIndex = 2643 DAG.getTargetExternalSymbol("_tls_index", PtrVT, ARMII::MO_NO_FLAG); 2644 TLSIndex = DAG.getNode(ARMISD::Wrapper, DL, PtrVT, TLSIndex); 2645 TLSIndex = DAG.getLoad(PtrVT, DL, Chain, TLSIndex, MachinePointerInfo(), 2646 false, false, false, 0); 2647 2648 SDValue Slot = DAG.getNode(ISD::SHL, DL, PtrVT, TLSIndex, 2649 DAG.getConstant(2, DL, MVT::i32)); 2650 SDValue TLS = DAG.getLoad(PtrVT, DL, Chain, 2651 DAG.getNode(ISD::ADD, DL, PtrVT, TLSArray, Slot), 2652 MachinePointerInfo(), false, false, false, 0); 2653 2654 // Get the offset of the start of the .tls section (section base) 2655 const auto *GA = cast<GlobalAddressSDNode>(Op); 2656 auto *CPV = ARMConstantPoolConstant::Create(GA->getGlobal(), ARMCP::SECREL); 2657 SDValue Offset = 2658 DAG.getLoad(PtrVT, DL, Chain, 2659 DAG.getNode(ARMISD::Wrapper, DL, MVT::i32, 2660 DAG.getTargetConstantPool(CPV, PtrVT, 4)), 2661 MachinePointerInfo::getConstantPool(DAG.getMachineFunction()), 2662 false, false, false, 0); 2663 2664 return DAG.getNode(ISD::ADD, DL, PtrVT, TLS, Offset); 2665 } 2666 2667 // Lower ISD::GlobalTLSAddress using the "general dynamic" model 2668 SDValue 2669 ARMTargetLowering::LowerToTLSGeneralDynamicModel(GlobalAddressSDNode *GA, 2670 SelectionDAG &DAG) const { 2671 SDLoc dl(GA); 2672 EVT PtrVT = getPointerTy(DAG.getDataLayout()); 2673 unsigned char PCAdj = Subtarget->isThumb() ? 4 : 8; 2674 MachineFunction &MF = DAG.getMachineFunction(); 2675 ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>(); 2676 unsigned ARMPCLabelIndex = AFI->createPICLabelUId(); 2677 ARMConstantPoolValue *CPV = 2678 ARMConstantPoolConstant::Create(GA->getGlobal(), ARMPCLabelIndex, 2679 ARMCP::CPValue, PCAdj, ARMCP::TLSGD, true); 2680 SDValue Argument = DAG.getTargetConstantPool(CPV, PtrVT, 4); 2681 Argument = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, Argument); 2682 Argument = 2683 DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), Argument, 2684 MachinePointerInfo::getConstantPool(DAG.getMachineFunction()), 2685 false, false, false, 0); 2686 SDValue Chain = Argument.getValue(1); 2687 2688 SDValue PICLabel = DAG.getConstant(ARMPCLabelIndex, dl, MVT::i32); 2689 Argument = DAG.getNode(ARMISD::PIC_ADD, dl, PtrVT, Argument, PICLabel); 2690 2691 // call __tls_get_addr. 2692 ArgListTy Args; 2693 ArgListEntry Entry; 2694 Entry.Node = Argument; 2695 Entry.Ty = (Type *) Type::getInt32Ty(*DAG.getContext()); 2696 Args.push_back(Entry); 2697 2698 // FIXME: is there useful debug info available here? 2699 TargetLowering::CallLoweringInfo CLI(DAG); 2700 CLI.setDebugLoc(dl).setChain(Chain) 2701 .setCallee(CallingConv::C, Type::getInt32Ty(*DAG.getContext()), 2702 DAG.getExternalSymbol("__tls_get_addr", PtrVT), std::move(Args)); 2703 2704 std::pair<SDValue, SDValue> CallResult = LowerCallTo(CLI); 2705 return CallResult.first; 2706 } 2707 2708 // Lower ISD::GlobalTLSAddress using the "initial exec" or 2709 // "local exec" model. 2710 SDValue 2711 ARMTargetLowering::LowerToTLSExecModels(GlobalAddressSDNode *GA, 2712 SelectionDAG &DAG, 2713 TLSModel::Model model) const { 2714 const GlobalValue *GV = GA->getGlobal(); 2715 SDLoc dl(GA); 2716 SDValue Offset; 2717 SDValue Chain = DAG.getEntryNode(); 2718 EVT PtrVT = getPointerTy(DAG.getDataLayout()); 2719 // Get the Thread Pointer 2720 SDValue ThreadPointer = DAG.getNode(ARMISD::THREAD_POINTER, dl, PtrVT); 2721 2722 if (model == TLSModel::InitialExec) { 2723 MachineFunction &MF = DAG.getMachineFunction(); 2724 ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>(); 2725 unsigned ARMPCLabelIndex = AFI->createPICLabelUId(); 2726 // Initial exec model. 2727 unsigned char PCAdj = Subtarget->isThumb() ? 4 : 8; 2728 ARMConstantPoolValue *CPV = 2729 ARMConstantPoolConstant::Create(GA->getGlobal(), ARMPCLabelIndex, 2730 ARMCP::CPValue, PCAdj, ARMCP::GOTTPOFF, 2731 true); 2732 Offset = DAG.getTargetConstantPool(CPV, PtrVT, 4); 2733 Offset = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, Offset); 2734 Offset = DAG.getLoad( 2735 PtrVT, dl, Chain, Offset, 2736 MachinePointerInfo::getConstantPool(DAG.getMachineFunction()), false, 2737 false, false, 0); 2738 Chain = Offset.getValue(1); 2739 2740 SDValue PICLabel = DAG.getConstant(ARMPCLabelIndex, dl, MVT::i32); 2741 Offset = DAG.getNode(ARMISD::PIC_ADD, dl, PtrVT, Offset, PICLabel); 2742 2743 Offset = DAG.getLoad( 2744 PtrVT, dl, Chain, Offset, 2745 MachinePointerInfo::getConstantPool(DAG.getMachineFunction()), false, 2746 false, false, 0); 2747 } else { 2748 // local exec model 2749 assert(model == TLSModel::LocalExec); 2750 ARMConstantPoolValue *CPV = 2751 ARMConstantPoolConstant::Create(GV, ARMCP::TPOFF); 2752 Offset = DAG.getTargetConstantPool(CPV, PtrVT, 4); 2753 Offset = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, Offset); 2754 Offset = DAG.getLoad( 2755 PtrVT, dl, Chain, Offset, 2756 MachinePointerInfo::getConstantPool(DAG.getMachineFunction()), false, 2757 false, false, 0); 2758 } 2759 2760 // The address of the thread local variable is the add of the thread 2761 // pointer with the offset of the variable. 2762 return DAG.getNode(ISD::ADD, dl, PtrVT, ThreadPointer, Offset); 2763 } 2764 2765 SDValue 2766 ARMTargetLowering::LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const { 2767 if (Subtarget->isTargetDarwin()) 2768 return LowerGlobalTLSAddressDarwin(Op, DAG); 2769 2770 if (Subtarget->isTargetWindows()) 2771 return LowerGlobalTLSAddressWindows(Op, DAG); 2772 2773 // TODO: implement the "local dynamic" model 2774 assert(Subtarget->isTargetELF() && "Only ELF implemented here"); 2775 GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(Op); 2776 if (DAG.getTarget().Options.EmulatedTLS) 2777 return LowerToTLSEmulatedModel(GA, DAG); 2778 2779 TLSModel::Model model = getTargetMachine().getTLSModel(GA->getGlobal()); 2780 2781 switch (model) { 2782 case TLSModel::GeneralDynamic: 2783 case TLSModel::LocalDynamic: 2784 return LowerToTLSGeneralDynamicModel(GA, DAG); 2785 case TLSModel::InitialExec: 2786 case TLSModel::LocalExec: 2787 return LowerToTLSExecModels(GA, DAG, model); 2788 } 2789 llvm_unreachable("bogus TLS model"); 2790 } 2791 2792 SDValue ARMTargetLowering::LowerGlobalAddressELF(SDValue Op, 2793 SelectionDAG &DAG) const { 2794 EVT PtrVT = getPointerTy(DAG.getDataLayout()); 2795 SDLoc dl(Op); 2796 const GlobalValue *GV = cast<GlobalAddressSDNode>(Op)->getGlobal(); 2797 const TargetMachine &TM = getTargetMachine(); 2798 if (isPositionIndependent()) { 2799 bool UseGOT_PREL = !TM.shouldAssumeDSOLocal(*GV->getParent(), GV); 2800 2801 MachineFunction &MF = DAG.getMachineFunction(); 2802 ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>(); 2803 unsigned ARMPCLabelIndex = AFI->createPICLabelUId(); 2804 EVT PtrVT = getPointerTy(DAG.getDataLayout()); 2805 SDLoc dl(Op); 2806 unsigned PCAdj = Subtarget->isThumb() ? 4 : 8; 2807 ARMConstantPoolValue *CPV = ARMConstantPoolConstant::Create( 2808 GV, ARMPCLabelIndex, ARMCP::CPValue, PCAdj, 2809 UseGOT_PREL ? ARMCP::GOT_PREL : ARMCP::no_modifier, 2810 /*AddCurrentAddress=*/UseGOT_PREL); 2811 SDValue CPAddr = DAG.getTargetConstantPool(CPV, PtrVT, 4); 2812 CPAddr = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, CPAddr); 2813 SDValue Result = DAG.getLoad( 2814 PtrVT, dl, DAG.getEntryNode(), CPAddr, 2815 MachinePointerInfo::getConstantPool(DAG.getMachineFunction()), false, 2816 false, false, 0); 2817 SDValue Chain = Result.getValue(1); 2818 SDValue PICLabel = DAG.getConstant(ARMPCLabelIndex, dl, MVT::i32); 2819 Result = DAG.getNode(ARMISD::PIC_ADD, dl, PtrVT, Result, PICLabel); 2820 if (UseGOT_PREL) 2821 Result = DAG.getLoad(PtrVT, dl, Chain, Result, 2822 MachinePointerInfo::getGOT(DAG.getMachineFunction()), 2823 false, false, false, 0); 2824 return Result; 2825 } 2826 2827 // If we have T2 ops, we can materialize the address directly via movt/movw 2828 // pair. This is always cheaper. 2829 if (Subtarget->useMovt(DAG.getMachineFunction())) { 2830 ++NumMovwMovt; 2831 // FIXME: Once remat is capable of dealing with instructions with register 2832 // operands, expand this into two nodes. 2833 return DAG.getNode(ARMISD::Wrapper, dl, PtrVT, 2834 DAG.getTargetGlobalAddress(GV, dl, PtrVT)); 2835 } else { 2836 SDValue CPAddr = DAG.getTargetConstantPool(GV, PtrVT, 4); 2837 CPAddr = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, CPAddr); 2838 return DAG.getLoad( 2839 PtrVT, dl, DAG.getEntryNode(), CPAddr, 2840 MachinePointerInfo::getConstantPool(DAG.getMachineFunction()), false, 2841 false, false, 0); 2842 } 2843 } 2844 2845 SDValue ARMTargetLowering::LowerGlobalAddressDarwin(SDValue Op, 2846 SelectionDAG &DAG) const { 2847 EVT PtrVT = getPointerTy(DAG.getDataLayout()); 2848 SDLoc dl(Op); 2849 const GlobalValue *GV = cast<GlobalAddressSDNode>(Op)->getGlobal(); 2850 2851 if (Subtarget->useMovt(DAG.getMachineFunction())) 2852 ++NumMovwMovt; 2853 2854 // FIXME: Once remat is capable of dealing with instructions with register 2855 // operands, expand this into multiple nodes 2856 unsigned Wrapper = 2857 isPositionIndependent() ? ARMISD::WrapperPIC : ARMISD::Wrapper; 2858 2859 SDValue G = DAG.getTargetGlobalAddress(GV, dl, PtrVT, 0, ARMII::MO_NONLAZY); 2860 SDValue Result = DAG.getNode(Wrapper, dl, PtrVT, G); 2861 2862 if (Subtarget->isGVIndirectSymbol(GV)) 2863 Result = DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), Result, 2864 MachinePointerInfo::getGOT(DAG.getMachineFunction()), 2865 false, false, false, 0); 2866 return Result; 2867 } 2868 2869 SDValue ARMTargetLowering::LowerGlobalAddressWindows(SDValue Op, 2870 SelectionDAG &DAG) const { 2871 assert(Subtarget->isTargetWindows() && "non-Windows COFF is not supported"); 2872 assert(Subtarget->useMovt(DAG.getMachineFunction()) && 2873 "Windows on ARM expects to use movw/movt"); 2874 2875 const GlobalValue *GV = cast<GlobalAddressSDNode>(Op)->getGlobal(); 2876 const ARMII::TOF TargetFlags = 2877 (GV->hasDLLImportStorageClass() ? ARMII::MO_DLLIMPORT : ARMII::MO_NO_FLAG); 2878 EVT PtrVT = getPointerTy(DAG.getDataLayout()); 2879 SDValue Result; 2880 SDLoc DL(Op); 2881 2882 ++NumMovwMovt; 2883 2884 // FIXME: Once remat is capable of dealing with instructions with register 2885 // operands, expand this into two nodes. 2886 Result = DAG.getNode(ARMISD::Wrapper, DL, PtrVT, 2887 DAG.getTargetGlobalAddress(GV, DL, PtrVT, /*Offset=*/0, 2888 TargetFlags)); 2889 if (GV->hasDLLImportStorageClass()) 2890 Result = DAG.getLoad(PtrVT, DL, DAG.getEntryNode(), Result, 2891 MachinePointerInfo::getGOT(DAG.getMachineFunction()), 2892 false, false, false, 0); 2893 return Result; 2894 } 2895 2896 SDValue 2897 ARMTargetLowering::LowerEH_SJLJ_SETJMP(SDValue Op, SelectionDAG &DAG) const { 2898 SDLoc dl(Op); 2899 SDValue Val = DAG.getConstant(0, dl, MVT::i32); 2900 return DAG.getNode(ARMISD::EH_SJLJ_SETJMP, dl, 2901 DAG.getVTList(MVT::i32, MVT::Other), Op.getOperand(0), 2902 Op.getOperand(1), Val); 2903 } 2904 2905 SDValue 2906 ARMTargetLowering::LowerEH_SJLJ_LONGJMP(SDValue Op, SelectionDAG &DAG) const { 2907 SDLoc dl(Op); 2908 return DAG.getNode(ARMISD::EH_SJLJ_LONGJMP, dl, MVT::Other, Op.getOperand(0), 2909 Op.getOperand(1), DAG.getConstant(0, dl, MVT::i32)); 2910 } 2911 2912 SDValue ARMTargetLowering::LowerEH_SJLJ_SETUP_DISPATCH(SDValue Op, 2913 SelectionDAG &DAG) const { 2914 SDLoc dl(Op); 2915 return DAG.getNode(ARMISD::EH_SJLJ_SETUP_DISPATCH, dl, MVT::Other, 2916 Op.getOperand(0)); 2917 } 2918 2919 SDValue 2920 ARMTargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG, 2921 const ARMSubtarget *Subtarget) const { 2922 unsigned IntNo = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue(); 2923 SDLoc dl(Op); 2924 switch (IntNo) { 2925 default: return SDValue(); // Don't custom lower most intrinsics. 2926 case Intrinsic::arm_rbit: { 2927 assert(Op.getOperand(1).getValueType() == MVT::i32 && 2928 "RBIT intrinsic must have i32 type!"); 2929 return DAG.getNode(ISD::BITREVERSE, dl, MVT::i32, Op.getOperand(1)); 2930 } 2931 case Intrinsic::thread_pointer: { 2932 EVT PtrVT = getPointerTy(DAG.getDataLayout()); 2933 return DAG.getNode(ARMISD::THREAD_POINTER, dl, PtrVT); 2934 } 2935 case Intrinsic::eh_sjlj_lsda: { 2936 MachineFunction &MF = DAG.getMachineFunction(); 2937 ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>(); 2938 unsigned ARMPCLabelIndex = AFI->createPICLabelUId(); 2939 EVT PtrVT = getPointerTy(DAG.getDataLayout()); 2940 SDValue CPAddr; 2941 bool IsPositionIndependent = isPositionIndependent(); 2942 unsigned PCAdj = IsPositionIndependent ? (Subtarget->isThumb() ? 4 : 8) : 0; 2943 ARMConstantPoolValue *CPV = 2944 ARMConstantPoolConstant::Create(MF.getFunction(), ARMPCLabelIndex, 2945 ARMCP::CPLSDA, PCAdj); 2946 CPAddr = DAG.getTargetConstantPool(CPV, PtrVT, 4); 2947 CPAddr = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, CPAddr); 2948 SDValue Result = DAG.getLoad( 2949 PtrVT, dl, DAG.getEntryNode(), CPAddr, 2950 MachinePointerInfo::getConstantPool(DAG.getMachineFunction()), false, 2951 false, false, 0); 2952 2953 if (IsPositionIndependent) { 2954 SDValue PICLabel = DAG.getConstant(ARMPCLabelIndex, dl, MVT::i32); 2955 Result = DAG.getNode(ARMISD::PIC_ADD, dl, PtrVT, Result, PICLabel); 2956 } 2957 return Result; 2958 } 2959 case Intrinsic::arm_neon_vmulls: 2960 case Intrinsic::arm_neon_vmullu: { 2961 unsigned NewOpc = (IntNo == Intrinsic::arm_neon_vmulls) 2962 ? ARMISD::VMULLs : ARMISD::VMULLu; 2963 return DAG.getNode(NewOpc, SDLoc(Op), Op.getValueType(), 2964 Op.getOperand(1), Op.getOperand(2)); 2965 } 2966 case Intrinsic::arm_neon_vminnm: 2967 case Intrinsic::arm_neon_vmaxnm: { 2968 unsigned NewOpc = (IntNo == Intrinsic::arm_neon_vminnm) 2969 ? ISD::FMINNUM : ISD::FMAXNUM; 2970 return DAG.getNode(NewOpc, SDLoc(Op), Op.getValueType(), 2971 Op.getOperand(1), Op.getOperand(2)); 2972 } 2973 case Intrinsic::arm_neon_vminu: 2974 case Intrinsic::arm_neon_vmaxu: { 2975 if (Op.getValueType().isFloatingPoint()) 2976 return SDValue(); 2977 unsigned NewOpc = (IntNo == Intrinsic::arm_neon_vminu) 2978 ? ISD::UMIN : ISD::UMAX; 2979 return DAG.getNode(NewOpc, SDLoc(Op), Op.getValueType(), 2980 Op.getOperand(1), Op.getOperand(2)); 2981 } 2982 case Intrinsic::arm_neon_vmins: 2983 case Intrinsic::arm_neon_vmaxs: { 2984 // v{min,max}s is overloaded between signed integers and floats. 2985 if (!Op.getValueType().isFloatingPoint()) { 2986 unsigned NewOpc = (IntNo == Intrinsic::arm_neon_vmins) 2987 ? ISD::SMIN : ISD::SMAX; 2988 return DAG.getNode(NewOpc, SDLoc(Op), Op.getValueType(), 2989 Op.getOperand(1), Op.getOperand(2)); 2990 } 2991 unsigned NewOpc = (IntNo == Intrinsic::arm_neon_vmins) 2992 ? ISD::FMINNAN : ISD::FMAXNAN; 2993 return DAG.getNode(NewOpc, SDLoc(Op), Op.getValueType(), 2994 Op.getOperand(1), Op.getOperand(2)); 2995 } 2996 } 2997 } 2998 2999 static SDValue LowerATOMIC_FENCE(SDValue Op, SelectionDAG &DAG, 3000 const ARMSubtarget *Subtarget) { 3001 // FIXME: handle "fence singlethread" more efficiently. 3002 SDLoc dl(Op); 3003 if (!Subtarget->hasDataBarrier()) { 3004 // Some ARMv6 cpus can support data barriers with an mcr instruction. 3005 // Thumb1 and pre-v6 ARM mode use a libcall instead and should never get 3006 // here. 3007 assert(Subtarget->hasV6Ops() && !Subtarget->isThumb() && 3008 "Unexpected ISD::ATOMIC_FENCE encountered. Should be libcall!"); 3009 return DAG.getNode(ARMISD::MEMBARRIER_MCR, dl, MVT::Other, Op.getOperand(0), 3010 DAG.getConstant(0, dl, MVT::i32)); 3011 } 3012 3013 ConstantSDNode *OrdN = cast<ConstantSDNode>(Op.getOperand(1)); 3014 AtomicOrdering Ord = static_cast<AtomicOrdering>(OrdN->getZExtValue()); 3015 ARM_MB::MemBOpt Domain = ARM_MB::ISH; 3016 if (Subtarget->isMClass()) { 3017 // Only a full system barrier exists in the M-class architectures. 3018 Domain = ARM_MB::SY; 3019 } else if (Subtarget->preferISHSTBarriers() && 3020 Ord == AtomicOrdering::Release) { 3021 // Swift happens to implement ISHST barriers in a way that's compatible with 3022 // Release semantics but weaker than ISH so we'd be fools not to use 3023 // it. Beware: other processors probably don't! 3024 Domain = ARM_MB::ISHST; 3025 } 3026 3027 return DAG.getNode(ISD::INTRINSIC_VOID, dl, MVT::Other, Op.getOperand(0), 3028 DAG.getConstant(Intrinsic::arm_dmb, dl, MVT::i32), 3029 DAG.getConstant(Domain, dl, MVT::i32)); 3030 } 3031 3032 static SDValue LowerPREFETCH(SDValue Op, SelectionDAG &DAG, 3033 const ARMSubtarget *Subtarget) { 3034 // ARM pre v5TE and Thumb1 does not have preload instructions. 3035 if (!(Subtarget->isThumb2() || 3036 (!Subtarget->isThumb1Only() && Subtarget->hasV5TEOps()))) 3037 // Just preserve the chain. 3038 return Op.getOperand(0); 3039 3040 SDLoc dl(Op); 3041 unsigned isRead = ~cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue() & 1; 3042 if (!isRead && 3043 (!Subtarget->hasV7Ops() || !Subtarget->hasMPExtension())) 3044 // ARMv7 with MP extension has PLDW. 3045 return Op.getOperand(0); 3046 3047 unsigned isData = cast<ConstantSDNode>(Op.getOperand(4))->getZExtValue(); 3048 if (Subtarget->isThumb()) { 3049 // Invert the bits. 3050 isRead = ~isRead & 1; 3051 isData = ~isData & 1; 3052 } 3053 3054 return DAG.getNode(ARMISD::PRELOAD, dl, MVT::Other, Op.getOperand(0), 3055 Op.getOperand(1), DAG.getConstant(isRead, dl, MVT::i32), 3056 DAG.getConstant(isData, dl, MVT::i32)); 3057 } 3058 3059 static SDValue LowerVASTART(SDValue Op, SelectionDAG &DAG) { 3060 MachineFunction &MF = DAG.getMachineFunction(); 3061 ARMFunctionInfo *FuncInfo = MF.getInfo<ARMFunctionInfo>(); 3062 3063 // vastart just stores the address of the VarArgsFrameIndex slot into the 3064 // memory location argument. 3065 SDLoc dl(Op); 3066 EVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy(DAG.getDataLayout()); 3067 SDValue FR = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(), PtrVT); 3068 const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue(); 3069 return DAG.getStore(Op.getOperand(0), dl, FR, Op.getOperand(1), 3070 MachinePointerInfo(SV), false, false, 0); 3071 } 3072 3073 SDValue ARMTargetLowering::GetF64FormalArgument(CCValAssign &VA, 3074 CCValAssign &NextVA, 3075 SDValue &Root, 3076 SelectionDAG &DAG, 3077 const SDLoc &dl) const { 3078 MachineFunction &MF = DAG.getMachineFunction(); 3079 ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>(); 3080 3081 const TargetRegisterClass *RC; 3082 if (AFI->isThumb1OnlyFunction()) 3083 RC = &ARM::tGPRRegClass; 3084 else 3085 RC = &ARM::GPRRegClass; 3086 3087 // Transform the arguments stored in physical registers into virtual ones. 3088 unsigned Reg = MF.addLiveIn(VA.getLocReg(), RC); 3089 SDValue ArgValue = DAG.getCopyFromReg(Root, dl, Reg, MVT::i32); 3090 3091 SDValue ArgValue2; 3092 if (NextVA.isMemLoc()) { 3093 MachineFrameInfo *MFI = MF.getFrameInfo(); 3094 int FI = MFI->CreateFixedObject(4, NextVA.getLocMemOffset(), true); 3095 3096 // Create load node to retrieve arguments from the stack. 3097 SDValue FIN = DAG.getFrameIndex(FI, getPointerTy(DAG.getDataLayout())); 3098 ArgValue2 = DAG.getLoad( 3099 MVT::i32, dl, Root, FIN, 3100 MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI), false, 3101 false, false, 0); 3102 } else { 3103 Reg = MF.addLiveIn(NextVA.getLocReg(), RC); 3104 ArgValue2 = DAG.getCopyFromReg(Root, dl, Reg, MVT::i32); 3105 } 3106 if (!Subtarget->isLittle()) 3107 std::swap (ArgValue, ArgValue2); 3108 return DAG.getNode(ARMISD::VMOVDRR, dl, MVT::f64, ArgValue, ArgValue2); 3109 } 3110 3111 // The remaining GPRs hold either the beginning of variable-argument 3112 // data, or the beginning of an aggregate passed by value (usually 3113 // byval). Either way, we allocate stack slots adjacent to the data 3114 // provided by our caller, and store the unallocated registers there. 3115 // If this is a variadic function, the va_list pointer will begin with 3116 // these values; otherwise, this reassembles a (byval) structure that 3117 // was split between registers and memory. 3118 // Return: The frame index registers were stored into. 3119 int ARMTargetLowering::StoreByValRegs(CCState &CCInfo, SelectionDAG &DAG, 3120 const SDLoc &dl, SDValue &Chain, 3121 const Value *OrigArg, 3122 unsigned InRegsParamRecordIdx, 3123 int ArgOffset, unsigned ArgSize) const { 3124 // Currently, two use-cases possible: 3125 // Case #1. Non-var-args function, and we meet first byval parameter. 3126 // Setup first unallocated register as first byval register; 3127 // eat all remained registers 3128 // (these two actions are performed by HandleByVal method). 3129 // Then, here, we initialize stack frame with 3130 // "store-reg" instructions. 3131 // Case #2. Var-args function, that doesn't contain byval parameters. 3132 // The same: eat all remained unallocated registers, 3133 // initialize stack frame. 3134 3135 MachineFunction &MF = DAG.getMachineFunction(); 3136 MachineFrameInfo *MFI = MF.getFrameInfo(); 3137 ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>(); 3138 unsigned RBegin, REnd; 3139 if (InRegsParamRecordIdx < CCInfo.getInRegsParamsCount()) { 3140 CCInfo.getInRegsParamInfo(InRegsParamRecordIdx, RBegin, REnd); 3141 } else { 3142 unsigned RBeginIdx = CCInfo.getFirstUnallocated(GPRArgRegs); 3143 RBegin = RBeginIdx == 4 ? (unsigned)ARM::R4 : GPRArgRegs[RBeginIdx]; 3144 REnd = ARM::R4; 3145 } 3146 3147 if (REnd != RBegin) 3148 ArgOffset = -4 * (ARM::R4 - RBegin); 3149 3150 auto PtrVT = getPointerTy(DAG.getDataLayout()); 3151 int FrameIndex = MFI->CreateFixedObject(ArgSize, ArgOffset, false); 3152 SDValue FIN = DAG.getFrameIndex(FrameIndex, PtrVT); 3153 3154 SmallVector<SDValue, 4> MemOps; 3155 const TargetRegisterClass *RC = 3156 AFI->isThumb1OnlyFunction() ? &ARM::tGPRRegClass : &ARM::GPRRegClass; 3157 3158 for (unsigned Reg = RBegin, i = 0; Reg < REnd; ++Reg, ++i) { 3159 unsigned VReg = MF.addLiveIn(Reg, RC); 3160 SDValue Val = DAG.getCopyFromReg(Chain, dl, VReg, MVT::i32); 3161 SDValue Store = 3162 DAG.getStore(Val.getValue(1), dl, Val, FIN, 3163 MachinePointerInfo(OrigArg, 4 * i), false, false, 0); 3164 MemOps.push_back(Store); 3165 FIN = DAG.getNode(ISD::ADD, dl, PtrVT, FIN, DAG.getConstant(4, dl, PtrVT)); 3166 } 3167 3168 if (!MemOps.empty()) 3169 Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOps); 3170 return FrameIndex; 3171 } 3172 3173 // Setup stack frame, the va_list pointer will start from. 3174 void ARMTargetLowering::VarArgStyleRegisters(CCState &CCInfo, SelectionDAG &DAG, 3175 const SDLoc &dl, SDValue &Chain, 3176 unsigned ArgOffset, 3177 unsigned TotalArgRegsSaveSize, 3178 bool ForceMutable) const { 3179 MachineFunction &MF = DAG.getMachineFunction(); 3180 ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>(); 3181 3182 // Try to store any remaining integer argument regs 3183 // to their spots on the stack so that they may be loaded by dereferencing 3184 // the result of va_next. 3185 // If there is no regs to be stored, just point address after last 3186 // argument passed via stack. 3187 int FrameIndex = StoreByValRegs(CCInfo, DAG, dl, Chain, nullptr, 3188 CCInfo.getInRegsParamsCount(), 3189 CCInfo.getNextStackOffset(), 4); 3190 AFI->setVarArgsFrameIndex(FrameIndex); 3191 } 3192 3193 SDValue ARMTargetLowering::LowerFormalArguments( 3194 SDValue Chain, CallingConv::ID CallConv, bool isVarArg, 3195 const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl, 3196 SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const { 3197 MachineFunction &MF = DAG.getMachineFunction(); 3198 MachineFrameInfo *MFI = MF.getFrameInfo(); 3199 3200 ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>(); 3201 3202 // Assign locations to all of the incoming arguments. 3203 SmallVector<CCValAssign, 16> ArgLocs; 3204 ARMCCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), ArgLocs, 3205 *DAG.getContext(), Prologue); 3206 CCInfo.AnalyzeFormalArguments(Ins, 3207 CCAssignFnForNode(CallConv, /* Return*/ false, 3208 isVarArg)); 3209 3210 SmallVector<SDValue, 16> ArgValues; 3211 SDValue ArgValue; 3212 Function::const_arg_iterator CurOrigArg = MF.getFunction()->arg_begin(); 3213 unsigned CurArgIdx = 0; 3214 3215 // Initially ArgRegsSaveSize is zero. 3216 // Then we increase this value each time we meet byval parameter. 3217 // We also increase this value in case of varargs function. 3218 AFI->setArgRegsSaveSize(0); 3219 3220 // Calculate the amount of stack space that we need to allocate to store 3221 // byval and variadic arguments that are passed in registers. 3222 // We need to know this before we allocate the first byval or variadic 3223 // argument, as they will be allocated a stack slot below the CFA (Canonical 3224 // Frame Address, the stack pointer at entry to the function). 3225 unsigned ArgRegBegin = ARM::R4; 3226 for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) { 3227 if (CCInfo.getInRegsParamsProcessed() >= CCInfo.getInRegsParamsCount()) 3228 break; 3229 3230 CCValAssign &VA = ArgLocs[i]; 3231 unsigned Index = VA.getValNo(); 3232 ISD::ArgFlagsTy Flags = Ins[Index].Flags; 3233 if (!Flags.isByVal()) 3234 continue; 3235 3236 assert(VA.isMemLoc() && "unexpected byval pointer in reg"); 3237 unsigned RBegin, REnd; 3238 CCInfo.getInRegsParamInfo(CCInfo.getInRegsParamsProcessed(), RBegin, REnd); 3239 ArgRegBegin = std::min(ArgRegBegin, RBegin); 3240 3241 CCInfo.nextInRegsParam(); 3242 } 3243 CCInfo.rewindByValRegsInfo(); 3244 3245 int lastInsIndex = -1; 3246 if (isVarArg && MFI->hasVAStart()) { 3247 unsigned RegIdx = CCInfo.getFirstUnallocated(GPRArgRegs); 3248 if (RegIdx != array_lengthof(GPRArgRegs)) 3249 ArgRegBegin = std::min(ArgRegBegin, (unsigned)GPRArgRegs[RegIdx]); 3250 } 3251 3252 unsigned TotalArgRegsSaveSize = 4 * (ARM::R4 - ArgRegBegin); 3253 AFI->setArgRegsSaveSize(TotalArgRegsSaveSize); 3254 auto PtrVT = getPointerTy(DAG.getDataLayout()); 3255 3256 for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) { 3257 CCValAssign &VA = ArgLocs[i]; 3258 if (Ins[VA.getValNo()].isOrigArg()) { 3259 std::advance(CurOrigArg, 3260 Ins[VA.getValNo()].getOrigArgIndex() - CurArgIdx); 3261 CurArgIdx = Ins[VA.getValNo()].getOrigArgIndex(); 3262 } 3263 // Arguments stored in registers. 3264 if (VA.isRegLoc()) { 3265 EVT RegVT = VA.getLocVT(); 3266 3267 if (VA.needsCustom()) { 3268 // f64 and vector types are split up into multiple registers or 3269 // combinations of registers and stack slots. 3270 if (VA.getLocVT() == MVT::v2f64) { 3271 SDValue ArgValue1 = GetF64FormalArgument(VA, ArgLocs[++i], 3272 Chain, DAG, dl); 3273 VA = ArgLocs[++i]; // skip ahead to next loc 3274 SDValue ArgValue2; 3275 if (VA.isMemLoc()) { 3276 int FI = MFI->CreateFixedObject(8, VA.getLocMemOffset(), true); 3277 SDValue FIN = DAG.getFrameIndex(FI, PtrVT); 3278 ArgValue2 = DAG.getLoad( 3279 MVT::f64, dl, Chain, FIN, 3280 MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI), 3281 false, false, false, 0); 3282 } else { 3283 ArgValue2 = GetF64FormalArgument(VA, ArgLocs[++i], 3284 Chain, DAG, dl); 3285 } 3286 ArgValue = DAG.getNode(ISD::UNDEF, dl, MVT::v2f64); 3287 ArgValue = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v2f64, 3288 ArgValue, ArgValue1, 3289 DAG.getIntPtrConstant(0, dl)); 3290 ArgValue = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v2f64, 3291 ArgValue, ArgValue2, 3292 DAG.getIntPtrConstant(1, dl)); 3293 } else 3294 ArgValue = GetF64FormalArgument(VA, ArgLocs[++i], Chain, DAG, dl); 3295 3296 } else { 3297 const TargetRegisterClass *RC; 3298 3299 if (RegVT == MVT::f32) 3300 RC = &ARM::SPRRegClass; 3301 else if (RegVT == MVT::f64) 3302 RC = &ARM::DPRRegClass; 3303 else if (RegVT == MVT::v2f64) 3304 RC = &ARM::QPRRegClass; 3305 else if (RegVT == MVT::i32) 3306 RC = AFI->isThumb1OnlyFunction() ? &ARM::tGPRRegClass 3307 : &ARM::GPRRegClass; 3308 else 3309 llvm_unreachable("RegVT not supported by FORMAL_ARGUMENTS Lowering"); 3310 3311 // Transform the arguments in physical registers into virtual ones. 3312 unsigned Reg = MF.addLiveIn(VA.getLocReg(), RC); 3313 ArgValue = DAG.getCopyFromReg(Chain, dl, Reg, RegVT); 3314 } 3315 3316 // If this is an 8 or 16-bit value, it is really passed promoted 3317 // to 32 bits. Insert an assert[sz]ext to capture this, then 3318 // truncate to the right size. 3319 switch (VA.getLocInfo()) { 3320 default: llvm_unreachable("Unknown loc info!"); 3321 case CCValAssign::Full: break; 3322 case CCValAssign::BCvt: 3323 ArgValue = DAG.getNode(ISD::BITCAST, dl, VA.getValVT(), ArgValue); 3324 break; 3325 case CCValAssign::SExt: 3326 ArgValue = DAG.getNode(ISD::AssertSext, dl, RegVT, ArgValue, 3327 DAG.getValueType(VA.getValVT())); 3328 ArgValue = DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), ArgValue); 3329 break; 3330 case CCValAssign::ZExt: 3331 ArgValue = DAG.getNode(ISD::AssertZext, dl, RegVT, ArgValue, 3332 DAG.getValueType(VA.getValVT())); 3333 ArgValue = DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), ArgValue); 3334 break; 3335 } 3336 3337 InVals.push_back(ArgValue); 3338 3339 } else { // VA.isRegLoc() 3340 3341 // sanity check 3342 assert(VA.isMemLoc()); 3343 assert(VA.getValVT() != MVT::i64 && "i64 should already be lowered"); 3344 3345 int index = VA.getValNo(); 3346 3347 // Some Ins[] entries become multiple ArgLoc[] entries. 3348 // Process them only once. 3349 if (index != lastInsIndex) 3350 { 3351 ISD::ArgFlagsTy Flags = Ins[index].Flags; 3352 // FIXME: For now, all byval parameter objects are marked mutable. 3353 // This can be changed with more analysis. 3354 // In case of tail call optimization mark all arguments mutable. 3355 // Since they could be overwritten by lowering of arguments in case of 3356 // a tail call. 3357 if (Flags.isByVal()) { 3358 assert(Ins[index].isOrigArg() && 3359 "Byval arguments cannot be implicit"); 3360 unsigned CurByValIndex = CCInfo.getInRegsParamsProcessed(); 3361 3362 int FrameIndex = StoreByValRegs( 3363 CCInfo, DAG, dl, Chain, &*CurOrigArg, CurByValIndex, 3364 VA.getLocMemOffset(), Flags.getByValSize()); 3365 InVals.push_back(DAG.getFrameIndex(FrameIndex, PtrVT)); 3366 CCInfo.nextInRegsParam(); 3367 } else { 3368 unsigned FIOffset = VA.getLocMemOffset(); 3369 int FI = MFI->CreateFixedObject(VA.getLocVT().getSizeInBits()/8, 3370 FIOffset, true); 3371 3372 // Create load nodes to retrieve arguments from the stack. 3373 SDValue FIN = DAG.getFrameIndex(FI, PtrVT); 3374 InVals.push_back(DAG.getLoad( 3375 VA.getValVT(), dl, Chain, FIN, 3376 MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI), 3377 false, false, false, 0)); 3378 } 3379 lastInsIndex = index; 3380 } 3381 } 3382 } 3383 3384 // varargs 3385 if (isVarArg && MFI->hasVAStart()) 3386 VarArgStyleRegisters(CCInfo, DAG, dl, Chain, 3387 CCInfo.getNextStackOffset(), 3388 TotalArgRegsSaveSize); 3389 3390 AFI->setArgumentStackSize(CCInfo.getNextStackOffset()); 3391 3392 return Chain; 3393 } 3394 3395 /// isFloatingPointZero - Return true if this is +0.0. 3396 static bool isFloatingPointZero(SDValue Op) { 3397 if (ConstantFPSDNode *CFP = dyn_cast<ConstantFPSDNode>(Op)) 3398 return CFP->getValueAPF().isPosZero(); 3399 else if (ISD::isEXTLoad(Op.getNode()) || ISD::isNON_EXTLoad(Op.getNode())) { 3400 // Maybe this has already been legalized into the constant pool? 3401 if (Op.getOperand(1).getOpcode() == ARMISD::Wrapper) { 3402 SDValue WrapperOp = Op.getOperand(1).getOperand(0); 3403 if (ConstantPoolSDNode *CP = dyn_cast<ConstantPoolSDNode>(WrapperOp)) 3404 if (const ConstantFP *CFP = dyn_cast<ConstantFP>(CP->getConstVal())) 3405 return CFP->getValueAPF().isPosZero(); 3406 } 3407 } else if (Op->getOpcode() == ISD::BITCAST && 3408 Op->getValueType(0) == MVT::f64) { 3409 // Handle (ISD::BITCAST (ARMISD::VMOVIMM (ISD::TargetConstant 0)) MVT::f64) 3410 // created by LowerConstantFP(). 3411 SDValue BitcastOp = Op->getOperand(0); 3412 if (BitcastOp->getOpcode() == ARMISD::VMOVIMM && 3413 isNullConstant(BitcastOp->getOperand(0))) 3414 return true; 3415 } 3416 return false; 3417 } 3418 3419 /// Returns appropriate ARM CMP (cmp) and corresponding condition code for 3420 /// the given operands. 3421 SDValue ARMTargetLowering::getARMCmp(SDValue LHS, SDValue RHS, ISD::CondCode CC, 3422 SDValue &ARMcc, SelectionDAG &DAG, 3423 const SDLoc &dl) const { 3424 if (ConstantSDNode *RHSC = dyn_cast<ConstantSDNode>(RHS.getNode())) { 3425 unsigned C = RHSC->getZExtValue(); 3426 if (!isLegalICmpImmediate(C)) { 3427 // Constant does not fit, try adjusting it by one? 3428 switch (CC) { 3429 default: break; 3430 case ISD::SETLT: 3431 case ISD::SETGE: 3432 if (C != 0x80000000 && isLegalICmpImmediate(C-1)) { 3433 CC = (CC == ISD::SETLT) ? ISD::SETLE : ISD::SETGT; 3434 RHS = DAG.getConstant(C - 1, dl, MVT::i32); 3435 } 3436 break; 3437 case ISD::SETULT: 3438 case ISD::SETUGE: 3439 if (C != 0 && isLegalICmpImmediate(C-1)) { 3440 CC = (CC == ISD::SETULT) ? ISD::SETULE : ISD::SETUGT; 3441 RHS = DAG.getConstant(C - 1, dl, MVT::i32); 3442 } 3443 break; 3444 case ISD::SETLE: 3445 case ISD::SETGT: 3446 if (C != 0x7fffffff && isLegalICmpImmediate(C+1)) { 3447 CC = (CC == ISD::SETLE) ? ISD::SETLT : ISD::SETGE; 3448 RHS = DAG.getConstant(C + 1, dl, MVT::i32); 3449 } 3450 break; 3451 case ISD::SETULE: 3452 case ISD::SETUGT: 3453 if (C != 0xffffffff && isLegalICmpImmediate(C+1)) { 3454 CC = (CC == ISD::SETULE) ? ISD::SETULT : ISD::SETUGE; 3455 RHS = DAG.getConstant(C + 1, dl, MVT::i32); 3456 } 3457 break; 3458 } 3459 } 3460 } 3461 3462 ARMCC::CondCodes CondCode = IntCCToARMCC(CC); 3463 ARMISD::NodeType CompareType; 3464 switch (CondCode) { 3465 default: 3466 CompareType = ARMISD::CMP; 3467 break; 3468 case ARMCC::EQ: 3469 case ARMCC::NE: 3470 // Uses only Z Flag 3471 CompareType = ARMISD::CMPZ; 3472 break; 3473 } 3474 ARMcc = DAG.getConstant(CondCode, dl, MVT::i32); 3475 return DAG.getNode(CompareType, dl, MVT::Glue, LHS, RHS); 3476 } 3477 3478 /// Returns a appropriate VFP CMP (fcmp{s|d}+fmstat) for the given operands. 3479 SDValue ARMTargetLowering::getVFPCmp(SDValue LHS, SDValue RHS, 3480 SelectionDAG &DAG, const SDLoc &dl) const { 3481 assert(!Subtarget->isFPOnlySP() || RHS.getValueType() != MVT::f64); 3482 SDValue Cmp; 3483 if (!isFloatingPointZero(RHS)) 3484 Cmp = DAG.getNode(ARMISD::CMPFP, dl, MVT::Glue, LHS, RHS); 3485 else 3486 Cmp = DAG.getNode(ARMISD::CMPFPw0, dl, MVT::Glue, LHS); 3487 return DAG.getNode(ARMISD::FMSTAT, dl, MVT::Glue, Cmp); 3488 } 3489 3490 /// duplicateCmp - Glue values can have only one use, so this function 3491 /// duplicates a comparison node. 3492 SDValue 3493 ARMTargetLowering::duplicateCmp(SDValue Cmp, SelectionDAG &DAG) const { 3494 unsigned Opc = Cmp.getOpcode(); 3495 SDLoc DL(Cmp); 3496 if (Opc == ARMISD::CMP || Opc == ARMISD::CMPZ) 3497 return DAG.getNode(Opc, DL, MVT::Glue, Cmp.getOperand(0),Cmp.getOperand(1)); 3498 3499 assert(Opc == ARMISD::FMSTAT && "unexpected comparison operation"); 3500 Cmp = Cmp.getOperand(0); 3501 Opc = Cmp.getOpcode(); 3502 if (Opc == ARMISD::CMPFP) 3503 Cmp = DAG.getNode(Opc, DL, MVT::Glue, Cmp.getOperand(0),Cmp.getOperand(1)); 3504 else { 3505 assert(Opc == ARMISD::CMPFPw0 && "unexpected operand of FMSTAT"); 3506 Cmp = DAG.getNode(Opc, DL, MVT::Glue, Cmp.getOperand(0)); 3507 } 3508 return DAG.getNode(ARMISD::FMSTAT, DL, MVT::Glue, Cmp); 3509 } 3510 3511 std::pair<SDValue, SDValue> 3512 ARMTargetLowering::getARMXALUOOp(SDValue Op, SelectionDAG &DAG, 3513 SDValue &ARMcc) const { 3514 assert(Op.getValueType() == MVT::i32 && "Unsupported value type"); 3515 3516 SDValue Value, OverflowCmp; 3517 SDValue LHS = Op.getOperand(0); 3518 SDValue RHS = Op.getOperand(1); 3519 SDLoc dl(Op); 3520 3521 // FIXME: We are currently always generating CMPs because we don't support 3522 // generating CMN through the backend. This is not as good as the natural 3523 // CMP case because it causes a register dependency and cannot be folded 3524 // later. 3525 3526 switch (Op.getOpcode()) { 3527 default: 3528 llvm_unreachable("Unknown overflow instruction!"); 3529 case ISD::SADDO: 3530 ARMcc = DAG.getConstant(ARMCC::VC, dl, MVT::i32); 3531 Value = DAG.getNode(ISD::ADD, dl, Op.getValueType(), LHS, RHS); 3532 OverflowCmp = DAG.getNode(ARMISD::CMP, dl, MVT::Glue, Value, LHS); 3533 break; 3534 case ISD::UADDO: 3535 ARMcc = DAG.getConstant(ARMCC::HS, dl, MVT::i32); 3536 Value = DAG.getNode(ISD::ADD, dl, Op.getValueType(), LHS, RHS); 3537 OverflowCmp = DAG.getNode(ARMISD::CMP, dl, MVT::Glue, Value, LHS); 3538 break; 3539 case ISD::SSUBO: 3540 ARMcc = DAG.getConstant(ARMCC::VC, dl, MVT::i32); 3541 Value = DAG.getNode(ISD::SUB, dl, Op.getValueType(), LHS, RHS); 3542 OverflowCmp = DAG.getNode(ARMISD::CMP, dl, MVT::Glue, LHS, RHS); 3543 break; 3544 case ISD::USUBO: 3545 ARMcc = DAG.getConstant(ARMCC::HS, dl, MVT::i32); 3546 Value = DAG.getNode(ISD::SUB, dl, Op.getValueType(), LHS, RHS); 3547 OverflowCmp = DAG.getNode(ARMISD::CMP, dl, MVT::Glue, LHS, RHS); 3548 break; 3549 } // switch (...) 3550 3551 return std::make_pair(Value, OverflowCmp); 3552 } 3553 3554 3555 SDValue 3556 ARMTargetLowering::LowerXALUO(SDValue Op, SelectionDAG &DAG) const { 3557 // Let legalize expand this if it isn't a legal type yet. 3558 if (!DAG.getTargetLoweringInfo().isTypeLegal(Op.getValueType())) 3559 return SDValue(); 3560 3561 SDValue Value, OverflowCmp; 3562 SDValue ARMcc; 3563 std::tie(Value, OverflowCmp) = getARMXALUOOp(Op, DAG, ARMcc); 3564 SDValue CCR = DAG.getRegister(ARM::CPSR, MVT::i32); 3565 SDLoc dl(Op); 3566 // We use 0 and 1 as false and true values. 3567 SDValue TVal = DAG.getConstant(1, dl, MVT::i32); 3568 SDValue FVal = DAG.getConstant(0, dl, MVT::i32); 3569 EVT VT = Op.getValueType(); 3570 3571 SDValue Overflow = DAG.getNode(ARMISD::CMOV, dl, VT, TVal, FVal, 3572 ARMcc, CCR, OverflowCmp); 3573 3574 SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32); 3575 return DAG.getNode(ISD::MERGE_VALUES, dl, VTs, Value, Overflow); 3576 } 3577 3578 3579 SDValue ARMTargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const { 3580 SDValue Cond = Op.getOperand(0); 3581 SDValue SelectTrue = Op.getOperand(1); 3582 SDValue SelectFalse = Op.getOperand(2); 3583 SDLoc dl(Op); 3584 unsigned Opc = Cond.getOpcode(); 3585 3586 if (Cond.getResNo() == 1 && 3587 (Opc == ISD::SADDO || Opc == ISD::UADDO || Opc == ISD::SSUBO || 3588 Opc == ISD::USUBO)) { 3589 if (!DAG.getTargetLoweringInfo().isTypeLegal(Cond->getValueType(0))) 3590 return SDValue(); 3591 3592 SDValue Value, OverflowCmp; 3593 SDValue ARMcc; 3594 std::tie(Value, OverflowCmp) = getARMXALUOOp(Cond, DAG, ARMcc); 3595 SDValue CCR = DAG.getRegister(ARM::CPSR, MVT::i32); 3596 EVT VT = Op.getValueType(); 3597 3598 return getCMOV(dl, VT, SelectTrue, SelectFalse, ARMcc, CCR, 3599 OverflowCmp, DAG); 3600 } 3601 3602 // Convert: 3603 // 3604 // (select (cmov 1, 0, cond), t, f) -> (cmov t, f, cond) 3605 // (select (cmov 0, 1, cond), t, f) -> (cmov f, t, cond) 3606 // 3607 if (Cond.getOpcode() == ARMISD::CMOV && Cond.hasOneUse()) { 3608 const ConstantSDNode *CMOVTrue = 3609 dyn_cast<ConstantSDNode>(Cond.getOperand(0)); 3610 const ConstantSDNode *CMOVFalse = 3611 dyn_cast<ConstantSDNode>(Cond.getOperand(1)); 3612 3613 if (CMOVTrue && CMOVFalse) { 3614 unsigned CMOVTrueVal = CMOVTrue->getZExtValue(); 3615 unsigned CMOVFalseVal = CMOVFalse->getZExtValue(); 3616 3617 SDValue True; 3618 SDValue False; 3619 if (CMOVTrueVal == 1 && CMOVFalseVal == 0) { 3620 True = SelectTrue; 3621 False = SelectFalse; 3622 } else if (CMOVTrueVal == 0 && CMOVFalseVal == 1) { 3623 True = SelectFalse; 3624 False = SelectTrue; 3625 } 3626 3627 if (True.getNode() && False.getNode()) { 3628 EVT VT = Op.getValueType(); 3629 SDValue ARMcc = Cond.getOperand(2); 3630 SDValue CCR = Cond.getOperand(3); 3631 SDValue Cmp = duplicateCmp(Cond.getOperand(4), DAG); 3632 assert(True.getValueType() == VT); 3633 return getCMOV(dl, VT, True, False, ARMcc, CCR, Cmp, DAG); 3634 } 3635 } 3636 } 3637 3638 // ARM's BooleanContents value is UndefinedBooleanContent. Mask out the 3639 // undefined bits before doing a full-word comparison with zero. 3640 Cond = DAG.getNode(ISD::AND, dl, Cond.getValueType(), Cond, 3641 DAG.getConstant(1, dl, Cond.getValueType())); 3642 3643 return DAG.getSelectCC(dl, Cond, 3644 DAG.getConstant(0, dl, Cond.getValueType()), 3645 SelectTrue, SelectFalse, ISD::SETNE); 3646 } 3647 3648 static void checkVSELConstraints(ISD::CondCode CC, ARMCC::CondCodes &CondCode, 3649 bool &swpCmpOps, bool &swpVselOps) { 3650 // Start by selecting the GE condition code for opcodes that return true for 3651 // 'equality' 3652 if (CC == ISD::SETUGE || CC == ISD::SETOGE || CC == ISD::SETOLE || 3653 CC == ISD::SETULE) 3654 CondCode = ARMCC::GE; 3655 3656 // and GT for opcodes that return false for 'equality'. 3657 else if (CC == ISD::SETUGT || CC == ISD::SETOGT || CC == ISD::SETOLT || 3658 CC == ISD::SETULT) 3659 CondCode = ARMCC::GT; 3660 3661 // Since we are constrained to GE/GT, if the opcode contains 'less', we need 3662 // to swap the compare operands. 3663 if (CC == ISD::SETOLE || CC == ISD::SETULE || CC == ISD::SETOLT || 3664 CC == ISD::SETULT) 3665 swpCmpOps = true; 3666 3667 // Both GT and GE are ordered comparisons, and return false for 'unordered'. 3668 // If we have an unordered opcode, we need to swap the operands to the VSEL 3669 // instruction (effectively negating the condition). 3670 // 3671 // This also has the effect of swapping which one of 'less' or 'greater' 3672 // returns true, so we also swap the compare operands. It also switches 3673 // whether we return true for 'equality', so we compensate by picking the 3674 // opposite condition code to our original choice. 3675 if (CC == ISD::SETULE || CC == ISD::SETULT || CC == ISD::SETUGE || 3676 CC == ISD::SETUGT) { 3677 swpCmpOps = !swpCmpOps; 3678 swpVselOps = !swpVselOps; 3679 CondCode = CondCode == ARMCC::GT ? ARMCC::GE : ARMCC::GT; 3680 } 3681 3682 // 'ordered' is 'anything but unordered', so use the VS condition code and 3683 // swap the VSEL operands. 3684 if (CC == ISD::SETO) { 3685 CondCode = ARMCC::VS; 3686 swpVselOps = true; 3687 } 3688 3689 // 'unordered or not equal' is 'anything but equal', so use the EQ condition 3690 // code and swap the VSEL operands. 3691 if (CC == ISD::SETUNE) { 3692 CondCode = ARMCC::EQ; 3693 swpVselOps = true; 3694 } 3695 } 3696 3697 SDValue ARMTargetLowering::getCMOV(const SDLoc &dl, EVT VT, SDValue FalseVal, 3698 SDValue TrueVal, SDValue ARMcc, SDValue CCR, 3699 SDValue Cmp, SelectionDAG &DAG) const { 3700 if (Subtarget->isFPOnlySP() && VT == MVT::f64) { 3701 FalseVal = DAG.getNode(ARMISD::VMOVRRD, dl, 3702 DAG.getVTList(MVT::i32, MVT::i32), FalseVal); 3703 TrueVal = DAG.getNode(ARMISD::VMOVRRD, dl, 3704 DAG.getVTList(MVT::i32, MVT::i32), TrueVal); 3705 3706 SDValue TrueLow = TrueVal.getValue(0); 3707 SDValue TrueHigh = TrueVal.getValue(1); 3708 SDValue FalseLow = FalseVal.getValue(0); 3709 SDValue FalseHigh = FalseVal.getValue(1); 3710 3711 SDValue Low = DAG.getNode(ARMISD::CMOV, dl, MVT::i32, FalseLow, TrueLow, 3712 ARMcc, CCR, Cmp); 3713 SDValue High = DAG.getNode(ARMISD::CMOV, dl, MVT::i32, FalseHigh, TrueHigh, 3714 ARMcc, CCR, duplicateCmp(Cmp, DAG)); 3715 3716 return DAG.getNode(ARMISD::VMOVDRR, dl, MVT::f64, Low, High); 3717 } else { 3718 return DAG.getNode(ARMISD::CMOV, dl, VT, FalseVal, TrueVal, ARMcc, CCR, 3719 Cmp); 3720 } 3721 } 3722 3723 static bool isGTorGE(ISD::CondCode CC) { 3724 return CC == ISD::SETGT || CC == ISD::SETGE; 3725 } 3726 3727 static bool isLTorLE(ISD::CondCode CC) { 3728 return CC == ISD::SETLT || CC == ISD::SETLE; 3729 } 3730 3731 // See if a conditional (LHS CC RHS ? TrueVal : FalseVal) is lower-saturating. 3732 // All of these conditions (and their <= and >= counterparts) will do: 3733 // x < k ? k : x 3734 // x > k ? x : k 3735 // k < x ? x : k 3736 // k > x ? k : x 3737 static bool isLowerSaturate(const SDValue LHS, const SDValue RHS, 3738 const SDValue TrueVal, const SDValue FalseVal, 3739 const ISD::CondCode CC, const SDValue K) { 3740 return (isGTorGE(CC) && 3741 ((K == LHS && K == TrueVal) || (K == RHS && K == FalseVal))) || 3742 (isLTorLE(CC) && 3743 ((K == RHS && K == TrueVal) || (K == LHS && K == FalseVal))); 3744 } 3745 3746 // Similar to isLowerSaturate(), but checks for upper-saturating conditions. 3747 static bool isUpperSaturate(const SDValue LHS, const SDValue RHS, 3748 const SDValue TrueVal, const SDValue FalseVal, 3749 const ISD::CondCode CC, const SDValue K) { 3750 return (isGTorGE(CC) && 3751 ((K == RHS && K == TrueVal) || (K == LHS && K == FalseVal))) || 3752 (isLTorLE(CC) && 3753 ((K == LHS && K == TrueVal) || (K == RHS && K == FalseVal))); 3754 } 3755 3756 // Check if two chained conditionals could be converted into SSAT. 3757 // 3758 // SSAT can replace a set of two conditional selectors that bound a number to an 3759 // interval of type [k, ~k] when k + 1 is a power of 2. Here are some examples: 3760 // 3761 // x < -k ? -k : (x > k ? k : x) 3762 // x < -k ? -k : (x < k ? x : k) 3763 // x > -k ? (x > k ? k : x) : -k 3764 // x < k ? (x < -k ? -k : x) : k 3765 // etc. 3766 // 3767 // It returns true if the conversion can be done, false otherwise. 3768 // Additionally, the variable is returned in parameter V and the constant in K. 3769 static bool isSaturatingConditional(const SDValue &Op, SDValue &V, 3770 uint64_t &K) { 3771 3772 SDValue LHS1 = Op.getOperand(0); 3773 SDValue RHS1 = Op.getOperand(1); 3774 SDValue TrueVal1 = Op.getOperand(2); 3775 SDValue FalseVal1 = Op.getOperand(3); 3776 ISD::CondCode CC1 = cast<CondCodeSDNode>(Op.getOperand(4))->get(); 3777 3778 const SDValue Op2 = isa<ConstantSDNode>(TrueVal1) ? FalseVal1 : TrueVal1; 3779 if (Op2.getOpcode() != ISD::SELECT_CC) 3780 return false; 3781 3782 SDValue LHS2 = Op2.getOperand(0); 3783 SDValue RHS2 = Op2.getOperand(1); 3784 SDValue TrueVal2 = Op2.getOperand(2); 3785 SDValue FalseVal2 = Op2.getOperand(3); 3786 ISD::CondCode CC2 = cast<CondCodeSDNode>(Op2.getOperand(4))->get(); 3787 3788 // Find out which are the constants and which are the variables 3789 // in each conditional 3790 SDValue *K1 = isa<ConstantSDNode>(LHS1) ? &LHS1 : isa<ConstantSDNode>(RHS1) 3791 ? &RHS1 3792 : NULL; 3793 SDValue *K2 = isa<ConstantSDNode>(LHS2) ? &LHS2 : isa<ConstantSDNode>(RHS2) 3794 ? &RHS2 3795 : NULL; 3796 SDValue K2Tmp = isa<ConstantSDNode>(TrueVal2) ? TrueVal2 : FalseVal2; 3797 SDValue V1Tmp = (K1 && *K1 == LHS1) ? RHS1 : LHS1; 3798 SDValue V2Tmp = (K2 && *K2 == LHS2) ? RHS2 : LHS2; 3799 SDValue V2 = (K2Tmp == TrueVal2) ? FalseVal2 : TrueVal2; 3800 3801 // We must detect cases where the original operations worked with 16- or 3802 // 8-bit values. In such case, V2Tmp != V2 because the comparison operations 3803 // must work with sign-extended values but the select operations return 3804 // the original non-extended value. 3805 SDValue V2TmpReg = V2Tmp; 3806 if (V2Tmp->getOpcode() == ISD::SIGN_EXTEND_INREG) 3807 V2TmpReg = V2Tmp->getOperand(0); 3808 3809 // Check that the registers and the constants have the correct values 3810 // in both conditionals 3811 if (!K1 || !K2 || *K1 == Op2 || *K2 != K2Tmp || V1Tmp != V2Tmp || 3812 V2TmpReg != V2) 3813 return false; 3814 3815 // Figure out which conditional is saturating the lower/upper bound. 3816 const SDValue *LowerCheckOp = 3817 isLowerSaturate(LHS1, RHS1, TrueVal1, FalseVal1, CC1, *K1) 3818 ? &Op 3819 : isLowerSaturate(LHS2, RHS2, TrueVal2, FalseVal2, CC2, *K2) ? &Op2 3820 : NULL; 3821 const SDValue *UpperCheckOp = 3822 isUpperSaturate(LHS1, RHS1, TrueVal1, FalseVal1, CC1, *K1) 3823 ? &Op 3824 : isUpperSaturate(LHS2, RHS2, TrueVal2, FalseVal2, CC2, *K2) ? &Op2 3825 : NULL; 3826 3827 if (!UpperCheckOp || !LowerCheckOp || LowerCheckOp == UpperCheckOp) 3828 return false; 3829 3830 // Check that the constant in the lower-bound check is 3831 // the opposite of the constant in the upper-bound check 3832 // in 1's complement. 3833 int64_t Val1 = cast<ConstantSDNode>(*K1)->getSExtValue(); 3834 int64_t Val2 = cast<ConstantSDNode>(*K2)->getSExtValue(); 3835 int64_t PosVal = std::max(Val1, Val2); 3836 3837 if (((Val1 > Val2 && UpperCheckOp == &Op) || 3838 (Val1 < Val2 && UpperCheckOp == &Op2)) && 3839 Val1 == ~Val2 && isPowerOf2_64(PosVal + 1)) { 3840 3841 V = V2; 3842 K = (uint64_t)PosVal; // At this point, PosVal is guaranteed to be positive 3843 return true; 3844 } 3845 3846 return false; 3847 } 3848 3849 SDValue ARMTargetLowering::LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const { 3850 3851 EVT VT = Op.getValueType(); 3852 SDLoc dl(Op); 3853 3854 // Try to convert two saturating conditional selects into a single SSAT 3855 SDValue SatValue; 3856 uint64_t SatConstant; 3857 if (isSaturatingConditional(Op, SatValue, SatConstant)) 3858 return DAG.getNode(ARMISD::SSAT, dl, VT, SatValue, 3859 DAG.getConstant(countTrailingOnes(SatConstant), dl, VT)); 3860 3861 SDValue LHS = Op.getOperand(0); 3862 SDValue RHS = Op.getOperand(1); 3863 ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(4))->get(); 3864 SDValue TrueVal = Op.getOperand(2); 3865 SDValue FalseVal = Op.getOperand(3); 3866 3867 if (Subtarget->isFPOnlySP() && LHS.getValueType() == MVT::f64) { 3868 DAG.getTargetLoweringInfo().softenSetCCOperands(DAG, MVT::f64, LHS, RHS, CC, 3869 dl); 3870 3871 // If softenSetCCOperands only returned one value, we should compare it to 3872 // zero. 3873 if (!RHS.getNode()) { 3874 RHS = DAG.getConstant(0, dl, LHS.getValueType()); 3875 CC = ISD::SETNE; 3876 } 3877 } 3878 3879 if (LHS.getValueType() == MVT::i32) { 3880 // Try to generate VSEL on ARMv8. 3881 // The VSEL instruction can't use all the usual ARM condition 3882 // codes: it only has two bits to select the condition code, so it's 3883 // constrained to use only GE, GT, VS and EQ. 3884 // 3885 // To implement all the various ISD::SETXXX opcodes, we sometimes need to 3886 // swap the operands of the previous compare instruction (effectively 3887 // inverting the compare condition, swapping 'less' and 'greater') and 3888 // sometimes need to swap the operands to the VSEL (which inverts the 3889 // condition in the sense of firing whenever the previous condition didn't) 3890 if (Subtarget->hasFPARMv8() && (TrueVal.getValueType() == MVT::f32 || 3891 TrueVal.getValueType() == MVT::f64)) { 3892 ARMCC::CondCodes CondCode = IntCCToARMCC(CC); 3893 if (CondCode == ARMCC::LT || CondCode == ARMCC::LE || 3894 CondCode == ARMCC::VC || CondCode == ARMCC::NE) { 3895 CC = ISD::getSetCCInverse(CC, true); 3896 std::swap(TrueVal, FalseVal); 3897 } 3898 } 3899 3900 SDValue ARMcc; 3901 SDValue CCR = DAG.getRegister(ARM::CPSR, MVT::i32); 3902 SDValue Cmp = getARMCmp(LHS, RHS, CC, ARMcc, DAG, dl); 3903 return getCMOV(dl, VT, FalseVal, TrueVal, ARMcc, CCR, Cmp, DAG); 3904 } 3905 3906 ARMCC::CondCodes CondCode, CondCode2; 3907 FPCCToARMCC(CC, CondCode, CondCode2); 3908 3909 // Try to generate VMAXNM/VMINNM on ARMv8. 3910 if (Subtarget->hasFPARMv8() && (TrueVal.getValueType() == MVT::f32 || 3911 TrueVal.getValueType() == MVT::f64)) { 3912 bool swpCmpOps = false; 3913 bool swpVselOps = false; 3914 checkVSELConstraints(CC, CondCode, swpCmpOps, swpVselOps); 3915 3916 if (CondCode == ARMCC::GT || CondCode == ARMCC::GE || 3917 CondCode == ARMCC::VS || CondCode == ARMCC::EQ) { 3918 if (swpCmpOps) 3919 std::swap(LHS, RHS); 3920 if (swpVselOps) 3921 std::swap(TrueVal, FalseVal); 3922 } 3923 } 3924 3925 SDValue ARMcc = DAG.getConstant(CondCode, dl, MVT::i32); 3926 SDValue Cmp = getVFPCmp(LHS, RHS, DAG, dl); 3927 SDValue CCR = DAG.getRegister(ARM::CPSR, MVT::i32); 3928 SDValue Result = getCMOV(dl, VT, FalseVal, TrueVal, ARMcc, CCR, Cmp, DAG); 3929 if (CondCode2 != ARMCC::AL) { 3930 SDValue ARMcc2 = DAG.getConstant(CondCode2, dl, MVT::i32); 3931 // FIXME: Needs another CMP because flag can have but one use. 3932 SDValue Cmp2 = getVFPCmp(LHS, RHS, DAG, dl); 3933 Result = getCMOV(dl, VT, Result, TrueVal, ARMcc2, CCR, Cmp2, DAG); 3934 } 3935 return Result; 3936 } 3937 3938 /// canChangeToInt - Given the fp compare operand, return true if it is suitable 3939 /// to morph to an integer compare sequence. 3940 static bool canChangeToInt(SDValue Op, bool &SeenZero, 3941 const ARMSubtarget *Subtarget) { 3942 SDNode *N = Op.getNode(); 3943 if (!N->hasOneUse()) 3944 // Otherwise it requires moving the value from fp to integer registers. 3945 return false; 3946 if (!N->getNumValues()) 3947 return false; 3948 EVT VT = Op.getValueType(); 3949 if (VT != MVT::f32 && !Subtarget->isFPBrccSlow()) 3950 // f32 case is generally profitable. f64 case only makes sense when vcmpe + 3951 // vmrs are very slow, e.g. cortex-a8. 3952 return false; 3953 3954 if (isFloatingPointZero(Op)) { 3955 SeenZero = true; 3956 return true; 3957 } 3958 return ISD::isNormalLoad(N); 3959 } 3960 3961 static SDValue bitcastf32Toi32(SDValue Op, SelectionDAG &DAG) { 3962 if (isFloatingPointZero(Op)) 3963 return DAG.getConstant(0, SDLoc(Op), MVT::i32); 3964 3965 if (LoadSDNode *Ld = dyn_cast<LoadSDNode>(Op)) 3966 return DAG.getLoad(MVT::i32, SDLoc(Op), 3967 Ld->getChain(), Ld->getBasePtr(), Ld->getPointerInfo(), 3968 Ld->isVolatile(), Ld->isNonTemporal(), 3969 Ld->isInvariant(), Ld->getAlignment()); 3970 3971 llvm_unreachable("Unknown VFP cmp argument!"); 3972 } 3973 3974 static void expandf64Toi32(SDValue Op, SelectionDAG &DAG, 3975 SDValue &RetVal1, SDValue &RetVal2) { 3976 SDLoc dl(Op); 3977 3978 if (isFloatingPointZero(Op)) { 3979 RetVal1 = DAG.getConstant(0, dl, MVT::i32); 3980 RetVal2 = DAG.getConstant(0, dl, MVT::i32); 3981 return; 3982 } 3983 3984 if (LoadSDNode *Ld = dyn_cast<LoadSDNode>(Op)) { 3985 SDValue Ptr = Ld->getBasePtr(); 3986 RetVal1 = DAG.getLoad(MVT::i32, dl, 3987 Ld->getChain(), Ptr, 3988 Ld->getPointerInfo(), 3989 Ld->isVolatile(), Ld->isNonTemporal(), 3990 Ld->isInvariant(), Ld->getAlignment()); 3991 3992 EVT PtrType = Ptr.getValueType(); 3993 unsigned NewAlign = MinAlign(Ld->getAlignment(), 4); 3994 SDValue NewPtr = DAG.getNode(ISD::ADD, dl, 3995 PtrType, Ptr, DAG.getConstant(4, dl, PtrType)); 3996 RetVal2 = DAG.getLoad(MVT::i32, dl, 3997 Ld->getChain(), NewPtr, 3998 Ld->getPointerInfo().getWithOffset(4), 3999 Ld->isVolatile(), Ld->isNonTemporal(), 4000 Ld->isInvariant(), NewAlign); 4001 return; 4002 } 4003 4004 llvm_unreachable("Unknown VFP cmp argument!"); 4005 } 4006 4007 /// OptimizeVFPBrcond - With -enable-unsafe-fp-math, it's legal to optimize some 4008 /// f32 and even f64 comparisons to integer ones. 4009 SDValue 4010 ARMTargetLowering::OptimizeVFPBrcond(SDValue Op, SelectionDAG &DAG) const { 4011 SDValue Chain = Op.getOperand(0); 4012 ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(1))->get(); 4013 SDValue LHS = Op.getOperand(2); 4014 SDValue RHS = Op.getOperand(3); 4015 SDValue Dest = Op.getOperand(4); 4016 SDLoc dl(Op); 4017 4018 bool LHSSeenZero = false; 4019 bool LHSOk = canChangeToInt(LHS, LHSSeenZero, Subtarget); 4020 bool RHSSeenZero = false; 4021 bool RHSOk = canChangeToInt(RHS, RHSSeenZero, Subtarget); 4022 if (LHSOk && RHSOk && (LHSSeenZero || RHSSeenZero)) { 4023 // If unsafe fp math optimization is enabled and there are no other uses of 4024 // the CMP operands, and the condition code is EQ or NE, we can optimize it 4025 // to an integer comparison. 4026 if (CC == ISD::SETOEQ) 4027 CC = ISD::SETEQ; 4028 else if (CC == ISD::SETUNE) 4029 CC = ISD::SETNE; 4030 4031 SDValue Mask = DAG.getConstant(0x7fffffff, dl, MVT::i32); 4032 SDValue ARMcc; 4033 if (LHS.getValueType() == MVT::f32) { 4034 LHS = DAG.getNode(ISD::AND, dl, MVT::i32, 4035 bitcastf32Toi32(LHS, DAG), Mask); 4036 RHS = DAG.getNode(ISD::AND, dl, MVT::i32, 4037 bitcastf32Toi32(RHS, DAG), Mask); 4038 SDValue Cmp = getARMCmp(LHS, RHS, CC, ARMcc, DAG, dl); 4039 SDValue CCR = DAG.getRegister(ARM::CPSR, MVT::i32); 4040 return DAG.getNode(ARMISD::BRCOND, dl, MVT::Other, 4041 Chain, Dest, ARMcc, CCR, Cmp); 4042 } 4043 4044 SDValue LHS1, LHS2; 4045 SDValue RHS1, RHS2; 4046 expandf64Toi32(LHS, DAG, LHS1, LHS2); 4047 expandf64Toi32(RHS, DAG, RHS1, RHS2); 4048 LHS2 = DAG.getNode(ISD::AND, dl, MVT::i32, LHS2, Mask); 4049 RHS2 = DAG.getNode(ISD::AND, dl, MVT::i32, RHS2, Mask); 4050 ARMCC::CondCodes CondCode = IntCCToARMCC(CC); 4051 ARMcc = DAG.getConstant(CondCode, dl, MVT::i32); 4052 SDVTList VTList = DAG.getVTList(MVT::Other, MVT::Glue); 4053 SDValue Ops[] = { Chain, ARMcc, LHS1, LHS2, RHS1, RHS2, Dest }; 4054 return DAG.getNode(ARMISD::BCC_i64, dl, VTList, Ops); 4055 } 4056 4057 return SDValue(); 4058 } 4059 4060 SDValue ARMTargetLowering::LowerBR_CC(SDValue Op, SelectionDAG &DAG) const { 4061 SDValue Chain = Op.getOperand(0); 4062 ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(1))->get(); 4063 SDValue LHS = Op.getOperand(2); 4064 SDValue RHS = Op.getOperand(3); 4065 SDValue Dest = Op.getOperand(4); 4066 SDLoc dl(Op); 4067 4068 if (Subtarget->isFPOnlySP() && LHS.getValueType() == MVT::f64) { 4069 DAG.getTargetLoweringInfo().softenSetCCOperands(DAG, MVT::f64, LHS, RHS, CC, 4070 dl); 4071 4072 // If softenSetCCOperands only returned one value, we should compare it to 4073 // zero. 4074 if (!RHS.getNode()) { 4075 RHS = DAG.getConstant(0, dl, LHS.getValueType()); 4076 CC = ISD::SETNE; 4077 } 4078 } 4079 4080 if (LHS.getValueType() == MVT::i32) { 4081 SDValue ARMcc; 4082 SDValue Cmp = getARMCmp(LHS, RHS, CC, ARMcc, DAG, dl); 4083 SDValue CCR = DAG.getRegister(ARM::CPSR, MVT::i32); 4084 return DAG.getNode(ARMISD::BRCOND, dl, MVT::Other, 4085 Chain, Dest, ARMcc, CCR, Cmp); 4086 } 4087 4088 assert(LHS.getValueType() == MVT::f32 || LHS.getValueType() == MVT::f64); 4089 4090 if (getTargetMachine().Options.UnsafeFPMath && 4091 (CC == ISD::SETEQ || CC == ISD::SETOEQ || 4092 CC == ISD::SETNE || CC == ISD::SETUNE)) { 4093 if (SDValue Result = OptimizeVFPBrcond(Op, DAG)) 4094 return Result; 4095 } 4096 4097 ARMCC::CondCodes CondCode, CondCode2; 4098 FPCCToARMCC(CC, CondCode, CondCode2); 4099 4100 SDValue ARMcc = DAG.getConstant(CondCode, dl, MVT::i32); 4101 SDValue Cmp = getVFPCmp(LHS, RHS, DAG, dl); 4102 SDValue CCR = DAG.getRegister(ARM::CPSR, MVT::i32); 4103 SDVTList VTList = DAG.getVTList(MVT::Other, MVT::Glue); 4104 SDValue Ops[] = { Chain, Dest, ARMcc, CCR, Cmp }; 4105 SDValue Res = DAG.getNode(ARMISD::BRCOND, dl, VTList, Ops); 4106 if (CondCode2 != ARMCC::AL) { 4107 ARMcc = DAG.getConstant(CondCode2, dl, MVT::i32); 4108 SDValue Ops[] = { Res, Dest, ARMcc, CCR, Res.getValue(1) }; 4109 Res = DAG.getNode(ARMISD::BRCOND, dl, VTList, Ops); 4110 } 4111 return Res; 4112 } 4113 4114 SDValue ARMTargetLowering::LowerBR_JT(SDValue Op, SelectionDAG &DAG) const { 4115 SDValue Chain = Op.getOperand(0); 4116 SDValue Table = Op.getOperand(1); 4117 SDValue Index = Op.getOperand(2); 4118 SDLoc dl(Op); 4119 4120 EVT PTy = getPointerTy(DAG.getDataLayout()); 4121 JumpTableSDNode *JT = cast<JumpTableSDNode>(Table); 4122 SDValue JTI = DAG.getTargetJumpTable(JT->getIndex(), PTy); 4123 Table = DAG.getNode(ARMISD::WrapperJT, dl, MVT::i32, JTI); 4124 Index = DAG.getNode(ISD::MUL, dl, PTy, Index, DAG.getConstant(4, dl, PTy)); 4125 SDValue Addr = DAG.getNode(ISD::ADD, dl, PTy, Index, Table); 4126 if (Subtarget->isThumb2()) { 4127 // Thumb2 uses a two-level jump. That is, it jumps into the jump table 4128 // which does another jump to the destination. This also makes it easier 4129 // to translate it to TBB / TBH later. 4130 // FIXME: This might not work if the function is extremely large. 4131 return DAG.getNode(ARMISD::BR2_JT, dl, MVT::Other, Chain, 4132 Addr, Op.getOperand(2), JTI); 4133 } 4134 if (isPositionIndependent()) { 4135 Addr = 4136 DAG.getLoad((EVT)MVT::i32, dl, Chain, Addr, 4137 MachinePointerInfo::getJumpTable(DAG.getMachineFunction()), 4138 false, false, false, 0); 4139 Chain = Addr.getValue(1); 4140 Addr = DAG.getNode(ISD::ADD, dl, PTy, Addr, Table); 4141 return DAG.getNode(ARMISD::BR_JT, dl, MVT::Other, Chain, Addr, JTI); 4142 } else { 4143 Addr = 4144 DAG.getLoad(PTy, dl, Chain, Addr, 4145 MachinePointerInfo::getJumpTable(DAG.getMachineFunction()), 4146 false, false, false, 0); 4147 Chain = Addr.getValue(1); 4148 return DAG.getNode(ARMISD::BR_JT, dl, MVT::Other, Chain, Addr, JTI); 4149 } 4150 } 4151 4152 static SDValue LowerVectorFP_TO_INT(SDValue Op, SelectionDAG &DAG) { 4153 EVT VT = Op.getValueType(); 4154 SDLoc dl(Op); 4155 4156 if (Op.getValueType().getVectorElementType() == MVT::i32) { 4157 if (Op.getOperand(0).getValueType().getVectorElementType() == MVT::f32) 4158 return Op; 4159 return DAG.UnrollVectorOp(Op.getNode()); 4160 } 4161 4162 assert(Op.getOperand(0).getValueType() == MVT::v4f32 && 4163 "Invalid type for custom lowering!"); 4164 if (VT != MVT::v4i16) 4165 return DAG.UnrollVectorOp(Op.getNode()); 4166 4167 Op = DAG.getNode(Op.getOpcode(), dl, MVT::v4i32, Op.getOperand(0)); 4168 return DAG.getNode(ISD::TRUNCATE, dl, VT, Op); 4169 } 4170 4171 SDValue ARMTargetLowering::LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG) const { 4172 EVT VT = Op.getValueType(); 4173 if (VT.isVector()) 4174 return LowerVectorFP_TO_INT(Op, DAG); 4175 if (Subtarget->isFPOnlySP() && Op.getOperand(0).getValueType() == MVT::f64) { 4176 RTLIB::Libcall LC; 4177 if (Op.getOpcode() == ISD::FP_TO_SINT) 4178 LC = RTLIB::getFPTOSINT(Op.getOperand(0).getValueType(), 4179 Op.getValueType()); 4180 else 4181 LC = RTLIB::getFPTOUINT(Op.getOperand(0).getValueType(), 4182 Op.getValueType()); 4183 return makeLibCall(DAG, LC, Op.getValueType(), Op.getOperand(0), 4184 /*isSigned*/ false, SDLoc(Op)).first; 4185 } 4186 4187 return Op; 4188 } 4189 4190 static SDValue LowerVectorINT_TO_FP(SDValue Op, SelectionDAG &DAG) { 4191 EVT VT = Op.getValueType(); 4192 SDLoc dl(Op); 4193 4194 if (Op.getOperand(0).getValueType().getVectorElementType() == MVT::i32) { 4195 if (VT.getVectorElementType() == MVT::f32) 4196 return Op; 4197 return DAG.UnrollVectorOp(Op.getNode()); 4198 } 4199 4200 assert(Op.getOperand(0).getValueType() == MVT::v4i16 && 4201 "Invalid type for custom lowering!"); 4202 if (VT != MVT::v4f32) 4203 return DAG.UnrollVectorOp(Op.getNode()); 4204 4205 unsigned CastOpc; 4206 unsigned Opc; 4207 switch (Op.getOpcode()) { 4208 default: llvm_unreachable("Invalid opcode!"); 4209 case ISD::SINT_TO_FP: 4210 CastOpc = ISD::SIGN_EXTEND; 4211 Opc = ISD::SINT_TO_FP; 4212 break; 4213 case ISD::UINT_TO_FP: 4214 CastOpc = ISD::ZERO_EXTEND; 4215 Opc = ISD::UINT_TO_FP; 4216 break; 4217 } 4218 4219 Op = DAG.getNode(CastOpc, dl, MVT::v4i32, Op.getOperand(0)); 4220 return DAG.getNode(Opc, dl, VT, Op); 4221 } 4222 4223 SDValue ARMTargetLowering::LowerINT_TO_FP(SDValue Op, SelectionDAG &DAG) const { 4224 EVT VT = Op.getValueType(); 4225 if (VT.isVector()) 4226 return LowerVectorINT_TO_FP(Op, DAG); 4227 if (Subtarget->isFPOnlySP() && Op.getValueType() == MVT::f64) { 4228 RTLIB::Libcall LC; 4229 if (Op.getOpcode() == ISD::SINT_TO_FP) 4230 LC = RTLIB::getSINTTOFP(Op.getOperand(0).getValueType(), 4231 Op.getValueType()); 4232 else 4233 LC = RTLIB::getUINTTOFP(Op.getOperand(0).getValueType(), 4234 Op.getValueType()); 4235 return makeLibCall(DAG, LC, Op.getValueType(), Op.getOperand(0), 4236 /*isSigned*/ false, SDLoc(Op)).first; 4237 } 4238 4239 return Op; 4240 } 4241 4242 SDValue ARMTargetLowering::LowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) const { 4243 // Implement fcopysign with a fabs and a conditional fneg. 4244 SDValue Tmp0 = Op.getOperand(0); 4245 SDValue Tmp1 = Op.getOperand(1); 4246 SDLoc dl(Op); 4247 EVT VT = Op.getValueType(); 4248 EVT SrcVT = Tmp1.getValueType(); 4249 bool InGPR = Tmp0.getOpcode() == ISD::BITCAST || 4250 Tmp0.getOpcode() == ARMISD::VMOVDRR; 4251 bool UseNEON = !InGPR && Subtarget->hasNEON(); 4252 4253 if (UseNEON) { 4254 // Use VBSL to copy the sign bit. 4255 unsigned EncodedVal = ARM_AM::createNEONModImm(0x6, 0x80); 4256 SDValue Mask = DAG.getNode(ARMISD::VMOVIMM, dl, MVT::v2i32, 4257 DAG.getTargetConstant(EncodedVal, dl, MVT::i32)); 4258 EVT OpVT = (VT == MVT::f32) ? MVT::v2i32 : MVT::v1i64; 4259 if (VT == MVT::f64) 4260 Mask = DAG.getNode(ARMISD::VSHL, dl, OpVT, 4261 DAG.getNode(ISD::BITCAST, dl, OpVT, Mask), 4262 DAG.getConstant(32, dl, MVT::i32)); 4263 else /*if (VT == MVT::f32)*/ 4264 Tmp0 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2f32, Tmp0); 4265 if (SrcVT == MVT::f32) { 4266 Tmp1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2f32, Tmp1); 4267 if (VT == MVT::f64) 4268 Tmp1 = DAG.getNode(ARMISD::VSHL, dl, OpVT, 4269 DAG.getNode(ISD::BITCAST, dl, OpVT, Tmp1), 4270 DAG.getConstant(32, dl, MVT::i32)); 4271 } else if (VT == MVT::f32) 4272 Tmp1 = DAG.getNode(ARMISD::VSHRu, dl, MVT::v1i64, 4273 DAG.getNode(ISD::BITCAST, dl, MVT::v1i64, Tmp1), 4274 DAG.getConstant(32, dl, MVT::i32)); 4275 Tmp0 = DAG.getNode(ISD::BITCAST, dl, OpVT, Tmp0); 4276 Tmp1 = DAG.getNode(ISD::BITCAST, dl, OpVT, Tmp1); 4277 4278 SDValue AllOnes = DAG.getTargetConstant(ARM_AM::createNEONModImm(0xe, 0xff), 4279 dl, MVT::i32); 4280 AllOnes = DAG.getNode(ARMISD::VMOVIMM, dl, MVT::v8i8, AllOnes); 4281 SDValue MaskNot = DAG.getNode(ISD::XOR, dl, OpVT, Mask, 4282 DAG.getNode(ISD::BITCAST, dl, OpVT, AllOnes)); 4283 4284 SDValue Res = DAG.getNode(ISD::OR, dl, OpVT, 4285 DAG.getNode(ISD::AND, dl, OpVT, Tmp1, Mask), 4286 DAG.getNode(ISD::AND, dl, OpVT, Tmp0, MaskNot)); 4287 if (VT == MVT::f32) { 4288 Res = DAG.getNode(ISD::BITCAST, dl, MVT::v2f32, Res); 4289 Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f32, Res, 4290 DAG.getConstant(0, dl, MVT::i32)); 4291 } else { 4292 Res = DAG.getNode(ISD::BITCAST, dl, MVT::f64, Res); 4293 } 4294 4295 return Res; 4296 } 4297 4298 // Bitcast operand 1 to i32. 4299 if (SrcVT == MVT::f64) 4300 Tmp1 = DAG.getNode(ARMISD::VMOVRRD, dl, DAG.getVTList(MVT::i32, MVT::i32), 4301 Tmp1).getValue(1); 4302 Tmp1 = DAG.getNode(ISD::BITCAST, dl, MVT::i32, Tmp1); 4303 4304 // Or in the signbit with integer operations. 4305 SDValue Mask1 = DAG.getConstant(0x80000000, dl, MVT::i32); 4306 SDValue Mask2 = DAG.getConstant(0x7fffffff, dl, MVT::i32); 4307 Tmp1 = DAG.getNode(ISD::AND, dl, MVT::i32, Tmp1, Mask1); 4308 if (VT == MVT::f32) { 4309 Tmp0 = DAG.getNode(ISD::AND, dl, MVT::i32, 4310 DAG.getNode(ISD::BITCAST, dl, MVT::i32, Tmp0), Mask2); 4311 return DAG.getNode(ISD::BITCAST, dl, MVT::f32, 4312 DAG.getNode(ISD::OR, dl, MVT::i32, Tmp0, Tmp1)); 4313 } 4314 4315 // f64: Or the high part with signbit and then combine two parts. 4316 Tmp0 = DAG.getNode(ARMISD::VMOVRRD, dl, DAG.getVTList(MVT::i32, MVT::i32), 4317 Tmp0); 4318 SDValue Lo = Tmp0.getValue(0); 4319 SDValue Hi = DAG.getNode(ISD::AND, dl, MVT::i32, Tmp0.getValue(1), Mask2); 4320 Hi = DAG.getNode(ISD::OR, dl, MVT::i32, Hi, Tmp1); 4321 return DAG.getNode(ARMISD::VMOVDRR, dl, MVT::f64, Lo, Hi); 4322 } 4323 4324 SDValue ARMTargetLowering::LowerRETURNADDR(SDValue Op, SelectionDAG &DAG) const{ 4325 MachineFunction &MF = DAG.getMachineFunction(); 4326 MachineFrameInfo *MFI = MF.getFrameInfo(); 4327 MFI->setReturnAddressIsTaken(true); 4328 4329 if (verifyReturnAddressArgumentIsConstant(Op, DAG)) 4330 return SDValue(); 4331 4332 EVT VT = Op.getValueType(); 4333 SDLoc dl(Op); 4334 unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue(); 4335 if (Depth) { 4336 SDValue FrameAddr = LowerFRAMEADDR(Op, DAG); 4337 SDValue Offset = DAG.getConstant(4, dl, MVT::i32); 4338 return DAG.getLoad(VT, dl, DAG.getEntryNode(), 4339 DAG.getNode(ISD::ADD, dl, VT, FrameAddr, Offset), 4340 MachinePointerInfo(), false, false, false, 0); 4341 } 4342 4343 // Return LR, which contains the return address. Mark it an implicit live-in. 4344 unsigned Reg = MF.addLiveIn(ARM::LR, getRegClassFor(MVT::i32)); 4345 return DAG.getCopyFromReg(DAG.getEntryNode(), dl, Reg, VT); 4346 } 4347 4348 SDValue ARMTargetLowering::LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) const { 4349 const ARMBaseRegisterInfo &ARI = 4350 *static_cast<const ARMBaseRegisterInfo*>(RegInfo); 4351 MachineFunction &MF = DAG.getMachineFunction(); 4352 MachineFrameInfo *MFI = MF.getFrameInfo(); 4353 MFI->setFrameAddressIsTaken(true); 4354 4355 EVT VT = Op.getValueType(); 4356 SDLoc dl(Op); // FIXME probably not meaningful 4357 unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue(); 4358 unsigned FrameReg = ARI.getFrameRegister(MF); 4359 SDValue FrameAddr = DAG.getCopyFromReg(DAG.getEntryNode(), dl, FrameReg, VT); 4360 while (Depth--) 4361 FrameAddr = DAG.getLoad(VT, dl, DAG.getEntryNode(), FrameAddr, 4362 MachinePointerInfo(), 4363 false, false, false, 0); 4364 return FrameAddr; 4365 } 4366 4367 // FIXME? Maybe this could be a TableGen attribute on some registers and 4368 // this table could be generated automatically from RegInfo. 4369 unsigned ARMTargetLowering::getRegisterByName(const char* RegName, EVT VT, 4370 SelectionDAG &DAG) const { 4371 unsigned Reg = StringSwitch<unsigned>(RegName) 4372 .Case("sp", ARM::SP) 4373 .Default(0); 4374 if (Reg) 4375 return Reg; 4376 report_fatal_error(Twine("Invalid register name \"" 4377 + StringRef(RegName) + "\".")); 4378 } 4379 4380 // Result is 64 bit value so split into two 32 bit values and return as a 4381 // pair of values. 4382 static void ExpandREAD_REGISTER(SDNode *N, SmallVectorImpl<SDValue> &Results, 4383 SelectionDAG &DAG) { 4384 SDLoc DL(N); 4385 4386 // This function is only supposed to be called for i64 type destination. 4387 assert(N->getValueType(0) == MVT::i64 4388 && "ExpandREAD_REGISTER called for non-i64 type result."); 4389 4390 SDValue Read = DAG.getNode(ISD::READ_REGISTER, DL, 4391 DAG.getVTList(MVT::i32, MVT::i32, MVT::Other), 4392 N->getOperand(0), 4393 N->getOperand(1)); 4394 4395 Results.push_back(DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, Read.getValue(0), 4396 Read.getValue(1))); 4397 Results.push_back(Read.getOperand(0)); 4398 } 4399 4400 /// \p BC is a bitcast that is about to be turned into a VMOVDRR. 4401 /// When \p DstVT, the destination type of \p BC, is on the vector 4402 /// register bank and the source of bitcast, \p Op, operates on the same bank, 4403 /// it might be possible to combine them, such that everything stays on the 4404 /// vector register bank. 4405 /// \p return The node that would replace \p BT, if the combine 4406 /// is possible. 4407 static SDValue CombineVMOVDRRCandidateWithVecOp(const SDNode *BC, 4408 SelectionDAG &DAG) { 4409 SDValue Op = BC->getOperand(0); 4410 EVT DstVT = BC->getValueType(0); 4411 4412 // The only vector instruction that can produce a scalar (remember, 4413 // since the bitcast was about to be turned into VMOVDRR, the source 4414 // type is i64) from a vector is EXTRACT_VECTOR_ELT. 4415 // Moreover, we can do this combine only if there is one use. 4416 // Finally, if the destination type is not a vector, there is not 4417 // much point on forcing everything on the vector bank. 4418 if (!DstVT.isVector() || Op.getOpcode() != ISD::EXTRACT_VECTOR_ELT || 4419 !Op.hasOneUse()) 4420 return SDValue(); 4421 4422 // If the index is not constant, we will introduce an additional 4423 // multiply that will stick. 4424 // Give up in that case. 4425 ConstantSDNode *Index = dyn_cast<ConstantSDNode>(Op.getOperand(1)); 4426 if (!Index) 4427 return SDValue(); 4428 unsigned DstNumElt = DstVT.getVectorNumElements(); 4429 4430 // Compute the new index. 4431 const APInt &APIntIndex = Index->getAPIntValue(); 4432 APInt NewIndex(APIntIndex.getBitWidth(), DstNumElt); 4433 NewIndex *= APIntIndex; 4434 // Check if the new constant index fits into i32. 4435 if (NewIndex.getBitWidth() > 32) 4436 return SDValue(); 4437 4438 // vMTy bitcast(i64 extractelt vNi64 src, i32 index) -> 4439 // vMTy extractsubvector vNxMTy (bitcast vNi64 src), i32 index*M) 4440 SDLoc dl(Op); 4441 SDValue ExtractSrc = Op.getOperand(0); 4442 EVT VecVT = EVT::getVectorVT( 4443 *DAG.getContext(), DstVT.getScalarType(), 4444 ExtractSrc.getValueType().getVectorNumElements() * DstNumElt); 4445 SDValue BitCast = DAG.getNode(ISD::BITCAST, dl, VecVT, ExtractSrc); 4446 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, DstVT, BitCast, 4447 DAG.getConstant(NewIndex.getZExtValue(), dl, MVT::i32)); 4448 } 4449 4450 /// ExpandBITCAST - If the target supports VFP, this function is called to 4451 /// expand a bit convert where either the source or destination type is i64 to 4452 /// use a VMOVDRR or VMOVRRD node. This should not be done when the non-i64 4453 /// operand type is illegal (e.g., v2f32 for a target that doesn't support 4454 /// vectors), since the legalizer won't know what to do with that. 4455 static SDValue ExpandBITCAST(SDNode *N, SelectionDAG &DAG) { 4456 const TargetLowering &TLI = DAG.getTargetLoweringInfo(); 4457 SDLoc dl(N); 4458 SDValue Op = N->getOperand(0); 4459 4460 // This function is only supposed to be called for i64 types, either as the 4461 // source or destination of the bit convert. 4462 EVT SrcVT = Op.getValueType(); 4463 EVT DstVT = N->getValueType(0); 4464 assert((SrcVT == MVT::i64 || DstVT == MVT::i64) && 4465 "ExpandBITCAST called for non-i64 type"); 4466 4467 // Turn i64->f64 into VMOVDRR. 4468 if (SrcVT == MVT::i64 && TLI.isTypeLegal(DstVT)) { 4469 // Do not force values to GPRs (this is what VMOVDRR does for the inputs) 4470 // if we can combine the bitcast with its source. 4471 if (SDValue Val = CombineVMOVDRRCandidateWithVecOp(N, DAG)) 4472 return Val; 4473 4474 SDValue Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, Op, 4475 DAG.getConstant(0, dl, MVT::i32)); 4476 SDValue Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, Op, 4477 DAG.getConstant(1, dl, MVT::i32)); 4478 return DAG.getNode(ISD::BITCAST, dl, DstVT, 4479 DAG.getNode(ARMISD::VMOVDRR, dl, MVT::f64, Lo, Hi)); 4480 } 4481 4482 // Turn f64->i64 into VMOVRRD. 4483 if (DstVT == MVT::i64 && TLI.isTypeLegal(SrcVT)) { 4484 SDValue Cvt; 4485 if (DAG.getDataLayout().isBigEndian() && SrcVT.isVector() && 4486 SrcVT.getVectorNumElements() > 1) 4487 Cvt = DAG.getNode(ARMISD::VMOVRRD, dl, 4488 DAG.getVTList(MVT::i32, MVT::i32), 4489 DAG.getNode(ARMISD::VREV64, dl, SrcVT, Op)); 4490 else 4491 Cvt = DAG.getNode(ARMISD::VMOVRRD, dl, 4492 DAG.getVTList(MVT::i32, MVT::i32), Op); 4493 // Merge the pieces into a single i64 value. 4494 return DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, Cvt, Cvt.getValue(1)); 4495 } 4496 4497 return SDValue(); 4498 } 4499 4500 /// getZeroVector - Returns a vector of specified type with all zero elements. 4501 /// Zero vectors are used to represent vector negation and in those cases 4502 /// will be implemented with the NEON VNEG instruction. However, VNEG does 4503 /// not support i64 elements, so sometimes the zero vectors will need to be 4504 /// explicitly constructed. Regardless, use a canonical VMOV to create the 4505 /// zero vector. 4506 static SDValue getZeroVector(EVT VT, SelectionDAG &DAG, const SDLoc &dl) { 4507 assert(VT.isVector() && "Expected a vector type"); 4508 // The canonical modified immediate encoding of a zero vector is....0! 4509 SDValue EncodedVal = DAG.getTargetConstant(0, dl, MVT::i32); 4510 EVT VmovVT = VT.is128BitVector() ? MVT::v4i32 : MVT::v2i32; 4511 SDValue Vmov = DAG.getNode(ARMISD::VMOVIMM, dl, VmovVT, EncodedVal); 4512 return DAG.getNode(ISD::BITCAST, dl, VT, Vmov); 4513 } 4514 4515 /// LowerShiftRightParts - Lower SRA_PARTS, which returns two 4516 /// i32 values and take a 2 x i32 value to shift plus a shift amount. 4517 SDValue ARMTargetLowering::LowerShiftRightParts(SDValue Op, 4518 SelectionDAG &DAG) const { 4519 assert(Op.getNumOperands() == 3 && "Not a double-shift!"); 4520 EVT VT = Op.getValueType(); 4521 unsigned VTBits = VT.getSizeInBits(); 4522 SDLoc dl(Op); 4523 SDValue ShOpLo = Op.getOperand(0); 4524 SDValue ShOpHi = Op.getOperand(1); 4525 SDValue ShAmt = Op.getOperand(2); 4526 SDValue ARMcc; 4527 unsigned Opc = (Op.getOpcode() == ISD::SRA_PARTS) ? ISD::SRA : ISD::SRL; 4528 4529 assert(Op.getOpcode() == ISD::SRA_PARTS || Op.getOpcode() == ISD::SRL_PARTS); 4530 4531 SDValue RevShAmt = DAG.getNode(ISD::SUB, dl, MVT::i32, 4532 DAG.getConstant(VTBits, dl, MVT::i32), ShAmt); 4533 SDValue Tmp1 = DAG.getNode(ISD::SRL, dl, VT, ShOpLo, ShAmt); 4534 SDValue ExtraShAmt = DAG.getNode(ISD::SUB, dl, MVT::i32, ShAmt, 4535 DAG.getConstant(VTBits, dl, MVT::i32)); 4536 SDValue Tmp2 = DAG.getNode(ISD::SHL, dl, VT, ShOpHi, RevShAmt); 4537 SDValue FalseVal = DAG.getNode(ISD::OR, dl, VT, Tmp1, Tmp2); 4538 SDValue TrueVal = DAG.getNode(Opc, dl, VT, ShOpHi, ExtraShAmt); 4539 4540 SDValue CCR = DAG.getRegister(ARM::CPSR, MVT::i32); 4541 SDValue Cmp = getARMCmp(ExtraShAmt, DAG.getConstant(0, dl, MVT::i32), 4542 ISD::SETGE, ARMcc, DAG, dl); 4543 SDValue Hi = DAG.getNode(Opc, dl, VT, ShOpHi, ShAmt); 4544 SDValue Lo = DAG.getNode(ARMISD::CMOV, dl, VT, FalseVal, TrueVal, ARMcc, 4545 CCR, Cmp); 4546 4547 SDValue Ops[2] = { Lo, Hi }; 4548 return DAG.getMergeValues(Ops, dl); 4549 } 4550 4551 /// LowerShiftLeftParts - Lower SHL_PARTS, which returns two 4552 /// i32 values and take a 2 x i32 value to shift plus a shift amount. 4553 SDValue ARMTargetLowering::LowerShiftLeftParts(SDValue Op, 4554 SelectionDAG &DAG) const { 4555 assert(Op.getNumOperands() == 3 && "Not a double-shift!"); 4556 EVT VT = Op.getValueType(); 4557 unsigned VTBits = VT.getSizeInBits(); 4558 SDLoc dl(Op); 4559 SDValue ShOpLo = Op.getOperand(0); 4560 SDValue ShOpHi = Op.getOperand(1); 4561 SDValue ShAmt = Op.getOperand(2); 4562 SDValue ARMcc; 4563 4564 assert(Op.getOpcode() == ISD::SHL_PARTS); 4565 SDValue RevShAmt = DAG.getNode(ISD::SUB, dl, MVT::i32, 4566 DAG.getConstant(VTBits, dl, MVT::i32), ShAmt); 4567 SDValue Tmp1 = DAG.getNode(ISD::SRL, dl, VT, ShOpLo, RevShAmt); 4568 SDValue ExtraShAmt = DAG.getNode(ISD::SUB, dl, MVT::i32, ShAmt, 4569 DAG.getConstant(VTBits, dl, MVT::i32)); 4570 SDValue Tmp2 = DAG.getNode(ISD::SHL, dl, VT, ShOpHi, ShAmt); 4571 SDValue Tmp3 = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, ExtraShAmt); 4572 4573 SDValue FalseVal = DAG.getNode(ISD::OR, dl, VT, Tmp1, Tmp2); 4574 SDValue CCR = DAG.getRegister(ARM::CPSR, MVT::i32); 4575 SDValue Cmp = getARMCmp(ExtraShAmt, DAG.getConstant(0, dl, MVT::i32), 4576 ISD::SETGE, ARMcc, DAG, dl); 4577 SDValue Lo = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, ShAmt); 4578 SDValue Hi = DAG.getNode(ARMISD::CMOV, dl, VT, FalseVal, Tmp3, ARMcc, 4579 CCR, Cmp); 4580 4581 SDValue Ops[2] = { Lo, Hi }; 4582 return DAG.getMergeValues(Ops, dl); 4583 } 4584 4585 SDValue ARMTargetLowering::LowerFLT_ROUNDS_(SDValue Op, 4586 SelectionDAG &DAG) const { 4587 // The rounding mode is in bits 23:22 of the FPSCR. 4588 // The ARM rounding mode value to FLT_ROUNDS mapping is 0->1, 1->2, 2->3, 3->0 4589 // The formula we use to implement this is (((FPSCR + 1 << 22) >> 22) & 3) 4590 // so that the shift + and get folded into a bitfield extract. 4591 SDLoc dl(Op); 4592 SDValue FPSCR = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::i32, 4593 DAG.getConstant(Intrinsic::arm_get_fpscr, dl, 4594 MVT::i32)); 4595 SDValue FltRounds = DAG.getNode(ISD::ADD, dl, MVT::i32, FPSCR, 4596 DAG.getConstant(1U << 22, dl, MVT::i32)); 4597 SDValue RMODE = DAG.getNode(ISD::SRL, dl, MVT::i32, FltRounds, 4598 DAG.getConstant(22, dl, MVT::i32)); 4599 return DAG.getNode(ISD::AND, dl, MVT::i32, RMODE, 4600 DAG.getConstant(3, dl, MVT::i32)); 4601 } 4602 4603 static SDValue LowerCTTZ(SDNode *N, SelectionDAG &DAG, 4604 const ARMSubtarget *ST) { 4605 SDLoc dl(N); 4606 EVT VT = N->getValueType(0); 4607 if (VT.isVector()) { 4608 assert(ST->hasNEON()); 4609 4610 // Compute the least significant set bit: LSB = X & -X 4611 SDValue X = N->getOperand(0); 4612 SDValue NX = DAG.getNode(ISD::SUB, dl, VT, getZeroVector(VT, DAG, dl), X); 4613 SDValue LSB = DAG.getNode(ISD::AND, dl, VT, X, NX); 4614 4615 EVT ElemTy = VT.getVectorElementType(); 4616 4617 if (ElemTy == MVT::i8) { 4618 // Compute with: cttz(x) = ctpop(lsb - 1) 4619 SDValue One = DAG.getNode(ARMISD::VMOVIMM, dl, VT, 4620 DAG.getTargetConstant(1, dl, ElemTy)); 4621 SDValue Bits = DAG.getNode(ISD::SUB, dl, VT, LSB, One); 4622 return DAG.getNode(ISD::CTPOP, dl, VT, Bits); 4623 } 4624 4625 if ((ElemTy == MVT::i16 || ElemTy == MVT::i32) && 4626 (N->getOpcode() == ISD::CTTZ_ZERO_UNDEF)) { 4627 // Compute with: cttz(x) = (width - 1) - ctlz(lsb), if x != 0 4628 unsigned NumBits = ElemTy.getSizeInBits(); 4629 SDValue WidthMinus1 = 4630 DAG.getNode(ARMISD::VMOVIMM, dl, VT, 4631 DAG.getTargetConstant(NumBits - 1, dl, ElemTy)); 4632 SDValue CTLZ = DAG.getNode(ISD::CTLZ, dl, VT, LSB); 4633 return DAG.getNode(ISD::SUB, dl, VT, WidthMinus1, CTLZ); 4634 } 4635 4636 // Compute with: cttz(x) = ctpop(lsb - 1) 4637 4638 // Since we can only compute the number of bits in a byte with vcnt.8, we 4639 // have to gather the result with pairwise addition (vpaddl) for i16, i32, 4640 // and i64. 4641 4642 // Compute LSB - 1. 4643 SDValue Bits; 4644 if (ElemTy == MVT::i64) { 4645 // Load constant 0xffff'ffff'ffff'ffff to register. 4646 SDValue FF = DAG.getNode(ARMISD::VMOVIMM, dl, VT, 4647 DAG.getTargetConstant(0x1eff, dl, MVT::i32)); 4648 Bits = DAG.getNode(ISD::ADD, dl, VT, LSB, FF); 4649 } else { 4650 SDValue One = DAG.getNode(ARMISD::VMOVIMM, dl, VT, 4651 DAG.getTargetConstant(1, dl, ElemTy)); 4652 Bits = DAG.getNode(ISD::SUB, dl, VT, LSB, One); 4653 } 4654 4655 // Count #bits with vcnt.8. 4656 EVT VT8Bit = VT.is64BitVector() ? MVT::v8i8 : MVT::v16i8; 4657 SDValue BitsVT8 = DAG.getNode(ISD::BITCAST, dl, VT8Bit, Bits); 4658 SDValue Cnt8 = DAG.getNode(ISD::CTPOP, dl, VT8Bit, BitsVT8); 4659 4660 // Gather the #bits with vpaddl (pairwise add.) 4661 EVT VT16Bit = VT.is64BitVector() ? MVT::v4i16 : MVT::v8i16; 4662 SDValue Cnt16 = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT16Bit, 4663 DAG.getTargetConstant(Intrinsic::arm_neon_vpaddlu, dl, MVT::i32), 4664 Cnt8); 4665 if (ElemTy == MVT::i16) 4666 return Cnt16; 4667 4668 EVT VT32Bit = VT.is64BitVector() ? MVT::v2i32 : MVT::v4i32; 4669 SDValue Cnt32 = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT32Bit, 4670 DAG.getTargetConstant(Intrinsic::arm_neon_vpaddlu, dl, MVT::i32), 4671 Cnt16); 4672 if (ElemTy == MVT::i32) 4673 return Cnt32; 4674 4675 assert(ElemTy == MVT::i64); 4676 SDValue Cnt64 = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, 4677 DAG.getTargetConstant(Intrinsic::arm_neon_vpaddlu, dl, MVT::i32), 4678 Cnt32); 4679 return Cnt64; 4680 } 4681 4682 if (!ST->hasV6T2Ops()) 4683 return SDValue(); 4684 4685 SDValue rbit = DAG.getNode(ISD::BITREVERSE, dl, VT, N->getOperand(0)); 4686 return DAG.getNode(ISD::CTLZ, dl, VT, rbit); 4687 } 4688 4689 /// getCTPOP16BitCounts - Returns a v8i8/v16i8 vector containing the bit-count 4690 /// for each 16-bit element from operand, repeated. The basic idea is to 4691 /// leverage vcnt to get the 8-bit counts, gather and add the results. 4692 /// 4693 /// Trace for v4i16: 4694 /// input = [v0 v1 v2 v3 ] (vi 16-bit element) 4695 /// cast: N0 = [w0 w1 w2 w3 w4 w5 w6 w7] (v0 = [w0 w1], wi 8-bit element) 4696 /// vcnt: N1 = [b0 b1 b2 b3 b4 b5 b6 b7] (bi = bit-count of 8-bit element wi) 4697 /// vrev: N2 = [b1 b0 b3 b2 b5 b4 b7 b6] 4698 /// [b0 b1 b2 b3 b4 b5 b6 b7] 4699 /// +[b1 b0 b3 b2 b5 b4 b7 b6] 4700 /// N3=N1+N2 = [k0 k0 k1 k1 k2 k2 k3 k3] (k0 = b0+b1 = bit-count of 16-bit v0, 4701 /// vuzp: = [k0 k1 k2 k3 k0 k1 k2 k3] each ki is 8-bits) 4702 static SDValue getCTPOP16BitCounts(SDNode *N, SelectionDAG &DAG) { 4703 EVT VT = N->getValueType(0); 4704 SDLoc DL(N); 4705 4706 EVT VT8Bit = VT.is64BitVector() ? MVT::v8i8 : MVT::v16i8; 4707 SDValue N0 = DAG.getNode(ISD::BITCAST, DL, VT8Bit, N->getOperand(0)); 4708 SDValue N1 = DAG.getNode(ISD::CTPOP, DL, VT8Bit, N0); 4709 SDValue N2 = DAG.getNode(ARMISD::VREV16, DL, VT8Bit, N1); 4710 SDValue N3 = DAG.getNode(ISD::ADD, DL, VT8Bit, N1, N2); 4711 return DAG.getNode(ARMISD::VUZP, DL, VT8Bit, N3, N3); 4712 } 4713 4714 /// lowerCTPOP16BitElements - Returns a v4i16/v8i16 vector containing the 4715 /// bit-count for each 16-bit element from the operand. We need slightly 4716 /// different sequencing for v4i16 and v8i16 to stay within NEON's available 4717 /// 64/128-bit registers. 4718 /// 4719 /// Trace for v4i16: 4720 /// input = [v0 v1 v2 v3 ] (vi 16-bit element) 4721 /// v8i8: BitCounts = [k0 k1 k2 k3 k0 k1 k2 k3 ] (ki is the bit-count of vi) 4722 /// v8i16:Extended = [k0 k1 k2 k3 k0 k1 k2 k3 ] 4723 /// v4i16:Extracted = [k0 k1 k2 k3 ] 4724 static SDValue lowerCTPOP16BitElements(SDNode *N, SelectionDAG &DAG) { 4725 EVT VT = N->getValueType(0); 4726 SDLoc DL(N); 4727 4728 SDValue BitCounts = getCTPOP16BitCounts(N, DAG); 4729 if (VT.is64BitVector()) { 4730 SDValue Extended = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::v8i16, BitCounts); 4731 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v4i16, Extended, 4732 DAG.getIntPtrConstant(0, DL)); 4733 } else { 4734 SDValue Extracted = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v8i8, 4735 BitCounts, DAG.getIntPtrConstant(0, DL)); 4736 return DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::v8i16, Extracted); 4737 } 4738 } 4739 4740 /// lowerCTPOP32BitElements - Returns a v2i32/v4i32 vector containing the 4741 /// bit-count for each 32-bit element from the operand. The idea here is 4742 /// to split the vector into 16-bit elements, leverage the 16-bit count 4743 /// routine, and then combine the results. 4744 /// 4745 /// Trace for v2i32 (v4i32 similar with Extracted/Extended exchanged): 4746 /// input = [v0 v1 ] (vi: 32-bit elements) 4747 /// Bitcast = [w0 w1 w2 w3 ] (wi: 16-bit elements, v0 = [w0 w1]) 4748 /// Counts16 = [k0 k1 k2 k3 ] (ki: 16-bit elements, bit-count of wi) 4749 /// vrev: N0 = [k1 k0 k3 k2 ] 4750 /// [k0 k1 k2 k3 ] 4751 /// N1 =+[k1 k0 k3 k2 ] 4752 /// [k0 k2 k1 k3 ] 4753 /// N2 =+[k1 k3 k0 k2 ] 4754 /// [k0 k2 k1 k3 ] 4755 /// Extended =+[k1 k3 k0 k2 ] 4756 /// [k0 k2 ] 4757 /// Extracted=+[k1 k3 ] 4758 /// 4759 static SDValue lowerCTPOP32BitElements(SDNode *N, SelectionDAG &DAG) { 4760 EVT VT = N->getValueType(0); 4761 SDLoc DL(N); 4762 4763 EVT VT16Bit = VT.is64BitVector() ? MVT::v4i16 : MVT::v8i16; 4764 4765 SDValue Bitcast = DAG.getNode(ISD::BITCAST, DL, VT16Bit, N->getOperand(0)); 4766 SDValue Counts16 = lowerCTPOP16BitElements(Bitcast.getNode(), DAG); 4767 SDValue N0 = DAG.getNode(ARMISD::VREV32, DL, VT16Bit, Counts16); 4768 SDValue N1 = DAG.getNode(ISD::ADD, DL, VT16Bit, Counts16, N0); 4769 SDValue N2 = DAG.getNode(ARMISD::VUZP, DL, VT16Bit, N1, N1); 4770 4771 if (VT.is64BitVector()) { 4772 SDValue Extended = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::v4i32, N2); 4773 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2i32, Extended, 4774 DAG.getIntPtrConstant(0, DL)); 4775 } else { 4776 SDValue Extracted = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v4i16, N2, 4777 DAG.getIntPtrConstant(0, DL)); 4778 return DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::v4i32, Extracted); 4779 } 4780 } 4781 4782 static SDValue LowerCTPOP(SDNode *N, SelectionDAG &DAG, 4783 const ARMSubtarget *ST) { 4784 EVT VT = N->getValueType(0); 4785 4786 assert(ST->hasNEON() && "Custom ctpop lowering requires NEON."); 4787 assert((VT == MVT::v2i32 || VT == MVT::v4i32 || 4788 VT == MVT::v4i16 || VT == MVT::v8i16) && 4789 "Unexpected type for custom ctpop lowering"); 4790 4791 if (VT.getVectorElementType() == MVT::i32) 4792 return lowerCTPOP32BitElements(N, DAG); 4793 else 4794 return lowerCTPOP16BitElements(N, DAG); 4795 } 4796 4797 static SDValue LowerShift(SDNode *N, SelectionDAG &DAG, 4798 const ARMSubtarget *ST) { 4799 EVT VT = N->getValueType(0); 4800 SDLoc dl(N); 4801 4802 if (!VT.isVector()) 4803 return SDValue(); 4804 4805 // Lower vector shifts on NEON to use VSHL. 4806 assert(ST->hasNEON() && "unexpected vector shift"); 4807 4808 // Left shifts translate directly to the vshiftu intrinsic. 4809 if (N->getOpcode() == ISD::SHL) 4810 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, 4811 DAG.getConstant(Intrinsic::arm_neon_vshiftu, dl, 4812 MVT::i32), 4813 N->getOperand(0), N->getOperand(1)); 4814 4815 assert((N->getOpcode() == ISD::SRA || 4816 N->getOpcode() == ISD::SRL) && "unexpected vector shift opcode"); 4817 4818 // NEON uses the same intrinsics for both left and right shifts. For 4819 // right shifts, the shift amounts are negative, so negate the vector of 4820 // shift amounts. 4821 EVT ShiftVT = N->getOperand(1).getValueType(); 4822 SDValue NegatedCount = DAG.getNode(ISD::SUB, dl, ShiftVT, 4823 getZeroVector(ShiftVT, DAG, dl), 4824 N->getOperand(1)); 4825 Intrinsic::ID vshiftInt = (N->getOpcode() == ISD::SRA ? 4826 Intrinsic::arm_neon_vshifts : 4827 Intrinsic::arm_neon_vshiftu); 4828 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, 4829 DAG.getConstant(vshiftInt, dl, MVT::i32), 4830 N->getOperand(0), NegatedCount); 4831 } 4832 4833 static SDValue Expand64BitShift(SDNode *N, SelectionDAG &DAG, 4834 const ARMSubtarget *ST) { 4835 EVT VT = N->getValueType(0); 4836 SDLoc dl(N); 4837 4838 // We can get here for a node like i32 = ISD::SHL i32, i64 4839 if (VT != MVT::i64) 4840 return SDValue(); 4841 4842 assert((N->getOpcode() == ISD::SRL || N->getOpcode() == ISD::SRA) && 4843 "Unknown shift to lower!"); 4844 4845 // We only lower SRA, SRL of 1 here, all others use generic lowering. 4846 if (!isOneConstant(N->getOperand(1))) 4847 return SDValue(); 4848 4849 // If we are in thumb mode, we don't have RRX. 4850 if (ST->isThumb1Only()) return SDValue(); 4851 4852 // Okay, we have a 64-bit SRA or SRL of 1. Lower this to an RRX expr. 4853 SDValue Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, N->getOperand(0), 4854 DAG.getConstant(0, dl, MVT::i32)); 4855 SDValue Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, N->getOperand(0), 4856 DAG.getConstant(1, dl, MVT::i32)); 4857 4858 // First, build a SRA_FLAG/SRL_FLAG op, which shifts the top part by one and 4859 // captures the result into a carry flag. 4860 unsigned Opc = N->getOpcode() == ISD::SRL ? ARMISD::SRL_FLAG:ARMISD::SRA_FLAG; 4861 Hi = DAG.getNode(Opc, dl, DAG.getVTList(MVT::i32, MVT::Glue), Hi); 4862 4863 // The low part is an ARMISD::RRX operand, which shifts the carry in. 4864 Lo = DAG.getNode(ARMISD::RRX, dl, MVT::i32, Lo, Hi.getValue(1)); 4865 4866 // Merge the pieces into a single i64 value. 4867 return DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, Lo, Hi); 4868 } 4869 4870 static SDValue LowerVSETCC(SDValue Op, SelectionDAG &DAG) { 4871 SDValue TmpOp0, TmpOp1; 4872 bool Invert = false; 4873 bool Swap = false; 4874 unsigned Opc = 0; 4875 4876 SDValue Op0 = Op.getOperand(0); 4877 SDValue Op1 = Op.getOperand(1); 4878 SDValue CC = Op.getOperand(2); 4879 EVT CmpVT = Op0.getValueType().changeVectorElementTypeToInteger(); 4880 EVT VT = Op.getValueType(); 4881 ISD::CondCode SetCCOpcode = cast<CondCodeSDNode>(CC)->get(); 4882 SDLoc dl(Op); 4883 4884 if (CmpVT.getVectorElementType() == MVT::i64) 4885 // 64-bit comparisons are not legal. We've marked SETCC as non-Custom, 4886 // but it's possible that our operands are 64-bit but our result is 32-bit. 4887 // Bail in this case. 4888 return SDValue(); 4889 4890 if (Op1.getValueType().isFloatingPoint()) { 4891 switch (SetCCOpcode) { 4892 default: llvm_unreachable("Illegal FP comparison"); 4893 case ISD::SETUNE: 4894 case ISD::SETNE: Invert = true; // Fallthrough 4895 case ISD::SETOEQ: 4896 case ISD::SETEQ: Opc = ARMISD::VCEQ; break; 4897 case ISD::SETOLT: 4898 case ISD::SETLT: Swap = true; // Fallthrough 4899 case ISD::SETOGT: 4900 case ISD::SETGT: Opc = ARMISD::VCGT; break; 4901 case ISD::SETOLE: 4902 case ISD::SETLE: Swap = true; // Fallthrough 4903 case ISD::SETOGE: 4904 case ISD::SETGE: Opc = ARMISD::VCGE; break; 4905 case ISD::SETUGE: Swap = true; // Fallthrough 4906 case ISD::SETULE: Invert = true; Opc = ARMISD::VCGT; break; 4907 case ISD::SETUGT: Swap = true; // Fallthrough 4908 case ISD::SETULT: Invert = true; Opc = ARMISD::VCGE; break; 4909 case ISD::SETUEQ: Invert = true; // Fallthrough 4910 case ISD::SETONE: 4911 // Expand this to (OLT | OGT). 4912 TmpOp0 = Op0; 4913 TmpOp1 = Op1; 4914 Opc = ISD::OR; 4915 Op0 = DAG.getNode(ARMISD::VCGT, dl, CmpVT, TmpOp1, TmpOp0); 4916 Op1 = DAG.getNode(ARMISD::VCGT, dl, CmpVT, TmpOp0, TmpOp1); 4917 break; 4918 case ISD::SETUO: Invert = true; // Fallthrough 4919 case ISD::SETO: 4920 // Expand this to (OLT | OGE). 4921 TmpOp0 = Op0; 4922 TmpOp1 = Op1; 4923 Opc = ISD::OR; 4924 Op0 = DAG.getNode(ARMISD::VCGT, dl, CmpVT, TmpOp1, TmpOp0); 4925 Op1 = DAG.getNode(ARMISD::VCGE, dl, CmpVT, TmpOp0, TmpOp1); 4926 break; 4927 } 4928 } else { 4929 // Integer comparisons. 4930 switch (SetCCOpcode) { 4931 default: llvm_unreachable("Illegal integer comparison"); 4932 case ISD::SETNE: Invert = true; 4933 case ISD::SETEQ: Opc = ARMISD::VCEQ; break; 4934 case ISD::SETLT: Swap = true; 4935 case ISD::SETGT: Opc = ARMISD::VCGT; break; 4936 case ISD::SETLE: Swap = true; 4937 case ISD::SETGE: Opc = ARMISD::VCGE; break; 4938 case ISD::SETULT: Swap = true; 4939 case ISD::SETUGT: Opc = ARMISD::VCGTU; break; 4940 case ISD::SETULE: Swap = true; 4941 case ISD::SETUGE: Opc = ARMISD::VCGEU; break; 4942 } 4943 4944 // Detect VTST (Vector Test Bits) = icmp ne (and (op0, op1), zero). 4945 if (Opc == ARMISD::VCEQ) { 4946 4947 SDValue AndOp; 4948 if (ISD::isBuildVectorAllZeros(Op1.getNode())) 4949 AndOp = Op0; 4950 else if (ISD::isBuildVectorAllZeros(Op0.getNode())) 4951 AndOp = Op1; 4952 4953 // Ignore bitconvert. 4954 if (AndOp.getNode() && AndOp.getOpcode() == ISD::BITCAST) 4955 AndOp = AndOp.getOperand(0); 4956 4957 if (AndOp.getNode() && AndOp.getOpcode() == ISD::AND) { 4958 Opc = ARMISD::VTST; 4959 Op0 = DAG.getNode(ISD::BITCAST, dl, CmpVT, AndOp.getOperand(0)); 4960 Op1 = DAG.getNode(ISD::BITCAST, dl, CmpVT, AndOp.getOperand(1)); 4961 Invert = !Invert; 4962 } 4963 } 4964 } 4965 4966 if (Swap) 4967 std::swap(Op0, Op1); 4968 4969 // If one of the operands is a constant vector zero, attempt to fold the 4970 // comparison to a specialized compare-against-zero form. 4971 SDValue SingleOp; 4972 if (ISD::isBuildVectorAllZeros(Op1.getNode())) 4973 SingleOp = Op0; 4974 else if (ISD::isBuildVectorAllZeros(Op0.getNode())) { 4975 if (Opc == ARMISD::VCGE) 4976 Opc = ARMISD::VCLEZ; 4977 else if (Opc == ARMISD::VCGT) 4978 Opc = ARMISD::VCLTZ; 4979 SingleOp = Op1; 4980 } 4981 4982 SDValue Result; 4983 if (SingleOp.getNode()) { 4984 switch (Opc) { 4985 case ARMISD::VCEQ: 4986 Result = DAG.getNode(ARMISD::VCEQZ, dl, CmpVT, SingleOp); break; 4987 case ARMISD::VCGE: 4988 Result = DAG.getNode(ARMISD::VCGEZ, dl, CmpVT, SingleOp); break; 4989 case ARMISD::VCLEZ: 4990 Result = DAG.getNode(ARMISD::VCLEZ, dl, CmpVT, SingleOp); break; 4991 case ARMISD::VCGT: 4992 Result = DAG.getNode(ARMISD::VCGTZ, dl, CmpVT, SingleOp); break; 4993 case ARMISD::VCLTZ: 4994 Result = DAG.getNode(ARMISD::VCLTZ, dl, CmpVT, SingleOp); break; 4995 default: 4996 Result = DAG.getNode(Opc, dl, CmpVT, Op0, Op1); 4997 } 4998 } else { 4999 Result = DAG.getNode(Opc, dl, CmpVT, Op0, Op1); 5000 } 5001 5002 Result = DAG.getSExtOrTrunc(Result, dl, VT); 5003 5004 if (Invert) 5005 Result = DAG.getNOT(dl, Result, VT); 5006 5007 return Result; 5008 } 5009 5010 static SDValue LowerSETCCE(SDValue Op, SelectionDAG &DAG) { 5011 SDValue LHS = Op.getOperand(0); 5012 SDValue RHS = Op.getOperand(1); 5013 SDValue Carry = Op.getOperand(2); 5014 SDValue Cond = Op.getOperand(3); 5015 SDLoc DL(Op); 5016 5017 assert(LHS.getSimpleValueType().isInteger() && "SETCCE is integer only."); 5018 5019 assert(Carry.getOpcode() != ISD::CARRY_FALSE); 5020 SDVTList VTs = DAG.getVTList(LHS.getValueType(), MVT::i32); 5021 SDValue Cmp = DAG.getNode(ARMISD::SUBE, DL, VTs, LHS, RHS, Carry); 5022 5023 SDValue FVal = DAG.getConstant(0, DL, MVT::i32); 5024 SDValue TVal = DAG.getConstant(1, DL, MVT::i32); 5025 SDValue ARMcc = DAG.getConstant( 5026 IntCCToARMCC(cast<CondCodeSDNode>(Cond)->get()), DL, MVT::i32); 5027 SDValue CCR = DAG.getRegister(ARM::CPSR, MVT::i32); 5028 SDValue Chain = DAG.getCopyToReg(DAG.getEntryNode(), DL, ARM::CPSR, 5029 Cmp.getValue(1), SDValue()); 5030 return DAG.getNode(ARMISD::CMOV, DL, Op.getValueType(), FVal, TVal, ARMcc, 5031 CCR, Chain.getValue(1)); 5032 } 5033 5034 /// isNEONModifiedImm - Check if the specified splat value corresponds to a 5035 /// valid vector constant for a NEON instruction with a "modified immediate" 5036 /// operand (e.g., VMOV). If so, return the encoded value. 5037 static SDValue isNEONModifiedImm(uint64_t SplatBits, uint64_t SplatUndef, 5038 unsigned SplatBitSize, SelectionDAG &DAG, 5039 const SDLoc &dl, EVT &VT, bool is128Bits, 5040 NEONModImmType type) { 5041 unsigned OpCmode, Imm; 5042 5043 // SplatBitSize is set to the smallest size that splats the vector, so a 5044 // zero vector will always have SplatBitSize == 8. However, NEON modified 5045 // immediate instructions others than VMOV do not support the 8-bit encoding 5046 // of a zero vector, and the default encoding of zero is supposed to be the 5047 // 32-bit version. 5048 if (SplatBits == 0) 5049 SplatBitSize = 32; 5050 5051 switch (SplatBitSize) { 5052 case 8: 5053 if (type != VMOVModImm) 5054 return SDValue(); 5055 // Any 1-byte value is OK. Op=0, Cmode=1110. 5056 assert((SplatBits & ~0xff) == 0 && "one byte splat value is too big"); 5057 OpCmode = 0xe; 5058 Imm = SplatBits; 5059 VT = is128Bits ? MVT::v16i8 : MVT::v8i8; 5060 break; 5061 5062 case 16: 5063 // NEON's 16-bit VMOV supports splat values where only one byte is nonzero. 5064 VT = is128Bits ? MVT::v8i16 : MVT::v4i16; 5065 if ((SplatBits & ~0xff) == 0) { 5066 // Value = 0x00nn: Op=x, Cmode=100x. 5067 OpCmode = 0x8; 5068 Imm = SplatBits; 5069 break; 5070 } 5071 if ((SplatBits & ~0xff00) == 0) { 5072 // Value = 0xnn00: Op=x, Cmode=101x. 5073 OpCmode = 0xa; 5074 Imm = SplatBits >> 8; 5075 break; 5076 } 5077 return SDValue(); 5078 5079 case 32: 5080 // NEON's 32-bit VMOV supports splat values where: 5081 // * only one byte is nonzero, or 5082 // * the least significant byte is 0xff and the second byte is nonzero, or 5083 // * the least significant 2 bytes are 0xff and the third is nonzero. 5084 VT = is128Bits ? MVT::v4i32 : MVT::v2i32; 5085 if ((SplatBits & ~0xff) == 0) { 5086 // Value = 0x000000nn: Op=x, Cmode=000x. 5087 OpCmode = 0; 5088 Imm = SplatBits; 5089 break; 5090 } 5091 if ((SplatBits & ~0xff00) == 0) { 5092 // Value = 0x0000nn00: Op=x, Cmode=001x. 5093 OpCmode = 0x2; 5094 Imm = SplatBits >> 8; 5095 break; 5096 } 5097 if ((SplatBits & ~0xff0000) == 0) { 5098 // Value = 0x00nn0000: Op=x, Cmode=010x. 5099 OpCmode = 0x4; 5100 Imm = SplatBits >> 16; 5101 break; 5102 } 5103 if ((SplatBits & ~0xff000000) == 0) { 5104 // Value = 0xnn000000: Op=x, Cmode=011x. 5105 OpCmode = 0x6; 5106 Imm = SplatBits >> 24; 5107 break; 5108 } 5109 5110 // cmode == 0b1100 and cmode == 0b1101 are not supported for VORR or VBIC 5111 if (type == OtherModImm) return SDValue(); 5112 5113 if ((SplatBits & ~0xffff) == 0 && 5114 ((SplatBits | SplatUndef) & 0xff) == 0xff) { 5115 // Value = 0x0000nnff: Op=x, Cmode=1100. 5116 OpCmode = 0xc; 5117 Imm = SplatBits >> 8; 5118 break; 5119 } 5120 5121 if ((SplatBits & ~0xffffff) == 0 && 5122 ((SplatBits | SplatUndef) & 0xffff) == 0xffff) { 5123 // Value = 0x00nnffff: Op=x, Cmode=1101. 5124 OpCmode = 0xd; 5125 Imm = SplatBits >> 16; 5126 break; 5127 } 5128 5129 // Note: there are a few 32-bit splat values (specifically: 00ffff00, 5130 // ff000000, ff0000ff, and ffff00ff) that are valid for VMOV.I64 but not 5131 // VMOV.I32. A (very) minor optimization would be to replicate the value 5132 // and fall through here to test for a valid 64-bit splat. But, then the 5133 // caller would also need to check and handle the change in size. 5134 return SDValue(); 5135 5136 case 64: { 5137 if (type != VMOVModImm) 5138 return SDValue(); 5139 // NEON has a 64-bit VMOV splat where each byte is either 0 or 0xff. 5140 uint64_t BitMask = 0xff; 5141 uint64_t Val = 0; 5142 unsigned ImmMask = 1; 5143 Imm = 0; 5144 for (int ByteNum = 0; ByteNum < 8; ++ByteNum) { 5145 if (((SplatBits | SplatUndef) & BitMask) == BitMask) { 5146 Val |= BitMask; 5147 Imm |= ImmMask; 5148 } else if ((SplatBits & BitMask) != 0) { 5149 return SDValue(); 5150 } 5151 BitMask <<= 8; 5152 ImmMask <<= 1; 5153 } 5154 5155 if (DAG.getDataLayout().isBigEndian()) 5156 // swap higher and lower 32 bit word 5157 Imm = ((Imm & 0xf) << 4) | ((Imm & 0xf0) >> 4); 5158 5159 // Op=1, Cmode=1110. 5160 OpCmode = 0x1e; 5161 VT = is128Bits ? MVT::v2i64 : MVT::v1i64; 5162 break; 5163 } 5164 5165 default: 5166 llvm_unreachable("unexpected size for isNEONModifiedImm"); 5167 } 5168 5169 unsigned EncodedVal = ARM_AM::createNEONModImm(OpCmode, Imm); 5170 return DAG.getTargetConstant(EncodedVal, dl, MVT::i32); 5171 } 5172 5173 SDValue ARMTargetLowering::LowerConstantFP(SDValue Op, SelectionDAG &DAG, 5174 const ARMSubtarget *ST) const { 5175 if (!ST->hasVFP3()) 5176 return SDValue(); 5177 5178 bool IsDouble = Op.getValueType() == MVT::f64; 5179 ConstantFPSDNode *CFP = cast<ConstantFPSDNode>(Op); 5180 5181 // Use the default (constant pool) lowering for double constants when we have 5182 // an SP-only FPU 5183 if (IsDouble && Subtarget->isFPOnlySP()) 5184 return SDValue(); 5185 5186 // Try splatting with a VMOV.f32... 5187 const APFloat &FPVal = CFP->getValueAPF(); 5188 int ImmVal = IsDouble ? ARM_AM::getFP64Imm(FPVal) : ARM_AM::getFP32Imm(FPVal); 5189 5190 if (ImmVal != -1) { 5191 if (IsDouble || !ST->useNEONForSinglePrecisionFP()) { 5192 // We have code in place to select a valid ConstantFP already, no need to 5193 // do any mangling. 5194 return Op; 5195 } 5196 5197 // It's a float and we are trying to use NEON operations where 5198 // possible. Lower it to a splat followed by an extract. 5199 SDLoc DL(Op); 5200 SDValue NewVal = DAG.getTargetConstant(ImmVal, DL, MVT::i32); 5201 SDValue VecConstant = DAG.getNode(ARMISD::VMOVFPIMM, DL, MVT::v2f32, 5202 NewVal); 5203 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, VecConstant, 5204 DAG.getConstant(0, DL, MVT::i32)); 5205 } 5206 5207 // The rest of our options are NEON only, make sure that's allowed before 5208 // proceeding.. 5209 if (!ST->hasNEON() || (!IsDouble && !ST->useNEONForSinglePrecisionFP())) 5210 return SDValue(); 5211 5212 EVT VMovVT; 5213 uint64_t iVal = FPVal.bitcastToAPInt().getZExtValue(); 5214 5215 // It wouldn't really be worth bothering for doubles except for one very 5216 // important value, which does happen to match: 0.0. So make sure we don't do 5217 // anything stupid. 5218 if (IsDouble && (iVal & 0xffffffff) != (iVal >> 32)) 5219 return SDValue(); 5220 5221 // Try a VMOV.i32 (FIXME: i8, i16, or i64 could work too). 5222 SDValue NewVal = isNEONModifiedImm(iVal & 0xffffffffU, 0, 32, DAG, SDLoc(Op), 5223 VMovVT, false, VMOVModImm); 5224 if (NewVal != SDValue()) { 5225 SDLoc DL(Op); 5226 SDValue VecConstant = DAG.getNode(ARMISD::VMOVIMM, DL, VMovVT, 5227 NewVal); 5228 if (IsDouble) 5229 return DAG.getNode(ISD::BITCAST, DL, MVT::f64, VecConstant); 5230 5231 // It's a float: cast and extract a vector element. 5232 SDValue VecFConstant = DAG.getNode(ISD::BITCAST, DL, MVT::v2f32, 5233 VecConstant); 5234 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, VecFConstant, 5235 DAG.getConstant(0, DL, MVT::i32)); 5236 } 5237 5238 // Finally, try a VMVN.i32 5239 NewVal = isNEONModifiedImm(~iVal & 0xffffffffU, 0, 32, DAG, SDLoc(Op), VMovVT, 5240 false, VMVNModImm); 5241 if (NewVal != SDValue()) { 5242 SDLoc DL(Op); 5243 SDValue VecConstant = DAG.getNode(ARMISD::VMVNIMM, DL, VMovVT, NewVal); 5244 5245 if (IsDouble) 5246 return DAG.getNode(ISD::BITCAST, DL, MVT::f64, VecConstant); 5247 5248 // It's a float: cast and extract a vector element. 5249 SDValue VecFConstant = DAG.getNode(ISD::BITCAST, DL, MVT::v2f32, 5250 VecConstant); 5251 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, VecFConstant, 5252 DAG.getConstant(0, DL, MVT::i32)); 5253 } 5254 5255 return SDValue(); 5256 } 5257 5258 // check if an VEXT instruction can handle the shuffle mask when the 5259 // vector sources of the shuffle are the same. 5260 static bool isSingletonVEXTMask(ArrayRef<int> M, EVT VT, unsigned &Imm) { 5261 unsigned NumElts = VT.getVectorNumElements(); 5262 5263 // Assume that the first shuffle index is not UNDEF. Fail if it is. 5264 if (M[0] < 0) 5265 return false; 5266 5267 Imm = M[0]; 5268 5269 // If this is a VEXT shuffle, the immediate value is the index of the first 5270 // element. The other shuffle indices must be the successive elements after 5271 // the first one. 5272 unsigned ExpectedElt = Imm; 5273 for (unsigned i = 1; i < NumElts; ++i) { 5274 // Increment the expected index. If it wraps around, just follow it 5275 // back to index zero and keep going. 5276 ++ExpectedElt; 5277 if (ExpectedElt == NumElts) 5278 ExpectedElt = 0; 5279 5280 if (M[i] < 0) continue; // ignore UNDEF indices 5281 if (ExpectedElt != static_cast<unsigned>(M[i])) 5282 return false; 5283 } 5284 5285 return true; 5286 } 5287 5288 5289 static bool isVEXTMask(ArrayRef<int> M, EVT VT, 5290 bool &ReverseVEXT, unsigned &Imm) { 5291 unsigned NumElts = VT.getVectorNumElements(); 5292 ReverseVEXT = false; 5293 5294 // Assume that the first shuffle index is not UNDEF. Fail if it is. 5295 if (M[0] < 0) 5296 return false; 5297 5298 Imm = M[0]; 5299 5300 // If this is a VEXT shuffle, the immediate value is the index of the first 5301 // element. The other shuffle indices must be the successive elements after 5302 // the first one. 5303 unsigned ExpectedElt = Imm; 5304 for (unsigned i = 1; i < NumElts; ++i) { 5305 // Increment the expected index. If it wraps around, it may still be 5306 // a VEXT but the source vectors must be swapped. 5307 ExpectedElt += 1; 5308 if (ExpectedElt == NumElts * 2) { 5309 ExpectedElt = 0; 5310 ReverseVEXT = true; 5311 } 5312 5313 if (M[i] < 0) continue; // ignore UNDEF indices 5314 if (ExpectedElt != static_cast<unsigned>(M[i])) 5315 return false; 5316 } 5317 5318 // Adjust the index value if the source operands will be swapped. 5319 if (ReverseVEXT) 5320 Imm -= NumElts; 5321 5322 return true; 5323 } 5324 5325 /// isVREVMask - Check if a vector shuffle corresponds to a VREV 5326 /// instruction with the specified blocksize. (The order of the elements 5327 /// within each block of the vector is reversed.) 5328 static bool isVREVMask(ArrayRef<int> M, EVT VT, unsigned BlockSize) { 5329 assert((BlockSize==16 || BlockSize==32 || BlockSize==64) && 5330 "Only possible block sizes for VREV are: 16, 32, 64"); 5331 5332 unsigned EltSz = VT.getVectorElementType().getSizeInBits(); 5333 if (EltSz == 64) 5334 return false; 5335 5336 unsigned NumElts = VT.getVectorNumElements(); 5337 unsigned BlockElts = M[0] + 1; 5338 // If the first shuffle index is UNDEF, be optimistic. 5339 if (M[0] < 0) 5340 BlockElts = BlockSize / EltSz; 5341 5342 if (BlockSize <= EltSz || BlockSize != BlockElts * EltSz) 5343 return false; 5344 5345 for (unsigned i = 0; i < NumElts; ++i) { 5346 if (M[i] < 0) continue; // ignore UNDEF indices 5347 if ((unsigned) M[i] != (i - i%BlockElts) + (BlockElts - 1 - i%BlockElts)) 5348 return false; 5349 } 5350 5351 return true; 5352 } 5353 5354 static bool isVTBLMask(ArrayRef<int> M, EVT VT) { 5355 // We can handle <8 x i8> vector shuffles. If the index in the mask is out of 5356 // range, then 0 is placed into the resulting vector. So pretty much any mask 5357 // of 8 elements can work here. 5358 return VT == MVT::v8i8 && M.size() == 8; 5359 } 5360 5361 // Checks whether the shuffle mask represents a vector transpose (VTRN) by 5362 // checking that pairs of elements in the shuffle mask represent the same index 5363 // in each vector, incrementing the expected index by 2 at each step. 5364 // e.g. For v1,v2 of type v4i32 a valid shuffle mask is: [0, 4, 2, 6] 5365 // v1={a,b,c,d} => x=shufflevector v1, v2 shufflemask => x={a,e,c,g} 5366 // v2={e,f,g,h} 5367 // WhichResult gives the offset for each element in the mask based on which 5368 // of the two results it belongs to. 5369 // 5370 // The transpose can be represented either as: 5371 // result1 = shufflevector v1, v2, result1_shuffle_mask 5372 // result2 = shufflevector v1, v2, result2_shuffle_mask 5373 // where v1/v2 and the shuffle masks have the same number of elements 5374 // (here WhichResult (see below) indicates which result is being checked) 5375 // 5376 // or as: 5377 // results = shufflevector v1, v2, shuffle_mask 5378 // where both results are returned in one vector and the shuffle mask has twice 5379 // as many elements as v1/v2 (here WhichResult will always be 0 if true) here we 5380 // want to check the low half and high half of the shuffle mask as if it were 5381 // the other case 5382 static bool isVTRNMask(ArrayRef<int> M, EVT VT, unsigned &WhichResult) { 5383 unsigned EltSz = VT.getVectorElementType().getSizeInBits(); 5384 if (EltSz == 64) 5385 return false; 5386 5387 unsigned NumElts = VT.getVectorNumElements(); 5388 if (M.size() != NumElts && M.size() != NumElts*2) 5389 return false; 5390 5391 // If the mask is twice as long as the input vector then we need to check the 5392 // upper and lower parts of the mask with a matching value for WhichResult 5393 // FIXME: A mask with only even values will be rejected in case the first 5394 // element is undefined, e.g. [-1, 4, 2, 6] will be rejected, because only 5395 // M[0] is used to determine WhichResult 5396 for (unsigned i = 0; i < M.size(); i += NumElts) { 5397 if (M.size() == NumElts * 2) 5398 WhichResult = i / NumElts; 5399 else 5400 WhichResult = M[i] == 0 ? 0 : 1; 5401 for (unsigned j = 0; j < NumElts; j += 2) { 5402 if ((M[i+j] >= 0 && (unsigned) M[i+j] != j + WhichResult) || 5403 (M[i+j+1] >= 0 && (unsigned) M[i+j+1] != j + NumElts + WhichResult)) 5404 return false; 5405 } 5406 } 5407 5408 if (M.size() == NumElts*2) 5409 WhichResult = 0; 5410 5411 return true; 5412 } 5413 5414 /// isVTRN_v_undef_Mask - Special case of isVTRNMask for canonical form of 5415 /// "vector_shuffle v, v", i.e., "vector_shuffle v, undef". 5416 /// Mask is e.g., <0, 0, 2, 2> instead of <0, 4, 2, 6>. 5417 static bool isVTRN_v_undef_Mask(ArrayRef<int> M, EVT VT, unsigned &WhichResult){ 5418 unsigned EltSz = VT.getVectorElementType().getSizeInBits(); 5419 if (EltSz == 64) 5420 return false; 5421 5422 unsigned NumElts = VT.getVectorNumElements(); 5423 if (M.size() != NumElts && M.size() != NumElts*2) 5424 return false; 5425 5426 for (unsigned i = 0; i < M.size(); i += NumElts) { 5427 if (M.size() == NumElts * 2) 5428 WhichResult = i / NumElts; 5429 else 5430 WhichResult = M[i] == 0 ? 0 : 1; 5431 for (unsigned j = 0; j < NumElts; j += 2) { 5432 if ((M[i+j] >= 0 && (unsigned) M[i+j] != j + WhichResult) || 5433 (M[i+j+1] >= 0 && (unsigned) M[i+j+1] != j + WhichResult)) 5434 return false; 5435 } 5436 } 5437 5438 if (M.size() == NumElts*2) 5439 WhichResult = 0; 5440 5441 return true; 5442 } 5443 5444 // Checks whether the shuffle mask represents a vector unzip (VUZP) by checking 5445 // that the mask elements are either all even and in steps of size 2 or all odd 5446 // and in steps of size 2. 5447 // e.g. For v1,v2 of type v4i32 a valid shuffle mask is: [0, 2, 4, 6] 5448 // v1={a,b,c,d} => x=shufflevector v1, v2 shufflemask => x={a,c,e,g} 5449 // v2={e,f,g,h} 5450 // Requires similar checks to that of isVTRNMask with 5451 // respect the how results are returned. 5452 static bool isVUZPMask(ArrayRef<int> M, EVT VT, unsigned &WhichResult) { 5453 unsigned EltSz = VT.getVectorElementType().getSizeInBits(); 5454 if (EltSz == 64) 5455 return false; 5456 5457 unsigned NumElts = VT.getVectorNumElements(); 5458 if (M.size() != NumElts && M.size() != NumElts*2) 5459 return false; 5460 5461 for (unsigned i = 0; i < M.size(); i += NumElts) { 5462 WhichResult = M[i] == 0 ? 0 : 1; 5463 for (unsigned j = 0; j < NumElts; ++j) { 5464 if (M[i+j] >= 0 && (unsigned) M[i+j] != 2 * j + WhichResult) 5465 return false; 5466 } 5467 } 5468 5469 if (M.size() == NumElts*2) 5470 WhichResult = 0; 5471 5472 // VUZP.32 for 64-bit vectors is a pseudo-instruction alias for VTRN.32. 5473 if (VT.is64BitVector() && EltSz == 32) 5474 return false; 5475 5476 return true; 5477 } 5478 5479 /// isVUZP_v_undef_Mask - Special case of isVUZPMask for canonical form of 5480 /// "vector_shuffle v, v", i.e., "vector_shuffle v, undef". 5481 /// Mask is e.g., <0, 2, 0, 2> instead of <0, 2, 4, 6>, 5482 static bool isVUZP_v_undef_Mask(ArrayRef<int> M, EVT VT, unsigned &WhichResult){ 5483 unsigned EltSz = VT.getVectorElementType().getSizeInBits(); 5484 if (EltSz == 64) 5485 return false; 5486 5487 unsigned NumElts = VT.getVectorNumElements(); 5488 if (M.size() != NumElts && M.size() != NumElts*2) 5489 return false; 5490 5491 unsigned Half = NumElts / 2; 5492 for (unsigned i = 0; i < M.size(); i += NumElts) { 5493 WhichResult = M[i] == 0 ? 0 : 1; 5494 for (unsigned j = 0; j < NumElts; j += Half) { 5495 unsigned Idx = WhichResult; 5496 for (unsigned k = 0; k < Half; ++k) { 5497 int MIdx = M[i + j + k]; 5498 if (MIdx >= 0 && (unsigned) MIdx != Idx) 5499 return false; 5500 Idx += 2; 5501 } 5502 } 5503 } 5504 5505 if (M.size() == NumElts*2) 5506 WhichResult = 0; 5507 5508 // VUZP.32 for 64-bit vectors is a pseudo-instruction alias for VTRN.32. 5509 if (VT.is64BitVector() && EltSz == 32) 5510 return false; 5511 5512 return true; 5513 } 5514 5515 // Checks whether the shuffle mask represents a vector zip (VZIP) by checking 5516 // that pairs of elements of the shufflemask represent the same index in each 5517 // vector incrementing sequentially through the vectors. 5518 // e.g. For v1,v2 of type v4i32 a valid shuffle mask is: [0, 4, 1, 5] 5519 // v1={a,b,c,d} => x=shufflevector v1, v2 shufflemask => x={a,e,b,f} 5520 // v2={e,f,g,h} 5521 // Requires similar checks to that of isVTRNMask with respect the how results 5522 // are returned. 5523 static bool isVZIPMask(ArrayRef<int> M, EVT VT, unsigned &WhichResult) { 5524 unsigned EltSz = VT.getVectorElementType().getSizeInBits(); 5525 if (EltSz == 64) 5526 return false; 5527 5528 unsigned NumElts = VT.getVectorNumElements(); 5529 if (M.size() != NumElts && M.size() != NumElts*2) 5530 return false; 5531 5532 for (unsigned i = 0; i < M.size(); i += NumElts) { 5533 WhichResult = M[i] == 0 ? 0 : 1; 5534 unsigned Idx = WhichResult * NumElts / 2; 5535 for (unsigned j = 0; j < NumElts; j += 2) { 5536 if ((M[i+j] >= 0 && (unsigned) M[i+j] != Idx) || 5537 (M[i+j+1] >= 0 && (unsigned) M[i+j+1] != Idx + NumElts)) 5538 return false; 5539 Idx += 1; 5540 } 5541 } 5542 5543 if (M.size() == NumElts*2) 5544 WhichResult = 0; 5545 5546 // VZIP.32 for 64-bit vectors is a pseudo-instruction alias for VTRN.32. 5547 if (VT.is64BitVector() && EltSz == 32) 5548 return false; 5549 5550 return true; 5551 } 5552 5553 /// isVZIP_v_undef_Mask - Special case of isVZIPMask for canonical form of 5554 /// "vector_shuffle v, v", i.e., "vector_shuffle v, undef". 5555 /// Mask is e.g., <0, 0, 1, 1> instead of <0, 4, 1, 5>. 5556 static bool isVZIP_v_undef_Mask(ArrayRef<int> M, EVT VT, unsigned &WhichResult){ 5557 unsigned EltSz = VT.getVectorElementType().getSizeInBits(); 5558 if (EltSz == 64) 5559 return false; 5560 5561 unsigned NumElts = VT.getVectorNumElements(); 5562 if (M.size() != NumElts && M.size() != NumElts*2) 5563 return false; 5564 5565 for (unsigned i = 0; i < M.size(); i += NumElts) { 5566 WhichResult = M[i] == 0 ? 0 : 1; 5567 unsigned Idx = WhichResult * NumElts / 2; 5568 for (unsigned j = 0; j < NumElts; j += 2) { 5569 if ((M[i+j] >= 0 && (unsigned) M[i+j] != Idx) || 5570 (M[i+j+1] >= 0 && (unsigned) M[i+j+1] != Idx)) 5571 return false; 5572 Idx += 1; 5573 } 5574 } 5575 5576 if (M.size() == NumElts*2) 5577 WhichResult = 0; 5578 5579 // VZIP.32 for 64-bit vectors is a pseudo-instruction alias for VTRN.32. 5580 if (VT.is64BitVector() && EltSz == 32) 5581 return false; 5582 5583 return true; 5584 } 5585 5586 /// Check if \p ShuffleMask is a NEON two-result shuffle (VZIP, VUZP, VTRN), 5587 /// and return the corresponding ARMISD opcode if it is, or 0 if it isn't. 5588 static unsigned isNEONTwoResultShuffleMask(ArrayRef<int> ShuffleMask, EVT VT, 5589 unsigned &WhichResult, 5590 bool &isV_UNDEF) { 5591 isV_UNDEF = false; 5592 if (isVTRNMask(ShuffleMask, VT, WhichResult)) 5593 return ARMISD::VTRN; 5594 if (isVUZPMask(ShuffleMask, VT, WhichResult)) 5595 return ARMISD::VUZP; 5596 if (isVZIPMask(ShuffleMask, VT, WhichResult)) 5597 return ARMISD::VZIP; 5598 5599 isV_UNDEF = true; 5600 if (isVTRN_v_undef_Mask(ShuffleMask, VT, WhichResult)) 5601 return ARMISD::VTRN; 5602 if (isVUZP_v_undef_Mask(ShuffleMask, VT, WhichResult)) 5603 return ARMISD::VUZP; 5604 if (isVZIP_v_undef_Mask(ShuffleMask, VT, WhichResult)) 5605 return ARMISD::VZIP; 5606 5607 return 0; 5608 } 5609 5610 /// \return true if this is a reverse operation on an vector. 5611 static bool isReverseMask(ArrayRef<int> M, EVT VT) { 5612 unsigned NumElts = VT.getVectorNumElements(); 5613 // Make sure the mask has the right size. 5614 if (NumElts != M.size()) 5615 return false; 5616 5617 // Look for <15, ..., 3, -1, 1, 0>. 5618 for (unsigned i = 0; i != NumElts; ++i) 5619 if (M[i] >= 0 && M[i] != (int) (NumElts - 1 - i)) 5620 return false; 5621 5622 return true; 5623 } 5624 5625 // If N is an integer constant that can be moved into a register in one 5626 // instruction, return an SDValue of such a constant (will become a MOV 5627 // instruction). Otherwise return null. 5628 static SDValue IsSingleInstrConstant(SDValue N, SelectionDAG &DAG, 5629 const ARMSubtarget *ST, const SDLoc &dl) { 5630 uint64_t Val; 5631 if (!isa<ConstantSDNode>(N)) 5632 return SDValue(); 5633 Val = cast<ConstantSDNode>(N)->getZExtValue(); 5634 5635 if (ST->isThumb1Only()) { 5636 if (Val <= 255 || ~Val <= 255) 5637 return DAG.getConstant(Val, dl, MVT::i32); 5638 } else { 5639 if (ARM_AM::getSOImmVal(Val) != -1 || ARM_AM::getSOImmVal(~Val) != -1) 5640 return DAG.getConstant(Val, dl, MVT::i32); 5641 } 5642 return SDValue(); 5643 } 5644 5645 // If this is a case we can't handle, return null and let the default 5646 // expansion code take care of it. 5647 SDValue ARMTargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG, 5648 const ARMSubtarget *ST) const { 5649 BuildVectorSDNode *BVN = cast<BuildVectorSDNode>(Op.getNode()); 5650 SDLoc dl(Op); 5651 EVT VT = Op.getValueType(); 5652 5653 APInt SplatBits, SplatUndef; 5654 unsigned SplatBitSize; 5655 bool HasAnyUndefs; 5656 if (BVN->isConstantSplat(SplatBits, SplatUndef, SplatBitSize, HasAnyUndefs)) { 5657 if (SplatBitSize <= 64) { 5658 // Check if an immediate VMOV works. 5659 EVT VmovVT; 5660 SDValue Val = isNEONModifiedImm(SplatBits.getZExtValue(), 5661 SplatUndef.getZExtValue(), SplatBitSize, 5662 DAG, dl, VmovVT, VT.is128BitVector(), 5663 VMOVModImm); 5664 if (Val.getNode()) { 5665 SDValue Vmov = DAG.getNode(ARMISD::VMOVIMM, dl, VmovVT, Val); 5666 return DAG.getNode(ISD::BITCAST, dl, VT, Vmov); 5667 } 5668 5669 // Try an immediate VMVN. 5670 uint64_t NegatedImm = (~SplatBits).getZExtValue(); 5671 Val = isNEONModifiedImm(NegatedImm, 5672 SplatUndef.getZExtValue(), SplatBitSize, 5673 DAG, dl, VmovVT, VT.is128BitVector(), 5674 VMVNModImm); 5675 if (Val.getNode()) { 5676 SDValue Vmov = DAG.getNode(ARMISD::VMVNIMM, dl, VmovVT, Val); 5677 return DAG.getNode(ISD::BITCAST, dl, VT, Vmov); 5678 } 5679 5680 // Use vmov.f32 to materialize other v2f32 and v4f32 splats. 5681 if ((VT == MVT::v2f32 || VT == MVT::v4f32) && SplatBitSize == 32) { 5682 int ImmVal = ARM_AM::getFP32Imm(SplatBits); 5683 if (ImmVal != -1) { 5684 SDValue Val = DAG.getTargetConstant(ImmVal, dl, MVT::i32); 5685 return DAG.getNode(ARMISD::VMOVFPIMM, dl, VT, Val); 5686 } 5687 } 5688 } 5689 } 5690 5691 // Scan through the operands to see if only one value is used. 5692 // 5693 // As an optimisation, even if more than one value is used it may be more 5694 // profitable to splat with one value then change some lanes. 5695 // 5696 // Heuristically we decide to do this if the vector has a "dominant" value, 5697 // defined as splatted to more than half of the lanes. 5698 unsigned NumElts = VT.getVectorNumElements(); 5699 bool isOnlyLowElement = true; 5700 bool usesOnlyOneValue = true; 5701 bool hasDominantValue = false; 5702 bool isConstant = true; 5703 5704 // Map of the number of times a particular SDValue appears in the 5705 // element list. 5706 DenseMap<SDValue, unsigned> ValueCounts; 5707 SDValue Value; 5708 for (unsigned i = 0; i < NumElts; ++i) { 5709 SDValue V = Op.getOperand(i); 5710 if (V.isUndef()) 5711 continue; 5712 if (i > 0) 5713 isOnlyLowElement = false; 5714 if (!isa<ConstantFPSDNode>(V) && !isa<ConstantSDNode>(V)) 5715 isConstant = false; 5716 5717 ValueCounts.insert(std::make_pair(V, 0)); 5718 unsigned &Count = ValueCounts[V]; 5719 5720 // Is this value dominant? (takes up more than half of the lanes) 5721 if (++Count > (NumElts / 2)) { 5722 hasDominantValue = true; 5723 Value = V; 5724 } 5725 } 5726 if (ValueCounts.size() != 1) 5727 usesOnlyOneValue = false; 5728 if (!Value.getNode() && ValueCounts.size() > 0) 5729 Value = ValueCounts.begin()->first; 5730 5731 if (ValueCounts.size() == 0) 5732 return DAG.getUNDEF(VT); 5733 5734 // Loads are better lowered with insert_vector_elt/ARMISD::BUILD_VECTOR. 5735 // Keep going if we are hitting this case. 5736 if (isOnlyLowElement && !ISD::isNormalLoad(Value.getNode())) 5737 return DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Value); 5738 5739 unsigned EltSize = VT.getVectorElementType().getSizeInBits(); 5740 5741 // Use VDUP for non-constant splats. For f32 constant splats, reduce to 5742 // i32 and try again. 5743 if (hasDominantValue && EltSize <= 32) { 5744 if (!isConstant) { 5745 SDValue N; 5746 5747 // If we are VDUPing a value that comes directly from a vector, that will 5748 // cause an unnecessary move to and from a GPR, where instead we could 5749 // just use VDUPLANE. We can only do this if the lane being extracted 5750 // is at a constant index, as the VDUP from lane instructions only have 5751 // constant-index forms. 5752 ConstantSDNode *constIndex; 5753 if (Value->getOpcode() == ISD::EXTRACT_VECTOR_ELT && 5754 (constIndex = dyn_cast<ConstantSDNode>(Value->getOperand(1)))) { 5755 // We need to create a new undef vector to use for the VDUPLANE if the 5756 // size of the vector from which we get the value is different than the 5757 // size of the vector that we need to create. We will insert the element 5758 // such that the register coalescer will remove unnecessary copies. 5759 if (VT != Value->getOperand(0).getValueType()) { 5760 unsigned index = constIndex->getAPIntValue().getLimitedValue() % 5761 VT.getVectorNumElements(); 5762 N = DAG.getNode(ARMISD::VDUPLANE, dl, VT, 5763 DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, DAG.getUNDEF(VT), 5764 Value, DAG.getConstant(index, dl, MVT::i32)), 5765 DAG.getConstant(index, dl, MVT::i32)); 5766 } else 5767 N = DAG.getNode(ARMISD::VDUPLANE, dl, VT, 5768 Value->getOperand(0), Value->getOperand(1)); 5769 } else 5770 N = DAG.getNode(ARMISD::VDUP, dl, VT, Value); 5771 5772 if (!usesOnlyOneValue) { 5773 // The dominant value was splatted as 'N', but we now have to insert 5774 // all differing elements. 5775 for (unsigned I = 0; I < NumElts; ++I) { 5776 if (Op.getOperand(I) == Value) 5777 continue; 5778 SmallVector<SDValue, 3> Ops; 5779 Ops.push_back(N); 5780 Ops.push_back(Op.getOperand(I)); 5781 Ops.push_back(DAG.getConstant(I, dl, MVT::i32)); 5782 N = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, Ops); 5783 } 5784 } 5785 return N; 5786 } 5787 if (VT.getVectorElementType().isFloatingPoint()) { 5788 SmallVector<SDValue, 8> Ops; 5789 for (unsigned i = 0; i < NumElts; ++i) 5790 Ops.push_back(DAG.getNode(ISD::BITCAST, dl, MVT::i32, 5791 Op.getOperand(i))); 5792 EVT VecVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32, NumElts); 5793 SDValue Val = DAG.getBuildVector(VecVT, dl, Ops); 5794 Val = LowerBUILD_VECTOR(Val, DAG, ST); 5795 if (Val.getNode()) 5796 return DAG.getNode(ISD::BITCAST, dl, VT, Val); 5797 } 5798 if (usesOnlyOneValue) { 5799 SDValue Val = IsSingleInstrConstant(Value, DAG, ST, dl); 5800 if (isConstant && Val.getNode()) 5801 return DAG.getNode(ARMISD::VDUP, dl, VT, Val); 5802 } 5803 } 5804 5805 // If all elements are constants and the case above didn't get hit, fall back 5806 // to the default expansion, which will generate a load from the constant 5807 // pool. 5808 if (isConstant) 5809 return SDValue(); 5810 5811 // Empirical tests suggest this is rarely worth it for vectors of length <= 2. 5812 if (NumElts >= 4) { 5813 SDValue shuffle = ReconstructShuffle(Op, DAG); 5814 if (shuffle != SDValue()) 5815 return shuffle; 5816 } 5817 5818 // Vectors with 32- or 64-bit elements can be built by directly assigning 5819 // the subregisters. Lower it to an ARMISD::BUILD_VECTOR so the operands 5820 // will be legalized. 5821 if (EltSize >= 32) { 5822 // Do the expansion with floating-point types, since that is what the VFP 5823 // registers are defined to use, and since i64 is not legal. 5824 EVT EltVT = EVT::getFloatingPointVT(EltSize); 5825 EVT VecVT = EVT::getVectorVT(*DAG.getContext(), EltVT, NumElts); 5826 SmallVector<SDValue, 8> Ops; 5827 for (unsigned i = 0; i < NumElts; ++i) 5828 Ops.push_back(DAG.getNode(ISD::BITCAST, dl, EltVT, Op.getOperand(i))); 5829 SDValue Val = DAG.getNode(ARMISD::BUILD_VECTOR, dl, VecVT, Ops); 5830 return DAG.getNode(ISD::BITCAST, dl, VT, Val); 5831 } 5832 5833 // If all else fails, just use a sequence of INSERT_VECTOR_ELT when we 5834 // know the default expansion would otherwise fall back on something even 5835 // worse. For a vector with one or two non-undef values, that's 5836 // scalar_to_vector for the elements followed by a shuffle (provided the 5837 // shuffle is valid for the target) and materialization element by element 5838 // on the stack followed by a load for everything else. 5839 if (!isConstant && !usesOnlyOneValue) { 5840 SDValue Vec = DAG.getUNDEF(VT); 5841 for (unsigned i = 0 ; i < NumElts; ++i) { 5842 SDValue V = Op.getOperand(i); 5843 if (V.isUndef()) 5844 continue; 5845 SDValue LaneIdx = DAG.getConstant(i, dl, MVT::i32); 5846 Vec = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, Vec, V, LaneIdx); 5847 } 5848 return Vec; 5849 } 5850 5851 return SDValue(); 5852 } 5853 5854 // Gather data to see if the operation can be modelled as a 5855 // shuffle in combination with VEXTs. 5856 SDValue ARMTargetLowering::ReconstructShuffle(SDValue Op, 5857 SelectionDAG &DAG) const { 5858 assert(Op.getOpcode() == ISD::BUILD_VECTOR && "Unknown opcode!"); 5859 SDLoc dl(Op); 5860 EVT VT = Op.getValueType(); 5861 unsigned NumElts = VT.getVectorNumElements(); 5862 5863 struct ShuffleSourceInfo { 5864 SDValue Vec; 5865 unsigned MinElt; 5866 unsigned MaxElt; 5867 5868 // We may insert some combination of BITCASTs and VEXT nodes to force Vec to 5869 // be compatible with the shuffle we intend to construct. As a result 5870 // ShuffleVec will be some sliding window into the original Vec. 5871 SDValue ShuffleVec; 5872 5873 // Code should guarantee that element i in Vec starts at element "WindowBase 5874 // + i * WindowScale in ShuffleVec". 5875 int WindowBase; 5876 int WindowScale; 5877 5878 bool operator ==(SDValue OtherVec) { return Vec == OtherVec; } 5879 ShuffleSourceInfo(SDValue Vec) 5880 : Vec(Vec), MinElt(UINT_MAX), MaxElt(0), ShuffleVec(Vec), WindowBase(0), 5881 WindowScale(1) {} 5882 }; 5883 5884 // First gather all vectors used as an immediate source for this BUILD_VECTOR 5885 // node. 5886 SmallVector<ShuffleSourceInfo, 2> Sources; 5887 for (unsigned i = 0; i < NumElts; ++i) { 5888 SDValue V = Op.getOperand(i); 5889 if (V.isUndef()) 5890 continue; 5891 else if (V.getOpcode() != ISD::EXTRACT_VECTOR_ELT) { 5892 // A shuffle can only come from building a vector from various 5893 // elements of other vectors. 5894 return SDValue(); 5895 } else if (!isa<ConstantSDNode>(V.getOperand(1))) { 5896 // Furthermore, shuffles require a constant mask, whereas extractelts 5897 // accept variable indices. 5898 return SDValue(); 5899 } 5900 5901 // Add this element source to the list if it's not already there. 5902 SDValue SourceVec = V.getOperand(0); 5903 auto Source = std::find(Sources.begin(), Sources.end(), SourceVec); 5904 if (Source == Sources.end()) 5905 Source = Sources.insert(Sources.end(), ShuffleSourceInfo(SourceVec)); 5906 5907 // Update the minimum and maximum lane number seen. 5908 unsigned EltNo = cast<ConstantSDNode>(V.getOperand(1))->getZExtValue(); 5909 Source->MinElt = std::min(Source->MinElt, EltNo); 5910 Source->MaxElt = std::max(Source->MaxElt, EltNo); 5911 } 5912 5913 // Currently only do something sane when at most two source vectors 5914 // are involved. 5915 if (Sources.size() > 2) 5916 return SDValue(); 5917 5918 // Find out the smallest element size among result and two sources, and use 5919 // it as element size to build the shuffle_vector. 5920 EVT SmallestEltTy = VT.getVectorElementType(); 5921 for (auto &Source : Sources) { 5922 EVT SrcEltTy = Source.Vec.getValueType().getVectorElementType(); 5923 if (SrcEltTy.bitsLT(SmallestEltTy)) 5924 SmallestEltTy = SrcEltTy; 5925 } 5926 unsigned ResMultiplier = 5927 VT.getVectorElementType().getSizeInBits() / SmallestEltTy.getSizeInBits(); 5928 NumElts = VT.getSizeInBits() / SmallestEltTy.getSizeInBits(); 5929 EVT ShuffleVT = EVT::getVectorVT(*DAG.getContext(), SmallestEltTy, NumElts); 5930 5931 // If the source vector is too wide or too narrow, we may nevertheless be able 5932 // to construct a compatible shuffle either by concatenating it with UNDEF or 5933 // extracting a suitable range of elements. 5934 for (auto &Src : Sources) { 5935 EVT SrcVT = Src.ShuffleVec.getValueType(); 5936 5937 if (SrcVT.getSizeInBits() == VT.getSizeInBits()) 5938 continue; 5939 5940 // This stage of the search produces a source with the same element type as 5941 // the original, but with a total width matching the BUILD_VECTOR output. 5942 EVT EltVT = SrcVT.getVectorElementType(); 5943 unsigned NumSrcElts = VT.getSizeInBits() / EltVT.getSizeInBits(); 5944 EVT DestVT = EVT::getVectorVT(*DAG.getContext(), EltVT, NumSrcElts); 5945 5946 if (SrcVT.getSizeInBits() < VT.getSizeInBits()) { 5947 if (2 * SrcVT.getSizeInBits() != VT.getSizeInBits()) 5948 return SDValue(); 5949 // We can pad out the smaller vector for free, so if it's part of a 5950 // shuffle... 5951 Src.ShuffleVec = 5952 DAG.getNode(ISD::CONCAT_VECTORS, dl, DestVT, Src.ShuffleVec, 5953 DAG.getUNDEF(Src.ShuffleVec.getValueType())); 5954 continue; 5955 } 5956 5957 if (SrcVT.getSizeInBits() != 2 * VT.getSizeInBits()) 5958 return SDValue(); 5959 5960 if (Src.MaxElt - Src.MinElt >= NumSrcElts) { 5961 // Span too large for a VEXT to cope 5962 return SDValue(); 5963 } 5964 5965 if (Src.MinElt >= NumSrcElts) { 5966 // The extraction can just take the second half 5967 Src.ShuffleVec = 5968 DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, DestVT, Src.ShuffleVec, 5969 DAG.getConstant(NumSrcElts, dl, MVT::i32)); 5970 Src.WindowBase = -NumSrcElts; 5971 } else if (Src.MaxElt < NumSrcElts) { 5972 // The extraction can just take the first half 5973 Src.ShuffleVec = 5974 DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, DestVT, Src.ShuffleVec, 5975 DAG.getConstant(0, dl, MVT::i32)); 5976 } else { 5977 // An actual VEXT is needed 5978 SDValue VEXTSrc1 = 5979 DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, DestVT, Src.ShuffleVec, 5980 DAG.getConstant(0, dl, MVT::i32)); 5981 SDValue VEXTSrc2 = 5982 DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, DestVT, Src.ShuffleVec, 5983 DAG.getConstant(NumSrcElts, dl, MVT::i32)); 5984 5985 Src.ShuffleVec = DAG.getNode(ARMISD::VEXT, dl, DestVT, VEXTSrc1, 5986 VEXTSrc2, 5987 DAG.getConstant(Src.MinElt, dl, MVT::i32)); 5988 Src.WindowBase = -Src.MinElt; 5989 } 5990 } 5991 5992 // Another possible incompatibility occurs from the vector element types. We 5993 // can fix this by bitcasting the source vectors to the same type we intend 5994 // for the shuffle. 5995 for (auto &Src : Sources) { 5996 EVT SrcEltTy = Src.ShuffleVec.getValueType().getVectorElementType(); 5997 if (SrcEltTy == SmallestEltTy) 5998 continue; 5999 assert(ShuffleVT.getVectorElementType() == SmallestEltTy); 6000 Src.ShuffleVec = DAG.getNode(ISD::BITCAST, dl, ShuffleVT, Src.ShuffleVec); 6001 Src.WindowScale = SrcEltTy.getSizeInBits() / SmallestEltTy.getSizeInBits(); 6002 Src.WindowBase *= Src.WindowScale; 6003 } 6004 6005 // Final sanity check before we try to actually produce a shuffle. 6006 DEBUG( 6007 for (auto Src : Sources) 6008 assert(Src.ShuffleVec.getValueType() == ShuffleVT); 6009 ); 6010 6011 // The stars all align, our next step is to produce the mask for the shuffle. 6012 SmallVector<int, 8> Mask(ShuffleVT.getVectorNumElements(), -1); 6013 int BitsPerShuffleLane = ShuffleVT.getVectorElementType().getSizeInBits(); 6014 for (unsigned i = 0; i < VT.getVectorNumElements(); ++i) { 6015 SDValue Entry = Op.getOperand(i); 6016 if (Entry.isUndef()) 6017 continue; 6018 6019 auto Src = std::find(Sources.begin(), Sources.end(), Entry.getOperand(0)); 6020 int EltNo = cast<ConstantSDNode>(Entry.getOperand(1))->getSExtValue(); 6021 6022 // EXTRACT_VECTOR_ELT performs an implicit any_ext; BUILD_VECTOR an implicit 6023 // trunc. So only std::min(SrcBits, DestBits) actually get defined in this 6024 // segment. 6025 EVT OrigEltTy = Entry.getOperand(0).getValueType().getVectorElementType(); 6026 int BitsDefined = std::min(OrigEltTy.getSizeInBits(), 6027 VT.getVectorElementType().getSizeInBits()); 6028 int LanesDefined = BitsDefined / BitsPerShuffleLane; 6029 6030 // This source is expected to fill ResMultiplier lanes of the final shuffle, 6031 // starting at the appropriate offset. 6032 int *LaneMask = &Mask[i * ResMultiplier]; 6033 6034 int ExtractBase = EltNo * Src->WindowScale + Src->WindowBase; 6035 ExtractBase += NumElts * (Src - Sources.begin()); 6036 for (int j = 0; j < LanesDefined; ++j) 6037 LaneMask[j] = ExtractBase + j; 6038 } 6039 6040 // Final check before we try to produce nonsense... 6041 if (!isShuffleMaskLegal(Mask, ShuffleVT)) 6042 return SDValue(); 6043 6044 // We can't handle more than two sources. This should have already 6045 // been checked before this point. 6046 assert(Sources.size() <= 2 && "Too many sources!"); 6047 6048 SDValue ShuffleOps[] = { DAG.getUNDEF(ShuffleVT), DAG.getUNDEF(ShuffleVT) }; 6049 for (unsigned i = 0; i < Sources.size(); ++i) 6050 ShuffleOps[i] = Sources[i].ShuffleVec; 6051 6052 SDValue Shuffle = DAG.getVectorShuffle(ShuffleVT, dl, ShuffleOps[0], 6053 ShuffleOps[1], Mask); 6054 return DAG.getNode(ISD::BITCAST, dl, VT, Shuffle); 6055 } 6056 6057 /// isShuffleMaskLegal - Targets can use this to indicate that they only 6058 /// support *some* VECTOR_SHUFFLE operations, those with specific masks. 6059 /// By default, if a target supports the VECTOR_SHUFFLE node, all mask values 6060 /// are assumed to be legal. 6061 bool 6062 ARMTargetLowering::isShuffleMaskLegal(const SmallVectorImpl<int> &M, 6063 EVT VT) const { 6064 if (VT.getVectorNumElements() == 4 && 6065 (VT.is128BitVector() || VT.is64BitVector())) { 6066 unsigned PFIndexes[4]; 6067 for (unsigned i = 0; i != 4; ++i) { 6068 if (M[i] < 0) 6069 PFIndexes[i] = 8; 6070 else 6071 PFIndexes[i] = M[i]; 6072 } 6073 6074 // Compute the index in the perfect shuffle table. 6075 unsigned PFTableIndex = 6076 PFIndexes[0]*9*9*9+PFIndexes[1]*9*9+PFIndexes[2]*9+PFIndexes[3]; 6077 unsigned PFEntry = PerfectShuffleTable[PFTableIndex]; 6078 unsigned Cost = (PFEntry >> 30); 6079 6080 if (Cost <= 4) 6081 return true; 6082 } 6083 6084 bool ReverseVEXT, isV_UNDEF; 6085 unsigned Imm, WhichResult; 6086 6087 unsigned EltSize = VT.getVectorElementType().getSizeInBits(); 6088 return (EltSize >= 32 || 6089 ShuffleVectorSDNode::isSplatMask(&M[0], VT) || 6090 isVREVMask(M, VT, 64) || 6091 isVREVMask(M, VT, 32) || 6092 isVREVMask(M, VT, 16) || 6093 isVEXTMask(M, VT, ReverseVEXT, Imm) || 6094 isVTBLMask(M, VT) || 6095 isNEONTwoResultShuffleMask(M, VT, WhichResult, isV_UNDEF) || 6096 ((VT == MVT::v8i16 || VT == MVT::v16i8) && isReverseMask(M, VT))); 6097 } 6098 6099 /// GeneratePerfectShuffle - Given an entry in the perfect-shuffle table, emit 6100 /// the specified operations to build the shuffle. 6101 static SDValue GeneratePerfectShuffle(unsigned PFEntry, SDValue LHS, 6102 SDValue RHS, SelectionDAG &DAG, 6103 const SDLoc &dl) { 6104 unsigned OpNum = (PFEntry >> 26) & 0x0F; 6105 unsigned LHSID = (PFEntry >> 13) & ((1 << 13)-1); 6106 unsigned RHSID = (PFEntry >> 0) & ((1 << 13)-1); 6107 6108 enum { 6109 OP_COPY = 0, // Copy, used for things like <u,u,u,3> to say it is <0,1,2,3> 6110 OP_VREV, 6111 OP_VDUP0, 6112 OP_VDUP1, 6113 OP_VDUP2, 6114 OP_VDUP3, 6115 OP_VEXT1, 6116 OP_VEXT2, 6117 OP_VEXT3, 6118 OP_VUZPL, // VUZP, left result 6119 OP_VUZPR, // VUZP, right result 6120 OP_VZIPL, // VZIP, left result 6121 OP_VZIPR, // VZIP, right result 6122 OP_VTRNL, // VTRN, left result 6123 OP_VTRNR // VTRN, right result 6124 }; 6125 6126 if (OpNum == OP_COPY) { 6127 if (LHSID == (1*9+2)*9+3) return LHS; 6128 assert(LHSID == ((4*9+5)*9+6)*9+7 && "Illegal OP_COPY!"); 6129 return RHS; 6130 } 6131 6132 SDValue OpLHS, OpRHS; 6133 OpLHS = GeneratePerfectShuffle(PerfectShuffleTable[LHSID], LHS, RHS, DAG, dl); 6134 OpRHS = GeneratePerfectShuffle(PerfectShuffleTable[RHSID], LHS, RHS, DAG, dl); 6135 EVT VT = OpLHS.getValueType(); 6136 6137 switch (OpNum) { 6138 default: llvm_unreachable("Unknown shuffle opcode!"); 6139 case OP_VREV: 6140 // VREV divides the vector in half and swaps within the half. 6141 if (VT.getVectorElementType() == MVT::i32 || 6142 VT.getVectorElementType() == MVT::f32) 6143 return DAG.getNode(ARMISD::VREV64, dl, VT, OpLHS); 6144 // vrev <4 x i16> -> VREV32 6145 if (VT.getVectorElementType() == MVT::i16) 6146 return DAG.getNode(ARMISD::VREV32, dl, VT, OpLHS); 6147 // vrev <4 x i8> -> VREV16 6148 assert(VT.getVectorElementType() == MVT::i8); 6149 return DAG.getNode(ARMISD::VREV16, dl, VT, OpLHS); 6150 case OP_VDUP0: 6151 case OP_VDUP1: 6152 case OP_VDUP2: 6153 case OP_VDUP3: 6154 return DAG.getNode(ARMISD::VDUPLANE, dl, VT, 6155 OpLHS, DAG.getConstant(OpNum-OP_VDUP0, dl, MVT::i32)); 6156 case OP_VEXT1: 6157 case OP_VEXT2: 6158 case OP_VEXT3: 6159 return DAG.getNode(ARMISD::VEXT, dl, VT, 6160 OpLHS, OpRHS, 6161 DAG.getConstant(OpNum - OP_VEXT1 + 1, dl, MVT::i32)); 6162 case OP_VUZPL: 6163 case OP_VUZPR: 6164 return DAG.getNode(ARMISD::VUZP, dl, DAG.getVTList(VT, VT), 6165 OpLHS, OpRHS).getValue(OpNum-OP_VUZPL); 6166 case OP_VZIPL: 6167 case OP_VZIPR: 6168 return DAG.getNode(ARMISD::VZIP, dl, DAG.getVTList(VT, VT), 6169 OpLHS, OpRHS).getValue(OpNum-OP_VZIPL); 6170 case OP_VTRNL: 6171 case OP_VTRNR: 6172 return DAG.getNode(ARMISD::VTRN, dl, DAG.getVTList(VT, VT), 6173 OpLHS, OpRHS).getValue(OpNum-OP_VTRNL); 6174 } 6175 } 6176 6177 static SDValue LowerVECTOR_SHUFFLEv8i8(SDValue Op, 6178 ArrayRef<int> ShuffleMask, 6179 SelectionDAG &DAG) { 6180 // Check to see if we can use the VTBL instruction. 6181 SDValue V1 = Op.getOperand(0); 6182 SDValue V2 = Op.getOperand(1); 6183 SDLoc DL(Op); 6184 6185 SmallVector<SDValue, 8> VTBLMask; 6186 for (ArrayRef<int>::iterator 6187 I = ShuffleMask.begin(), E = ShuffleMask.end(); I != E; ++I) 6188 VTBLMask.push_back(DAG.getConstant(*I, DL, MVT::i32)); 6189 6190 if (V2.getNode()->isUndef()) 6191 return DAG.getNode(ARMISD::VTBL1, DL, MVT::v8i8, V1, 6192 DAG.getBuildVector(MVT::v8i8, DL, VTBLMask)); 6193 6194 return DAG.getNode(ARMISD::VTBL2, DL, MVT::v8i8, V1, V2, 6195 DAG.getBuildVector(MVT::v8i8, DL, VTBLMask)); 6196 } 6197 6198 static SDValue LowerReverse_VECTOR_SHUFFLEv16i8_v8i16(SDValue Op, 6199 SelectionDAG &DAG) { 6200 SDLoc DL(Op); 6201 SDValue OpLHS = Op.getOperand(0); 6202 EVT VT = OpLHS.getValueType(); 6203 6204 assert((VT == MVT::v8i16 || VT == MVT::v16i8) && 6205 "Expect an v8i16/v16i8 type"); 6206 OpLHS = DAG.getNode(ARMISD::VREV64, DL, VT, OpLHS); 6207 // For a v16i8 type: After the VREV, we have got <8, ...15, 8, ..., 0>. Now, 6208 // extract the first 8 bytes into the top double word and the last 8 bytes 6209 // into the bottom double word. The v8i16 case is similar. 6210 unsigned ExtractNum = (VT == MVT::v16i8) ? 8 : 4; 6211 return DAG.getNode(ARMISD::VEXT, DL, VT, OpLHS, OpLHS, 6212 DAG.getConstant(ExtractNum, DL, MVT::i32)); 6213 } 6214 6215 static SDValue LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) { 6216 SDValue V1 = Op.getOperand(0); 6217 SDValue V2 = Op.getOperand(1); 6218 SDLoc dl(Op); 6219 EVT VT = Op.getValueType(); 6220 ShuffleVectorSDNode *SVN = cast<ShuffleVectorSDNode>(Op.getNode()); 6221 6222 // Convert shuffles that are directly supported on NEON to target-specific 6223 // DAG nodes, instead of keeping them as shuffles and matching them again 6224 // during code selection. This is more efficient and avoids the possibility 6225 // of inconsistencies between legalization and selection. 6226 // FIXME: floating-point vectors should be canonicalized to integer vectors 6227 // of the same time so that they get CSEd properly. 6228 ArrayRef<int> ShuffleMask = SVN->getMask(); 6229 6230 unsigned EltSize = VT.getVectorElementType().getSizeInBits(); 6231 if (EltSize <= 32) { 6232 if (SVN->isSplat()) { 6233 int Lane = SVN->getSplatIndex(); 6234 // If this is undef splat, generate it via "just" vdup, if possible. 6235 if (Lane == -1) Lane = 0; 6236 6237 // Test if V1 is a SCALAR_TO_VECTOR. 6238 if (Lane == 0 && V1.getOpcode() == ISD::SCALAR_TO_VECTOR) { 6239 return DAG.getNode(ARMISD::VDUP, dl, VT, V1.getOperand(0)); 6240 } 6241 // Test if V1 is a BUILD_VECTOR which is equivalent to a SCALAR_TO_VECTOR 6242 // (and probably will turn into a SCALAR_TO_VECTOR once legalization 6243 // reaches it). 6244 if (Lane == 0 && V1.getOpcode() == ISD::BUILD_VECTOR && 6245 !isa<ConstantSDNode>(V1.getOperand(0))) { 6246 bool IsScalarToVector = true; 6247 for (unsigned i = 1, e = V1.getNumOperands(); i != e; ++i) 6248 if (!V1.getOperand(i).isUndef()) { 6249 IsScalarToVector = false; 6250 break; 6251 } 6252 if (IsScalarToVector) 6253 return DAG.getNode(ARMISD::VDUP, dl, VT, V1.getOperand(0)); 6254 } 6255 return DAG.getNode(ARMISD::VDUPLANE, dl, VT, V1, 6256 DAG.getConstant(Lane, dl, MVT::i32)); 6257 } 6258 6259 bool ReverseVEXT; 6260 unsigned Imm; 6261 if (isVEXTMask(ShuffleMask, VT, ReverseVEXT, Imm)) { 6262 if (ReverseVEXT) 6263 std::swap(V1, V2); 6264 return DAG.getNode(ARMISD::VEXT, dl, VT, V1, V2, 6265 DAG.getConstant(Imm, dl, MVT::i32)); 6266 } 6267 6268 if (isVREVMask(ShuffleMask, VT, 64)) 6269 return DAG.getNode(ARMISD::VREV64, dl, VT, V1); 6270 if (isVREVMask(ShuffleMask, VT, 32)) 6271 return DAG.getNode(ARMISD::VREV32, dl, VT, V1); 6272 if (isVREVMask(ShuffleMask, VT, 16)) 6273 return DAG.getNode(ARMISD::VREV16, dl, VT, V1); 6274 6275 if (V2->isUndef() && isSingletonVEXTMask(ShuffleMask, VT, Imm)) { 6276 return DAG.getNode(ARMISD::VEXT, dl, VT, V1, V1, 6277 DAG.getConstant(Imm, dl, MVT::i32)); 6278 } 6279 6280 // Check for Neon shuffles that modify both input vectors in place. 6281 // If both results are used, i.e., if there are two shuffles with the same 6282 // source operands and with masks corresponding to both results of one of 6283 // these operations, DAG memoization will ensure that a single node is 6284 // used for both shuffles. 6285 unsigned WhichResult; 6286 bool isV_UNDEF; 6287 if (unsigned ShuffleOpc = isNEONTwoResultShuffleMask( 6288 ShuffleMask, VT, WhichResult, isV_UNDEF)) { 6289 if (isV_UNDEF) 6290 V2 = V1; 6291 return DAG.getNode(ShuffleOpc, dl, DAG.getVTList(VT, VT), V1, V2) 6292 .getValue(WhichResult); 6293 } 6294 6295 // Also check for these shuffles through CONCAT_VECTORS: we canonicalize 6296 // shuffles that produce a result larger than their operands with: 6297 // shuffle(concat(v1, undef), concat(v2, undef)) 6298 // -> 6299 // shuffle(concat(v1, v2), undef) 6300 // because we can access quad vectors (see PerformVECTOR_SHUFFLECombine). 6301 // 6302 // This is useful in the general case, but there are special cases where 6303 // native shuffles produce larger results: the two-result ops. 6304 // 6305 // Look through the concat when lowering them: 6306 // shuffle(concat(v1, v2), undef) 6307 // -> 6308 // concat(VZIP(v1, v2):0, :1) 6309 // 6310 if (V1->getOpcode() == ISD::CONCAT_VECTORS && V2->isUndef()) { 6311 SDValue SubV1 = V1->getOperand(0); 6312 SDValue SubV2 = V1->getOperand(1); 6313 EVT SubVT = SubV1.getValueType(); 6314 6315 // We expect these to have been canonicalized to -1. 6316 assert(std::all_of(ShuffleMask.begin(), ShuffleMask.end(), [&](int i) { 6317 return i < (int)VT.getVectorNumElements(); 6318 }) && "Unexpected shuffle index into UNDEF operand!"); 6319 6320 if (unsigned ShuffleOpc = isNEONTwoResultShuffleMask( 6321 ShuffleMask, SubVT, WhichResult, isV_UNDEF)) { 6322 if (isV_UNDEF) 6323 SubV2 = SubV1; 6324 assert((WhichResult == 0) && 6325 "In-place shuffle of concat can only have one result!"); 6326 SDValue Res = DAG.getNode(ShuffleOpc, dl, DAG.getVTList(SubVT, SubVT), 6327 SubV1, SubV2); 6328 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Res.getValue(0), 6329 Res.getValue(1)); 6330 } 6331 } 6332 } 6333 6334 // If the shuffle is not directly supported and it has 4 elements, use 6335 // the PerfectShuffle-generated table to synthesize it from other shuffles. 6336 unsigned NumElts = VT.getVectorNumElements(); 6337 if (NumElts == 4) { 6338 unsigned PFIndexes[4]; 6339 for (unsigned i = 0; i != 4; ++i) { 6340 if (ShuffleMask[i] < 0) 6341 PFIndexes[i] = 8; 6342 else 6343 PFIndexes[i] = ShuffleMask[i]; 6344 } 6345 6346 // Compute the index in the perfect shuffle table. 6347 unsigned PFTableIndex = 6348 PFIndexes[0]*9*9*9+PFIndexes[1]*9*9+PFIndexes[2]*9+PFIndexes[3]; 6349 unsigned PFEntry = PerfectShuffleTable[PFTableIndex]; 6350 unsigned Cost = (PFEntry >> 30); 6351 6352 if (Cost <= 4) 6353 return GeneratePerfectShuffle(PFEntry, V1, V2, DAG, dl); 6354 } 6355 6356 // Implement shuffles with 32- or 64-bit elements as ARMISD::BUILD_VECTORs. 6357 if (EltSize >= 32) { 6358 // Do the expansion with floating-point types, since that is what the VFP 6359 // registers are defined to use, and since i64 is not legal. 6360 EVT EltVT = EVT::getFloatingPointVT(EltSize); 6361 EVT VecVT = EVT::getVectorVT(*DAG.getContext(), EltVT, NumElts); 6362 V1 = DAG.getNode(ISD::BITCAST, dl, VecVT, V1); 6363 V2 = DAG.getNode(ISD::BITCAST, dl, VecVT, V2); 6364 SmallVector<SDValue, 8> Ops; 6365 for (unsigned i = 0; i < NumElts; ++i) { 6366 if (ShuffleMask[i] < 0) 6367 Ops.push_back(DAG.getUNDEF(EltVT)); 6368 else 6369 Ops.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, 6370 ShuffleMask[i] < (int)NumElts ? V1 : V2, 6371 DAG.getConstant(ShuffleMask[i] & (NumElts-1), 6372 dl, MVT::i32))); 6373 } 6374 SDValue Val = DAG.getNode(ARMISD::BUILD_VECTOR, dl, VecVT, Ops); 6375 return DAG.getNode(ISD::BITCAST, dl, VT, Val); 6376 } 6377 6378 if ((VT == MVT::v8i16 || VT == MVT::v16i8) && isReverseMask(ShuffleMask, VT)) 6379 return LowerReverse_VECTOR_SHUFFLEv16i8_v8i16(Op, DAG); 6380 6381 if (VT == MVT::v8i8) 6382 if (SDValue NewOp = LowerVECTOR_SHUFFLEv8i8(Op, ShuffleMask, DAG)) 6383 return NewOp; 6384 6385 return SDValue(); 6386 } 6387 6388 static SDValue LowerINSERT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) { 6389 // INSERT_VECTOR_ELT is legal only for immediate indexes. 6390 SDValue Lane = Op.getOperand(2); 6391 if (!isa<ConstantSDNode>(Lane)) 6392 return SDValue(); 6393 6394 return Op; 6395 } 6396 6397 static SDValue LowerEXTRACT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) { 6398 // EXTRACT_VECTOR_ELT is legal only for immediate indexes. 6399 SDValue Lane = Op.getOperand(1); 6400 if (!isa<ConstantSDNode>(Lane)) 6401 return SDValue(); 6402 6403 SDValue Vec = Op.getOperand(0); 6404 if (Op.getValueType() == MVT::i32 && 6405 Vec.getValueType().getVectorElementType().getSizeInBits() < 32) { 6406 SDLoc dl(Op); 6407 return DAG.getNode(ARMISD::VGETLANEu, dl, MVT::i32, Vec, Lane); 6408 } 6409 6410 return Op; 6411 } 6412 6413 static SDValue LowerCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) { 6414 // The only time a CONCAT_VECTORS operation can have legal types is when 6415 // two 64-bit vectors are concatenated to a 128-bit vector. 6416 assert(Op.getValueType().is128BitVector() && Op.getNumOperands() == 2 && 6417 "unexpected CONCAT_VECTORS"); 6418 SDLoc dl(Op); 6419 SDValue Val = DAG.getUNDEF(MVT::v2f64); 6420 SDValue Op0 = Op.getOperand(0); 6421 SDValue Op1 = Op.getOperand(1); 6422 if (!Op0.isUndef()) 6423 Val = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v2f64, Val, 6424 DAG.getNode(ISD::BITCAST, dl, MVT::f64, Op0), 6425 DAG.getIntPtrConstant(0, dl)); 6426 if (!Op1.isUndef()) 6427 Val = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v2f64, Val, 6428 DAG.getNode(ISD::BITCAST, dl, MVT::f64, Op1), 6429 DAG.getIntPtrConstant(1, dl)); 6430 return DAG.getNode(ISD::BITCAST, dl, Op.getValueType(), Val); 6431 } 6432 6433 /// isExtendedBUILD_VECTOR - Check if N is a constant BUILD_VECTOR where each 6434 /// element has been zero/sign-extended, depending on the isSigned parameter, 6435 /// from an integer type half its size. 6436 static bool isExtendedBUILD_VECTOR(SDNode *N, SelectionDAG &DAG, 6437 bool isSigned) { 6438 // A v2i64 BUILD_VECTOR will have been legalized to a BITCAST from v4i32. 6439 EVT VT = N->getValueType(0); 6440 if (VT == MVT::v2i64 && N->getOpcode() == ISD::BITCAST) { 6441 SDNode *BVN = N->getOperand(0).getNode(); 6442 if (BVN->getValueType(0) != MVT::v4i32 || 6443 BVN->getOpcode() != ISD::BUILD_VECTOR) 6444 return false; 6445 unsigned LoElt = DAG.getDataLayout().isBigEndian() ? 1 : 0; 6446 unsigned HiElt = 1 - LoElt; 6447 ConstantSDNode *Lo0 = dyn_cast<ConstantSDNode>(BVN->getOperand(LoElt)); 6448 ConstantSDNode *Hi0 = dyn_cast<ConstantSDNode>(BVN->getOperand(HiElt)); 6449 ConstantSDNode *Lo1 = dyn_cast<ConstantSDNode>(BVN->getOperand(LoElt+2)); 6450 ConstantSDNode *Hi1 = dyn_cast<ConstantSDNode>(BVN->getOperand(HiElt+2)); 6451 if (!Lo0 || !Hi0 || !Lo1 || !Hi1) 6452 return false; 6453 if (isSigned) { 6454 if (Hi0->getSExtValue() == Lo0->getSExtValue() >> 32 && 6455 Hi1->getSExtValue() == Lo1->getSExtValue() >> 32) 6456 return true; 6457 } else { 6458 if (Hi0->isNullValue() && Hi1->isNullValue()) 6459 return true; 6460 } 6461 return false; 6462 } 6463 6464 if (N->getOpcode() != ISD::BUILD_VECTOR) 6465 return false; 6466 6467 for (unsigned i = 0, e = N->getNumOperands(); i != e; ++i) { 6468 SDNode *Elt = N->getOperand(i).getNode(); 6469 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Elt)) { 6470 unsigned EltSize = VT.getVectorElementType().getSizeInBits(); 6471 unsigned HalfSize = EltSize / 2; 6472 if (isSigned) { 6473 if (!isIntN(HalfSize, C->getSExtValue())) 6474 return false; 6475 } else { 6476 if (!isUIntN(HalfSize, C->getZExtValue())) 6477 return false; 6478 } 6479 continue; 6480 } 6481 return false; 6482 } 6483 6484 return true; 6485 } 6486 6487 /// isSignExtended - Check if a node is a vector value that is sign-extended 6488 /// or a constant BUILD_VECTOR with sign-extended elements. 6489 static bool isSignExtended(SDNode *N, SelectionDAG &DAG) { 6490 if (N->getOpcode() == ISD::SIGN_EXTEND || ISD::isSEXTLoad(N)) 6491 return true; 6492 if (isExtendedBUILD_VECTOR(N, DAG, true)) 6493 return true; 6494 return false; 6495 } 6496 6497 /// isZeroExtended - Check if a node is a vector value that is zero-extended 6498 /// or a constant BUILD_VECTOR with zero-extended elements. 6499 static bool isZeroExtended(SDNode *N, SelectionDAG &DAG) { 6500 if (N->getOpcode() == ISD::ZERO_EXTEND || ISD::isZEXTLoad(N)) 6501 return true; 6502 if (isExtendedBUILD_VECTOR(N, DAG, false)) 6503 return true; 6504 return false; 6505 } 6506 6507 static EVT getExtensionTo64Bits(const EVT &OrigVT) { 6508 if (OrigVT.getSizeInBits() >= 64) 6509 return OrigVT; 6510 6511 assert(OrigVT.isSimple() && "Expecting a simple value type"); 6512 6513 MVT::SimpleValueType OrigSimpleTy = OrigVT.getSimpleVT().SimpleTy; 6514 switch (OrigSimpleTy) { 6515 default: llvm_unreachable("Unexpected Vector Type"); 6516 case MVT::v2i8: 6517 case MVT::v2i16: 6518 return MVT::v2i32; 6519 case MVT::v4i8: 6520 return MVT::v4i16; 6521 } 6522 } 6523 6524 /// AddRequiredExtensionForVMULL - Add a sign/zero extension to extend the total 6525 /// value size to 64 bits. We need a 64-bit D register as an operand to VMULL. 6526 /// We insert the required extension here to get the vector to fill a D register. 6527 static SDValue AddRequiredExtensionForVMULL(SDValue N, SelectionDAG &DAG, 6528 const EVT &OrigTy, 6529 const EVT &ExtTy, 6530 unsigned ExtOpcode) { 6531 // The vector originally had a size of OrigTy. It was then extended to ExtTy. 6532 // We expect the ExtTy to be 128-bits total. If the OrigTy is less than 6533 // 64-bits we need to insert a new extension so that it will be 64-bits. 6534 assert(ExtTy.is128BitVector() && "Unexpected extension size"); 6535 if (OrigTy.getSizeInBits() >= 64) 6536 return N; 6537 6538 // Must extend size to at least 64 bits to be used as an operand for VMULL. 6539 EVT NewVT = getExtensionTo64Bits(OrigTy); 6540 6541 return DAG.getNode(ExtOpcode, SDLoc(N), NewVT, N); 6542 } 6543 6544 /// SkipLoadExtensionForVMULL - return a load of the original vector size that 6545 /// does not do any sign/zero extension. If the original vector is less 6546 /// than 64 bits, an appropriate extension will be added after the load to 6547 /// reach a total size of 64 bits. We have to add the extension separately 6548 /// because ARM does not have a sign/zero extending load for vectors. 6549 static SDValue SkipLoadExtensionForVMULL(LoadSDNode *LD, SelectionDAG& DAG) { 6550 EVT ExtendedTy = getExtensionTo64Bits(LD->getMemoryVT()); 6551 6552 // The load already has the right type. 6553 if (ExtendedTy == LD->getMemoryVT()) 6554 return DAG.getLoad(LD->getMemoryVT(), SDLoc(LD), LD->getChain(), 6555 LD->getBasePtr(), LD->getPointerInfo(), LD->isVolatile(), 6556 LD->isNonTemporal(), LD->isInvariant(), 6557 LD->getAlignment()); 6558 6559 // We need to create a zextload/sextload. We cannot just create a load 6560 // followed by a zext/zext node because LowerMUL is also run during normal 6561 // operation legalization where we can't create illegal types. 6562 return DAG.getExtLoad(LD->getExtensionType(), SDLoc(LD), ExtendedTy, 6563 LD->getChain(), LD->getBasePtr(), LD->getPointerInfo(), 6564 LD->getMemoryVT(), LD->isVolatile(), LD->isInvariant(), 6565 LD->isNonTemporal(), LD->getAlignment()); 6566 } 6567 6568 /// SkipExtensionForVMULL - For a node that is a SIGN_EXTEND, ZERO_EXTEND, 6569 /// extending load, or BUILD_VECTOR with extended elements, return the 6570 /// unextended value. The unextended vector should be 64 bits so that it can 6571 /// be used as an operand to a VMULL instruction. If the original vector size 6572 /// before extension is less than 64 bits we add a an extension to resize 6573 /// the vector to 64 bits. 6574 static SDValue SkipExtensionForVMULL(SDNode *N, SelectionDAG &DAG) { 6575 if (N->getOpcode() == ISD::SIGN_EXTEND || N->getOpcode() == ISD::ZERO_EXTEND) 6576 return AddRequiredExtensionForVMULL(N->getOperand(0), DAG, 6577 N->getOperand(0)->getValueType(0), 6578 N->getValueType(0), 6579 N->getOpcode()); 6580 6581 if (LoadSDNode *LD = dyn_cast<LoadSDNode>(N)) 6582 return SkipLoadExtensionForVMULL(LD, DAG); 6583 6584 // Otherwise, the value must be a BUILD_VECTOR. For v2i64, it will 6585 // have been legalized as a BITCAST from v4i32. 6586 if (N->getOpcode() == ISD::BITCAST) { 6587 SDNode *BVN = N->getOperand(0).getNode(); 6588 assert(BVN->getOpcode() == ISD::BUILD_VECTOR && 6589 BVN->getValueType(0) == MVT::v4i32 && "expected v4i32 BUILD_VECTOR"); 6590 unsigned LowElt = DAG.getDataLayout().isBigEndian() ? 1 : 0; 6591 return DAG.getBuildVector( 6592 MVT::v2i32, SDLoc(N), 6593 {BVN->getOperand(LowElt), BVN->getOperand(LowElt + 2)}); 6594 } 6595 // Construct a new BUILD_VECTOR with elements truncated to half the size. 6596 assert(N->getOpcode() == ISD::BUILD_VECTOR && "expected BUILD_VECTOR"); 6597 EVT VT = N->getValueType(0); 6598 unsigned EltSize = VT.getVectorElementType().getSizeInBits() / 2; 6599 unsigned NumElts = VT.getVectorNumElements(); 6600 MVT TruncVT = MVT::getIntegerVT(EltSize); 6601 SmallVector<SDValue, 8> Ops; 6602 SDLoc dl(N); 6603 for (unsigned i = 0; i != NumElts; ++i) { 6604 ConstantSDNode *C = cast<ConstantSDNode>(N->getOperand(i)); 6605 const APInt &CInt = C->getAPIntValue(); 6606 // Element types smaller than 32 bits are not legal, so use i32 elements. 6607 // The values are implicitly truncated so sext vs. zext doesn't matter. 6608 Ops.push_back(DAG.getConstant(CInt.zextOrTrunc(32), dl, MVT::i32)); 6609 } 6610 return DAG.getBuildVector(MVT::getVectorVT(TruncVT, NumElts), dl, Ops); 6611 } 6612 6613 static bool isAddSubSExt(SDNode *N, SelectionDAG &DAG) { 6614 unsigned Opcode = N->getOpcode(); 6615 if (Opcode == ISD::ADD || Opcode == ISD::SUB) { 6616 SDNode *N0 = N->getOperand(0).getNode(); 6617 SDNode *N1 = N->getOperand(1).getNode(); 6618 return N0->hasOneUse() && N1->hasOneUse() && 6619 isSignExtended(N0, DAG) && isSignExtended(N1, DAG); 6620 } 6621 return false; 6622 } 6623 6624 static bool isAddSubZExt(SDNode *N, SelectionDAG &DAG) { 6625 unsigned Opcode = N->getOpcode(); 6626 if (Opcode == ISD::ADD || Opcode == ISD::SUB) { 6627 SDNode *N0 = N->getOperand(0).getNode(); 6628 SDNode *N1 = N->getOperand(1).getNode(); 6629 return N0->hasOneUse() && N1->hasOneUse() && 6630 isZeroExtended(N0, DAG) && isZeroExtended(N1, DAG); 6631 } 6632 return false; 6633 } 6634 6635 static SDValue LowerMUL(SDValue Op, SelectionDAG &DAG) { 6636 // Multiplications are only custom-lowered for 128-bit vectors so that 6637 // VMULL can be detected. Otherwise v2i64 multiplications are not legal. 6638 EVT VT = Op.getValueType(); 6639 assert(VT.is128BitVector() && VT.isInteger() && 6640 "unexpected type for custom-lowering ISD::MUL"); 6641 SDNode *N0 = Op.getOperand(0).getNode(); 6642 SDNode *N1 = Op.getOperand(1).getNode(); 6643 unsigned NewOpc = 0; 6644 bool isMLA = false; 6645 bool isN0SExt = isSignExtended(N0, DAG); 6646 bool isN1SExt = isSignExtended(N1, DAG); 6647 if (isN0SExt && isN1SExt) 6648 NewOpc = ARMISD::VMULLs; 6649 else { 6650 bool isN0ZExt = isZeroExtended(N0, DAG); 6651 bool isN1ZExt = isZeroExtended(N1, DAG); 6652 if (isN0ZExt && isN1ZExt) 6653 NewOpc = ARMISD::VMULLu; 6654 else if (isN1SExt || isN1ZExt) { 6655 // Look for (s/zext A + s/zext B) * (s/zext C). We want to turn these 6656 // into (s/zext A * s/zext C) + (s/zext B * s/zext C) 6657 if (isN1SExt && isAddSubSExt(N0, DAG)) { 6658 NewOpc = ARMISD::VMULLs; 6659 isMLA = true; 6660 } else if (isN1ZExt && isAddSubZExt(N0, DAG)) { 6661 NewOpc = ARMISD::VMULLu; 6662 isMLA = true; 6663 } else if (isN0ZExt && isAddSubZExt(N1, DAG)) { 6664 std::swap(N0, N1); 6665 NewOpc = ARMISD::VMULLu; 6666 isMLA = true; 6667 } 6668 } 6669 6670 if (!NewOpc) { 6671 if (VT == MVT::v2i64) 6672 // Fall through to expand this. It is not legal. 6673 return SDValue(); 6674 else 6675 // Other vector multiplications are legal. 6676 return Op; 6677 } 6678 } 6679 6680 // Legalize to a VMULL instruction. 6681 SDLoc DL(Op); 6682 SDValue Op0; 6683 SDValue Op1 = SkipExtensionForVMULL(N1, DAG); 6684 if (!isMLA) { 6685 Op0 = SkipExtensionForVMULL(N0, DAG); 6686 assert(Op0.getValueType().is64BitVector() && 6687 Op1.getValueType().is64BitVector() && 6688 "unexpected types for extended operands to VMULL"); 6689 return DAG.getNode(NewOpc, DL, VT, Op0, Op1); 6690 } 6691 6692 // Optimizing (zext A + zext B) * C, to (VMULL A, C) + (VMULL B, C) during 6693 // isel lowering to take advantage of no-stall back to back vmul + vmla. 6694 // vmull q0, d4, d6 6695 // vmlal q0, d5, d6 6696 // is faster than 6697 // vaddl q0, d4, d5 6698 // vmovl q1, d6 6699 // vmul q0, q0, q1 6700 SDValue N00 = SkipExtensionForVMULL(N0->getOperand(0).getNode(), DAG); 6701 SDValue N01 = SkipExtensionForVMULL(N0->getOperand(1).getNode(), DAG); 6702 EVT Op1VT = Op1.getValueType(); 6703 return DAG.getNode(N0->getOpcode(), DL, VT, 6704 DAG.getNode(NewOpc, DL, VT, 6705 DAG.getNode(ISD::BITCAST, DL, Op1VT, N00), Op1), 6706 DAG.getNode(NewOpc, DL, VT, 6707 DAG.getNode(ISD::BITCAST, DL, Op1VT, N01), Op1)); 6708 } 6709 6710 static SDValue LowerSDIV_v4i8(SDValue X, SDValue Y, const SDLoc &dl, 6711 SelectionDAG &DAG) { 6712 // TODO: Should this propagate fast-math-flags? 6713 6714 // Convert to float 6715 // float4 xf = vcvt_f32_s32(vmovl_s16(a.lo)); 6716 // float4 yf = vcvt_f32_s32(vmovl_s16(b.lo)); 6717 X = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v4i32, X); 6718 Y = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v4i32, Y); 6719 X = DAG.getNode(ISD::SINT_TO_FP, dl, MVT::v4f32, X); 6720 Y = DAG.getNode(ISD::SINT_TO_FP, dl, MVT::v4f32, Y); 6721 // Get reciprocal estimate. 6722 // float4 recip = vrecpeq_f32(yf); 6723 Y = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::v4f32, 6724 DAG.getConstant(Intrinsic::arm_neon_vrecpe, dl, MVT::i32), 6725 Y); 6726 // Because char has a smaller range than uchar, we can actually get away 6727 // without any newton steps. This requires that we use a weird bias 6728 // of 0xb000, however (again, this has been exhaustively tested). 6729 // float4 result = as_float4(as_int4(xf*recip) + 0xb000); 6730 X = DAG.getNode(ISD::FMUL, dl, MVT::v4f32, X, Y); 6731 X = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, X); 6732 Y = DAG.getConstant(0xb000, dl, MVT::v4i32); 6733 X = DAG.getNode(ISD::ADD, dl, MVT::v4i32, X, Y); 6734 X = DAG.getNode(ISD::BITCAST, dl, MVT::v4f32, X); 6735 // Convert back to short. 6736 X = DAG.getNode(ISD::FP_TO_SINT, dl, MVT::v4i32, X); 6737 X = DAG.getNode(ISD::TRUNCATE, dl, MVT::v4i16, X); 6738 return X; 6739 } 6740 6741 static SDValue LowerSDIV_v4i16(SDValue N0, SDValue N1, const SDLoc &dl, 6742 SelectionDAG &DAG) { 6743 // TODO: Should this propagate fast-math-flags? 6744 6745 SDValue N2; 6746 // Convert to float. 6747 // float4 yf = vcvt_f32_s32(vmovl_s16(y)); 6748 // float4 xf = vcvt_f32_s32(vmovl_s16(x)); 6749 N0 = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v4i32, N0); 6750 N1 = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v4i32, N1); 6751 N0 = DAG.getNode(ISD::SINT_TO_FP, dl, MVT::v4f32, N0); 6752 N1 = DAG.getNode(ISD::SINT_TO_FP, dl, MVT::v4f32, N1); 6753 6754 // Use reciprocal estimate and one refinement step. 6755 // float4 recip = vrecpeq_f32(yf); 6756 // recip *= vrecpsq_f32(yf, recip); 6757 N2 = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::v4f32, 6758 DAG.getConstant(Intrinsic::arm_neon_vrecpe, dl, MVT::i32), 6759 N1); 6760 N1 = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::v4f32, 6761 DAG.getConstant(Intrinsic::arm_neon_vrecps, dl, MVT::i32), 6762 N1, N2); 6763 N2 = DAG.getNode(ISD::FMUL, dl, MVT::v4f32, N1, N2); 6764 // Because short has a smaller range than ushort, we can actually get away 6765 // with only a single newton step. This requires that we use a weird bias 6766 // of 89, however (again, this has been exhaustively tested). 6767 // float4 result = as_float4(as_int4(xf*recip) + 0x89); 6768 N0 = DAG.getNode(ISD::FMUL, dl, MVT::v4f32, N0, N2); 6769 N0 = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, N0); 6770 N1 = DAG.getConstant(0x89, dl, MVT::v4i32); 6771 N0 = DAG.getNode(ISD::ADD, dl, MVT::v4i32, N0, N1); 6772 N0 = DAG.getNode(ISD::BITCAST, dl, MVT::v4f32, N0); 6773 // Convert back to integer and return. 6774 // return vmovn_s32(vcvt_s32_f32(result)); 6775 N0 = DAG.getNode(ISD::FP_TO_SINT, dl, MVT::v4i32, N0); 6776 N0 = DAG.getNode(ISD::TRUNCATE, dl, MVT::v4i16, N0); 6777 return N0; 6778 } 6779 6780 static SDValue LowerSDIV(SDValue Op, SelectionDAG &DAG) { 6781 EVT VT = Op.getValueType(); 6782 assert((VT == MVT::v4i16 || VT == MVT::v8i8) && 6783 "unexpected type for custom-lowering ISD::SDIV"); 6784 6785 SDLoc dl(Op); 6786 SDValue N0 = Op.getOperand(0); 6787 SDValue N1 = Op.getOperand(1); 6788 SDValue N2, N3; 6789 6790 if (VT == MVT::v8i8) { 6791 N0 = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v8i16, N0); 6792 N1 = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v8i16, N1); 6793 6794 N2 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v4i16, N0, 6795 DAG.getIntPtrConstant(4, dl)); 6796 N3 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v4i16, N1, 6797 DAG.getIntPtrConstant(4, dl)); 6798 N0 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v4i16, N0, 6799 DAG.getIntPtrConstant(0, dl)); 6800 N1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v4i16, N1, 6801 DAG.getIntPtrConstant(0, dl)); 6802 6803 N0 = LowerSDIV_v4i8(N0, N1, dl, DAG); // v4i16 6804 N2 = LowerSDIV_v4i8(N2, N3, dl, DAG); // v4i16 6805 6806 N0 = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8i16, N0, N2); 6807 N0 = LowerCONCAT_VECTORS(N0, DAG); 6808 6809 N0 = DAG.getNode(ISD::TRUNCATE, dl, MVT::v8i8, N0); 6810 return N0; 6811 } 6812 return LowerSDIV_v4i16(N0, N1, dl, DAG); 6813 } 6814 6815 static SDValue LowerUDIV(SDValue Op, SelectionDAG &DAG) { 6816 // TODO: Should this propagate fast-math-flags? 6817 EVT VT = Op.getValueType(); 6818 assert((VT == MVT::v4i16 || VT == MVT::v8i8) && 6819 "unexpected type for custom-lowering ISD::UDIV"); 6820 6821 SDLoc dl(Op); 6822 SDValue N0 = Op.getOperand(0); 6823 SDValue N1 = Op.getOperand(1); 6824 SDValue N2, N3; 6825 6826 if (VT == MVT::v8i8) { 6827 N0 = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::v8i16, N0); 6828 N1 = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::v8i16, N1); 6829 6830 N2 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v4i16, N0, 6831 DAG.getIntPtrConstant(4, dl)); 6832 N3 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v4i16, N1, 6833 DAG.getIntPtrConstant(4, dl)); 6834 N0 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v4i16, N0, 6835 DAG.getIntPtrConstant(0, dl)); 6836 N1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v4i16, N1, 6837 DAG.getIntPtrConstant(0, dl)); 6838 6839 N0 = LowerSDIV_v4i16(N0, N1, dl, DAG); // v4i16 6840 N2 = LowerSDIV_v4i16(N2, N3, dl, DAG); // v4i16 6841 6842 N0 = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8i16, N0, N2); 6843 N0 = LowerCONCAT_VECTORS(N0, DAG); 6844 6845 N0 = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::v8i8, 6846 DAG.getConstant(Intrinsic::arm_neon_vqmovnsu, dl, 6847 MVT::i32), 6848 N0); 6849 return N0; 6850 } 6851 6852 // v4i16 sdiv ... Convert to float. 6853 // float4 yf = vcvt_f32_s32(vmovl_u16(y)); 6854 // float4 xf = vcvt_f32_s32(vmovl_u16(x)); 6855 N0 = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::v4i32, N0); 6856 N1 = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::v4i32, N1); 6857 N0 = DAG.getNode(ISD::SINT_TO_FP, dl, MVT::v4f32, N0); 6858 SDValue BN1 = DAG.getNode(ISD::SINT_TO_FP, dl, MVT::v4f32, N1); 6859 6860 // Use reciprocal estimate and two refinement steps. 6861 // float4 recip = vrecpeq_f32(yf); 6862 // recip *= vrecpsq_f32(yf, recip); 6863 // recip *= vrecpsq_f32(yf, recip); 6864 N2 = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::v4f32, 6865 DAG.getConstant(Intrinsic::arm_neon_vrecpe, dl, MVT::i32), 6866 BN1); 6867 N1 = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::v4f32, 6868 DAG.getConstant(Intrinsic::arm_neon_vrecps, dl, MVT::i32), 6869 BN1, N2); 6870 N2 = DAG.getNode(ISD::FMUL, dl, MVT::v4f32, N1, N2); 6871 N1 = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::v4f32, 6872 DAG.getConstant(Intrinsic::arm_neon_vrecps, dl, MVT::i32), 6873 BN1, N2); 6874 N2 = DAG.getNode(ISD::FMUL, dl, MVT::v4f32, N1, N2); 6875 // Simply multiplying by the reciprocal estimate can leave us a few ulps 6876 // too low, so we add 2 ulps (exhaustive testing shows that this is enough, 6877 // and that it will never cause us to return an answer too large). 6878 // float4 result = as_float4(as_int4(xf*recip) + 2); 6879 N0 = DAG.getNode(ISD::FMUL, dl, MVT::v4f32, N0, N2); 6880 N0 = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, N0); 6881 N1 = DAG.getConstant(2, dl, MVT::v4i32); 6882 N0 = DAG.getNode(ISD::ADD, dl, MVT::v4i32, N0, N1); 6883 N0 = DAG.getNode(ISD::BITCAST, dl, MVT::v4f32, N0); 6884 // Convert back to integer and return. 6885 // return vmovn_u32(vcvt_s32_f32(result)); 6886 N0 = DAG.getNode(ISD::FP_TO_SINT, dl, MVT::v4i32, N0); 6887 N0 = DAG.getNode(ISD::TRUNCATE, dl, MVT::v4i16, N0); 6888 return N0; 6889 } 6890 6891 static SDValue LowerADDC_ADDE_SUBC_SUBE(SDValue Op, SelectionDAG &DAG) { 6892 EVT VT = Op.getNode()->getValueType(0); 6893 SDVTList VTs = DAG.getVTList(VT, MVT::i32); 6894 6895 unsigned Opc; 6896 bool ExtraOp = false; 6897 switch (Op.getOpcode()) { 6898 default: llvm_unreachable("Invalid code"); 6899 case ISD::ADDC: Opc = ARMISD::ADDC; break; 6900 case ISD::ADDE: Opc = ARMISD::ADDE; ExtraOp = true; break; 6901 case ISD::SUBC: Opc = ARMISD::SUBC; break; 6902 case ISD::SUBE: Opc = ARMISD::SUBE; ExtraOp = true; break; 6903 } 6904 6905 if (!ExtraOp) 6906 return DAG.getNode(Opc, SDLoc(Op), VTs, Op.getOperand(0), 6907 Op.getOperand(1)); 6908 return DAG.getNode(Opc, SDLoc(Op), VTs, Op.getOperand(0), 6909 Op.getOperand(1), Op.getOperand(2)); 6910 } 6911 6912 SDValue ARMTargetLowering::LowerFSINCOS(SDValue Op, SelectionDAG &DAG) const { 6913 assert(Subtarget->isTargetDarwin()); 6914 6915 // For iOS, we want to call an alternative entry point: __sincos_stret, 6916 // return values are passed via sret. 6917 SDLoc dl(Op); 6918 SDValue Arg = Op.getOperand(0); 6919 EVT ArgVT = Arg.getValueType(); 6920 Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext()); 6921 auto PtrVT = getPointerTy(DAG.getDataLayout()); 6922 6923 MachineFrameInfo *FrameInfo = DAG.getMachineFunction().getFrameInfo(); 6924 const TargetLowering &TLI = DAG.getTargetLoweringInfo(); 6925 6926 // Pair of floats / doubles used to pass the result. 6927 Type *RetTy = StructType::get(ArgTy, ArgTy, nullptr); 6928 auto &DL = DAG.getDataLayout(); 6929 6930 ArgListTy Args; 6931 bool ShouldUseSRet = Subtarget->isAPCS_ABI(); 6932 SDValue SRet; 6933 if (ShouldUseSRet) { 6934 // Create stack object for sret. 6935 const uint64_t ByteSize = DL.getTypeAllocSize(RetTy); 6936 const unsigned StackAlign = DL.getPrefTypeAlignment(RetTy); 6937 int FrameIdx = FrameInfo->CreateStackObject(ByteSize, StackAlign, false); 6938 SRet = DAG.getFrameIndex(FrameIdx, TLI.getPointerTy(DL)); 6939 6940 ArgListEntry Entry; 6941 Entry.Node = SRet; 6942 Entry.Ty = RetTy->getPointerTo(); 6943 Entry.isSExt = false; 6944 Entry.isZExt = false; 6945 Entry.isSRet = true; 6946 Args.push_back(Entry); 6947 RetTy = Type::getVoidTy(*DAG.getContext()); 6948 } 6949 6950 ArgListEntry Entry; 6951 Entry.Node = Arg; 6952 Entry.Ty = ArgTy; 6953 Entry.isSExt = false; 6954 Entry.isZExt = false; 6955 Args.push_back(Entry); 6956 6957 const char *LibcallName = 6958 (ArgVT == MVT::f64) ? "__sincos_stret" : "__sincosf_stret"; 6959 RTLIB::Libcall LC = 6960 (ArgVT == MVT::f64) ? RTLIB::SINCOS_F64 : RTLIB::SINCOS_F32; 6961 CallingConv::ID CC = getLibcallCallingConv(LC); 6962 SDValue Callee = DAG.getExternalSymbol(LibcallName, getPointerTy(DL)); 6963 6964 TargetLowering::CallLoweringInfo CLI(DAG); 6965 CLI.setDebugLoc(dl) 6966 .setChain(DAG.getEntryNode()) 6967 .setCallee(CC, RetTy, Callee, std::move(Args)) 6968 .setDiscardResult(ShouldUseSRet); 6969 std::pair<SDValue, SDValue> CallResult = LowerCallTo(CLI); 6970 6971 if (!ShouldUseSRet) 6972 return CallResult.first; 6973 6974 SDValue LoadSin = DAG.getLoad(ArgVT, dl, CallResult.second, SRet, 6975 MachinePointerInfo(), false, false, false, 0); 6976 6977 // Address of cos field. 6978 SDValue Add = DAG.getNode(ISD::ADD, dl, PtrVT, SRet, 6979 DAG.getIntPtrConstant(ArgVT.getStoreSize(), dl)); 6980 SDValue LoadCos = DAG.getLoad(ArgVT, dl, LoadSin.getValue(1), Add, 6981 MachinePointerInfo(), false, false, false, 0); 6982 6983 SDVTList Tys = DAG.getVTList(ArgVT, ArgVT); 6984 return DAG.getNode(ISD::MERGE_VALUES, dl, Tys, 6985 LoadSin.getValue(0), LoadCos.getValue(0)); 6986 } 6987 6988 SDValue ARMTargetLowering::LowerWindowsDIVLibCall(SDValue Op, SelectionDAG &DAG, 6989 bool Signed, 6990 SDValue &Chain) const { 6991 EVT VT = Op.getValueType(); 6992 assert((VT == MVT::i32 || VT == MVT::i64) && 6993 "unexpected type for custom lowering DIV"); 6994 SDLoc dl(Op); 6995 6996 const auto &DL = DAG.getDataLayout(); 6997 const auto &TLI = DAG.getTargetLoweringInfo(); 6998 6999 const char *Name = nullptr; 7000 if (Signed) 7001 Name = (VT == MVT::i32) ? "__rt_sdiv" : "__rt_sdiv64"; 7002 else 7003 Name = (VT == MVT::i32) ? "__rt_udiv" : "__rt_udiv64"; 7004 7005 SDValue ES = DAG.getExternalSymbol(Name, TLI.getPointerTy(DL)); 7006 7007 ARMTargetLowering::ArgListTy Args; 7008 7009 for (auto AI : {1, 0}) { 7010 ArgListEntry Arg; 7011 Arg.Node = Op.getOperand(AI); 7012 Arg.Ty = Arg.Node.getValueType().getTypeForEVT(*DAG.getContext()); 7013 Args.push_back(Arg); 7014 } 7015 7016 CallLoweringInfo CLI(DAG); 7017 CLI.setDebugLoc(dl) 7018 .setChain(Chain) 7019 .setCallee(CallingConv::ARM_AAPCS_VFP, VT.getTypeForEVT(*DAG.getContext()), 7020 ES, std::move(Args)); 7021 7022 return LowerCallTo(CLI).first; 7023 } 7024 7025 SDValue ARMTargetLowering::LowerDIV_Windows(SDValue Op, SelectionDAG &DAG, 7026 bool Signed) const { 7027 assert(Op.getValueType() == MVT::i32 && 7028 "unexpected type for custom lowering DIV"); 7029 SDLoc dl(Op); 7030 7031 SDValue DBZCHK = DAG.getNode(ARMISD::WIN__DBZCHK, dl, MVT::Other, 7032 DAG.getEntryNode(), Op.getOperand(1)); 7033 7034 return LowerWindowsDIVLibCall(Op, DAG, Signed, DBZCHK); 7035 } 7036 7037 void ARMTargetLowering::ExpandDIV_Windows( 7038 SDValue Op, SelectionDAG &DAG, bool Signed, 7039 SmallVectorImpl<SDValue> &Results) const { 7040 const auto &DL = DAG.getDataLayout(); 7041 const auto &TLI = DAG.getTargetLoweringInfo(); 7042 7043 assert(Op.getValueType() == MVT::i64 && 7044 "unexpected type for custom lowering DIV"); 7045 SDLoc dl(Op); 7046 7047 SDValue Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, Op.getOperand(1), 7048 DAG.getConstant(0, dl, MVT::i32)); 7049 SDValue Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, Op.getOperand(1), 7050 DAG.getConstant(1, dl, MVT::i32)); 7051 SDValue Or = DAG.getNode(ISD::OR, dl, MVT::i32, Lo, Hi); 7052 7053 SDValue DBZCHK = 7054 DAG.getNode(ARMISD::WIN__DBZCHK, dl, MVT::Other, DAG.getEntryNode(), Or); 7055 7056 SDValue Result = LowerWindowsDIVLibCall(Op, DAG, Signed, DBZCHK); 7057 7058 SDValue Lower = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, Result); 7059 SDValue Upper = DAG.getNode(ISD::SRL, dl, MVT::i64, Result, 7060 DAG.getConstant(32, dl, TLI.getPointerTy(DL))); 7061 Upper = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, Upper); 7062 7063 Results.push_back(Lower); 7064 Results.push_back(Upper); 7065 } 7066 7067 static SDValue LowerAtomicLoadStore(SDValue Op, SelectionDAG &DAG) { 7068 if (isStrongerThanMonotonic(cast<AtomicSDNode>(Op)->getOrdering())) 7069 // Acquire/Release load/store is not legal for targets without a dmb or 7070 // equivalent available. 7071 return SDValue(); 7072 7073 // Monotonic load/store is legal for all targets. 7074 return Op; 7075 } 7076 7077 static void ReplaceREADCYCLECOUNTER(SDNode *N, 7078 SmallVectorImpl<SDValue> &Results, 7079 SelectionDAG &DAG, 7080 const ARMSubtarget *Subtarget) { 7081 SDLoc DL(N); 7082 // Under Power Management extensions, the cycle-count is: 7083 // mrc p15, #0, <Rt>, c9, c13, #0 7084 SDValue Ops[] = { N->getOperand(0), // Chain 7085 DAG.getConstant(Intrinsic::arm_mrc, DL, MVT::i32), 7086 DAG.getConstant(15, DL, MVT::i32), 7087 DAG.getConstant(0, DL, MVT::i32), 7088 DAG.getConstant(9, DL, MVT::i32), 7089 DAG.getConstant(13, DL, MVT::i32), 7090 DAG.getConstant(0, DL, MVT::i32) 7091 }; 7092 7093 SDValue Cycles32 = DAG.getNode(ISD::INTRINSIC_W_CHAIN, DL, 7094 DAG.getVTList(MVT::i32, MVT::Other), Ops); 7095 Results.push_back(DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, Cycles32, 7096 DAG.getConstant(0, DL, MVT::i32))); 7097 Results.push_back(Cycles32.getValue(1)); 7098 } 7099 7100 static SDValue createGPRPairNode(SelectionDAG &DAG, SDValue V) { 7101 SDLoc dl(V.getNode()); 7102 SDValue VLo = DAG.getAnyExtOrTrunc(V, dl, MVT::i32); 7103 SDValue VHi = DAG.getAnyExtOrTrunc( 7104 DAG.getNode(ISD::SRL, dl, MVT::i64, V, DAG.getConstant(32, dl, MVT::i32)), 7105 dl, MVT::i32); 7106 SDValue RegClass = 7107 DAG.getTargetConstant(ARM::GPRPairRegClassID, dl, MVT::i32); 7108 SDValue SubReg0 = DAG.getTargetConstant(ARM::gsub_0, dl, MVT::i32); 7109 SDValue SubReg1 = DAG.getTargetConstant(ARM::gsub_1, dl, MVT::i32); 7110 const SDValue Ops[] = { RegClass, VLo, SubReg0, VHi, SubReg1 }; 7111 return SDValue( 7112 DAG.getMachineNode(TargetOpcode::REG_SEQUENCE, dl, MVT::Untyped, Ops), 0); 7113 } 7114 7115 static void ReplaceCMP_SWAP_64Results(SDNode *N, 7116 SmallVectorImpl<SDValue> & Results, 7117 SelectionDAG &DAG) { 7118 assert(N->getValueType(0) == MVT::i64 && 7119 "AtomicCmpSwap on types less than 64 should be legal"); 7120 SDValue Ops[] = {N->getOperand(1), 7121 createGPRPairNode(DAG, N->getOperand(2)), 7122 createGPRPairNode(DAG, N->getOperand(3)), 7123 N->getOperand(0)}; 7124 SDNode *CmpSwap = DAG.getMachineNode( 7125 ARM::CMP_SWAP_64, SDLoc(N), 7126 DAG.getVTList(MVT::Untyped, MVT::i32, MVT::Other), Ops); 7127 7128 MachineFunction &MF = DAG.getMachineFunction(); 7129 MachineSDNode::mmo_iterator MemOp = MF.allocateMemRefsArray(1); 7130 MemOp[0] = cast<MemSDNode>(N)->getMemOperand(); 7131 cast<MachineSDNode>(CmpSwap)->setMemRefs(MemOp, MemOp + 1); 7132 7133 Results.push_back(DAG.getTargetExtractSubreg(ARM::gsub_0, SDLoc(N), MVT::i32, 7134 SDValue(CmpSwap, 0))); 7135 Results.push_back(DAG.getTargetExtractSubreg(ARM::gsub_1, SDLoc(N), MVT::i32, 7136 SDValue(CmpSwap, 0))); 7137 Results.push_back(SDValue(CmpSwap, 2)); 7138 } 7139 7140 SDValue ARMTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { 7141 switch (Op.getOpcode()) { 7142 default: llvm_unreachable("Don't know how to custom lower this!"); 7143 case ISD::WRITE_REGISTER: return LowerWRITE_REGISTER(Op, DAG); 7144 case ISD::ConstantPool: return LowerConstantPool(Op, DAG); 7145 case ISD::BlockAddress: return LowerBlockAddress(Op, DAG); 7146 case ISD::GlobalAddress: 7147 switch (Subtarget->getTargetTriple().getObjectFormat()) { 7148 default: llvm_unreachable("unknown object format"); 7149 case Triple::COFF: 7150 return LowerGlobalAddressWindows(Op, DAG); 7151 case Triple::ELF: 7152 return LowerGlobalAddressELF(Op, DAG); 7153 case Triple::MachO: 7154 return LowerGlobalAddressDarwin(Op, DAG); 7155 } 7156 case ISD::GlobalTLSAddress: return LowerGlobalTLSAddress(Op, DAG); 7157 case ISD::SELECT: return LowerSELECT(Op, DAG); 7158 case ISD::SELECT_CC: return LowerSELECT_CC(Op, DAG); 7159 case ISD::BR_CC: return LowerBR_CC(Op, DAG); 7160 case ISD::BR_JT: return LowerBR_JT(Op, DAG); 7161 case ISD::VASTART: return LowerVASTART(Op, DAG); 7162 case ISD::ATOMIC_FENCE: return LowerATOMIC_FENCE(Op, DAG, Subtarget); 7163 case ISD::PREFETCH: return LowerPREFETCH(Op, DAG, Subtarget); 7164 case ISD::SINT_TO_FP: 7165 case ISD::UINT_TO_FP: return LowerINT_TO_FP(Op, DAG); 7166 case ISD::FP_TO_SINT: 7167 case ISD::FP_TO_UINT: return LowerFP_TO_INT(Op, DAG); 7168 case ISD::FCOPYSIGN: return LowerFCOPYSIGN(Op, DAG); 7169 case ISD::RETURNADDR: return LowerRETURNADDR(Op, DAG); 7170 case ISD::FRAMEADDR: return LowerFRAMEADDR(Op, DAG); 7171 case ISD::EH_SJLJ_SETJMP: return LowerEH_SJLJ_SETJMP(Op, DAG); 7172 case ISD::EH_SJLJ_LONGJMP: return LowerEH_SJLJ_LONGJMP(Op, DAG); 7173 case ISD::EH_SJLJ_SETUP_DISPATCH: return LowerEH_SJLJ_SETUP_DISPATCH(Op, DAG); 7174 case ISD::INTRINSIC_WO_CHAIN: return LowerINTRINSIC_WO_CHAIN(Op, DAG, 7175 Subtarget); 7176 case ISD::BITCAST: return ExpandBITCAST(Op.getNode(), DAG); 7177 case ISD::SHL: 7178 case ISD::SRL: 7179 case ISD::SRA: return LowerShift(Op.getNode(), DAG, Subtarget); 7180 case ISD::SREM: return LowerREM(Op.getNode(), DAG); 7181 case ISD::UREM: return LowerREM(Op.getNode(), DAG); 7182 case ISD::SHL_PARTS: return LowerShiftLeftParts(Op, DAG); 7183 case ISD::SRL_PARTS: 7184 case ISD::SRA_PARTS: return LowerShiftRightParts(Op, DAG); 7185 case ISD::CTTZ: 7186 case ISD::CTTZ_ZERO_UNDEF: return LowerCTTZ(Op.getNode(), DAG, Subtarget); 7187 case ISD::CTPOP: return LowerCTPOP(Op.getNode(), DAG, Subtarget); 7188 case ISD::SETCC: return LowerVSETCC(Op, DAG); 7189 case ISD::SETCCE: return LowerSETCCE(Op, DAG); 7190 case ISD::ConstantFP: return LowerConstantFP(Op, DAG, Subtarget); 7191 case ISD::BUILD_VECTOR: return LowerBUILD_VECTOR(Op, DAG, Subtarget); 7192 case ISD::VECTOR_SHUFFLE: return LowerVECTOR_SHUFFLE(Op, DAG); 7193 case ISD::INSERT_VECTOR_ELT: return LowerINSERT_VECTOR_ELT(Op, DAG); 7194 case ISD::EXTRACT_VECTOR_ELT: return LowerEXTRACT_VECTOR_ELT(Op, DAG); 7195 case ISD::CONCAT_VECTORS: return LowerCONCAT_VECTORS(Op, DAG); 7196 case ISD::FLT_ROUNDS_: return LowerFLT_ROUNDS_(Op, DAG); 7197 case ISD::MUL: return LowerMUL(Op, DAG); 7198 case ISD::SDIV: 7199 if (Subtarget->isTargetWindows()) 7200 return LowerDIV_Windows(Op, DAG, /* Signed */ true); 7201 return LowerSDIV(Op, DAG); 7202 case ISD::UDIV: 7203 if (Subtarget->isTargetWindows()) 7204 return LowerDIV_Windows(Op, DAG, /* Signed */ false); 7205 return LowerUDIV(Op, DAG); 7206 case ISD::ADDC: 7207 case ISD::ADDE: 7208 case ISD::SUBC: 7209 case ISD::SUBE: return LowerADDC_ADDE_SUBC_SUBE(Op, DAG); 7210 case ISD::SADDO: 7211 case ISD::UADDO: 7212 case ISD::SSUBO: 7213 case ISD::USUBO: 7214 return LowerXALUO(Op, DAG); 7215 case ISD::ATOMIC_LOAD: 7216 case ISD::ATOMIC_STORE: return LowerAtomicLoadStore(Op, DAG); 7217 case ISD::FSINCOS: return LowerFSINCOS(Op, DAG); 7218 case ISD::SDIVREM: 7219 case ISD::UDIVREM: return LowerDivRem(Op, DAG); 7220 case ISD::DYNAMIC_STACKALLOC: 7221 if (Subtarget->getTargetTriple().isWindowsItaniumEnvironment()) 7222 return LowerDYNAMIC_STACKALLOC(Op, DAG); 7223 llvm_unreachable("Don't know how to custom lower this!"); 7224 case ISD::FP_ROUND: return LowerFP_ROUND(Op, DAG); 7225 case ISD::FP_EXTEND: return LowerFP_EXTEND(Op, DAG); 7226 case ARMISD::WIN__DBZCHK: return SDValue(); 7227 } 7228 } 7229 7230 /// ReplaceNodeResults - Replace the results of node with an illegal result 7231 /// type with new values built out of custom code. 7232 void ARMTargetLowering::ReplaceNodeResults(SDNode *N, 7233 SmallVectorImpl<SDValue> &Results, 7234 SelectionDAG &DAG) const { 7235 SDValue Res; 7236 switch (N->getOpcode()) { 7237 default: 7238 llvm_unreachable("Don't know how to custom expand this!"); 7239 case ISD::READ_REGISTER: 7240 ExpandREAD_REGISTER(N, Results, DAG); 7241 break; 7242 case ISD::BITCAST: 7243 Res = ExpandBITCAST(N, DAG); 7244 break; 7245 case ISD::SRL: 7246 case ISD::SRA: 7247 Res = Expand64BitShift(N, DAG, Subtarget); 7248 break; 7249 case ISD::SREM: 7250 case ISD::UREM: 7251 Res = LowerREM(N, DAG); 7252 break; 7253 case ISD::SDIVREM: 7254 case ISD::UDIVREM: 7255 Res = LowerDivRem(SDValue(N, 0), DAG); 7256 assert(Res.getNumOperands() == 2 && "DivRem needs two values"); 7257 Results.push_back(Res.getValue(0)); 7258 Results.push_back(Res.getValue(1)); 7259 return; 7260 case ISD::READCYCLECOUNTER: 7261 ReplaceREADCYCLECOUNTER(N, Results, DAG, Subtarget); 7262 return; 7263 case ISD::UDIV: 7264 case ISD::SDIV: 7265 assert(Subtarget->isTargetWindows() && "can only expand DIV on Windows"); 7266 return ExpandDIV_Windows(SDValue(N, 0), DAG, N->getOpcode() == ISD::SDIV, 7267 Results); 7268 case ISD::ATOMIC_CMP_SWAP: 7269 ReplaceCMP_SWAP_64Results(N, Results, DAG); 7270 return; 7271 } 7272 if (Res.getNode()) 7273 Results.push_back(Res); 7274 } 7275 7276 //===----------------------------------------------------------------------===// 7277 // ARM Scheduler Hooks 7278 //===----------------------------------------------------------------------===// 7279 7280 /// SetupEntryBlockForSjLj - Insert code into the entry block that creates and 7281 /// registers the function context. 7282 void ARMTargetLowering::SetupEntryBlockForSjLj(MachineInstr &MI, 7283 MachineBasicBlock *MBB, 7284 MachineBasicBlock *DispatchBB, 7285 int FI) const { 7286 const TargetInstrInfo *TII = Subtarget->getInstrInfo(); 7287 DebugLoc dl = MI.getDebugLoc(); 7288 MachineFunction *MF = MBB->getParent(); 7289 MachineRegisterInfo *MRI = &MF->getRegInfo(); 7290 MachineConstantPool *MCP = MF->getConstantPool(); 7291 ARMFunctionInfo *AFI = MF->getInfo<ARMFunctionInfo>(); 7292 const Function *F = MF->getFunction(); 7293 7294 bool isThumb = Subtarget->isThumb(); 7295 bool isThumb2 = Subtarget->isThumb2(); 7296 7297 unsigned PCLabelId = AFI->createPICLabelUId(); 7298 unsigned PCAdj = (isThumb || isThumb2) ? 4 : 8; 7299 ARMConstantPoolValue *CPV = 7300 ARMConstantPoolMBB::Create(F->getContext(), DispatchBB, PCLabelId, PCAdj); 7301 unsigned CPI = MCP->getConstantPoolIndex(CPV, 4); 7302 7303 const TargetRegisterClass *TRC = isThumb ? &ARM::tGPRRegClass 7304 : &ARM::GPRRegClass; 7305 7306 // Grab constant pool and fixed stack memory operands. 7307 MachineMemOperand *CPMMO = 7308 MF->getMachineMemOperand(MachinePointerInfo::getConstantPool(*MF), 7309 MachineMemOperand::MOLoad, 4, 4); 7310 7311 MachineMemOperand *FIMMOSt = 7312 MF->getMachineMemOperand(MachinePointerInfo::getFixedStack(*MF, FI), 7313 MachineMemOperand::MOStore, 4, 4); 7314 7315 // Load the address of the dispatch MBB into the jump buffer. 7316 if (isThumb2) { 7317 // Incoming value: jbuf 7318 // ldr.n r5, LCPI1_1 7319 // orr r5, r5, #1 7320 // add r5, pc 7321 // str r5, [$jbuf, #+4] ; &jbuf[1] 7322 unsigned NewVReg1 = MRI->createVirtualRegister(TRC); 7323 AddDefaultPred(BuildMI(*MBB, MI, dl, TII->get(ARM::t2LDRpci), NewVReg1) 7324 .addConstantPoolIndex(CPI) 7325 .addMemOperand(CPMMO)); 7326 // Set the low bit because of thumb mode. 7327 unsigned NewVReg2 = MRI->createVirtualRegister(TRC); 7328 AddDefaultCC( 7329 AddDefaultPred(BuildMI(*MBB, MI, dl, TII->get(ARM::t2ORRri), NewVReg2) 7330 .addReg(NewVReg1, RegState::Kill) 7331 .addImm(0x01))); 7332 unsigned NewVReg3 = MRI->createVirtualRegister(TRC); 7333 BuildMI(*MBB, MI, dl, TII->get(ARM::tPICADD), NewVReg3) 7334 .addReg(NewVReg2, RegState::Kill) 7335 .addImm(PCLabelId); 7336 AddDefaultPred(BuildMI(*MBB, MI, dl, TII->get(ARM::t2STRi12)) 7337 .addReg(NewVReg3, RegState::Kill) 7338 .addFrameIndex(FI) 7339 .addImm(36) // &jbuf[1] :: pc 7340 .addMemOperand(FIMMOSt)); 7341 } else if (isThumb) { 7342 // Incoming value: jbuf 7343 // ldr.n r1, LCPI1_4 7344 // add r1, pc 7345 // mov r2, #1 7346 // orrs r1, r2 7347 // add r2, $jbuf, #+4 ; &jbuf[1] 7348 // str r1, [r2] 7349 unsigned NewVReg1 = MRI->createVirtualRegister(TRC); 7350 AddDefaultPred(BuildMI(*MBB, MI, dl, TII->get(ARM::tLDRpci), NewVReg1) 7351 .addConstantPoolIndex(CPI) 7352 .addMemOperand(CPMMO)); 7353 unsigned NewVReg2 = MRI->createVirtualRegister(TRC); 7354 BuildMI(*MBB, MI, dl, TII->get(ARM::tPICADD), NewVReg2) 7355 .addReg(NewVReg1, RegState::Kill) 7356 .addImm(PCLabelId); 7357 // Set the low bit because of thumb mode. 7358 unsigned NewVReg3 = MRI->createVirtualRegister(TRC); 7359 AddDefaultPred(BuildMI(*MBB, MI, dl, TII->get(ARM::tMOVi8), NewVReg3) 7360 .addReg(ARM::CPSR, RegState::Define) 7361 .addImm(1)); 7362 unsigned NewVReg4 = MRI->createVirtualRegister(TRC); 7363 AddDefaultPred(BuildMI(*MBB, MI, dl, TII->get(ARM::tORR), NewVReg4) 7364 .addReg(ARM::CPSR, RegState::Define) 7365 .addReg(NewVReg2, RegState::Kill) 7366 .addReg(NewVReg3, RegState::Kill)); 7367 unsigned NewVReg5 = MRI->createVirtualRegister(TRC); 7368 BuildMI(*MBB, MI, dl, TII->get(ARM::tADDframe), NewVReg5) 7369 .addFrameIndex(FI) 7370 .addImm(36); // &jbuf[1] :: pc 7371 AddDefaultPred(BuildMI(*MBB, MI, dl, TII->get(ARM::tSTRi)) 7372 .addReg(NewVReg4, RegState::Kill) 7373 .addReg(NewVReg5, RegState::Kill) 7374 .addImm(0) 7375 .addMemOperand(FIMMOSt)); 7376 } else { 7377 // Incoming value: jbuf 7378 // ldr r1, LCPI1_1 7379 // add r1, pc, r1 7380 // str r1, [$jbuf, #+4] ; &jbuf[1] 7381 unsigned NewVReg1 = MRI->createVirtualRegister(TRC); 7382 AddDefaultPred(BuildMI(*MBB, MI, dl, TII->get(ARM::LDRi12), NewVReg1) 7383 .addConstantPoolIndex(CPI) 7384 .addImm(0) 7385 .addMemOperand(CPMMO)); 7386 unsigned NewVReg2 = MRI->createVirtualRegister(TRC); 7387 AddDefaultPred(BuildMI(*MBB, MI, dl, TII->get(ARM::PICADD), NewVReg2) 7388 .addReg(NewVReg1, RegState::Kill) 7389 .addImm(PCLabelId)); 7390 AddDefaultPred(BuildMI(*MBB, MI, dl, TII->get(ARM::STRi12)) 7391 .addReg(NewVReg2, RegState::Kill) 7392 .addFrameIndex(FI) 7393 .addImm(36) // &jbuf[1] :: pc 7394 .addMemOperand(FIMMOSt)); 7395 } 7396 } 7397 7398 void ARMTargetLowering::EmitSjLjDispatchBlock(MachineInstr &MI, 7399 MachineBasicBlock *MBB) const { 7400 const TargetInstrInfo *TII = Subtarget->getInstrInfo(); 7401 DebugLoc dl = MI.getDebugLoc(); 7402 MachineFunction *MF = MBB->getParent(); 7403 MachineRegisterInfo *MRI = &MF->getRegInfo(); 7404 MachineFrameInfo *MFI = MF->getFrameInfo(); 7405 int FI = MFI->getFunctionContextIndex(); 7406 7407 const TargetRegisterClass *TRC = Subtarget->isThumb() ? &ARM::tGPRRegClass 7408 : &ARM::GPRnopcRegClass; 7409 7410 // Get a mapping of the call site numbers to all of the landing pads they're 7411 // associated with. 7412 DenseMap<unsigned, SmallVector<MachineBasicBlock*, 2> > CallSiteNumToLPad; 7413 unsigned MaxCSNum = 0; 7414 MachineModuleInfo &MMI = MF->getMMI(); 7415 for (MachineFunction::iterator BB = MF->begin(), E = MF->end(); BB != E; 7416 ++BB) { 7417 if (!BB->isEHPad()) continue; 7418 7419 // FIXME: We should assert that the EH_LABEL is the first MI in the landing 7420 // pad. 7421 for (MachineBasicBlock::iterator 7422 II = BB->begin(), IE = BB->end(); II != IE; ++II) { 7423 if (!II->isEHLabel()) continue; 7424 7425 MCSymbol *Sym = II->getOperand(0).getMCSymbol(); 7426 if (!MMI.hasCallSiteLandingPad(Sym)) continue; 7427 7428 SmallVectorImpl<unsigned> &CallSiteIdxs = MMI.getCallSiteLandingPad(Sym); 7429 for (SmallVectorImpl<unsigned>::iterator 7430 CSI = CallSiteIdxs.begin(), CSE = CallSiteIdxs.end(); 7431 CSI != CSE; ++CSI) { 7432 CallSiteNumToLPad[*CSI].push_back(&*BB); 7433 MaxCSNum = std::max(MaxCSNum, *CSI); 7434 } 7435 break; 7436 } 7437 } 7438 7439 // Get an ordered list of the machine basic blocks for the jump table. 7440 std::vector<MachineBasicBlock*> LPadList; 7441 SmallPtrSet<MachineBasicBlock*, 32> InvokeBBs; 7442 LPadList.reserve(CallSiteNumToLPad.size()); 7443 for (unsigned I = 1; I <= MaxCSNum; ++I) { 7444 SmallVectorImpl<MachineBasicBlock*> &MBBList = CallSiteNumToLPad[I]; 7445 for (SmallVectorImpl<MachineBasicBlock*>::iterator 7446 II = MBBList.begin(), IE = MBBList.end(); II != IE; ++II) { 7447 LPadList.push_back(*II); 7448 InvokeBBs.insert((*II)->pred_begin(), (*II)->pred_end()); 7449 } 7450 } 7451 7452 assert(!LPadList.empty() && 7453 "No landing pad destinations for the dispatch jump table!"); 7454 7455 // Create the jump table and associated information. 7456 MachineJumpTableInfo *JTI = 7457 MF->getOrCreateJumpTableInfo(MachineJumpTableInfo::EK_Inline); 7458 unsigned MJTI = JTI->createJumpTableIndex(LPadList); 7459 7460 // Create the MBBs for the dispatch code. 7461 7462 // Shove the dispatch's address into the return slot in the function context. 7463 MachineBasicBlock *DispatchBB = MF->CreateMachineBasicBlock(); 7464 DispatchBB->setIsEHPad(); 7465 7466 MachineBasicBlock *TrapBB = MF->CreateMachineBasicBlock(); 7467 unsigned trap_opcode; 7468 if (Subtarget->isThumb()) 7469 trap_opcode = ARM::tTRAP; 7470 else 7471 trap_opcode = Subtarget->useNaClTrap() ? ARM::TRAPNaCl : ARM::TRAP; 7472 7473 BuildMI(TrapBB, dl, TII->get(trap_opcode)); 7474 DispatchBB->addSuccessor(TrapBB); 7475 7476 MachineBasicBlock *DispContBB = MF->CreateMachineBasicBlock(); 7477 DispatchBB->addSuccessor(DispContBB); 7478 7479 // Insert and MBBs. 7480 MF->insert(MF->end(), DispatchBB); 7481 MF->insert(MF->end(), DispContBB); 7482 MF->insert(MF->end(), TrapBB); 7483 7484 // Insert code into the entry block that creates and registers the function 7485 // context. 7486 SetupEntryBlockForSjLj(MI, MBB, DispatchBB, FI); 7487 7488 MachineMemOperand *FIMMOLd = MF->getMachineMemOperand( 7489 MachinePointerInfo::getFixedStack(*MF, FI), 7490 MachineMemOperand::MOLoad | MachineMemOperand::MOVolatile, 4, 4); 7491 7492 MachineInstrBuilder MIB; 7493 MIB = BuildMI(DispatchBB, dl, TII->get(ARM::Int_eh_sjlj_dispatchsetup)); 7494 7495 const ARMBaseInstrInfo *AII = static_cast<const ARMBaseInstrInfo*>(TII); 7496 const ARMBaseRegisterInfo &RI = AII->getRegisterInfo(); 7497 7498 // Add a register mask with no preserved registers. This results in all 7499 // registers being marked as clobbered. 7500 MIB.addRegMask(RI.getNoPreservedMask()); 7501 7502 bool IsPositionIndependent = isPositionIndependent(); 7503 unsigned NumLPads = LPadList.size(); 7504 if (Subtarget->isThumb2()) { 7505 unsigned NewVReg1 = MRI->createVirtualRegister(TRC); 7506 AddDefaultPred(BuildMI(DispatchBB, dl, TII->get(ARM::t2LDRi12), NewVReg1) 7507 .addFrameIndex(FI) 7508 .addImm(4) 7509 .addMemOperand(FIMMOLd)); 7510 7511 if (NumLPads < 256) { 7512 AddDefaultPred(BuildMI(DispatchBB, dl, TII->get(ARM::t2CMPri)) 7513 .addReg(NewVReg1) 7514 .addImm(LPadList.size())); 7515 } else { 7516 unsigned VReg1 = MRI->createVirtualRegister(TRC); 7517 AddDefaultPred(BuildMI(DispatchBB, dl, TII->get(ARM::t2MOVi16), VReg1) 7518 .addImm(NumLPads & 0xFFFF)); 7519 7520 unsigned VReg2 = VReg1; 7521 if ((NumLPads & 0xFFFF0000) != 0) { 7522 VReg2 = MRI->createVirtualRegister(TRC); 7523 AddDefaultPred(BuildMI(DispatchBB, dl, TII->get(ARM::t2MOVTi16), VReg2) 7524 .addReg(VReg1) 7525 .addImm(NumLPads >> 16)); 7526 } 7527 7528 AddDefaultPred(BuildMI(DispatchBB, dl, TII->get(ARM::t2CMPrr)) 7529 .addReg(NewVReg1) 7530 .addReg(VReg2)); 7531 } 7532 7533 BuildMI(DispatchBB, dl, TII->get(ARM::t2Bcc)) 7534 .addMBB(TrapBB) 7535 .addImm(ARMCC::HI) 7536 .addReg(ARM::CPSR); 7537 7538 unsigned NewVReg3 = MRI->createVirtualRegister(TRC); 7539 AddDefaultPred(BuildMI(DispContBB, dl, TII->get(ARM::t2LEApcrelJT),NewVReg3) 7540 .addJumpTableIndex(MJTI)); 7541 7542 unsigned NewVReg4 = MRI->createVirtualRegister(TRC); 7543 AddDefaultCC( 7544 AddDefaultPred( 7545 BuildMI(DispContBB, dl, TII->get(ARM::t2ADDrs), NewVReg4) 7546 .addReg(NewVReg3, RegState::Kill) 7547 .addReg(NewVReg1) 7548 .addImm(ARM_AM::getSORegOpc(ARM_AM::lsl, 2)))); 7549 7550 BuildMI(DispContBB, dl, TII->get(ARM::t2BR_JT)) 7551 .addReg(NewVReg4, RegState::Kill) 7552 .addReg(NewVReg1) 7553 .addJumpTableIndex(MJTI); 7554 } else if (Subtarget->isThumb()) { 7555 unsigned NewVReg1 = MRI->createVirtualRegister(TRC); 7556 AddDefaultPred(BuildMI(DispatchBB, dl, TII->get(ARM::tLDRspi), NewVReg1) 7557 .addFrameIndex(FI) 7558 .addImm(1) 7559 .addMemOperand(FIMMOLd)); 7560 7561 if (NumLPads < 256) { 7562 AddDefaultPred(BuildMI(DispatchBB, dl, TII->get(ARM::tCMPi8)) 7563 .addReg(NewVReg1) 7564 .addImm(NumLPads)); 7565 } else { 7566 MachineConstantPool *ConstantPool = MF->getConstantPool(); 7567 Type *Int32Ty = Type::getInt32Ty(MF->getFunction()->getContext()); 7568 const Constant *C = ConstantInt::get(Int32Ty, NumLPads); 7569 7570 // MachineConstantPool wants an explicit alignment. 7571 unsigned Align = MF->getDataLayout().getPrefTypeAlignment(Int32Ty); 7572 if (Align == 0) 7573 Align = MF->getDataLayout().getTypeAllocSize(C->getType()); 7574 unsigned Idx = ConstantPool->getConstantPoolIndex(C, Align); 7575 7576 unsigned VReg1 = MRI->createVirtualRegister(TRC); 7577 AddDefaultPred(BuildMI(DispatchBB, dl, TII->get(ARM::tLDRpci)) 7578 .addReg(VReg1, RegState::Define) 7579 .addConstantPoolIndex(Idx)); 7580 AddDefaultPred(BuildMI(DispatchBB, dl, TII->get(ARM::tCMPr)) 7581 .addReg(NewVReg1) 7582 .addReg(VReg1)); 7583 } 7584 7585 BuildMI(DispatchBB, dl, TII->get(ARM::tBcc)) 7586 .addMBB(TrapBB) 7587 .addImm(ARMCC::HI) 7588 .addReg(ARM::CPSR); 7589 7590 unsigned NewVReg2 = MRI->createVirtualRegister(TRC); 7591 AddDefaultPred(BuildMI(DispContBB, dl, TII->get(ARM::tLSLri), NewVReg2) 7592 .addReg(ARM::CPSR, RegState::Define) 7593 .addReg(NewVReg1) 7594 .addImm(2)); 7595 7596 unsigned NewVReg3 = MRI->createVirtualRegister(TRC); 7597 AddDefaultPred(BuildMI(DispContBB, dl, TII->get(ARM::tLEApcrelJT), NewVReg3) 7598 .addJumpTableIndex(MJTI)); 7599 7600 unsigned NewVReg4 = MRI->createVirtualRegister(TRC); 7601 AddDefaultPred(BuildMI(DispContBB, dl, TII->get(ARM::tADDrr), NewVReg4) 7602 .addReg(ARM::CPSR, RegState::Define) 7603 .addReg(NewVReg2, RegState::Kill) 7604 .addReg(NewVReg3)); 7605 7606 MachineMemOperand *JTMMOLd = MF->getMachineMemOperand( 7607 MachinePointerInfo::getJumpTable(*MF), MachineMemOperand::MOLoad, 4, 4); 7608 7609 unsigned NewVReg5 = MRI->createVirtualRegister(TRC); 7610 AddDefaultPred(BuildMI(DispContBB, dl, TII->get(ARM::tLDRi), NewVReg5) 7611 .addReg(NewVReg4, RegState::Kill) 7612 .addImm(0) 7613 .addMemOperand(JTMMOLd)); 7614 7615 unsigned NewVReg6 = NewVReg5; 7616 if (IsPositionIndependent) { 7617 NewVReg6 = MRI->createVirtualRegister(TRC); 7618 AddDefaultPred(BuildMI(DispContBB, dl, TII->get(ARM::tADDrr), NewVReg6) 7619 .addReg(ARM::CPSR, RegState::Define) 7620 .addReg(NewVReg5, RegState::Kill) 7621 .addReg(NewVReg3)); 7622 } 7623 7624 BuildMI(DispContBB, dl, TII->get(ARM::tBR_JTr)) 7625 .addReg(NewVReg6, RegState::Kill) 7626 .addJumpTableIndex(MJTI); 7627 } else { 7628 unsigned NewVReg1 = MRI->createVirtualRegister(TRC); 7629 AddDefaultPred(BuildMI(DispatchBB, dl, TII->get(ARM::LDRi12), NewVReg1) 7630 .addFrameIndex(FI) 7631 .addImm(4) 7632 .addMemOperand(FIMMOLd)); 7633 7634 if (NumLPads < 256) { 7635 AddDefaultPred(BuildMI(DispatchBB, dl, TII->get(ARM::CMPri)) 7636 .addReg(NewVReg1) 7637 .addImm(NumLPads)); 7638 } else if (Subtarget->hasV6T2Ops() && isUInt<16>(NumLPads)) { 7639 unsigned VReg1 = MRI->createVirtualRegister(TRC); 7640 AddDefaultPred(BuildMI(DispatchBB, dl, TII->get(ARM::MOVi16), VReg1) 7641 .addImm(NumLPads & 0xFFFF)); 7642 7643 unsigned VReg2 = VReg1; 7644 if ((NumLPads & 0xFFFF0000) != 0) { 7645 VReg2 = MRI->createVirtualRegister(TRC); 7646 AddDefaultPred(BuildMI(DispatchBB, dl, TII->get(ARM::MOVTi16), VReg2) 7647 .addReg(VReg1) 7648 .addImm(NumLPads >> 16)); 7649 } 7650 7651 AddDefaultPred(BuildMI(DispatchBB, dl, TII->get(ARM::CMPrr)) 7652 .addReg(NewVReg1) 7653 .addReg(VReg2)); 7654 } else { 7655 MachineConstantPool *ConstantPool = MF->getConstantPool(); 7656 Type *Int32Ty = Type::getInt32Ty(MF->getFunction()->getContext()); 7657 const Constant *C = ConstantInt::get(Int32Ty, NumLPads); 7658 7659 // MachineConstantPool wants an explicit alignment. 7660 unsigned Align = MF->getDataLayout().getPrefTypeAlignment(Int32Ty); 7661 if (Align == 0) 7662 Align = MF->getDataLayout().getTypeAllocSize(C->getType()); 7663 unsigned Idx = ConstantPool->getConstantPoolIndex(C, Align); 7664 7665 unsigned VReg1 = MRI->createVirtualRegister(TRC); 7666 AddDefaultPred(BuildMI(DispatchBB, dl, TII->get(ARM::LDRcp)) 7667 .addReg(VReg1, RegState::Define) 7668 .addConstantPoolIndex(Idx) 7669 .addImm(0)); 7670 AddDefaultPred(BuildMI(DispatchBB, dl, TII->get(ARM::CMPrr)) 7671 .addReg(NewVReg1) 7672 .addReg(VReg1, RegState::Kill)); 7673 } 7674 7675 BuildMI(DispatchBB, dl, TII->get(ARM::Bcc)) 7676 .addMBB(TrapBB) 7677 .addImm(ARMCC::HI) 7678 .addReg(ARM::CPSR); 7679 7680 unsigned NewVReg3 = MRI->createVirtualRegister(TRC); 7681 AddDefaultCC( 7682 AddDefaultPred(BuildMI(DispContBB, dl, TII->get(ARM::MOVsi), NewVReg3) 7683 .addReg(NewVReg1) 7684 .addImm(ARM_AM::getSORegOpc(ARM_AM::lsl, 2)))); 7685 unsigned NewVReg4 = MRI->createVirtualRegister(TRC); 7686 AddDefaultPred(BuildMI(DispContBB, dl, TII->get(ARM::LEApcrelJT), NewVReg4) 7687 .addJumpTableIndex(MJTI)); 7688 7689 MachineMemOperand *JTMMOLd = MF->getMachineMemOperand( 7690 MachinePointerInfo::getJumpTable(*MF), MachineMemOperand::MOLoad, 4, 4); 7691 unsigned NewVReg5 = MRI->createVirtualRegister(TRC); 7692 AddDefaultPred( 7693 BuildMI(DispContBB, dl, TII->get(ARM::LDRrs), NewVReg5) 7694 .addReg(NewVReg3, RegState::Kill) 7695 .addReg(NewVReg4) 7696 .addImm(0) 7697 .addMemOperand(JTMMOLd)); 7698 7699 if (IsPositionIndependent) { 7700 BuildMI(DispContBB, dl, TII->get(ARM::BR_JTadd)) 7701 .addReg(NewVReg5, RegState::Kill) 7702 .addReg(NewVReg4) 7703 .addJumpTableIndex(MJTI); 7704 } else { 7705 BuildMI(DispContBB, dl, TII->get(ARM::BR_JTr)) 7706 .addReg(NewVReg5, RegState::Kill) 7707 .addJumpTableIndex(MJTI); 7708 } 7709 } 7710 7711 // Add the jump table entries as successors to the MBB. 7712 SmallPtrSet<MachineBasicBlock*, 8> SeenMBBs; 7713 for (std::vector<MachineBasicBlock*>::iterator 7714 I = LPadList.begin(), E = LPadList.end(); I != E; ++I) { 7715 MachineBasicBlock *CurMBB = *I; 7716 if (SeenMBBs.insert(CurMBB).second) 7717 DispContBB->addSuccessor(CurMBB); 7718 } 7719 7720 // N.B. the order the invoke BBs are processed in doesn't matter here. 7721 const MCPhysReg *SavedRegs = RI.getCalleeSavedRegs(MF); 7722 SmallVector<MachineBasicBlock*, 64> MBBLPads; 7723 for (MachineBasicBlock *BB : InvokeBBs) { 7724 7725 // Remove the landing pad successor from the invoke block and replace it 7726 // with the new dispatch block. 7727 SmallVector<MachineBasicBlock*, 4> Successors(BB->succ_begin(), 7728 BB->succ_end()); 7729 while (!Successors.empty()) { 7730 MachineBasicBlock *SMBB = Successors.pop_back_val(); 7731 if (SMBB->isEHPad()) { 7732 BB->removeSuccessor(SMBB); 7733 MBBLPads.push_back(SMBB); 7734 } 7735 } 7736 7737 BB->addSuccessor(DispatchBB, BranchProbability::getZero()); 7738 BB->normalizeSuccProbs(); 7739 7740 // Find the invoke call and mark all of the callee-saved registers as 7741 // 'implicit defined' so that they're spilled. This prevents code from 7742 // moving instructions to before the EH block, where they will never be 7743 // executed. 7744 for (MachineBasicBlock::reverse_iterator 7745 II = BB->rbegin(), IE = BB->rend(); II != IE; ++II) { 7746 if (!II->isCall()) continue; 7747 7748 DenseMap<unsigned, bool> DefRegs; 7749 for (MachineInstr::mop_iterator 7750 OI = II->operands_begin(), OE = II->operands_end(); 7751 OI != OE; ++OI) { 7752 if (!OI->isReg()) continue; 7753 DefRegs[OI->getReg()] = true; 7754 } 7755 7756 MachineInstrBuilder MIB(*MF, &*II); 7757 7758 for (unsigned i = 0; SavedRegs[i] != 0; ++i) { 7759 unsigned Reg = SavedRegs[i]; 7760 if (Subtarget->isThumb2() && 7761 !ARM::tGPRRegClass.contains(Reg) && 7762 !ARM::hGPRRegClass.contains(Reg)) 7763 continue; 7764 if (Subtarget->isThumb1Only() && !ARM::tGPRRegClass.contains(Reg)) 7765 continue; 7766 if (!Subtarget->isThumb() && !ARM::GPRRegClass.contains(Reg)) 7767 continue; 7768 if (!DefRegs[Reg]) 7769 MIB.addReg(Reg, RegState::ImplicitDefine | RegState::Dead); 7770 } 7771 7772 break; 7773 } 7774 } 7775 7776 // Mark all former landing pads as non-landing pads. The dispatch is the only 7777 // landing pad now. 7778 for (SmallVectorImpl<MachineBasicBlock*>::iterator 7779 I = MBBLPads.begin(), E = MBBLPads.end(); I != E; ++I) 7780 (*I)->setIsEHPad(false); 7781 7782 // The instruction is gone now. 7783 MI.eraseFromParent(); 7784 } 7785 7786 static 7787 MachineBasicBlock *OtherSucc(MachineBasicBlock *MBB, MachineBasicBlock *Succ) { 7788 for (MachineBasicBlock::succ_iterator I = MBB->succ_begin(), 7789 E = MBB->succ_end(); I != E; ++I) 7790 if (*I != Succ) 7791 return *I; 7792 llvm_unreachable("Expecting a BB with two successors!"); 7793 } 7794 7795 /// Return the load opcode for a given load size. If load size >= 8, 7796 /// neon opcode will be returned. 7797 static unsigned getLdOpcode(unsigned LdSize, bool IsThumb1, bool IsThumb2) { 7798 if (LdSize >= 8) 7799 return LdSize == 16 ? ARM::VLD1q32wb_fixed 7800 : LdSize == 8 ? ARM::VLD1d32wb_fixed : 0; 7801 if (IsThumb1) 7802 return LdSize == 4 ? ARM::tLDRi 7803 : LdSize == 2 ? ARM::tLDRHi 7804 : LdSize == 1 ? ARM::tLDRBi : 0; 7805 if (IsThumb2) 7806 return LdSize == 4 ? ARM::t2LDR_POST 7807 : LdSize == 2 ? ARM::t2LDRH_POST 7808 : LdSize == 1 ? ARM::t2LDRB_POST : 0; 7809 return LdSize == 4 ? ARM::LDR_POST_IMM 7810 : LdSize == 2 ? ARM::LDRH_POST 7811 : LdSize == 1 ? ARM::LDRB_POST_IMM : 0; 7812 } 7813 7814 /// Return the store opcode for a given store size. If store size >= 8, 7815 /// neon opcode will be returned. 7816 static unsigned getStOpcode(unsigned StSize, bool IsThumb1, bool IsThumb2) { 7817 if (StSize >= 8) 7818 return StSize == 16 ? ARM::VST1q32wb_fixed 7819 : StSize == 8 ? ARM::VST1d32wb_fixed : 0; 7820 if (IsThumb1) 7821 return StSize == 4 ? ARM::tSTRi 7822 : StSize == 2 ? ARM::tSTRHi 7823 : StSize == 1 ? ARM::tSTRBi : 0; 7824 if (IsThumb2) 7825 return StSize == 4 ? ARM::t2STR_POST 7826 : StSize == 2 ? ARM::t2STRH_POST 7827 : StSize == 1 ? ARM::t2STRB_POST : 0; 7828 return StSize == 4 ? ARM::STR_POST_IMM 7829 : StSize == 2 ? ARM::STRH_POST 7830 : StSize == 1 ? ARM::STRB_POST_IMM : 0; 7831 } 7832 7833 /// Emit a post-increment load operation with given size. The instructions 7834 /// will be added to BB at Pos. 7835 static void emitPostLd(MachineBasicBlock *BB, MachineBasicBlock::iterator Pos, 7836 const TargetInstrInfo *TII, const DebugLoc &dl, 7837 unsigned LdSize, unsigned Data, unsigned AddrIn, 7838 unsigned AddrOut, bool IsThumb1, bool IsThumb2) { 7839 unsigned LdOpc = getLdOpcode(LdSize, IsThumb1, IsThumb2); 7840 assert(LdOpc != 0 && "Should have a load opcode"); 7841 if (LdSize >= 8) { 7842 AddDefaultPred(BuildMI(*BB, Pos, dl, TII->get(LdOpc), Data) 7843 .addReg(AddrOut, RegState::Define).addReg(AddrIn) 7844 .addImm(0)); 7845 } else if (IsThumb1) { 7846 // load + update AddrIn 7847 AddDefaultPred(BuildMI(*BB, Pos, dl, TII->get(LdOpc), Data) 7848 .addReg(AddrIn).addImm(0)); 7849 MachineInstrBuilder MIB = 7850 BuildMI(*BB, Pos, dl, TII->get(ARM::tADDi8), AddrOut); 7851 MIB = AddDefaultT1CC(MIB); 7852 MIB.addReg(AddrIn).addImm(LdSize); 7853 AddDefaultPred(MIB); 7854 } else if (IsThumb2) { 7855 AddDefaultPred(BuildMI(*BB, Pos, dl, TII->get(LdOpc), Data) 7856 .addReg(AddrOut, RegState::Define).addReg(AddrIn) 7857 .addImm(LdSize)); 7858 } else { // arm 7859 AddDefaultPred(BuildMI(*BB, Pos, dl, TII->get(LdOpc), Data) 7860 .addReg(AddrOut, RegState::Define).addReg(AddrIn) 7861 .addReg(0).addImm(LdSize)); 7862 } 7863 } 7864 7865 /// Emit a post-increment store operation with given size. The instructions 7866 /// will be added to BB at Pos. 7867 static void emitPostSt(MachineBasicBlock *BB, MachineBasicBlock::iterator Pos, 7868 const TargetInstrInfo *TII, const DebugLoc &dl, 7869 unsigned StSize, unsigned Data, unsigned AddrIn, 7870 unsigned AddrOut, bool IsThumb1, bool IsThumb2) { 7871 unsigned StOpc = getStOpcode(StSize, IsThumb1, IsThumb2); 7872 assert(StOpc != 0 && "Should have a store opcode"); 7873 if (StSize >= 8) { 7874 AddDefaultPred(BuildMI(*BB, Pos, dl, TII->get(StOpc), AddrOut) 7875 .addReg(AddrIn).addImm(0).addReg(Data)); 7876 } else if (IsThumb1) { 7877 // store + update AddrIn 7878 AddDefaultPred(BuildMI(*BB, Pos, dl, TII->get(StOpc)).addReg(Data) 7879 .addReg(AddrIn).addImm(0)); 7880 MachineInstrBuilder MIB = 7881 BuildMI(*BB, Pos, dl, TII->get(ARM::tADDi8), AddrOut); 7882 MIB = AddDefaultT1CC(MIB); 7883 MIB.addReg(AddrIn).addImm(StSize); 7884 AddDefaultPred(MIB); 7885 } else if (IsThumb2) { 7886 AddDefaultPred(BuildMI(*BB, Pos, dl, TII->get(StOpc), AddrOut) 7887 .addReg(Data).addReg(AddrIn).addImm(StSize)); 7888 } else { // arm 7889 AddDefaultPred(BuildMI(*BB, Pos, dl, TII->get(StOpc), AddrOut) 7890 .addReg(Data).addReg(AddrIn).addReg(0) 7891 .addImm(StSize)); 7892 } 7893 } 7894 7895 MachineBasicBlock * 7896 ARMTargetLowering::EmitStructByval(MachineInstr &MI, 7897 MachineBasicBlock *BB) const { 7898 // This pseudo instruction has 3 operands: dst, src, size 7899 // We expand it to a loop if size > Subtarget->getMaxInlineSizeThreshold(). 7900 // Otherwise, we will generate unrolled scalar copies. 7901 const TargetInstrInfo *TII = Subtarget->getInstrInfo(); 7902 const BasicBlock *LLVM_BB = BB->getBasicBlock(); 7903 MachineFunction::iterator It = ++BB->getIterator(); 7904 7905 unsigned dest = MI.getOperand(0).getReg(); 7906 unsigned src = MI.getOperand(1).getReg(); 7907 unsigned SizeVal = MI.getOperand(2).getImm(); 7908 unsigned Align = MI.getOperand(3).getImm(); 7909 DebugLoc dl = MI.getDebugLoc(); 7910 7911 MachineFunction *MF = BB->getParent(); 7912 MachineRegisterInfo &MRI = MF->getRegInfo(); 7913 unsigned UnitSize = 0; 7914 const TargetRegisterClass *TRC = nullptr; 7915 const TargetRegisterClass *VecTRC = nullptr; 7916 7917 bool IsThumb1 = Subtarget->isThumb1Only(); 7918 bool IsThumb2 = Subtarget->isThumb2(); 7919 7920 if (Align & 1) { 7921 UnitSize = 1; 7922 } else if (Align & 2) { 7923 UnitSize = 2; 7924 } else { 7925 // Check whether we can use NEON instructions. 7926 if (!MF->getFunction()->hasFnAttribute(Attribute::NoImplicitFloat) && 7927 Subtarget->hasNEON()) { 7928 if ((Align % 16 == 0) && SizeVal >= 16) 7929 UnitSize = 16; 7930 else if ((Align % 8 == 0) && SizeVal >= 8) 7931 UnitSize = 8; 7932 } 7933 // Can't use NEON instructions. 7934 if (UnitSize == 0) 7935 UnitSize = 4; 7936 } 7937 7938 // Select the correct opcode and register class for unit size load/store 7939 bool IsNeon = UnitSize >= 8; 7940 TRC = (IsThumb1 || IsThumb2) ? &ARM::tGPRRegClass : &ARM::GPRRegClass; 7941 if (IsNeon) 7942 VecTRC = UnitSize == 16 ? &ARM::DPairRegClass 7943 : UnitSize == 8 ? &ARM::DPRRegClass 7944 : nullptr; 7945 7946 unsigned BytesLeft = SizeVal % UnitSize; 7947 unsigned LoopSize = SizeVal - BytesLeft; 7948 7949 if (SizeVal <= Subtarget->getMaxInlineSizeThreshold()) { 7950 // Use LDR and STR to copy. 7951 // [scratch, srcOut] = LDR_POST(srcIn, UnitSize) 7952 // [destOut] = STR_POST(scratch, destIn, UnitSize) 7953 unsigned srcIn = src; 7954 unsigned destIn = dest; 7955 for (unsigned i = 0; i < LoopSize; i+=UnitSize) { 7956 unsigned srcOut = MRI.createVirtualRegister(TRC); 7957 unsigned destOut = MRI.createVirtualRegister(TRC); 7958 unsigned scratch = MRI.createVirtualRegister(IsNeon ? VecTRC : TRC); 7959 emitPostLd(BB, MI, TII, dl, UnitSize, scratch, srcIn, srcOut, 7960 IsThumb1, IsThumb2); 7961 emitPostSt(BB, MI, TII, dl, UnitSize, scratch, destIn, destOut, 7962 IsThumb1, IsThumb2); 7963 srcIn = srcOut; 7964 destIn = destOut; 7965 } 7966 7967 // Handle the leftover bytes with LDRB and STRB. 7968 // [scratch, srcOut] = LDRB_POST(srcIn, 1) 7969 // [destOut] = STRB_POST(scratch, destIn, 1) 7970 for (unsigned i = 0; i < BytesLeft; i++) { 7971 unsigned srcOut = MRI.createVirtualRegister(TRC); 7972 unsigned destOut = MRI.createVirtualRegister(TRC); 7973 unsigned scratch = MRI.createVirtualRegister(TRC); 7974 emitPostLd(BB, MI, TII, dl, 1, scratch, srcIn, srcOut, 7975 IsThumb1, IsThumb2); 7976 emitPostSt(BB, MI, TII, dl, 1, scratch, destIn, destOut, 7977 IsThumb1, IsThumb2); 7978 srcIn = srcOut; 7979 destIn = destOut; 7980 } 7981 MI.eraseFromParent(); // The instruction is gone now. 7982 return BB; 7983 } 7984 7985 // Expand the pseudo op to a loop. 7986 // thisMBB: 7987 // ... 7988 // movw varEnd, # --> with thumb2 7989 // movt varEnd, # 7990 // ldrcp varEnd, idx --> without thumb2 7991 // fallthrough --> loopMBB 7992 // loopMBB: 7993 // PHI varPhi, varEnd, varLoop 7994 // PHI srcPhi, src, srcLoop 7995 // PHI destPhi, dst, destLoop 7996 // [scratch, srcLoop] = LDR_POST(srcPhi, UnitSize) 7997 // [destLoop] = STR_POST(scratch, destPhi, UnitSize) 7998 // subs varLoop, varPhi, #UnitSize 7999 // bne loopMBB 8000 // fallthrough --> exitMBB 8001 // exitMBB: 8002 // epilogue to handle left-over bytes 8003 // [scratch, srcOut] = LDRB_POST(srcLoop, 1) 8004 // [destOut] = STRB_POST(scratch, destLoop, 1) 8005 MachineBasicBlock *loopMBB = MF->CreateMachineBasicBlock(LLVM_BB); 8006 MachineBasicBlock *exitMBB = MF->CreateMachineBasicBlock(LLVM_BB); 8007 MF->insert(It, loopMBB); 8008 MF->insert(It, exitMBB); 8009 8010 // Transfer the remainder of BB and its successor edges to exitMBB. 8011 exitMBB->splice(exitMBB->begin(), BB, 8012 std::next(MachineBasicBlock::iterator(MI)), BB->end()); 8013 exitMBB->transferSuccessorsAndUpdatePHIs(BB); 8014 8015 // Load an immediate to varEnd. 8016 unsigned varEnd = MRI.createVirtualRegister(TRC); 8017 if (Subtarget->useMovt(*MF)) { 8018 unsigned Vtmp = varEnd; 8019 if ((LoopSize & 0xFFFF0000) != 0) 8020 Vtmp = MRI.createVirtualRegister(TRC); 8021 AddDefaultPred(BuildMI(BB, dl, 8022 TII->get(IsThumb2 ? ARM::t2MOVi16 : ARM::MOVi16), 8023 Vtmp).addImm(LoopSize & 0xFFFF)); 8024 8025 if ((LoopSize & 0xFFFF0000) != 0) 8026 AddDefaultPred(BuildMI(BB, dl, 8027 TII->get(IsThumb2 ? ARM::t2MOVTi16 : ARM::MOVTi16), 8028 varEnd) 8029 .addReg(Vtmp) 8030 .addImm(LoopSize >> 16)); 8031 } else { 8032 MachineConstantPool *ConstantPool = MF->getConstantPool(); 8033 Type *Int32Ty = Type::getInt32Ty(MF->getFunction()->getContext()); 8034 const Constant *C = ConstantInt::get(Int32Ty, LoopSize); 8035 8036 // MachineConstantPool wants an explicit alignment. 8037 unsigned Align = MF->getDataLayout().getPrefTypeAlignment(Int32Ty); 8038 if (Align == 0) 8039 Align = MF->getDataLayout().getTypeAllocSize(C->getType()); 8040 unsigned Idx = ConstantPool->getConstantPoolIndex(C, Align); 8041 8042 if (IsThumb1) 8043 AddDefaultPred(BuildMI(*BB, MI, dl, TII->get(ARM::tLDRpci)).addReg( 8044 varEnd, RegState::Define).addConstantPoolIndex(Idx)); 8045 else 8046 AddDefaultPred(BuildMI(*BB, MI, dl, TII->get(ARM::LDRcp)).addReg( 8047 varEnd, RegState::Define).addConstantPoolIndex(Idx).addImm(0)); 8048 } 8049 BB->addSuccessor(loopMBB); 8050 8051 // Generate the loop body: 8052 // varPhi = PHI(varLoop, varEnd) 8053 // srcPhi = PHI(srcLoop, src) 8054 // destPhi = PHI(destLoop, dst) 8055 MachineBasicBlock *entryBB = BB; 8056 BB = loopMBB; 8057 unsigned varLoop = MRI.createVirtualRegister(TRC); 8058 unsigned varPhi = MRI.createVirtualRegister(TRC); 8059 unsigned srcLoop = MRI.createVirtualRegister(TRC); 8060 unsigned srcPhi = MRI.createVirtualRegister(TRC); 8061 unsigned destLoop = MRI.createVirtualRegister(TRC); 8062 unsigned destPhi = MRI.createVirtualRegister(TRC); 8063 8064 BuildMI(*BB, BB->begin(), dl, TII->get(ARM::PHI), varPhi) 8065 .addReg(varLoop).addMBB(loopMBB) 8066 .addReg(varEnd).addMBB(entryBB); 8067 BuildMI(BB, dl, TII->get(ARM::PHI), srcPhi) 8068 .addReg(srcLoop).addMBB(loopMBB) 8069 .addReg(src).addMBB(entryBB); 8070 BuildMI(BB, dl, TII->get(ARM::PHI), destPhi) 8071 .addReg(destLoop).addMBB(loopMBB) 8072 .addReg(dest).addMBB(entryBB); 8073 8074 // [scratch, srcLoop] = LDR_POST(srcPhi, UnitSize) 8075 // [destLoop] = STR_POST(scratch, destPhi, UnitSiz) 8076 unsigned scratch = MRI.createVirtualRegister(IsNeon ? VecTRC : TRC); 8077 emitPostLd(BB, BB->end(), TII, dl, UnitSize, scratch, srcPhi, srcLoop, 8078 IsThumb1, IsThumb2); 8079 emitPostSt(BB, BB->end(), TII, dl, UnitSize, scratch, destPhi, destLoop, 8080 IsThumb1, IsThumb2); 8081 8082 // Decrement loop variable by UnitSize. 8083 if (IsThumb1) { 8084 MachineInstrBuilder MIB = 8085 BuildMI(*BB, BB->end(), dl, TII->get(ARM::tSUBi8), varLoop); 8086 MIB = AddDefaultT1CC(MIB); 8087 MIB.addReg(varPhi).addImm(UnitSize); 8088 AddDefaultPred(MIB); 8089 } else { 8090 MachineInstrBuilder MIB = 8091 BuildMI(*BB, BB->end(), dl, 8092 TII->get(IsThumb2 ? ARM::t2SUBri : ARM::SUBri), varLoop); 8093 AddDefaultCC(AddDefaultPred(MIB.addReg(varPhi).addImm(UnitSize))); 8094 MIB->getOperand(5).setReg(ARM::CPSR); 8095 MIB->getOperand(5).setIsDef(true); 8096 } 8097 BuildMI(*BB, BB->end(), dl, 8098 TII->get(IsThumb1 ? ARM::tBcc : IsThumb2 ? ARM::t2Bcc : ARM::Bcc)) 8099 .addMBB(loopMBB).addImm(ARMCC::NE).addReg(ARM::CPSR); 8100 8101 // loopMBB can loop back to loopMBB or fall through to exitMBB. 8102 BB->addSuccessor(loopMBB); 8103 BB->addSuccessor(exitMBB); 8104 8105 // Add epilogue to handle BytesLeft. 8106 BB = exitMBB; 8107 auto StartOfExit = exitMBB->begin(); 8108 8109 // [scratch, srcOut] = LDRB_POST(srcLoop, 1) 8110 // [destOut] = STRB_POST(scratch, destLoop, 1) 8111 unsigned srcIn = srcLoop; 8112 unsigned destIn = destLoop; 8113 for (unsigned i = 0; i < BytesLeft; i++) { 8114 unsigned srcOut = MRI.createVirtualRegister(TRC); 8115 unsigned destOut = MRI.createVirtualRegister(TRC); 8116 unsigned scratch = MRI.createVirtualRegister(TRC); 8117 emitPostLd(BB, StartOfExit, TII, dl, 1, scratch, srcIn, srcOut, 8118 IsThumb1, IsThumb2); 8119 emitPostSt(BB, StartOfExit, TII, dl, 1, scratch, destIn, destOut, 8120 IsThumb1, IsThumb2); 8121 srcIn = srcOut; 8122 destIn = destOut; 8123 } 8124 8125 MI.eraseFromParent(); // The instruction is gone now. 8126 return BB; 8127 } 8128 8129 MachineBasicBlock * 8130 ARMTargetLowering::EmitLowered__chkstk(MachineInstr &MI, 8131 MachineBasicBlock *MBB) const { 8132 const TargetMachine &TM = getTargetMachine(); 8133 const TargetInstrInfo &TII = *Subtarget->getInstrInfo(); 8134 DebugLoc DL = MI.getDebugLoc(); 8135 8136 assert(Subtarget->isTargetWindows() && 8137 "__chkstk is only supported on Windows"); 8138 assert(Subtarget->isThumb2() && "Windows on ARM requires Thumb-2 mode"); 8139 8140 // __chkstk takes the number of words to allocate on the stack in R4, and 8141 // returns the stack adjustment in number of bytes in R4. This will not 8142 // clober any other registers (other than the obvious lr). 8143 // 8144 // Although, technically, IP should be considered a register which may be 8145 // clobbered, the call itself will not touch it. Windows on ARM is a pure 8146 // thumb-2 environment, so there is no interworking required. As a result, we 8147 // do not expect a veneer to be emitted by the linker, clobbering IP. 8148 // 8149 // Each module receives its own copy of __chkstk, so no import thunk is 8150 // required, again, ensuring that IP is not clobbered. 8151 // 8152 // Finally, although some linkers may theoretically provide a trampoline for 8153 // out of range calls (which is quite common due to a 32M range limitation of 8154 // branches for Thumb), we can generate the long-call version via 8155 // -mcmodel=large, alleviating the need for the trampoline which may clobber 8156 // IP. 8157 8158 switch (TM.getCodeModel()) { 8159 case CodeModel::Small: 8160 case CodeModel::Medium: 8161 case CodeModel::Default: 8162 case CodeModel::Kernel: 8163 BuildMI(*MBB, MI, DL, TII.get(ARM::tBL)) 8164 .addImm((unsigned)ARMCC::AL).addReg(0) 8165 .addExternalSymbol("__chkstk") 8166 .addReg(ARM::R4, RegState::Implicit | RegState::Kill) 8167 .addReg(ARM::R4, RegState::Implicit | RegState::Define) 8168 .addReg(ARM::R12, RegState::Implicit | RegState::Define | RegState::Dead); 8169 break; 8170 case CodeModel::Large: 8171 case CodeModel::JITDefault: { 8172 MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo(); 8173 unsigned Reg = MRI.createVirtualRegister(&ARM::rGPRRegClass); 8174 8175 BuildMI(*MBB, MI, DL, TII.get(ARM::t2MOVi32imm), Reg) 8176 .addExternalSymbol("__chkstk"); 8177 BuildMI(*MBB, MI, DL, TII.get(ARM::tBLXr)) 8178 .addImm((unsigned)ARMCC::AL).addReg(0) 8179 .addReg(Reg, RegState::Kill) 8180 .addReg(ARM::R4, RegState::Implicit | RegState::Kill) 8181 .addReg(ARM::R4, RegState::Implicit | RegState::Define) 8182 .addReg(ARM::R12, RegState::Implicit | RegState::Define | RegState::Dead); 8183 break; 8184 } 8185 } 8186 8187 AddDefaultCC(AddDefaultPred(BuildMI(*MBB, MI, DL, TII.get(ARM::t2SUBrr), 8188 ARM::SP) 8189 .addReg(ARM::SP, RegState::Kill) 8190 .addReg(ARM::R4, RegState::Kill) 8191 .setMIFlags(MachineInstr::FrameSetup))); 8192 8193 MI.eraseFromParent(); 8194 return MBB; 8195 } 8196 8197 MachineBasicBlock * 8198 ARMTargetLowering::EmitLowered__dbzchk(MachineInstr &MI, 8199 MachineBasicBlock *MBB) const { 8200 DebugLoc DL = MI.getDebugLoc(); 8201 MachineFunction *MF = MBB->getParent(); 8202 const TargetInstrInfo *TII = Subtarget->getInstrInfo(); 8203 8204 MachineBasicBlock *ContBB = MF->CreateMachineBasicBlock(); 8205 MF->insert(++MBB->getIterator(), ContBB); 8206 ContBB->splice(ContBB->begin(), MBB, 8207 std::next(MachineBasicBlock::iterator(MI)), MBB->end()); 8208 ContBB->transferSuccessorsAndUpdatePHIs(MBB); 8209 8210 MachineBasicBlock *TrapBB = MF->CreateMachineBasicBlock(); 8211 MF->push_back(TrapBB); 8212 BuildMI(TrapBB, DL, TII->get(ARM::t2UDF)).addImm(249); 8213 MBB->addSuccessor(TrapBB); 8214 8215 BuildMI(*MBB, MI, DL, TII->get(ARM::tCBZ)) 8216 .addReg(MI.getOperand(0).getReg()) 8217 .addMBB(TrapBB); 8218 AddDefaultPred(BuildMI(*MBB, MI, DL, TII->get(ARM::t2B)).addMBB(ContBB)); 8219 MBB->addSuccessor(ContBB); 8220 8221 MI.eraseFromParent(); 8222 return ContBB; 8223 } 8224 8225 MachineBasicBlock * 8226 ARMTargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI, 8227 MachineBasicBlock *BB) const { 8228 const TargetInstrInfo *TII = Subtarget->getInstrInfo(); 8229 DebugLoc dl = MI.getDebugLoc(); 8230 bool isThumb2 = Subtarget->isThumb2(); 8231 switch (MI.getOpcode()) { 8232 default: { 8233 MI.dump(); 8234 llvm_unreachable("Unexpected instr type to insert"); 8235 } 8236 // The Thumb2 pre-indexed stores have the same MI operands, they just 8237 // define them differently in the .td files from the isel patterns, so 8238 // they need pseudos. 8239 case ARM::t2STR_preidx: 8240 MI.setDesc(TII->get(ARM::t2STR_PRE)); 8241 return BB; 8242 case ARM::t2STRB_preidx: 8243 MI.setDesc(TII->get(ARM::t2STRB_PRE)); 8244 return BB; 8245 case ARM::t2STRH_preidx: 8246 MI.setDesc(TII->get(ARM::t2STRH_PRE)); 8247 return BB; 8248 8249 case ARM::STRi_preidx: 8250 case ARM::STRBi_preidx: { 8251 unsigned NewOpc = MI.getOpcode() == ARM::STRi_preidx ? ARM::STR_PRE_IMM 8252 : ARM::STRB_PRE_IMM; 8253 // Decode the offset. 8254 unsigned Offset = MI.getOperand(4).getImm(); 8255 bool isSub = ARM_AM::getAM2Op(Offset) == ARM_AM::sub; 8256 Offset = ARM_AM::getAM2Offset(Offset); 8257 if (isSub) 8258 Offset = -Offset; 8259 8260 MachineMemOperand *MMO = *MI.memoperands_begin(); 8261 BuildMI(*BB, MI, dl, TII->get(NewOpc)) 8262 .addOperand(MI.getOperand(0)) // Rn_wb 8263 .addOperand(MI.getOperand(1)) // Rt 8264 .addOperand(MI.getOperand(2)) // Rn 8265 .addImm(Offset) // offset (skip GPR==zero_reg) 8266 .addOperand(MI.getOperand(5)) // pred 8267 .addOperand(MI.getOperand(6)) 8268 .addMemOperand(MMO); 8269 MI.eraseFromParent(); 8270 return BB; 8271 } 8272 case ARM::STRr_preidx: 8273 case ARM::STRBr_preidx: 8274 case ARM::STRH_preidx: { 8275 unsigned NewOpc; 8276 switch (MI.getOpcode()) { 8277 default: llvm_unreachable("unexpected opcode!"); 8278 case ARM::STRr_preidx: NewOpc = ARM::STR_PRE_REG; break; 8279 case ARM::STRBr_preidx: NewOpc = ARM::STRB_PRE_REG; break; 8280 case ARM::STRH_preidx: NewOpc = ARM::STRH_PRE; break; 8281 } 8282 MachineInstrBuilder MIB = BuildMI(*BB, MI, dl, TII->get(NewOpc)); 8283 for (unsigned i = 0; i < MI.getNumOperands(); ++i) 8284 MIB.addOperand(MI.getOperand(i)); 8285 MI.eraseFromParent(); 8286 return BB; 8287 } 8288 8289 case ARM::tMOVCCr_pseudo: { 8290 // To "insert" a SELECT_CC instruction, we actually have to insert the 8291 // diamond control-flow pattern. The incoming instruction knows the 8292 // destination vreg to set, the condition code register to branch on, the 8293 // true/false values to select between, and a branch opcode to use. 8294 const BasicBlock *LLVM_BB = BB->getBasicBlock(); 8295 MachineFunction::iterator It = ++BB->getIterator(); 8296 8297 // thisMBB: 8298 // ... 8299 // TrueVal = ... 8300 // cmpTY ccX, r1, r2 8301 // bCC copy1MBB 8302 // fallthrough --> copy0MBB 8303 MachineBasicBlock *thisMBB = BB; 8304 MachineFunction *F = BB->getParent(); 8305 MachineBasicBlock *copy0MBB = F->CreateMachineBasicBlock(LLVM_BB); 8306 MachineBasicBlock *sinkMBB = F->CreateMachineBasicBlock(LLVM_BB); 8307 F->insert(It, copy0MBB); 8308 F->insert(It, sinkMBB); 8309 8310 // Transfer the remainder of BB and its successor edges to sinkMBB. 8311 sinkMBB->splice(sinkMBB->begin(), BB, 8312 std::next(MachineBasicBlock::iterator(MI)), BB->end()); 8313 sinkMBB->transferSuccessorsAndUpdatePHIs(BB); 8314 8315 BB->addSuccessor(copy0MBB); 8316 BB->addSuccessor(sinkMBB); 8317 8318 BuildMI(BB, dl, TII->get(ARM::tBcc)) 8319 .addMBB(sinkMBB) 8320 .addImm(MI.getOperand(3).getImm()) 8321 .addReg(MI.getOperand(4).getReg()); 8322 8323 // copy0MBB: 8324 // %FalseValue = ... 8325 // # fallthrough to sinkMBB 8326 BB = copy0MBB; 8327 8328 // Update machine-CFG edges 8329 BB->addSuccessor(sinkMBB); 8330 8331 // sinkMBB: 8332 // %Result = phi [ %FalseValue, copy0MBB ], [ %TrueValue, thisMBB ] 8333 // ... 8334 BB = sinkMBB; 8335 BuildMI(*BB, BB->begin(), dl, TII->get(ARM::PHI), MI.getOperand(0).getReg()) 8336 .addReg(MI.getOperand(1).getReg()) 8337 .addMBB(copy0MBB) 8338 .addReg(MI.getOperand(2).getReg()) 8339 .addMBB(thisMBB); 8340 8341 MI.eraseFromParent(); // The pseudo instruction is gone now. 8342 return BB; 8343 } 8344 8345 case ARM::BCCi64: 8346 case ARM::BCCZi64: { 8347 // If there is an unconditional branch to the other successor, remove it. 8348 BB->erase(std::next(MachineBasicBlock::iterator(MI)), BB->end()); 8349 8350 // Compare both parts that make up the double comparison separately for 8351 // equality. 8352 bool RHSisZero = MI.getOpcode() == ARM::BCCZi64; 8353 8354 unsigned LHS1 = MI.getOperand(1).getReg(); 8355 unsigned LHS2 = MI.getOperand(2).getReg(); 8356 if (RHSisZero) { 8357 AddDefaultPred(BuildMI(BB, dl, 8358 TII->get(isThumb2 ? ARM::t2CMPri : ARM::CMPri)) 8359 .addReg(LHS1).addImm(0)); 8360 BuildMI(BB, dl, TII->get(isThumb2 ? ARM::t2CMPri : ARM::CMPri)) 8361 .addReg(LHS2).addImm(0) 8362 .addImm(ARMCC::EQ).addReg(ARM::CPSR); 8363 } else { 8364 unsigned RHS1 = MI.getOperand(3).getReg(); 8365 unsigned RHS2 = MI.getOperand(4).getReg(); 8366 AddDefaultPred(BuildMI(BB, dl, 8367 TII->get(isThumb2 ? ARM::t2CMPrr : ARM::CMPrr)) 8368 .addReg(LHS1).addReg(RHS1)); 8369 BuildMI(BB, dl, TII->get(isThumb2 ? ARM::t2CMPrr : ARM::CMPrr)) 8370 .addReg(LHS2).addReg(RHS2) 8371 .addImm(ARMCC::EQ).addReg(ARM::CPSR); 8372 } 8373 8374 MachineBasicBlock *destMBB = MI.getOperand(RHSisZero ? 3 : 5).getMBB(); 8375 MachineBasicBlock *exitMBB = OtherSucc(BB, destMBB); 8376 if (MI.getOperand(0).getImm() == ARMCC::NE) 8377 std::swap(destMBB, exitMBB); 8378 8379 BuildMI(BB, dl, TII->get(isThumb2 ? ARM::t2Bcc : ARM::Bcc)) 8380 .addMBB(destMBB).addImm(ARMCC::EQ).addReg(ARM::CPSR); 8381 if (isThumb2) 8382 AddDefaultPred(BuildMI(BB, dl, TII->get(ARM::t2B)).addMBB(exitMBB)); 8383 else 8384 BuildMI(BB, dl, TII->get(ARM::B)) .addMBB(exitMBB); 8385 8386 MI.eraseFromParent(); // The pseudo instruction is gone now. 8387 return BB; 8388 } 8389 8390 case ARM::Int_eh_sjlj_setjmp: 8391 case ARM::Int_eh_sjlj_setjmp_nofp: 8392 case ARM::tInt_eh_sjlj_setjmp: 8393 case ARM::t2Int_eh_sjlj_setjmp: 8394 case ARM::t2Int_eh_sjlj_setjmp_nofp: 8395 return BB; 8396 8397 case ARM::Int_eh_sjlj_setup_dispatch: 8398 EmitSjLjDispatchBlock(MI, BB); 8399 return BB; 8400 8401 case ARM::ABS: 8402 case ARM::t2ABS: { 8403 // To insert an ABS instruction, we have to insert the 8404 // diamond control-flow pattern. The incoming instruction knows the 8405 // source vreg to test against 0, the destination vreg to set, 8406 // the condition code register to branch on, the 8407 // true/false values to select between, and a branch opcode to use. 8408 // It transforms 8409 // V1 = ABS V0 8410 // into 8411 // V2 = MOVS V0 8412 // BCC (branch to SinkBB if V0 >= 0) 8413 // RSBBB: V3 = RSBri V2, 0 (compute ABS if V2 < 0) 8414 // SinkBB: V1 = PHI(V2, V3) 8415 const BasicBlock *LLVM_BB = BB->getBasicBlock(); 8416 MachineFunction::iterator BBI = ++BB->getIterator(); 8417 MachineFunction *Fn = BB->getParent(); 8418 MachineBasicBlock *RSBBB = Fn->CreateMachineBasicBlock(LLVM_BB); 8419 MachineBasicBlock *SinkBB = Fn->CreateMachineBasicBlock(LLVM_BB); 8420 Fn->insert(BBI, RSBBB); 8421 Fn->insert(BBI, SinkBB); 8422 8423 unsigned int ABSSrcReg = MI.getOperand(1).getReg(); 8424 unsigned int ABSDstReg = MI.getOperand(0).getReg(); 8425 bool ABSSrcKIll = MI.getOperand(1).isKill(); 8426 bool isThumb2 = Subtarget->isThumb2(); 8427 MachineRegisterInfo &MRI = Fn->getRegInfo(); 8428 // In Thumb mode S must not be specified if source register is the SP or 8429 // PC and if destination register is the SP, so restrict register class 8430 unsigned NewRsbDstReg = 8431 MRI.createVirtualRegister(isThumb2 ? &ARM::rGPRRegClass : &ARM::GPRRegClass); 8432 8433 // Transfer the remainder of BB and its successor edges to sinkMBB. 8434 SinkBB->splice(SinkBB->begin(), BB, 8435 std::next(MachineBasicBlock::iterator(MI)), BB->end()); 8436 SinkBB->transferSuccessorsAndUpdatePHIs(BB); 8437 8438 BB->addSuccessor(RSBBB); 8439 BB->addSuccessor(SinkBB); 8440 8441 // fall through to SinkMBB 8442 RSBBB->addSuccessor(SinkBB); 8443 8444 // insert a cmp at the end of BB 8445 AddDefaultPred(BuildMI(BB, dl, 8446 TII->get(isThumb2 ? ARM::t2CMPri : ARM::CMPri)) 8447 .addReg(ABSSrcReg).addImm(0)); 8448 8449 // insert a bcc with opposite CC to ARMCC::MI at the end of BB 8450 BuildMI(BB, dl, 8451 TII->get(isThumb2 ? ARM::t2Bcc : ARM::Bcc)).addMBB(SinkBB) 8452 .addImm(ARMCC::getOppositeCondition(ARMCC::MI)).addReg(ARM::CPSR); 8453 8454 // insert rsbri in RSBBB 8455 // Note: BCC and rsbri will be converted into predicated rsbmi 8456 // by if-conversion pass 8457 BuildMI(*RSBBB, RSBBB->begin(), dl, 8458 TII->get(isThumb2 ? ARM::t2RSBri : ARM::RSBri), NewRsbDstReg) 8459 .addReg(ABSSrcReg, ABSSrcKIll ? RegState::Kill : 0) 8460 .addImm(0).addImm((unsigned)ARMCC::AL).addReg(0).addReg(0); 8461 8462 // insert PHI in SinkBB, 8463 // reuse ABSDstReg to not change uses of ABS instruction 8464 BuildMI(*SinkBB, SinkBB->begin(), dl, 8465 TII->get(ARM::PHI), ABSDstReg) 8466 .addReg(NewRsbDstReg).addMBB(RSBBB) 8467 .addReg(ABSSrcReg).addMBB(BB); 8468 8469 // remove ABS instruction 8470 MI.eraseFromParent(); 8471 8472 // return last added BB 8473 return SinkBB; 8474 } 8475 case ARM::COPY_STRUCT_BYVAL_I32: 8476 ++NumLoopByVals; 8477 return EmitStructByval(MI, BB); 8478 case ARM::WIN__CHKSTK: 8479 return EmitLowered__chkstk(MI, BB); 8480 case ARM::WIN__DBZCHK: 8481 return EmitLowered__dbzchk(MI, BB); 8482 } 8483 } 8484 8485 /// \brief Attaches vregs to MEMCPY that it will use as scratch registers 8486 /// when it is expanded into LDM/STM. This is done as a post-isel lowering 8487 /// instead of as a custom inserter because we need the use list from the SDNode. 8488 static void attachMEMCPYScratchRegs(const ARMSubtarget *Subtarget, 8489 MachineInstr &MI, const SDNode *Node) { 8490 bool isThumb1 = Subtarget->isThumb1Only(); 8491 8492 DebugLoc DL = MI.getDebugLoc(); 8493 MachineFunction *MF = MI.getParent()->getParent(); 8494 MachineRegisterInfo &MRI = MF->getRegInfo(); 8495 MachineInstrBuilder MIB(*MF, MI); 8496 8497 // If the new dst/src is unused mark it as dead. 8498 if (!Node->hasAnyUseOfValue(0)) { 8499 MI.getOperand(0).setIsDead(true); 8500 } 8501 if (!Node->hasAnyUseOfValue(1)) { 8502 MI.getOperand(1).setIsDead(true); 8503 } 8504 8505 // The MEMCPY both defines and kills the scratch registers. 8506 for (unsigned I = 0; I != MI.getOperand(4).getImm(); ++I) { 8507 unsigned TmpReg = MRI.createVirtualRegister(isThumb1 ? &ARM::tGPRRegClass 8508 : &ARM::GPRRegClass); 8509 MIB.addReg(TmpReg, RegState::Define|RegState::Dead); 8510 } 8511 } 8512 8513 void ARMTargetLowering::AdjustInstrPostInstrSelection(MachineInstr &MI, 8514 SDNode *Node) const { 8515 if (MI.getOpcode() == ARM::MEMCPY) { 8516 attachMEMCPYScratchRegs(Subtarget, MI, Node); 8517 return; 8518 } 8519 8520 const MCInstrDesc *MCID = &MI.getDesc(); 8521 // Adjust potentially 's' setting instructions after isel, i.e. ADC, SBC, RSB, 8522 // RSC. Coming out of isel, they have an implicit CPSR def, but the optional 8523 // operand is still set to noreg. If needed, set the optional operand's 8524 // register to CPSR, and remove the redundant implicit def. 8525 // 8526 // e.g. ADCS (..., CPSR<imp-def>) -> ADC (... opt:CPSR<def>). 8527 8528 // Rename pseudo opcodes. 8529 unsigned NewOpc = convertAddSubFlagsOpcode(MI.getOpcode()); 8530 if (NewOpc) { 8531 const ARMBaseInstrInfo *TII = Subtarget->getInstrInfo(); 8532 MCID = &TII->get(NewOpc); 8533 8534 assert(MCID->getNumOperands() == MI.getDesc().getNumOperands() + 1 && 8535 "converted opcode should be the same except for cc_out"); 8536 8537 MI.setDesc(*MCID); 8538 8539 // Add the optional cc_out operand 8540 MI.addOperand(MachineOperand::CreateReg(0, /*isDef=*/true)); 8541 } 8542 unsigned ccOutIdx = MCID->getNumOperands() - 1; 8543 8544 // Any ARM instruction that sets the 's' bit should specify an optional 8545 // "cc_out" operand in the last operand position. 8546 if (!MI.hasOptionalDef() || !MCID->OpInfo[ccOutIdx].isOptionalDef()) { 8547 assert(!NewOpc && "Optional cc_out operand required"); 8548 return; 8549 } 8550 // Look for an implicit def of CPSR added by MachineInstr ctor. Remove it 8551 // since we already have an optional CPSR def. 8552 bool definesCPSR = false; 8553 bool deadCPSR = false; 8554 for (unsigned i = MCID->getNumOperands(), e = MI.getNumOperands(); i != e; 8555 ++i) { 8556 const MachineOperand &MO = MI.getOperand(i); 8557 if (MO.isReg() && MO.isDef() && MO.getReg() == ARM::CPSR) { 8558 definesCPSR = true; 8559 if (MO.isDead()) 8560 deadCPSR = true; 8561 MI.RemoveOperand(i); 8562 break; 8563 } 8564 } 8565 if (!definesCPSR) { 8566 assert(!NewOpc && "Optional cc_out operand required"); 8567 return; 8568 } 8569 assert(deadCPSR == !Node->hasAnyUseOfValue(1) && "inconsistent dead flag"); 8570 if (deadCPSR) { 8571 assert(!MI.getOperand(ccOutIdx).getReg() && 8572 "expect uninitialized optional cc_out operand"); 8573 return; 8574 } 8575 8576 // If this instruction was defined with an optional CPSR def and its dag node 8577 // had a live implicit CPSR def, then activate the optional CPSR def. 8578 MachineOperand &MO = MI.getOperand(ccOutIdx); 8579 MO.setReg(ARM::CPSR); 8580 MO.setIsDef(true); 8581 } 8582 8583 //===----------------------------------------------------------------------===// 8584 // ARM Optimization Hooks 8585 //===----------------------------------------------------------------------===// 8586 8587 // Helper function that checks if N is a null or all ones constant. 8588 static inline bool isZeroOrAllOnes(SDValue N, bool AllOnes) { 8589 return AllOnes ? isAllOnesConstant(N) : isNullConstant(N); 8590 } 8591 8592 // Return true if N is conditionally 0 or all ones. 8593 // Detects these expressions where cc is an i1 value: 8594 // 8595 // (select cc 0, y) [AllOnes=0] 8596 // (select cc y, 0) [AllOnes=0] 8597 // (zext cc) [AllOnes=0] 8598 // (sext cc) [AllOnes=0/1] 8599 // (select cc -1, y) [AllOnes=1] 8600 // (select cc y, -1) [AllOnes=1] 8601 // 8602 // Invert is set when N is the null/all ones constant when CC is false. 8603 // OtherOp is set to the alternative value of N. 8604 static bool isConditionalZeroOrAllOnes(SDNode *N, bool AllOnes, 8605 SDValue &CC, bool &Invert, 8606 SDValue &OtherOp, 8607 SelectionDAG &DAG) { 8608 switch (N->getOpcode()) { 8609 default: return false; 8610 case ISD::SELECT: { 8611 CC = N->getOperand(0); 8612 SDValue N1 = N->getOperand(1); 8613 SDValue N2 = N->getOperand(2); 8614 if (isZeroOrAllOnes(N1, AllOnes)) { 8615 Invert = false; 8616 OtherOp = N2; 8617 return true; 8618 } 8619 if (isZeroOrAllOnes(N2, AllOnes)) { 8620 Invert = true; 8621 OtherOp = N1; 8622 return true; 8623 } 8624 return false; 8625 } 8626 case ISD::ZERO_EXTEND: 8627 // (zext cc) can never be the all ones value. 8628 if (AllOnes) 8629 return false; 8630 // Fall through. 8631 case ISD::SIGN_EXTEND: { 8632 SDLoc dl(N); 8633 EVT VT = N->getValueType(0); 8634 CC = N->getOperand(0); 8635 if (CC.getValueType() != MVT::i1) 8636 return false; 8637 Invert = !AllOnes; 8638 if (AllOnes) 8639 // When looking for an AllOnes constant, N is an sext, and the 'other' 8640 // value is 0. 8641 OtherOp = DAG.getConstant(0, dl, VT); 8642 else if (N->getOpcode() == ISD::ZERO_EXTEND) 8643 // When looking for a 0 constant, N can be zext or sext. 8644 OtherOp = DAG.getConstant(1, dl, VT); 8645 else 8646 OtherOp = DAG.getConstant(APInt::getAllOnesValue(VT.getSizeInBits()), dl, 8647 VT); 8648 return true; 8649 } 8650 } 8651 } 8652 8653 // Combine a constant select operand into its use: 8654 // 8655 // (add (select cc, 0, c), x) -> (select cc, x, (add, x, c)) 8656 // (sub x, (select cc, 0, c)) -> (select cc, x, (sub, x, c)) 8657 // (and (select cc, -1, c), x) -> (select cc, x, (and, x, c)) [AllOnes=1] 8658 // (or (select cc, 0, c), x) -> (select cc, x, (or, x, c)) 8659 // (xor (select cc, 0, c), x) -> (select cc, x, (xor, x, c)) 8660 // 8661 // The transform is rejected if the select doesn't have a constant operand that 8662 // is null, or all ones when AllOnes is set. 8663 // 8664 // Also recognize sext/zext from i1: 8665 // 8666 // (add (zext cc), x) -> (select cc (add x, 1), x) 8667 // (add (sext cc), x) -> (select cc (add x, -1), x) 8668 // 8669 // These transformations eventually create predicated instructions. 8670 // 8671 // @param N The node to transform. 8672 // @param Slct The N operand that is a select. 8673 // @param OtherOp The other N operand (x above). 8674 // @param DCI Context. 8675 // @param AllOnes Require the select constant to be all ones instead of null. 8676 // @returns The new node, or SDValue() on failure. 8677 static 8678 SDValue combineSelectAndUse(SDNode *N, SDValue Slct, SDValue OtherOp, 8679 TargetLowering::DAGCombinerInfo &DCI, 8680 bool AllOnes = false) { 8681 SelectionDAG &DAG = DCI.DAG; 8682 EVT VT = N->getValueType(0); 8683 SDValue NonConstantVal; 8684 SDValue CCOp; 8685 bool SwapSelectOps; 8686 if (!isConditionalZeroOrAllOnes(Slct.getNode(), AllOnes, CCOp, SwapSelectOps, 8687 NonConstantVal, DAG)) 8688 return SDValue(); 8689 8690 // Slct is now know to be the desired identity constant when CC is true. 8691 SDValue TrueVal = OtherOp; 8692 SDValue FalseVal = DAG.getNode(N->getOpcode(), SDLoc(N), VT, 8693 OtherOp, NonConstantVal); 8694 // Unless SwapSelectOps says CC should be false. 8695 if (SwapSelectOps) 8696 std::swap(TrueVal, FalseVal); 8697 8698 return DAG.getNode(ISD::SELECT, SDLoc(N), VT, 8699 CCOp, TrueVal, FalseVal); 8700 } 8701 8702 // Attempt combineSelectAndUse on each operand of a commutative operator N. 8703 static 8704 SDValue combineSelectAndUseCommutative(SDNode *N, bool AllOnes, 8705 TargetLowering::DAGCombinerInfo &DCI) { 8706 SDValue N0 = N->getOperand(0); 8707 SDValue N1 = N->getOperand(1); 8708 if (N0.getNode()->hasOneUse()) 8709 if (SDValue Result = combineSelectAndUse(N, N0, N1, DCI, AllOnes)) 8710 return Result; 8711 if (N1.getNode()->hasOneUse()) 8712 if (SDValue Result = combineSelectAndUse(N, N1, N0, DCI, AllOnes)) 8713 return Result; 8714 return SDValue(); 8715 } 8716 8717 // AddCombineToVPADDL- For pair-wise add on neon, use the vpaddl instruction 8718 // (only after legalization). 8719 static SDValue AddCombineToVPADDL(SDNode *N, SDValue N0, SDValue N1, 8720 TargetLowering::DAGCombinerInfo &DCI, 8721 const ARMSubtarget *Subtarget) { 8722 8723 // Only perform optimization if after legalize, and if NEON is available. We 8724 // also expected both operands to be BUILD_VECTORs. 8725 if (DCI.isBeforeLegalize() || !Subtarget->hasNEON() 8726 || N0.getOpcode() != ISD::BUILD_VECTOR 8727 || N1.getOpcode() != ISD::BUILD_VECTOR) 8728 return SDValue(); 8729 8730 // Check output type since VPADDL operand elements can only be 8, 16, or 32. 8731 EVT VT = N->getValueType(0); 8732 if (!VT.isInteger() || VT.getVectorElementType() == MVT::i64) 8733 return SDValue(); 8734 8735 // Check that the vector operands are of the right form. 8736 // N0 and N1 are BUILD_VECTOR nodes with N number of EXTRACT_VECTOR 8737 // operands, where N is the size of the formed vector. 8738 // Each EXTRACT_VECTOR should have the same input vector and odd or even 8739 // index such that we have a pair wise add pattern. 8740 8741 // Grab the vector that all EXTRACT_VECTOR nodes should be referencing. 8742 if (N0->getOperand(0)->getOpcode() != ISD::EXTRACT_VECTOR_ELT) 8743 return SDValue(); 8744 SDValue Vec = N0->getOperand(0)->getOperand(0); 8745 SDNode *V = Vec.getNode(); 8746 unsigned nextIndex = 0; 8747 8748 // For each operands to the ADD which are BUILD_VECTORs, 8749 // check to see if each of their operands are an EXTRACT_VECTOR with 8750 // the same vector and appropriate index. 8751 for (unsigned i = 0, e = N0->getNumOperands(); i != e; ++i) { 8752 if (N0->getOperand(i)->getOpcode() == ISD::EXTRACT_VECTOR_ELT 8753 && N1->getOperand(i)->getOpcode() == ISD::EXTRACT_VECTOR_ELT) { 8754 8755 SDValue ExtVec0 = N0->getOperand(i); 8756 SDValue ExtVec1 = N1->getOperand(i); 8757 8758 // First operand is the vector, verify its the same. 8759 if (V != ExtVec0->getOperand(0).getNode() || 8760 V != ExtVec1->getOperand(0).getNode()) 8761 return SDValue(); 8762 8763 // Second is the constant, verify its correct. 8764 ConstantSDNode *C0 = dyn_cast<ConstantSDNode>(ExtVec0->getOperand(1)); 8765 ConstantSDNode *C1 = dyn_cast<ConstantSDNode>(ExtVec1->getOperand(1)); 8766 8767 // For the constant, we want to see all the even or all the odd. 8768 if (!C0 || !C1 || C0->getZExtValue() != nextIndex 8769 || C1->getZExtValue() != nextIndex+1) 8770 return SDValue(); 8771 8772 // Increment index. 8773 nextIndex+=2; 8774 } else 8775 return SDValue(); 8776 } 8777 8778 // Create VPADDL node. 8779 SelectionDAG &DAG = DCI.DAG; 8780 const TargetLowering &TLI = DAG.getTargetLoweringInfo(); 8781 8782 SDLoc dl(N); 8783 8784 // Build operand list. 8785 SmallVector<SDValue, 8> Ops; 8786 Ops.push_back(DAG.getConstant(Intrinsic::arm_neon_vpaddls, dl, 8787 TLI.getPointerTy(DAG.getDataLayout()))); 8788 8789 // Input is the vector. 8790 Ops.push_back(Vec); 8791 8792 // Get widened type and narrowed type. 8793 MVT widenType; 8794 unsigned numElem = VT.getVectorNumElements(); 8795 8796 EVT inputLaneType = Vec.getValueType().getVectorElementType(); 8797 switch (inputLaneType.getSimpleVT().SimpleTy) { 8798 case MVT::i8: widenType = MVT::getVectorVT(MVT::i16, numElem); break; 8799 case MVT::i16: widenType = MVT::getVectorVT(MVT::i32, numElem); break; 8800 case MVT::i32: widenType = MVT::getVectorVT(MVT::i64, numElem); break; 8801 default: 8802 llvm_unreachable("Invalid vector element type for padd optimization."); 8803 } 8804 8805 SDValue tmp = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, widenType, Ops); 8806 unsigned ExtOp = VT.bitsGT(tmp.getValueType()) ? ISD::ANY_EXTEND : ISD::TRUNCATE; 8807 return DAG.getNode(ExtOp, dl, VT, tmp); 8808 } 8809 8810 static SDValue findMUL_LOHI(SDValue V) { 8811 if (V->getOpcode() == ISD::UMUL_LOHI || 8812 V->getOpcode() == ISD::SMUL_LOHI) 8813 return V; 8814 return SDValue(); 8815 } 8816 8817 static SDValue AddCombineTo64bitMLAL(SDNode *AddcNode, 8818 TargetLowering::DAGCombinerInfo &DCI, 8819 const ARMSubtarget *Subtarget) { 8820 8821 // Look for multiply add opportunities. 8822 // The pattern is a ISD::UMUL_LOHI followed by two add nodes, where 8823 // each add nodes consumes a value from ISD::UMUL_LOHI and there is 8824 // a glue link from the first add to the second add. 8825 // If we find this pattern, we can replace the U/SMUL_LOHI, ADDC, and ADDE by 8826 // a S/UMLAL instruction. 8827 // UMUL_LOHI 8828 // / :lo \ :hi 8829 // / \ [no multiline comment] 8830 // loAdd -> ADDE | 8831 // \ :glue / 8832 // \ / 8833 // ADDC <- hiAdd 8834 // 8835 assert(AddcNode->getOpcode() == ISD::ADDC && "Expect an ADDC"); 8836 SDValue AddcOp0 = AddcNode->getOperand(0); 8837 SDValue AddcOp1 = AddcNode->getOperand(1); 8838 8839 // Check if the two operands are from the same mul_lohi node. 8840 if (AddcOp0.getNode() == AddcOp1.getNode()) 8841 return SDValue(); 8842 8843 assert(AddcNode->getNumValues() == 2 && 8844 AddcNode->getValueType(0) == MVT::i32 && 8845 "Expect ADDC with two result values. First: i32"); 8846 8847 // Check that we have a glued ADDC node. 8848 if (AddcNode->getValueType(1) != MVT::Glue) 8849 return SDValue(); 8850 8851 // Check that the ADDC adds the low result of the S/UMUL_LOHI. 8852 if (AddcOp0->getOpcode() != ISD::UMUL_LOHI && 8853 AddcOp0->getOpcode() != ISD::SMUL_LOHI && 8854 AddcOp1->getOpcode() != ISD::UMUL_LOHI && 8855 AddcOp1->getOpcode() != ISD::SMUL_LOHI) 8856 return SDValue(); 8857 8858 // Look for the glued ADDE. 8859 SDNode* AddeNode = AddcNode->getGluedUser(); 8860 if (!AddeNode) 8861 return SDValue(); 8862 8863 // Make sure it is really an ADDE. 8864 if (AddeNode->getOpcode() != ISD::ADDE) 8865 return SDValue(); 8866 8867 assert(AddeNode->getNumOperands() == 3 && 8868 AddeNode->getOperand(2).getValueType() == MVT::Glue && 8869 "ADDE node has the wrong inputs"); 8870 8871 // Check for the triangle shape. 8872 SDValue AddeOp0 = AddeNode->getOperand(0); 8873 SDValue AddeOp1 = AddeNode->getOperand(1); 8874 8875 // Make sure that the ADDE operands are not coming from the same node. 8876 if (AddeOp0.getNode() == AddeOp1.getNode()) 8877 return SDValue(); 8878 8879 // Find the MUL_LOHI node walking up ADDE's operands. 8880 bool IsLeftOperandMUL = false; 8881 SDValue MULOp = findMUL_LOHI(AddeOp0); 8882 if (MULOp == SDValue()) 8883 MULOp = findMUL_LOHI(AddeOp1); 8884 else 8885 IsLeftOperandMUL = true; 8886 if (MULOp == SDValue()) 8887 return SDValue(); 8888 8889 // Figure out the right opcode. 8890 unsigned Opc = MULOp->getOpcode(); 8891 unsigned FinalOpc = (Opc == ISD::SMUL_LOHI) ? ARMISD::SMLAL : ARMISD::UMLAL; 8892 8893 // Figure out the high and low input values to the MLAL node. 8894 SDValue* HiAdd = nullptr; 8895 SDValue* LoMul = nullptr; 8896 SDValue* LowAdd = nullptr; 8897 8898 // Ensure that ADDE is from high result of ISD::SMUL_LOHI. 8899 if ((AddeOp0 != MULOp.getValue(1)) && (AddeOp1 != MULOp.getValue(1))) 8900 return SDValue(); 8901 8902 if (IsLeftOperandMUL) 8903 HiAdd = &AddeOp1; 8904 else 8905 HiAdd = &AddeOp0; 8906 8907 8908 // Ensure that LoMul and LowAdd are taken from correct ISD::SMUL_LOHI node 8909 // whose low result is fed to the ADDC we are checking. 8910 8911 if (AddcOp0 == MULOp.getValue(0)) { 8912 LoMul = &AddcOp0; 8913 LowAdd = &AddcOp1; 8914 } 8915 if (AddcOp1 == MULOp.getValue(0)) { 8916 LoMul = &AddcOp1; 8917 LowAdd = &AddcOp0; 8918 } 8919 8920 if (!LoMul) 8921 return SDValue(); 8922 8923 // Create the merged node. 8924 SelectionDAG &DAG = DCI.DAG; 8925 8926 // Build operand list. 8927 SmallVector<SDValue, 8> Ops; 8928 Ops.push_back(LoMul->getOperand(0)); 8929 Ops.push_back(LoMul->getOperand(1)); 8930 Ops.push_back(*LowAdd); 8931 Ops.push_back(*HiAdd); 8932 8933 SDValue MLALNode = DAG.getNode(FinalOpc, SDLoc(AddcNode), 8934 DAG.getVTList(MVT::i32, MVT::i32), Ops); 8935 8936 // Replace the ADDs' nodes uses by the MLA node's values. 8937 SDValue HiMLALResult(MLALNode.getNode(), 1); 8938 DAG.ReplaceAllUsesOfValueWith(SDValue(AddeNode, 0), HiMLALResult); 8939 8940 SDValue LoMLALResult(MLALNode.getNode(), 0); 8941 DAG.ReplaceAllUsesOfValueWith(SDValue(AddcNode, 0), LoMLALResult); 8942 8943 // Return original node to notify the driver to stop replacing. 8944 SDValue resNode(AddcNode, 0); 8945 return resNode; 8946 } 8947 8948 static SDValue AddCombineTo64bitUMAAL(SDNode *AddcNode, 8949 TargetLowering::DAGCombinerInfo &DCI, 8950 const ARMSubtarget *Subtarget) { 8951 // UMAAL is similar to UMLAL except that it adds two unsigned values. 8952 // While trying to combine for the other MLAL nodes, first search for the 8953 // chance to use UMAAL. Check if Addc uses another addc node which can first 8954 // be combined into a UMLAL. The other pattern is AddcNode being combined 8955 // into an UMLAL and then using another addc is handled in ISelDAGToDAG. 8956 8957 if (!Subtarget->hasV6Ops()) 8958 return AddCombineTo64bitMLAL(AddcNode, DCI, Subtarget); 8959 8960 SDNode *PrevAddc = nullptr; 8961 if (AddcNode->getOperand(0).getOpcode() == ISD::ADDC) 8962 PrevAddc = AddcNode->getOperand(0).getNode(); 8963 else if (AddcNode->getOperand(1).getOpcode() == ISD::ADDC) 8964 PrevAddc = AddcNode->getOperand(1).getNode(); 8965 8966 // If there's no addc chains, just return a search for any MLAL. 8967 if (PrevAddc == nullptr) 8968 return AddCombineTo64bitMLAL(AddcNode, DCI, Subtarget); 8969 8970 // Try to convert the addc operand to an MLAL and if that fails try to 8971 // combine AddcNode. 8972 SDValue MLAL = AddCombineTo64bitMLAL(PrevAddc, DCI, Subtarget); 8973 if (MLAL != SDValue(PrevAddc, 0)) 8974 return AddCombineTo64bitMLAL(AddcNode, DCI, Subtarget); 8975 8976 // Find the converted UMAAL or quit if it doesn't exist. 8977 SDNode *UmlalNode = nullptr; 8978 SDValue AddHi; 8979 if (AddcNode->getOperand(0).getOpcode() == ARMISD::UMLAL) { 8980 UmlalNode = AddcNode->getOperand(0).getNode(); 8981 AddHi = AddcNode->getOperand(1); 8982 } else if (AddcNode->getOperand(1).getOpcode() == ARMISD::UMLAL) { 8983 UmlalNode = AddcNode->getOperand(1).getNode(); 8984 AddHi = AddcNode->getOperand(0); 8985 } else { 8986 return SDValue(); 8987 } 8988 8989 // The ADDC should be glued to an ADDE node, which uses the same UMLAL as 8990 // the ADDC as well as Zero. 8991 auto *Zero = dyn_cast<ConstantSDNode>(UmlalNode->getOperand(3)); 8992 8993 if (!Zero || Zero->getZExtValue() != 0) 8994 return SDValue(); 8995 8996 // Check that we have a glued ADDC node. 8997 if (AddcNode->getValueType(1) != MVT::Glue) 8998 return SDValue(); 8999 9000 // Look for the glued ADDE. 9001 SDNode* AddeNode = AddcNode->getGluedUser(); 9002 if (!AddeNode) 9003 return SDValue(); 9004 9005 if ((AddeNode->getOperand(0).getNode() == Zero && 9006 AddeNode->getOperand(1).getNode() == UmlalNode) || 9007 (AddeNode->getOperand(0).getNode() == UmlalNode && 9008 AddeNode->getOperand(1).getNode() == Zero)) { 9009 9010 SelectionDAG &DAG = DCI.DAG; 9011 SDValue Ops[] = { UmlalNode->getOperand(0), UmlalNode->getOperand(1), 9012 UmlalNode->getOperand(2), AddHi }; 9013 SDValue UMAAL = DAG.getNode(ARMISD::UMAAL, SDLoc(AddcNode), 9014 DAG.getVTList(MVT::i32, MVT::i32), Ops); 9015 9016 // Replace the ADDs' nodes uses by the UMAAL node's values. 9017 DAG.ReplaceAllUsesOfValueWith(SDValue(AddeNode, 0), SDValue(UMAAL.getNode(), 1)); 9018 DAG.ReplaceAllUsesOfValueWith(SDValue(AddcNode, 0), SDValue(UMAAL.getNode(), 0)); 9019 9020 // Return original node to notify the driver to stop replacing. 9021 return SDValue(AddcNode, 0); 9022 } 9023 return SDValue(); 9024 } 9025 9026 /// PerformADDCCombine - Target-specific dag combine transform from 9027 /// ISD::ADDC, ISD::ADDE, and ISD::MUL_LOHI to MLAL or 9028 /// ISD::ADDC, ISD::ADDE and ARMISD::UMLAL to ARMISD::UMAAL 9029 static SDValue PerformADDCCombine(SDNode *N, 9030 TargetLowering::DAGCombinerInfo &DCI, 9031 const ARMSubtarget *Subtarget) { 9032 9033 if (Subtarget->isThumb1Only()) return SDValue(); 9034 9035 // Only perform the checks after legalize when the pattern is available. 9036 if (DCI.isBeforeLegalize()) return SDValue(); 9037 9038 return AddCombineTo64bitUMAAL(N, DCI, Subtarget); 9039 } 9040 9041 /// PerformADDCombineWithOperands - Try DAG combinations for an ADD with 9042 /// operands N0 and N1. This is a helper for PerformADDCombine that is 9043 /// called with the default operands, and if that fails, with commuted 9044 /// operands. 9045 static SDValue PerformADDCombineWithOperands(SDNode *N, SDValue N0, SDValue N1, 9046 TargetLowering::DAGCombinerInfo &DCI, 9047 const ARMSubtarget *Subtarget){ 9048 9049 // Attempt to create vpaddl for this add. 9050 if (SDValue Result = AddCombineToVPADDL(N, N0, N1, DCI, Subtarget)) 9051 return Result; 9052 9053 // fold (add (select cc, 0, c), x) -> (select cc, x, (add, x, c)) 9054 if (N0.getNode()->hasOneUse()) 9055 if (SDValue Result = combineSelectAndUse(N, N0, N1, DCI)) 9056 return Result; 9057 return SDValue(); 9058 } 9059 9060 /// PerformADDCombine - Target-specific dag combine xforms for ISD::ADD. 9061 /// 9062 static SDValue PerformADDCombine(SDNode *N, 9063 TargetLowering::DAGCombinerInfo &DCI, 9064 const ARMSubtarget *Subtarget) { 9065 SDValue N0 = N->getOperand(0); 9066 SDValue N1 = N->getOperand(1); 9067 9068 // First try with the default operand order. 9069 if (SDValue Result = PerformADDCombineWithOperands(N, N0, N1, DCI, Subtarget)) 9070 return Result; 9071 9072 // If that didn't work, try again with the operands commuted. 9073 return PerformADDCombineWithOperands(N, N1, N0, DCI, Subtarget); 9074 } 9075 9076 /// PerformSUBCombine - Target-specific dag combine xforms for ISD::SUB. 9077 /// 9078 static SDValue PerformSUBCombine(SDNode *N, 9079 TargetLowering::DAGCombinerInfo &DCI) { 9080 SDValue N0 = N->getOperand(0); 9081 SDValue N1 = N->getOperand(1); 9082 9083 // fold (sub x, (select cc, 0, c)) -> (select cc, x, (sub, x, c)) 9084 if (N1.getNode()->hasOneUse()) 9085 if (SDValue Result = combineSelectAndUse(N, N1, N0, DCI)) 9086 return Result; 9087 9088 return SDValue(); 9089 } 9090 9091 /// PerformVMULCombine 9092 /// Distribute (A + B) * C to (A * C) + (B * C) to take advantage of the 9093 /// special multiplier accumulator forwarding. 9094 /// vmul d3, d0, d2 9095 /// vmla d3, d1, d2 9096 /// is faster than 9097 /// vadd d3, d0, d1 9098 /// vmul d3, d3, d2 9099 // However, for (A + B) * (A + B), 9100 // vadd d2, d0, d1 9101 // vmul d3, d0, d2 9102 // vmla d3, d1, d2 9103 // is slower than 9104 // vadd d2, d0, d1 9105 // vmul d3, d2, d2 9106 static SDValue PerformVMULCombine(SDNode *N, 9107 TargetLowering::DAGCombinerInfo &DCI, 9108 const ARMSubtarget *Subtarget) { 9109 if (!Subtarget->hasVMLxForwarding()) 9110 return SDValue(); 9111 9112 SelectionDAG &DAG = DCI.DAG; 9113 SDValue N0 = N->getOperand(0); 9114 SDValue N1 = N->getOperand(1); 9115 unsigned Opcode = N0.getOpcode(); 9116 if (Opcode != ISD::ADD && Opcode != ISD::SUB && 9117 Opcode != ISD::FADD && Opcode != ISD::FSUB) { 9118 Opcode = N1.getOpcode(); 9119 if (Opcode != ISD::ADD && Opcode != ISD::SUB && 9120 Opcode != ISD::FADD && Opcode != ISD::FSUB) 9121 return SDValue(); 9122 std::swap(N0, N1); 9123 } 9124 9125 if (N0 == N1) 9126 return SDValue(); 9127 9128 EVT VT = N->getValueType(0); 9129 SDLoc DL(N); 9130 SDValue N00 = N0->getOperand(0); 9131 SDValue N01 = N0->getOperand(1); 9132 return DAG.getNode(Opcode, DL, VT, 9133 DAG.getNode(ISD::MUL, DL, VT, N00, N1), 9134 DAG.getNode(ISD::MUL, DL, VT, N01, N1)); 9135 } 9136 9137 static SDValue PerformMULCombine(SDNode *N, 9138 TargetLowering::DAGCombinerInfo &DCI, 9139 const ARMSubtarget *Subtarget) { 9140 SelectionDAG &DAG = DCI.DAG; 9141 9142 if (Subtarget->isThumb1Only()) 9143 return SDValue(); 9144 9145 if (DCI.isBeforeLegalize() || DCI.isCalledByLegalizer()) 9146 return SDValue(); 9147 9148 EVT VT = N->getValueType(0); 9149 if (VT.is64BitVector() || VT.is128BitVector()) 9150 return PerformVMULCombine(N, DCI, Subtarget); 9151 if (VT != MVT::i32) 9152 return SDValue(); 9153 9154 ConstantSDNode *C = dyn_cast<ConstantSDNode>(N->getOperand(1)); 9155 if (!C) 9156 return SDValue(); 9157 9158 int64_t MulAmt = C->getSExtValue(); 9159 unsigned ShiftAmt = countTrailingZeros<uint64_t>(MulAmt); 9160 9161 ShiftAmt = ShiftAmt & (32 - 1); 9162 SDValue V = N->getOperand(0); 9163 SDLoc DL(N); 9164 9165 SDValue Res; 9166 MulAmt >>= ShiftAmt; 9167 9168 if (MulAmt >= 0) { 9169 if (isPowerOf2_32(MulAmt - 1)) { 9170 // (mul x, 2^N + 1) => (add (shl x, N), x) 9171 Res = DAG.getNode(ISD::ADD, DL, VT, 9172 V, 9173 DAG.getNode(ISD::SHL, DL, VT, 9174 V, 9175 DAG.getConstant(Log2_32(MulAmt - 1), DL, 9176 MVT::i32))); 9177 } else if (isPowerOf2_32(MulAmt + 1)) { 9178 // (mul x, 2^N - 1) => (sub (shl x, N), x) 9179 Res = DAG.getNode(ISD::SUB, DL, VT, 9180 DAG.getNode(ISD::SHL, DL, VT, 9181 V, 9182 DAG.getConstant(Log2_32(MulAmt + 1), DL, 9183 MVT::i32)), 9184 V); 9185 } else 9186 return SDValue(); 9187 } else { 9188 uint64_t MulAmtAbs = -MulAmt; 9189 if (isPowerOf2_32(MulAmtAbs + 1)) { 9190 // (mul x, -(2^N - 1)) => (sub x, (shl x, N)) 9191 Res = DAG.getNode(ISD::SUB, DL, VT, 9192 V, 9193 DAG.getNode(ISD::SHL, DL, VT, 9194 V, 9195 DAG.getConstant(Log2_32(MulAmtAbs + 1), DL, 9196 MVT::i32))); 9197 } else if (isPowerOf2_32(MulAmtAbs - 1)) { 9198 // (mul x, -(2^N + 1)) => - (add (shl x, N), x) 9199 Res = DAG.getNode(ISD::ADD, DL, VT, 9200 V, 9201 DAG.getNode(ISD::SHL, DL, VT, 9202 V, 9203 DAG.getConstant(Log2_32(MulAmtAbs - 1), DL, 9204 MVT::i32))); 9205 Res = DAG.getNode(ISD::SUB, DL, VT, 9206 DAG.getConstant(0, DL, MVT::i32), Res); 9207 9208 } else 9209 return SDValue(); 9210 } 9211 9212 if (ShiftAmt != 0) 9213 Res = DAG.getNode(ISD::SHL, DL, VT, 9214 Res, DAG.getConstant(ShiftAmt, DL, MVT::i32)); 9215 9216 // Do not add new nodes to DAG combiner worklist. 9217 DCI.CombineTo(N, Res, false); 9218 return SDValue(); 9219 } 9220 9221 static SDValue PerformANDCombine(SDNode *N, 9222 TargetLowering::DAGCombinerInfo &DCI, 9223 const ARMSubtarget *Subtarget) { 9224 9225 // Attempt to use immediate-form VBIC 9226 BuildVectorSDNode *BVN = dyn_cast<BuildVectorSDNode>(N->getOperand(1)); 9227 SDLoc dl(N); 9228 EVT VT = N->getValueType(0); 9229 SelectionDAG &DAG = DCI.DAG; 9230 9231 if(!DAG.getTargetLoweringInfo().isTypeLegal(VT)) 9232 return SDValue(); 9233 9234 APInt SplatBits, SplatUndef; 9235 unsigned SplatBitSize; 9236 bool HasAnyUndefs; 9237 if (BVN && 9238 BVN->isConstantSplat(SplatBits, SplatUndef, SplatBitSize, HasAnyUndefs)) { 9239 if (SplatBitSize <= 64) { 9240 EVT VbicVT; 9241 SDValue Val = isNEONModifiedImm((~SplatBits).getZExtValue(), 9242 SplatUndef.getZExtValue(), SplatBitSize, 9243 DAG, dl, VbicVT, VT.is128BitVector(), 9244 OtherModImm); 9245 if (Val.getNode()) { 9246 SDValue Input = 9247 DAG.getNode(ISD::BITCAST, dl, VbicVT, N->getOperand(0)); 9248 SDValue Vbic = DAG.getNode(ARMISD::VBICIMM, dl, VbicVT, Input, Val); 9249 return DAG.getNode(ISD::BITCAST, dl, VT, Vbic); 9250 } 9251 } 9252 } 9253 9254 if (!Subtarget->isThumb1Only()) { 9255 // fold (and (select cc, -1, c), x) -> (select cc, x, (and, x, c)) 9256 if (SDValue Result = combineSelectAndUseCommutative(N, true, DCI)) 9257 return Result; 9258 } 9259 9260 return SDValue(); 9261 } 9262 9263 /// PerformORCombine - Target-specific dag combine xforms for ISD::OR 9264 static SDValue PerformORCombine(SDNode *N, 9265 TargetLowering::DAGCombinerInfo &DCI, 9266 const ARMSubtarget *Subtarget) { 9267 // Attempt to use immediate-form VORR 9268 BuildVectorSDNode *BVN = dyn_cast<BuildVectorSDNode>(N->getOperand(1)); 9269 SDLoc dl(N); 9270 EVT VT = N->getValueType(0); 9271 SelectionDAG &DAG = DCI.DAG; 9272 9273 if(!DAG.getTargetLoweringInfo().isTypeLegal(VT)) 9274 return SDValue(); 9275 9276 APInt SplatBits, SplatUndef; 9277 unsigned SplatBitSize; 9278 bool HasAnyUndefs; 9279 if (BVN && Subtarget->hasNEON() && 9280 BVN->isConstantSplat(SplatBits, SplatUndef, SplatBitSize, HasAnyUndefs)) { 9281 if (SplatBitSize <= 64) { 9282 EVT VorrVT; 9283 SDValue Val = isNEONModifiedImm(SplatBits.getZExtValue(), 9284 SplatUndef.getZExtValue(), SplatBitSize, 9285 DAG, dl, VorrVT, VT.is128BitVector(), 9286 OtherModImm); 9287 if (Val.getNode()) { 9288 SDValue Input = 9289 DAG.getNode(ISD::BITCAST, dl, VorrVT, N->getOperand(0)); 9290 SDValue Vorr = DAG.getNode(ARMISD::VORRIMM, dl, VorrVT, Input, Val); 9291 return DAG.getNode(ISD::BITCAST, dl, VT, Vorr); 9292 } 9293 } 9294 } 9295 9296 if (!Subtarget->isThumb1Only()) { 9297 // fold (or (select cc, 0, c), x) -> (select cc, x, (or, x, c)) 9298 if (SDValue Result = combineSelectAndUseCommutative(N, false, DCI)) 9299 return Result; 9300 } 9301 9302 // The code below optimizes (or (and X, Y), Z). 9303 // The AND operand needs to have a single user to make these optimizations 9304 // profitable. 9305 SDValue N0 = N->getOperand(0); 9306 if (N0.getOpcode() != ISD::AND || !N0.hasOneUse()) 9307 return SDValue(); 9308 SDValue N1 = N->getOperand(1); 9309 9310 // (or (and B, A), (and C, ~A)) => (VBSL A, B, C) when A is a constant. 9311 if (Subtarget->hasNEON() && N1.getOpcode() == ISD::AND && VT.isVector() && 9312 DAG.getTargetLoweringInfo().isTypeLegal(VT)) { 9313 APInt SplatUndef; 9314 unsigned SplatBitSize; 9315 bool HasAnyUndefs; 9316 9317 APInt SplatBits0, SplatBits1; 9318 BuildVectorSDNode *BVN0 = dyn_cast<BuildVectorSDNode>(N0->getOperand(1)); 9319 BuildVectorSDNode *BVN1 = dyn_cast<BuildVectorSDNode>(N1->getOperand(1)); 9320 // Ensure that the second operand of both ands are constants 9321 if (BVN0 && BVN0->isConstantSplat(SplatBits0, SplatUndef, SplatBitSize, 9322 HasAnyUndefs) && !HasAnyUndefs) { 9323 if (BVN1 && BVN1->isConstantSplat(SplatBits1, SplatUndef, SplatBitSize, 9324 HasAnyUndefs) && !HasAnyUndefs) { 9325 // Ensure that the bit width of the constants are the same and that 9326 // the splat arguments are logical inverses as per the pattern we 9327 // are trying to simplify. 9328 if (SplatBits0.getBitWidth() == SplatBits1.getBitWidth() && 9329 SplatBits0 == ~SplatBits1) { 9330 // Canonicalize the vector type to make instruction selection 9331 // simpler. 9332 EVT CanonicalVT = VT.is128BitVector() ? MVT::v4i32 : MVT::v2i32; 9333 SDValue Result = DAG.getNode(ARMISD::VBSL, dl, CanonicalVT, 9334 N0->getOperand(1), 9335 N0->getOperand(0), 9336 N1->getOperand(0)); 9337 return DAG.getNode(ISD::BITCAST, dl, VT, Result); 9338 } 9339 } 9340 } 9341 } 9342 9343 // Try to use the ARM/Thumb2 BFI (bitfield insert) instruction when 9344 // reasonable. 9345 9346 // BFI is only available on V6T2+ 9347 if (Subtarget->isThumb1Only() || !Subtarget->hasV6T2Ops()) 9348 return SDValue(); 9349 9350 SDLoc DL(N); 9351 // 1) or (and A, mask), val => ARMbfi A, val, mask 9352 // iff (val & mask) == val 9353 // 9354 // 2) or (and A, mask), (and B, mask2) => ARMbfi A, (lsr B, amt), mask 9355 // 2a) iff isBitFieldInvertedMask(mask) && isBitFieldInvertedMask(~mask2) 9356 // && mask == ~mask2 9357 // 2b) iff isBitFieldInvertedMask(~mask) && isBitFieldInvertedMask(mask2) 9358 // && ~mask == mask2 9359 // (i.e., copy a bitfield value into another bitfield of the same width) 9360 9361 if (VT != MVT::i32) 9362 return SDValue(); 9363 9364 SDValue N00 = N0.getOperand(0); 9365 9366 // The value and the mask need to be constants so we can verify this is 9367 // actually a bitfield set. If the mask is 0xffff, we can do better 9368 // via a movt instruction, so don't use BFI in that case. 9369 SDValue MaskOp = N0.getOperand(1); 9370 ConstantSDNode *MaskC = dyn_cast<ConstantSDNode>(MaskOp); 9371 if (!MaskC) 9372 return SDValue(); 9373 unsigned Mask = MaskC->getZExtValue(); 9374 if (Mask == 0xffff) 9375 return SDValue(); 9376 SDValue Res; 9377 // Case (1): or (and A, mask), val => ARMbfi A, val, mask 9378 ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(N1); 9379 if (N1C) { 9380 unsigned Val = N1C->getZExtValue(); 9381 if ((Val & ~Mask) != Val) 9382 return SDValue(); 9383 9384 if (ARM::isBitFieldInvertedMask(Mask)) { 9385 Val >>= countTrailingZeros(~Mask); 9386 9387 Res = DAG.getNode(ARMISD::BFI, DL, VT, N00, 9388 DAG.getConstant(Val, DL, MVT::i32), 9389 DAG.getConstant(Mask, DL, MVT::i32)); 9390 9391 // Do not add new nodes to DAG combiner worklist. 9392 DCI.CombineTo(N, Res, false); 9393 return SDValue(); 9394 } 9395 } else if (N1.getOpcode() == ISD::AND) { 9396 // case (2) or (and A, mask), (and B, mask2) => ARMbfi A, (lsr B, amt), mask 9397 ConstantSDNode *N11C = dyn_cast<ConstantSDNode>(N1.getOperand(1)); 9398 if (!N11C) 9399 return SDValue(); 9400 unsigned Mask2 = N11C->getZExtValue(); 9401 9402 // Mask and ~Mask2 (or reverse) must be equivalent for the BFI pattern 9403 // as is to match. 9404 if (ARM::isBitFieldInvertedMask(Mask) && 9405 (Mask == ~Mask2)) { 9406 // The pack halfword instruction works better for masks that fit it, 9407 // so use that when it's available. 9408 if (Subtarget->hasT2ExtractPack() && 9409 (Mask == 0xffff || Mask == 0xffff0000)) 9410 return SDValue(); 9411 // 2a 9412 unsigned amt = countTrailingZeros(Mask2); 9413 Res = DAG.getNode(ISD::SRL, DL, VT, N1.getOperand(0), 9414 DAG.getConstant(amt, DL, MVT::i32)); 9415 Res = DAG.getNode(ARMISD::BFI, DL, VT, N00, Res, 9416 DAG.getConstant(Mask, DL, MVT::i32)); 9417 // Do not add new nodes to DAG combiner worklist. 9418 DCI.CombineTo(N, Res, false); 9419 return SDValue(); 9420 } else if (ARM::isBitFieldInvertedMask(~Mask) && 9421 (~Mask == Mask2)) { 9422 // The pack halfword instruction works better for masks that fit it, 9423 // so use that when it's available. 9424 if (Subtarget->hasT2ExtractPack() && 9425 (Mask2 == 0xffff || Mask2 == 0xffff0000)) 9426 return SDValue(); 9427 // 2b 9428 unsigned lsb = countTrailingZeros(Mask); 9429 Res = DAG.getNode(ISD::SRL, DL, VT, N00, 9430 DAG.getConstant(lsb, DL, MVT::i32)); 9431 Res = DAG.getNode(ARMISD::BFI, DL, VT, N1.getOperand(0), Res, 9432 DAG.getConstant(Mask2, DL, MVT::i32)); 9433 // Do not add new nodes to DAG combiner worklist. 9434 DCI.CombineTo(N, Res, false); 9435 return SDValue(); 9436 } 9437 } 9438 9439 if (DAG.MaskedValueIsZero(N1, MaskC->getAPIntValue()) && 9440 N00.getOpcode() == ISD::SHL && isa<ConstantSDNode>(N00.getOperand(1)) && 9441 ARM::isBitFieldInvertedMask(~Mask)) { 9442 // Case (3): or (and (shl A, #shamt), mask), B => ARMbfi B, A, ~mask 9443 // where lsb(mask) == #shamt and masked bits of B are known zero. 9444 SDValue ShAmt = N00.getOperand(1); 9445 unsigned ShAmtC = cast<ConstantSDNode>(ShAmt)->getZExtValue(); 9446 unsigned LSB = countTrailingZeros(Mask); 9447 if (ShAmtC != LSB) 9448 return SDValue(); 9449 9450 Res = DAG.getNode(ARMISD::BFI, DL, VT, N1, N00.getOperand(0), 9451 DAG.getConstant(~Mask, DL, MVT::i32)); 9452 9453 // Do not add new nodes to DAG combiner worklist. 9454 DCI.CombineTo(N, Res, false); 9455 } 9456 9457 return SDValue(); 9458 } 9459 9460 static SDValue PerformXORCombine(SDNode *N, 9461 TargetLowering::DAGCombinerInfo &DCI, 9462 const ARMSubtarget *Subtarget) { 9463 EVT VT = N->getValueType(0); 9464 SelectionDAG &DAG = DCI.DAG; 9465 9466 if(!DAG.getTargetLoweringInfo().isTypeLegal(VT)) 9467 return SDValue(); 9468 9469 if (!Subtarget->isThumb1Only()) { 9470 // fold (xor (select cc, 0, c), x) -> (select cc, x, (xor, x, c)) 9471 if (SDValue Result = combineSelectAndUseCommutative(N, false, DCI)) 9472 return Result; 9473 } 9474 9475 return SDValue(); 9476 } 9477 9478 // ParseBFI - given a BFI instruction in N, extract the "from" value (Rn) and return it, 9479 // and fill in FromMask and ToMask with (consecutive) bits in "from" to be extracted and 9480 // their position in "to" (Rd). 9481 static SDValue ParseBFI(SDNode *N, APInt &ToMask, APInt &FromMask) { 9482 assert(N->getOpcode() == ARMISD::BFI); 9483 9484 SDValue From = N->getOperand(1); 9485 ToMask = ~cast<ConstantSDNode>(N->getOperand(2))->getAPIntValue(); 9486 FromMask = APInt::getLowBitsSet(ToMask.getBitWidth(), ToMask.countPopulation()); 9487 9488 // If the Base came from a SHR #C, we can deduce that it is really testing bit 9489 // #C in the base of the SHR. 9490 if (From->getOpcode() == ISD::SRL && 9491 isa<ConstantSDNode>(From->getOperand(1))) { 9492 APInt Shift = cast<ConstantSDNode>(From->getOperand(1))->getAPIntValue(); 9493 assert(Shift.getLimitedValue() < 32 && "Shift too large!"); 9494 FromMask <<= Shift.getLimitedValue(31); 9495 From = From->getOperand(0); 9496 } 9497 9498 return From; 9499 } 9500 9501 // If A and B contain one contiguous set of bits, does A | B == A . B? 9502 // 9503 // Neither A nor B must be zero. 9504 static bool BitsProperlyConcatenate(const APInt &A, const APInt &B) { 9505 unsigned LastActiveBitInA = A.countTrailingZeros(); 9506 unsigned FirstActiveBitInB = B.getBitWidth() - B.countLeadingZeros() - 1; 9507 return LastActiveBitInA - 1 == FirstActiveBitInB; 9508 } 9509 9510 static SDValue FindBFIToCombineWith(SDNode *N) { 9511 // We have a BFI in N. Follow a possible chain of BFIs and find a BFI it can combine with, 9512 // if one exists. 9513 APInt ToMask, FromMask; 9514 SDValue From = ParseBFI(N, ToMask, FromMask); 9515 SDValue To = N->getOperand(0); 9516 9517 // Now check for a compatible BFI to merge with. We can pass through BFIs that 9518 // aren't compatible, but not if they set the same bit in their destination as 9519 // we do (or that of any BFI we're going to combine with). 9520 SDValue V = To; 9521 APInt CombinedToMask = ToMask; 9522 while (V.getOpcode() == ARMISD::BFI) { 9523 APInt NewToMask, NewFromMask; 9524 SDValue NewFrom = ParseBFI(V.getNode(), NewToMask, NewFromMask); 9525 if (NewFrom != From) { 9526 // This BFI has a different base. Keep going. 9527 CombinedToMask |= NewToMask; 9528 V = V.getOperand(0); 9529 continue; 9530 } 9531 9532 // Do the written bits conflict with any we've seen so far? 9533 if ((NewToMask & CombinedToMask).getBoolValue()) 9534 // Conflicting bits - bail out because going further is unsafe. 9535 return SDValue(); 9536 9537 // Are the new bits contiguous when combined with the old bits? 9538 if (BitsProperlyConcatenate(ToMask, NewToMask) && 9539 BitsProperlyConcatenate(FromMask, NewFromMask)) 9540 return V; 9541 if (BitsProperlyConcatenate(NewToMask, ToMask) && 9542 BitsProperlyConcatenate(NewFromMask, FromMask)) 9543 return V; 9544 9545 // We've seen a write to some bits, so track it. 9546 CombinedToMask |= NewToMask; 9547 // Keep going... 9548 V = V.getOperand(0); 9549 } 9550 9551 return SDValue(); 9552 } 9553 9554 static SDValue PerformBFICombine(SDNode *N, 9555 TargetLowering::DAGCombinerInfo &DCI) { 9556 SDValue N1 = N->getOperand(1); 9557 if (N1.getOpcode() == ISD::AND) { 9558 // (bfi A, (and B, Mask1), Mask2) -> (bfi A, B, Mask2) iff 9559 // the bits being cleared by the AND are not demanded by the BFI. 9560 ConstantSDNode *N11C = dyn_cast<ConstantSDNode>(N1.getOperand(1)); 9561 if (!N11C) 9562 return SDValue(); 9563 unsigned InvMask = cast<ConstantSDNode>(N->getOperand(2))->getZExtValue(); 9564 unsigned LSB = countTrailingZeros(~InvMask); 9565 unsigned Width = (32 - countLeadingZeros(~InvMask)) - LSB; 9566 assert(Width < 9567 static_cast<unsigned>(std::numeric_limits<unsigned>::digits) && 9568 "undefined behavior"); 9569 unsigned Mask = (1u << Width) - 1; 9570 unsigned Mask2 = N11C->getZExtValue(); 9571 if ((Mask & (~Mask2)) == 0) 9572 return DCI.DAG.getNode(ARMISD::BFI, SDLoc(N), N->getValueType(0), 9573 N->getOperand(0), N1.getOperand(0), 9574 N->getOperand(2)); 9575 } else if (N->getOperand(0).getOpcode() == ARMISD::BFI) { 9576 // We have a BFI of a BFI. Walk up the BFI chain to see how long it goes. 9577 // Keep track of any consecutive bits set that all come from the same base 9578 // value. We can combine these together into a single BFI. 9579 SDValue CombineBFI = FindBFIToCombineWith(N); 9580 if (CombineBFI == SDValue()) 9581 return SDValue(); 9582 9583 // We've found a BFI. 9584 APInt ToMask1, FromMask1; 9585 SDValue From1 = ParseBFI(N, ToMask1, FromMask1); 9586 9587 APInt ToMask2, FromMask2; 9588 SDValue From2 = ParseBFI(CombineBFI.getNode(), ToMask2, FromMask2); 9589 assert(From1 == From2); 9590 (void)From2; 9591 9592 // First, unlink CombineBFI. 9593 DCI.DAG.ReplaceAllUsesWith(CombineBFI, CombineBFI.getOperand(0)); 9594 // Then create a new BFI, combining the two together. 9595 APInt NewFromMask = FromMask1 | FromMask2; 9596 APInt NewToMask = ToMask1 | ToMask2; 9597 9598 EVT VT = N->getValueType(0); 9599 SDLoc dl(N); 9600 9601 if (NewFromMask[0] == 0) 9602 From1 = DCI.DAG.getNode( 9603 ISD::SRL, dl, VT, From1, 9604 DCI.DAG.getConstant(NewFromMask.countTrailingZeros(), dl, VT)); 9605 return DCI.DAG.getNode(ARMISD::BFI, dl, VT, N->getOperand(0), From1, 9606 DCI.DAG.getConstant(~NewToMask, dl, VT)); 9607 } 9608 return SDValue(); 9609 } 9610 9611 /// PerformVMOVRRDCombine - Target-specific dag combine xforms for 9612 /// ARMISD::VMOVRRD. 9613 static SDValue PerformVMOVRRDCombine(SDNode *N, 9614 TargetLowering::DAGCombinerInfo &DCI, 9615 const ARMSubtarget *Subtarget) { 9616 // vmovrrd(vmovdrr x, y) -> x,y 9617 SDValue InDouble = N->getOperand(0); 9618 if (InDouble.getOpcode() == ARMISD::VMOVDRR && !Subtarget->isFPOnlySP()) 9619 return DCI.CombineTo(N, InDouble.getOperand(0), InDouble.getOperand(1)); 9620 9621 // vmovrrd(load f64) -> (load i32), (load i32) 9622 SDNode *InNode = InDouble.getNode(); 9623 if (ISD::isNormalLoad(InNode) && InNode->hasOneUse() && 9624 InNode->getValueType(0) == MVT::f64 && 9625 InNode->getOperand(1).getOpcode() == ISD::FrameIndex && 9626 !cast<LoadSDNode>(InNode)->isVolatile()) { 9627 // TODO: Should this be done for non-FrameIndex operands? 9628 LoadSDNode *LD = cast<LoadSDNode>(InNode); 9629 9630 SelectionDAG &DAG = DCI.DAG; 9631 SDLoc DL(LD); 9632 SDValue BasePtr = LD->getBasePtr(); 9633 SDValue NewLD1 = DAG.getLoad(MVT::i32, DL, LD->getChain(), BasePtr, 9634 LD->getPointerInfo(), LD->isVolatile(), 9635 LD->isNonTemporal(), LD->isInvariant(), 9636 LD->getAlignment()); 9637 9638 SDValue OffsetPtr = DAG.getNode(ISD::ADD, DL, MVT::i32, BasePtr, 9639 DAG.getConstant(4, DL, MVT::i32)); 9640 SDValue NewLD2 = DAG.getLoad(MVT::i32, DL, NewLD1.getValue(1), OffsetPtr, 9641 LD->getPointerInfo(), LD->isVolatile(), 9642 LD->isNonTemporal(), LD->isInvariant(), 9643 std::min(4U, LD->getAlignment() / 2)); 9644 9645 DAG.ReplaceAllUsesOfValueWith(SDValue(LD, 1), NewLD2.getValue(1)); 9646 if (DCI.DAG.getDataLayout().isBigEndian()) 9647 std::swap (NewLD1, NewLD2); 9648 SDValue Result = DCI.CombineTo(N, NewLD1, NewLD2); 9649 return Result; 9650 } 9651 9652 return SDValue(); 9653 } 9654 9655 /// PerformVMOVDRRCombine - Target-specific dag combine xforms for 9656 /// ARMISD::VMOVDRR. This is also used for BUILD_VECTORs with 2 operands. 9657 static SDValue PerformVMOVDRRCombine(SDNode *N, SelectionDAG &DAG) { 9658 // N=vmovrrd(X); vmovdrr(N:0, N:1) -> bit_convert(X) 9659 SDValue Op0 = N->getOperand(0); 9660 SDValue Op1 = N->getOperand(1); 9661 if (Op0.getOpcode() == ISD::BITCAST) 9662 Op0 = Op0.getOperand(0); 9663 if (Op1.getOpcode() == ISD::BITCAST) 9664 Op1 = Op1.getOperand(0); 9665 if (Op0.getOpcode() == ARMISD::VMOVRRD && 9666 Op0.getNode() == Op1.getNode() && 9667 Op0.getResNo() == 0 && Op1.getResNo() == 1) 9668 return DAG.getNode(ISD::BITCAST, SDLoc(N), 9669 N->getValueType(0), Op0.getOperand(0)); 9670 return SDValue(); 9671 } 9672 9673 /// hasNormalLoadOperand - Check if any of the operands of a BUILD_VECTOR node 9674 /// are normal, non-volatile loads. If so, it is profitable to bitcast an 9675 /// i64 vector to have f64 elements, since the value can then be loaded 9676 /// directly into a VFP register. 9677 static bool hasNormalLoadOperand(SDNode *N) { 9678 unsigned NumElts = N->getValueType(0).getVectorNumElements(); 9679 for (unsigned i = 0; i < NumElts; ++i) { 9680 SDNode *Elt = N->getOperand(i).getNode(); 9681 if (ISD::isNormalLoad(Elt) && !cast<LoadSDNode>(Elt)->isVolatile()) 9682 return true; 9683 } 9684 return false; 9685 } 9686 9687 /// PerformBUILD_VECTORCombine - Target-specific dag combine xforms for 9688 /// ISD::BUILD_VECTOR. 9689 static SDValue PerformBUILD_VECTORCombine(SDNode *N, 9690 TargetLowering::DAGCombinerInfo &DCI, 9691 const ARMSubtarget *Subtarget) { 9692 // build_vector(N=ARMISD::VMOVRRD(X), N:1) -> bit_convert(X): 9693 // VMOVRRD is introduced when legalizing i64 types. It forces the i64 value 9694 // into a pair of GPRs, which is fine when the value is used as a scalar, 9695 // but if the i64 value is converted to a vector, we need to undo the VMOVRRD. 9696 SelectionDAG &DAG = DCI.DAG; 9697 if (N->getNumOperands() == 2) 9698 if (SDValue RV = PerformVMOVDRRCombine(N, DAG)) 9699 return RV; 9700 9701 // Load i64 elements as f64 values so that type legalization does not split 9702 // them up into i32 values. 9703 EVT VT = N->getValueType(0); 9704 if (VT.getVectorElementType() != MVT::i64 || !hasNormalLoadOperand(N)) 9705 return SDValue(); 9706 SDLoc dl(N); 9707 SmallVector<SDValue, 8> Ops; 9708 unsigned NumElts = VT.getVectorNumElements(); 9709 for (unsigned i = 0; i < NumElts; ++i) { 9710 SDValue V = DAG.getNode(ISD::BITCAST, dl, MVT::f64, N->getOperand(i)); 9711 Ops.push_back(V); 9712 // Make the DAGCombiner fold the bitcast. 9713 DCI.AddToWorklist(V.getNode()); 9714 } 9715 EVT FloatVT = EVT::getVectorVT(*DAG.getContext(), MVT::f64, NumElts); 9716 SDValue BV = DAG.getBuildVector(FloatVT, dl, Ops); 9717 return DAG.getNode(ISD::BITCAST, dl, VT, BV); 9718 } 9719 9720 /// \brief Target-specific dag combine xforms for ARMISD::BUILD_VECTOR. 9721 static SDValue 9722 PerformARMBUILD_VECTORCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI) { 9723 // ARMISD::BUILD_VECTOR is introduced when legalizing ISD::BUILD_VECTOR. 9724 // At that time, we may have inserted bitcasts from integer to float. 9725 // If these bitcasts have survived DAGCombine, change the lowering of this 9726 // BUILD_VECTOR in something more vector friendly, i.e., that does not 9727 // force to use floating point types. 9728 9729 // Make sure we can change the type of the vector. 9730 // This is possible iff: 9731 // 1. The vector is only used in a bitcast to a integer type. I.e., 9732 // 1.1. Vector is used only once. 9733 // 1.2. Use is a bit convert to an integer type. 9734 // 2. The size of its operands are 32-bits (64-bits are not legal). 9735 EVT VT = N->getValueType(0); 9736 EVT EltVT = VT.getVectorElementType(); 9737 9738 // Check 1.1. and 2. 9739 if (EltVT.getSizeInBits() != 32 || !N->hasOneUse()) 9740 return SDValue(); 9741 9742 // By construction, the input type must be float. 9743 assert(EltVT == MVT::f32 && "Unexpected type!"); 9744 9745 // Check 1.2. 9746 SDNode *Use = *N->use_begin(); 9747 if (Use->getOpcode() != ISD::BITCAST || 9748 Use->getValueType(0).isFloatingPoint()) 9749 return SDValue(); 9750 9751 // Check profitability. 9752 // Model is, if more than half of the relevant operands are bitcast from 9753 // i32, turn the build_vector into a sequence of insert_vector_elt. 9754 // Relevant operands are everything that is not statically 9755 // (i.e., at compile time) bitcasted. 9756 unsigned NumOfBitCastedElts = 0; 9757 unsigned NumElts = VT.getVectorNumElements(); 9758 unsigned NumOfRelevantElts = NumElts; 9759 for (unsigned Idx = 0; Idx < NumElts; ++Idx) { 9760 SDValue Elt = N->getOperand(Idx); 9761 if (Elt->getOpcode() == ISD::BITCAST) { 9762 // Assume only bit cast to i32 will go away. 9763 if (Elt->getOperand(0).getValueType() == MVT::i32) 9764 ++NumOfBitCastedElts; 9765 } else if (Elt.isUndef() || isa<ConstantSDNode>(Elt)) 9766 // Constants are statically casted, thus do not count them as 9767 // relevant operands. 9768 --NumOfRelevantElts; 9769 } 9770 9771 // Check if more than half of the elements require a non-free bitcast. 9772 if (NumOfBitCastedElts <= NumOfRelevantElts / 2) 9773 return SDValue(); 9774 9775 SelectionDAG &DAG = DCI.DAG; 9776 // Create the new vector type. 9777 EVT VecVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32, NumElts); 9778 // Check if the type is legal. 9779 const TargetLowering &TLI = DAG.getTargetLoweringInfo(); 9780 if (!TLI.isTypeLegal(VecVT)) 9781 return SDValue(); 9782 9783 // Combine: 9784 // ARMISD::BUILD_VECTOR E1, E2, ..., EN. 9785 // => BITCAST INSERT_VECTOR_ELT 9786 // (INSERT_VECTOR_ELT (...), (BITCAST EN-1), N-1), 9787 // (BITCAST EN), N. 9788 SDValue Vec = DAG.getUNDEF(VecVT); 9789 SDLoc dl(N); 9790 for (unsigned Idx = 0 ; Idx < NumElts; ++Idx) { 9791 SDValue V = N->getOperand(Idx); 9792 if (V.isUndef()) 9793 continue; 9794 if (V.getOpcode() == ISD::BITCAST && 9795 V->getOperand(0).getValueType() == MVT::i32) 9796 // Fold obvious case. 9797 V = V.getOperand(0); 9798 else { 9799 V = DAG.getNode(ISD::BITCAST, SDLoc(V), MVT::i32, V); 9800 // Make the DAGCombiner fold the bitcasts. 9801 DCI.AddToWorklist(V.getNode()); 9802 } 9803 SDValue LaneIdx = DAG.getConstant(Idx, dl, MVT::i32); 9804 Vec = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VecVT, Vec, V, LaneIdx); 9805 } 9806 Vec = DAG.getNode(ISD::BITCAST, dl, VT, Vec); 9807 // Make the DAGCombiner fold the bitcasts. 9808 DCI.AddToWorklist(Vec.getNode()); 9809 return Vec; 9810 } 9811 9812 /// PerformInsertEltCombine - Target-specific dag combine xforms for 9813 /// ISD::INSERT_VECTOR_ELT. 9814 static SDValue PerformInsertEltCombine(SDNode *N, 9815 TargetLowering::DAGCombinerInfo &DCI) { 9816 // Bitcast an i64 load inserted into a vector to f64. 9817 // Otherwise, the i64 value will be legalized to a pair of i32 values. 9818 EVT VT = N->getValueType(0); 9819 SDNode *Elt = N->getOperand(1).getNode(); 9820 if (VT.getVectorElementType() != MVT::i64 || 9821 !ISD::isNormalLoad(Elt) || cast<LoadSDNode>(Elt)->isVolatile()) 9822 return SDValue(); 9823 9824 SelectionDAG &DAG = DCI.DAG; 9825 SDLoc dl(N); 9826 EVT FloatVT = EVT::getVectorVT(*DAG.getContext(), MVT::f64, 9827 VT.getVectorNumElements()); 9828 SDValue Vec = DAG.getNode(ISD::BITCAST, dl, FloatVT, N->getOperand(0)); 9829 SDValue V = DAG.getNode(ISD::BITCAST, dl, MVT::f64, N->getOperand(1)); 9830 // Make the DAGCombiner fold the bitcasts. 9831 DCI.AddToWorklist(Vec.getNode()); 9832 DCI.AddToWorklist(V.getNode()); 9833 SDValue InsElt = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, FloatVT, 9834 Vec, V, N->getOperand(2)); 9835 return DAG.getNode(ISD::BITCAST, dl, VT, InsElt); 9836 } 9837 9838 /// PerformVECTOR_SHUFFLECombine - Target-specific dag combine xforms for 9839 /// ISD::VECTOR_SHUFFLE. 9840 static SDValue PerformVECTOR_SHUFFLECombine(SDNode *N, SelectionDAG &DAG) { 9841 // The LLVM shufflevector instruction does not require the shuffle mask 9842 // length to match the operand vector length, but ISD::VECTOR_SHUFFLE does 9843 // have that requirement. When translating to ISD::VECTOR_SHUFFLE, if the 9844 // operands do not match the mask length, they are extended by concatenating 9845 // them with undef vectors. That is probably the right thing for other 9846 // targets, but for NEON it is better to concatenate two double-register 9847 // size vector operands into a single quad-register size vector. Do that 9848 // transformation here: 9849 // shuffle(concat(v1, undef), concat(v2, undef)) -> 9850 // shuffle(concat(v1, v2), undef) 9851 SDValue Op0 = N->getOperand(0); 9852 SDValue Op1 = N->getOperand(1); 9853 if (Op0.getOpcode() != ISD::CONCAT_VECTORS || 9854 Op1.getOpcode() != ISD::CONCAT_VECTORS || 9855 Op0.getNumOperands() != 2 || 9856 Op1.getNumOperands() != 2) 9857 return SDValue(); 9858 SDValue Concat0Op1 = Op0.getOperand(1); 9859 SDValue Concat1Op1 = Op1.getOperand(1); 9860 if (!Concat0Op1.isUndef() || !Concat1Op1.isUndef()) 9861 return SDValue(); 9862 // Skip the transformation if any of the types are illegal. 9863 const TargetLowering &TLI = DAG.getTargetLoweringInfo(); 9864 EVT VT = N->getValueType(0); 9865 if (!TLI.isTypeLegal(VT) || 9866 !TLI.isTypeLegal(Concat0Op1.getValueType()) || 9867 !TLI.isTypeLegal(Concat1Op1.getValueType())) 9868 return SDValue(); 9869 9870 SDValue NewConcat = DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(N), VT, 9871 Op0.getOperand(0), Op1.getOperand(0)); 9872 // Translate the shuffle mask. 9873 SmallVector<int, 16> NewMask; 9874 unsigned NumElts = VT.getVectorNumElements(); 9875 unsigned HalfElts = NumElts/2; 9876 ShuffleVectorSDNode *SVN = cast<ShuffleVectorSDNode>(N); 9877 for (unsigned n = 0; n < NumElts; ++n) { 9878 int MaskElt = SVN->getMaskElt(n); 9879 int NewElt = -1; 9880 if (MaskElt < (int)HalfElts) 9881 NewElt = MaskElt; 9882 else if (MaskElt >= (int)NumElts && MaskElt < (int)(NumElts + HalfElts)) 9883 NewElt = HalfElts + MaskElt - NumElts; 9884 NewMask.push_back(NewElt); 9885 } 9886 return DAG.getVectorShuffle(VT, SDLoc(N), NewConcat, 9887 DAG.getUNDEF(VT), NewMask); 9888 } 9889 9890 /// CombineBaseUpdate - Target-specific DAG combine function for VLDDUP, 9891 /// NEON load/store intrinsics, and generic vector load/stores, to merge 9892 /// base address updates. 9893 /// For generic load/stores, the memory type is assumed to be a vector. 9894 /// The caller is assumed to have checked legality. 9895 static SDValue CombineBaseUpdate(SDNode *N, 9896 TargetLowering::DAGCombinerInfo &DCI) { 9897 SelectionDAG &DAG = DCI.DAG; 9898 const bool isIntrinsic = (N->getOpcode() == ISD::INTRINSIC_VOID || 9899 N->getOpcode() == ISD::INTRINSIC_W_CHAIN); 9900 const bool isStore = N->getOpcode() == ISD::STORE; 9901 const unsigned AddrOpIdx = ((isIntrinsic || isStore) ? 2 : 1); 9902 SDValue Addr = N->getOperand(AddrOpIdx); 9903 MemSDNode *MemN = cast<MemSDNode>(N); 9904 SDLoc dl(N); 9905 9906 // Search for a use of the address operand that is an increment. 9907 for (SDNode::use_iterator UI = Addr.getNode()->use_begin(), 9908 UE = Addr.getNode()->use_end(); UI != UE; ++UI) { 9909 SDNode *User = *UI; 9910 if (User->getOpcode() != ISD::ADD || 9911 UI.getUse().getResNo() != Addr.getResNo()) 9912 continue; 9913 9914 // Check that the add is independent of the load/store. Otherwise, folding 9915 // it would create a cycle. 9916 if (User->isPredecessorOf(N) || N->isPredecessorOf(User)) 9917 continue; 9918 9919 // Find the new opcode for the updating load/store. 9920 bool isLoadOp = true; 9921 bool isLaneOp = false; 9922 unsigned NewOpc = 0; 9923 unsigned NumVecs = 0; 9924 if (isIntrinsic) { 9925 unsigned IntNo = cast<ConstantSDNode>(N->getOperand(1))->getZExtValue(); 9926 switch (IntNo) { 9927 default: llvm_unreachable("unexpected intrinsic for Neon base update"); 9928 case Intrinsic::arm_neon_vld1: NewOpc = ARMISD::VLD1_UPD; 9929 NumVecs = 1; break; 9930 case Intrinsic::arm_neon_vld2: NewOpc = ARMISD::VLD2_UPD; 9931 NumVecs = 2; break; 9932 case Intrinsic::arm_neon_vld3: NewOpc = ARMISD::VLD3_UPD; 9933 NumVecs = 3; break; 9934 case Intrinsic::arm_neon_vld4: NewOpc = ARMISD::VLD4_UPD; 9935 NumVecs = 4; break; 9936 case Intrinsic::arm_neon_vld2lane: NewOpc = ARMISD::VLD2LN_UPD; 9937 NumVecs = 2; isLaneOp = true; break; 9938 case Intrinsic::arm_neon_vld3lane: NewOpc = ARMISD::VLD3LN_UPD; 9939 NumVecs = 3; isLaneOp = true; break; 9940 case Intrinsic::arm_neon_vld4lane: NewOpc = ARMISD::VLD4LN_UPD; 9941 NumVecs = 4; isLaneOp = true; break; 9942 case Intrinsic::arm_neon_vst1: NewOpc = ARMISD::VST1_UPD; 9943 NumVecs = 1; isLoadOp = false; break; 9944 case Intrinsic::arm_neon_vst2: NewOpc = ARMISD::VST2_UPD; 9945 NumVecs = 2; isLoadOp = false; break; 9946 case Intrinsic::arm_neon_vst3: NewOpc = ARMISD::VST3_UPD; 9947 NumVecs = 3; isLoadOp = false; break; 9948 case Intrinsic::arm_neon_vst4: NewOpc = ARMISD::VST4_UPD; 9949 NumVecs = 4; isLoadOp = false; break; 9950 case Intrinsic::arm_neon_vst2lane: NewOpc = ARMISD::VST2LN_UPD; 9951 NumVecs = 2; isLoadOp = false; isLaneOp = true; break; 9952 case Intrinsic::arm_neon_vst3lane: NewOpc = ARMISD::VST3LN_UPD; 9953 NumVecs = 3; isLoadOp = false; isLaneOp = true; break; 9954 case Intrinsic::arm_neon_vst4lane: NewOpc = ARMISD::VST4LN_UPD; 9955 NumVecs = 4; isLoadOp = false; isLaneOp = true; break; 9956 } 9957 } else { 9958 isLaneOp = true; 9959 switch (N->getOpcode()) { 9960 default: llvm_unreachable("unexpected opcode for Neon base update"); 9961 case ARMISD::VLD2DUP: NewOpc = ARMISD::VLD2DUP_UPD; NumVecs = 2; break; 9962 case ARMISD::VLD3DUP: NewOpc = ARMISD::VLD3DUP_UPD; NumVecs = 3; break; 9963 case ARMISD::VLD4DUP: NewOpc = ARMISD::VLD4DUP_UPD; NumVecs = 4; break; 9964 case ISD::LOAD: NewOpc = ARMISD::VLD1_UPD; 9965 NumVecs = 1; isLaneOp = false; break; 9966 case ISD::STORE: NewOpc = ARMISD::VST1_UPD; 9967 NumVecs = 1; isLaneOp = false; isLoadOp = false; break; 9968 } 9969 } 9970 9971 // Find the size of memory referenced by the load/store. 9972 EVT VecTy; 9973 if (isLoadOp) { 9974 VecTy = N->getValueType(0); 9975 } else if (isIntrinsic) { 9976 VecTy = N->getOperand(AddrOpIdx+1).getValueType(); 9977 } else { 9978 assert(isStore && "Node has to be a load, a store, or an intrinsic!"); 9979 VecTy = N->getOperand(1).getValueType(); 9980 } 9981 9982 unsigned NumBytes = NumVecs * VecTy.getSizeInBits() / 8; 9983 if (isLaneOp) 9984 NumBytes /= VecTy.getVectorNumElements(); 9985 9986 // If the increment is a constant, it must match the memory ref size. 9987 SDValue Inc = User->getOperand(User->getOperand(0) == Addr ? 1 : 0); 9988 if (ConstantSDNode *CInc = dyn_cast<ConstantSDNode>(Inc.getNode())) { 9989 uint64_t IncVal = CInc->getZExtValue(); 9990 if (IncVal != NumBytes) 9991 continue; 9992 } else if (NumBytes >= 3 * 16) { 9993 // VLD3/4 and VST3/4 for 128-bit vectors are implemented with two 9994 // separate instructions that make it harder to use a non-constant update. 9995 continue; 9996 } 9997 9998 // OK, we found an ADD we can fold into the base update. 9999 // Now, create a _UPD node, taking care of not breaking alignment. 10000 10001 EVT AlignedVecTy = VecTy; 10002 unsigned Alignment = MemN->getAlignment(); 10003 10004 // If this is a less-than-standard-aligned load/store, change the type to 10005 // match the standard alignment. 10006 // The alignment is overlooked when selecting _UPD variants; and it's 10007 // easier to introduce bitcasts here than fix that. 10008 // There are 3 ways to get to this base-update combine: 10009 // - intrinsics: they are assumed to be properly aligned (to the standard 10010 // alignment of the memory type), so we don't need to do anything. 10011 // - ARMISD::VLDx nodes: they are only generated from the aforementioned 10012 // intrinsics, so, likewise, there's nothing to do. 10013 // - generic load/store instructions: the alignment is specified as an 10014 // explicit operand, rather than implicitly as the standard alignment 10015 // of the memory type (like the intrisics). We need to change the 10016 // memory type to match the explicit alignment. That way, we don't 10017 // generate non-standard-aligned ARMISD::VLDx nodes. 10018 if (isa<LSBaseSDNode>(N)) { 10019 if (Alignment == 0) 10020 Alignment = 1; 10021 if (Alignment < VecTy.getScalarSizeInBits() / 8) { 10022 MVT EltTy = MVT::getIntegerVT(Alignment * 8); 10023 assert(NumVecs == 1 && "Unexpected multi-element generic load/store."); 10024 assert(!isLaneOp && "Unexpected generic load/store lane."); 10025 unsigned NumElts = NumBytes / (EltTy.getSizeInBits() / 8); 10026 AlignedVecTy = MVT::getVectorVT(EltTy, NumElts); 10027 } 10028 // Don't set an explicit alignment on regular load/stores that we want 10029 // to transform to VLD/VST 1_UPD nodes. 10030 // This matches the behavior of regular load/stores, which only get an 10031 // explicit alignment if the MMO alignment is larger than the standard 10032 // alignment of the memory type. 10033 // Intrinsics, however, always get an explicit alignment, set to the 10034 // alignment of the MMO. 10035 Alignment = 1; 10036 } 10037 10038 // Create the new updating load/store node. 10039 // First, create an SDVTList for the new updating node's results. 10040 EVT Tys[6]; 10041 unsigned NumResultVecs = (isLoadOp ? NumVecs : 0); 10042 unsigned n; 10043 for (n = 0; n < NumResultVecs; ++n) 10044 Tys[n] = AlignedVecTy; 10045 Tys[n++] = MVT::i32; 10046 Tys[n] = MVT::Other; 10047 SDVTList SDTys = DAG.getVTList(makeArrayRef(Tys, NumResultVecs+2)); 10048 10049 // Then, gather the new node's operands. 10050 SmallVector<SDValue, 8> Ops; 10051 Ops.push_back(N->getOperand(0)); // incoming chain 10052 Ops.push_back(N->getOperand(AddrOpIdx)); 10053 Ops.push_back(Inc); 10054 10055 if (StoreSDNode *StN = dyn_cast<StoreSDNode>(N)) { 10056 // Try to match the intrinsic's signature 10057 Ops.push_back(StN->getValue()); 10058 } else { 10059 // Loads (and of course intrinsics) match the intrinsics' signature, 10060 // so just add all but the alignment operand. 10061 for (unsigned i = AddrOpIdx + 1; i < N->getNumOperands() - 1; ++i) 10062 Ops.push_back(N->getOperand(i)); 10063 } 10064 10065 // For all node types, the alignment operand is always the last one. 10066 Ops.push_back(DAG.getConstant(Alignment, dl, MVT::i32)); 10067 10068 // If this is a non-standard-aligned STORE, the penultimate operand is the 10069 // stored value. Bitcast it to the aligned type. 10070 if (AlignedVecTy != VecTy && N->getOpcode() == ISD::STORE) { 10071 SDValue &StVal = Ops[Ops.size()-2]; 10072 StVal = DAG.getNode(ISD::BITCAST, dl, AlignedVecTy, StVal); 10073 } 10074 10075 SDValue UpdN = DAG.getMemIntrinsicNode(NewOpc, dl, SDTys, 10076 Ops, AlignedVecTy, 10077 MemN->getMemOperand()); 10078 10079 // Update the uses. 10080 SmallVector<SDValue, 5> NewResults; 10081 for (unsigned i = 0; i < NumResultVecs; ++i) 10082 NewResults.push_back(SDValue(UpdN.getNode(), i)); 10083 10084 // If this is an non-standard-aligned LOAD, the first result is the loaded 10085 // value. Bitcast it to the expected result type. 10086 if (AlignedVecTy != VecTy && N->getOpcode() == ISD::LOAD) { 10087 SDValue &LdVal = NewResults[0]; 10088 LdVal = DAG.getNode(ISD::BITCAST, dl, VecTy, LdVal); 10089 } 10090 10091 NewResults.push_back(SDValue(UpdN.getNode(), NumResultVecs+1)); // chain 10092 DCI.CombineTo(N, NewResults); 10093 DCI.CombineTo(User, SDValue(UpdN.getNode(), NumResultVecs)); 10094 10095 break; 10096 } 10097 return SDValue(); 10098 } 10099 10100 static SDValue PerformVLDCombine(SDNode *N, 10101 TargetLowering::DAGCombinerInfo &DCI) { 10102 if (DCI.isBeforeLegalize() || DCI.isCalledByLegalizer()) 10103 return SDValue(); 10104 10105 return CombineBaseUpdate(N, DCI); 10106 } 10107 10108 /// CombineVLDDUP - For a VDUPLANE node N, check if its source operand is a 10109 /// vldN-lane (N > 1) intrinsic, and if all the other uses of that intrinsic 10110 /// are also VDUPLANEs. If so, combine them to a vldN-dup operation and 10111 /// return true. 10112 static bool CombineVLDDUP(SDNode *N, TargetLowering::DAGCombinerInfo &DCI) { 10113 SelectionDAG &DAG = DCI.DAG; 10114 EVT VT = N->getValueType(0); 10115 // vldN-dup instructions only support 64-bit vectors for N > 1. 10116 if (!VT.is64BitVector()) 10117 return false; 10118 10119 // Check if the VDUPLANE operand is a vldN-dup intrinsic. 10120 SDNode *VLD = N->getOperand(0).getNode(); 10121 if (VLD->getOpcode() != ISD::INTRINSIC_W_CHAIN) 10122 return false; 10123 unsigned NumVecs = 0; 10124 unsigned NewOpc = 0; 10125 unsigned IntNo = cast<ConstantSDNode>(VLD->getOperand(1))->getZExtValue(); 10126 if (IntNo == Intrinsic::arm_neon_vld2lane) { 10127 NumVecs = 2; 10128 NewOpc = ARMISD::VLD2DUP; 10129 } else if (IntNo == Intrinsic::arm_neon_vld3lane) { 10130 NumVecs = 3; 10131 NewOpc = ARMISD::VLD3DUP; 10132 } else if (IntNo == Intrinsic::arm_neon_vld4lane) { 10133 NumVecs = 4; 10134 NewOpc = ARMISD::VLD4DUP; 10135 } else { 10136 return false; 10137 } 10138 10139 // First check that all the vldN-lane uses are VDUPLANEs and that the lane 10140 // numbers match the load. 10141 unsigned VLDLaneNo = 10142 cast<ConstantSDNode>(VLD->getOperand(NumVecs+3))->getZExtValue(); 10143 for (SDNode::use_iterator UI = VLD->use_begin(), UE = VLD->use_end(); 10144 UI != UE; ++UI) { 10145 // Ignore uses of the chain result. 10146 if (UI.getUse().getResNo() == NumVecs) 10147 continue; 10148 SDNode *User = *UI; 10149 if (User->getOpcode() != ARMISD::VDUPLANE || 10150 VLDLaneNo != cast<ConstantSDNode>(User->getOperand(1))->getZExtValue()) 10151 return false; 10152 } 10153 10154 // Create the vldN-dup node. 10155 EVT Tys[5]; 10156 unsigned n; 10157 for (n = 0; n < NumVecs; ++n) 10158 Tys[n] = VT; 10159 Tys[n] = MVT::Other; 10160 SDVTList SDTys = DAG.getVTList(makeArrayRef(Tys, NumVecs+1)); 10161 SDValue Ops[] = { VLD->getOperand(0), VLD->getOperand(2) }; 10162 MemIntrinsicSDNode *VLDMemInt = cast<MemIntrinsicSDNode>(VLD); 10163 SDValue VLDDup = DAG.getMemIntrinsicNode(NewOpc, SDLoc(VLD), SDTys, 10164 Ops, VLDMemInt->getMemoryVT(), 10165 VLDMemInt->getMemOperand()); 10166 10167 // Update the uses. 10168 for (SDNode::use_iterator UI = VLD->use_begin(), UE = VLD->use_end(); 10169 UI != UE; ++UI) { 10170 unsigned ResNo = UI.getUse().getResNo(); 10171 // Ignore uses of the chain result. 10172 if (ResNo == NumVecs) 10173 continue; 10174 SDNode *User = *UI; 10175 DCI.CombineTo(User, SDValue(VLDDup.getNode(), ResNo)); 10176 } 10177 10178 // Now the vldN-lane intrinsic is dead except for its chain result. 10179 // Update uses of the chain. 10180 std::vector<SDValue> VLDDupResults; 10181 for (unsigned n = 0; n < NumVecs; ++n) 10182 VLDDupResults.push_back(SDValue(VLDDup.getNode(), n)); 10183 VLDDupResults.push_back(SDValue(VLDDup.getNode(), NumVecs)); 10184 DCI.CombineTo(VLD, VLDDupResults); 10185 10186 return true; 10187 } 10188 10189 /// PerformVDUPLANECombine - Target-specific dag combine xforms for 10190 /// ARMISD::VDUPLANE. 10191 static SDValue PerformVDUPLANECombine(SDNode *N, 10192 TargetLowering::DAGCombinerInfo &DCI) { 10193 SDValue Op = N->getOperand(0); 10194 10195 // If the source is a vldN-lane (N > 1) intrinsic, and all the other uses 10196 // of that intrinsic are also VDUPLANEs, combine them to a vldN-dup operation. 10197 if (CombineVLDDUP(N, DCI)) 10198 return SDValue(N, 0); 10199 10200 // If the source is already a VMOVIMM or VMVNIMM splat, the VDUPLANE is 10201 // redundant. Ignore bit_converts for now; element sizes are checked below. 10202 while (Op.getOpcode() == ISD::BITCAST) 10203 Op = Op.getOperand(0); 10204 if (Op.getOpcode() != ARMISD::VMOVIMM && Op.getOpcode() != ARMISD::VMVNIMM) 10205 return SDValue(); 10206 10207 // Make sure the VMOV element size is not bigger than the VDUPLANE elements. 10208 unsigned EltSize = Op.getValueType().getVectorElementType().getSizeInBits(); 10209 // The canonical VMOV for a zero vector uses a 32-bit element size. 10210 unsigned Imm = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue(); 10211 unsigned EltBits; 10212 if (ARM_AM::decodeNEONModImm(Imm, EltBits) == 0) 10213 EltSize = 8; 10214 EVT VT = N->getValueType(0); 10215 if (EltSize > VT.getVectorElementType().getSizeInBits()) 10216 return SDValue(); 10217 10218 return DCI.DAG.getNode(ISD::BITCAST, SDLoc(N), VT, Op); 10219 } 10220 10221 static SDValue PerformLOADCombine(SDNode *N, 10222 TargetLowering::DAGCombinerInfo &DCI) { 10223 EVT VT = N->getValueType(0); 10224 10225 // If this is a legal vector load, try to combine it into a VLD1_UPD. 10226 if (ISD::isNormalLoad(N) && VT.isVector() && 10227 DCI.DAG.getTargetLoweringInfo().isTypeLegal(VT)) 10228 return CombineBaseUpdate(N, DCI); 10229 10230 return SDValue(); 10231 } 10232 10233 /// PerformSTORECombine - Target-specific dag combine xforms for 10234 /// ISD::STORE. 10235 static SDValue PerformSTORECombine(SDNode *N, 10236 TargetLowering::DAGCombinerInfo &DCI) { 10237 StoreSDNode *St = cast<StoreSDNode>(N); 10238 if (St->isVolatile()) 10239 return SDValue(); 10240 10241 // Optimize trunc store (of multiple scalars) to shuffle and store. First, 10242 // pack all of the elements in one place. Next, store to memory in fewer 10243 // chunks. 10244 SDValue StVal = St->getValue(); 10245 EVT VT = StVal.getValueType(); 10246 if (St->isTruncatingStore() && VT.isVector()) { 10247 SelectionDAG &DAG = DCI.DAG; 10248 const TargetLowering &TLI = DAG.getTargetLoweringInfo(); 10249 EVT StVT = St->getMemoryVT(); 10250 unsigned NumElems = VT.getVectorNumElements(); 10251 assert(StVT != VT && "Cannot truncate to the same type"); 10252 unsigned FromEltSz = VT.getVectorElementType().getSizeInBits(); 10253 unsigned ToEltSz = StVT.getVectorElementType().getSizeInBits(); 10254 10255 // From, To sizes and ElemCount must be pow of two 10256 if (!isPowerOf2_32(NumElems * FromEltSz * ToEltSz)) return SDValue(); 10257 10258 // We are going to use the original vector elt for storing. 10259 // Accumulated smaller vector elements must be a multiple of the store size. 10260 if (0 != (NumElems * FromEltSz) % ToEltSz) return SDValue(); 10261 10262 unsigned SizeRatio = FromEltSz / ToEltSz; 10263 assert(SizeRatio * NumElems * ToEltSz == VT.getSizeInBits()); 10264 10265 // Create a type on which we perform the shuffle. 10266 EVT WideVecVT = EVT::getVectorVT(*DAG.getContext(), StVT.getScalarType(), 10267 NumElems*SizeRatio); 10268 assert(WideVecVT.getSizeInBits() == VT.getSizeInBits()); 10269 10270 SDLoc DL(St); 10271 SDValue WideVec = DAG.getNode(ISD::BITCAST, DL, WideVecVT, StVal); 10272 SmallVector<int, 8> ShuffleVec(NumElems * SizeRatio, -1); 10273 for (unsigned i = 0; i < NumElems; ++i) 10274 ShuffleVec[i] = DAG.getDataLayout().isBigEndian() 10275 ? (i + 1) * SizeRatio - 1 10276 : i * SizeRatio; 10277 10278 // Can't shuffle using an illegal type. 10279 if (!TLI.isTypeLegal(WideVecVT)) return SDValue(); 10280 10281 SDValue Shuff = DAG.getVectorShuffle(WideVecVT, DL, WideVec, 10282 DAG.getUNDEF(WideVec.getValueType()), 10283 ShuffleVec); 10284 // At this point all of the data is stored at the bottom of the 10285 // register. We now need to save it to mem. 10286 10287 // Find the largest store unit 10288 MVT StoreType = MVT::i8; 10289 for (MVT Tp : MVT::integer_valuetypes()) { 10290 if (TLI.isTypeLegal(Tp) && Tp.getSizeInBits() <= NumElems * ToEltSz) 10291 StoreType = Tp; 10292 } 10293 // Didn't find a legal store type. 10294 if (!TLI.isTypeLegal(StoreType)) 10295 return SDValue(); 10296 10297 // Bitcast the original vector into a vector of store-size units 10298 EVT StoreVecVT = EVT::getVectorVT(*DAG.getContext(), 10299 StoreType, VT.getSizeInBits()/EVT(StoreType).getSizeInBits()); 10300 assert(StoreVecVT.getSizeInBits() == VT.getSizeInBits()); 10301 SDValue ShuffWide = DAG.getNode(ISD::BITCAST, DL, StoreVecVT, Shuff); 10302 SmallVector<SDValue, 8> Chains; 10303 SDValue Increment = DAG.getConstant(StoreType.getSizeInBits() / 8, DL, 10304 TLI.getPointerTy(DAG.getDataLayout())); 10305 SDValue BasePtr = St->getBasePtr(); 10306 10307 // Perform one or more big stores into memory. 10308 unsigned E = (ToEltSz*NumElems)/StoreType.getSizeInBits(); 10309 for (unsigned I = 0; I < E; I++) { 10310 SDValue SubVec = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, 10311 StoreType, ShuffWide, 10312 DAG.getIntPtrConstant(I, DL)); 10313 SDValue Ch = DAG.getStore(St->getChain(), DL, SubVec, BasePtr, 10314 St->getPointerInfo(), St->isVolatile(), 10315 St->isNonTemporal(), St->getAlignment()); 10316 BasePtr = DAG.getNode(ISD::ADD, DL, BasePtr.getValueType(), BasePtr, 10317 Increment); 10318 Chains.push_back(Ch); 10319 } 10320 return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Chains); 10321 } 10322 10323 if (!ISD::isNormalStore(St)) 10324 return SDValue(); 10325 10326 // Split a store of a VMOVDRR into two integer stores to avoid mixing NEON and 10327 // ARM stores of arguments in the same cache line. 10328 if (StVal.getNode()->getOpcode() == ARMISD::VMOVDRR && 10329 StVal.getNode()->hasOneUse()) { 10330 SelectionDAG &DAG = DCI.DAG; 10331 bool isBigEndian = DAG.getDataLayout().isBigEndian(); 10332 SDLoc DL(St); 10333 SDValue BasePtr = St->getBasePtr(); 10334 SDValue NewST1 = DAG.getStore(St->getChain(), DL, 10335 StVal.getNode()->getOperand(isBigEndian ? 1 : 0 ), 10336 BasePtr, St->getPointerInfo(), St->isVolatile(), 10337 St->isNonTemporal(), St->getAlignment()); 10338 10339 SDValue OffsetPtr = DAG.getNode(ISD::ADD, DL, MVT::i32, BasePtr, 10340 DAG.getConstant(4, DL, MVT::i32)); 10341 return DAG.getStore(NewST1.getValue(0), DL, 10342 StVal.getNode()->getOperand(isBigEndian ? 0 : 1), 10343 OffsetPtr, St->getPointerInfo(), St->isVolatile(), 10344 St->isNonTemporal(), 10345 std::min(4U, St->getAlignment() / 2)); 10346 } 10347 10348 if (StVal.getValueType() == MVT::i64 && 10349 StVal.getNode()->getOpcode() == ISD::EXTRACT_VECTOR_ELT) { 10350 10351 // Bitcast an i64 store extracted from a vector to f64. 10352 // Otherwise, the i64 value will be legalized to a pair of i32 values. 10353 SelectionDAG &DAG = DCI.DAG; 10354 SDLoc dl(StVal); 10355 SDValue IntVec = StVal.getOperand(0); 10356 EVT FloatVT = EVT::getVectorVT(*DAG.getContext(), MVT::f64, 10357 IntVec.getValueType().getVectorNumElements()); 10358 SDValue Vec = DAG.getNode(ISD::BITCAST, dl, FloatVT, IntVec); 10359 SDValue ExtElt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, 10360 Vec, StVal.getOperand(1)); 10361 dl = SDLoc(N); 10362 SDValue V = DAG.getNode(ISD::BITCAST, dl, MVT::i64, ExtElt); 10363 // Make the DAGCombiner fold the bitcasts. 10364 DCI.AddToWorklist(Vec.getNode()); 10365 DCI.AddToWorklist(ExtElt.getNode()); 10366 DCI.AddToWorklist(V.getNode()); 10367 return DAG.getStore(St->getChain(), dl, V, St->getBasePtr(), 10368 St->getPointerInfo(), St->isVolatile(), 10369 St->isNonTemporal(), St->getAlignment(), 10370 St->getAAInfo()); 10371 } 10372 10373 // If this is a legal vector store, try to combine it into a VST1_UPD. 10374 if (ISD::isNormalStore(N) && VT.isVector() && 10375 DCI.DAG.getTargetLoweringInfo().isTypeLegal(VT)) 10376 return CombineBaseUpdate(N, DCI); 10377 10378 return SDValue(); 10379 } 10380 10381 /// PerformVCVTCombine - VCVT (floating-point to fixed-point, Advanced SIMD) 10382 /// can replace combinations of VMUL and VCVT (floating-point to integer) 10383 /// when the VMUL has a constant operand that is a power of 2. 10384 /// 10385 /// Example (assume d17 = <float 8.000000e+00, float 8.000000e+00>): 10386 /// vmul.f32 d16, d17, d16 10387 /// vcvt.s32.f32 d16, d16 10388 /// becomes: 10389 /// vcvt.s32.f32 d16, d16, #3 10390 static SDValue PerformVCVTCombine(SDNode *N, SelectionDAG &DAG, 10391 const ARMSubtarget *Subtarget) { 10392 if (!Subtarget->hasNEON()) 10393 return SDValue(); 10394 10395 SDValue Op = N->getOperand(0); 10396 if (!Op.getValueType().isVector() || !Op.getValueType().isSimple() || 10397 Op.getOpcode() != ISD::FMUL) 10398 return SDValue(); 10399 10400 SDValue ConstVec = Op->getOperand(1); 10401 if (!isa<BuildVectorSDNode>(ConstVec)) 10402 return SDValue(); 10403 10404 MVT FloatTy = Op.getSimpleValueType().getVectorElementType(); 10405 uint32_t FloatBits = FloatTy.getSizeInBits(); 10406 MVT IntTy = N->getSimpleValueType(0).getVectorElementType(); 10407 uint32_t IntBits = IntTy.getSizeInBits(); 10408 unsigned NumLanes = Op.getValueType().getVectorNumElements(); 10409 if (FloatBits != 32 || IntBits > 32 || NumLanes > 4) { 10410 // These instructions only exist converting from f32 to i32. We can handle 10411 // smaller integers by generating an extra truncate, but larger ones would 10412 // be lossy. We also can't handle more then 4 lanes, since these intructions 10413 // only support v2i32/v4i32 types. 10414 return SDValue(); 10415 } 10416 10417 BitVector UndefElements; 10418 BuildVectorSDNode *BV = cast<BuildVectorSDNode>(ConstVec); 10419 int32_t C = BV->getConstantFPSplatPow2ToLog2Int(&UndefElements, 33); 10420 if (C == -1 || C == 0 || C > 32) 10421 return SDValue(); 10422 10423 SDLoc dl(N); 10424 bool isSigned = N->getOpcode() == ISD::FP_TO_SINT; 10425 unsigned IntrinsicOpcode = isSigned ? Intrinsic::arm_neon_vcvtfp2fxs : 10426 Intrinsic::arm_neon_vcvtfp2fxu; 10427 SDValue FixConv = DAG.getNode( 10428 ISD::INTRINSIC_WO_CHAIN, dl, NumLanes == 2 ? MVT::v2i32 : MVT::v4i32, 10429 DAG.getConstant(IntrinsicOpcode, dl, MVT::i32), Op->getOperand(0), 10430 DAG.getConstant(C, dl, MVT::i32)); 10431 10432 if (IntBits < FloatBits) 10433 FixConv = DAG.getNode(ISD::TRUNCATE, dl, N->getValueType(0), FixConv); 10434 10435 return FixConv; 10436 } 10437 10438 /// PerformVDIVCombine - VCVT (fixed-point to floating-point, Advanced SIMD) 10439 /// can replace combinations of VCVT (integer to floating-point) and VDIV 10440 /// when the VDIV has a constant operand that is a power of 2. 10441 /// 10442 /// Example (assume d17 = <float 8.000000e+00, float 8.000000e+00>): 10443 /// vcvt.f32.s32 d16, d16 10444 /// vdiv.f32 d16, d17, d16 10445 /// becomes: 10446 /// vcvt.f32.s32 d16, d16, #3 10447 static SDValue PerformVDIVCombine(SDNode *N, SelectionDAG &DAG, 10448 const ARMSubtarget *Subtarget) { 10449 if (!Subtarget->hasNEON()) 10450 return SDValue(); 10451 10452 SDValue Op = N->getOperand(0); 10453 unsigned OpOpcode = Op.getNode()->getOpcode(); 10454 if (!N->getValueType(0).isVector() || !N->getValueType(0).isSimple() || 10455 (OpOpcode != ISD::SINT_TO_FP && OpOpcode != ISD::UINT_TO_FP)) 10456 return SDValue(); 10457 10458 SDValue ConstVec = N->getOperand(1); 10459 if (!isa<BuildVectorSDNode>(ConstVec)) 10460 return SDValue(); 10461 10462 MVT FloatTy = N->getSimpleValueType(0).getVectorElementType(); 10463 uint32_t FloatBits = FloatTy.getSizeInBits(); 10464 MVT IntTy = Op.getOperand(0).getSimpleValueType().getVectorElementType(); 10465 uint32_t IntBits = IntTy.getSizeInBits(); 10466 unsigned NumLanes = Op.getValueType().getVectorNumElements(); 10467 if (FloatBits != 32 || IntBits > 32 || NumLanes > 4) { 10468 // These instructions only exist converting from i32 to f32. We can handle 10469 // smaller integers by generating an extra extend, but larger ones would 10470 // be lossy. We also can't handle more then 4 lanes, since these intructions 10471 // only support v2i32/v4i32 types. 10472 return SDValue(); 10473 } 10474 10475 BitVector UndefElements; 10476 BuildVectorSDNode *BV = cast<BuildVectorSDNode>(ConstVec); 10477 int32_t C = BV->getConstantFPSplatPow2ToLog2Int(&UndefElements, 33); 10478 if (C == -1 || C == 0 || C > 32) 10479 return SDValue(); 10480 10481 SDLoc dl(N); 10482 bool isSigned = OpOpcode == ISD::SINT_TO_FP; 10483 SDValue ConvInput = Op.getOperand(0); 10484 if (IntBits < FloatBits) 10485 ConvInput = DAG.getNode(isSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND, 10486 dl, NumLanes == 2 ? MVT::v2i32 : MVT::v4i32, 10487 ConvInput); 10488 10489 unsigned IntrinsicOpcode = isSigned ? Intrinsic::arm_neon_vcvtfxs2fp : 10490 Intrinsic::arm_neon_vcvtfxu2fp; 10491 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, 10492 Op.getValueType(), 10493 DAG.getConstant(IntrinsicOpcode, dl, MVT::i32), 10494 ConvInput, DAG.getConstant(C, dl, MVT::i32)); 10495 } 10496 10497 /// Getvshiftimm - Check if this is a valid build_vector for the immediate 10498 /// operand of a vector shift operation, where all the elements of the 10499 /// build_vector must have the same constant integer value. 10500 static bool getVShiftImm(SDValue Op, unsigned ElementBits, int64_t &Cnt) { 10501 // Ignore bit_converts. 10502 while (Op.getOpcode() == ISD::BITCAST) 10503 Op = Op.getOperand(0); 10504 BuildVectorSDNode *BVN = dyn_cast<BuildVectorSDNode>(Op.getNode()); 10505 APInt SplatBits, SplatUndef; 10506 unsigned SplatBitSize; 10507 bool HasAnyUndefs; 10508 if (! BVN || ! BVN->isConstantSplat(SplatBits, SplatUndef, SplatBitSize, 10509 HasAnyUndefs, ElementBits) || 10510 SplatBitSize > ElementBits) 10511 return false; 10512 Cnt = SplatBits.getSExtValue(); 10513 return true; 10514 } 10515 10516 /// isVShiftLImm - Check if this is a valid build_vector for the immediate 10517 /// operand of a vector shift left operation. That value must be in the range: 10518 /// 0 <= Value < ElementBits for a left shift; or 10519 /// 0 <= Value <= ElementBits for a long left shift. 10520 static bool isVShiftLImm(SDValue Op, EVT VT, bool isLong, int64_t &Cnt) { 10521 assert(VT.isVector() && "vector shift count is not a vector type"); 10522 int64_t ElementBits = VT.getVectorElementType().getSizeInBits(); 10523 if (! getVShiftImm(Op, ElementBits, Cnt)) 10524 return false; 10525 return (Cnt >= 0 && (isLong ? Cnt-1 : Cnt) < ElementBits); 10526 } 10527 10528 /// isVShiftRImm - Check if this is a valid build_vector for the immediate 10529 /// operand of a vector shift right operation. For a shift opcode, the value 10530 /// is positive, but for an intrinsic the value count must be negative. The 10531 /// absolute value must be in the range: 10532 /// 1 <= |Value| <= ElementBits for a right shift; or 10533 /// 1 <= |Value| <= ElementBits/2 for a narrow right shift. 10534 static bool isVShiftRImm(SDValue Op, EVT VT, bool isNarrow, bool isIntrinsic, 10535 int64_t &Cnt) { 10536 assert(VT.isVector() && "vector shift count is not a vector type"); 10537 int64_t ElementBits = VT.getVectorElementType().getSizeInBits(); 10538 if (! getVShiftImm(Op, ElementBits, Cnt)) 10539 return false; 10540 if (!isIntrinsic) 10541 return (Cnt >= 1 && Cnt <= (isNarrow ? ElementBits/2 : ElementBits)); 10542 if (Cnt >= -(isNarrow ? ElementBits/2 : ElementBits) && Cnt <= -1) { 10543 Cnt = -Cnt; 10544 return true; 10545 } 10546 return false; 10547 } 10548 10549 /// PerformIntrinsicCombine - ARM-specific DAG combining for intrinsics. 10550 static SDValue PerformIntrinsicCombine(SDNode *N, SelectionDAG &DAG) { 10551 unsigned IntNo = cast<ConstantSDNode>(N->getOperand(0))->getZExtValue(); 10552 switch (IntNo) { 10553 default: 10554 // Don't do anything for most intrinsics. 10555 break; 10556 10557 // Vector shifts: check for immediate versions and lower them. 10558 // Note: This is done during DAG combining instead of DAG legalizing because 10559 // the build_vectors for 64-bit vector element shift counts are generally 10560 // not legal, and it is hard to see their values after they get legalized to 10561 // loads from a constant pool. 10562 case Intrinsic::arm_neon_vshifts: 10563 case Intrinsic::arm_neon_vshiftu: 10564 case Intrinsic::arm_neon_vrshifts: 10565 case Intrinsic::arm_neon_vrshiftu: 10566 case Intrinsic::arm_neon_vrshiftn: 10567 case Intrinsic::arm_neon_vqshifts: 10568 case Intrinsic::arm_neon_vqshiftu: 10569 case Intrinsic::arm_neon_vqshiftsu: 10570 case Intrinsic::arm_neon_vqshiftns: 10571 case Intrinsic::arm_neon_vqshiftnu: 10572 case Intrinsic::arm_neon_vqshiftnsu: 10573 case Intrinsic::arm_neon_vqrshiftns: 10574 case Intrinsic::arm_neon_vqrshiftnu: 10575 case Intrinsic::arm_neon_vqrshiftnsu: { 10576 EVT VT = N->getOperand(1).getValueType(); 10577 int64_t Cnt; 10578 unsigned VShiftOpc = 0; 10579 10580 switch (IntNo) { 10581 case Intrinsic::arm_neon_vshifts: 10582 case Intrinsic::arm_neon_vshiftu: 10583 if (isVShiftLImm(N->getOperand(2), VT, false, Cnt)) { 10584 VShiftOpc = ARMISD::VSHL; 10585 break; 10586 } 10587 if (isVShiftRImm(N->getOperand(2), VT, false, true, Cnt)) { 10588 VShiftOpc = (IntNo == Intrinsic::arm_neon_vshifts ? 10589 ARMISD::VSHRs : ARMISD::VSHRu); 10590 break; 10591 } 10592 return SDValue(); 10593 10594 case Intrinsic::arm_neon_vrshifts: 10595 case Intrinsic::arm_neon_vrshiftu: 10596 if (isVShiftRImm(N->getOperand(2), VT, false, true, Cnt)) 10597 break; 10598 return SDValue(); 10599 10600 case Intrinsic::arm_neon_vqshifts: 10601 case Intrinsic::arm_neon_vqshiftu: 10602 if (isVShiftLImm(N->getOperand(2), VT, false, Cnt)) 10603 break; 10604 return SDValue(); 10605 10606 case Intrinsic::arm_neon_vqshiftsu: 10607 if (isVShiftLImm(N->getOperand(2), VT, false, Cnt)) 10608 break; 10609 llvm_unreachable("invalid shift count for vqshlu intrinsic"); 10610 10611 case Intrinsic::arm_neon_vrshiftn: 10612 case Intrinsic::arm_neon_vqshiftns: 10613 case Intrinsic::arm_neon_vqshiftnu: 10614 case Intrinsic::arm_neon_vqshiftnsu: 10615 case Intrinsic::arm_neon_vqrshiftns: 10616 case Intrinsic::arm_neon_vqrshiftnu: 10617 case Intrinsic::arm_neon_vqrshiftnsu: 10618 // Narrowing shifts require an immediate right shift. 10619 if (isVShiftRImm(N->getOperand(2), VT, true, true, Cnt)) 10620 break; 10621 llvm_unreachable("invalid shift count for narrowing vector shift " 10622 "intrinsic"); 10623 10624 default: 10625 llvm_unreachable("unhandled vector shift"); 10626 } 10627 10628 switch (IntNo) { 10629 case Intrinsic::arm_neon_vshifts: 10630 case Intrinsic::arm_neon_vshiftu: 10631 // Opcode already set above. 10632 break; 10633 case Intrinsic::arm_neon_vrshifts: 10634 VShiftOpc = ARMISD::VRSHRs; break; 10635 case Intrinsic::arm_neon_vrshiftu: 10636 VShiftOpc = ARMISD::VRSHRu; break; 10637 case Intrinsic::arm_neon_vrshiftn: 10638 VShiftOpc = ARMISD::VRSHRN; break; 10639 case Intrinsic::arm_neon_vqshifts: 10640 VShiftOpc = ARMISD::VQSHLs; break; 10641 case Intrinsic::arm_neon_vqshiftu: 10642 VShiftOpc = ARMISD::VQSHLu; break; 10643 case Intrinsic::arm_neon_vqshiftsu: 10644 VShiftOpc = ARMISD::VQSHLsu; break; 10645 case Intrinsic::arm_neon_vqshiftns: 10646 VShiftOpc = ARMISD::VQSHRNs; break; 10647 case Intrinsic::arm_neon_vqshiftnu: 10648 VShiftOpc = ARMISD::VQSHRNu; break; 10649 case Intrinsic::arm_neon_vqshiftnsu: 10650 VShiftOpc = ARMISD::VQSHRNsu; break; 10651 case Intrinsic::arm_neon_vqrshiftns: 10652 VShiftOpc = ARMISD::VQRSHRNs; break; 10653 case Intrinsic::arm_neon_vqrshiftnu: 10654 VShiftOpc = ARMISD::VQRSHRNu; break; 10655 case Intrinsic::arm_neon_vqrshiftnsu: 10656 VShiftOpc = ARMISD::VQRSHRNsu; break; 10657 } 10658 10659 SDLoc dl(N); 10660 return DAG.getNode(VShiftOpc, dl, N->getValueType(0), 10661 N->getOperand(1), DAG.getConstant(Cnt, dl, MVT::i32)); 10662 } 10663 10664 case Intrinsic::arm_neon_vshiftins: { 10665 EVT VT = N->getOperand(1).getValueType(); 10666 int64_t Cnt; 10667 unsigned VShiftOpc = 0; 10668 10669 if (isVShiftLImm(N->getOperand(3), VT, false, Cnt)) 10670 VShiftOpc = ARMISD::VSLI; 10671 else if (isVShiftRImm(N->getOperand(3), VT, false, true, Cnt)) 10672 VShiftOpc = ARMISD::VSRI; 10673 else { 10674 llvm_unreachable("invalid shift count for vsli/vsri intrinsic"); 10675 } 10676 10677 SDLoc dl(N); 10678 return DAG.getNode(VShiftOpc, dl, N->getValueType(0), 10679 N->getOperand(1), N->getOperand(2), 10680 DAG.getConstant(Cnt, dl, MVT::i32)); 10681 } 10682 10683 case Intrinsic::arm_neon_vqrshifts: 10684 case Intrinsic::arm_neon_vqrshiftu: 10685 // No immediate versions of these to check for. 10686 break; 10687 } 10688 10689 return SDValue(); 10690 } 10691 10692 /// PerformShiftCombine - Checks for immediate versions of vector shifts and 10693 /// lowers them. As with the vector shift intrinsics, this is done during DAG 10694 /// combining instead of DAG legalizing because the build_vectors for 64-bit 10695 /// vector element shift counts are generally not legal, and it is hard to see 10696 /// their values after they get legalized to loads from a constant pool. 10697 static SDValue PerformShiftCombine(SDNode *N, SelectionDAG &DAG, 10698 const ARMSubtarget *ST) { 10699 EVT VT = N->getValueType(0); 10700 if (N->getOpcode() == ISD::SRL && VT == MVT::i32 && ST->hasV6Ops()) { 10701 // Canonicalize (srl (bswap x), 16) to (rotr (bswap x), 16) if the high 10702 // 16-bits of x is zero. This optimizes rev + lsr 16 to rev16. 10703 SDValue N1 = N->getOperand(1); 10704 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(N1)) { 10705 SDValue N0 = N->getOperand(0); 10706 if (C->getZExtValue() == 16 && N0.getOpcode() == ISD::BSWAP && 10707 DAG.MaskedValueIsZero(N0.getOperand(0), 10708 APInt::getHighBitsSet(32, 16))) 10709 return DAG.getNode(ISD::ROTR, SDLoc(N), VT, N0, N1); 10710 } 10711 } 10712 10713 // Nothing to be done for scalar shifts. 10714 const TargetLowering &TLI = DAG.getTargetLoweringInfo(); 10715 if (!VT.isVector() || !TLI.isTypeLegal(VT)) 10716 return SDValue(); 10717 10718 assert(ST->hasNEON() && "unexpected vector shift"); 10719 int64_t Cnt; 10720 10721 switch (N->getOpcode()) { 10722 default: llvm_unreachable("unexpected shift opcode"); 10723 10724 case ISD::SHL: 10725 if (isVShiftLImm(N->getOperand(1), VT, false, Cnt)) { 10726 SDLoc dl(N); 10727 return DAG.getNode(ARMISD::VSHL, dl, VT, N->getOperand(0), 10728 DAG.getConstant(Cnt, dl, MVT::i32)); 10729 } 10730 break; 10731 10732 case ISD::SRA: 10733 case ISD::SRL: 10734 if (isVShiftRImm(N->getOperand(1), VT, false, false, Cnt)) { 10735 unsigned VShiftOpc = (N->getOpcode() == ISD::SRA ? 10736 ARMISD::VSHRs : ARMISD::VSHRu); 10737 SDLoc dl(N); 10738 return DAG.getNode(VShiftOpc, dl, VT, N->getOperand(0), 10739 DAG.getConstant(Cnt, dl, MVT::i32)); 10740 } 10741 } 10742 return SDValue(); 10743 } 10744 10745 /// PerformExtendCombine - Target-specific DAG combining for ISD::SIGN_EXTEND, 10746 /// ISD::ZERO_EXTEND, and ISD::ANY_EXTEND. 10747 static SDValue PerformExtendCombine(SDNode *N, SelectionDAG &DAG, 10748 const ARMSubtarget *ST) { 10749 SDValue N0 = N->getOperand(0); 10750 10751 // Check for sign- and zero-extensions of vector extract operations of 8- 10752 // and 16-bit vector elements. NEON supports these directly. They are 10753 // handled during DAG combining because type legalization will promote them 10754 // to 32-bit types and it is messy to recognize the operations after that. 10755 if (ST->hasNEON() && N0.getOpcode() == ISD::EXTRACT_VECTOR_ELT) { 10756 SDValue Vec = N0.getOperand(0); 10757 SDValue Lane = N0.getOperand(1); 10758 EVT VT = N->getValueType(0); 10759 EVT EltVT = N0.getValueType(); 10760 const TargetLowering &TLI = DAG.getTargetLoweringInfo(); 10761 10762 if (VT == MVT::i32 && 10763 (EltVT == MVT::i8 || EltVT == MVT::i16) && 10764 TLI.isTypeLegal(Vec.getValueType()) && 10765 isa<ConstantSDNode>(Lane)) { 10766 10767 unsigned Opc = 0; 10768 switch (N->getOpcode()) { 10769 default: llvm_unreachable("unexpected opcode"); 10770 case ISD::SIGN_EXTEND: 10771 Opc = ARMISD::VGETLANEs; 10772 break; 10773 case ISD::ZERO_EXTEND: 10774 case ISD::ANY_EXTEND: 10775 Opc = ARMISD::VGETLANEu; 10776 break; 10777 } 10778 return DAG.getNode(Opc, SDLoc(N), VT, Vec, Lane); 10779 } 10780 } 10781 10782 return SDValue(); 10783 } 10784 10785 static void computeKnownBits(SelectionDAG &DAG, SDValue Op, APInt &KnownZero, 10786 APInt &KnownOne) { 10787 if (Op.getOpcode() == ARMISD::BFI) { 10788 // Conservatively, we can recurse down the first operand 10789 // and just mask out all affected bits. 10790 computeKnownBits(DAG, Op.getOperand(0), KnownZero, KnownOne); 10791 10792 // The operand to BFI is already a mask suitable for removing the bits it 10793 // sets. 10794 ConstantSDNode *CI = cast<ConstantSDNode>(Op.getOperand(2)); 10795 const APInt &Mask = CI->getAPIntValue(); 10796 KnownZero &= Mask; 10797 KnownOne &= Mask; 10798 return; 10799 } 10800 if (Op.getOpcode() == ARMISD::CMOV) { 10801 APInt KZ2(KnownZero.getBitWidth(), 0); 10802 APInt KO2(KnownOne.getBitWidth(), 0); 10803 computeKnownBits(DAG, Op.getOperand(1), KnownZero, KnownOne); 10804 computeKnownBits(DAG, Op.getOperand(2), KZ2, KO2); 10805 10806 KnownZero &= KZ2; 10807 KnownOne &= KO2; 10808 return; 10809 } 10810 return DAG.computeKnownBits(Op, KnownZero, KnownOne); 10811 } 10812 10813 SDValue ARMTargetLowering::PerformCMOVToBFICombine(SDNode *CMOV, SelectionDAG &DAG) const { 10814 // If we have a CMOV, OR and AND combination such as: 10815 // if (x & CN) 10816 // y |= CM; 10817 // 10818 // And: 10819 // * CN is a single bit; 10820 // * All bits covered by CM are known zero in y 10821 // 10822 // Then we can convert this into a sequence of BFI instructions. This will 10823 // always be a win if CM is a single bit, will always be no worse than the 10824 // TST&OR sequence if CM is two bits, and for thumb will be no worse if CM is 10825 // three bits (due to the extra IT instruction). 10826 10827 SDValue Op0 = CMOV->getOperand(0); 10828 SDValue Op1 = CMOV->getOperand(1); 10829 auto CCNode = cast<ConstantSDNode>(CMOV->getOperand(2)); 10830 auto CC = CCNode->getAPIntValue().getLimitedValue(); 10831 SDValue CmpZ = CMOV->getOperand(4); 10832 10833 // The compare must be against zero. 10834 if (!isNullConstant(CmpZ->getOperand(1))) 10835 return SDValue(); 10836 10837 assert(CmpZ->getOpcode() == ARMISD::CMPZ); 10838 SDValue And = CmpZ->getOperand(0); 10839 if (And->getOpcode() != ISD::AND) 10840 return SDValue(); 10841 ConstantSDNode *AndC = dyn_cast<ConstantSDNode>(And->getOperand(1)); 10842 if (!AndC || !AndC->getAPIntValue().isPowerOf2()) 10843 return SDValue(); 10844 SDValue X = And->getOperand(0); 10845 10846 if (CC == ARMCC::EQ) { 10847 // We're performing an "equal to zero" compare. Swap the operands so we 10848 // canonicalize on a "not equal to zero" compare. 10849 std::swap(Op0, Op1); 10850 } else { 10851 assert(CC == ARMCC::NE && "How can a CMPZ node not be EQ or NE?"); 10852 } 10853 10854 if (Op1->getOpcode() != ISD::OR) 10855 return SDValue(); 10856 10857 ConstantSDNode *OrC = dyn_cast<ConstantSDNode>(Op1->getOperand(1)); 10858 if (!OrC) 10859 return SDValue(); 10860 SDValue Y = Op1->getOperand(0); 10861 10862 if (Op0 != Y) 10863 return SDValue(); 10864 10865 // Now, is it profitable to continue? 10866 APInt OrCI = OrC->getAPIntValue(); 10867 unsigned Heuristic = Subtarget->isThumb() ? 3 : 2; 10868 if (OrCI.countPopulation() > Heuristic) 10869 return SDValue(); 10870 10871 // Lastly, can we determine that the bits defined by OrCI 10872 // are zero in Y? 10873 APInt KnownZero, KnownOne; 10874 computeKnownBits(DAG, Y, KnownZero, KnownOne); 10875 if ((OrCI & KnownZero) != OrCI) 10876 return SDValue(); 10877 10878 // OK, we can do the combine. 10879 SDValue V = Y; 10880 SDLoc dl(X); 10881 EVT VT = X.getValueType(); 10882 unsigned BitInX = AndC->getAPIntValue().logBase2(); 10883 10884 if (BitInX != 0) { 10885 // We must shift X first. 10886 X = DAG.getNode(ISD::SRL, dl, VT, X, 10887 DAG.getConstant(BitInX, dl, VT)); 10888 } 10889 10890 for (unsigned BitInY = 0, NumActiveBits = OrCI.getActiveBits(); 10891 BitInY < NumActiveBits; ++BitInY) { 10892 if (OrCI[BitInY] == 0) 10893 continue; 10894 APInt Mask(VT.getSizeInBits(), 0); 10895 Mask.setBit(BitInY); 10896 V = DAG.getNode(ARMISD::BFI, dl, VT, V, X, 10897 // Confusingly, the operand is an *inverted* mask. 10898 DAG.getConstant(~Mask, dl, VT)); 10899 } 10900 10901 return V; 10902 } 10903 10904 /// PerformBRCONDCombine - Target-specific DAG combining for ARMISD::BRCOND. 10905 SDValue 10906 ARMTargetLowering::PerformBRCONDCombine(SDNode *N, SelectionDAG &DAG) const { 10907 SDValue Cmp = N->getOperand(4); 10908 if (Cmp.getOpcode() != ARMISD::CMPZ) 10909 // Only looking at NE cases. 10910 return SDValue(); 10911 10912 EVT VT = N->getValueType(0); 10913 SDLoc dl(N); 10914 SDValue LHS = Cmp.getOperand(0); 10915 SDValue RHS = Cmp.getOperand(1); 10916 SDValue Chain = N->getOperand(0); 10917 SDValue BB = N->getOperand(1); 10918 SDValue ARMcc = N->getOperand(2); 10919 ARMCC::CondCodes CC = 10920 (ARMCC::CondCodes)cast<ConstantSDNode>(ARMcc)->getZExtValue(); 10921 10922 // (brcond Chain BB ne CPSR (cmpz (and (cmov 0 1 CC CPSR Cmp) 1) 0)) 10923 // -> (brcond Chain BB CC CPSR Cmp) 10924 if (CC == ARMCC::NE && LHS.getOpcode() == ISD::AND && LHS->hasOneUse() && 10925 LHS->getOperand(0)->getOpcode() == ARMISD::CMOV && 10926 LHS->getOperand(0)->hasOneUse()) { 10927 auto *LHS00C = dyn_cast<ConstantSDNode>(LHS->getOperand(0)->getOperand(0)); 10928 auto *LHS01C = dyn_cast<ConstantSDNode>(LHS->getOperand(0)->getOperand(1)); 10929 auto *LHS1C = dyn_cast<ConstantSDNode>(LHS->getOperand(1)); 10930 auto *RHSC = dyn_cast<ConstantSDNode>(RHS); 10931 if ((LHS00C && LHS00C->getZExtValue() == 0) && 10932 (LHS01C && LHS01C->getZExtValue() == 1) && 10933 (LHS1C && LHS1C->getZExtValue() == 1) && 10934 (RHSC && RHSC->getZExtValue() == 0)) { 10935 return DAG.getNode( 10936 ARMISD::BRCOND, dl, VT, Chain, BB, LHS->getOperand(0)->getOperand(2), 10937 LHS->getOperand(0)->getOperand(3), LHS->getOperand(0)->getOperand(4)); 10938 } 10939 } 10940 10941 return SDValue(); 10942 } 10943 10944 /// PerformCMOVCombine - Target-specific DAG combining for ARMISD::CMOV. 10945 SDValue 10946 ARMTargetLowering::PerformCMOVCombine(SDNode *N, SelectionDAG &DAG) const { 10947 SDValue Cmp = N->getOperand(4); 10948 if (Cmp.getOpcode() != ARMISD::CMPZ) 10949 // Only looking at EQ and NE cases. 10950 return SDValue(); 10951 10952 EVT VT = N->getValueType(0); 10953 SDLoc dl(N); 10954 SDValue LHS = Cmp.getOperand(0); 10955 SDValue RHS = Cmp.getOperand(1); 10956 SDValue FalseVal = N->getOperand(0); 10957 SDValue TrueVal = N->getOperand(1); 10958 SDValue ARMcc = N->getOperand(2); 10959 ARMCC::CondCodes CC = 10960 (ARMCC::CondCodes)cast<ConstantSDNode>(ARMcc)->getZExtValue(); 10961 10962 // BFI is only available on V6T2+. 10963 if (!Subtarget->isThumb1Only() && Subtarget->hasV6T2Ops()) { 10964 SDValue R = PerformCMOVToBFICombine(N, DAG); 10965 if (R) 10966 return R; 10967 } 10968 10969 // Simplify 10970 // mov r1, r0 10971 // cmp r1, x 10972 // mov r0, y 10973 // moveq r0, x 10974 // to 10975 // cmp r0, x 10976 // movne r0, y 10977 // 10978 // mov r1, r0 10979 // cmp r1, x 10980 // mov r0, x 10981 // movne r0, y 10982 // to 10983 // cmp r0, x 10984 // movne r0, y 10985 /// FIXME: Turn this into a target neutral optimization? 10986 SDValue Res; 10987 if (CC == ARMCC::NE && FalseVal == RHS && FalseVal != LHS) { 10988 Res = DAG.getNode(ARMISD::CMOV, dl, VT, LHS, TrueVal, ARMcc, 10989 N->getOperand(3), Cmp); 10990 } else if (CC == ARMCC::EQ && TrueVal == RHS) { 10991 SDValue ARMcc; 10992 SDValue NewCmp = getARMCmp(LHS, RHS, ISD::SETNE, ARMcc, DAG, dl); 10993 Res = DAG.getNode(ARMISD::CMOV, dl, VT, LHS, FalseVal, ARMcc, 10994 N->getOperand(3), NewCmp); 10995 } 10996 10997 // (cmov F T ne CPSR (cmpz (cmov 0 1 CC CPSR Cmp) 0)) 10998 // -> (cmov F T CC CPSR Cmp) 10999 if (CC == ARMCC::NE && LHS.getOpcode() == ARMISD::CMOV && LHS->hasOneUse()) { 11000 auto *LHS0C = dyn_cast<ConstantSDNode>(LHS->getOperand(0)); 11001 auto *LHS1C = dyn_cast<ConstantSDNode>(LHS->getOperand(1)); 11002 auto *RHSC = dyn_cast<ConstantSDNode>(RHS); 11003 if ((LHS0C && LHS0C->getZExtValue() == 0) && 11004 (LHS1C && LHS1C->getZExtValue() == 1) && 11005 (RHSC && RHSC->getZExtValue() == 0)) { 11006 return DAG.getNode(ARMISD::CMOV, dl, VT, FalseVal, TrueVal, 11007 LHS->getOperand(2), LHS->getOperand(3), 11008 LHS->getOperand(4)); 11009 } 11010 } 11011 11012 if (Res.getNode()) { 11013 APInt KnownZero, KnownOne; 11014 DAG.computeKnownBits(SDValue(N,0), KnownZero, KnownOne); 11015 // Capture demanded bits information that would be otherwise lost. 11016 if (KnownZero == 0xfffffffe) 11017 Res = DAG.getNode(ISD::AssertZext, dl, MVT::i32, Res, 11018 DAG.getValueType(MVT::i1)); 11019 else if (KnownZero == 0xffffff00) 11020 Res = DAG.getNode(ISD::AssertZext, dl, MVT::i32, Res, 11021 DAG.getValueType(MVT::i8)); 11022 else if (KnownZero == 0xffff0000) 11023 Res = DAG.getNode(ISD::AssertZext, dl, MVT::i32, Res, 11024 DAG.getValueType(MVT::i16)); 11025 } 11026 11027 return Res; 11028 } 11029 11030 SDValue ARMTargetLowering::PerformDAGCombine(SDNode *N, 11031 DAGCombinerInfo &DCI) const { 11032 switch (N->getOpcode()) { 11033 default: break; 11034 case ISD::ADDC: return PerformADDCCombine(N, DCI, Subtarget); 11035 case ISD::ADD: return PerformADDCombine(N, DCI, Subtarget); 11036 case ISD::SUB: return PerformSUBCombine(N, DCI); 11037 case ISD::MUL: return PerformMULCombine(N, DCI, Subtarget); 11038 case ISD::OR: return PerformORCombine(N, DCI, Subtarget); 11039 case ISD::XOR: return PerformXORCombine(N, DCI, Subtarget); 11040 case ISD::AND: return PerformANDCombine(N, DCI, Subtarget); 11041 case ARMISD::BFI: return PerformBFICombine(N, DCI); 11042 case ARMISD::VMOVRRD: return PerformVMOVRRDCombine(N, DCI, Subtarget); 11043 case ARMISD::VMOVDRR: return PerformVMOVDRRCombine(N, DCI.DAG); 11044 case ISD::STORE: return PerformSTORECombine(N, DCI); 11045 case ISD::BUILD_VECTOR: return PerformBUILD_VECTORCombine(N, DCI, Subtarget); 11046 case ISD::INSERT_VECTOR_ELT: return PerformInsertEltCombine(N, DCI); 11047 case ISD::VECTOR_SHUFFLE: return PerformVECTOR_SHUFFLECombine(N, DCI.DAG); 11048 case ARMISD::VDUPLANE: return PerformVDUPLANECombine(N, DCI); 11049 case ISD::FP_TO_SINT: 11050 case ISD::FP_TO_UINT: 11051 return PerformVCVTCombine(N, DCI.DAG, Subtarget); 11052 case ISD::FDIV: 11053 return PerformVDIVCombine(N, DCI.DAG, Subtarget); 11054 case ISD::INTRINSIC_WO_CHAIN: return PerformIntrinsicCombine(N, DCI.DAG); 11055 case ISD::SHL: 11056 case ISD::SRA: 11057 case ISD::SRL: return PerformShiftCombine(N, DCI.DAG, Subtarget); 11058 case ISD::SIGN_EXTEND: 11059 case ISD::ZERO_EXTEND: 11060 case ISD::ANY_EXTEND: return PerformExtendCombine(N, DCI.DAG, Subtarget); 11061 case ARMISD::CMOV: return PerformCMOVCombine(N, DCI.DAG); 11062 case ARMISD::BRCOND: return PerformBRCONDCombine(N, DCI.DAG); 11063 case ISD::LOAD: return PerformLOADCombine(N, DCI); 11064 case ARMISD::VLD2DUP: 11065 case ARMISD::VLD3DUP: 11066 case ARMISD::VLD4DUP: 11067 return PerformVLDCombine(N, DCI); 11068 case ARMISD::BUILD_VECTOR: 11069 return PerformARMBUILD_VECTORCombine(N, DCI); 11070 case ISD::INTRINSIC_VOID: 11071 case ISD::INTRINSIC_W_CHAIN: 11072 switch (cast<ConstantSDNode>(N->getOperand(1))->getZExtValue()) { 11073 case Intrinsic::arm_neon_vld1: 11074 case Intrinsic::arm_neon_vld2: 11075 case Intrinsic::arm_neon_vld3: 11076 case Intrinsic::arm_neon_vld4: 11077 case Intrinsic::arm_neon_vld2lane: 11078 case Intrinsic::arm_neon_vld3lane: 11079 case Intrinsic::arm_neon_vld4lane: 11080 case Intrinsic::arm_neon_vst1: 11081 case Intrinsic::arm_neon_vst2: 11082 case Intrinsic::arm_neon_vst3: 11083 case Intrinsic::arm_neon_vst4: 11084 case Intrinsic::arm_neon_vst2lane: 11085 case Intrinsic::arm_neon_vst3lane: 11086 case Intrinsic::arm_neon_vst4lane: 11087 return PerformVLDCombine(N, DCI); 11088 default: break; 11089 } 11090 break; 11091 } 11092 return SDValue(); 11093 } 11094 11095 bool ARMTargetLowering::isDesirableToTransformToIntegerOp(unsigned Opc, 11096 EVT VT) const { 11097 return (VT == MVT::f32) && (Opc == ISD::LOAD || Opc == ISD::STORE); 11098 } 11099 11100 bool ARMTargetLowering::allowsMisalignedMemoryAccesses(EVT VT, 11101 unsigned, 11102 unsigned, 11103 bool *Fast) const { 11104 // The AllowsUnaliged flag models the SCTLR.A setting in ARM cpus 11105 bool AllowsUnaligned = Subtarget->allowsUnalignedMem(); 11106 11107 switch (VT.getSimpleVT().SimpleTy) { 11108 default: 11109 return false; 11110 case MVT::i8: 11111 case MVT::i16: 11112 case MVT::i32: { 11113 // Unaligned access can use (for example) LRDB, LRDH, LDR 11114 if (AllowsUnaligned) { 11115 if (Fast) 11116 *Fast = Subtarget->hasV7Ops(); 11117 return true; 11118 } 11119 return false; 11120 } 11121 case MVT::f64: 11122 case MVT::v2f64: { 11123 // For any little-endian targets with neon, we can support unaligned ld/st 11124 // of D and Q (e.g. {D0,D1}) registers by using vld1.i8/vst1.i8. 11125 // A big-endian target may also explicitly support unaligned accesses 11126 if (Subtarget->hasNEON() && (AllowsUnaligned || Subtarget->isLittle())) { 11127 if (Fast) 11128 *Fast = true; 11129 return true; 11130 } 11131 return false; 11132 } 11133 } 11134 } 11135 11136 static bool memOpAlign(unsigned DstAlign, unsigned SrcAlign, 11137 unsigned AlignCheck) { 11138 return ((SrcAlign == 0 || SrcAlign % AlignCheck == 0) && 11139 (DstAlign == 0 || DstAlign % AlignCheck == 0)); 11140 } 11141 11142 EVT ARMTargetLowering::getOptimalMemOpType(uint64_t Size, 11143 unsigned DstAlign, unsigned SrcAlign, 11144 bool IsMemset, bool ZeroMemset, 11145 bool MemcpyStrSrc, 11146 MachineFunction &MF) const { 11147 const Function *F = MF.getFunction(); 11148 11149 // See if we can use NEON instructions for this... 11150 if ((!IsMemset || ZeroMemset) && Subtarget->hasNEON() && 11151 !F->hasFnAttribute(Attribute::NoImplicitFloat)) { 11152 bool Fast; 11153 if (Size >= 16 && 11154 (memOpAlign(SrcAlign, DstAlign, 16) || 11155 (allowsMisalignedMemoryAccesses(MVT::v2f64, 0, 1, &Fast) && Fast))) { 11156 return MVT::v2f64; 11157 } else if (Size >= 8 && 11158 (memOpAlign(SrcAlign, DstAlign, 8) || 11159 (allowsMisalignedMemoryAccesses(MVT::f64, 0, 1, &Fast) && 11160 Fast))) { 11161 return MVT::f64; 11162 } 11163 } 11164 11165 // Lowering to i32/i16 if the size permits. 11166 if (Size >= 4) 11167 return MVT::i32; 11168 else if (Size >= 2) 11169 return MVT::i16; 11170 11171 // Let the target-independent logic figure it out. 11172 return MVT::Other; 11173 } 11174 11175 bool ARMTargetLowering::isZExtFree(SDValue Val, EVT VT2) const { 11176 if (Val.getOpcode() != ISD::LOAD) 11177 return false; 11178 11179 EVT VT1 = Val.getValueType(); 11180 if (!VT1.isSimple() || !VT1.isInteger() || 11181 !VT2.isSimple() || !VT2.isInteger()) 11182 return false; 11183 11184 switch (VT1.getSimpleVT().SimpleTy) { 11185 default: break; 11186 case MVT::i1: 11187 case MVT::i8: 11188 case MVT::i16: 11189 // 8-bit and 16-bit loads implicitly zero-extend to 32-bits. 11190 return true; 11191 } 11192 11193 return false; 11194 } 11195 11196 bool ARMTargetLowering::isVectorLoadExtDesirable(SDValue ExtVal) const { 11197 EVT VT = ExtVal.getValueType(); 11198 11199 if (!isTypeLegal(VT)) 11200 return false; 11201 11202 // Don't create a loadext if we can fold the extension into a wide/long 11203 // instruction. 11204 // If there's more than one user instruction, the loadext is desirable no 11205 // matter what. There can be two uses by the same instruction. 11206 if (ExtVal->use_empty() || 11207 !ExtVal->use_begin()->isOnlyUserOf(ExtVal.getNode())) 11208 return true; 11209 11210 SDNode *U = *ExtVal->use_begin(); 11211 if ((U->getOpcode() == ISD::ADD || U->getOpcode() == ISD::SUB || 11212 U->getOpcode() == ISD::SHL || U->getOpcode() == ARMISD::VSHL)) 11213 return false; 11214 11215 return true; 11216 } 11217 11218 bool ARMTargetLowering::allowTruncateForTailCall(Type *Ty1, Type *Ty2) const { 11219 if (!Ty1->isIntegerTy() || !Ty2->isIntegerTy()) 11220 return false; 11221 11222 if (!isTypeLegal(EVT::getEVT(Ty1))) 11223 return false; 11224 11225 assert(Ty1->getPrimitiveSizeInBits() <= 64 && "i128 is probably not a noop"); 11226 11227 // Assuming the caller doesn't have a zeroext or signext return parameter, 11228 // truncation all the way down to i1 is valid. 11229 return true; 11230 } 11231 11232 11233 static bool isLegalT1AddressImmediate(int64_t V, EVT VT) { 11234 if (V < 0) 11235 return false; 11236 11237 unsigned Scale = 1; 11238 switch (VT.getSimpleVT().SimpleTy) { 11239 default: return false; 11240 case MVT::i1: 11241 case MVT::i8: 11242 // Scale == 1; 11243 break; 11244 case MVT::i16: 11245 // Scale == 2; 11246 Scale = 2; 11247 break; 11248 case MVT::i32: 11249 // Scale == 4; 11250 Scale = 4; 11251 break; 11252 } 11253 11254 if ((V & (Scale - 1)) != 0) 11255 return false; 11256 V /= Scale; 11257 return V == (V & ((1LL << 5) - 1)); 11258 } 11259 11260 static bool isLegalT2AddressImmediate(int64_t V, EVT VT, 11261 const ARMSubtarget *Subtarget) { 11262 bool isNeg = false; 11263 if (V < 0) { 11264 isNeg = true; 11265 V = - V; 11266 } 11267 11268 switch (VT.getSimpleVT().SimpleTy) { 11269 default: return false; 11270 case MVT::i1: 11271 case MVT::i8: 11272 case MVT::i16: 11273 case MVT::i32: 11274 // + imm12 or - imm8 11275 if (isNeg) 11276 return V == (V & ((1LL << 8) - 1)); 11277 return V == (V & ((1LL << 12) - 1)); 11278 case MVT::f32: 11279 case MVT::f64: 11280 // Same as ARM mode. FIXME: NEON? 11281 if (!Subtarget->hasVFP2()) 11282 return false; 11283 if ((V & 3) != 0) 11284 return false; 11285 V >>= 2; 11286 return V == (V & ((1LL << 8) - 1)); 11287 } 11288 } 11289 11290 /// isLegalAddressImmediate - Return true if the integer value can be used 11291 /// as the offset of the target addressing mode for load / store of the 11292 /// given type. 11293 static bool isLegalAddressImmediate(int64_t V, EVT VT, 11294 const ARMSubtarget *Subtarget) { 11295 if (V == 0) 11296 return true; 11297 11298 if (!VT.isSimple()) 11299 return false; 11300 11301 if (Subtarget->isThumb1Only()) 11302 return isLegalT1AddressImmediate(V, VT); 11303 else if (Subtarget->isThumb2()) 11304 return isLegalT2AddressImmediate(V, VT, Subtarget); 11305 11306 // ARM mode. 11307 if (V < 0) 11308 V = - V; 11309 switch (VT.getSimpleVT().SimpleTy) { 11310 default: return false; 11311 case MVT::i1: 11312 case MVT::i8: 11313 case MVT::i32: 11314 // +- imm12 11315 return V == (V & ((1LL << 12) - 1)); 11316 case MVT::i16: 11317 // +- imm8 11318 return V == (V & ((1LL << 8) - 1)); 11319 case MVT::f32: 11320 case MVT::f64: 11321 if (!Subtarget->hasVFP2()) // FIXME: NEON? 11322 return false; 11323 if ((V & 3) != 0) 11324 return false; 11325 V >>= 2; 11326 return V == (V & ((1LL << 8) - 1)); 11327 } 11328 } 11329 11330 bool ARMTargetLowering::isLegalT2ScaledAddressingMode(const AddrMode &AM, 11331 EVT VT) const { 11332 int Scale = AM.Scale; 11333 if (Scale < 0) 11334 return false; 11335 11336 switch (VT.getSimpleVT().SimpleTy) { 11337 default: return false; 11338 case MVT::i1: 11339 case MVT::i8: 11340 case MVT::i16: 11341 case MVT::i32: 11342 if (Scale == 1) 11343 return true; 11344 // r + r << imm 11345 Scale = Scale & ~1; 11346 return Scale == 2 || Scale == 4 || Scale == 8; 11347 case MVT::i64: 11348 // r + r 11349 if (((unsigned)AM.HasBaseReg + Scale) <= 2) 11350 return true; 11351 return false; 11352 case MVT::isVoid: 11353 // Note, we allow "void" uses (basically, uses that aren't loads or 11354 // stores), because arm allows folding a scale into many arithmetic 11355 // operations. This should be made more precise and revisited later. 11356 11357 // Allow r << imm, but the imm has to be a multiple of two. 11358 if (Scale & 1) return false; 11359 return isPowerOf2_32(Scale); 11360 } 11361 } 11362 11363 /// isLegalAddressingMode - Return true if the addressing mode represented 11364 /// by AM is legal for this target, for a load/store of the specified type. 11365 bool ARMTargetLowering::isLegalAddressingMode(const DataLayout &DL, 11366 const AddrMode &AM, Type *Ty, 11367 unsigned AS) const { 11368 EVT VT = getValueType(DL, Ty, true); 11369 if (!isLegalAddressImmediate(AM.BaseOffs, VT, Subtarget)) 11370 return false; 11371 11372 // Can never fold addr of global into load/store. 11373 if (AM.BaseGV) 11374 return false; 11375 11376 switch (AM.Scale) { 11377 case 0: // no scale reg, must be "r+i" or "r", or "i". 11378 break; 11379 case 1: 11380 if (Subtarget->isThumb1Only()) 11381 return false; 11382 // FALL THROUGH. 11383 default: 11384 // ARM doesn't support any R+R*scale+imm addr modes. 11385 if (AM.BaseOffs) 11386 return false; 11387 11388 if (!VT.isSimple()) 11389 return false; 11390 11391 if (Subtarget->isThumb2()) 11392 return isLegalT2ScaledAddressingMode(AM, VT); 11393 11394 int Scale = AM.Scale; 11395 switch (VT.getSimpleVT().SimpleTy) { 11396 default: return false; 11397 case MVT::i1: 11398 case MVT::i8: 11399 case MVT::i32: 11400 if (Scale < 0) Scale = -Scale; 11401 if (Scale == 1) 11402 return true; 11403 // r + r << imm 11404 return isPowerOf2_32(Scale & ~1); 11405 case MVT::i16: 11406 case MVT::i64: 11407 // r + r 11408 if (((unsigned)AM.HasBaseReg + Scale) <= 2) 11409 return true; 11410 return false; 11411 11412 case MVT::isVoid: 11413 // Note, we allow "void" uses (basically, uses that aren't loads or 11414 // stores), because arm allows folding a scale into many arithmetic 11415 // operations. This should be made more precise and revisited later. 11416 11417 // Allow r << imm, but the imm has to be a multiple of two. 11418 if (Scale & 1) return false; 11419 return isPowerOf2_32(Scale); 11420 } 11421 } 11422 return true; 11423 } 11424 11425 /// isLegalICmpImmediate - Return true if the specified immediate is legal 11426 /// icmp immediate, that is the target has icmp instructions which can compare 11427 /// a register against the immediate without having to materialize the 11428 /// immediate into a register. 11429 bool ARMTargetLowering::isLegalICmpImmediate(int64_t Imm) const { 11430 // Thumb2 and ARM modes can use cmn for negative immediates. 11431 if (!Subtarget->isThumb()) 11432 return ARM_AM::getSOImmVal(std::abs(Imm)) != -1; 11433 if (Subtarget->isThumb2()) 11434 return ARM_AM::getT2SOImmVal(std::abs(Imm)) != -1; 11435 // Thumb1 doesn't have cmn, and only 8-bit immediates. 11436 return Imm >= 0 && Imm <= 255; 11437 } 11438 11439 /// isLegalAddImmediate - Return true if the specified immediate is a legal add 11440 /// *or sub* immediate, that is the target has add or sub instructions which can 11441 /// add a register with the immediate without having to materialize the 11442 /// immediate into a register. 11443 bool ARMTargetLowering::isLegalAddImmediate(int64_t Imm) const { 11444 // Same encoding for add/sub, just flip the sign. 11445 int64_t AbsImm = std::abs(Imm); 11446 if (!Subtarget->isThumb()) 11447 return ARM_AM::getSOImmVal(AbsImm) != -1; 11448 if (Subtarget->isThumb2()) 11449 return ARM_AM::getT2SOImmVal(AbsImm) != -1; 11450 // Thumb1 only has 8-bit unsigned immediate. 11451 return AbsImm >= 0 && AbsImm <= 255; 11452 } 11453 11454 static bool getARMIndexedAddressParts(SDNode *Ptr, EVT VT, 11455 bool isSEXTLoad, SDValue &Base, 11456 SDValue &Offset, bool &isInc, 11457 SelectionDAG &DAG) { 11458 if (Ptr->getOpcode() != ISD::ADD && Ptr->getOpcode() != ISD::SUB) 11459 return false; 11460 11461 if (VT == MVT::i16 || ((VT == MVT::i8 || VT == MVT::i1) && isSEXTLoad)) { 11462 // AddressingMode 3 11463 Base = Ptr->getOperand(0); 11464 if (ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(Ptr->getOperand(1))) { 11465 int RHSC = (int)RHS->getZExtValue(); 11466 if (RHSC < 0 && RHSC > -256) { 11467 assert(Ptr->getOpcode() == ISD::ADD); 11468 isInc = false; 11469 Offset = DAG.getConstant(-RHSC, SDLoc(Ptr), RHS->getValueType(0)); 11470 return true; 11471 } 11472 } 11473 isInc = (Ptr->getOpcode() == ISD::ADD); 11474 Offset = Ptr->getOperand(1); 11475 return true; 11476 } else if (VT == MVT::i32 || VT == MVT::i8 || VT == MVT::i1) { 11477 // AddressingMode 2 11478 if (ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(Ptr->getOperand(1))) { 11479 int RHSC = (int)RHS->getZExtValue(); 11480 if (RHSC < 0 && RHSC > -0x1000) { 11481 assert(Ptr->getOpcode() == ISD::ADD); 11482 isInc = false; 11483 Offset = DAG.getConstant(-RHSC, SDLoc(Ptr), RHS->getValueType(0)); 11484 Base = Ptr->getOperand(0); 11485 return true; 11486 } 11487 } 11488 11489 if (Ptr->getOpcode() == ISD::ADD) { 11490 isInc = true; 11491 ARM_AM::ShiftOpc ShOpcVal= 11492 ARM_AM::getShiftOpcForNode(Ptr->getOperand(0).getOpcode()); 11493 if (ShOpcVal != ARM_AM::no_shift) { 11494 Base = Ptr->getOperand(1); 11495 Offset = Ptr->getOperand(0); 11496 } else { 11497 Base = Ptr->getOperand(0); 11498 Offset = Ptr->getOperand(1); 11499 } 11500 return true; 11501 } 11502 11503 isInc = (Ptr->getOpcode() == ISD::ADD); 11504 Base = Ptr->getOperand(0); 11505 Offset = Ptr->getOperand(1); 11506 return true; 11507 } 11508 11509 // FIXME: Use VLDM / VSTM to emulate indexed FP load / store. 11510 return false; 11511 } 11512 11513 static bool getT2IndexedAddressParts(SDNode *Ptr, EVT VT, 11514 bool isSEXTLoad, SDValue &Base, 11515 SDValue &Offset, bool &isInc, 11516 SelectionDAG &DAG) { 11517 if (Ptr->getOpcode() != ISD::ADD && Ptr->getOpcode() != ISD::SUB) 11518 return false; 11519 11520 Base = Ptr->getOperand(0); 11521 if (ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(Ptr->getOperand(1))) { 11522 int RHSC = (int)RHS->getZExtValue(); 11523 if (RHSC < 0 && RHSC > -0x100) { // 8 bits. 11524 assert(Ptr->getOpcode() == ISD::ADD); 11525 isInc = false; 11526 Offset = DAG.getConstant(-RHSC, SDLoc(Ptr), RHS->getValueType(0)); 11527 return true; 11528 } else if (RHSC > 0 && RHSC < 0x100) { // 8 bit, no zero. 11529 isInc = Ptr->getOpcode() == ISD::ADD; 11530 Offset = DAG.getConstant(RHSC, SDLoc(Ptr), RHS->getValueType(0)); 11531 return true; 11532 } 11533 } 11534 11535 return false; 11536 } 11537 11538 /// getPreIndexedAddressParts - returns true by value, base pointer and 11539 /// offset pointer and addressing mode by reference if the node's address 11540 /// can be legally represented as pre-indexed load / store address. 11541 bool 11542 ARMTargetLowering::getPreIndexedAddressParts(SDNode *N, SDValue &Base, 11543 SDValue &Offset, 11544 ISD::MemIndexedMode &AM, 11545 SelectionDAG &DAG) const { 11546 if (Subtarget->isThumb1Only()) 11547 return false; 11548 11549 EVT VT; 11550 SDValue Ptr; 11551 bool isSEXTLoad = false; 11552 if (LoadSDNode *LD = dyn_cast<LoadSDNode>(N)) { 11553 Ptr = LD->getBasePtr(); 11554 VT = LD->getMemoryVT(); 11555 isSEXTLoad = LD->getExtensionType() == ISD::SEXTLOAD; 11556 } else if (StoreSDNode *ST = dyn_cast<StoreSDNode>(N)) { 11557 Ptr = ST->getBasePtr(); 11558 VT = ST->getMemoryVT(); 11559 } else 11560 return false; 11561 11562 bool isInc; 11563 bool isLegal = false; 11564 if (Subtarget->isThumb2()) 11565 isLegal = getT2IndexedAddressParts(Ptr.getNode(), VT, isSEXTLoad, Base, 11566 Offset, isInc, DAG); 11567 else 11568 isLegal = getARMIndexedAddressParts(Ptr.getNode(), VT, isSEXTLoad, Base, 11569 Offset, isInc, DAG); 11570 if (!isLegal) 11571 return false; 11572 11573 AM = isInc ? ISD::PRE_INC : ISD::PRE_DEC; 11574 return true; 11575 } 11576 11577 /// getPostIndexedAddressParts - returns true by value, base pointer and 11578 /// offset pointer and addressing mode by reference if this node can be 11579 /// combined with a load / store to form a post-indexed load / store. 11580 bool ARMTargetLowering::getPostIndexedAddressParts(SDNode *N, SDNode *Op, 11581 SDValue &Base, 11582 SDValue &Offset, 11583 ISD::MemIndexedMode &AM, 11584 SelectionDAG &DAG) const { 11585 if (Subtarget->isThumb1Only()) 11586 return false; 11587 11588 EVT VT; 11589 SDValue Ptr; 11590 bool isSEXTLoad = false; 11591 if (LoadSDNode *LD = dyn_cast<LoadSDNode>(N)) { 11592 VT = LD->getMemoryVT(); 11593 Ptr = LD->getBasePtr(); 11594 isSEXTLoad = LD->getExtensionType() == ISD::SEXTLOAD; 11595 } else if (StoreSDNode *ST = dyn_cast<StoreSDNode>(N)) { 11596 VT = ST->getMemoryVT(); 11597 Ptr = ST->getBasePtr(); 11598 } else 11599 return false; 11600 11601 bool isInc; 11602 bool isLegal = false; 11603 if (Subtarget->isThumb2()) 11604 isLegal = getT2IndexedAddressParts(Op, VT, isSEXTLoad, Base, Offset, 11605 isInc, DAG); 11606 else 11607 isLegal = getARMIndexedAddressParts(Op, VT, isSEXTLoad, Base, Offset, 11608 isInc, DAG); 11609 if (!isLegal) 11610 return false; 11611 11612 if (Ptr != Base) { 11613 // Swap base ptr and offset to catch more post-index load / store when 11614 // it's legal. In Thumb2 mode, offset must be an immediate. 11615 if (Ptr == Offset && Op->getOpcode() == ISD::ADD && 11616 !Subtarget->isThumb2()) 11617 std::swap(Base, Offset); 11618 11619 // Post-indexed load / store update the base pointer. 11620 if (Ptr != Base) 11621 return false; 11622 } 11623 11624 AM = isInc ? ISD::POST_INC : ISD::POST_DEC; 11625 return true; 11626 } 11627 11628 void ARMTargetLowering::computeKnownBitsForTargetNode(const SDValue Op, 11629 APInt &KnownZero, 11630 APInt &KnownOne, 11631 const SelectionDAG &DAG, 11632 unsigned Depth) const { 11633 unsigned BitWidth = KnownOne.getBitWidth(); 11634 KnownZero = KnownOne = APInt(BitWidth, 0); 11635 switch (Op.getOpcode()) { 11636 default: break; 11637 case ARMISD::ADDC: 11638 case ARMISD::ADDE: 11639 case ARMISD::SUBC: 11640 case ARMISD::SUBE: 11641 // These nodes' second result is a boolean 11642 if (Op.getResNo() == 0) 11643 break; 11644 KnownZero |= APInt::getHighBitsSet(BitWidth, BitWidth - 1); 11645 break; 11646 case ARMISD::CMOV: { 11647 // Bits are known zero/one if known on the LHS and RHS. 11648 DAG.computeKnownBits(Op.getOperand(0), KnownZero, KnownOne, Depth+1); 11649 if (KnownZero == 0 && KnownOne == 0) return; 11650 11651 APInt KnownZeroRHS, KnownOneRHS; 11652 DAG.computeKnownBits(Op.getOperand(1), KnownZeroRHS, KnownOneRHS, Depth+1); 11653 KnownZero &= KnownZeroRHS; 11654 KnownOne &= KnownOneRHS; 11655 return; 11656 } 11657 case ISD::INTRINSIC_W_CHAIN: { 11658 ConstantSDNode *CN = cast<ConstantSDNode>(Op->getOperand(1)); 11659 Intrinsic::ID IntID = static_cast<Intrinsic::ID>(CN->getZExtValue()); 11660 switch (IntID) { 11661 default: return; 11662 case Intrinsic::arm_ldaex: 11663 case Intrinsic::arm_ldrex: { 11664 EVT VT = cast<MemIntrinsicSDNode>(Op)->getMemoryVT(); 11665 unsigned MemBits = VT.getScalarType().getSizeInBits(); 11666 KnownZero |= APInt::getHighBitsSet(BitWidth, BitWidth - MemBits); 11667 return; 11668 } 11669 } 11670 } 11671 } 11672 } 11673 11674 //===----------------------------------------------------------------------===// 11675 // ARM Inline Assembly Support 11676 //===----------------------------------------------------------------------===// 11677 11678 bool ARMTargetLowering::ExpandInlineAsm(CallInst *CI) const { 11679 // Looking for "rev" which is V6+. 11680 if (!Subtarget->hasV6Ops()) 11681 return false; 11682 11683 InlineAsm *IA = cast<InlineAsm>(CI->getCalledValue()); 11684 std::string AsmStr = IA->getAsmString(); 11685 SmallVector<StringRef, 4> AsmPieces; 11686 SplitString(AsmStr, AsmPieces, ";\n"); 11687 11688 switch (AsmPieces.size()) { 11689 default: return false; 11690 case 1: 11691 AsmStr = AsmPieces[0]; 11692 AsmPieces.clear(); 11693 SplitString(AsmStr, AsmPieces, " \t,"); 11694 11695 // rev $0, $1 11696 if (AsmPieces.size() == 3 && 11697 AsmPieces[0] == "rev" && AsmPieces[1] == "$0" && AsmPieces[2] == "$1" && 11698 IA->getConstraintString().compare(0, 4, "=l,l") == 0) { 11699 IntegerType *Ty = dyn_cast<IntegerType>(CI->getType()); 11700 if (Ty && Ty->getBitWidth() == 32) 11701 return IntrinsicLowering::LowerToByteSwap(CI); 11702 } 11703 break; 11704 } 11705 11706 return false; 11707 } 11708 11709 const char *ARMTargetLowering::LowerXConstraint(EVT ConstraintVT) const { 11710 // At this point, we have to lower this constraint to something else, so we 11711 // lower it to an "r" or "w". However, by doing this we will force the result 11712 // to be in register, while the X constraint is much more permissive. 11713 // 11714 // Although we are correct (we are free to emit anything, without 11715 // constraints), we might break use cases that would expect us to be more 11716 // efficient and emit something else. 11717 if (!Subtarget->hasVFP2()) 11718 return "r"; 11719 if (ConstraintVT.isFloatingPoint()) 11720 return "w"; 11721 if (ConstraintVT.isVector() && Subtarget->hasNEON() && 11722 (ConstraintVT.getSizeInBits() == 64 || 11723 ConstraintVT.getSizeInBits() == 128)) 11724 return "w"; 11725 11726 return "r"; 11727 } 11728 11729 /// getConstraintType - Given a constraint letter, return the type of 11730 /// constraint it is for this target. 11731 ARMTargetLowering::ConstraintType 11732 ARMTargetLowering::getConstraintType(StringRef Constraint) const { 11733 if (Constraint.size() == 1) { 11734 switch (Constraint[0]) { 11735 default: break; 11736 case 'l': return C_RegisterClass; 11737 case 'w': return C_RegisterClass; 11738 case 'h': return C_RegisterClass; 11739 case 'x': return C_RegisterClass; 11740 case 't': return C_RegisterClass; 11741 case 'j': return C_Other; // Constant for movw. 11742 // An address with a single base register. Due to the way we 11743 // currently handle addresses it is the same as an 'r' memory constraint. 11744 case 'Q': return C_Memory; 11745 } 11746 } else if (Constraint.size() == 2) { 11747 switch (Constraint[0]) { 11748 default: break; 11749 // All 'U+' constraints are addresses. 11750 case 'U': return C_Memory; 11751 } 11752 } 11753 return TargetLowering::getConstraintType(Constraint); 11754 } 11755 11756 /// Examine constraint type and operand type and determine a weight value. 11757 /// This object must already have been set up with the operand type 11758 /// and the current alternative constraint selected. 11759 TargetLowering::ConstraintWeight 11760 ARMTargetLowering::getSingleConstraintMatchWeight( 11761 AsmOperandInfo &info, const char *constraint) const { 11762 ConstraintWeight weight = CW_Invalid; 11763 Value *CallOperandVal = info.CallOperandVal; 11764 // If we don't have a value, we can't do a match, 11765 // but allow it at the lowest weight. 11766 if (!CallOperandVal) 11767 return CW_Default; 11768 Type *type = CallOperandVal->getType(); 11769 // Look at the constraint type. 11770 switch (*constraint) { 11771 default: 11772 weight = TargetLowering::getSingleConstraintMatchWeight(info, constraint); 11773 break; 11774 case 'l': 11775 if (type->isIntegerTy()) { 11776 if (Subtarget->isThumb()) 11777 weight = CW_SpecificReg; 11778 else 11779 weight = CW_Register; 11780 } 11781 break; 11782 case 'w': 11783 if (type->isFloatingPointTy()) 11784 weight = CW_Register; 11785 break; 11786 } 11787 return weight; 11788 } 11789 11790 typedef std::pair<unsigned, const TargetRegisterClass*> RCPair; 11791 RCPair ARMTargetLowering::getRegForInlineAsmConstraint( 11792 const TargetRegisterInfo *TRI, StringRef Constraint, MVT VT) const { 11793 if (Constraint.size() == 1) { 11794 // GCC ARM Constraint Letters 11795 switch (Constraint[0]) { 11796 case 'l': // Low regs or general regs. 11797 if (Subtarget->isThumb()) 11798 return RCPair(0U, &ARM::tGPRRegClass); 11799 return RCPair(0U, &ARM::GPRRegClass); 11800 case 'h': // High regs or no regs. 11801 if (Subtarget->isThumb()) 11802 return RCPair(0U, &ARM::hGPRRegClass); 11803 break; 11804 case 'r': 11805 if (Subtarget->isThumb1Only()) 11806 return RCPair(0U, &ARM::tGPRRegClass); 11807 return RCPair(0U, &ARM::GPRRegClass); 11808 case 'w': 11809 if (VT == MVT::Other) 11810 break; 11811 if (VT == MVT::f32) 11812 return RCPair(0U, &ARM::SPRRegClass); 11813 if (VT.getSizeInBits() == 64) 11814 return RCPair(0U, &ARM::DPRRegClass); 11815 if (VT.getSizeInBits() == 128) 11816 return RCPair(0U, &ARM::QPRRegClass); 11817 break; 11818 case 'x': 11819 if (VT == MVT::Other) 11820 break; 11821 if (VT == MVT::f32) 11822 return RCPair(0U, &ARM::SPR_8RegClass); 11823 if (VT.getSizeInBits() == 64) 11824 return RCPair(0U, &ARM::DPR_8RegClass); 11825 if (VT.getSizeInBits() == 128) 11826 return RCPair(0U, &ARM::QPR_8RegClass); 11827 break; 11828 case 't': 11829 if (VT == MVT::f32) 11830 return RCPair(0U, &ARM::SPRRegClass); 11831 break; 11832 } 11833 } 11834 if (StringRef("{cc}").equals_lower(Constraint)) 11835 return std::make_pair(unsigned(ARM::CPSR), &ARM::CCRRegClass); 11836 11837 return TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT); 11838 } 11839 11840 /// LowerAsmOperandForConstraint - Lower the specified operand into the Ops 11841 /// vector. If it is invalid, don't add anything to Ops. 11842 void ARMTargetLowering::LowerAsmOperandForConstraint(SDValue Op, 11843 std::string &Constraint, 11844 std::vector<SDValue>&Ops, 11845 SelectionDAG &DAG) const { 11846 SDValue Result; 11847 11848 // Currently only support length 1 constraints. 11849 if (Constraint.length() != 1) return; 11850 11851 char ConstraintLetter = Constraint[0]; 11852 switch (ConstraintLetter) { 11853 default: break; 11854 case 'j': 11855 case 'I': case 'J': case 'K': case 'L': 11856 case 'M': case 'N': case 'O': 11857 ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op); 11858 if (!C) 11859 return; 11860 11861 int64_t CVal64 = C->getSExtValue(); 11862 int CVal = (int) CVal64; 11863 // None of these constraints allow values larger than 32 bits. Check 11864 // that the value fits in an int. 11865 if (CVal != CVal64) 11866 return; 11867 11868 switch (ConstraintLetter) { 11869 case 'j': 11870 // Constant suitable for movw, must be between 0 and 11871 // 65535. 11872 if (Subtarget->hasV6T2Ops()) 11873 if (CVal >= 0 && CVal <= 65535) 11874 break; 11875 return; 11876 case 'I': 11877 if (Subtarget->isThumb1Only()) { 11878 // This must be a constant between 0 and 255, for ADD 11879 // immediates. 11880 if (CVal >= 0 && CVal <= 255) 11881 break; 11882 } else if (Subtarget->isThumb2()) { 11883 // A constant that can be used as an immediate value in a 11884 // data-processing instruction. 11885 if (ARM_AM::getT2SOImmVal(CVal) != -1) 11886 break; 11887 } else { 11888 // A constant that can be used as an immediate value in a 11889 // data-processing instruction. 11890 if (ARM_AM::getSOImmVal(CVal) != -1) 11891 break; 11892 } 11893 return; 11894 11895 case 'J': 11896 if (Subtarget->isThumb1Only()) { 11897 // This must be a constant between -255 and -1, for negated ADD 11898 // immediates. This can be used in GCC with an "n" modifier that 11899 // prints the negated value, for use with SUB instructions. It is 11900 // not useful otherwise but is implemented for compatibility. 11901 if (CVal >= -255 && CVal <= -1) 11902 break; 11903 } else { 11904 // This must be a constant between -4095 and 4095. It is not clear 11905 // what this constraint is intended for. Implemented for 11906 // compatibility with GCC. 11907 if (CVal >= -4095 && CVal <= 4095) 11908 break; 11909 } 11910 return; 11911 11912 case 'K': 11913 if (Subtarget->isThumb1Only()) { 11914 // A 32-bit value where only one byte has a nonzero value. Exclude 11915 // zero to match GCC. This constraint is used by GCC internally for 11916 // constants that can be loaded with a move/shift combination. 11917 // It is not useful otherwise but is implemented for compatibility. 11918 if (CVal != 0 && ARM_AM::isThumbImmShiftedVal(CVal)) 11919 break; 11920 } else if (Subtarget->isThumb2()) { 11921 // A constant whose bitwise inverse can be used as an immediate 11922 // value in a data-processing instruction. This can be used in GCC 11923 // with a "B" modifier that prints the inverted value, for use with 11924 // BIC and MVN instructions. It is not useful otherwise but is 11925 // implemented for compatibility. 11926 if (ARM_AM::getT2SOImmVal(~CVal) != -1) 11927 break; 11928 } else { 11929 // A constant whose bitwise inverse can be used as an immediate 11930 // value in a data-processing instruction. This can be used in GCC 11931 // with a "B" modifier that prints the inverted value, for use with 11932 // BIC and MVN instructions. It is not useful otherwise but is 11933 // implemented for compatibility. 11934 if (ARM_AM::getSOImmVal(~CVal) != -1) 11935 break; 11936 } 11937 return; 11938 11939 case 'L': 11940 if (Subtarget->isThumb1Only()) { 11941 // This must be a constant between -7 and 7, 11942 // for 3-operand ADD/SUB immediate instructions. 11943 if (CVal >= -7 && CVal < 7) 11944 break; 11945 } else if (Subtarget->isThumb2()) { 11946 // A constant whose negation can be used as an immediate value in a 11947 // data-processing instruction. This can be used in GCC with an "n" 11948 // modifier that prints the negated value, for use with SUB 11949 // instructions. It is not useful otherwise but is implemented for 11950 // compatibility. 11951 if (ARM_AM::getT2SOImmVal(-CVal) != -1) 11952 break; 11953 } else { 11954 // A constant whose negation can be used as an immediate value in a 11955 // data-processing instruction. This can be used in GCC with an "n" 11956 // modifier that prints the negated value, for use with SUB 11957 // instructions. It is not useful otherwise but is implemented for 11958 // compatibility. 11959 if (ARM_AM::getSOImmVal(-CVal) != -1) 11960 break; 11961 } 11962 return; 11963 11964 case 'M': 11965 if (Subtarget->isThumb1Only()) { 11966 // This must be a multiple of 4 between 0 and 1020, for 11967 // ADD sp + immediate. 11968 if ((CVal >= 0 && CVal <= 1020) && ((CVal & 3) == 0)) 11969 break; 11970 } else { 11971 // A power of two or a constant between 0 and 32. This is used in 11972 // GCC for the shift amount on shifted register operands, but it is 11973 // useful in general for any shift amounts. 11974 if ((CVal >= 0 && CVal <= 32) || ((CVal & (CVal - 1)) == 0)) 11975 break; 11976 } 11977 return; 11978 11979 case 'N': 11980 if (Subtarget->isThumb()) { // FIXME thumb2 11981 // This must be a constant between 0 and 31, for shift amounts. 11982 if (CVal >= 0 && CVal <= 31) 11983 break; 11984 } 11985 return; 11986 11987 case 'O': 11988 if (Subtarget->isThumb()) { // FIXME thumb2 11989 // This must be a multiple of 4 between -508 and 508, for 11990 // ADD/SUB sp = sp + immediate. 11991 if ((CVal >= -508 && CVal <= 508) && ((CVal & 3) == 0)) 11992 break; 11993 } 11994 return; 11995 } 11996 Result = DAG.getTargetConstant(CVal, SDLoc(Op), Op.getValueType()); 11997 break; 11998 } 11999 12000 if (Result.getNode()) { 12001 Ops.push_back(Result); 12002 return; 12003 } 12004 return TargetLowering::LowerAsmOperandForConstraint(Op, Constraint, Ops, DAG); 12005 } 12006 12007 static RTLIB::Libcall getDivRemLibcall( 12008 const SDNode *N, MVT::SimpleValueType SVT) { 12009 assert((N->getOpcode() == ISD::SDIVREM || N->getOpcode() == ISD::UDIVREM || 12010 N->getOpcode() == ISD::SREM || N->getOpcode() == ISD::UREM) && 12011 "Unhandled Opcode in getDivRemLibcall"); 12012 bool isSigned = N->getOpcode() == ISD::SDIVREM || 12013 N->getOpcode() == ISD::SREM; 12014 RTLIB::Libcall LC; 12015 switch (SVT) { 12016 default: llvm_unreachable("Unexpected request for libcall!"); 12017 case MVT::i8: LC = isSigned ? RTLIB::SDIVREM_I8 : RTLIB::UDIVREM_I8; break; 12018 case MVT::i16: LC = isSigned ? RTLIB::SDIVREM_I16 : RTLIB::UDIVREM_I16; break; 12019 case MVT::i32: LC = isSigned ? RTLIB::SDIVREM_I32 : RTLIB::UDIVREM_I32; break; 12020 case MVT::i64: LC = isSigned ? RTLIB::SDIVREM_I64 : RTLIB::UDIVREM_I64; break; 12021 } 12022 return LC; 12023 } 12024 12025 static TargetLowering::ArgListTy getDivRemArgList( 12026 const SDNode *N, LLVMContext *Context) { 12027 assert((N->getOpcode() == ISD::SDIVREM || N->getOpcode() == ISD::UDIVREM || 12028 N->getOpcode() == ISD::SREM || N->getOpcode() == ISD::UREM) && 12029 "Unhandled Opcode in getDivRemArgList"); 12030 bool isSigned = N->getOpcode() == ISD::SDIVREM || 12031 N->getOpcode() == ISD::SREM; 12032 TargetLowering::ArgListTy Args; 12033 TargetLowering::ArgListEntry Entry; 12034 for (unsigned i = 0, e = N->getNumOperands(); i != e; ++i) { 12035 EVT ArgVT = N->getOperand(i).getValueType(); 12036 Type *ArgTy = ArgVT.getTypeForEVT(*Context); 12037 Entry.Node = N->getOperand(i); 12038 Entry.Ty = ArgTy; 12039 Entry.isSExt = isSigned; 12040 Entry.isZExt = !isSigned; 12041 Args.push_back(Entry); 12042 } 12043 return Args; 12044 } 12045 12046 SDValue ARMTargetLowering::LowerDivRem(SDValue Op, SelectionDAG &DAG) const { 12047 assert((Subtarget->isTargetAEABI() || Subtarget->isTargetAndroid() || 12048 Subtarget->isTargetGNUAEABI() || Subtarget->isTargetMuslAEABI()) && 12049 "Register-based DivRem lowering only"); 12050 unsigned Opcode = Op->getOpcode(); 12051 assert((Opcode == ISD::SDIVREM || Opcode == ISD::UDIVREM) && 12052 "Invalid opcode for Div/Rem lowering"); 12053 bool isSigned = (Opcode == ISD::SDIVREM); 12054 EVT VT = Op->getValueType(0); 12055 Type *Ty = VT.getTypeForEVT(*DAG.getContext()); 12056 12057 RTLIB::Libcall LC = getDivRemLibcall(Op.getNode(), 12058 VT.getSimpleVT().SimpleTy); 12059 SDValue InChain = DAG.getEntryNode(); 12060 12061 TargetLowering::ArgListTy Args = getDivRemArgList(Op.getNode(), 12062 DAG.getContext()); 12063 12064 SDValue Callee = DAG.getExternalSymbol(getLibcallName(LC), 12065 getPointerTy(DAG.getDataLayout())); 12066 12067 Type *RetTy = (Type*)StructType::get(Ty, Ty, nullptr); 12068 12069 SDLoc dl(Op); 12070 TargetLowering::CallLoweringInfo CLI(DAG); 12071 CLI.setDebugLoc(dl).setChain(InChain) 12072 .setCallee(getLibcallCallingConv(LC), RetTy, Callee, std::move(Args)) 12073 .setInRegister().setSExtResult(isSigned).setZExtResult(!isSigned); 12074 12075 std::pair<SDValue, SDValue> CallInfo = LowerCallTo(CLI); 12076 return CallInfo.first; 12077 } 12078 12079 // Lowers REM using divmod helpers 12080 // see RTABI section 4.2/4.3 12081 SDValue ARMTargetLowering::LowerREM(SDNode *N, SelectionDAG &DAG) const { 12082 // Build return types (div and rem) 12083 std::vector<Type*> RetTyParams; 12084 Type *RetTyElement; 12085 12086 switch (N->getValueType(0).getSimpleVT().SimpleTy) { 12087 default: llvm_unreachable("Unexpected request for libcall!"); 12088 case MVT::i8: RetTyElement = Type::getInt8Ty(*DAG.getContext()); break; 12089 case MVT::i16: RetTyElement = Type::getInt16Ty(*DAG.getContext()); break; 12090 case MVT::i32: RetTyElement = Type::getInt32Ty(*DAG.getContext()); break; 12091 case MVT::i64: RetTyElement = Type::getInt64Ty(*DAG.getContext()); break; 12092 } 12093 12094 RetTyParams.push_back(RetTyElement); 12095 RetTyParams.push_back(RetTyElement); 12096 ArrayRef<Type*> ret = ArrayRef<Type*>(RetTyParams); 12097 Type *RetTy = StructType::get(*DAG.getContext(), ret); 12098 12099 RTLIB::Libcall LC = getDivRemLibcall(N, N->getValueType(0).getSimpleVT(). 12100 SimpleTy); 12101 SDValue InChain = DAG.getEntryNode(); 12102 TargetLowering::ArgListTy Args = getDivRemArgList(N, DAG.getContext()); 12103 bool isSigned = N->getOpcode() == ISD::SREM; 12104 SDValue Callee = DAG.getExternalSymbol(getLibcallName(LC), 12105 getPointerTy(DAG.getDataLayout())); 12106 12107 // Lower call 12108 CallLoweringInfo CLI(DAG); 12109 CLI.setChain(InChain) 12110 .setCallee(CallingConv::ARM_AAPCS, RetTy, Callee, std::move(Args)) 12111 .setSExtResult(isSigned).setZExtResult(!isSigned).setDebugLoc(SDLoc(N)); 12112 std::pair<SDValue, SDValue> CallResult = LowerCallTo(CLI); 12113 12114 // Return second (rem) result operand (first contains div) 12115 SDNode *ResNode = CallResult.first.getNode(); 12116 assert(ResNode->getNumOperands() == 2 && "divmod should return two operands"); 12117 return ResNode->getOperand(1); 12118 } 12119 12120 SDValue 12121 ARMTargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op, SelectionDAG &DAG) const { 12122 assert(Subtarget->isTargetWindows() && "unsupported target platform"); 12123 SDLoc DL(Op); 12124 12125 // Get the inputs. 12126 SDValue Chain = Op.getOperand(0); 12127 SDValue Size = Op.getOperand(1); 12128 12129 SDValue Words = DAG.getNode(ISD::SRL, DL, MVT::i32, Size, 12130 DAG.getConstant(2, DL, MVT::i32)); 12131 12132 SDValue Flag; 12133 Chain = DAG.getCopyToReg(Chain, DL, ARM::R4, Words, Flag); 12134 Flag = Chain.getValue(1); 12135 12136 SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue); 12137 Chain = DAG.getNode(ARMISD::WIN__CHKSTK, DL, NodeTys, Chain, Flag); 12138 12139 SDValue NewSP = DAG.getCopyFromReg(Chain, DL, ARM::SP, MVT::i32); 12140 Chain = NewSP.getValue(1); 12141 12142 SDValue Ops[2] = { NewSP, Chain }; 12143 return DAG.getMergeValues(Ops, DL); 12144 } 12145 12146 SDValue ARMTargetLowering::LowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) const { 12147 assert(Op.getValueType() == MVT::f64 && Subtarget->isFPOnlySP() && 12148 "Unexpected type for custom-lowering FP_EXTEND"); 12149 12150 RTLIB::Libcall LC; 12151 LC = RTLIB::getFPEXT(Op.getOperand(0).getValueType(), Op.getValueType()); 12152 12153 SDValue SrcVal = Op.getOperand(0); 12154 return makeLibCall(DAG, LC, Op.getValueType(), SrcVal, /*isSigned*/ false, 12155 SDLoc(Op)).first; 12156 } 12157 12158 SDValue ARMTargetLowering::LowerFP_ROUND(SDValue Op, SelectionDAG &DAG) const { 12159 assert(Op.getOperand(0).getValueType() == MVT::f64 && 12160 Subtarget->isFPOnlySP() && 12161 "Unexpected type for custom-lowering FP_ROUND"); 12162 12163 RTLIB::Libcall LC; 12164 LC = RTLIB::getFPROUND(Op.getOperand(0).getValueType(), Op.getValueType()); 12165 12166 SDValue SrcVal = Op.getOperand(0); 12167 return makeLibCall(DAG, LC, Op.getValueType(), SrcVal, /*isSigned*/ false, 12168 SDLoc(Op)).first; 12169 } 12170 12171 bool 12172 ARMTargetLowering::isOffsetFoldingLegal(const GlobalAddressSDNode *GA) const { 12173 // The ARM target isn't yet aware of offsets. 12174 return false; 12175 } 12176 12177 bool ARM::isBitFieldInvertedMask(unsigned v) { 12178 if (v == 0xffffffff) 12179 return false; 12180 12181 // there can be 1's on either or both "outsides", all the "inside" 12182 // bits must be 0's 12183 return isShiftedMask_32(~v); 12184 } 12185 12186 /// isFPImmLegal - Returns true if the target can instruction select the 12187 /// specified FP immediate natively. If false, the legalizer will 12188 /// materialize the FP immediate as a load from a constant pool. 12189 bool ARMTargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT) const { 12190 if (!Subtarget->hasVFP3()) 12191 return false; 12192 if (VT == MVT::f32) 12193 return ARM_AM::getFP32Imm(Imm) != -1; 12194 if (VT == MVT::f64 && !Subtarget->isFPOnlySP()) 12195 return ARM_AM::getFP64Imm(Imm) != -1; 12196 return false; 12197 } 12198 12199 /// getTgtMemIntrinsic - Represent NEON load and store intrinsics as 12200 /// MemIntrinsicNodes. The associated MachineMemOperands record the alignment 12201 /// specified in the intrinsic calls. 12202 bool ARMTargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info, 12203 const CallInst &I, 12204 unsigned Intrinsic) const { 12205 switch (Intrinsic) { 12206 case Intrinsic::arm_neon_vld1: 12207 case Intrinsic::arm_neon_vld2: 12208 case Intrinsic::arm_neon_vld3: 12209 case Intrinsic::arm_neon_vld4: 12210 case Intrinsic::arm_neon_vld2lane: 12211 case Intrinsic::arm_neon_vld3lane: 12212 case Intrinsic::arm_neon_vld4lane: { 12213 Info.opc = ISD::INTRINSIC_W_CHAIN; 12214 // Conservatively set memVT to the entire set of vectors loaded. 12215 auto &DL = I.getCalledFunction()->getParent()->getDataLayout(); 12216 uint64_t NumElts = DL.getTypeSizeInBits(I.getType()) / 64; 12217 Info.memVT = EVT::getVectorVT(I.getType()->getContext(), MVT::i64, NumElts); 12218 Info.ptrVal = I.getArgOperand(0); 12219 Info.offset = 0; 12220 Value *AlignArg = I.getArgOperand(I.getNumArgOperands() - 1); 12221 Info.align = cast<ConstantInt>(AlignArg)->getZExtValue(); 12222 Info.vol = false; // volatile loads with NEON intrinsics not supported 12223 Info.readMem = true; 12224 Info.writeMem = false; 12225 return true; 12226 } 12227 case Intrinsic::arm_neon_vst1: 12228 case Intrinsic::arm_neon_vst2: 12229 case Intrinsic::arm_neon_vst3: 12230 case Intrinsic::arm_neon_vst4: 12231 case Intrinsic::arm_neon_vst2lane: 12232 case Intrinsic::arm_neon_vst3lane: 12233 case Intrinsic::arm_neon_vst4lane: { 12234 Info.opc = ISD::INTRINSIC_VOID; 12235 // Conservatively set memVT to the entire set of vectors stored. 12236 auto &DL = I.getCalledFunction()->getParent()->getDataLayout(); 12237 unsigned NumElts = 0; 12238 for (unsigned ArgI = 1, ArgE = I.getNumArgOperands(); ArgI < ArgE; ++ArgI) { 12239 Type *ArgTy = I.getArgOperand(ArgI)->getType(); 12240 if (!ArgTy->isVectorTy()) 12241 break; 12242 NumElts += DL.getTypeSizeInBits(ArgTy) / 64; 12243 } 12244 Info.memVT = EVT::getVectorVT(I.getType()->getContext(), MVT::i64, NumElts); 12245 Info.ptrVal = I.getArgOperand(0); 12246 Info.offset = 0; 12247 Value *AlignArg = I.getArgOperand(I.getNumArgOperands() - 1); 12248 Info.align = cast<ConstantInt>(AlignArg)->getZExtValue(); 12249 Info.vol = false; // volatile stores with NEON intrinsics not supported 12250 Info.readMem = false; 12251 Info.writeMem = true; 12252 return true; 12253 } 12254 case Intrinsic::arm_ldaex: 12255 case Intrinsic::arm_ldrex: { 12256 auto &DL = I.getCalledFunction()->getParent()->getDataLayout(); 12257 PointerType *PtrTy = cast<PointerType>(I.getArgOperand(0)->getType()); 12258 Info.opc = ISD::INTRINSIC_W_CHAIN; 12259 Info.memVT = MVT::getVT(PtrTy->getElementType()); 12260 Info.ptrVal = I.getArgOperand(0); 12261 Info.offset = 0; 12262 Info.align = DL.getABITypeAlignment(PtrTy->getElementType()); 12263 Info.vol = true; 12264 Info.readMem = true; 12265 Info.writeMem = false; 12266 return true; 12267 } 12268 case Intrinsic::arm_stlex: 12269 case Intrinsic::arm_strex: { 12270 auto &DL = I.getCalledFunction()->getParent()->getDataLayout(); 12271 PointerType *PtrTy = cast<PointerType>(I.getArgOperand(1)->getType()); 12272 Info.opc = ISD::INTRINSIC_W_CHAIN; 12273 Info.memVT = MVT::getVT(PtrTy->getElementType()); 12274 Info.ptrVal = I.getArgOperand(1); 12275 Info.offset = 0; 12276 Info.align = DL.getABITypeAlignment(PtrTy->getElementType()); 12277 Info.vol = true; 12278 Info.readMem = false; 12279 Info.writeMem = true; 12280 return true; 12281 } 12282 case Intrinsic::arm_stlexd: 12283 case Intrinsic::arm_strexd: { 12284 Info.opc = ISD::INTRINSIC_W_CHAIN; 12285 Info.memVT = MVT::i64; 12286 Info.ptrVal = I.getArgOperand(2); 12287 Info.offset = 0; 12288 Info.align = 8; 12289 Info.vol = true; 12290 Info.readMem = false; 12291 Info.writeMem = true; 12292 return true; 12293 } 12294 case Intrinsic::arm_ldaexd: 12295 case Intrinsic::arm_ldrexd: { 12296 Info.opc = ISD::INTRINSIC_W_CHAIN; 12297 Info.memVT = MVT::i64; 12298 Info.ptrVal = I.getArgOperand(0); 12299 Info.offset = 0; 12300 Info.align = 8; 12301 Info.vol = true; 12302 Info.readMem = true; 12303 Info.writeMem = false; 12304 return true; 12305 } 12306 default: 12307 break; 12308 } 12309 12310 return false; 12311 } 12312 12313 /// \brief Returns true if it is beneficial to convert a load of a constant 12314 /// to just the constant itself. 12315 bool ARMTargetLowering::shouldConvertConstantLoadToIntImm(const APInt &Imm, 12316 Type *Ty) const { 12317 assert(Ty->isIntegerTy()); 12318 12319 unsigned Bits = Ty->getPrimitiveSizeInBits(); 12320 if (Bits == 0 || Bits > 32) 12321 return false; 12322 return true; 12323 } 12324 12325 Instruction* ARMTargetLowering::makeDMB(IRBuilder<> &Builder, 12326 ARM_MB::MemBOpt Domain) const { 12327 Module *M = Builder.GetInsertBlock()->getParent()->getParent(); 12328 12329 // First, if the target has no DMB, see what fallback we can use. 12330 if (!Subtarget->hasDataBarrier()) { 12331 // Some ARMv6 cpus can support data barriers with an mcr instruction. 12332 // Thumb1 and pre-v6 ARM mode use a libcall instead and should never get 12333 // here. 12334 if (Subtarget->hasV6Ops() && !Subtarget->isThumb()) { 12335 Function *MCR = llvm::Intrinsic::getDeclaration(M, Intrinsic::arm_mcr); 12336 Value* args[6] = {Builder.getInt32(15), Builder.getInt32(0), 12337 Builder.getInt32(0), Builder.getInt32(7), 12338 Builder.getInt32(10), Builder.getInt32(5)}; 12339 return Builder.CreateCall(MCR, args); 12340 } else { 12341 // Instead of using barriers, atomic accesses on these subtargets use 12342 // libcalls. 12343 llvm_unreachable("makeDMB on a target so old that it has no barriers"); 12344 } 12345 } else { 12346 Function *DMB = llvm::Intrinsic::getDeclaration(M, Intrinsic::arm_dmb); 12347 // Only a full system barrier exists in the M-class architectures. 12348 Domain = Subtarget->isMClass() ? ARM_MB::SY : Domain; 12349 Constant *CDomain = Builder.getInt32(Domain); 12350 return Builder.CreateCall(DMB, CDomain); 12351 } 12352 } 12353 12354 // Based on http://www.cl.cam.ac.uk/~pes20/cpp/cpp0xmappings.html 12355 Instruction* ARMTargetLowering::emitLeadingFence(IRBuilder<> &Builder, 12356 AtomicOrdering Ord, bool IsStore, 12357 bool IsLoad) const { 12358 switch (Ord) { 12359 case AtomicOrdering::NotAtomic: 12360 case AtomicOrdering::Unordered: 12361 llvm_unreachable("Invalid fence: unordered/non-atomic"); 12362 case AtomicOrdering::Monotonic: 12363 case AtomicOrdering::Acquire: 12364 return nullptr; // Nothing to do 12365 case AtomicOrdering::SequentiallyConsistent: 12366 if (!IsStore) 12367 return nullptr; // Nothing to do 12368 /*FALLTHROUGH*/ 12369 case AtomicOrdering::Release: 12370 case AtomicOrdering::AcquireRelease: 12371 if (Subtarget->preferISHSTBarriers()) 12372 return makeDMB(Builder, ARM_MB::ISHST); 12373 // FIXME: add a comment with a link to documentation justifying this. 12374 else 12375 return makeDMB(Builder, ARM_MB::ISH); 12376 } 12377 llvm_unreachable("Unknown fence ordering in emitLeadingFence"); 12378 } 12379 12380 Instruction* ARMTargetLowering::emitTrailingFence(IRBuilder<> &Builder, 12381 AtomicOrdering Ord, bool IsStore, 12382 bool IsLoad) const { 12383 switch (Ord) { 12384 case AtomicOrdering::NotAtomic: 12385 case AtomicOrdering::Unordered: 12386 llvm_unreachable("Invalid fence: unordered/not-atomic"); 12387 case AtomicOrdering::Monotonic: 12388 case AtomicOrdering::Release: 12389 return nullptr; // Nothing to do 12390 case AtomicOrdering::Acquire: 12391 case AtomicOrdering::AcquireRelease: 12392 case AtomicOrdering::SequentiallyConsistent: 12393 return makeDMB(Builder, ARM_MB::ISH); 12394 } 12395 llvm_unreachable("Unknown fence ordering in emitTrailingFence"); 12396 } 12397 12398 // Loads and stores less than 64-bits are already atomic; ones above that 12399 // are doomed anyway, so defer to the default libcall and blame the OS when 12400 // things go wrong. Cortex M doesn't have ldrexd/strexd though, so don't emit 12401 // anything for those. 12402 bool ARMTargetLowering::shouldExpandAtomicStoreInIR(StoreInst *SI) const { 12403 unsigned Size = SI->getValueOperand()->getType()->getPrimitiveSizeInBits(); 12404 return (Size == 64) && !Subtarget->isMClass(); 12405 } 12406 12407 // Loads and stores less than 64-bits are already atomic; ones above that 12408 // are doomed anyway, so defer to the default libcall and blame the OS when 12409 // things go wrong. Cortex M doesn't have ldrexd/strexd though, so don't emit 12410 // anything for those. 12411 // FIXME: ldrd and strd are atomic if the CPU has LPAE (e.g. A15 has that 12412 // guarantee, see DDI0406C ARM architecture reference manual, 12413 // sections A8.8.72-74 LDRD) 12414 TargetLowering::AtomicExpansionKind 12415 ARMTargetLowering::shouldExpandAtomicLoadInIR(LoadInst *LI) const { 12416 unsigned Size = LI->getType()->getPrimitiveSizeInBits(); 12417 return ((Size == 64) && !Subtarget->isMClass()) ? AtomicExpansionKind::LLOnly 12418 : AtomicExpansionKind::None; 12419 } 12420 12421 // For the real atomic operations, we have ldrex/strex up to 32 bits, 12422 // and up to 64 bits on the non-M profiles 12423 TargetLowering::AtomicExpansionKind 12424 ARMTargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const { 12425 unsigned Size = AI->getType()->getPrimitiveSizeInBits(); 12426 return (Size <= (Subtarget->isMClass() ? 32U : 64U)) 12427 ? AtomicExpansionKind::LLSC 12428 : AtomicExpansionKind::None; 12429 } 12430 12431 bool ARMTargetLowering::shouldExpandAtomicCmpXchgInIR( 12432 AtomicCmpXchgInst *AI) const { 12433 // At -O0, fast-regalloc cannot cope with the live vregs necessary to 12434 // implement cmpxchg without spilling. If the address being exchanged is also 12435 // on the stack and close enough to the spill slot, this can lead to a 12436 // situation where the monitor always gets cleared and the atomic operation 12437 // can never succeed. So at -O0 we need a late-expanded pseudo-inst instead. 12438 return getTargetMachine().getOptLevel() != 0; 12439 } 12440 12441 bool ARMTargetLowering::shouldInsertFencesForAtomic( 12442 const Instruction *I) const { 12443 return InsertFencesForAtomic; 12444 } 12445 12446 // This has so far only been implemented for MachO. 12447 bool ARMTargetLowering::useLoadStackGuardNode() const { 12448 return Subtarget->isTargetMachO(); 12449 } 12450 12451 bool ARMTargetLowering::canCombineStoreAndExtract(Type *VectorTy, Value *Idx, 12452 unsigned &Cost) const { 12453 // If we do not have NEON, vector types are not natively supported. 12454 if (!Subtarget->hasNEON()) 12455 return false; 12456 12457 // Floating point values and vector values map to the same register file. 12458 // Therefore, although we could do a store extract of a vector type, this is 12459 // better to leave at float as we have more freedom in the addressing mode for 12460 // those. 12461 if (VectorTy->isFPOrFPVectorTy()) 12462 return false; 12463 12464 // If the index is unknown at compile time, this is very expensive to lower 12465 // and it is not possible to combine the store with the extract. 12466 if (!isa<ConstantInt>(Idx)) 12467 return false; 12468 12469 assert(VectorTy->isVectorTy() && "VectorTy is not a vector type"); 12470 unsigned BitWidth = cast<VectorType>(VectorTy)->getBitWidth(); 12471 // We can do a store + vector extract on any vector that fits perfectly in a D 12472 // or Q register. 12473 if (BitWidth == 64 || BitWidth == 128) { 12474 Cost = 0; 12475 return true; 12476 } 12477 return false; 12478 } 12479 12480 bool ARMTargetLowering::isCheapToSpeculateCttz() const { 12481 return Subtarget->hasV6T2Ops(); 12482 } 12483 12484 bool ARMTargetLowering::isCheapToSpeculateCtlz() const { 12485 return Subtarget->hasV6T2Ops(); 12486 } 12487 12488 Value *ARMTargetLowering::emitLoadLinked(IRBuilder<> &Builder, Value *Addr, 12489 AtomicOrdering Ord) const { 12490 Module *M = Builder.GetInsertBlock()->getParent()->getParent(); 12491 Type *ValTy = cast<PointerType>(Addr->getType())->getElementType(); 12492 bool IsAcquire = isAcquireOrStronger(Ord); 12493 12494 // Since i64 isn't legal and intrinsics don't get type-lowered, the ldrexd 12495 // intrinsic must return {i32, i32} and we have to recombine them into a 12496 // single i64 here. 12497 if (ValTy->getPrimitiveSizeInBits() == 64) { 12498 Intrinsic::ID Int = 12499 IsAcquire ? Intrinsic::arm_ldaexd : Intrinsic::arm_ldrexd; 12500 Function *Ldrex = llvm::Intrinsic::getDeclaration(M, Int); 12501 12502 Addr = Builder.CreateBitCast(Addr, Type::getInt8PtrTy(M->getContext())); 12503 Value *LoHi = Builder.CreateCall(Ldrex, Addr, "lohi"); 12504 12505 Value *Lo = Builder.CreateExtractValue(LoHi, 0, "lo"); 12506 Value *Hi = Builder.CreateExtractValue(LoHi, 1, "hi"); 12507 if (!Subtarget->isLittle()) 12508 std::swap (Lo, Hi); 12509 Lo = Builder.CreateZExt(Lo, ValTy, "lo64"); 12510 Hi = Builder.CreateZExt(Hi, ValTy, "hi64"); 12511 return Builder.CreateOr( 12512 Lo, Builder.CreateShl(Hi, ConstantInt::get(ValTy, 32)), "val64"); 12513 } 12514 12515 Type *Tys[] = { Addr->getType() }; 12516 Intrinsic::ID Int = IsAcquire ? Intrinsic::arm_ldaex : Intrinsic::arm_ldrex; 12517 Function *Ldrex = llvm::Intrinsic::getDeclaration(M, Int, Tys); 12518 12519 return Builder.CreateTruncOrBitCast( 12520 Builder.CreateCall(Ldrex, Addr), 12521 cast<PointerType>(Addr->getType())->getElementType()); 12522 } 12523 12524 void ARMTargetLowering::emitAtomicCmpXchgNoStoreLLBalance( 12525 IRBuilder<> &Builder) const { 12526 if (!Subtarget->hasV7Ops()) 12527 return; 12528 Module *M = Builder.GetInsertBlock()->getParent()->getParent(); 12529 Builder.CreateCall(llvm::Intrinsic::getDeclaration(M, Intrinsic::arm_clrex)); 12530 } 12531 12532 Value *ARMTargetLowering::emitStoreConditional(IRBuilder<> &Builder, Value *Val, 12533 Value *Addr, 12534 AtomicOrdering Ord) const { 12535 Module *M = Builder.GetInsertBlock()->getParent()->getParent(); 12536 bool IsRelease = isReleaseOrStronger(Ord); 12537 12538 // Since the intrinsics must have legal type, the i64 intrinsics take two 12539 // parameters: "i32, i32". We must marshal Val into the appropriate form 12540 // before the call. 12541 if (Val->getType()->getPrimitiveSizeInBits() == 64) { 12542 Intrinsic::ID Int = 12543 IsRelease ? Intrinsic::arm_stlexd : Intrinsic::arm_strexd; 12544 Function *Strex = Intrinsic::getDeclaration(M, Int); 12545 Type *Int32Ty = Type::getInt32Ty(M->getContext()); 12546 12547 Value *Lo = Builder.CreateTrunc(Val, Int32Ty, "lo"); 12548 Value *Hi = Builder.CreateTrunc(Builder.CreateLShr(Val, 32), Int32Ty, "hi"); 12549 if (!Subtarget->isLittle()) 12550 std::swap (Lo, Hi); 12551 Addr = Builder.CreateBitCast(Addr, Type::getInt8PtrTy(M->getContext())); 12552 return Builder.CreateCall(Strex, {Lo, Hi, Addr}); 12553 } 12554 12555 Intrinsic::ID Int = IsRelease ? Intrinsic::arm_stlex : Intrinsic::arm_strex; 12556 Type *Tys[] = { Addr->getType() }; 12557 Function *Strex = Intrinsic::getDeclaration(M, Int, Tys); 12558 12559 return Builder.CreateCall( 12560 Strex, {Builder.CreateZExtOrBitCast( 12561 Val, Strex->getFunctionType()->getParamType(0)), 12562 Addr}); 12563 } 12564 12565 /// \brief Lower an interleaved load into a vldN intrinsic. 12566 /// 12567 /// E.g. Lower an interleaved load (Factor = 2): 12568 /// %wide.vec = load <8 x i32>, <8 x i32>* %ptr, align 4 12569 /// %v0 = shuffle %wide.vec, undef, <0, 2, 4, 6> ; Extract even elements 12570 /// %v1 = shuffle %wide.vec, undef, <1, 3, 5, 7> ; Extract odd elements 12571 /// 12572 /// Into: 12573 /// %vld2 = { <4 x i32>, <4 x i32> } call llvm.arm.neon.vld2(%ptr, 4) 12574 /// %vec0 = extractelement { <4 x i32>, <4 x i32> } %vld2, i32 0 12575 /// %vec1 = extractelement { <4 x i32>, <4 x i32> } %vld2, i32 1 12576 bool ARMTargetLowering::lowerInterleavedLoad( 12577 LoadInst *LI, ArrayRef<ShuffleVectorInst *> Shuffles, 12578 ArrayRef<unsigned> Indices, unsigned Factor) const { 12579 assert(Factor >= 2 && Factor <= getMaxSupportedInterleaveFactor() && 12580 "Invalid interleave factor"); 12581 assert(!Shuffles.empty() && "Empty shufflevector input"); 12582 assert(Shuffles.size() == Indices.size() && 12583 "Unmatched number of shufflevectors and indices"); 12584 12585 VectorType *VecTy = Shuffles[0]->getType(); 12586 Type *EltTy = VecTy->getVectorElementType(); 12587 12588 const DataLayout &DL = LI->getModule()->getDataLayout(); 12589 unsigned VecSize = DL.getTypeSizeInBits(VecTy); 12590 bool EltIs64Bits = DL.getTypeSizeInBits(EltTy) == 64; 12591 12592 // Skip if we do not have NEON and skip illegal vector types and vector types 12593 // with i64/f64 elements (vldN doesn't support i64/f64 elements). 12594 if (!Subtarget->hasNEON() || (VecSize != 64 && VecSize != 128) || EltIs64Bits) 12595 return false; 12596 12597 // A pointer vector can not be the return type of the ldN intrinsics. Need to 12598 // load integer vectors first and then convert to pointer vectors. 12599 if (EltTy->isPointerTy()) 12600 VecTy = 12601 VectorType::get(DL.getIntPtrType(EltTy), VecTy->getVectorNumElements()); 12602 12603 static const Intrinsic::ID LoadInts[3] = {Intrinsic::arm_neon_vld2, 12604 Intrinsic::arm_neon_vld3, 12605 Intrinsic::arm_neon_vld4}; 12606 12607 IRBuilder<> Builder(LI); 12608 SmallVector<Value *, 2> Ops; 12609 12610 Type *Int8Ptr = Builder.getInt8PtrTy(LI->getPointerAddressSpace()); 12611 Ops.push_back(Builder.CreateBitCast(LI->getPointerOperand(), Int8Ptr)); 12612 Ops.push_back(Builder.getInt32(LI->getAlignment())); 12613 12614 Type *Tys[] = { VecTy, Int8Ptr }; 12615 Function *VldnFunc = 12616 Intrinsic::getDeclaration(LI->getModule(), LoadInts[Factor - 2], Tys); 12617 CallInst *VldN = Builder.CreateCall(VldnFunc, Ops, "vldN"); 12618 12619 // Replace uses of each shufflevector with the corresponding vector loaded 12620 // by ldN. 12621 for (unsigned i = 0; i < Shuffles.size(); i++) { 12622 ShuffleVectorInst *SV = Shuffles[i]; 12623 unsigned Index = Indices[i]; 12624 12625 Value *SubVec = Builder.CreateExtractValue(VldN, Index); 12626 12627 // Convert the integer vector to pointer vector if the element is pointer. 12628 if (EltTy->isPointerTy()) 12629 SubVec = Builder.CreateIntToPtr(SubVec, SV->getType()); 12630 12631 SV->replaceAllUsesWith(SubVec); 12632 } 12633 12634 return true; 12635 } 12636 12637 /// \brief Get a mask consisting of sequential integers starting from \p Start. 12638 /// 12639 /// I.e. <Start, Start + 1, ..., Start + NumElts - 1> 12640 static Constant *getSequentialMask(IRBuilder<> &Builder, unsigned Start, 12641 unsigned NumElts) { 12642 SmallVector<Constant *, 16> Mask; 12643 for (unsigned i = 0; i < NumElts; i++) 12644 Mask.push_back(Builder.getInt32(Start + i)); 12645 12646 return ConstantVector::get(Mask); 12647 } 12648 12649 /// \brief Lower an interleaved store into a vstN intrinsic. 12650 /// 12651 /// E.g. Lower an interleaved store (Factor = 3): 12652 /// %i.vec = shuffle <8 x i32> %v0, <8 x i32> %v1, 12653 /// <0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11> 12654 /// store <12 x i32> %i.vec, <12 x i32>* %ptr, align 4 12655 /// 12656 /// Into: 12657 /// %sub.v0 = shuffle <8 x i32> %v0, <8 x i32> v1, <0, 1, 2, 3> 12658 /// %sub.v1 = shuffle <8 x i32> %v0, <8 x i32> v1, <4, 5, 6, 7> 12659 /// %sub.v2 = shuffle <8 x i32> %v0, <8 x i32> v1, <8, 9, 10, 11> 12660 /// call void llvm.arm.neon.vst3(%ptr, %sub.v0, %sub.v1, %sub.v2, 4) 12661 /// 12662 /// Note that the new shufflevectors will be removed and we'll only generate one 12663 /// vst3 instruction in CodeGen. 12664 bool ARMTargetLowering::lowerInterleavedStore(StoreInst *SI, 12665 ShuffleVectorInst *SVI, 12666 unsigned Factor) const { 12667 assert(Factor >= 2 && Factor <= getMaxSupportedInterleaveFactor() && 12668 "Invalid interleave factor"); 12669 12670 VectorType *VecTy = SVI->getType(); 12671 assert(VecTy->getVectorNumElements() % Factor == 0 && 12672 "Invalid interleaved store"); 12673 12674 unsigned NumSubElts = VecTy->getVectorNumElements() / Factor; 12675 Type *EltTy = VecTy->getVectorElementType(); 12676 VectorType *SubVecTy = VectorType::get(EltTy, NumSubElts); 12677 12678 const DataLayout &DL = SI->getModule()->getDataLayout(); 12679 unsigned SubVecSize = DL.getTypeSizeInBits(SubVecTy); 12680 bool EltIs64Bits = DL.getTypeSizeInBits(EltTy) == 64; 12681 12682 // Skip if we do not have NEON and skip illegal vector types and vector types 12683 // with i64/f64 elements (vstN doesn't support i64/f64 elements). 12684 if (!Subtarget->hasNEON() || (SubVecSize != 64 && SubVecSize != 128) || 12685 EltIs64Bits) 12686 return false; 12687 12688 Value *Op0 = SVI->getOperand(0); 12689 Value *Op1 = SVI->getOperand(1); 12690 IRBuilder<> Builder(SI); 12691 12692 // StN intrinsics don't support pointer vectors as arguments. Convert pointer 12693 // vectors to integer vectors. 12694 if (EltTy->isPointerTy()) { 12695 Type *IntTy = DL.getIntPtrType(EltTy); 12696 12697 // Convert to the corresponding integer vector. 12698 Type *IntVecTy = 12699 VectorType::get(IntTy, Op0->getType()->getVectorNumElements()); 12700 Op0 = Builder.CreatePtrToInt(Op0, IntVecTy); 12701 Op1 = Builder.CreatePtrToInt(Op1, IntVecTy); 12702 12703 SubVecTy = VectorType::get(IntTy, NumSubElts); 12704 } 12705 12706 static const Intrinsic::ID StoreInts[3] = {Intrinsic::arm_neon_vst2, 12707 Intrinsic::arm_neon_vst3, 12708 Intrinsic::arm_neon_vst4}; 12709 SmallVector<Value *, 6> Ops; 12710 12711 Type *Int8Ptr = Builder.getInt8PtrTy(SI->getPointerAddressSpace()); 12712 Ops.push_back(Builder.CreateBitCast(SI->getPointerOperand(), Int8Ptr)); 12713 12714 Type *Tys[] = { Int8Ptr, SubVecTy }; 12715 Function *VstNFunc = Intrinsic::getDeclaration( 12716 SI->getModule(), StoreInts[Factor - 2], Tys); 12717 12718 // Split the shufflevector operands into sub vectors for the new vstN call. 12719 for (unsigned i = 0; i < Factor; i++) 12720 Ops.push_back(Builder.CreateShuffleVector( 12721 Op0, Op1, getSequentialMask(Builder, NumSubElts * i, NumSubElts))); 12722 12723 Ops.push_back(Builder.getInt32(SI->getAlignment())); 12724 Builder.CreateCall(VstNFunc, Ops); 12725 return true; 12726 } 12727 12728 enum HABaseType { 12729 HA_UNKNOWN = 0, 12730 HA_FLOAT, 12731 HA_DOUBLE, 12732 HA_VECT64, 12733 HA_VECT128 12734 }; 12735 12736 static bool isHomogeneousAggregate(Type *Ty, HABaseType &Base, 12737 uint64_t &Members) { 12738 if (auto *ST = dyn_cast<StructType>(Ty)) { 12739 for (unsigned i = 0; i < ST->getNumElements(); ++i) { 12740 uint64_t SubMembers = 0; 12741 if (!isHomogeneousAggregate(ST->getElementType(i), Base, SubMembers)) 12742 return false; 12743 Members += SubMembers; 12744 } 12745 } else if (auto *AT = dyn_cast<ArrayType>(Ty)) { 12746 uint64_t SubMembers = 0; 12747 if (!isHomogeneousAggregate(AT->getElementType(), Base, SubMembers)) 12748 return false; 12749 Members += SubMembers * AT->getNumElements(); 12750 } else if (Ty->isFloatTy()) { 12751 if (Base != HA_UNKNOWN && Base != HA_FLOAT) 12752 return false; 12753 Members = 1; 12754 Base = HA_FLOAT; 12755 } else if (Ty->isDoubleTy()) { 12756 if (Base != HA_UNKNOWN && Base != HA_DOUBLE) 12757 return false; 12758 Members = 1; 12759 Base = HA_DOUBLE; 12760 } else if (auto *VT = dyn_cast<VectorType>(Ty)) { 12761 Members = 1; 12762 switch (Base) { 12763 case HA_FLOAT: 12764 case HA_DOUBLE: 12765 return false; 12766 case HA_VECT64: 12767 return VT->getBitWidth() == 64; 12768 case HA_VECT128: 12769 return VT->getBitWidth() == 128; 12770 case HA_UNKNOWN: 12771 switch (VT->getBitWidth()) { 12772 case 64: 12773 Base = HA_VECT64; 12774 return true; 12775 case 128: 12776 Base = HA_VECT128; 12777 return true; 12778 default: 12779 return false; 12780 } 12781 } 12782 } 12783 12784 return (Members > 0 && Members <= 4); 12785 } 12786 12787 /// \brief Return true if a type is an AAPCS-VFP homogeneous aggregate or one of 12788 /// [N x i32] or [N x i64]. This allows front-ends to skip emitting padding when 12789 /// passing according to AAPCS rules. 12790 bool ARMTargetLowering::functionArgumentNeedsConsecutiveRegisters( 12791 Type *Ty, CallingConv::ID CallConv, bool isVarArg) const { 12792 if (getEffectiveCallingConv(CallConv, isVarArg) != 12793 CallingConv::ARM_AAPCS_VFP) 12794 return false; 12795 12796 HABaseType Base = HA_UNKNOWN; 12797 uint64_t Members = 0; 12798 bool IsHA = isHomogeneousAggregate(Ty, Base, Members); 12799 DEBUG(dbgs() << "isHA: " << IsHA << " "; Ty->dump()); 12800 12801 bool IsIntArray = Ty->isArrayTy() && Ty->getArrayElementType()->isIntegerTy(); 12802 return IsHA || IsIntArray; 12803 } 12804 12805 unsigned ARMTargetLowering::getExceptionPointerRegister( 12806 const Constant *PersonalityFn) const { 12807 // Platforms which do not use SjLj EH may return values in these registers 12808 // via the personality function. 12809 return Subtarget->useSjLjEH() ? ARM::NoRegister : ARM::R0; 12810 } 12811 12812 unsigned ARMTargetLowering::getExceptionSelectorRegister( 12813 const Constant *PersonalityFn) const { 12814 // Platforms which do not use SjLj EH may return values in these registers 12815 // via the personality function. 12816 return Subtarget->useSjLjEH() ? ARM::NoRegister : ARM::R1; 12817 } 12818 12819 void ARMTargetLowering::initializeSplitCSR(MachineBasicBlock *Entry) const { 12820 // Update IsSplitCSR in ARMFunctionInfo. 12821 ARMFunctionInfo *AFI = Entry->getParent()->getInfo<ARMFunctionInfo>(); 12822 AFI->setIsSplitCSR(true); 12823 } 12824 12825 void ARMTargetLowering::insertCopiesSplitCSR( 12826 MachineBasicBlock *Entry, 12827 const SmallVectorImpl<MachineBasicBlock *> &Exits) const { 12828 const ARMBaseRegisterInfo *TRI = Subtarget->getRegisterInfo(); 12829 const MCPhysReg *IStart = TRI->getCalleeSavedRegsViaCopy(Entry->getParent()); 12830 if (!IStart) 12831 return; 12832 12833 const TargetInstrInfo *TII = Subtarget->getInstrInfo(); 12834 MachineRegisterInfo *MRI = &Entry->getParent()->getRegInfo(); 12835 MachineBasicBlock::iterator MBBI = Entry->begin(); 12836 for (const MCPhysReg *I = IStart; *I; ++I) { 12837 const TargetRegisterClass *RC = nullptr; 12838 if (ARM::GPRRegClass.contains(*I)) 12839 RC = &ARM::GPRRegClass; 12840 else if (ARM::DPRRegClass.contains(*I)) 12841 RC = &ARM::DPRRegClass; 12842 else 12843 llvm_unreachable("Unexpected register class in CSRsViaCopy!"); 12844 12845 unsigned NewVR = MRI->createVirtualRegister(RC); 12846 // Create copy from CSR to a virtual register. 12847 // FIXME: this currently does not emit CFI pseudo-instructions, it works 12848 // fine for CXX_FAST_TLS since the C++-style TLS access functions should be 12849 // nounwind. If we want to generalize this later, we may need to emit 12850 // CFI pseudo-instructions. 12851 assert(Entry->getParent()->getFunction()->hasFnAttribute( 12852 Attribute::NoUnwind) && 12853 "Function should be nounwind in insertCopiesSplitCSR!"); 12854 Entry->addLiveIn(*I); 12855 BuildMI(*Entry, MBBI, DebugLoc(), TII->get(TargetOpcode::COPY), NewVR) 12856 .addReg(*I); 12857 12858 // Insert the copy-back instructions right before the terminator. 12859 for (auto *Exit : Exits) 12860 BuildMI(*Exit, Exit->getFirstTerminator(), DebugLoc(), 12861 TII->get(TargetOpcode::COPY), *I) 12862 .addReg(NewVR); 12863 } 12864 } 12865