1 //===-- AArch64ISelLowering.cpp - AArch64 DAG Lowering Implementation ----===// 2 // 3 // The LLVM Compiler Infrastructure 4 // 5 // This file is distributed under the University of Illinois Open Source 6 // License. See LICENSE.TXT for details. 7 // 8 //===----------------------------------------------------------------------===// 9 // 10 // This file implements the AArch64TargetLowering class. 11 // 12 //===----------------------------------------------------------------------===// 13 14 #include "AArch64ISelLowering.h" 15 #include "AArch64CallingConvention.h" 16 #include "AArch64MachineFunctionInfo.h" 17 #include "AArch64PerfectShuffle.h" 18 #include "AArch64Subtarget.h" 19 #include "AArch64TargetMachine.h" 20 #include "AArch64TargetObjectFile.h" 21 #include "MCTargetDesc/AArch64AddressingModes.h" 22 #include "llvm/ADT/Statistic.h" 23 #include "llvm/CodeGen/CallingConvLower.h" 24 #include "llvm/CodeGen/MachineFrameInfo.h" 25 #include "llvm/CodeGen/MachineInstrBuilder.h" 26 #include "llvm/CodeGen/MachineRegisterInfo.h" 27 #include "llvm/IR/Function.h" 28 #include "llvm/IR/GetElementPtrTypeIterator.h" 29 #include "llvm/IR/Intrinsics.h" 30 #include "llvm/IR/Type.h" 31 #include "llvm/Support/CommandLine.h" 32 #include "llvm/Support/Debug.h" 33 #include "llvm/Support/ErrorHandling.h" 34 #include "llvm/Support/raw_ostream.h" 35 #include "llvm/Target/TargetOptions.h" 36 using namespace llvm; 37 38 #define DEBUG_TYPE "aarch64-lower" 39 40 STATISTIC(NumTailCalls, "Number of tail calls"); 41 STATISTIC(NumShiftInserts, "Number of vector shift inserts"); 42 43 static cl::opt<bool> 44 EnableAArch64SlrGeneration("aarch64-shift-insert-generation", cl::Hidden, 45 cl::desc("Allow AArch64 SLI/SRI formation"), 46 cl::init(false)); 47 48 // FIXME: The necessary dtprel relocations don't seem to be supported 49 // well in the GNU bfd and gold linkers at the moment. Therefore, by 50 // default, for now, fall back to GeneralDynamic code generation. 51 cl::opt<bool> EnableAArch64ELFLocalDynamicTLSGeneration( 52 "aarch64-elf-ldtls-generation", cl::Hidden, 53 cl::desc("Allow AArch64 Local Dynamic TLS code generation"), 54 cl::init(false)); 55 56 /// Value type used for condition codes. 57 static const MVT MVT_CC = MVT::i32; 58 59 AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM, 60 const AArch64Subtarget &STI) 61 : TargetLowering(TM), Subtarget(&STI) { 62 63 // AArch64 doesn't have comparisons which set GPRs or setcc instructions, so 64 // we have to make something up. Arbitrarily, choose ZeroOrOne. 65 setBooleanContents(ZeroOrOneBooleanContent); 66 // When comparing vectors the result sets the different elements in the 67 // vector to all-one or all-zero. 68 setBooleanVectorContents(ZeroOrNegativeOneBooleanContent); 69 70 // Set up the register classes. 71 addRegisterClass(MVT::i32, &AArch64::GPR32allRegClass); 72 addRegisterClass(MVT::i64, &AArch64::GPR64allRegClass); 73 74 if (Subtarget->hasFPARMv8()) { 75 addRegisterClass(MVT::f16, &AArch64::FPR16RegClass); 76 addRegisterClass(MVT::f32, &AArch64::FPR32RegClass); 77 addRegisterClass(MVT::f64, &AArch64::FPR64RegClass); 78 addRegisterClass(MVT::f128, &AArch64::FPR128RegClass); 79 } 80 81 if (Subtarget->hasNEON()) { 82 addRegisterClass(MVT::v16i8, &AArch64::FPR8RegClass); 83 addRegisterClass(MVT::v8i16, &AArch64::FPR16RegClass); 84 // Someone set us up the NEON. 85 addDRTypeForNEON(MVT::v2f32); 86 addDRTypeForNEON(MVT::v8i8); 87 addDRTypeForNEON(MVT::v4i16); 88 addDRTypeForNEON(MVT::v2i32); 89 addDRTypeForNEON(MVT::v1i64); 90 addDRTypeForNEON(MVT::v1f64); 91 addDRTypeForNEON(MVT::v4f16); 92 93 addQRTypeForNEON(MVT::v4f32); 94 addQRTypeForNEON(MVT::v2f64); 95 addQRTypeForNEON(MVT::v16i8); 96 addQRTypeForNEON(MVT::v8i16); 97 addQRTypeForNEON(MVT::v4i32); 98 addQRTypeForNEON(MVT::v2i64); 99 addQRTypeForNEON(MVT::v8f16); 100 } 101 102 // Compute derived properties from the register classes 103 computeRegisterProperties(Subtarget->getRegisterInfo()); 104 105 // Provide all sorts of operation actions 106 setOperationAction(ISD::GlobalAddress, MVT::i64, Custom); 107 setOperationAction(ISD::GlobalTLSAddress, MVT::i64, Custom); 108 setOperationAction(ISD::SETCC, MVT::i32, Custom); 109 setOperationAction(ISD::SETCC, MVT::i64, Custom); 110 setOperationAction(ISD::SETCC, MVT::f32, Custom); 111 setOperationAction(ISD::SETCC, MVT::f64, Custom); 112 setOperationAction(ISD::BRCOND, MVT::Other, Expand); 113 setOperationAction(ISD::BR_CC, MVT::i32, Custom); 114 setOperationAction(ISD::BR_CC, MVT::i64, Custom); 115 setOperationAction(ISD::BR_CC, MVT::f32, Custom); 116 setOperationAction(ISD::BR_CC, MVT::f64, Custom); 117 setOperationAction(ISD::SELECT, MVT::i32, Custom); 118 setOperationAction(ISD::SELECT, MVT::i64, Custom); 119 setOperationAction(ISD::SELECT, MVT::f32, Custom); 120 setOperationAction(ISD::SELECT, MVT::f64, Custom); 121 setOperationAction(ISD::SELECT_CC, MVT::i32, Custom); 122 setOperationAction(ISD::SELECT_CC, MVT::i64, Custom); 123 setOperationAction(ISD::SELECT_CC, MVT::f32, Custom); 124 setOperationAction(ISD::SELECT_CC, MVT::f64, Custom); 125 setOperationAction(ISD::BR_JT, MVT::Other, Expand); 126 setOperationAction(ISD::JumpTable, MVT::i64, Custom); 127 128 setOperationAction(ISD::SHL_PARTS, MVT::i64, Custom); 129 setOperationAction(ISD::SRA_PARTS, MVT::i64, Custom); 130 setOperationAction(ISD::SRL_PARTS, MVT::i64, Custom); 131 132 setOperationAction(ISD::FREM, MVT::f32, Expand); 133 setOperationAction(ISD::FREM, MVT::f64, Expand); 134 setOperationAction(ISD::FREM, MVT::f80, Expand); 135 136 // Custom lowering hooks are needed for XOR 137 // to fold it into CSINC/CSINV. 138 setOperationAction(ISD::XOR, MVT::i32, Custom); 139 setOperationAction(ISD::XOR, MVT::i64, Custom); 140 141 // Virtually no operation on f128 is legal, but LLVM can't expand them when 142 // there's a valid register class, so we need custom operations in most cases. 143 setOperationAction(ISD::FABS, MVT::f128, Expand); 144 setOperationAction(ISD::FADD, MVT::f128, Custom); 145 setOperationAction(ISD::FCOPYSIGN, MVT::f128, Expand); 146 setOperationAction(ISD::FCOS, MVT::f128, Expand); 147 setOperationAction(ISD::FDIV, MVT::f128, Custom); 148 setOperationAction(ISD::FMA, MVT::f128, Expand); 149 setOperationAction(ISD::FMUL, MVT::f128, Custom); 150 setOperationAction(ISD::FNEG, MVT::f128, Expand); 151 setOperationAction(ISD::FPOW, MVT::f128, Expand); 152 setOperationAction(ISD::FREM, MVT::f128, Expand); 153 setOperationAction(ISD::FRINT, MVT::f128, Expand); 154 setOperationAction(ISD::FSIN, MVT::f128, Expand); 155 setOperationAction(ISD::FSINCOS, MVT::f128, Expand); 156 setOperationAction(ISD::FSQRT, MVT::f128, Expand); 157 setOperationAction(ISD::FSUB, MVT::f128, Custom); 158 setOperationAction(ISD::FTRUNC, MVT::f128, Expand); 159 setOperationAction(ISD::SETCC, MVT::f128, Custom); 160 setOperationAction(ISD::BR_CC, MVT::f128, Custom); 161 setOperationAction(ISD::SELECT, MVT::f128, Custom); 162 setOperationAction(ISD::SELECT_CC, MVT::f128, Custom); 163 setOperationAction(ISD::FP_EXTEND, MVT::f128, Custom); 164 165 // Lowering for many of the conversions is actually specified by the non-f128 166 // type. The LowerXXX function will be trivial when f128 isn't involved. 167 setOperationAction(ISD::FP_TO_SINT, MVT::i32, Custom); 168 setOperationAction(ISD::FP_TO_SINT, MVT::i64, Custom); 169 setOperationAction(ISD::FP_TO_SINT, MVT::i128, Custom); 170 setOperationAction(ISD::FP_TO_UINT, MVT::i32, Custom); 171 setOperationAction(ISD::FP_TO_UINT, MVT::i64, Custom); 172 setOperationAction(ISD::FP_TO_UINT, MVT::i128, Custom); 173 setOperationAction(ISD::SINT_TO_FP, MVT::i32, Custom); 174 setOperationAction(ISD::SINT_TO_FP, MVT::i64, Custom); 175 setOperationAction(ISD::SINT_TO_FP, MVT::i128, Custom); 176 setOperationAction(ISD::UINT_TO_FP, MVT::i32, Custom); 177 setOperationAction(ISD::UINT_TO_FP, MVT::i64, Custom); 178 setOperationAction(ISD::UINT_TO_FP, MVT::i128, Custom); 179 setOperationAction(ISD::FP_ROUND, MVT::f32, Custom); 180 setOperationAction(ISD::FP_ROUND, MVT::f64, Custom); 181 182 // Variable arguments. 183 setOperationAction(ISD::VASTART, MVT::Other, Custom); 184 setOperationAction(ISD::VAARG, MVT::Other, Custom); 185 setOperationAction(ISD::VACOPY, MVT::Other, Custom); 186 setOperationAction(ISD::VAEND, MVT::Other, Expand); 187 188 // Variable-sized objects. 189 setOperationAction(ISD::STACKSAVE, MVT::Other, Expand); 190 setOperationAction(ISD::STACKRESTORE, MVT::Other, Expand); 191 setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i64, Expand); 192 193 // Constant pool entries 194 setOperationAction(ISD::ConstantPool, MVT::i64, Custom); 195 196 // BlockAddress 197 setOperationAction(ISD::BlockAddress, MVT::i64, Custom); 198 199 // Add/Sub overflow ops with MVT::Glues are lowered to NZCV dependences. 200 setOperationAction(ISD::ADDC, MVT::i32, Custom); 201 setOperationAction(ISD::ADDE, MVT::i32, Custom); 202 setOperationAction(ISD::SUBC, MVT::i32, Custom); 203 setOperationAction(ISD::SUBE, MVT::i32, Custom); 204 setOperationAction(ISD::ADDC, MVT::i64, Custom); 205 setOperationAction(ISD::ADDE, MVT::i64, Custom); 206 setOperationAction(ISD::SUBC, MVT::i64, Custom); 207 setOperationAction(ISD::SUBE, MVT::i64, Custom); 208 209 // AArch64 lacks both left-rotate and popcount instructions. 210 setOperationAction(ISD::ROTL, MVT::i32, Expand); 211 setOperationAction(ISD::ROTL, MVT::i64, Expand); 212 for (MVT VT : MVT::vector_valuetypes()) { 213 setOperationAction(ISD::ROTL, VT, Expand); 214 setOperationAction(ISD::ROTR, VT, Expand); 215 } 216 217 // AArch64 doesn't have {U|S}MUL_LOHI. 218 setOperationAction(ISD::UMUL_LOHI, MVT::i64, Expand); 219 setOperationAction(ISD::SMUL_LOHI, MVT::i64, Expand); 220 221 222 setOperationAction(ISD::CTPOP, MVT::i32, Custom); 223 setOperationAction(ISD::CTPOP, MVT::i64, Custom); 224 225 setOperationAction(ISD::SDIVREM, MVT::i32, Expand); 226 setOperationAction(ISD::SDIVREM, MVT::i64, Expand); 227 for (MVT VT : MVT::vector_valuetypes()) { 228 setOperationAction(ISD::SDIVREM, VT, Expand); 229 setOperationAction(ISD::UDIVREM, VT, Expand); 230 } 231 setOperationAction(ISD::SREM, MVT::i32, Expand); 232 setOperationAction(ISD::SREM, MVT::i64, Expand); 233 setOperationAction(ISD::UDIVREM, MVT::i32, Expand); 234 setOperationAction(ISD::UDIVREM, MVT::i64, Expand); 235 setOperationAction(ISD::UREM, MVT::i32, Expand); 236 setOperationAction(ISD::UREM, MVT::i64, Expand); 237 238 // Custom lower Add/Sub/Mul with overflow. 239 setOperationAction(ISD::SADDO, MVT::i32, Custom); 240 setOperationAction(ISD::SADDO, MVT::i64, Custom); 241 setOperationAction(ISD::UADDO, MVT::i32, Custom); 242 setOperationAction(ISD::UADDO, MVT::i64, Custom); 243 setOperationAction(ISD::SSUBO, MVT::i32, Custom); 244 setOperationAction(ISD::SSUBO, MVT::i64, Custom); 245 setOperationAction(ISD::USUBO, MVT::i32, Custom); 246 setOperationAction(ISD::USUBO, MVT::i64, Custom); 247 setOperationAction(ISD::SMULO, MVT::i32, Custom); 248 setOperationAction(ISD::SMULO, MVT::i64, Custom); 249 setOperationAction(ISD::UMULO, MVT::i32, Custom); 250 setOperationAction(ISD::UMULO, MVT::i64, Custom); 251 252 setOperationAction(ISD::FSIN, MVT::f32, Expand); 253 setOperationAction(ISD::FSIN, MVT::f64, Expand); 254 setOperationAction(ISD::FCOS, MVT::f32, Expand); 255 setOperationAction(ISD::FCOS, MVT::f64, Expand); 256 setOperationAction(ISD::FPOW, MVT::f32, Expand); 257 setOperationAction(ISD::FPOW, MVT::f64, Expand); 258 setOperationAction(ISD::FCOPYSIGN, MVT::f64, Custom); 259 setOperationAction(ISD::FCOPYSIGN, MVT::f32, Custom); 260 261 // f16 is a storage-only type, always promote it to f32. 262 setOperationAction(ISD::SETCC, MVT::f16, Promote); 263 setOperationAction(ISD::BR_CC, MVT::f16, Promote); 264 setOperationAction(ISD::SELECT_CC, MVT::f16, Promote); 265 setOperationAction(ISD::SELECT, MVT::f16, Promote); 266 setOperationAction(ISD::FADD, MVT::f16, Promote); 267 setOperationAction(ISD::FSUB, MVT::f16, Promote); 268 setOperationAction(ISD::FMUL, MVT::f16, Promote); 269 setOperationAction(ISD::FDIV, MVT::f16, Promote); 270 setOperationAction(ISD::FREM, MVT::f16, Promote); 271 setOperationAction(ISD::FMA, MVT::f16, Promote); 272 setOperationAction(ISD::FNEG, MVT::f16, Promote); 273 setOperationAction(ISD::FABS, MVT::f16, Promote); 274 setOperationAction(ISD::FCEIL, MVT::f16, Promote); 275 setOperationAction(ISD::FCOPYSIGN, MVT::f16, Promote); 276 setOperationAction(ISD::FCOS, MVT::f16, Promote); 277 setOperationAction(ISD::FFLOOR, MVT::f16, Promote); 278 setOperationAction(ISD::FNEARBYINT, MVT::f16, Promote); 279 setOperationAction(ISD::FPOW, MVT::f16, Promote); 280 setOperationAction(ISD::FPOWI, MVT::f16, Promote); 281 setOperationAction(ISD::FRINT, MVT::f16, Promote); 282 setOperationAction(ISD::FSIN, MVT::f16, Promote); 283 setOperationAction(ISD::FSINCOS, MVT::f16, Promote); 284 setOperationAction(ISD::FSQRT, MVT::f16, Promote); 285 setOperationAction(ISD::FEXP, MVT::f16, Promote); 286 setOperationAction(ISD::FEXP2, MVT::f16, Promote); 287 setOperationAction(ISD::FLOG, MVT::f16, Promote); 288 setOperationAction(ISD::FLOG2, MVT::f16, Promote); 289 setOperationAction(ISD::FLOG10, MVT::f16, Promote); 290 setOperationAction(ISD::FROUND, MVT::f16, Promote); 291 setOperationAction(ISD::FTRUNC, MVT::f16, Promote); 292 setOperationAction(ISD::FMINNUM, MVT::f16, Promote); 293 setOperationAction(ISD::FMAXNUM, MVT::f16, Promote); 294 setOperationAction(ISD::FMINNAN, MVT::f16, Promote); 295 setOperationAction(ISD::FMAXNAN, MVT::f16, Promote); 296 297 // v4f16 is also a storage-only type, so promote it to v4f32 when that is 298 // known to be safe. 299 setOperationAction(ISD::FADD, MVT::v4f16, Promote); 300 setOperationAction(ISD::FSUB, MVT::v4f16, Promote); 301 setOperationAction(ISD::FMUL, MVT::v4f16, Promote); 302 setOperationAction(ISD::FDIV, MVT::v4f16, Promote); 303 setOperationAction(ISD::FP_EXTEND, MVT::v4f16, Promote); 304 setOperationAction(ISD::FP_ROUND, MVT::v4f16, Promote); 305 AddPromotedToType(ISD::FADD, MVT::v4f16, MVT::v4f32); 306 AddPromotedToType(ISD::FSUB, MVT::v4f16, MVT::v4f32); 307 AddPromotedToType(ISD::FMUL, MVT::v4f16, MVT::v4f32); 308 AddPromotedToType(ISD::FDIV, MVT::v4f16, MVT::v4f32); 309 AddPromotedToType(ISD::FP_EXTEND, MVT::v4f16, MVT::v4f32); 310 AddPromotedToType(ISD::FP_ROUND, MVT::v4f16, MVT::v4f32); 311 312 // Expand all other v4f16 operations. 313 // FIXME: We could generate better code by promoting some operations to 314 // a pair of v4f32s 315 setOperationAction(ISD::FABS, MVT::v4f16, Expand); 316 setOperationAction(ISD::FCEIL, MVT::v4f16, Expand); 317 setOperationAction(ISD::FCOPYSIGN, MVT::v4f16, Expand); 318 setOperationAction(ISD::FCOS, MVT::v4f16, Expand); 319 setOperationAction(ISD::FFLOOR, MVT::v4f16, Expand); 320 setOperationAction(ISD::FMA, MVT::v4f16, Expand); 321 setOperationAction(ISD::FNEARBYINT, MVT::v4f16, Expand); 322 setOperationAction(ISD::FNEG, MVT::v4f16, Expand); 323 setOperationAction(ISD::FPOW, MVT::v4f16, Expand); 324 setOperationAction(ISD::FPOWI, MVT::v4f16, Expand); 325 setOperationAction(ISD::FREM, MVT::v4f16, Expand); 326 setOperationAction(ISD::FROUND, MVT::v4f16, Expand); 327 setOperationAction(ISD::FRINT, MVT::v4f16, Expand); 328 setOperationAction(ISD::FSIN, MVT::v4f16, Expand); 329 setOperationAction(ISD::FSINCOS, MVT::v4f16, Expand); 330 setOperationAction(ISD::FSQRT, MVT::v4f16, Expand); 331 setOperationAction(ISD::FTRUNC, MVT::v4f16, Expand); 332 setOperationAction(ISD::SETCC, MVT::v4f16, Expand); 333 setOperationAction(ISD::BR_CC, MVT::v4f16, Expand); 334 setOperationAction(ISD::SELECT, MVT::v4f16, Expand); 335 setOperationAction(ISD::SELECT_CC, MVT::v4f16, Expand); 336 setOperationAction(ISD::FEXP, MVT::v4f16, Expand); 337 setOperationAction(ISD::FEXP2, MVT::v4f16, Expand); 338 setOperationAction(ISD::FLOG, MVT::v4f16, Expand); 339 setOperationAction(ISD::FLOG2, MVT::v4f16, Expand); 340 setOperationAction(ISD::FLOG10, MVT::v4f16, Expand); 341 342 343 // v8f16 is also a storage-only type, so expand it. 344 setOperationAction(ISD::FABS, MVT::v8f16, Expand); 345 setOperationAction(ISD::FADD, MVT::v8f16, Expand); 346 setOperationAction(ISD::FCEIL, MVT::v8f16, Expand); 347 setOperationAction(ISD::FCOPYSIGN, MVT::v8f16, Expand); 348 setOperationAction(ISD::FCOS, MVT::v8f16, Expand); 349 setOperationAction(ISD::FDIV, MVT::v8f16, Expand); 350 setOperationAction(ISD::FFLOOR, MVT::v8f16, Expand); 351 setOperationAction(ISD::FMA, MVT::v8f16, Expand); 352 setOperationAction(ISD::FMUL, MVT::v8f16, Expand); 353 setOperationAction(ISD::FNEARBYINT, MVT::v8f16, Expand); 354 setOperationAction(ISD::FNEG, MVT::v8f16, Expand); 355 setOperationAction(ISD::FPOW, MVT::v8f16, Expand); 356 setOperationAction(ISD::FPOWI, MVT::v8f16, Expand); 357 setOperationAction(ISD::FREM, MVT::v8f16, Expand); 358 setOperationAction(ISD::FROUND, MVT::v8f16, Expand); 359 setOperationAction(ISD::FRINT, MVT::v8f16, Expand); 360 setOperationAction(ISD::FSIN, MVT::v8f16, Expand); 361 setOperationAction(ISD::FSINCOS, MVT::v8f16, Expand); 362 setOperationAction(ISD::FSQRT, MVT::v8f16, Expand); 363 setOperationAction(ISD::FSUB, MVT::v8f16, Expand); 364 setOperationAction(ISD::FTRUNC, MVT::v8f16, Expand); 365 setOperationAction(ISD::SETCC, MVT::v8f16, Expand); 366 setOperationAction(ISD::BR_CC, MVT::v8f16, Expand); 367 setOperationAction(ISD::SELECT, MVT::v8f16, Expand); 368 setOperationAction(ISD::SELECT_CC, MVT::v8f16, Expand); 369 setOperationAction(ISD::FP_EXTEND, MVT::v8f16, Expand); 370 setOperationAction(ISD::FEXP, MVT::v8f16, Expand); 371 setOperationAction(ISD::FEXP2, MVT::v8f16, Expand); 372 setOperationAction(ISD::FLOG, MVT::v8f16, Expand); 373 setOperationAction(ISD::FLOG2, MVT::v8f16, Expand); 374 setOperationAction(ISD::FLOG10, MVT::v8f16, Expand); 375 376 // AArch64 has implementations of a lot of rounding-like FP operations. 377 for (MVT Ty : {MVT::f32, MVT::f64}) { 378 setOperationAction(ISD::FFLOOR, Ty, Legal); 379 setOperationAction(ISD::FNEARBYINT, Ty, Legal); 380 setOperationAction(ISD::FCEIL, Ty, Legal); 381 setOperationAction(ISD::FRINT, Ty, Legal); 382 setOperationAction(ISD::FTRUNC, Ty, Legal); 383 setOperationAction(ISD::FROUND, Ty, Legal); 384 setOperationAction(ISD::FMINNUM, Ty, Legal); 385 setOperationAction(ISD::FMAXNUM, Ty, Legal); 386 setOperationAction(ISD::FMINNAN, Ty, Legal); 387 setOperationAction(ISD::FMAXNAN, Ty, Legal); 388 } 389 390 setOperationAction(ISD::PREFETCH, MVT::Other, Custom); 391 392 setOperationAction(ISD::ATOMIC_CMP_SWAP, MVT::i128, Custom); 393 394 // Lower READCYCLECOUNTER using an mrs from PMCCNTR_EL0. 395 // This requires the Performance Monitors extension. 396 if (Subtarget->hasPerfMon()) 397 setOperationAction(ISD::READCYCLECOUNTER, MVT::i64, Legal); 398 399 if (Subtarget->isTargetMachO()) { 400 // For iOS, we don't want to the normal expansion of a libcall to 401 // sincos. We want to issue a libcall to __sincos_stret to avoid memory 402 // traffic. 403 setOperationAction(ISD::FSINCOS, MVT::f64, Custom); 404 setOperationAction(ISD::FSINCOS, MVT::f32, Custom); 405 } else { 406 setOperationAction(ISD::FSINCOS, MVT::f64, Expand); 407 setOperationAction(ISD::FSINCOS, MVT::f32, Expand); 408 } 409 410 // Make floating-point constants legal for the large code model, so they don't 411 // become loads from the constant pool. 412 if (Subtarget->isTargetMachO() && TM.getCodeModel() == CodeModel::Large) { 413 setOperationAction(ISD::ConstantFP, MVT::f32, Legal); 414 setOperationAction(ISD::ConstantFP, MVT::f64, Legal); 415 } 416 417 // AArch64 does not have floating-point extending loads, i1 sign-extending 418 // load, floating-point truncating stores, or v2i32->v2i16 truncating store. 419 for (MVT VT : MVT::fp_valuetypes()) { 420 setLoadExtAction(ISD::EXTLOAD, VT, MVT::f16, Expand); 421 setLoadExtAction(ISD::EXTLOAD, VT, MVT::f32, Expand); 422 setLoadExtAction(ISD::EXTLOAD, VT, MVT::f64, Expand); 423 setLoadExtAction(ISD::EXTLOAD, VT, MVT::f80, Expand); 424 } 425 for (MVT VT : MVT::integer_valuetypes()) 426 setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i1, Expand); 427 428 setTruncStoreAction(MVT::f32, MVT::f16, Expand); 429 setTruncStoreAction(MVT::f64, MVT::f32, Expand); 430 setTruncStoreAction(MVT::f64, MVT::f16, Expand); 431 setTruncStoreAction(MVT::f128, MVT::f80, Expand); 432 setTruncStoreAction(MVT::f128, MVT::f64, Expand); 433 setTruncStoreAction(MVT::f128, MVT::f32, Expand); 434 setTruncStoreAction(MVT::f128, MVT::f16, Expand); 435 436 setOperationAction(ISD::BITCAST, MVT::i16, Custom); 437 setOperationAction(ISD::BITCAST, MVT::f16, Custom); 438 439 // Indexed loads and stores are supported. 440 for (unsigned im = (unsigned)ISD::PRE_INC; 441 im != (unsigned)ISD::LAST_INDEXED_MODE; ++im) { 442 setIndexedLoadAction(im, MVT::i8, Legal); 443 setIndexedLoadAction(im, MVT::i16, Legal); 444 setIndexedLoadAction(im, MVT::i32, Legal); 445 setIndexedLoadAction(im, MVT::i64, Legal); 446 setIndexedLoadAction(im, MVT::f64, Legal); 447 setIndexedLoadAction(im, MVT::f32, Legal); 448 setIndexedLoadAction(im, MVT::f16, Legal); 449 setIndexedStoreAction(im, MVT::i8, Legal); 450 setIndexedStoreAction(im, MVT::i16, Legal); 451 setIndexedStoreAction(im, MVT::i32, Legal); 452 setIndexedStoreAction(im, MVT::i64, Legal); 453 setIndexedStoreAction(im, MVT::f64, Legal); 454 setIndexedStoreAction(im, MVT::f32, Legal); 455 setIndexedStoreAction(im, MVT::f16, Legal); 456 } 457 458 // Trap. 459 setOperationAction(ISD::TRAP, MVT::Other, Legal); 460 461 // We combine OR nodes for bitfield operations. 462 setTargetDAGCombine(ISD::OR); 463 464 // Vector add and sub nodes may conceal a high-half opportunity. 465 // Also, try to fold ADD into CSINC/CSINV.. 466 setTargetDAGCombine(ISD::ADD); 467 setTargetDAGCombine(ISD::SUB); 468 setTargetDAGCombine(ISD::SRL); 469 setTargetDAGCombine(ISD::XOR); 470 setTargetDAGCombine(ISD::SINT_TO_FP); 471 setTargetDAGCombine(ISD::UINT_TO_FP); 472 473 setTargetDAGCombine(ISD::FP_TO_SINT); 474 setTargetDAGCombine(ISD::FP_TO_UINT); 475 setTargetDAGCombine(ISD::FDIV); 476 477 setTargetDAGCombine(ISD::INTRINSIC_WO_CHAIN); 478 479 setTargetDAGCombine(ISD::ANY_EXTEND); 480 setTargetDAGCombine(ISD::ZERO_EXTEND); 481 setTargetDAGCombine(ISD::SIGN_EXTEND); 482 setTargetDAGCombine(ISD::BITCAST); 483 setTargetDAGCombine(ISD::CONCAT_VECTORS); 484 setTargetDAGCombine(ISD::STORE); 485 if (Subtarget->supportsAddressTopByteIgnored()) 486 setTargetDAGCombine(ISD::LOAD); 487 488 setTargetDAGCombine(ISD::MUL); 489 490 setTargetDAGCombine(ISD::SELECT); 491 setTargetDAGCombine(ISD::VSELECT); 492 493 setTargetDAGCombine(ISD::INTRINSIC_VOID); 494 setTargetDAGCombine(ISD::INTRINSIC_W_CHAIN); 495 setTargetDAGCombine(ISD::INSERT_VECTOR_ELT); 496 setTargetDAGCombine(ISD::EXTRACT_VECTOR_ELT); 497 498 MaxStoresPerMemset = MaxStoresPerMemsetOptSize = 8; 499 MaxStoresPerMemcpy = MaxStoresPerMemcpyOptSize = 4; 500 MaxStoresPerMemmove = MaxStoresPerMemmoveOptSize = 4; 501 502 setStackPointerRegisterToSaveRestore(AArch64::SP); 503 504 setSchedulingPreference(Sched::Hybrid); 505 506 // Enable TBZ/TBNZ 507 MaskAndBranchFoldingIsLegal = true; 508 EnableExtLdPromotion = true; 509 510 // Set required alignment. 511 setMinFunctionAlignment(2); 512 // Set preferred alignments. 513 setPrefFunctionAlignment(STI.getPrefFunctionAlignment()); 514 setPrefLoopAlignment(STI.getPrefLoopAlignment()); 515 516 setHasExtractBitsInsn(true); 517 518 setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom); 519 520 if (Subtarget->hasNEON()) { 521 // FIXME: v1f64 shouldn't be legal if we can avoid it, because it leads to 522 // silliness like this: 523 setOperationAction(ISD::FABS, MVT::v1f64, Expand); 524 setOperationAction(ISD::FADD, MVT::v1f64, Expand); 525 setOperationAction(ISD::FCEIL, MVT::v1f64, Expand); 526 setOperationAction(ISD::FCOPYSIGN, MVT::v1f64, Expand); 527 setOperationAction(ISD::FCOS, MVT::v1f64, Expand); 528 setOperationAction(ISD::FDIV, MVT::v1f64, Expand); 529 setOperationAction(ISD::FFLOOR, MVT::v1f64, Expand); 530 setOperationAction(ISD::FMA, MVT::v1f64, Expand); 531 setOperationAction(ISD::FMUL, MVT::v1f64, Expand); 532 setOperationAction(ISD::FNEARBYINT, MVT::v1f64, Expand); 533 setOperationAction(ISD::FNEG, MVT::v1f64, Expand); 534 setOperationAction(ISD::FPOW, MVT::v1f64, Expand); 535 setOperationAction(ISD::FREM, MVT::v1f64, Expand); 536 setOperationAction(ISD::FROUND, MVT::v1f64, Expand); 537 setOperationAction(ISD::FRINT, MVT::v1f64, Expand); 538 setOperationAction(ISD::FSIN, MVT::v1f64, Expand); 539 setOperationAction(ISD::FSINCOS, MVT::v1f64, Expand); 540 setOperationAction(ISD::FSQRT, MVT::v1f64, Expand); 541 setOperationAction(ISD::FSUB, MVT::v1f64, Expand); 542 setOperationAction(ISD::FTRUNC, MVT::v1f64, Expand); 543 setOperationAction(ISD::SETCC, MVT::v1f64, Expand); 544 setOperationAction(ISD::BR_CC, MVT::v1f64, Expand); 545 setOperationAction(ISD::SELECT, MVT::v1f64, Expand); 546 setOperationAction(ISD::SELECT_CC, MVT::v1f64, Expand); 547 setOperationAction(ISD::FP_EXTEND, MVT::v1f64, Expand); 548 549 setOperationAction(ISD::FP_TO_SINT, MVT::v1i64, Expand); 550 setOperationAction(ISD::FP_TO_UINT, MVT::v1i64, Expand); 551 setOperationAction(ISD::SINT_TO_FP, MVT::v1i64, Expand); 552 setOperationAction(ISD::UINT_TO_FP, MVT::v1i64, Expand); 553 setOperationAction(ISD::FP_ROUND, MVT::v1f64, Expand); 554 555 setOperationAction(ISD::MUL, MVT::v1i64, Expand); 556 557 // AArch64 doesn't have a direct vector ->f32 conversion instructions for 558 // elements smaller than i32, so promote the input to i32 first. 559 setOperationAction(ISD::UINT_TO_FP, MVT::v4i8, Promote); 560 setOperationAction(ISD::SINT_TO_FP, MVT::v4i8, Promote); 561 setOperationAction(ISD::UINT_TO_FP, MVT::v4i16, Promote); 562 setOperationAction(ISD::SINT_TO_FP, MVT::v4i16, Promote); 563 // i8 and i16 vector elements also need promotion to i32 for v8i8 or v8i16 564 // -> v8f16 conversions. 565 setOperationAction(ISD::SINT_TO_FP, MVT::v8i8, Promote); 566 setOperationAction(ISD::UINT_TO_FP, MVT::v8i8, Promote); 567 setOperationAction(ISD::SINT_TO_FP, MVT::v8i16, Promote); 568 setOperationAction(ISD::UINT_TO_FP, MVT::v8i16, Promote); 569 // Similarly, there is no direct i32 -> f64 vector conversion instruction. 570 setOperationAction(ISD::SINT_TO_FP, MVT::v2i32, Custom); 571 setOperationAction(ISD::UINT_TO_FP, MVT::v2i32, Custom); 572 setOperationAction(ISD::SINT_TO_FP, MVT::v2i64, Custom); 573 setOperationAction(ISD::UINT_TO_FP, MVT::v2i64, Custom); 574 // Or, direct i32 -> f16 vector conversion. Set it so custom, so the 575 // conversion happens in two steps: v4i32 -> v4f32 -> v4f16 576 setOperationAction(ISD::SINT_TO_FP, MVT::v4i32, Custom); 577 setOperationAction(ISD::UINT_TO_FP, MVT::v4i32, Custom); 578 579 setOperationAction(ISD::CTLZ, MVT::v1i64, Expand); 580 setOperationAction(ISD::CTLZ, MVT::v2i64, Expand); 581 582 setOperationAction(ISD::CTTZ, MVT::v2i8, Expand); 583 setOperationAction(ISD::CTTZ, MVT::v4i16, Expand); 584 setOperationAction(ISD::CTTZ, MVT::v2i32, Expand); 585 setOperationAction(ISD::CTTZ, MVT::v1i64, Expand); 586 setOperationAction(ISD::CTTZ, MVT::v16i8, Expand); 587 setOperationAction(ISD::CTTZ, MVT::v8i16, Expand); 588 setOperationAction(ISD::CTTZ, MVT::v4i32, Expand); 589 setOperationAction(ISD::CTTZ, MVT::v2i64, Expand); 590 591 // AArch64 doesn't have MUL.2d: 592 setOperationAction(ISD::MUL, MVT::v2i64, Expand); 593 // Custom handling for some quad-vector types to detect MULL. 594 setOperationAction(ISD::MUL, MVT::v8i16, Custom); 595 setOperationAction(ISD::MUL, MVT::v4i32, Custom); 596 setOperationAction(ISD::MUL, MVT::v2i64, Custom); 597 598 setOperationAction(ISD::ANY_EXTEND, MVT::v4i32, Legal); 599 setTruncStoreAction(MVT::v2i32, MVT::v2i16, Expand); 600 // Likewise, narrowing and extending vector loads/stores aren't handled 601 // directly. 602 for (MVT VT : MVT::vector_valuetypes()) { 603 setOperationAction(ISD::SIGN_EXTEND_INREG, VT, Expand); 604 605 setOperationAction(ISD::MULHS, VT, Expand); 606 setOperationAction(ISD::SMUL_LOHI, VT, Expand); 607 setOperationAction(ISD::MULHU, VT, Expand); 608 setOperationAction(ISD::UMUL_LOHI, VT, Expand); 609 610 setOperationAction(ISD::BSWAP, VT, Expand); 611 612 for (MVT InnerVT : MVT::vector_valuetypes()) { 613 setTruncStoreAction(VT, InnerVT, Expand); 614 setLoadExtAction(ISD::SEXTLOAD, VT, InnerVT, Expand); 615 setLoadExtAction(ISD::ZEXTLOAD, VT, InnerVT, Expand); 616 setLoadExtAction(ISD::EXTLOAD, VT, InnerVT, Expand); 617 } 618 } 619 620 // AArch64 has implementations of a lot of rounding-like FP operations. 621 for (MVT Ty : {MVT::v2f32, MVT::v4f32, MVT::v2f64}) { 622 setOperationAction(ISD::FFLOOR, Ty, Legal); 623 setOperationAction(ISD::FNEARBYINT, Ty, Legal); 624 setOperationAction(ISD::FCEIL, Ty, Legal); 625 setOperationAction(ISD::FRINT, Ty, Legal); 626 setOperationAction(ISD::FTRUNC, Ty, Legal); 627 setOperationAction(ISD::FROUND, Ty, Legal); 628 } 629 } 630 631 PredictableSelectIsExpensive = Subtarget->predictableSelectIsExpensive(); 632 } 633 634 void AArch64TargetLowering::addTypeForNEON(MVT VT, MVT PromotedBitwiseVT) { 635 if (VT == MVT::v2f32 || VT == MVT::v4f16) { 636 setOperationAction(ISD::LOAD, VT, Promote); 637 AddPromotedToType(ISD::LOAD, VT, MVT::v2i32); 638 639 setOperationAction(ISD::STORE, VT, Promote); 640 AddPromotedToType(ISD::STORE, VT, MVT::v2i32); 641 } else if (VT == MVT::v2f64 || VT == MVT::v4f32 || VT == MVT::v8f16) { 642 setOperationAction(ISD::LOAD, VT, Promote); 643 AddPromotedToType(ISD::LOAD, VT, MVT::v2i64); 644 645 setOperationAction(ISD::STORE, VT, Promote); 646 AddPromotedToType(ISD::STORE, VT, MVT::v2i64); 647 } 648 649 // Mark vector float intrinsics as expand. 650 if (VT == MVT::v2f32 || VT == MVT::v4f32 || VT == MVT::v2f64) { 651 setOperationAction(ISD::FSIN, VT, Expand); 652 setOperationAction(ISD::FCOS, VT, Expand); 653 setOperationAction(ISD::FPOWI, VT, Expand); 654 setOperationAction(ISD::FPOW, VT, Expand); 655 setOperationAction(ISD::FLOG, VT, Expand); 656 setOperationAction(ISD::FLOG2, VT, Expand); 657 setOperationAction(ISD::FLOG10, VT, Expand); 658 setOperationAction(ISD::FEXP, VT, Expand); 659 setOperationAction(ISD::FEXP2, VT, Expand); 660 661 // But we do support custom-lowering for FCOPYSIGN. 662 setOperationAction(ISD::FCOPYSIGN, VT, Custom); 663 } 664 665 setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom); 666 setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom); 667 setOperationAction(ISD::BUILD_VECTOR, VT, Custom); 668 setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom); 669 setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Custom); 670 setOperationAction(ISD::SRA, VT, Custom); 671 setOperationAction(ISD::SRL, VT, Custom); 672 setOperationAction(ISD::SHL, VT, Custom); 673 setOperationAction(ISD::AND, VT, Custom); 674 setOperationAction(ISD::OR, VT, Custom); 675 setOperationAction(ISD::SETCC, VT, Custom); 676 setOperationAction(ISD::CONCAT_VECTORS, VT, Legal); 677 678 setOperationAction(ISD::SELECT, VT, Expand); 679 setOperationAction(ISD::SELECT_CC, VT, Expand); 680 setOperationAction(ISD::VSELECT, VT, Expand); 681 for (MVT InnerVT : MVT::all_valuetypes()) 682 setLoadExtAction(ISD::EXTLOAD, InnerVT, VT, Expand); 683 684 // CNT supports only B element sizes. 685 if (VT != MVT::v8i8 && VT != MVT::v16i8) 686 setOperationAction(ISD::CTPOP, VT, Expand); 687 688 setOperationAction(ISD::UDIV, VT, Expand); 689 setOperationAction(ISD::SDIV, VT, Expand); 690 setOperationAction(ISD::UREM, VT, Expand); 691 setOperationAction(ISD::SREM, VT, Expand); 692 setOperationAction(ISD::FREM, VT, Expand); 693 694 setOperationAction(ISD::FP_TO_SINT, VT, Custom); 695 setOperationAction(ISD::FP_TO_UINT, VT, Custom); 696 697 // [SU][MIN|MAX] are available for all NEON types apart from i64. 698 if (!VT.isFloatingPoint() && VT != MVT::v2i64 && VT != MVT::v1i64) 699 for (unsigned Opcode : {ISD::SMIN, ISD::SMAX, ISD::UMIN, ISD::UMAX}) 700 setOperationAction(Opcode, VT, Legal); 701 702 // F[MIN|MAX][NUM|NAN] are available for all FP NEON types (not f16 though!). 703 if (VT.isFloatingPoint() && VT.getVectorElementType() != MVT::f16) 704 for (unsigned Opcode : {ISD::FMINNAN, ISD::FMAXNAN, 705 ISD::FMINNUM, ISD::FMAXNUM}) 706 setOperationAction(Opcode, VT, Legal); 707 708 if (Subtarget->isLittleEndian()) { 709 for (unsigned im = (unsigned)ISD::PRE_INC; 710 im != (unsigned)ISD::LAST_INDEXED_MODE; ++im) { 711 setIndexedLoadAction(im, VT, Legal); 712 setIndexedStoreAction(im, VT, Legal); 713 } 714 } 715 } 716 717 void AArch64TargetLowering::addDRTypeForNEON(MVT VT) { 718 addRegisterClass(VT, &AArch64::FPR64RegClass); 719 addTypeForNEON(VT, MVT::v2i32); 720 } 721 722 void AArch64TargetLowering::addQRTypeForNEON(MVT VT) { 723 addRegisterClass(VT, &AArch64::FPR128RegClass); 724 addTypeForNEON(VT, MVT::v4i32); 725 } 726 727 EVT AArch64TargetLowering::getSetCCResultType(const DataLayout &, LLVMContext &, 728 EVT VT) const { 729 if (!VT.isVector()) 730 return MVT::i32; 731 return VT.changeVectorElementTypeToInteger(); 732 } 733 734 /// computeKnownBitsForTargetNode - Determine which of the bits specified in 735 /// Mask are known to be either zero or one and return them in the 736 /// KnownZero/KnownOne bitsets. 737 void AArch64TargetLowering::computeKnownBitsForTargetNode( 738 const SDValue Op, APInt &KnownZero, APInt &KnownOne, 739 const SelectionDAG &DAG, unsigned Depth) const { 740 switch (Op.getOpcode()) { 741 default: 742 break; 743 case AArch64ISD::CSEL: { 744 APInt KnownZero2, KnownOne2; 745 DAG.computeKnownBits(Op->getOperand(0), KnownZero, KnownOne, Depth + 1); 746 DAG.computeKnownBits(Op->getOperand(1), KnownZero2, KnownOne2, Depth + 1); 747 KnownZero &= KnownZero2; 748 KnownOne &= KnownOne2; 749 break; 750 } 751 case ISD::INTRINSIC_W_CHAIN: { 752 ConstantSDNode *CN = cast<ConstantSDNode>(Op->getOperand(1)); 753 Intrinsic::ID IntID = static_cast<Intrinsic::ID>(CN->getZExtValue()); 754 switch (IntID) { 755 default: return; 756 case Intrinsic::aarch64_ldaxr: 757 case Intrinsic::aarch64_ldxr: { 758 unsigned BitWidth = KnownOne.getBitWidth(); 759 EVT VT = cast<MemIntrinsicSDNode>(Op)->getMemoryVT(); 760 unsigned MemBits = VT.getScalarType().getSizeInBits(); 761 KnownZero |= APInt::getHighBitsSet(BitWidth, BitWidth - MemBits); 762 return; 763 } 764 } 765 break; 766 } 767 case ISD::INTRINSIC_WO_CHAIN: 768 case ISD::INTRINSIC_VOID: { 769 unsigned IntNo = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue(); 770 switch (IntNo) { 771 default: 772 break; 773 case Intrinsic::aarch64_neon_umaxv: 774 case Intrinsic::aarch64_neon_uminv: { 775 // Figure out the datatype of the vector operand. The UMINV instruction 776 // will zero extend the result, so we can mark as known zero all the 777 // bits larger than the element datatype. 32-bit or larget doesn't need 778 // this as those are legal types and will be handled by isel directly. 779 MVT VT = Op.getOperand(1).getValueType().getSimpleVT(); 780 unsigned BitWidth = KnownZero.getBitWidth(); 781 if (VT == MVT::v8i8 || VT == MVT::v16i8) { 782 assert(BitWidth >= 8 && "Unexpected width!"); 783 APInt Mask = APInt::getHighBitsSet(BitWidth, BitWidth - 8); 784 KnownZero |= Mask; 785 } else if (VT == MVT::v4i16 || VT == MVT::v8i16) { 786 assert(BitWidth >= 16 && "Unexpected width!"); 787 APInt Mask = APInt::getHighBitsSet(BitWidth, BitWidth - 16); 788 KnownZero |= Mask; 789 } 790 break; 791 } break; 792 } 793 } 794 } 795 } 796 797 MVT AArch64TargetLowering::getScalarShiftAmountTy(const DataLayout &DL, 798 EVT) const { 799 return MVT::i64; 800 } 801 802 bool AArch64TargetLowering::allowsMisalignedMemoryAccesses(EVT VT, 803 unsigned AddrSpace, 804 unsigned Align, 805 bool *Fast) const { 806 if (Subtarget->requiresStrictAlign()) 807 return false; 808 809 if (Fast) { 810 // Some CPUs are fine with unaligned stores except for 128-bit ones. 811 *Fast = !Subtarget->isMisaligned128StoreSlow() || VT.getStoreSize() != 16 || 812 // See comments in performSTORECombine() for more details about 813 // these conditions. 814 815 // Code that uses clang vector extensions can mark that it 816 // wants unaligned accesses to be treated as fast by 817 // underspecifying alignment to be 1 or 2. 818 Align <= 2 || 819 820 // Disregard v2i64. Memcpy lowering produces those and splitting 821 // them regresses performance on micro-benchmarks and olden/bh. 822 VT == MVT::v2i64; 823 } 824 return true; 825 } 826 827 FastISel * 828 AArch64TargetLowering::createFastISel(FunctionLoweringInfo &funcInfo, 829 const TargetLibraryInfo *libInfo) const { 830 return AArch64::createFastISel(funcInfo, libInfo); 831 } 832 833 const char *AArch64TargetLowering::getTargetNodeName(unsigned Opcode) const { 834 switch ((AArch64ISD::NodeType)Opcode) { 835 case AArch64ISD::FIRST_NUMBER: break; 836 case AArch64ISD::CALL: return "AArch64ISD::CALL"; 837 case AArch64ISD::ADRP: return "AArch64ISD::ADRP"; 838 case AArch64ISD::ADDlow: return "AArch64ISD::ADDlow"; 839 case AArch64ISD::LOADgot: return "AArch64ISD::LOADgot"; 840 case AArch64ISD::RET_FLAG: return "AArch64ISD::RET_FLAG"; 841 case AArch64ISD::BRCOND: return "AArch64ISD::BRCOND"; 842 case AArch64ISD::CSEL: return "AArch64ISD::CSEL"; 843 case AArch64ISD::FCSEL: return "AArch64ISD::FCSEL"; 844 case AArch64ISD::CSINV: return "AArch64ISD::CSINV"; 845 case AArch64ISD::CSNEG: return "AArch64ISD::CSNEG"; 846 case AArch64ISD::CSINC: return "AArch64ISD::CSINC"; 847 case AArch64ISD::THREAD_POINTER: return "AArch64ISD::THREAD_POINTER"; 848 case AArch64ISD::TLSDESC_CALLSEQ: return "AArch64ISD::TLSDESC_CALLSEQ"; 849 case AArch64ISD::ADC: return "AArch64ISD::ADC"; 850 case AArch64ISD::SBC: return "AArch64ISD::SBC"; 851 case AArch64ISD::ADDS: return "AArch64ISD::ADDS"; 852 case AArch64ISD::SUBS: return "AArch64ISD::SUBS"; 853 case AArch64ISD::ADCS: return "AArch64ISD::ADCS"; 854 case AArch64ISD::SBCS: return "AArch64ISD::SBCS"; 855 case AArch64ISD::ANDS: return "AArch64ISD::ANDS"; 856 case AArch64ISD::CCMP: return "AArch64ISD::CCMP"; 857 case AArch64ISD::CCMN: return "AArch64ISD::CCMN"; 858 case AArch64ISD::FCCMP: return "AArch64ISD::FCCMP"; 859 case AArch64ISD::FCMP: return "AArch64ISD::FCMP"; 860 case AArch64ISD::DUP: return "AArch64ISD::DUP"; 861 case AArch64ISD::DUPLANE8: return "AArch64ISD::DUPLANE8"; 862 case AArch64ISD::DUPLANE16: return "AArch64ISD::DUPLANE16"; 863 case AArch64ISD::DUPLANE32: return "AArch64ISD::DUPLANE32"; 864 case AArch64ISD::DUPLANE64: return "AArch64ISD::DUPLANE64"; 865 case AArch64ISD::MOVI: return "AArch64ISD::MOVI"; 866 case AArch64ISD::MOVIshift: return "AArch64ISD::MOVIshift"; 867 case AArch64ISD::MOVIedit: return "AArch64ISD::MOVIedit"; 868 case AArch64ISD::MOVImsl: return "AArch64ISD::MOVImsl"; 869 case AArch64ISD::FMOV: return "AArch64ISD::FMOV"; 870 case AArch64ISD::MVNIshift: return "AArch64ISD::MVNIshift"; 871 case AArch64ISD::MVNImsl: return "AArch64ISD::MVNImsl"; 872 case AArch64ISD::BICi: return "AArch64ISD::BICi"; 873 case AArch64ISD::ORRi: return "AArch64ISD::ORRi"; 874 case AArch64ISD::BSL: return "AArch64ISD::BSL"; 875 case AArch64ISD::NEG: return "AArch64ISD::NEG"; 876 case AArch64ISD::EXTR: return "AArch64ISD::EXTR"; 877 case AArch64ISD::ZIP1: return "AArch64ISD::ZIP1"; 878 case AArch64ISD::ZIP2: return "AArch64ISD::ZIP2"; 879 case AArch64ISD::UZP1: return "AArch64ISD::UZP1"; 880 case AArch64ISD::UZP2: return "AArch64ISD::UZP2"; 881 case AArch64ISD::TRN1: return "AArch64ISD::TRN1"; 882 case AArch64ISD::TRN2: return "AArch64ISD::TRN2"; 883 case AArch64ISD::REV16: return "AArch64ISD::REV16"; 884 case AArch64ISD::REV32: return "AArch64ISD::REV32"; 885 case AArch64ISD::REV64: return "AArch64ISD::REV64"; 886 case AArch64ISD::EXT: return "AArch64ISD::EXT"; 887 case AArch64ISD::VSHL: return "AArch64ISD::VSHL"; 888 case AArch64ISD::VLSHR: return "AArch64ISD::VLSHR"; 889 case AArch64ISD::VASHR: return "AArch64ISD::VASHR"; 890 case AArch64ISD::CMEQ: return "AArch64ISD::CMEQ"; 891 case AArch64ISD::CMGE: return "AArch64ISD::CMGE"; 892 case AArch64ISD::CMGT: return "AArch64ISD::CMGT"; 893 case AArch64ISD::CMHI: return "AArch64ISD::CMHI"; 894 case AArch64ISD::CMHS: return "AArch64ISD::CMHS"; 895 case AArch64ISD::FCMEQ: return "AArch64ISD::FCMEQ"; 896 case AArch64ISD::FCMGE: return "AArch64ISD::FCMGE"; 897 case AArch64ISD::FCMGT: return "AArch64ISD::FCMGT"; 898 case AArch64ISD::CMEQz: return "AArch64ISD::CMEQz"; 899 case AArch64ISD::CMGEz: return "AArch64ISD::CMGEz"; 900 case AArch64ISD::CMGTz: return "AArch64ISD::CMGTz"; 901 case AArch64ISD::CMLEz: return "AArch64ISD::CMLEz"; 902 case AArch64ISD::CMLTz: return "AArch64ISD::CMLTz"; 903 case AArch64ISD::FCMEQz: return "AArch64ISD::FCMEQz"; 904 case AArch64ISD::FCMGEz: return "AArch64ISD::FCMGEz"; 905 case AArch64ISD::FCMGTz: return "AArch64ISD::FCMGTz"; 906 case AArch64ISD::FCMLEz: return "AArch64ISD::FCMLEz"; 907 case AArch64ISD::FCMLTz: return "AArch64ISD::FCMLTz"; 908 case AArch64ISD::SADDV: return "AArch64ISD::SADDV"; 909 case AArch64ISD::UADDV: return "AArch64ISD::UADDV"; 910 case AArch64ISD::SMINV: return "AArch64ISD::SMINV"; 911 case AArch64ISD::UMINV: return "AArch64ISD::UMINV"; 912 case AArch64ISD::SMAXV: return "AArch64ISD::SMAXV"; 913 case AArch64ISD::UMAXV: return "AArch64ISD::UMAXV"; 914 case AArch64ISD::NOT: return "AArch64ISD::NOT"; 915 case AArch64ISD::BIT: return "AArch64ISD::BIT"; 916 case AArch64ISD::CBZ: return "AArch64ISD::CBZ"; 917 case AArch64ISD::CBNZ: return "AArch64ISD::CBNZ"; 918 case AArch64ISD::TBZ: return "AArch64ISD::TBZ"; 919 case AArch64ISD::TBNZ: return "AArch64ISD::TBNZ"; 920 case AArch64ISD::TC_RETURN: return "AArch64ISD::TC_RETURN"; 921 case AArch64ISD::PREFETCH: return "AArch64ISD::PREFETCH"; 922 case AArch64ISD::SITOF: return "AArch64ISD::SITOF"; 923 case AArch64ISD::UITOF: return "AArch64ISD::UITOF"; 924 case AArch64ISD::NVCAST: return "AArch64ISD::NVCAST"; 925 case AArch64ISD::SQSHL_I: return "AArch64ISD::SQSHL_I"; 926 case AArch64ISD::UQSHL_I: return "AArch64ISD::UQSHL_I"; 927 case AArch64ISD::SRSHR_I: return "AArch64ISD::SRSHR_I"; 928 case AArch64ISD::URSHR_I: return "AArch64ISD::URSHR_I"; 929 case AArch64ISD::SQSHLU_I: return "AArch64ISD::SQSHLU_I"; 930 case AArch64ISD::WrapperLarge: return "AArch64ISD::WrapperLarge"; 931 case AArch64ISD::LD2post: return "AArch64ISD::LD2post"; 932 case AArch64ISD::LD3post: return "AArch64ISD::LD3post"; 933 case AArch64ISD::LD4post: return "AArch64ISD::LD4post"; 934 case AArch64ISD::ST2post: return "AArch64ISD::ST2post"; 935 case AArch64ISD::ST3post: return "AArch64ISD::ST3post"; 936 case AArch64ISD::ST4post: return "AArch64ISD::ST4post"; 937 case AArch64ISD::LD1x2post: return "AArch64ISD::LD1x2post"; 938 case AArch64ISD::LD1x3post: return "AArch64ISD::LD1x3post"; 939 case AArch64ISD::LD1x4post: return "AArch64ISD::LD1x4post"; 940 case AArch64ISD::ST1x2post: return "AArch64ISD::ST1x2post"; 941 case AArch64ISD::ST1x3post: return "AArch64ISD::ST1x3post"; 942 case AArch64ISD::ST1x4post: return "AArch64ISD::ST1x4post"; 943 case AArch64ISD::LD1DUPpost: return "AArch64ISD::LD1DUPpost"; 944 case AArch64ISD::LD2DUPpost: return "AArch64ISD::LD2DUPpost"; 945 case AArch64ISD::LD3DUPpost: return "AArch64ISD::LD3DUPpost"; 946 case AArch64ISD::LD4DUPpost: return "AArch64ISD::LD4DUPpost"; 947 case AArch64ISD::LD1LANEpost: return "AArch64ISD::LD1LANEpost"; 948 case AArch64ISD::LD2LANEpost: return "AArch64ISD::LD2LANEpost"; 949 case AArch64ISD::LD3LANEpost: return "AArch64ISD::LD3LANEpost"; 950 case AArch64ISD::LD4LANEpost: return "AArch64ISD::LD4LANEpost"; 951 case AArch64ISD::ST2LANEpost: return "AArch64ISD::ST2LANEpost"; 952 case AArch64ISD::ST3LANEpost: return "AArch64ISD::ST3LANEpost"; 953 case AArch64ISD::ST4LANEpost: return "AArch64ISD::ST4LANEpost"; 954 case AArch64ISD::SMULL: return "AArch64ISD::SMULL"; 955 case AArch64ISD::UMULL: return "AArch64ISD::UMULL"; 956 case AArch64ISD::FRSQRTE: return "AArch64ISD::FRSQRTE"; 957 case AArch64ISD::FRECPE: return "AArch64ISD::FRECPE"; 958 } 959 return nullptr; 960 } 961 962 MachineBasicBlock * 963 AArch64TargetLowering::EmitF128CSEL(MachineInstr &MI, 964 MachineBasicBlock *MBB) const { 965 // We materialise the F128CSEL pseudo-instruction as some control flow and a 966 // phi node: 967 968 // OrigBB: 969 // [... previous instrs leading to comparison ...] 970 // b.ne TrueBB 971 // b EndBB 972 // TrueBB: 973 // ; Fallthrough 974 // EndBB: 975 // Dest = PHI [IfTrue, TrueBB], [IfFalse, OrigBB] 976 977 MachineFunction *MF = MBB->getParent(); 978 const TargetInstrInfo *TII = Subtarget->getInstrInfo(); 979 const BasicBlock *LLVM_BB = MBB->getBasicBlock(); 980 DebugLoc DL = MI.getDebugLoc(); 981 MachineFunction::iterator It = ++MBB->getIterator(); 982 983 unsigned DestReg = MI.getOperand(0).getReg(); 984 unsigned IfTrueReg = MI.getOperand(1).getReg(); 985 unsigned IfFalseReg = MI.getOperand(2).getReg(); 986 unsigned CondCode = MI.getOperand(3).getImm(); 987 bool NZCVKilled = MI.getOperand(4).isKill(); 988 989 MachineBasicBlock *TrueBB = MF->CreateMachineBasicBlock(LLVM_BB); 990 MachineBasicBlock *EndBB = MF->CreateMachineBasicBlock(LLVM_BB); 991 MF->insert(It, TrueBB); 992 MF->insert(It, EndBB); 993 994 // Transfer rest of current basic-block to EndBB 995 EndBB->splice(EndBB->begin(), MBB, std::next(MachineBasicBlock::iterator(MI)), 996 MBB->end()); 997 EndBB->transferSuccessorsAndUpdatePHIs(MBB); 998 999 BuildMI(MBB, DL, TII->get(AArch64::Bcc)).addImm(CondCode).addMBB(TrueBB); 1000 BuildMI(MBB, DL, TII->get(AArch64::B)).addMBB(EndBB); 1001 MBB->addSuccessor(TrueBB); 1002 MBB->addSuccessor(EndBB); 1003 1004 // TrueBB falls through to the end. 1005 TrueBB->addSuccessor(EndBB); 1006 1007 if (!NZCVKilled) { 1008 TrueBB->addLiveIn(AArch64::NZCV); 1009 EndBB->addLiveIn(AArch64::NZCV); 1010 } 1011 1012 BuildMI(*EndBB, EndBB->begin(), DL, TII->get(AArch64::PHI), DestReg) 1013 .addReg(IfTrueReg) 1014 .addMBB(TrueBB) 1015 .addReg(IfFalseReg) 1016 .addMBB(MBB); 1017 1018 MI.eraseFromParent(); 1019 return EndBB; 1020 } 1021 1022 MachineBasicBlock *AArch64TargetLowering::EmitInstrWithCustomInserter( 1023 MachineInstr &MI, MachineBasicBlock *BB) const { 1024 switch (MI.getOpcode()) { 1025 default: 1026 #ifndef NDEBUG 1027 MI.dump(); 1028 #endif 1029 llvm_unreachable("Unexpected instruction for custom inserter!"); 1030 1031 case AArch64::F128CSEL: 1032 return EmitF128CSEL(MI, BB); 1033 1034 case TargetOpcode::STACKMAP: 1035 case TargetOpcode::PATCHPOINT: 1036 return emitPatchPoint(MI, BB); 1037 } 1038 } 1039 1040 //===----------------------------------------------------------------------===// 1041 // AArch64 Lowering private implementation. 1042 //===----------------------------------------------------------------------===// 1043 1044 //===----------------------------------------------------------------------===// 1045 // Lowering Code 1046 //===----------------------------------------------------------------------===// 1047 1048 /// changeIntCCToAArch64CC - Convert a DAG integer condition code to an AArch64 1049 /// CC 1050 static AArch64CC::CondCode changeIntCCToAArch64CC(ISD::CondCode CC) { 1051 switch (CC) { 1052 default: 1053 llvm_unreachable("Unknown condition code!"); 1054 case ISD::SETNE: 1055 return AArch64CC::NE; 1056 case ISD::SETEQ: 1057 return AArch64CC::EQ; 1058 case ISD::SETGT: 1059 return AArch64CC::GT; 1060 case ISD::SETGE: 1061 return AArch64CC::GE; 1062 case ISD::SETLT: 1063 return AArch64CC::LT; 1064 case ISD::SETLE: 1065 return AArch64CC::LE; 1066 case ISD::SETUGT: 1067 return AArch64CC::HI; 1068 case ISD::SETUGE: 1069 return AArch64CC::HS; 1070 case ISD::SETULT: 1071 return AArch64CC::LO; 1072 case ISD::SETULE: 1073 return AArch64CC::LS; 1074 } 1075 } 1076 1077 /// changeFPCCToAArch64CC - Convert a DAG fp condition code to an AArch64 CC. 1078 static void changeFPCCToAArch64CC(ISD::CondCode CC, 1079 AArch64CC::CondCode &CondCode, 1080 AArch64CC::CondCode &CondCode2) { 1081 CondCode2 = AArch64CC::AL; 1082 switch (CC) { 1083 default: 1084 llvm_unreachable("Unknown FP condition!"); 1085 case ISD::SETEQ: 1086 case ISD::SETOEQ: 1087 CondCode = AArch64CC::EQ; 1088 break; 1089 case ISD::SETGT: 1090 case ISD::SETOGT: 1091 CondCode = AArch64CC::GT; 1092 break; 1093 case ISD::SETGE: 1094 case ISD::SETOGE: 1095 CondCode = AArch64CC::GE; 1096 break; 1097 case ISD::SETOLT: 1098 CondCode = AArch64CC::MI; 1099 break; 1100 case ISD::SETOLE: 1101 CondCode = AArch64CC::LS; 1102 break; 1103 case ISD::SETONE: 1104 CondCode = AArch64CC::MI; 1105 CondCode2 = AArch64CC::GT; 1106 break; 1107 case ISD::SETO: 1108 CondCode = AArch64CC::VC; 1109 break; 1110 case ISD::SETUO: 1111 CondCode = AArch64CC::VS; 1112 break; 1113 case ISD::SETUEQ: 1114 CondCode = AArch64CC::EQ; 1115 CondCode2 = AArch64CC::VS; 1116 break; 1117 case ISD::SETUGT: 1118 CondCode = AArch64CC::HI; 1119 break; 1120 case ISD::SETUGE: 1121 CondCode = AArch64CC::PL; 1122 break; 1123 case ISD::SETLT: 1124 case ISD::SETULT: 1125 CondCode = AArch64CC::LT; 1126 break; 1127 case ISD::SETLE: 1128 case ISD::SETULE: 1129 CondCode = AArch64CC::LE; 1130 break; 1131 case ISD::SETNE: 1132 case ISD::SETUNE: 1133 CondCode = AArch64CC::NE; 1134 break; 1135 } 1136 } 1137 1138 /// Convert a DAG fp condition code to an AArch64 CC. 1139 /// This differs from changeFPCCToAArch64CC in that it returns cond codes that 1140 /// should be AND'ed instead of OR'ed. 1141 static void changeFPCCToANDAArch64CC(ISD::CondCode CC, 1142 AArch64CC::CondCode &CondCode, 1143 AArch64CC::CondCode &CondCode2) { 1144 CondCode2 = AArch64CC::AL; 1145 switch (CC) { 1146 default: 1147 changeFPCCToAArch64CC(CC, CondCode, CondCode2); 1148 assert(CondCode2 == AArch64CC::AL); 1149 break; 1150 case ISD::SETONE: 1151 // (a one b) 1152 // == ((a olt b) || (a ogt b)) 1153 // == ((a ord b) && (a une b)) 1154 CondCode = AArch64CC::VC; 1155 CondCode2 = AArch64CC::NE; 1156 break; 1157 case ISD::SETUEQ: 1158 // (a ueq b) 1159 // == ((a uno b) || (a oeq b)) 1160 // == ((a ule b) && (a uge b)) 1161 CondCode = AArch64CC::PL; 1162 CondCode2 = AArch64CC::LE; 1163 break; 1164 } 1165 } 1166 1167 /// changeVectorFPCCToAArch64CC - Convert a DAG fp condition code to an AArch64 1168 /// CC usable with the vector instructions. Fewer operations are available 1169 /// without a real NZCV register, so we have to use less efficient combinations 1170 /// to get the same effect. 1171 static void changeVectorFPCCToAArch64CC(ISD::CondCode CC, 1172 AArch64CC::CondCode &CondCode, 1173 AArch64CC::CondCode &CondCode2, 1174 bool &Invert) { 1175 Invert = false; 1176 switch (CC) { 1177 default: 1178 // Mostly the scalar mappings work fine. 1179 changeFPCCToAArch64CC(CC, CondCode, CondCode2); 1180 break; 1181 case ISD::SETUO: 1182 Invert = true; // Fallthrough 1183 case ISD::SETO: 1184 CondCode = AArch64CC::MI; 1185 CondCode2 = AArch64CC::GE; 1186 break; 1187 case ISD::SETUEQ: 1188 case ISD::SETULT: 1189 case ISD::SETULE: 1190 case ISD::SETUGT: 1191 case ISD::SETUGE: 1192 // All of the compare-mask comparisons are ordered, but we can switch 1193 // between the two by a double inversion. E.g. ULE == !OGT. 1194 Invert = true; 1195 changeFPCCToAArch64CC(getSetCCInverse(CC, false), CondCode, CondCode2); 1196 break; 1197 } 1198 } 1199 1200 static bool isLegalArithImmed(uint64_t C) { 1201 // Matches AArch64DAGToDAGISel::SelectArithImmed(). 1202 return (C >> 12 == 0) || ((C & 0xFFFULL) == 0 && C >> 24 == 0); 1203 } 1204 1205 static SDValue emitComparison(SDValue LHS, SDValue RHS, ISD::CondCode CC, 1206 const SDLoc &dl, SelectionDAG &DAG) { 1207 EVT VT = LHS.getValueType(); 1208 1209 if (VT.isFloatingPoint()) { 1210 assert(VT != MVT::f128); 1211 if (VT == MVT::f16) { 1212 LHS = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f32, LHS); 1213 RHS = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f32, RHS); 1214 VT = MVT::f32; 1215 } 1216 return DAG.getNode(AArch64ISD::FCMP, dl, VT, LHS, RHS); 1217 } 1218 1219 // The CMP instruction is just an alias for SUBS, and representing it as 1220 // SUBS means that it's possible to get CSE with subtract operations. 1221 // A later phase can perform the optimization of setting the destination 1222 // register to WZR/XZR if it ends up being unused. 1223 unsigned Opcode = AArch64ISD::SUBS; 1224 1225 if (RHS.getOpcode() == ISD::SUB && isNullConstant(RHS.getOperand(0)) && 1226 (CC == ISD::SETEQ || CC == ISD::SETNE)) { 1227 // We'd like to combine a (CMP op1, (sub 0, op2) into a CMN instruction on 1228 // the grounds that "op1 - (-op2) == op1 + op2". However, the C and V flags 1229 // can be set differently by this operation. It comes down to whether 1230 // "SInt(~op2)+1 == SInt(~op2+1)" (and the same for UInt). If they are then 1231 // everything is fine. If not then the optimization is wrong. Thus general 1232 // comparisons are only valid if op2 != 0. 1233 1234 // So, finally, the only LLVM-native comparisons that don't mention C and V 1235 // are SETEQ and SETNE. They're the only ones we can safely use CMN for in 1236 // the absence of information about op2. 1237 Opcode = AArch64ISD::ADDS; 1238 RHS = RHS.getOperand(1); 1239 } else if (LHS.getOpcode() == ISD::AND && isNullConstant(RHS) && 1240 !isUnsignedIntSetCC(CC)) { 1241 // Similarly, (CMP (and X, Y), 0) can be implemented with a TST 1242 // (a.k.a. ANDS) except that the flags are only guaranteed to work for one 1243 // of the signed comparisons. 1244 Opcode = AArch64ISD::ANDS; 1245 RHS = LHS.getOperand(1); 1246 LHS = LHS.getOperand(0); 1247 } 1248 1249 return DAG.getNode(Opcode, dl, DAG.getVTList(VT, MVT_CC), LHS, RHS) 1250 .getValue(1); 1251 } 1252 1253 /// \defgroup AArch64CCMP CMP;CCMP matching 1254 /// 1255 /// These functions deal with the formation of CMP;CCMP;... sequences. 1256 /// The CCMP/CCMN/FCCMP/FCCMPE instructions allow the conditional execution of 1257 /// a comparison. They set the NZCV flags to a predefined value if their 1258 /// predicate is false. This allows to express arbitrary conjunctions, for 1259 /// example "cmp 0 (and (setCA (cmp A)) (setCB (cmp B))))" 1260 /// expressed as: 1261 /// cmp A 1262 /// ccmp B, inv(CB), CA 1263 /// check for CB flags 1264 /// 1265 /// In general we can create code for arbitrary "... (and (and A B) C)" 1266 /// sequences. We can also implement some "or" expressions, because "(or A B)" 1267 /// is equivalent to "not (and (not A) (not B))" and we can implement some 1268 /// negation operations: 1269 /// We can negate the results of a single comparison by inverting the flags 1270 /// used when the predicate fails and inverting the flags tested in the next 1271 /// instruction; We can also negate the results of the whole previous 1272 /// conditional compare sequence by inverting the flags tested in the next 1273 /// instruction. However there is no way to negate the result of a partial 1274 /// sequence. 1275 /// 1276 /// Therefore on encountering an "or" expression we can negate the subtree on 1277 /// one side and have to be able to push the negate to the leafs of the subtree 1278 /// on the other side (see also the comments in code). As complete example: 1279 /// "or (or (setCA (cmp A)) (setCB (cmp B))) 1280 /// (and (setCC (cmp C)) (setCD (cmp D)))" 1281 /// is transformed to 1282 /// "not (and (not (and (setCC (cmp C)) (setCC (cmp D)))) 1283 /// (and (not (setCA (cmp A)) (not (setCB (cmp B))))))" 1284 /// and implemented as: 1285 /// cmp C 1286 /// ccmp D, inv(CD), CC 1287 /// ccmp A, CA, inv(CD) 1288 /// ccmp B, CB, inv(CA) 1289 /// check for CB flags 1290 /// A counterexample is "or (and A B) (and C D)" which cannot be implemented 1291 /// by conditional compare sequences. 1292 /// @{ 1293 1294 /// Create a conditional comparison; Use CCMP, CCMN or FCCMP as appropriate. 1295 static SDValue emitConditionalComparison(SDValue LHS, SDValue RHS, 1296 ISD::CondCode CC, SDValue CCOp, 1297 AArch64CC::CondCode Predicate, 1298 AArch64CC::CondCode OutCC, 1299 const SDLoc &DL, SelectionDAG &DAG) { 1300 unsigned Opcode = 0; 1301 if (LHS.getValueType().isFloatingPoint()) { 1302 assert(LHS.getValueType() != MVT::f128); 1303 if (LHS.getValueType() == MVT::f16) { 1304 LHS = DAG.getNode(ISD::FP_EXTEND, DL, MVT::f32, LHS); 1305 RHS = DAG.getNode(ISD::FP_EXTEND, DL, MVT::f32, RHS); 1306 } 1307 Opcode = AArch64ISD::FCCMP; 1308 } else if (RHS.getOpcode() == ISD::SUB) { 1309 SDValue SubOp0 = RHS.getOperand(0); 1310 if (isNullConstant(SubOp0) && (CC == ISD::SETEQ || CC == ISD::SETNE)) { 1311 // See emitComparison() on why we can only do this for SETEQ and SETNE. 1312 Opcode = AArch64ISD::CCMN; 1313 RHS = RHS.getOperand(1); 1314 } 1315 } 1316 if (Opcode == 0) 1317 Opcode = AArch64ISD::CCMP; 1318 1319 SDValue Condition = DAG.getConstant(Predicate, DL, MVT_CC); 1320 AArch64CC::CondCode InvOutCC = AArch64CC::getInvertedCondCode(OutCC); 1321 unsigned NZCV = AArch64CC::getNZCVToSatisfyCondCode(InvOutCC); 1322 SDValue NZCVOp = DAG.getConstant(NZCV, DL, MVT::i32); 1323 return DAG.getNode(Opcode, DL, MVT_CC, LHS, RHS, NZCVOp, Condition, CCOp); 1324 } 1325 1326 /// Returns true if @p Val is a tree of AND/OR/SETCC operations. 1327 /// CanPushNegate is set to true if we can push a negate operation through 1328 /// the tree in a was that we are left with AND operations and negate operations 1329 /// at the leafs only. i.e. "not (or (or x y) z)" can be changed to 1330 /// "and (and (not x) (not y)) (not z)"; "not (or (and x y) z)" cannot be 1331 /// brought into such a form. 1332 static bool isConjunctionDisjunctionTree(const SDValue Val, bool &CanNegate, 1333 unsigned Depth = 0) { 1334 if (!Val.hasOneUse()) 1335 return false; 1336 unsigned Opcode = Val->getOpcode(); 1337 if (Opcode == ISD::SETCC) { 1338 if (Val->getOperand(0).getValueType() == MVT::f128) 1339 return false; 1340 CanNegate = true; 1341 return true; 1342 } 1343 // Protect against exponential runtime and stack overflow. 1344 if (Depth > 6) 1345 return false; 1346 if (Opcode == ISD::AND || Opcode == ISD::OR) { 1347 SDValue O0 = Val->getOperand(0); 1348 SDValue O1 = Val->getOperand(1); 1349 bool CanNegateL; 1350 if (!isConjunctionDisjunctionTree(O0, CanNegateL, Depth+1)) 1351 return false; 1352 bool CanNegateR; 1353 if (!isConjunctionDisjunctionTree(O1, CanNegateR, Depth+1)) 1354 return false; 1355 1356 if (Opcode == ISD::OR) { 1357 // For an OR expression we need to be able to negate at least one side or 1358 // we cannot do the transformation at all. 1359 if (!CanNegateL && !CanNegateR) 1360 return false; 1361 // We can however change a (not (or x y)) to (and (not x) (not y)) if we 1362 // can negate the x and y subtrees. 1363 CanNegate = CanNegateL && CanNegateR; 1364 } else { 1365 // If the operands are OR expressions then we finally need to negate their 1366 // outputs, we can only do that for the operand with emitted last by 1367 // negating OutCC, not for both operands. 1368 bool NeedsNegOutL = O0->getOpcode() == ISD::OR; 1369 bool NeedsNegOutR = O1->getOpcode() == ISD::OR; 1370 if (NeedsNegOutL && NeedsNegOutR) 1371 return false; 1372 // We cannot negate an AND operation (it would become an OR), 1373 CanNegate = false; 1374 } 1375 return true; 1376 } 1377 return false; 1378 } 1379 1380 /// Emit conjunction or disjunction tree with the CMP/FCMP followed by a chain 1381 /// of CCMP/CFCMP ops. See @ref AArch64CCMP. 1382 /// Tries to transform the given i1 producing node @p Val to a series compare 1383 /// and conditional compare operations. @returns an NZCV flags producing node 1384 /// and sets @p OutCC to the flags that should be tested or returns SDValue() if 1385 /// transformation was not possible. 1386 /// On recursive invocations @p PushNegate may be set to true to have negation 1387 /// effects pushed to the tree leafs; @p Predicate is an NZCV flag predicate 1388 /// for the comparisons in the current subtree; @p Depth limits the search 1389 /// depth to avoid stack overflow. 1390 static SDValue emitConjunctionDisjunctionTreeRec(SelectionDAG &DAG, SDValue Val, 1391 AArch64CC::CondCode &OutCC, bool Negate, SDValue CCOp, 1392 AArch64CC::CondCode Predicate) { 1393 // We're at a tree leaf, produce a conditional comparison operation. 1394 unsigned Opcode = Val->getOpcode(); 1395 if (Opcode == ISD::SETCC) { 1396 SDValue LHS = Val->getOperand(0); 1397 SDValue RHS = Val->getOperand(1); 1398 ISD::CondCode CC = cast<CondCodeSDNode>(Val->getOperand(2))->get(); 1399 bool isInteger = LHS.getValueType().isInteger(); 1400 if (Negate) 1401 CC = getSetCCInverse(CC, isInteger); 1402 SDLoc DL(Val); 1403 // Determine OutCC and handle FP special case. 1404 if (isInteger) { 1405 OutCC = changeIntCCToAArch64CC(CC); 1406 } else { 1407 assert(LHS.getValueType().isFloatingPoint()); 1408 AArch64CC::CondCode ExtraCC; 1409 changeFPCCToANDAArch64CC(CC, OutCC, ExtraCC); 1410 // Some floating point conditions can't be tested with a single condition 1411 // code. Construct an additional comparison in this case. 1412 if (ExtraCC != AArch64CC::AL) { 1413 SDValue ExtraCmp; 1414 if (!CCOp.getNode()) 1415 ExtraCmp = emitComparison(LHS, RHS, CC, DL, DAG); 1416 else 1417 ExtraCmp = emitConditionalComparison(LHS, RHS, CC, CCOp, Predicate, 1418 ExtraCC, DL, DAG); 1419 CCOp = ExtraCmp; 1420 Predicate = ExtraCC; 1421 } 1422 } 1423 1424 // Produce a normal comparison if we are first in the chain 1425 if (!CCOp) 1426 return emitComparison(LHS, RHS, CC, DL, DAG); 1427 // Otherwise produce a ccmp. 1428 return emitConditionalComparison(LHS, RHS, CC, CCOp, Predicate, OutCC, DL, 1429 DAG); 1430 } 1431 assert((Opcode == ISD::AND || (Opcode == ISD::OR && Val->hasOneUse())) && 1432 "Valid conjunction/disjunction tree"); 1433 1434 // Check if both sides can be transformed. 1435 SDValue LHS = Val->getOperand(0); 1436 SDValue RHS = Val->getOperand(1); 1437 1438 // In case of an OR we need to negate our operands and the result. 1439 // (A v B) <=> not(not(A) ^ not(B)) 1440 bool NegateOpsAndResult = Opcode == ISD::OR; 1441 // We can negate the results of all previous operations by inverting the 1442 // predicate flags giving us a free negation for one side. The other side 1443 // must be negatable by itself. 1444 if (NegateOpsAndResult) { 1445 // See which side we can negate. 1446 bool CanNegateL; 1447 bool isValidL = isConjunctionDisjunctionTree(LHS, CanNegateL); 1448 assert(isValidL && "Valid conjunction/disjunction tree"); 1449 (void)isValidL; 1450 1451 #ifndef NDEBUG 1452 bool CanNegateR; 1453 bool isValidR = isConjunctionDisjunctionTree(RHS, CanNegateR); 1454 assert(isValidR && "Valid conjunction/disjunction tree"); 1455 assert((CanNegateL || CanNegateR) && "Valid conjunction/disjunction tree"); 1456 #endif 1457 1458 // Order the side which we cannot negate to RHS so we can emit it first. 1459 if (!CanNegateL) 1460 std::swap(LHS, RHS); 1461 } else { 1462 bool NeedsNegOutL = LHS->getOpcode() == ISD::OR; 1463 assert((!NeedsNegOutL || RHS->getOpcode() != ISD::OR) && 1464 "Valid conjunction/disjunction tree"); 1465 // Order the side where we need to negate the output flags to RHS so it 1466 // gets emitted first. 1467 if (NeedsNegOutL) 1468 std::swap(LHS, RHS); 1469 } 1470 1471 // Emit RHS. If we want to negate the tree we only need to push a negate 1472 // through if we are already in a PushNegate case, otherwise we can negate 1473 // the "flags to test" afterwards. 1474 AArch64CC::CondCode RHSCC; 1475 SDValue CmpR = emitConjunctionDisjunctionTreeRec(DAG, RHS, RHSCC, Negate, 1476 CCOp, Predicate); 1477 if (NegateOpsAndResult && !Negate) 1478 RHSCC = AArch64CC::getInvertedCondCode(RHSCC); 1479 // Emit LHS. We may need to negate it. 1480 SDValue CmpL = emitConjunctionDisjunctionTreeRec(DAG, LHS, OutCC, 1481 NegateOpsAndResult, CmpR, 1482 RHSCC); 1483 // If we transformed an OR to and AND then we have to negate the result 1484 // (or absorb the Negate parameter). 1485 if (NegateOpsAndResult && !Negate) 1486 OutCC = AArch64CC::getInvertedCondCode(OutCC); 1487 return CmpL; 1488 } 1489 1490 /// Emit conjunction or disjunction tree with the CMP/FCMP followed by a chain 1491 /// of CCMP/CFCMP ops. See @ref AArch64CCMP. 1492 /// \see emitConjunctionDisjunctionTreeRec(). 1493 static SDValue emitConjunctionDisjunctionTree(SelectionDAG &DAG, SDValue Val, 1494 AArch64CC::CondCode &OutCC) { 1495 bool CanNegate; 1496 if (!isConjunctionDisjunctionTree(Val, CanNegate)) 1497 return SDValue(); 1498 1499 return emitConjunctionDisjunctionTreeRec(DAG, Val, OutCC, false, SDValue(), 1500 AArch64CC::AL); 1501 } 1502 1503 /// @} 1504 1505 static SDValue getAArch64Cmp(SDValue LHS, SDValue RHS, ISD::CondCode CC, 1506 SDValue &AArch64cc, SelectionDAG &DAG, 1507 const SDLoc &dl) { 1508 if (ConstantSDNode *RHSC = dyn_cast<ConstantSDNode>(RHS.getNode())) { 1509 EVT VT = RHS.getValueType(); 1510 uint64_t C = RHSC->getZExtValue(); 1511 if (!isLegalArithImmed(C)) { 1512 // Constant does not fit, try adjusting it by one? 1513 switch (CC) { 1514 default: 1515 break; 1516 case ISD::SETLT: 1517 case ISD::SETGE: 1518 if ((VT == MVT::i32 && C != 0x80000000 && 1519 isLegalArithImmed((uint32_t)(C - 1))) || 1520 (VT == MVT::i64 && C != 0x80000000ULL && 1521 isLegalArithImmed(C - 1ULL))) { 1522 CC = (CC == ISD::SETLT) ? ISD::SETLE : ISD::SETGT; 1523 C = (VT == MVT::i32) ? (uint32_t)(C - 1) : C - 1; 1524 RHS = DAG.getConstant(C, dl, VT); 1525 } 1526 break; 1527 case ISD::SETULT: 1528 case ISD::SETUGE: 1529 if ((VT == MVT::i32 && C != 0 && 1530 isLegalArithImmed((uint32_t)(C - 1))) || 1531 (VT == MVT::i64 && C != 0ULL && isLegalArithImmed(C - 1ULL))) { 1532 CC = (CC == ISD::SETULT) ? ISD::SETULE : ISD::SETUGT; 1533 C = (VT == MVT::i32) ? (uint32_t)(C - 1) : C - 1; 1534 RHS = DAG.getConstant(C, dl, VT); 1535 } 1536 break; 1537 case ISD::SETLE: 1538 case ISD::SETGT: 1539 if ((VT == MVT::i32 && C != INT32_MAX && 1540 isLegalArithImmed((uint32_t)(C + 1))) || 1541 (VT == MVT::i64 && C != INT64_MAX && 1542 isLegalArithImmed(C + 1ULL))) { 1543 CC = (CC == ISD::SETLE) ? ISD::SETLT : ISD::SETGE; 1544 C = (VT == MVT::i32) ? (uint32_t)(C + 1) : C + 1; 1545 RHS = DAG.getConstant(C, dl, VT); 1546 } 1547 break; 1548 case ISD::SETULE: 1549 case ISD::SETUGT: 1550 if ((VT == MVT::i32 && C != UINT32_MAX && 1551 isLegalArithImmed((uint32_t)(C + 1))) || 1552 (VT == MVT::i64 && C != UINT64_MAX && 1553 isLegalArithImmed(C + 1ULL))) { 1554 CC = (CC == ISD::SETULE) ? ISD::SETULT : ISD::SETUGE; 1555 C = (VT == MVT::i32) ? (uint32_t)(C + 1) : C + 1; 1556 RHS = DAG.getConstant(C, dl, VT); 1557 } 1558 break; 1559 } 1560 } 1561 } 1562 SDValue Cmp; 1563 AArch64CC::CondCode AArch64CC; 1564 if ((CC == ISD::SETEQ || CC == ISD::SETNE) && isa<ConstantSDNode>(RHS)) { 1565 const ConstantSDNode *RHSC = cast<ConstantSDNode>(RHS); 1566 1567 // The imm operand of ADDS is an unsigned immediate, in the range 0 to 4095. 1568 // For the i8 operand, the largest immediate is 255, so this can be easily 1569 // encoded in the compare instruction. For the i16 operand, however, the 1570 // largest immediate cannot be encoded in the compare. 1571 // Therefore, use a sign extending load and cmn to avoid materializing the 1572 // -1 constant. For example, 1573 // movz w1, #65535 1574 // ldrh w0, [x0, #0] 1575 // cmp w0, w1 1576 // > 1577 // ldrsh w0, [x0, #0] 1578 // cmn w0, #1 1579 // Fundamental, we're relying on the property that (zext LHS) == (zext RHS) 1580 // if and only if (sext LHS) == (sext RHS). The checks are in place to 1581 // ensure both the LHS and RHS are truly zero extended and to make sure the 1582 // transformation is profitable. 1583 if ((RHSC->getZExtValue() >> 16 == 0) && isa<LoadSDNode>(LHS) && 1584 cast<LoadSDNode>(LHS)->getExtensionType() == ISD::ZEXTLOAD && 1585 cast<LoadSDNode>(LHS)->getMemoryVT() == MVT::i16 && 1586 LHS.getNode()->hasNUsesOfValue(1, 0)) { 1587 int16_t ValueofRHS = cast<ConstantSDNode>(RHS)->getZExtValue(); 1588 if (ValueofRHS < 0 && isLegalArithImmed(-ValueofRHS)) { 1589 SDValue SExt = 1590 DAG.getNode(ISD::SIGN_EXTEND_INREG, dl, LHS.getValueType(), LHS, 1591 DAG.getValueType(MVT::i16)); 1592 Cmp = emitComparison(SExt, DAG.getConstant(ValueofRHS, dl, 1593 RHS.getValueType()), 1594 CC, dl, DAG); 1595 AArch64CC = changeIntCCToAArch64CC(CC); 1596 } 1597 } 1598 1599 if (!Cmp && (RHSC->isNullValue() || RHSC->isOne())) { 1600 if ((Cmp = emitConjunctionDisjunctionTree(DAG, LHS, AArch64CC))) { 1601 if ((CC == ISD::SETNE) ^ RHSC->isNullValue()) 1602 AArch64CC = AArch64CC::getInvertedCondCode(AArch64CC); 1603 } 1604 } 1605 } 1606 1607 if (!Cmp) { 1608 Cmp = emitComparison(LHS, RHS, CC, dl, DAG); 1609 AArch64CC = changeIntCCToAArch64CC(CC); 1610 } 1611 AArch64cc = DAG.getConstant(AArch64CC, dl, MVT_CC); 1612 return Cmp; 1613 } 1614 1615 static std::pair<SDValue, SDValue> 1616 getAArch64XALUOOp(AArch64CC::CondCode &CC, SDValue Op, SelectionDAG &DAG) { 1617 assert((Op.getValueType() == MVT::i32 || Op.getValueType() == MVT::i64) && 1618 "Unsupported value type"); 1619 SDValue Value, Overflow; 1620 SDLoc DL(Op); 1621 SDValue LHS = Op.getOperand(0); 1622 SDValue RHS = Op.getOperand(1); 1623 unsigned Opc = 0; 1624 switch (Op.getOpcode()) { 1625 default: 1626 llvm_unreachable("Unknown overflow instruction!"); 1627 case ISD::SADDO: 1628 Opc = AArch64ISD::ADDS; 1629 CC = AArch64CC::VS; 1630 break; 1631 case ISD::UADDO: 1632 Opc = AArch64ISD::ADDS; 1633 CC = AArch64CC::HS; 1634 break; 1635 case ISD::SSUBO: 1636 Opc = AArch64ISD::SUBS; 1637 CC = AArch64CC::VS; 1638 break; 1639 case ISD::USUBO: 1640 Opc = AArch64ISD::SUBS; 1641 CC = AArch64CC::LO; 1642 break; 1643 // Multiply needs a little bit extra work. 1644 case ISD::SMULO: 1645 case ISD::UMULO: { 1646 CC = AArch64CC::NE; 1647 bool IsSigned = Op.getOpcode() == ISD::SMULO; 1648 if (Op.getValueType() == MVT::i32) { 1649 unsigned ExtendOpc = IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND; 1650 // For a 32 bit multiply with overflow check we want the instruction 1651 // selector to generate a widening multiply (SMADDL/UMADDL). For that we 1652 // need to generate the following pattern: 1653 // (i64 add 0, (i64 mul (i64 sext|zext i32 %a), (i64 sext|zext i32 %b)) 1654 LHS = DAG.getNode(ExtendOpc, DL, MVT::i64, LHS); 1655 RHS = DAG.getNode(ExtendOpc, DL, MVT::i64, RHS); 1656 SDValue Mul = DAG.getNode(ISD::MUL, DL, MVT::i64, LHS, RHS); 1657 SDValue Add = DAG.getNode(ISD::ADD, DL, MVT::i64, Mul, 1658 DAG.getConstant(0, DL, MVT::i64)); 1659 // On AArch64 the upper 32 bits are always zero extended for a 32 bit 1660 // operation. We need to clear out the upper 32 bits, because we used a 1661 // widening multiply that wrote all 64 bits. In the end this should be a 1662 // noop. 1663 Value = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, Add); 1664 if (IsSigned) { 1665 // The signed overflow check requires more than just a simple check for 1666 // any bit set in the upper 32 bits of the result. These bits could be 1667 // just the sign bits of a negative number. To perform the overflow 1668 // check we have to arithmetic shift right the 32nd bit of the result by 1669 // 31 bits. Then we compare the result to the upper 32 bits. 1670 SDValue UpperBits = DAG.getNode(ISD::SRL, DL, MVT::i64, Add, 1671 DAG.getConstant(32, DL, MVT::i64)); 1672 UpperBits = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, UpperBits); 1673 SDValue LowerBits = DAG.getNode(ISD::SRA, DL, MVT::i32, Value, 1674 DAG.getConstant(31, DL, MVT::i64)); 1675 // It is important that LowerBits is last, otherwise the arithmetic 1676 // shift will not be folded into the compare (SUBS). 1677 SDVTList VTs = DAG.getVTList(MVT::i32, MVT::i32); 1678 Overflow = DAG.getNode(AArch64ISD::SUBS, DL, VTs, UpperBits, LowerBits) 1679 .getValue(1); 1680 } else { 1681 // The overflow check for unsigned multiply is easy. We only need to 1682 // check if any of the upper 32 bits are set. This can be done with a 1683 // CMP (shifted register). For that we need to generate the following 1684 // pattern: 1685 // (i64 AArch64ISD::SUBS i64 0, (i64 srl i64 %Mul, i64 32) 1686 SDValue UpperBits = DAG.getNode(ISD::SRL, DL, MVT::i64, Mul, 1687 DAG.getConstant(32, DL, MVT::i64)); 1688 SDVTList VTs = DAG.getVTList(MVT::i64, MVT::i32); 1689 Overflow = 1690 DAG.getNode(AArch64ISD::SUBS, DL, VTs, 1691 DAG.getConstant(0, DL, MVT::i64), 1692 UpperBits).getValue(1); 1693 } 1694 break; 1695 } 1696 assert(Op.getValueType() == MVT::i64 && "Expected an i64 value type"); 1697 // For the 64 bit multiply 1698 Value = DAG.getNode(ISD::MUL, DL, MVT::i64, LHS, RHS); 1699 if (IsSigned) { 1700 SDValue UpperBits = DAG.getNode(ISD::MULHS, DL, MVT::i64, LHS, RHS); 1701 SDValue LowerBits = DAG.getNode(ISD::SRA, DL, MVT::i64, Value, 1702 DAG.getConstant(63, DL, MVT::i64)); 1703 // It is important that LowerBits is last, otherwise the arithmetic 1704 // shift will not be folded into the compare (SUBS). 1705 SDVTList VTs = DAG.getVTList(MVT::i64, MVT::i32); 1706 Overflow = DAG.getNode(AArch64ISD::SUBS, DL, VTs, UpperBits, LowerBits) 1707 .getValue(1); 1708 } else { 1709 SDValue UpperBits = DAG.getNode(ISD::MULHU, DL, MVT::i64, LHS, RHS); 1710 SDVTList VTs = DAG.getVTList(MVT::i64, MVT::i32); 1711 Overflow = 1712 DAG.getNode(AArch64ISD::SUBS, DL, VTs, 1713 DAG.getConstant(0, DL, MVT::i64), 1714 UpperBits).getValue(1); 1715 } 1716 break; 1717 } 1718 } // switch (...) 1719 1720 if (Opc) { 1721 SDVTList VTs = DAG.getVTList(Op->getValueType(0), MVT::i32); 1722 1723 // Emit the AArch64 operation with overflow check. 1724 Value = DAG.getNode(Opc, DL, VTs, LHS, RHS); 1725 Overflow = Value.getValue(1); 1726 } 1727 return std::make_pair(Value, Overflow); 1728 } 1729 1730 SDValue AArch64TargetLowering::LowerF128Call(SDValue Op, SelectionDAG &DAG, 1731 RTLIB::Libcall Call) const { 1732 SmallVector<SDValue, 2> Ops(Op->op_begin(), Op->op_end()); 1733 return makeLibCall(DAG, Call, MVT::f128, Ops, false, SDLoc(Op)).first; 1734 } 1735 1736 static SDValue LowerXOR(SDValue Op, SelectionDAG &DAG) { 1737 SDValue Sel = Op.getOperand(0); 1738 SDValue Other = Op.getOperand(1); 1739 1740 // If neither operand is a SELECT_CC, give up. 1741 if (Sel.getOpcode() != ISD::SELECT_CC) 1742 std::swap(Sel, Other); 1743 if (Sel.getOpcode() != ISD::SELECT_CC) 1744 return Op; 1745 1746 // The folding we want to perform is: 1747 // (xor x, (select_cc a, b, cc, 0, -1) ) 1748 // --> 1749 // (csel x, (xor x, -1), cc ...) 1750 // 1751 // The latter will get matched to a CSINV instruction. 1752 1753 ISD::CondCode CC = cast<CondCodeSDNode>(Sel.getOperand(4))->get(); 1754 SDValue LHS = Sel.getOperand(0); 1755 SDValue RHS = Sel.getOperand(1); 1756 SDValue TVal = Sel.getOperand(2); 1757 SDValue FVal = Sel.getOperand(3); 1758 SDLoc dl(Sel); 1759 1760 // FIXME: This could be generalized to non-integer comparisons. 1761 if (LHS.getValueType() != MVT::i32 && LHS.getValueType() != MVT::i64) 1762 return Op; 1763 1764 ConstantSDNode *CFVal = dyn_cast<ConstantSDNode>(FVal); 1765 ConstantSDNode *CTVal = dyn_cast<ConstantSDNode>(TVal); 1766 1767 // The values aren't constants, this isn't the pattern we're looking for. 1768 if (!CFVal || !CTVal) 1769 return Op; 1770 1771 // We can commute the SELECT_CC by inverting the condition. This 1772 // might be needed to make this fit into a CSINV pattern. 1773 if (CTVal->isAllOnesValue() && CFVal->isNullValue()) { 1774 std::swap(TVal, FVal); 1775 std::swap(CTVal, CFVal); 1776 CC = ISD::getSetCCInverse(CC, true); 1777 } 1778 1779 // If the constants line up, perform the transform! 1780 if (CTVal->isNullValue() && CFVal->isAllOnesValue()) { 1781 SDValue CCVal; 1782 SDValue Cmp = getAArch64Cmp(LHS, RHS, CC, CCVal, DAG, dl); 1783 1784 FVal = Other; 1785 TVal = DAG.getNode(ISD::XOR, dl, Other.getValueType(), Other, 1786 DAG.getConstant(-1ULL, dl, Other.getValueType())); 1787 1788 return DAG.getNode(AArch64ISD::CSEL, dl, Sel.getValueType(), FVal, TVal, 1789 CCVal, Cmp); 1790 } 1791 1792 return Op; 1793 } 1794 1795 static SDValue LowerADDC_ADDE_SUBC_SUBE(SDValue Op, SelectionDAG &DAG) { 1796 EVT VT = Op.getValueType(); 1797 1798 // Let legalize expand this if it isn't a legal type yet. 1799 if (!DAG.getTargetLoweringInfo().isTypeLegal(VT)) 1800 return SDValue(); 1801 1802 SDVTList VTs = DAG.getVTList(VT, MVT::i32); 1803 1804 unsigned Opc; 1805 bool ExtraOp = false; 1806 switch (Op.getOpcode()) { 1807 default: 1808 llvm_unreachable("Invalid code"); 1809 case ISD::ADDC: 1810 Opc = AArch64ISD::ADDS; 1811 break; 1812 case ISD::SUBC: 1813 Opc = AArch64ISD::SUBS; 1814 break; 1815 case ISD::ADDE: 1816 Opc = AArch64ISD::ADCS; 1817 ExtraOp = true; 1818 break; 1819 case ISD::SUBE: 1820 Opc = AArch64ISD::SBCS; 1821 ExtraOp = true; 1822 break; 1823 } 1824 1825 if (!ExtraOp) 1826 return DAG.getNode(Opc, SDLoc(Op), VTs, Op.getOperand(0), Op.getOperand(1)); 1827 return DAG.getNode(Opc, SDLoc(Op), VTs, Op.getOperand(0), Op.getOperand(1), 1828 Op.getOperand(2)); 1829 } 1830 1831 static SDValue LowerXALUO(SDValue Op, SelectionDAG &DAG) { 1832 // Let legalize expand this if it isn't a legal type yet. 1833 if (!DAG.getTargetLoweringInfo().isTypeLegal(Op.getValueType())) 1834 return SDValue(); 1835 1836 SDLoc dl(Op); 1837 AArch64CC::CondCode CC; 1838 // The actual operation that sets the overflow or carry flag. 1839 SDValue Value, Overflow; 1840 std::tie(Value, Overflow) = getAArch64XALUOOp(CC, Op, DAG); 1841 1842 // We use 0 and 1 as false and true values. 1843 SDValue TVal = DAG.getConstant(1, dl, MVT::i32); 1844 SDValue FVal = DAG.getConstant(0, dl, MVT::i32); 1845 1846 // We use an inverted condition, because the conditional select is inverted 1847 // too. This will allow it to be selected to a single instruction: 1848 // CSINC Wd, WZR, WZR, invert(cond). 1849 SDValue CCVal = DAG.getConstant(getInvertedCondCode(CC), dl, MVT::i32); 1850 Overflow = DAG.getNode(AArch64ISD::CSEL, dl, MVT::i32, FVal, TVal, 1851 CCVal, Overflow); 1852 1853 SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32); 1854 return DAG.getNode(ISD::MERGE_VALUES, dl, VTs, Value, Overflow); 1855 } 1856 1857 // Prefetch operands are: 1858 // 1: Address to prefetch 1859 // 2: bool isWrite 1860 // 3: int locality (0 = no locality ... 3 = extreme locality) 1861 // 4: bool isDataCache 1862 static SDValue LowerPREFETCH(SDValue Op, SelectionDAG &DAG) { 1863 SDLoc DL(Op); 1864 unsigned IsWrite = cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue(); 1865 unsigned Locality = cast<ConstantSDNode>(Op.getOperand(3))->getZExtValue(); 1866 unsigned IsData = cast<ConstantSDNode>(Op.getOperand(4))->getZExtValue(); 1867 1868 bool IsStream = !Locality; 1869 // When the locality number is set 1870 if (Locality) { 1871 // The front-end should have filtered out the out-of-range values 1872 assert(Locality <= 3 && "Prefetch locality out-of-range"); 1873 // The locality degree is the opposite of the cache speed. 1874 // Put the number the other way around. 1875 // The encoding starts at 0 for level 1 1876 Locality = 3 - Locality; 1877 } 1878 1879 // built the mask value encoding the expected behavior. 1880 unsigned PrfOp = (IsWrite << 4) | // Load/Store bit 1881 (!IsData << 3) | // IsDataCache bit 1882 (Locality << 1) | // Cache level bits 1883 (unsigned)IsStream; // Stream bit 1884 return DAG.getNode(AArch64ISD::PREFETCH, DL, MVT::Other, Op.getOperand(0), 1885 DAG.getConstant(PrfOp, DL, MVT::i32), Op.getOperand(1)); 1886 } 1887 1888 SDValue AArch64TargetLowering::LowerFP_EXTEND(SDValue Op, 1889 SelectionDAG &DAG) const { 1890 assert(Op.getValueType() == MVT::f128 && "Unexpected lowering"); 1891 1892 RTLIB::Libcall LC; 1893 LC = RTLIB::getFPEXT(Op.getOperand(0).getValueType(), Op.getValueType()); 1894 1895 return LowerF128Call(Op, DAG, LC); 1896 } 1897 1898 SDValue AArch64TargetLowering::LowerFP_ROUND(SDValue Op, 1899 SelectionDAG &DAG) const { 1900 if (Op.getOperand(0).getValueType() != MVT::f128) { 1901 // It's legal except when f128 is involved 1902 return Op; 1903 } 1904 1905 RTLIB::Libcall LC; 1906 LC = RTLIB::getFPROUND(Op.getOperand(0).getValueType(), Op.getValueType()); 1907 1908 // FP_ROUND node has a second operand indicating whether it is known to be 1909 // precise. That doesn't take part in the LibCall so we can't directly use 1910 // LowerF128Call. 1911 SDValue SrcVal = Op.getOperand(0); 1912 return makeLibCall(DAG, LC, Op.getValueType(), SrcVal, /*isSigned*/ false, 1913 SDLoc(Op)).first; 1914 } 1915 1916 static SDValue LowerVectorFP_TO_INT(SDValue Op, SelectionDAG &DAG) { 1917 // Warning: We maintain cost tables in AArch64TargetTransformInfo.cpp. 1918 // Any additional optimization in this function should be recorded 1919 // in the cost tables. 1920 EVT InVT = Op.getOperand(0).getValueType(); 1921 EVT VT = Op.getValueType(); 1922 unsigned NumElts = InVT.getVectorNumElements(); 1923 1924 // f16 vectors are promoted to f32 before a conversion. 1925 if (InVT.getVectorElementType() == MVT::f16) { 1926 MVT NewVT = MVT::getVectorVT(MVT::f32, NumElts); 1927 SDLoc dl(Op); 1928 return DAG.getNode( 1929 Op.getOpcode(), dl, Op.getValueType(), 1930 DAG.getNode(ISD::FP_EXTEND, dl, NewVT, Op.getOperand(0))); 1931 } 1932 1933 if (VT.getSizeInBits() < InVT.getSizeInBits()) { 1934 SDLoc dl(Op); 1935 SDValue Cv = 1936 DAG.getNode(Op.getOpcode(), dl, InVT.changeVectorElementTypeToInteger(), 1937 Op.getOperand(0)); 1938 return DAG.getNode(ISD::TRUNCATE, dl, VT, Cv); 1939 } 1940 1941 if (VT.getSizeInBits() > InVT.getSizeInBits()) { 1942 SDLoc dl(Op); 1943 MVT ExtVT = 1944 MVT::getVectorVT(MVT::getFloatingPointVT(VT.getScalarSizeInBits()), 1945 VT.getVectorNumElements()); 1946 SDValue Ext = DAG.getNode(ISD::FP_EXTEND, dl, ExtVT, Op.getOperand(0)); 1947 return DAG.getNode(Op.getOpcode(), dl, VT, Ext); 1948 } 1949 1950 // Type changing conversions are illegal. 1951 return Op; 1952 } 1953 1954 SDValue AArch64TargetLowering::LowerFP_TO_INT(SDValue Op, 1955 SelectionDAG &DAG) const { 1956 if (Op.getOperand(0).getValueType().isVector()) 1957 return LowerVectorFP_TO_INT(Op, DAG); 1958 1959 // f16 conversions are promoted to f32. 1960 if (Op.getOperand(0).getValueType() == MVT::f16) { 1961 SDLoc dl(Op); 1962 return DAG.getNode( 1963 Op.getOpcode(), dl, Op.getValueType(), 1964 DAG.getNode(ISD::FP_EXTEND, dl, MVT::f32, Op.getOperand(0))); 1965 } 1966 1967 if (Op.getOperand(0).getValueType() != MVT::f128) { 1968 // It's legal except when f128 is involved 1969 return Op; 1970 } 1971 1972 RTLIB::Libcall LC; 1973 if (Op.getOpcode() == ISD::FP_TO_SINT) 1974 LC = RTLIB::getFPTOSINT(Op.getOperand(0).getValueType(), Op.getValueType()); 1975 else 1976 LC = RTLIB::getFPTOUINT(Op.getOperand(0).getValueType(), Op.getValueType()); 1977 1978 SmallVector<SDValue, 2> Ops(Op->op_begin(), Op->op_end()); 1979 return makeLibCall(DAG, LC, Op.getValueType(), Ops, false, SDLoc(Op)).first; 1980 } 1981 1982 static SDValue LowerVectorINT_TO_FP(SDValue Op, SelectionDAG &DAG) { 1983 // Warning: We maintain cost tables in AArch64TargetTransformInfo.cpp. 1984 // Any additional optimization in this function should be recorded 1985 // in the cost tables. 1986 EVT VT = Op.getValueType(); 1987 SDLoc dl(Op); 1988 SDValue In = Op.getOperand(0); 1989 EVT InVT = In.getValueType(); 1990 1991 if (VT.getSizeInBits() < InVT.getSizeInBits()) { 1992 MVT CastVT = 1993 MVT::getVectorVT(MVT::getFloatingPointVT(InVT.getScalarSizeInBits()), 1994 InVT.getVectorNumElements()); 1995 In = DAG.getNode(Op.getOpcode(), dl, CastVT, In); 1996 return DAG.getNode(ISD::FP_ROUND, dl, VT, In, DAG.getIntPtrConstant(0, dl)); 1997 } 1998 1999 if (VT.getSizeInBits() > InVT.getSizeInBits()) { 2000 unsigned CastOpc = 2001 Op.getOpcode() == ISD::SINT_TO_FP ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND; 2002 EVT CastVT = VT.changeVectorElementTypeToInteger(); 2003 In = DAG.getNode(CastOpc, dl, CastVT, In); 2004 return DAG.getNode(Op.getOpcode(), dl, VT, In); 2005 } 2006 2007 return Op; 2008 } 2009 2010 SDValue AArch64TargetLowering::LowerINT_TO_FP(SDValue Op, 2011 SelectionDAG &DAG) const { 2012 if (Op.getValueType().isVector()) 2013 return LowerVectorINT_TO_FP(Op, DAG); 2014 2015 // f16 conversions are promoted to f32. 2016 if (Op.getValueType() == MVT::f16) { 2017 SDLoc dl(Op); 2018 return DAG.getNode( 2019 ISD::FP_ROUND, dl, MVT::f16, 2020 DAG.getNode(Op.getOpcode(), dl, MVT::f32, Op.getOperand(0)), 2021 DAG.getIntPtrConstant(0, dl)); 2022 } 2023 2024 // i128 conversions are libcalls. 2025 if (Op.getOperand(0).getValueType() == MVT::i128) 2026 return SDValue(); 2027 2028 // Other conversions are legal, unless it's to the completely software-based 2029 // fp128. 2030 if (Op.getValueType() != MVT::f128) 2031 return Op; 2032 2033 RTLIB::Libcall LC; 2034 if (Op.getOpcode() == ISD::SINT_TO_FP) 2035 LC = RTLIB::getSINTTOFP(Op.getOperand(0).getValueType(), Op.getValueType()); 2036 else 2037 LC = RTLIB::getUINTTOFP(Op.getOperand(0).getValueType(), Op.getValueType()); 2038 2039 return LowerF128Call(Op, DAG, LC); 2040 } 2041 2042 SDValue AArch64TargetLowering::LowerFSINCOS(SDValue Op, 2043 SelectionDAG &DAG) const { 2044 // For iOS, we want to call an alternative entry point: __sincos_stret, 2045 // which returns the values in two S / D registers. 2046 SDLoc dl(Op); 2047 SDValue Arg = Op.getOperand(0); 2048 EVT ArgVT = Arg.getValueType(); 2049 Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext()); 2050 2051 ArgListTy Args; 2052 ArgListEntry Entry; 2053 2054 Entry.Node = Arg; 2055 Entry.Ty = ArgTy; 2056 Entry.isSExt = false; 2057 Entry.isZExt = false; 2058 Args.push_back(Entry); 2059 2060 const char *LibcallName = 2061 (ArgVT == MVT::f64) ? "__sincos_stret" : "__sincosf_stret"; 2062 SDValue Callee = 2063 DAG.getExternalSymbol(LibcallName, getPointerTy(DAG.getDataLayout())); 2064 2065 StructType *RetTy = StructType::get(ArgTy, ArgTy, nullptr); 2066 TargetLowering::CallLoweringInfo CLI(DAG); 2067 CLI.setDebugLoc(dl).setChain(DAG.getEntryNode()) 2068 .setCallee(CallingConv::Fast, RetTy, Callee, std::move(Args)); 2069 2070 std::pair<SDValue, SDValue> CallResult = LowerCallTo(CLI); 2071 return CallResult.first; 2072 } 2073 2074 static SDValue LowerBITCAST(SDValue Op, SelectionDAG &DAG) { 2075 if (Op.getValueType() != MVT::f16) 2076 return SDValue(); 2077 2078 assert(Op.getOperand(0).getValueType() == MVT::i16); 2079 SDLoc DL(Op); 2080 2081 Op = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Op.getOperand(0)); 2082 Op = DAG.getNode(ISD::BITCAST, DL, MVT::f32, Op); 2083 return SDValue( 2084 DAG.getMachineNode(TargetOpcode::EXTRACT_SUBREG, DL, MVT::f16, Op, 2085 DAG.getTargetConstant(AArch64::hsub, DL, MVT::i32)), 2086 0); 2087 } 2088 2089 static EVT getExtensionTo64Bits(const EVT &OrigVT) { 2090 if (OrigVT.getSizeInBits() >= 64) 2091 return OrigVT; 2092 2093 assert(OrigVT.isSimple() && "Expecting a simple value type"); 2094 2095 MVT::SimpleValueType OrigSimpleTy = OrigVT.getSimpleVT().SimpleTy; 2096 switch (OrigSimpleTy) { 2097 default: llvm_unreachable("Unexpected Vector Type"); 2098 case MVT::v2i8: 2099 case MVT::v2i16: 2100 return MVT::v2i32; 2101 case MVT::v4i8: 2102 return MVT::v4i16; 2103 } 2104 } 2105 2106 static SDValue addRequiredExtensionForVectorMULL(SDValue N, SelectionDAG &DAG, 2107 const EVT &OrigTy, 2108 const EVT &ExtTy, 2109 unsigned ExtOpcode) { 2110 // The vector originally had a size of OrigTy. It was then extended to ExtTy. 2111 // We expect the ExtTy to be 128-bits total. If the OrigTy is less than 2112 // 64-bits we need to insert a new extension so that it will be 64-bits. 2113 assert(ExtTy.is128BitVector() && "Unexpected extension size"); 2114 if (OrigTy.getSizeInBits() >= 64) 2115 return N; 2116 2117 // Must extend size to at least 64 bits to be used as an operand for VMULL. 2118 EVT NewVT = getExtensionTo64Bits(OrigTy); 2119 2120 return DAG.getNode(ExtOpcode, SDLoc(N), NewVT, N); 2121 } 2122 2123 static bool isExtendedBUILD_VECTOR(SDNode *N, SelectionDAG &DAG, 2124 bool isSigned) { 2125 EVT VT = N->getValueType(0); 2126 2127 if (N->getOpcode() != ISD::BUILD_VECTOR) 2128 return false; 2129 2130 for (const SDValue &Elt : N->op_values()) { 2131 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Elt)) { 2132 unsigned EltSize = VT.getVectorElementType().getSizeInBits(); 2133 unsigned HalfSize = EltSize / 2; 2134 if (isSigned) { 2135 if (!isIntN(HalfSize, C->getSExtValue())) 2136 return false; 2137 } else { 2138 if (!isUIntN(HalfSize, C->getZExtValue())) 2139 return false; 2140 } 2141 continue; 2142 } 2143 return false; 2144 } 2145 2146 return true; 2147 } 2148 2149 static SDValue skipExtensionForVectorMULL(SDNode *N, SelectionDAG &DAG) { 2150 if (N->getOpcode() == ISD::SIGN_EXTEND || N->getOpcode() == ISD::ZERO_EXTEND) 2151 return addRequiredExtensionForVectorMULL(N->getOperand(0), DAG, 2152 N->getOperand(0)->getValueType(0), 2153 N->getValueType(0), 2154 N->getOpcode()); 2155 2156 assert(N->getOpcode() == ISD::BUILD_VECTOR && "expected BUILD_VECTOR"); 2157 EVT VT = N->getValueType(0); 2158 SDLoc dl(N); 2159 unsigned EltSize = VT.getVectorElementType().getSizeInBits() / 2; 2160 unsigned NumElts = VT.getVectorNumElements(); 2161 MVT TruncVT = MVT::getIntegerVT(EltSize); 2162 SmallVector<SDValue, 8> Ops; 2163 for (unsigned i = 0; i != NumElts; ++i) { 2164 ConstantSDNode *C = cast<ConstantSDNode>(N->getOperand(i)); 2165 const APInt &CInt = C->getAPIntValue(); 2166 // Element types smaller than 32 bits are not legal, so use i32 elements. 2167 // The values are implicitly truncated so sext vs. zext doesn't matter. 2168 Ops.push_back(DAG.getConstant(CInt.zextOrTrunc(32), dl, MVT::i32)); 2169 } 2170 return DAG.getBuildVector(MVT::getVectorVT(TruncVT, NumElts), dl, Ops); 2171 } 2172 2173 static bool isSignExtended(SDNode *N, SelectionDAG &DAG) { 2174 if (N->getOpcode() == ISD::SIGN_EXTEND) 2175 return true; 2176 if (isExtendedBUILD_VECTOR(N, DAG, true)) 2177 return true; 2178 return false; 2179 } 2180 2181 static bool isZeroExtended(SDNode *N, SelectionDAG &DAG) { 2182 if (N->getOpcode() == ISD::ZERO_EXTEND) 2183 return true; 2184 if (isExtendedBUILD_VECTOR(N, DAG, false)) 2185 return true; 2186 return false; 2187 } 2188 2189 static bool isAddSubSExt(SDNode *N, SelectionDAG &DAG) { 2190 unsigned Opcode = N->getOpcode(); 2191 if (Opcode == ISD::ADD || Opcode == ISD::SUB) { 2192 SDNode *N0 = N->getOperand(0).getNode(); 2193 SDNode *N1 = N->getOperand(1).getNode(); 2194 return N0->hasOneUse() && N1->hasOneUse() && 2195 isSignExtended(N0, DAG) && isSignExtended(N1, DAG); 2196 } 2197 return false; 2198 } 2199 2200 static bool isAddSubZExt(SDNode *N, SelectionDAG &DAG) { 2201 unsigned Opcode = N->getOpcode(); 2202 if (Opcode == ISD::ADD || Opcode == ISD::SUB) { 2203 SDNode *N0 = N->getOperand(0).getNode(); 2204 SDNode *N1 = N->getOperand(1).getNode(); 2205 return N0->hasOneUse() && N1->hasOneUse() && 2206 isZeroExtended(N0, DAG) && isZeroExtended(N1, DAG); 2207 } 2208 return false; 2209 } 2210 2211 static SDValue LowerMUL(SDValue Op, SelectionDAG &DAG) { 2212 // Multiplications are only custom-lowered for 128-bit vectors so that 2213 // VMULL can be detected. Otherwise v2i64 multiplications are not legal. 2214 EVT VT = Op.getValueType(); 2215 assert(VT.is128BitVector() && VT.isInteger() && 2216 "unexpected type for custom-lowering ISD::MUL"); 2217 SDNode *N0 = Op.getOperand(0).getNode(); 2218 SDNode *N1 = Op.getOperand(1).getNode(); 2219 unsigned NewOpc = 0; 2220 bool isMLA = false; 2221 bool isN0SExt = isSignExtended(N0, DAG); 2222 bool isN1SExt = isSignExtended(N1, DAG); 2223 if (isN0SExt && isN1SExt) 2224 NewOpc = AArch64ISD::SMULL; 2225 else { 2226 bool isN0ZExt = isZeroExtended(N0, DAG); 2227 bool isN1ZExt = isZeroExtended(N1, DAG); 2228 if (isN0ZExt && isN1ZExt) 2229 NewOpc = AArch64ISD::UMULL; 2230 else if (isN1SExt || isN1ZExt) { 2231 // Look for (s/zext A + s/zext B) * (s/zext C). We want to turn these 2232 // into (s/zext A * s/zext C) + (s/zext B * s/zext C) 2233 if (isN1SExt && isAddSubSExt(N0, DAG)) { 2234 NewOpc = AArch64ISD::SMULL; 2235 isMLA = true; 2236 } else if (isN1ZExt && isAddSubZExt(N0, DAG)) { 2237 NewOpc = AArch64ISD::UMULL; 2238 isMLA = true; 2239 } else if (isN0ZExt && isAddSubZExt(N1, DAG)) { 2240 std::swap(N0, N1); 2241 NewOpc = AArch64ISD::UMULL; 2242 isMLA = true; 2243 } 2244 } 2245 2246 if (!NewOpc) { 2247 if (VT == MVT::v2i64) 2248 // Fall through to expand this. It is not legal. 2249 return SDValue(); 2250 else 2251 // Other vector multiplications are legal. 2252 return Op; 2253 } 2254 } 2255 2256 // Legalize to a S/UMULL instruction 2257 SDLoc DL(Op); 2258 SDValue Op0; 2259 SDValue Op1 = skipExtensionForVectorMULL(N1, DAG); 2260 if (!isMLA) { 2261 Op0 = skipExtensionForVectorMULL(N0, DAG); 2262 assert(Op0.getValueType().is64BitVector() && 2263 Op1.getValueType().is64BitVector() && 2264 "unexpected types for extended operands to VMULL"); 2265 return DAG.getNode(NewOpc, DL, VT, Op0, Op1); 2266 } 2267 // Optimizing (zext A + zext B) * C, to (S/UMULL A, C) + (S/UMULL B, C) during 2268 // isel lowering to take advantage of no-stall back to back s/umul + s/umla. 2269 // This is true for CPUs with accumulate forwarding such as Cortex-A53/A57 2270 SDValue N00 = skipExtensionForVectorMULL(N0->getOperand(0).getNode(), DAG); 2271 SDValue N01 = skipExtensionForVectorMULL(N0->getOperand(1).getNode(), DAG); 2272 EVT Op1VT = Op1.getValueType(); 2273 return DAG.getNode(N0->getOpcode(), DL, VT, 2274 DAG.getNode(NewOpc, DL, VT, 2275 DAG.getNode(ISD::BITCAST, DL, Op1VT, N00), Op1), 2276 DAG.getNode(NewOpc, DL, VT, 2277 DAG.getNode(ISD::BITCAST, DL, Op1VT, N01), Op1)); 2278 } 2279 2280 SDValue AArch64TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, 2281 SelectionDAG &DAG) const { 2282 unsigned IntNo = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue(); 2283 SDLoc dl(Op); 2284 switch (IntNo) { 2285 default: return SDValue(); // Don't custom lower most intrinsics. 2286 case Intrinsic::thread_pointer: { 2287 EVT PtrVT = getPointerTy(DAG.getDataLayout()); 2288 return DAG.getNode(AArch64ISD::THREAD_POINTER, dl, PtrVT); 2289 } 2290 case Intrinsic::aarch64_neon_smax: 2291 return DAG.getNode(ISD::SMAX, dl, Op.getValueType(), 2292 Op.getOperand(1), Op.getOperand(2)); 2293 case Intrinsic::aarch64_neon_umax: 2294 return DAG.getNode(ISD::UMAX, dl, Op.getValueType(), 2295 Op.getOperand(1), Op.getOperand(2)); 2296 case Intrinsic::aarch64_neon_smin: 2297 return DAG.getNode(ISD::SMIN, dl, Op.getValueType(), 2298 Op.getOperand(1), Op.getOperand(2)); 2299 case Intrinsic::aarch64_neon_umin: 2300 return DAG.getNode(ISD::UMIN, dl, Op.getValueType(), 2301 Op.getOperand(1), Op.getOperand(2)); 2302 } 2303 } 2304 2305 SDValue AArch64TargetLowering::LowerOperation(SDValue Op, 2306 SelectionDAG &DAG) const { 2307 switch (Op.getOpcode()) { 2308 default: 2309 llvm_unreachable("unimplemented operand"); 2310 return SDValue(); 2311 case ISD::BITCAST: 2312 return LowerBITCAST(Op, DAG); 2313 case ISD::GlobalAddress: 2314 return LowerGlobalAddress(Op, DAG); 2315 case ISD::GlobalTLSAddress: 2316 return LowerGlobalTLSAddress(Op, DAG); 2317 case ISD::SETCC: 2318 return LowerSETCC(Op, DAG); 2319 case ISD::BR_CC: 2320 return LowerBR_CC(Op, DAG); 2321 case ISD::SELECT: 2322 return LowerSELECT(Op, DAG); 2323 case ISD::SELECT_CC: 2324 return LowerSELECT_CC(Op, DAG); 2325 case ISD::JumpTable: 2326 return LowerJumpTable(Op, DAG); 2327 case ISD::ConstantPool: 2328 return LowerConstantPool(Op, DAG); 2329 case ISD::BlockAddress: 2330 return LowerBlockAddress(Op, DAG); 2331 case ISD::VASTART: 2332 return LowerVASTART(Op, DAG); 2333 case ISD::VACOPY: 2334 return LowerVACOPY(Op, DAG); 2335 case ISD::VAARG: 2336 return LowerVAARG(Op, DAG); 2337 case ISD::ADDC: 2338 case ISD::ADDE: 2339 case ISD::SUBC: 2340 case ISD::SUBE: 2341 return LowerADDC_ADDE_SUBC_SUBE(Op, DAG); 2342 case ISD::SADDO: 2343 case ISD::UADDO: 2344 case ISD::SSUBO: 2345 case ISD::USUBO: 2346 case ISD::SMULO: 2347 case ISD::UMULO: 2348 return LowerXALUO(Op, DAG); 2349 case ISD::FADD: 2350 return LowerF128Call(Op, DAG, RTLIB::ADD_F128); 2351 case ISD::FSUB: 2352 return LowerF128Call(Op, DAG, RTLIB::SUB_F128); 2353 case ISD::FMUL: 2354 return LowerF128Call(Op, DAG, RTLIB::MUL_F128); 2355 case ISD::FDIV: 2356 return LowerF128Call(Op, DAG, RTLIB::DIV_F128); 2357 case ISD::FP_ROUND: 2358 return LowerFP_ROUND(Op, DAG); 2359 case ISD::FP_EXTEND: 2360 return LowerFP_EXTEND(Op, DAG); 2361 case ISD::FRAMEADDR: 2362 return LowerFRAMEADDR(Op, DAG); 2363 case ISD::RETURNADDR: 2364 return LowerRETURNADDR(Op, DAG); 2365 case ISD::INSERT_VECTOR_ELT: 2366 return LowerINSERT_VECTOR_ELT(Op, DAG); 2367 case ISD::EXTRACT_VECTOR_ELT: 2368 return LowerEXTRACT_VECTOR_ELT(Op, DAG); 2369 case ISD::BUILD_VECTOR: 2370 return LowerBUILD_VECTOR(Op, DAG); 2371 case ISD::VECTOR_SHUFFLE: 2372 return LowerVECTOR_SHUFFLE(Op, DAG); 2373 case ISD::EXTRACT_SUBVECTOR: 2374 return LowerEXTRACT_SUBVECTOR(Op, DAG); 2375 case ISD::SRA: 2376 case ISD::SRL: 2377 case ISD::SHL: 2378 return LowerVectorSRA_SRL_SHL(Op, DAG); 2379 case ISD::SHL_PARTS: 2380 return LowerShiftLeftParts(Op, DAG); 2381 case ISD::SRL_PARTS: 2382 case ISD::SRA_PARTS: 2383 return LowerShiftRightParts(Op, DAG); 2384 case ISD::CTPOP: 2385 return LowerCTPOP(Op, DAG); 2386 case ISD::FCOPYSIGN: 2387 return LowerFCOPYSIGN(Op, DAG); 2388 case ISD::AND: 2389 return LowerVectorAND(Op, DAG); 2390 case ISD::OR: 2391 return LowerVectorOR(Op, DAG); 2392 case ISD::XOR: 2393 return LowerXOR(Op, DAG); 2394 case ISD::PREFETCH: 2395 return LowerPREFETCH(Op, DAG); 2396 case ISD::SINT_TO_FP: 2397 case ISD::UINT_TO_FP: 2398 return LowerINT_TO_FP(Op, DAG); 2399 case ISD::FP_TO_SINT: 2400 case ISD::FP_TO_UINT: 2401 return LowerFP_TO_INT(Op, DAG); 2402 case ISD::FSINCOS: 2403 return LowerFSINCOS(Op, DAG); 2404 case ISD::MUL: 2405 return LowerMUL(Op, DAG); 2406 case ISD::INTRINSIC_WO_CHAIN: 2407 return LowerINTRINSIC_WO_CHAIN(Op, DAG); 2408 } 2409 } 2410 2411 //===----------------------------------------------------------------------===// 2412 // Calling Convention Implementation 2413 //===----------------------------------------------------------------------===// 2414 2415 #include "AArch64GenCallingConv.inc" 2416 2417 /// Selects the correct CCAssignFn for a given CallingConvention value. 2418 CCAssignFn *AArch64TargetLowering::CCAssignFnForCall(CallingConv::ID CC, 2419 bool IsVarArg) const { 2420 switch (CC) { 2421 default: 2422 llvm_unreachable("Unsupported calling convention."); 2423 case CallingConv::WebKit_JS: 2424 return CC_AArch64_WebKit_JS; 2425 case CallingConv::GHC: 2426 return CC_AArch64_GHC; 2427 case CallingConv::C: 2428 case CallingConv::Fast: 2429 case CallingConv::PreserveMost: 2430 case CallingConv::CXX_FAST_TLS: 2431 if (!Subtarget->isTargetDarwin()) 2432 return CC_AArch64_AAPCS; 2433 return IsVarArg ? CC_AArch64_DarwinPCS_VarArg : CC_AArch64_DarwinPCS; 2434 } 2435 } 2436 2437 SDValue AArch64TargetLowering::LowerFormalArguments( 2438 SDValue Chain, CallingConv::ID CallConv, bool isVarArg, 2439 const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &DL, 2440 SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const { 2441 MachineFunction &MF = DAG.getMachineFunction(); 2442 MachineFrameInfo *MFI = MF.getFrameInfo(); 2443 2444 // Assign locations to all of the incoming arguments. 2445 SmallVector<CCValAssign, 16> ArgLocs; 2446 CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), ArgLocs, 2447 *DAG.getContext()); 2448 2449 // At this point, Ins[].VT may already be promoted to i32. To correctly 2450 // handle passing i8 as i8 instead of i32 on stack, we pass in both i32 and 2451 // i8 to CC_AArch64_AAPCS with i32 being ValVT and i8 being LocVT. 2452 // Since AnalyzeFormalArguments uses Ins[].VT for both ValVT and LocVT, here 2453 // we use a special version of AnalyzeFormalArguments to pass in ValVT and 2454 // LocVT. 2455 unsigned NumArgs = Ins.size(); 2456 Function::const_arg_iterator CurOrigArg = MF.getFunction()->arg_begin(); 2457 unsigned CurArgIdx = 0; 2458 for (unsigned i = 0; i != NumArgs; ++i) { 2459 MVT ValVT = Ins[i].VT; 2460 if (Ins[i].isOrigArg()) { 2461 std::advance(CurOrigArg, Ins[i].getOrigArgIndex() - CurArgIdx); 2462 CurArgIdx = Ins[i].getOrigArgIndex(); 2463 2464 // Get type of the original argument. 2465 EVT ActualVT = getValueType(DAG.getDataLayout(), CurOrigArg->getType(), 2466 /*AllowUnknown*/ true); 2467 MVT ActualMVT = ActualVT.isSimple() ? ActualVT.getSimpleVT() : MVT::Other; 2468 // If ActualMVT is i1/i8/i16, we should set LocVT to i8/i8/i16. 2469 if (ActualMVT == MVT::i1 || ActualMVT == MVT::i8) 2470 ValVT = MVT::i8; 2471 else if (ActualMVT == MVT::i16) 2472 ValVT = MVT::i16; 2473 } 2474 CCAssignFn *AssignFn = CCAssignFnForCall(CallConv, /*IsVarArg=*/false); 2475 bool Res = 2476 AssignFn(i, ValVT, ValVT, CCValAssign::Full, Ins[i].Flags, CCInfo); 2477 assert(!Res && "Call operand has unhandled type"); 2478 (void)Res; 2479 } 2480 assert(ArgLocs.size() == Ins.size()); 2481 SmallVector<SDValue, 16> ArgValues; 2482 for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) { 2483 CCValAssign &VA = ArgLocs[i]; 2484 2485 if (Ins[i].Flags.isByVal()) { 2486 // Byval is used for HFAs in the PCS, but the system should work in a 2487 // non-compliant manner for larger structs. 2488 EVT PtrVT = getPointerTy(DAG.getDataLayout()); 2489 int Size = Ins[i].Flags.getByValSize(); 2490 unsigned NumRegs = (Size + 7) / 8; 2491 2492 // FIXME: This works on big-endian for composite byvals, which are the common 2493 // case. It should also work for fundamental types too. 2494 unsigned FrameIdx = 2495 MFI->CreateFixedObject(8 * NumRegs, VA.getLocMemOffset(), false); 2496 SDValue FrameIdxN = DAG.getFrameIndex(FrameIdx, PtrVT); 2497 InVals.push_back(FrameIdxN); 2498 2499 continue; 2500 } 2501 2502 if (VA.isRegLoc()) { 2503 // Arguments stored in registers. 2504 EVT RegVT = VA.getLocVT(); 2505 2506 SDValue ArgValue; 2507 const TargetRegisterClass *RC; 2508 2509 if (RegVT == MVT::i32) 2510 RC = &AArch64::GPR32RegClass; 2511 else if (RegVT == MVT::i64) 2512 RC = &AArch64::GPR64RegClass; 2513 else if (RegVT == MVT::f16) 2514 RC = &AArch64::FPR16RegClass; 2515 else if (RegVT == MVT::f32) 2516 RC = &AArch64::FPR32RegClass; 2517 else if (RegVT == MVT::f64 || RegVT.is64BitVector()) 2518 RC = &AArch64::FPR64RegClass; 2519 else if (RegVT == MVT::f128 || RegVT.is128BitVector()) 2520 RC = &AArch64::FPR128RegClass; 2521 else 2522 llvm_unreachable("RegVT not supported by FORMAL_ARGUMENTS Lowering"); 2523 2524 // Transform the arguments in physical registers into virtual ones. 2525 unsigned Reg = MF.addLiveIn(VA.getLocReg(), RC); 2526 ArgValue = DAG.getCopyFromReg(Chain, DL, Reg, RegVT); 2527 2528 // If this is an 8, 16 or 32-bit value, it is really passed promoted 2529 // to 64 bits. Insert an assert[sz]ext to capture this, then 2530 // truncate to the right size. 2531 switch (VA.getLocInfo()) { 2532 default: 2533 llvm_unreachable("Unknown loc info!"); 2534 case CCValAssign::Full: 2535 break; 2536 case CCValAssign::BCvt: 2537 ArgValue = DAG.getNode(ISD::BITCAST, DL, VA.getValVT(), ArgValue); 2538 break; 2539 case CCValAssign::AExt: 2540 case CCValAssign::SExt: 2541 case CCValAssign::ZExt: 2542 // SelectionDAGBuilder will insert appropriate AssertZExt & AssertSExt 2543 // nodes after our lowering. 2544 assert(RegVT == Ins[i].VT && "incorrect register location selected"); 2545 break; 2546 } 2547 2548 InVals.push_back(ArgValue); 2549 2550 } else { // VA.isRegLoc() 2551 assert(VA.isMemLoc() && "CCValAssign is neither reg nor mem"); 2552 unsigned ArgOffset = VA.getLocMemOffset(); 2553 unsigned ArgSize = VA.getValVT().getSizeInBits() / 8; 2554 2555 uint32_t BEAlign = 0; 2556 if (!Subtarget->isLittleEndian() && ArgSize < 8 && 2557 !Ins[i].Flags.isInConsecutiveRegs()) 2558 BEAlign = 8 - ArgSize; 2559 2560 int FI = MFI->CreateFixedObject(ArgSize, ArgOffset + BEAlign, true); 2561 2562 // Create load nodes to retrieve arguments from the stack. 2563 SDValue FIN = DAG.getFrameIndex(FI, getPointerTy(DAG.getDataLayout())); 2564 SDValue ArgValue; 2565 2566 // For NON_EXTLOAD, generic code in getLoad assert(ValVT == MemVT) 2567 ISD::LoadExtType ExtType = ISD::NON_EXTLOAD; 2568 MVT MemVT = VA.getValVT(); 2569 2570 switch (VA.getLocInfo()) { 2571 default: 2572 break; 2573 case CCValAssign::BCvt: 2574 MemVT = VA.getLocVT(); 2575 break; 2576 case CCValAssign::SExt: 2577 ExtType = ISD::SEXTLOAD; 2578 break; 2579 case CCValAssign::ZExt: 2580 ExtType = ISD::ZEXTLOAD; 2581 break; 2582 case CCValAssign::AExt: 2583 ExtType = ISD::EXTLOAD; 2584 break; 2585 } 2586 2587 ArgValue = DAG.getExtLoad( 2588 ExtType, DL, VA.getLocVT(), Chain, FIN, 2589 MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI), 2590 MemVT, false, false, false, 0); 2591 2592 InVals.push_back(ArgValue); 2593 } 2594 } 2595 2596 // varargs 2597 AArch64FunctionInfo *FuncInfo = MF.getInfo<AArch64FunctionInfo>(); 2598 if (isVarArg) { 2599 if (!Subtarget->isTargetDarwin()) { 2600 // The AAPCS variadic function ABI is identical to the non-variadic 2601 // one. As a result there may be more arguments in registers and we should 2602 // save them for future reference. 2603 saveVarArgRegisters(CCInfo, DAG, DL, Chain); 2604 } 2605 2606 // This will point to the next argument passed via stack. 2607 unsigned StackOffset = CCInfo.getNextStackOffset(); 2608 // We currently pass all varargs at 8-byte alignment. 2609 StackOffset = ((StackOffset + 7) & ~7); 2610 FuncInfo->setVarArgsStackIndex(MFI->CreateFixedObject(4, StackOffset, true)); 2611 } 2612 2613 unsigned StackArgSize = CCInfo.getNextStackOffset(); 2614 bool TailCallOpt = MF.getTarget().Options.GuaranteedTailCallOpt; 2615 if (DoesCalleeRestoreStack(CallConv, TailCallOpt)) { 2616 // This is a non-standard ABI so by fiat I say we're allowed to make full 2617 // use of the stack area to be popped, which must be aligned to 16 bytes in 2618 // any case: 2619 StackArgSize = alignTo(StackArgSize, 16); 2620 2621 // If we're expected to restore the stack (e.g. fastcc) then we'll be adding 2622 // a multiple of 16. 2623 FuncInfo->setArgumentStackToRestore(StackArgSize); 2624 2625 // This realignment carries over to the available bytes below. Our own 2626 // callers will guarantee the space is free by giving an aligned value to 2627 // CALLSEQ_START. 2628 } 2629 // Even if we're not expected to free up the space, it's useful to know how 2630 // much is there while considering tail calls (because we can reuse it). 2631 FuncInfo->setBytesInStackArgArea(StackArgSize); 2632 2633 return Chain; 2634 } 2635 2636 void AArch64TargetLowering::saveVarArgRegisters(CCState &CCInfo, 2637 SelectionDAG &DAG, 2638 const SDLoc &DL, 2639 SDValue &Chain) const { 2640 MachineFunction &MF = DAG.getMachineFunction(); 2641 MachineFrameInfo *MFI = MF.getFrameInfo(); 2642 AArch64FunctionInfo *FuncInfo = MF.getInfo<AArch64FunctionInfo>(); 2643 auto PtrVT = getPointerTy(DAG.getDataLayout()); 2644 2645 SmallVector<SDValue, 8> MemOps; 2646 2647 static const MCPhysReg GPRArgRegs[] = { AArch64::X0, AArch64::X1, AArch64::X2, 2648 AArch64::X3, AArch64::X4, AArch64::X5, 2649 AArch64::X6, AArch64::X7 }; 2650 static const unsigned NumGPRArgRegs = array_lengthof(GPRArgRegs); 2651 unsigned FirstVariadicGPR = CCInfo.getFirstUnallocated(GPRArgRegs); 2652 2653 unsigned GPRSaveSize = 8 * (NumGPRArgRegs - FirstVariadicGPR); 2654 int GPRIdx = 0; 2655 if (GPRSaveSize != 0) { 2656 GPRIdx = MFI->CreateStackObject(GPRSaveSize, 8, false); 2657 2658 SDValue FIN = DAG.getFrameIndex(GPRIdx, PtrVT); 2659 2660 for (unsigned i = FirstVariadicGPR; i < NumGPRArgRegs; ++i) { 2661 unsigned VReg = MF.addLiveIn(GPRArgRegs[i], &AArch64::GPR64RegClass); 2662 SDValue Val = DAG.getCopyFromReg(Chain, DL, VReg, MVT::i64); 2663 SDValue Store = DAG.getStore( 2664 Val.getValue(1), DL, Val, FIN, 2665 MachinePointerInfo::getStack(DAG.getMachineFunction(), i * 8), false, 2666 false, 0); 2667 MemOps.push_back(Store); 2668 FIN = 2669 DAG.getNode(ISD::ADD, DL, PtrVT, FIN, DAG.getConstant(8, DL, PtrVT)); 2670 } 2671 } 2672 FuncInfo->setVarArgsGPRIndex(GPRIdx); 2673 FuncInfo->setVarArgsGPRSize(GPRSaveSize); 2674 2675 if (Subtarget->hasFPARMv8()) { 2676 static const MCPhysReg FPRArgRegs[] = { 2677 AArch64::Q0, AArch64::Q1, AArch64::Q2, AArch64::Q3, 2678 AArch64::Q4, AArch64::Q5, AArch64::Q6, AArch64::Q7}; 2679 static const unsigned NumFPRArgRegs = array_lengthof(FPRArgRegs); 2680 unsigned FirstVariadicFPR = CCInfo.getFirstUnallocated(FPRArgRegs); 2681 2682 unsigned FPRSaveSize = 16 * (NumFPRArgRegs - FirstVariadicFPR); 2683 int FPRIdx = 0; 2684 if (FPRSaveSize != 0) { 2685 FPRIdx = MFI->CreateStackObject(FPRSaveSize, 16, false); 2686 2687 SDValue FIN = DAG.getFrameIndex(FPRIdx, PtrVT); 2688 2689 for (unsigned i = FirstVariadicFPR; i < NumFPRArgRegs; ++i) { 2690 unsigned VReg = MF.addLiveIn(FPRArgRegs[i], &AArch64::FPR128RegClass); 2691 SDValue Val = DAG.getCopyFromReg(Chain, DL, VReg, MVT::f128); 2692 2693 SDValue Store = DAG.getStore( 2694 Val.getValue(1), DL, Val, FIN, 2695 MachinePointerInfo::getStack(DAG.getMachineFunction(), i * 16), 2696 false, false, 0); 2697 MemOps.push_back(Store); 2698 FIN = DAG.getNode(ISD::ADD, DL, PtrVT, FIN, 2699 DAG.getConstant(16, DL, PtrVT)); 2700 } 2701 } 2702 FuncInfo->setVarArgsFPRIndex(FPRIdx); 2703 FuncInfo->setVarArgsFPRSize(FPRSaveSize); 2704 } 2705 2706 if (!MemOps.empty()) { 2707 Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOps); 2708 } 2709 } 2710 2711 /// LowerCallResult - Lower the result values of a call into the 2712 /// appropriate copies out of appropriate physical registers. 2713 SDValue AArch64TargetLowering::LowerCallResult( 2714 SDValue Chain, SDValue InFlag, CallingConv::ID CallConv, bool isVarArg, 2715 const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &DL, 2716 SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals, bool isThisReturn, 2717 SDValue ThisVal) const { 2718 CCAssignFn *RetCC = CallConv == CallingConv::WebKit_JS 2719 ? RetCC_AArch64_WebKit_JS 2720 : RetCC_AArch64_AAPCS; 2721 // Assign locations to each value returned by this call. 2722 SmallVector<CCValAssign, 16> RVLocs; 2723 CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), RVLocs, 2724 *DAG.getContext()); 2725 CCInfo.AnalyzeCallResult(Ins, RetCC); 2726 2727 // Copy all of the result registers out of their specified physreg. 2728 for (unsigned i = 0; i != RVLocs.size(); ++i) { 2729 CCValAssign VA = RVLocs[i]; 2730 2731 // Pass 'this' value directly from the argument to return value, to avoid 2732 // reg unit interference 2733 if (i == 0 && isThisReturn) { 2734 assert(!VA.needsCustom() && VA.getLocVT() == MVT::i64 && 2735 "unexpected return calling convention register assignment"); 2736 InVals.push_back(ThisVal); 2737 continue; 2738 } 2739 2740 SDValue Val = 2741 DAG.getCopyFromReg(Chain, DL, VA.getLocReg(), VA.getLocVT(), InFlag); 2742 Chain = Val.getValue(1); 2743 InFlag = Val.getValue(2); 2744 2745 switch (VA.getLocInfo()) { 2746 default: 2747 llvm_unreachable("Unknown loc info!"); 2748 case CCValAssign::Full: 2749 break; 2750 case CCValAssign::BCvt: 2751 Val = DAG.getNode(ISD::BITCAST, DL, VA.getValVT(), Val); 2752 break; 2753 } 2754 2755 InVals.push_back(Val); 2756 } 2757 2758 return Chain; 2759 } 2760 2761 bool AArch64TargetLowering::isEligibleForTailCallOptimization( 2762 SDValue Callee, CallingConv::ID CalleeCC, bool isVarArg, 2763 const SmallVectorImpl<ISD::OutputArg> &Outs, 2764 const SmallVectorImpl<SDValue> &OutVals, 2765 const SmallVectorImpl<ISD::InputArg> &Ins, SelectionDAG &DAG) const { 2766 // For CallingConv::C this function knows whether the ABI needs 2767 // changing. That's not true for other conventions so they will have to opt in 2768 // manually. 2769 if (!IsTailCallConvention(CalleeCC) && CalleeCC != CallingConv::C) 2770 return false; 2771 2772 MachineFunction &MF = DAG.getMachineFunction(); 2773 const Function *CallerF = MF.getFunction(); 2774 CallingConv::ID CallerCC = CallerF->getCallingConv(); 2775 bool CCMatch = CallerCC == CalleeCC; 2776 2777 // Byval parameters hand the function a pointer directly into the stack area 2778 // we want to reuse during a tail call. Working around this *is* possible (see 2779 // X86) but less efficient and uglier in LowerCall. 2780 for (Function::const_arg_iterator i = CallerF->arg_begin(), 2781 e = CallerF->arg_end(); 2782 i != e; ++i) 2783 if (i->hasByValAttr()) 2784 return false; 2785 2786 if (getTargetMachine().Options.GuaranteedTailCallOpt) { 2787 return IsTailCallConvention(CalleeCC) && CCMatch; 2788 } 2789 2790 // Externally-defined functions with weak linkage should not be 2791 // tail-called on AArch64 when the OS does not support dynamic 2792 // pre-emption of symbols, as the AAELF spec requires normal calls 2793 // to undefined weak functions to be replaced with a NOP or jump to the 2794 // next instruction. The behaviour of branch instructions in this 2795 // situation (as used for tail calls) is implementation-defined, so we 2796 // cannot rely on the linker replacing the tail call with a return. 2797 if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee)) { 2798 const GlobalValue *GV = G->getGlobal(); 2799 const Triple &TT = getTargetMachine().getTargetTriple(); 2800 if (GV->hasExternalWeakLinkage() && 2801 (!TT.isOSWindows() || TT.isOSBinFormatELF() || TT.isOSBinFormatMachO())) 2802 return false; 2803 } 2804 2805 // Now we search for cases where we can use a tail call without changing the 2806 // ABI. Sibcall is used in some places (particularly gcc) to refer to this 2807 // concept. 2808 2809 // I want anyone implementing a new calling convention to think long and hard 2810 // about this assert. 2811 assert((!isVarArg || CalleeCC == CallingConv::C) && 2812 "Unexpected variadic calling convention"); 2813 2814 LLVMContext &C = *DAG.getContext(); 2815 if (isVarArg && !Outs.empty()) { 2816 // At least two cases here: if caller is fastcc then we can't have any 2817 // memory arguments (we'd be expected to clean up the stack afterwards). If 2818 // caller is C then we could potentially use its argument area. 2819 2820 // FIXME: for now we take the most conservative of these in both cases: 2821 // disallow all variadic memory operands. 2822 SmallVector<CCValAssign, 16> ArgLocs; 2823 CCState CCInfo(CalleeCC, isVarArg, MF, ArgLocs, C); 2824 2825 CCInfo.AnalyzeCallOperands(Outs, CCAssignFnForCall(CalleeCC, true)); 2826 for (const CCValAssign &ArgLoc : ArgLocs) 2827 if (!ArgLoc.isRegLoc()) 2828 return false; 2829 } 2830 2831 // Check that the call results are passed in the same way. 2832 if (!CCState::resultsCompatible(CalleeCC, CallerCC, MF, C, Ins, 2833 CCAssignFnForCall(CalleeCC, isVarArg), 2834 CCAssignFnForCall(CallerCC, isVarArg))) 2835 return false; 2836 // The callee has to preserve all registers the caller needs to preserve. 2837 const AArch64RegisterInfo *TRI = Subtarget->getRegisterInfo(); 2838 const uint32_t *CallerPreserved = TRI->getCallPreservedMask(MF, CallerCC); 2839 if (!CCMatch) { 2840 const uint32_t *CalleePreserved = TRI->getCallPreservedMask(MF, CalleeCC); 2841 if (!TRI->regmaskSubsetEqual(CallerPreserved, CalleePreserved)) 2842 return false; 2843 } 2844 2845 // Nothing more to check if the callee is taking no arguments 2846 if (Outs.empty()) 2847 return true; 2848 2849 SmallVector<CCValAssign, 16> ArgLocs; 2850 CCState CCInfo(CalleeCC, isVarArg, MF, ArgLocs, C); 2851 2852 CCInfo.AnalyzeCallOperands(Outs, CCAssignFnForCall(CalleeCC, isVarArg)); 2853 2854 const AArch64FunctionInfo *FuncInfo = MF.getInfo<AArch64FunctionInfo>(); 2855 2856 // If the stack arguments for this call do not fit into our own save area then 2857 // the call cannot be made tail. 2858 if (CCInfo.getNextStackOffset() > FuncInfo->getBytesInStackArgArea()) 2859 return false; 2860 2861 const MachineRegisterInfo &MRI = MF.getRegInfo(); 2862 if (!parametersInCSRMatch(MRI, CallerPreserved, ArgLocs, OutVals)) 2863 return false; 2864 2865 return true; 2866 } 2867 2868 SDValue AArch64TargetLowering::addTokenForArgument(SDValue Chain, 2869 SelectionDAG &DAG, 2870 MachineFrameInfo *MFI, 2871 int ClobberedFI) const { 2872 SmallVector<SDValue, 8> ArgChains; 2873 int64_t FirstByte = MFI->getObjectOffset(ClobberedFI); 2874 int64_t LastByte = FirstByte + MFI->getObjectSize(ClobberedFI) - 1; 2875 2876 // Include the original chain at the beginning of the list. When this is 2877 // used by target LowerCall hooks, this helps legalize find the 2878 // CALLSEQ_BEGIN node. 2879 ArgChains.push_back(Chain); 2880 2881 // Add a chain value for each stack argument corresponding 2882 for (SDNode::use_iterator U = DAG.getEntryNode().getNode()->use_begin(), 2883 UE = DAG.getEntryNode().getNode()->use_end(); 2884 U != UE; ++U) 2885 if (LoadSDNode *L = dyn_cast<LoadSDNode>(*U)) 2886 if (FrameIndexSDNode *FI = dyn_cast<FrameIndexSDNode>(L->getBasePtr())) 2887 if (FI->getIndex() < 0) { 2888 int64_t InFirstByte = MFI->getObjectOffset(FI->getIndex()); 2889 int64_t InLastByte = InFirstByte; 2890 InLastByte += MFI->getObjectSize(FI->getIndex()) - 1; 2891 2892 if ((InFirstByte <= FirstByte && FirstByte <= InLastByte) || 2893 (FirstByte <= InFirstByte && InFirstByte <= LastByte)) 2894 ArgChains.push_back(SDValue(L, 1)); 2895 } 2896 2897 // Build a tokenfactor for all the chains. 2898 return DAG.getNode(ISD::TokenFactor, SDLoc(Chain), MVT::Other, ArgChains); 2899 } 2900 2901 bool AArch64TargetLowering::DoesCalleeRestoreStack(CallingConv::ID CallCC, 2902 bool TailCallOpt) const { 2903 return CallCC == CallingConv::Fast && TailCallOpt; 2904 } 2905 2906 bool AArch64TargetLowering::IsTailCallConvention(CallingConv::ID CallCC) const { 2907 return CallCC == CallingConv::Fast || 2908 CallCC == CallingConv::PreserveMost; 2909 } 2910 2911 /// LowerCall - Lower a call to a callseq_start + CALL + callseq_end chain, 2912 /// and add input and output parameter nodes. 2913 SDValue 2914 AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI, 2915 SmallVectorImpl<SDValue> &InVals) const { 2916 SelectionDAG &DAG = CLI.DAG; 2917 SDLoc &DL = CLI.DL; 2918 SmallVector<ISD::OutputArg, 32> &Outs = CLI.Outs; 2919 SmallVector<SDValue, 32> &OutVals = CLI.OutVals; 2920 SmallVector<ISD::InputArg, 32> &Ins = CLI.Ins; 2921 SDValue Chain = CLI.Chain; 2922 SDValue Callee = CLI.Callee; 2923 bool &IsTailCall = CLI.IsTailCall; 2924 CallingConv::ID CallConv = CLI.CallConv; 2925 bool IsVarArg = CLI.IsVarArg; 2926 2927 MachineFunction &MF = DAG.getMachineFunction(); 2928 bool IsThisReturn = false; 2929 2930 AArch64FunctionInfo *FuncInfo = MF.getInfo<AArch64FunctionInfo>(); 2931 bool TailCallOpt = MF.getTarget().Options.GuaranteedTailCallOpt; 2932 bool IsSibCall = false; 2933 2934 if (IsTailCall) { 2935 // Check if it's really possible to do a tail call. 2936 IsTailCall = isEligibleForTailCallOptimization( 2937 Callee, CallConv, IsVarArg, Outs, OutVals, Ins, DAG); 2938 if (!IsTailCall && CLI.CS && CLI.CS->isMustTailCall()) 2939 report_fatal_error("failed to perform tail call elimination on a call " 2940 "site marked musttail"); 2941 2942 // A sibling call is one where we're under the usual C ABI and not planning 2943 // to change that but can still do a tail call: 2944 if (!TailCallOpt && IsTailCall) 2945 IsSibCall = true; 2946 2947 if (IsTailCall) 2948 ++NumTailCalls; 2949 } 2950 2951 // Analyze operands of the call, assigning locations to each operand. 2952 SmallVector<CCValAssign, 16> ArgLocs; 2953 CCState CCInfo(CallConv, IsVarArg, DAG.getMachineFunction(), ArgLocs, 2954 *DAG.getContext()); 2955 2956 if (IsVarArg) { 2957 // Handle fixed and variable vector arguments differently. 2958 // Variable vector arguments always go into memory. 2959 unsigned NumArgs = Outs.size(); 2960 2961 for (unsigned i = 0; i != NumArgs; ++i) { 2962 MVT ArgVT = Outs[i].VT; 2963 ISD::ArgFlagsTy ArgFlags = Outs[i].Flags; 2964 CCAssignFn *AssignFn = CCAssignFnForCall(CallConv, 2965 /*IsVarArg=*/ !Outs[i].IsFixed); 2966 bool Res = AssignFn(i, ArgVT, ArgVT, CCValAssign::Full, ArgFlags, CCInfo); 2967 assert(!Res && "Call operand has unhandled type"); 2968 (void)Res; 2969 } 2970 } else { 2971 // At this point, Outs[].VT may already be promoted to i32. To correctly 2972 // handle passing i8 as i8 instead of i32 on stack, we pass in both i32 and 2973 // i8 to CC_AArch64_AAPCS with i32 being ValVT and i8 being LocVT. 2974 // Since AnalyzeCallOperands uses Ins[].VT for both ValVT and LocVT, here 2975 // we use a special version of AnalyzeCallOperands to pass in ValVT and 2976 // LocVT. 2977 unsigned NumArgs = Outs.size(); 2978 for (unsigned i = 0; i != NumArgs; ++i) { 2979 MVT ValVT = Outs[i].VT; 2980 // Get type of the original argument. 2981 EVT ActualVT = getValueType(DAG.getDataLayout(), 2982 CLI.getArgs()[Outs[i].OrigArgIndex].Ty, 2983 /*AllowUnknown*/ true); 2984 MVT ActualMVT = ActualVT.isSimple() ? ActualVT.getSimpleVT() : ValVT; 2985 ISD::ArgFlagsTy ArgFlags = Outs[i].Flags; 2986 // If ActualMVT is i1/i8/i16, we should set LocVT to i8/i8/i16. 2987 if (ActualMVT == MVT::i1 || ActualMVT == MVT::i8) 2988 ValVT = MVT::i8; 2989 else if (ActualMVT == MVT::i16) 2990 ValVT = MVT::i16; 2991 2992 CCAssignFn *AssignFn = CCAssignFnForCall(CallConv, /*IsVarArg=*/false); 2993 bool Res = AssignFn(i, ValVT, ValVT, CCValAssign::Full, ArgFlags, CCInfo); 2994 assert(!Res && "Call operand has unhandled type"); 2995 (void)Res; 2996 } 2997 } 2998 2999 // Get a count of how many bytes are to be pushed on the stack. 3000 unsigned NumBytes = CCInfo.getNextStackOffset(); 3001 3002 if (IsSibCall) { 3003 // Since we're not changing the ABI to make this a tail call, the memory 3004 // operands are already available in the caller's incoming argument space. 3005 NumBytes = 0; 3006 } 3007 3008 // FPDiff is the byte offset of the call's argument area from the callee's. 3009 // Stores to callee stack arguments will be placed in FixedStackSlots offset 3010 // by this amount for a tail call. In a sibling call it must be 0 because the 3011 // caller will deallocate the entire stack and the callee still expects its 3012 // arguments to begin at SP+0. Completely unused for non-tail calls. 3013 int FPDiff = 0; 3014 3015 if (IsTailCall && !IsSibCall) { 3016 unsigned NumReusableBytes = FuncInfo->getBytesInStackArgArea(); 3017 3018 // Since callee will pop argument stack as a tail call, we must keep the 3019 // popped size 16-byte aligned. 3020 NumBytes = alignTo(NumBytes, 16); 3021 3022 // FPDiff will be negative if this tail call requires more space than we 3023 // would automatically have in our incoming argument space. Positive if we 3024 // can actually shrink the stack. 3025 FPDiff = NumReusableBytes - NumBytes; 3026 3027 // The stack pointer must be 16-byte aligned at all times it's used for a 3028 // memory operation, which in practice means at *all* times and in 3029 // particular across call boundaries. Therefore our own arguments started at 3030 // a 16-byte aligned SP and the delta applied for the tail call should 3031 // satisfy the same constraint. 3032 assert(FPDiff % 16 == 0 && "unaligned stack on tail call"); 3033 } 3034 3035 // Adjust the stack pointer for the new arguments... 3036 // These operations are automatically eliminated by the prolog/epilog pass 3037 if (!IsSibCall) 3038 Chain = DAG.getCALLSEQ_START(Chain, DAG.getIntPtrConstant(NumBytes, DL, 3039 true), 3040 DL); 3041 3042 SDValue StackPtr = DAG.getCopyFromReg(Chain, DL, AArch64::SP, 3043 getPointerTy(DAG.getDataLayout())); 3044 3045 SmallVector<std::pair<unsigned, SDValue>, 8> RegsToPass; 3046 SmallVector<SDValue, 8> MemOpChains; 3047 auto PtrVT = getPointerTy(DAG.getDataLayout()); 3048 3049 // Walk the register/memloc assignments, inserting copies/loads. 3050 for (unsigned i = 0, realArgIdx = 0, e = ArgLocs.size(); i != e; 3051 ++i, ++realArgIdx) { 3052 CCValAssign &VA = ArgLocs[i]; 3053 SDValue Arg = OutVals[realArgIdx]; 3054 ISD::ArgFlagsTy Flags = Outs[realArgIdx].Flags; 3055 3056 // Promote the value if needed. 3057 switch (VA.getLocInfo()) { 3058 default: 3059 llvm_unreachable("Unknown loc info!"); 3060 case CCValAssign::Full: 3061 break; 3062 case CCValAssign::SExt: 3063 Arg = DAG.getNode(ISD::SIGN_EXTEND, DL, VA.getLocVT(), Arg); 3064 break; 3065 case CCValAssign::ZExt: 3066 Arg = DAG.getNode(ISD::ZERO_EXTEND, DL, VA.getLocVT(), Arg); 3067 break; 3068 case CCValAssign::AExt: 3069 if (Outs[realArgIdx].ArgVT == MVT::i1) { 3070 // AAPCS requires i1 to be zero-extended to 8-bits by the caller. 3071 Arg = DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, Arg); 3072 Arg = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i8, Arg); 3073 } 3074 Arg = DAG.getNode(ISD::ANY_EXTEND, DL, VA.getLocVT(), Arg); 3075 break; 3076 case CCValAssign::BCvt: 3077 Arg = DAG.getNode(ISD::BITCAST, DL, VA.getLocVT(), Arg); 3078 break; 3079 case CCValAssign::FPExt: 3080 Arg = DAG.getNode(ISD::FP_EXTEND, DL, VA.getLocVT(), Arg); 3081 break; 3082 } 3083 3084 if (VA.isRegLoc()) { 3085 if (realArgIdx == 0 && Flags.isReturned() && Outs[0].VT == MVT::i64) { 3086 assert(VA.getLocVT() == MVT::i64 && 3087 "unexpected calling convention register assignment"); 3088 assert(!Ins.empty() && Ins[0].VT == MVT::i64 && 3089 "unexpected use of 'returned'"); 3090 IsThisReturn = true; 3091 } 3092 RegsToPass.push_back(std::make_pair(VA.getLocReg(), Arg)); 3093 } else { 3094 assert(VA.isMemLoc()); 3095 3096 SDValue DstAddr; 3097 MachinePointerInfo DstInfo; 3098 3099 // FIXME: This works on big-endian for composite byvals, which are the 3100 // common case. It should also work for fundamental types too. 3101 uint32_t BEAlign = 0; 3102 unsigned OpSize = Flags.isByVal() ? Flags.getByValSize() * 8 3103 : VA.getValVT().getSizeInBits(); 3104 OpSize = (OpSize + 7) / 8; 3105 if (!Subtarget->isLittleEndian() && !Flags.isByVal() && 3106 !Flags.isInConsecutiveRegs()) { 3107 if (OpSize < 8) 3108 BEAlign = 8 - OpSize; 3109 } 3110 unsigned LocMemOffset = VA.getLocMemOffset(); 3111 int32_t Offset = LocMemOffset + BEAlign; 3112 SDValue PtrOff = DAG.getIntPtrConstant(Offset, DL); 3113 PtrOff = DAG.getNode(ISD::ADD, DL, PtrVT, StackPtr, PtrOff); 3114 3115 if (IsTailCall) { 3116 Offset = Offset + FPDiff; 3117 int FI = MF.getFrameInfo()->CreateFixedObject(OpSize, Offset, true); 3118 3119 DstAddr = DAG.getFrameIndex(FI, PtrVT); 3120 DstInfo = 3121 MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI); 3122 3123 // Make sure any stack arguments overlapping with where we're storing 3124 // are loaded before this eventual operation. Otherwise they'll be 3125 // clobbered. 3126 Chain = addTokenForArgument(Chain, DAG, MF.getFrameInfo(), FI); 3127 } else { 3128 SDValue PtrOff = DAG.getIntPtrConstant(Offset, DL); 3129 3130 DstAddr = DAG.getNode(ISD::ADD, DL, PtrVT, StackPtr, PtrOff); 3131 DstInfo = MachinePointerInfo::getStack(DAG.getMachineFunction(), 3132 LocMemOffset); 3133 } 3134 3135 if (Outs[i].Flags.isByVal()) { 3136 SDValue SizeNode = 3137 DAG.getConstant(Outs[i].Flags.getByValSize(), DL, MVT::i64); 3138 SDValue Cpy = DAG.getMemcpy( 3139 Chain, DL, DstAddr, Arg, SizeNode, Outs[i].Flags.getByValAlign(), 3140 /*isVol = */ false, /*AlwaysInline = */ false, 3141 /*isTailCall = */ false, 3142 DstInfo, MachinePointerInfo()); 3143 3144 MemOpChains.push_back(Cpy); 3145 } else { 3146 // Since we pass i1/i8/i16 as i1/i8/i16 on stack and Arg is already 3147 // promoted to a legal register type i32, we should truncate Arg back to 3148 // i1/i8/i16. 3149 if (VA.getValVT() == MVT::i1 || VA.getValVT() == MVT::i8 || 3150 VA.getValVT() == MVT::i16) 3151 Arg = DAG.getNode(ISD::TRUNCATE, DL, VA.getValVT(), Arg); 3152 3153 SDValue Store = 3154 DAG.getStore(Chain, DL, Arg, DstAddr, DstInfo, false, false, 0); 3155 MemOpChains.push_back(Store); 3156 } 3157 } 3158 } 3159 3160 if (!MemOpChains.empty()) 3161 Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOpChains); 3162 3163 // Build a sequence of copy-to-reg nodes chained together with token chain 3164 // and flag operands which copy the outgoing args into the appropriate regs. 3165 SDValue InFlag; 3166 for (auto &RegToPass : RegsToPass) { 3167 Chain = DAG.getCopyToReg(Chain, DL, RegToPass.first, 3168 RegToPass.second, InFlag); 3169 InFlag = Chain.getValue(1); 3170 } 3171 3172 // If the callee is a GlobalAddress/ExternalSymbol node (quite common, every 3173 // direct call is) turn it into a TargetGlobalAddress/TargetExternalSymbol 3174 // node so that legalize doesn't hack it. 3175 if (getTargetMachine().getCodeModel() == CodeModel::Large && 3176 Subtarget->isTargetMachO()) { 3177 if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee)) { 3178 const GlobalValue *GV = G->getGlobal(); 3179 bool InternalLinkage = GV->hasInternalLinkage(); 3180 if (InternalLinkage) 3181 Callee = DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, 0); 3182 else { 3183 Callee = 3184 DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, AArch64II::MO_GOT); 3185 Callee = DAG.getNode(AArch64ISD::LOADgot, DL, PtrVT, Callee); 3186 } 3187 } else if (ExternalSymbolSDNode *S = 3188 dyn_cast<ExternalSymbolSDNode>(Callee)) { 3189 const char *Sym = S->getSymbol(); 3190 Callee = DAG.getTargetExternalSymbol(Sym, PtrVT, AArch64II::MO_GOT); 3191 Callee = DAG.getNode(AArch64ISD::LOADgot, DL, PtrVT, Callee); 3192 } 3193 } else if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee)) { 3194 const GlobalValue *GV = G->getGlobal(); 3195 Callee = DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, 0); 3196 } else if (ExternalSymbolSDNode *S = dyn_cast<ExternalSymbolSDNode>(Callee)) { 3197 const char *Sym = S->getSymbol(); 3198 Callee = DAG.getTargetExternalSymbol(Sym, PtrVT, 0); 3199 } 3200 3201 // We don't usually want to end the call-sequence here because we would tidy 3202 // the frame up *after* the call, however in the ABI-changing tail-call case 3203 // we've carefully laid out the parameters so that when sp is reset they'll be 3204 // in the correct location. 3205 if (IsTailCall && !IsSibCall) { 3206 Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(NumBytes, DL, true), 3207 DAG.getIntPtrConstant(0, DL, true), InFlag, DL); 3208 InFlag = Chain.getValue(1); 3209 } 3210 3211 std::vector<SDValue> Ops; 3212 Ops.push_back(Chain); 3213 Ops.push_back(Callee); 3214 3215 if (IsTailCall) { 3216 // Each tail call may have to adjust the stack by a different amount, so 3217 // this information must travel along with the operation for eventual 3218 // consumption by emitEpilogue. 3219 Ops.push_back(DAG.getTargetConstant(FPDiff, DL, MVT::i32)); 3220 } 3221 3222 // Add argument registers to the end of the list so that they are known live 3223 // into the call. 3224 for (auto &RegToPass : RegsToPass) 3225 Ops.push_back(DAG.getRegister(RegToPass.first, 3226 RegToPass.second.getValueType())); 3227 3228 // Add a register mask operand representing the call-preserved registers. 3229 const uint32_t *Mask; 3230 const AArch64RegisterInfo *TRI = Subtarget->getRegisterInfo(); 3231 if (IsThisReturn) { 3232 // For 'this' returns, use the X0-preserving mask if applicable 3233 Mask = TRI->getThisReturnPreservedMask(MF, CallConv); 3234 if (!Mask) { 3235 IsThisReturn = false; 3236 Mask = TRI->getCallPreservedMask(MF, CallConv); 3237 } 3238 } else 3239 Mask = TRI->getCallPreservedMask(MF, CallConv); 3240 3241 assert(Mask && "Missing call preserved mask for calling convention"); 3242 Ops.push_back(DAG.getRegisterMask(Mask)); 3243 3244 if (InFlag.getNode()) 3245 Ops.push_back(InFlag); 3246 3247 SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue); 3248 3249 // If we're doing a tall call, use a TC_RETURN here rather than an 3250 // actual call instruction. 3251 if (IsTailCall) { 3252 MF.getFrameInfo()->setHasTailCall(); 3253 return DAG.getNode(AArch64ISD::TC_RETURN, DL, NodeTys, Ops); 3254 } 3255 3256 // Returns a chain and a flag for retval copy to use. 3257 Chain = DAG.getNode(AArch64ISD::CALL, DL, NodeTys, Ops); 3258 InFlag = Chain.getValue(1); 3259 3260 uint64_t CalleePopBytes = 3261 DoesCalleeRestoreStack(CallConv, TailCallOpt) ? alignTo(NumBytes, 16) : 0; 3262 3263 Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(NumBytes, DL, true), 3264 DAG.getIntPtrConstant(CalleePopBytes, DL, true), 3265 InFlag, DL); 3266 if (!Ins.empty()) 3267 InFlag = Chain.getValue(1); 3268 3269 // Handle result values, copying them out of physregs into vregs that we 3270 // return. 3271 return LowerCallResult(Chain, InFlag, CallConv, IsVarArg, Ins, DL, DAG, 3272 InVals, IsThisReturn, 3273 IsThisReturn ? OutVals[0] : SDValue()); 3274 } 3275 3276 bool AArch64TargetLowering::CanLowerReturn( 3277 CallingConv::ID CallConv, MachineFunction &MF, bool isVarArg, 3278 const SmallVectorImpl<ISD::OutputArg> &Outs, LLVMContext &Context) const { 3279 CCAssignFn *RetCC = CallConv == CallingConv::WebKit_JS 3280 ? RetCC_AArch64_WebKit_JS 3281 : RetCC_AArch64_AAPCS; 3282 SmallVector<CCValAssign, 16> RVLocs; 3283 CCState CCInfo(CallConv, isVarArg, MF, RVLocs, Context); 3284 return CCInfo.CheckReturn(Outs, RetCC); 3285 } 3286 3287 SDValue 3288 AArch64TargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv, 3289 bool isVarArg, 3290 const SmallVectorImpl<ISD::OutputArg> &Outs, 3291 const SmallVectorImpl<SDValue> &OutVals, 3292 const SDLoc &DL, SelectionDAG &DAG) const { 3293 CCAssignFn *RetCC = CallConv == CallingConv::WebKit_JS 3294 ? RetCC_AArch64_WebKit_JS 3295 : RetCC_AArch64_AAPCS; 3296 SmallVector<CCValAssign, 16> RVLocs; 3297 CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), RVLocs, 3298 *DAG.getContext()); 3299 CCInfo.AnalyzeReturn(Outs, RetCC); 3300 3301 // Copy the result values into the output registers. 3302 SDValue Flag; 3303 SmallVector<SDValue, 4> RetOps(1, Chain); 3304 for (unsigned i = 0, realRVLocIdx = 0; i != RVLocs.size(); 3305 ++i, ++realRVLocIdx) { 3306 CCValAssign &VA = RVLocs[i]; 3307 assert(VA.isRegLoc() && "Can only return in registers!"); 3308 SDValue Arg = OutVals[realRVLocIdx]; 3309 3310 switch (VA.getLocInfo()) { 3311 default: 3312 llvm_unreachable("Unknown loc info!"); 3313 case CCValAssign::Full: 3314 if (Outs[i].ArgVT == MVT::i1) { 3315 // AAPCS requires i1 to be zero-extended to i8 by the producer of the 3316 // value. This is strictly redundant on Darwin (which uses "zeroext 3317 // i1"), but will be optimised out before ISel. 3318 Arg = DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, Arg); 3319 Arg = DAG.getNode(ISD::ZERO_EXTEND, DL, VA.getLocVT(), Arg); 3320 } 3321 break; 3322 case CCValAssign::BCvt: 3323 Arg = DAG.getNode(ISD::BITCAST, DL, VA.getLocVT(), Arg); 3324 break; 3325 } 3326 3327 Chain = DAG.getCopyToReg(Chain, DL, VA.getLocReg(), Arg, Flag); 3328 Flag = Chain.getValue(1); 3329 RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT())); 3330 } 3331 const AArch64RegisterInfo *TRI = Subtarget->getRegisterInfo(); 3332 const MCPhysReg *I = 3333 TRI->getCalleeSavedRegsViaCopy(&DAG.getMachineFunction()); 3334 if (I) { 3335 for (; *I; ++I) { 3336 if (AArch64::GPR64RegClass.contains(*I)) 3337 RetOps.push_back(DAG.getRegister(*I, MVT::i64)); 3338 else if (AArch64::FPR64RegClass.contains(*I)) 3339 RetOps.push_back(DAG.getRegister(*I, MVT::getFloatingPointVT(64))); 3340 else 3341 llvm_unreachable("Unexpected register class in CSRsViaCopy!"); 3342 } 3343 } 3344 3345 RetOps[0] = Chain; // Update chain. 3346 3347 // Add the flag if we have it. 3348 if (Flag.getNode()) 3349 RetOps.push_back(Flag); 3350 3351 return DAG.getNode(AArch64ISD::RET_FLAG, DL, MVT::Other, RetOps); 3352 } 3353 3354 //===----------------------------------------------------------------------===// 3355 // Other Lowering Code 3356 //===----------------------------------------------------------------------===// 3357 3358 SDValue AArch64TargetLowering::LowerGlobalAddress(SDValue Op, 3359 SelectionDAG &DAG) const { 3360 EVT PtrVT = getPointerTy(DAG.getDataLayout()); 3361 SDLoc DL(Op); 3362 const GlobalAddressSDNode *GN = cast<GlobalAddressSDNode>(Op); 3363 const GlobalValue *GV = GN->getGlobal(); 3364 unsigned char OpFlags = 3365 Subtarget->ClassifyGlobalReference(GV, getTargetMachine()); 3366 3367 assert(cast<GlobalAddressSDNode>(Op)->getOffset() == 0 && 3368 "unexpected offset in global node"); 3369 3370 // This also catched the large code model case for Darwin. 3371 if ((OpFlags & AArch64II::MO_GOT) != 0) { 3372 SDValue GotAddr = DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, OpFlags); 3373 // FIXME: Once remat is capable of dealing with instructions with register 3374 // operands, expand this into two nodes instead of using a wrapper node. 3375 return DAG.getNode(AArch64ISD::LOADgot, DL, PtrVT, GotAddr); 3376 } 3377 3378 if (getTargetMachine().getCodeModel() == CodeModel::Large) { 3379 const unsigned char MO_NC = AArch64II::MO_NC; 3380 return DAG.getNode( 3381 AArch64ISD::WrapperLarge, DL, PtrVT, 3382 DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, AArch64II::MO_G3), 3383 DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, AArch64II::MO_G2 | MO_NC), 3384 DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, AArch64II::MO_G1 | MO_NC), 3385 DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, AArch64II::MO_G0 | MO_NC)); 3386 } else { 3387 // Use ADRP/ADD or ADRP/LDR for everything else: the small model on ELF and 3388 // the only correct model on Darwin. 3389 SDValue Hi = DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, 3390 OpFlags | AArch64II::MO_PAGE); 3391 unsigned char LoFlags = OpFlags | AArch64II::MO_PAGEOFF | AArch64II::MO_NC; 3392 SDValue Lo = DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, LoFlags); 3393 3394 SDValue ADRP = DAG.getNode(AArch64ISD::ADRP, DL, PtrVT, Hi); 3395 return DAG.getNode(AArch64ISD::ADDlow, DL, PtrVT, ADRP, Lo); 3396 } 3397 } 3398 3399 /// \brief Convert a TLS address reference into the correct sequence of loads 3400 /// and calls to compute the variable's address (for Darwin, currently) and 3401 /// return an SDValue containing the final node. 3402 3403 /// Darwin only has one TLS scheme which must be capable of dealing with the 3404 /// fully general situation, in the worst case. This means: 3405 /// + "extern __thread" declaration. 3406 /// + Defined in a possibly unknown dynamic library. 3407 /// 3408 /// The general system is that each __thread variable has a [3 x i64] descriptor 3409 /// which contains information used by the runtime to calculate the address. The 3410 /// only part of this the compiler needs to know about is the first xword, which 3411 /// contains a function pointer that must be called with the address of the 3412 /// entire descriptor in "x0". 3413 /// 3414 /// Since this descriptor may be in a different unit, in general even the 3415 /// descriptor must be accessed via an indirect load. The "ideal" code sequence 3416 /// is: 3417 /// adrp x0, _var@TLVPPAGE 3418 /// ldr x0, [x0, _var@TLVPPAGEOFF] ; x0 now contains address of descriptor 3419 /// ldr x1, [x0] ; x1 contains 1st entry of descriptor, 3420 /// ; the function pointer 3421 /// blr x1 ; Uses descriptor address in x0 3422 /// ; Address of _var is now in x0. 3423 /// 3424 /// If the address of _var's descriptor *is* known to the linker, then it can 3425 /// change the first "ldr" instruction to an appropriate "add x0, x0, #imm" for 3426 /// a slight efficiency gain. 3427 SDValue 3428 AArch64TargetLowering::LowerDarwinGlobalTLSAddress(SDValue Op, 3429 SelectionDAG &DAG) const { 3430 assert(Subtarget->isTargetDarwin() && "TLS only supported on Darwin"); 3431 3432 SDLoc DL(Op); 3433 MVT PtrVT = getPointerTy(DAG.getDataLayout()); 3434 const GlobalValue *GV = cast<GlobalAddressSDNode>(Op)->getGlobal(); 3435 3436 SDValue TLVPAddr = 3437 DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, AArch64II::MO_TLS); 3438 SDValue DescAddr = DAG.getNode(AArch64ISD::LOADgot, DL, PtrVT, TLVPAddr); 3439 3440 // The first entry in the descriptor is a function pointer that we must call 3441 // to obtain the address of the variable. 3442 SDValue Chain = DAG.getEntryNode(); 3443 SDValue FuncTLVGet = 3444 DAG.getLoad(MVT::i64, DL, Chain, DescAddr, 3445 MachinePointerInfo::getGOT(DAG.getMachineFunction()), false, 3446 true, true, 8); 3447 Chain = FuncTLVGet.getValue(1); 3448 3449 MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo(); 3450 MFI->setAdjustsStack(true); 3451 3452 // TLS calls preserve all registers except those that absolutely must be 3453 // trashed: X0 (it takes an argument), LR (it's a call) and NZCV (let's not be 3454 // silly). 3455 const uint32_t *Mask = 3456 Subtarget->getRegisterInfo()->getTLSCallPreservedMask(); 3457 3458 // Finally, we can make the call. This is just a degenerate version of a 3459 // normal AArch64 call node: x0 takes the address of the descriptor, and 3460 // returns the address of the variable in this thread. 3461 Chain = DAG.getCopyToReg(Chain, DL, AArch64::X0, DescAddr, SDValue()); 3462 Chain = 3463 DAG.getNode(AArch64ISD::CALL, DL, DAG.getVTList(MVT::Other, MVT::Glue), 3464 Chain, FuncTLVGet, DAG.getRegister(AArch64::X0, MVT::i64), 3465 DAG.getRegisterMask(Mask), Chain.getValue(1)); 3466 return DAG.getCopyFromReg(Chain, DL, AArch64::X0, PtrVT, Chain.getValue(1)); 3467 } 3468 3469 /// When accessing thread-local variables under either the general-dynamic or 3470 /// local-dynamic system, we make a "TLS-descriptor" call. The variable will 3471 /// have a descriptor, accessible via a PC-relative ADRP, and whose first entry 3472 /// is a function pointer to carry out the resolution. 3473 /// 3474 /// The sequence is: 3475 /// adrp x0, :tlsdesc:var 3476 /// ldr x1, [x0, #:tlsdesc_lo12:var] 3477 /// add x0, x0, #:tlsdesc_lo12:var 3478 /// .tlsdesccall var 3479 /// blr x1 3480 /// (TPIDR_EL0 offset now in x0) 3481 /// 3482 /// The above sequence must be produced unscheduled, to enable the linker to 3483 /// optimize/relax this sequence. 3484 /// Therefore, a pseudo-instruction (TLSDESC_CALLSEQ) is used to represent the 3485 /// above sequence, and expanded really late in the compilation flow, to ensure 3486 /// the sequence is produced as per above. 3487 SDValue AArch64TargetLowering::LowerELFTLSDescCallSeq(SDValue SymAddr, 3488 const SDLoc &DL, 3489 SelectionDAG &DAG) const { 3490 EVT PtrVT = getPointerTy(DAG.getDataLayout()); 3491 3492 SDValue Chain = DAG.getEntryNode(); 3493 SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue); 3494 3495 Chain = 3496 DAG.getNode(AArch64ISD::TLSDESC_CALLSEQ, DL, NodeTys, {Chain, SymAddr}); 3497 SDValue Glue = Chain.getValue(1); 3498 3499 return DAG.getCopyFromReg(Chain, DL, AArch64::X0, PtrVT, Glue); 3500 } 3501 3502 SDValue 3503 AArch64TargetLowering::LowerELFGlobalTLSAddress(SDValue Op, 3504 SelectionDAG &DAG) const { 3505 assert(Subtarget->isTargetELF() && "This function expects an ELF target"); 3506 assert(getTargetMachine().getCodeModel() == CodeModel::Small && 3507 "ELF TLS only supported in small memory model"); 3508 // Different choices can be made for the maximum size of the TLS area for a 3509 // module. For the small address model, the default TLS size is 16MiB and the 3510 // maximum TLS size is 4GiB. 3511 // FIXME: add -mtls-size command line option and make it control the 16MiB 3512 // vs. 4GiB code sequence generation. 3513 const GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(Op); 3514 3515 TLSModel::Model Model = getTargetMachine().getTLSModel(GA->getGlobal()); 3516 3517 if (DAG.getTarget().Options.EmulatedTLS) 3518 return LowerToTLSEmulatedModel(GA, DAG); 3519 3520 if (!EnableAArch64ELFLocalDynamicTLSGeneration) { 3521 if (Model == TLSModel::LocalDynamic) 3522 Model = TLSModel::GeneralDynamic; 3523 } 3524 3525 SDValue TPOff; 3526 EVT PtrVT = getPointerTy(DAG.getDataLayout()); 3527 SDLoc DL(Op); 3528 const GlobalValue *GV = GA->getGlobal(); 3529 3530 SDValue ThreadBase = DAG.getNode(AArch64ISD::THREAD_POINTER, DL, PtrVT); 3531 3532 if (Model == TLSModel::LocalExec) { 3533 SDValue HiVar = DAG.getTargetGlobalAddress( 3534 GV, DL, PtrVT, 0, AArch64II::MO_TLS | AArch64II::MO_HI12); 3535 SDValue LoVar = DAG.getTargetGlobalAddress( 3536 GV, DL, PtrVT, 0, 3537 AArch64II::MO_TLS | AArch64II::MO_PAGEOFF | AArch64II::MO_NC); 3538 3539 SDValue TPWithOff_lo = 3540 SDValue(DAG.getMachineNode(AArch64::ADDXri, DL, PtrVT, ThreadBase, 3541 HiVar, 3542 DAG.getTargetConstant(0, DL, MVT::i32)), 3543 0); 3544 SDValue TPWithOff = 3545 SDValue(DAG.getMachineNode(AArch64::ADDXri, DL, PtrVT, TPWithOff_lo, 3546 LoVar, 3547 DAG.getTargetConstant(0, DL, MVT::i32)), 3548 0); 3549 return TPWithOff; 3550 } else if (Model == TLSModel::InitialExec) { 3551 TPOff = DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, AArch64II::MO_TLS); 3552 TPOff = DAG.getNode(AArch64ISD::LOADgot, DL, PtrVT, TPOff); 3553 } else if (Model == TLSModel::LocalDynamic) { 3554 // Local-dynamic accesses proceed in two phases. A general-dynamic TLS 3555 // descriptor call against the special symbol _TLS_MODULE_BASE_ to calculate 3556 // the beginning of the module's TLS region, followed by a DTPREL offset 3557 // calculation. 3558 3559 // These accesses will need deduplicating if there's more than one. 3560 AArch64FunctionInfo *MFI = 3561 DAG.getMachineFunction().getInfo<AArch64FunctionInfo>(); 3562 MFI->incNumLocalDynamicTLSAccesses(); 3563 3564 // The call needs a relocation too for linker relaxation. It doesn't make 3565 // sense to call it MO_PAGE or MO_PAGEOFF though so we need another copy of 3566 // the address. 3567 SDValue SymAddr = DAG.getTargetExternalSymbol("_TLS_MODULE_BASE_", PtrVT, 3568 AArch64II::MO_TLS); 3569 3570 // Now we can calculate the offset from TPIDR_EL0 to this module's 3571 // thread-local area. 3572 TPOff = LowerELFTLSDescCallSeq(SymAddr, DL, DAG); 3573 3574 // Now use :dtprel_whatever: operations to calculate this variable's offset 3575 // in its thread-storage area. 3576 SDValue HiVar = DAG.getTargetGlobalAddress( 3577 GV, DL, MVT::i64, 0, AArch64II::MO_TLS | AArch64II::MO_HI12); 3578 SDValue LoVar = DAG.getTargetGlobalAddress( 3579 GV, DL, MVT::i64, 0, 3580 AArch64II::MO_TLS | AArch64II::MO_PAGEOFF | AArch64II::MO_NC); 3581 3582 TPOff = SDValue(DAG.getMachineNode(AArch64::ADDXri, DL, PtrVT, TPOff, HiVar, 3583 DAG.getTargetConstant(0, DL, MVT::i32)), 3584 0); 3585 TPOff = SDValue(DAG.getMachineNode(AArch64::ADDXri, DL, PtrVT, TPOff, LoVar, 3586 DAG.getTargetConstant(0, DL, MVT::i32)), 3587 0); 3588 } else if (Model == TLSModel::GeneralDynamic) { 3589 // The call needs a relocation too for linker relaxation. It doesn't make 3590 // sense to call it MO_PAGE or MO_PAGEOFF though so we need another copy of 3591 // the address. 3592 SDValue SymAddr = 3593 DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, AArch64II::MO_TLS); 3594 3595 // Finally we can make a call to calculate the offset from tpidr_el0. 3596 TPOff = LowerELFTLSDescCallSeq(SymAddr, DL, DAG); 3597 } else 3598 llvm_unreachable("Unsupported ELF TLS access model"); 3599 3600 return DAG.getNode(ISD::ADD, DL, PtrVT, ThreadBase, TPOff); 3601 } 3602 3603 SDValue AArch64TargetLowering::LowerGlobalTLSAddress(SDValue Op, 3604 SelectionDAG &DAG) const { 3605 if (Subtarget->isTargetDarwin()) 3606 return LowerDarwinGlobalTLSAddress(Op, DAG); 3607 else if (Subtarget->isTargetELF()) 3608 return LowerELFGlobalTLSAddress(Op, DAG); 3609 3610 llvm_unreachable("Unexpected platform trying to use TLS"); 3611 } 3612 SDValue AArch64TargetLowering::LowerBR_CC(SDValue Op, SelectionDAG &DAG) const { 3613 SDValue Chain = Op.getOperand(0); 3614 ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(1))->get(); 3615 SDValue LHS = Op.getOperand(2); 3616 SDValue RHS = Op.getOperand(3); 3617 SDValue Dest = Op.getOperand(4); 3618 SDLoc dl(Op); 3619 3620 // Handle f128 first, since lowering it will result in comparing the return 3621 // value of a libcall against zero, which is just what the rest of LowerBR_CC 3622 // is expecting to deal with. 3623 if (LHS.getValueType() == MVT::f128) { 3624 softenSetCCOperands(DAG, MVT::f128, LHS, RHS, CC, dl); 3625 3626 // If softenSetCCOperands returned a scalar, we need to compare the result 3627 // against zero to select between true and false values. 3628 if (!RHS.getNode()) { 3629 RHS = DAG.getConstant(0, dl, LHS.getValueType()); 3630 CC = ISD::SETNE; 3631 } 3632 } 3633 3634 // Optimize {s|u}{add|sub|mul}.with.overflow feeding into a branch 3635 // instruction. 3636 unsigned Opc = LHS.getOpcode(); 3637 if (LHS.getResNo() == 1 && isOneConstant(RHS) && 3638 (Opc == ISD::SADDO || Opc == ISD::UADDO || Opc == ISD::SSUBO || 3639 Opc == ISD::USUBO || Opc == ISD::SMULO || Opc == ISD::UMULO)) { 3640 assert((CC == ISD::SETEQ || CC == ISD::SETNE) && 3641 "Unexpected condition code."); 3642 // Only lower legal XALUO ops. 3643 if (!DAG.getTargetLoweringInfo().isTypeLegal(LHS->getValueType(0))) 3644 return SDValue(); 3645 3646 // The actual operation with overflow check. 3647 AArch64CC::CondCode OFCC; 3648 SDValue Value, Overflow; 3649 std::tie(Value, Overflow) = getAArch64XALUOOp(OFCC, LHS.getValue(0), DAG); 3650 3651 if (CC == ISD::SETNE) 3652 OFCC = getInvertedCondCode(OFCC); 3653 SDValue CCVal = DAG.getConstant(OFCC, dl, MVT::i32); 3654 3655 return DAG.getNode(AArch64ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal, 3656 Overflow); 3657 } 3658 3659 if (LHS.getValueType().isInteger()) { 3660 assert((LHS.getValueType() == RHS.getValueType()) && 3661 (LHS.getValueType() == MVT::i32 || LHS.getValueType() == MVT::i64)); 3662 3663 // If the RHS of the comparison is zero, we can potentially fold this 3664 // to a specialized branch. 3665 const ConstantSDNode *RHSC = dyn_cast<ConstantSDNode>(RHS); 3666 if (RHSC && RHSC->getZExtValue() == 0) { 3667 if (CC == ISD::SETEQ) { 3668 // See if we can use a TBZ to fold in an AND as well. 3669 // TBZ has a smaller branch displacement than CBZ. If the offset is 3670 // out of bounds, a late MI-layer pass rewrites branches. 3671 // 403.gcc is an example that hits this case. 3672 if (LHS.getOpcode() == ISD::AND && 3673 isa<ConstantSDNode>(LHS.getOperand(1)) && 3674 isPowerOf2_64(LHS.getConstantOperandVal(1))) { 3675 SDValue Test = LHS.getOperand(0); 3676 uint64_t Mask = LHS.getConstantOperandVal(1); 3677 return DAG.getNode(AArch64ISD::TBZ, dl, MVT::Other, Chain, Test, 3678 DAG.getConstant(Log2_64(Mask), dl, MVT::i64), 3679 Dest); 3680 } 3681 3682 return DAG.getNode(AArch64ISD::CBZ, dl, MVT::Other, Chain, LHS, Dest); 3683 } else if (CC == ISD::SETNE) { 3684 // See if we can use a TBZ to fold in an AND as well. 3685 // TBZ has a smaller branch displacement than CBZ. If the offset is 3686 // out of bounds, a late MI-layer pass rewrites branches. 3687 // 403.gcc is an example that hits this case. 3688 if (LHS.getOpcode() == ISD::AND && 3689 isa<ConstantSDNode>(LHS.getOperand(1)) && 3690 isPowerOf2_64(LHS.getConstantOperandVal(1))) { 3691 SDValue Test = LHS.getOperand(0); 3692 uint64_t Mask = LHS.getConstantOperandVal(1); 3693 return DAG.getNode(AArch64ISD::TBNZ, dl, MVT::Other, Chain, Test, 3694 DAG.getConstant(Log2_64(Mask), dl, MVT::i64), 3695 Dest); 3696 } 3697 3698 return DAG.getNode(AArch64ISD::CBNZ, dl, MVT::Other, Chain, LHS, Dest); 3699 } else if (CC == ISD::SETLT && LHS.getOpcode() != ISD::AND) { 3700 // Don't combine AND since emitComparison converts the AND to an ANDS 3701 // (a.k.a. TST) and the test in the test bit and branch instruction 3702 // becomes redundant. This would also increase register pressure. 3703 uint64_t Mask = LHS.getValueType().getSizeInBits() - 1; 3704 return DAG.getNode(AArch64ISD::TBNZ, dl, MVT::Other, Chain, LHS, 3705 DAG.getConstant(Mask, dl, MVT::i64), Dest); 3706 } 3707 } 3708 if (RHSC && RHSC->getSExtValue() == -1 && CC == ISD::SETGT && 3709 LHS.getOpcode() != ISD::AND) { 3710 // Don't combine AND since emitComparison converts the AND to an ANDS 3711 // (a.k.a. TST) and the test in the test bit and branch instruction 3712 // becomes redundant. This would also increase register pressure. 3713 uint64_t Mask = LHS.getValueType().getSizeInBits() - 1; 3714 return DAG.getNode(AArch64ISD::TBZ, dl, MVT::Other, Chain, LHS, 3715 DAG.getConstant(Mask, dl, MVT::i64), Dest); 3716 } 3717 3718 SDValue CCVal; 3719 SDValue Cmp = getAArch64Cmp(LHS, RHS, CC, CCVal, DAG, dl); 3720 return DAG.getNode(AArch64ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal, 3721 Cmp); 3722 } 3723 3724 assert(LHS.getValueType() == MVT::f32 || LHS.getValueType() == MVT::f64); 3725 3726 // Unfortunately, the mapping of LLVM FP CC's onto AArch64 CC's isn't totally 3727 // clean. Some of them require two branches to implement. 3728 SDValue Cmp = emitComparison(LHS, RHS, CC, dl, DAG); 3729 AArch64CC::CondCode CC1, CC2; 3730 changeFPCCToAArch64CC(CC, CC1, CC2); 3731 SDValue CC1Val = DAG.getConstant(CC1, dl, MVT::i32); 3732 SDValue BR1 = 3733 DAG.getNode(AArch64ISD::BRCOND, dl, MVT::Other, Chain, Dest, CC1Val, Cmp); 3734 if (CC2 != AArch64CC::AL) { 3735 SDValue CC2Val = DAG.getConstant(CC2, dl, MVT::i32); 3736 return DAG.getNode(AArch64ISD::BRCOND, dl, MVT::Other, BR1, Dest, CC2Val, 3737 Cmp); 3738 } 3739 3740 return BR1; 3741 } 3742 3743 SDValue AArch64TargetLowering::LowerFCOPYSIGN(SDValue Op, 3744 SelectionDAG &DAG) const { 3745 EVT VT = Op.getValueType(); 3746 SDLoc DL(Op); 3747 3748 SDValue In1 = Op.getOperand(0); 3749 SDValue In2 = Op.getOperand(1); 3750 EVT SrcVT = In2.getValueType(); 3751 3752 if (SrcVT.bitsLT(VT)) 3753 In2 = DAG.getNode(ISD::FP_EXTEND, DL, VT, In2); 3754 else if (SrcVT.bitsGT(VT)) 3755 In2 = DAG.getNode(ISD::FP_ROUND, DL, VT, In2, DAG.getIntPtrConstant(0, DL)); 3756 3757 EVT VecVT; 3758 EVT EltVT; 3759 uint64_t EltMask; 3760 SDValue VecVal1, VecVal2; 3761 if (VT == MVT::f32 || VT == MVT::v2f32 || VT == MVT::v4f32) { 3762 EltVT = MVT::i32; 3763 VecVT = (VT == MVT::v2f32 ? MVT::v2i32 : MVT::v4i32); 3764 EltMask = 0x80000000ULL; 3765 3766 if (!VT.isVector()) { 3767 VecVal1 = DAG.getTargetInsertSubreg(AArch64::ssub, DL, VecVT, 3768 DAG.getUNDEF(VecVT), In1); 3769 VecVal2 = DAG.getTargetInsertSubreg(AArch64::ssub, DL, VecVT, 3770 DAG.getUNDEF(VecVT), In2); 3771 } else { 3772 VecVal1 = DAG.getNode(ISD::BITCAST, DL, VecVT, In1); 3773 VecVal2 = DAG.getNode(ISD::BITCAST, DL, VecVT, In2); 3774 } 3775 } else if (VT == MVT::f64 || VT == MVT::v2f64) { 3776 EltVT = MVT::i64; 3777 VecVT = MVT::v2i64; 3778 3779 // We want to materialize a mask with the high bit set, but the AdvSIMD 3780 // immediate moves cannot materialize that in a single instruction for 3781 // 64-bit elements. Instead, materialize zero and then negate it. 3782 EltMask = 0; 3783 3784 if (!VT.isVector()) { 3785 VecVal1 = DAG.getTargetInsertSubreg(AArch64::dsub, DL, VecVT, 3786 DAG.getUNDEF(VecVT), In1); 3787 VecVal2 = DAG.getTargetInsertSubreg(AArch64::dsub, DL, VecVT, 3788 DAG.getUNDEF(VecVT), In2); 3789 } else { 3790 VecVal1 = DAG.getNode(ISD::BITCAST, DL, VecVT, In1); 3791 VecVal2 = DAG.getNode(ISD::BITCAST, DL, VecVT, In2); 3792 } 3793 } else { 3794 llvm_unreachable("Invalid type for copysign!"); 3795 } 3796 3797 SDValue BuildVec = DAG.getConstant(EltMask, DL, VecVT); 3798 3799 // If we couldn't materialize the mask above, then the mask vector will be 3800 // the zero vector, and we need to negate it here. 3801 if (VT == MVT::f64 || VT == MVT::v2f64) { 3802 BuildVec = DAG.getNode(ISD::BITCAST, DL, MVT::v2f64, BuildVec); 3803 BuildVec = DAG.getNode(ISD::FNEG, DL, MVT::v2f64, BuildVec); 3804 BuildVec = DAG.getNode(ISD::BITCAST, DL, MVT::v2i64, BuildVec); 3805 } 3806 3807 SDValue Sel = 3808 DAG.getNode(AArch64ISD::BIT, DL, VecVT, VecVal1, VecVal2, BuildVec); 3809 3810 if (VT == MVT::f32) 3811 return DAG.getTargetExtractSubreg(AArch64::ssub, DL, VT, Sel); 3812 else if (VT == MVT::f64) 3813 return DAG.getTargetExtractSubreg(AArch64::dsub, DL, VT, Sel); 3814 else 3815 return DAG.getNode(ISD::BITCAST, DL, VT, Sel); 3816 } 3817 3818 SDValue AArch64TargetLowering::LowerCTPOP(SDValue Op, SelectionDAG &DAG) const { 3819 if (DAG.getMachineFunction().getFunction()->hasFnAttribute( 3820 Attribute::NoImplicitFloat)) 3821 return SDValue(); 3822 3823 if (!Subtarget->hasNEON()) 3824 return SDValue(); 3825 3826 // While there is no integer popcount instruction, it can 3827 // be more efficiently lowered to the following sequence that uses 3828 // AdvSIMD registers/instructions as long as the copies to/from 3829 // the AdvSIMD registers are cheap. 3830 // FMOV D0, X0 // copy 64-bit int to vector, high bits zero'd 3831 // CNT V0.8B, V0.8B // 8xbyte pop-counts 3832 // ADDV B0, V0.8B // sum 8xbyte pop-counts 3833 // UMOV X0, V0.B[0] // copy byte result back to integer reg 3834 SDValue Val = Op.getOperand(0); 3835 SDLoc DL(Op); 3836 EVT VT = Op.getValueType(); 3837 3838 if (VT == MVT::i32) 3839 Val = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i64, Val); 3840 Val = DAG.getNode(ISD::BITCAST, DL, MVT::v8i8, Val); 3841 3842 SDValue CtPop = DAG.getNode(ISD::CTPOP, DL, MVT::v8i8, Val); 3843 SDValue UaddLV = DAG.getNode( 3844 ISD::INTRINSIC_WO_CHAIN, DL, MVT::i32, 3845 DAG.getConstant(Intrinsic::aarch64_neon_uaddlv, DL, MVT::i32), CtPop); 3846 3847 if (VT == MVT::i64) 3848 UaddLV = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i64, UaddLV); 3849 return UaddLV; 3850 } 3851 3852 SDValue AArch64TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const { 3853 3854 if (Op.getValueType().isVector()) 3855 return LowerVSETCC(Op, DAG); 3856 3857 SDValue LHS = Op.getOperand(0); 3858 SDValue RHS = Op.getOperand(1); 3859 ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(2))->get(); 3860 SDLoc dl(Op); 3861 3862 // We chose ZeroOrOneBooleanContents, so use zero and one. 3863 EVT VT = Op.getValueType(); 3864 SDValue TVal = DAG.getConstant(1, dl, VT); 3865 SDValue FVal = DAG.getConstant(0, dl, VT); 3866 3867 // Handle f128 first, since one possible outcome is a normal integer 3868 // comparison which gets picked up by the next if statement. 3869 if (LHS.getValueType() == MVT::f128) { 3870 softenSetCCOperands(DAG, MVT::f128, LHS, RHS, CC, dl); 3871 3872 // If softenSetCCOperands returned a scalar, use it. 3873 if (!RHS.getNode()) { 3874 assert(LHS.getValueType() == Op.getValueType() && 3875 "Unexpected setcc expansion!"); 3876 return LHS; 3877 } 3878 } 3879 3880 if (LHS.getValueType().isInteger()) { 3881 SDValue CCVal; 3882 SDValue Cmp = 3883 getAArch64Cmp(LHS, RHS, ISD::getSetCCInverse(CC, true), CCVal, DAG, dl); 3884 3885 // Note that we inverted the condition above, so we reverse the order of 3886 // the true and false operands here. This will allow the setcc to be 3887 // matched to a single CSINC instruction. 3888 return DAG.getNode(AArch64ISD::CSEL, dl, VT, FVal, TVal, CCVal, Cmp); 3889 } 3890 3891 // Now we know we're dealing with FP values. 3892 assert(LHS.getValueType() == MVT::f32 || LHS.getValueType() == MVT::f64); 3893 3894 // If that fails, we'll need to perform an FCMP + CSEL sequence. Go ahead 3895 // and do the comparison. 3896 SDValue Cmp = emitComparison(LHS, RHS, CC, dl, DAG); 3897 3898 AArch64CC::CondCode CC1, CC2; 3899 changeFPCCToAArch64CC(CC, CC1, CC2); 3900 if (CC2 == AArch64CC::AL) { 3901 changeFPCCToAArch64CC(ISD::getSetCCInverse(CC, false), CC1, CC2); 3902 SDValue CC1Val = DAG.getConstant(CC1, dl, MVT::i32); 3903 3904 // Note that we inverted the condition above, so we reverse the order of 3905 // the true and false operands here. This will allow the setcc to be 3906 // matched to a single CSINC instruction. 3907 return DAG.getNode(AArch64ISD::CSEL, dl, VT, FVal, TVal, CC1Val, Cmp); 3908 } else { 3909 // Unfortunately, the mapping of LLVM FP CC's onto AArch64 CC's isn't 3910 // totally clean. Some of them require two CSELs to implement. As is in 3911 // this case, we emit the first CSEL and then emit a second using the output 3912 // of the first as the RHS. We're effectively OR'ing the two CC's together. 3913 3914 // FIXME: It would be nice if we could match the two CSELs to two CSINCs. 3915 SDValue CC1Val = DAG.getConstant(CC1, dl, MVT::i32); 3916 SDValue CS1 = 3917 DAG.getNode(AArch64ISD::CSEL, dl, VT, TVal, FVal, CC1Val, Cmp); 3918 3919 SDValue CC2Val = DAG.getConstant(CC2, dl, MVT::i32); 3920 return DAG.getNode(AArch64ISD::CSEL, dl, VT, TVal, CS1, CC2Val, Cmp); 3921 } 3922 } 3923 3924 SDValue AArch64TargetLowering::LowerSELECT_CC(ISD::CondCode CC, SDValue LHS, 3925 SDValue RHS, SDValue TVal, 3926 SDValue FVal, const SDLoc &dl, 3927 SelectionDAG &DAG) const { 3928 // Handle f128 first, because it will result in a comparison of some RTLIB 3929 // call result against zero. 3930 if (LHS.getValueType() == MVT::f128) { 3931 softenSetCCOperands(DAG, MVT::f128, LHS, RHS, CC, dl); 3932 3933 // If softenSetCCOperands returned a scalar, we need to compare the result 3934 // against zero to select between true and false values. 3935 if (!RHS.getNode()) { 3936 RHS = DAG.getConstant(0, dl, LHS.getValueType()); 3937 CC = ISD::SETNE; 3938 } 3939 } 3940 3941 // Also handle f16, for which we need to do a f32 comparison. 3942 if (LHS.getValueType() == MVT::f16) { 3943 LHS = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f32, LHS); 3944 RHS = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f32, RHS); 3945 } 3946 3947 // Next, handle integers. 3948 if (LHS.getValueType().isInteger()) { 3949 assert((LHS.getValueType() == RHS.getValueType()) && 3950 (LHS.getValueType() == MVT::i32 || LHS.getValueType() == MVT::i64)); 3951 3952 unsigned Opcode = AArch64ISD::CSEL; 3953 3954 // If both the TVal and the FVal are constants, see if we can swap them in 3955 // order to for a CSINV or CSINC out of them. 3956 ConstantSDNode *CFVal = dyn_cast<ConstantSDNode>(FVal); 3957 ConstantSDNode *CTVal = dyn_cast<ConstantSDNode>(TVal); 3958 3959 if (CTVal && CFVal && CTVal->isAllOnesValue() && CFVal->isNullValue()) { 3960 std::swap(TVal, FVal); 3961 std::swap(CTVal, CFVal); 3962 CC = ISD::getSetCCInverse(CC, true); 3963 } else if (CTVal && CFVal && CTVal->isOne() && CFVal->isNullValue()) { 3964 std::swap(TVal, FVal); 3965 std::swap(CTVal, CFVal); 3966 CC = ISD::getSetCCInverse(CC, true); 3967 } else if (TVal.getOpcode() == ISD::XOR) { 3968 // If TVal is a NOT we want to swap TVal and FVal so that we can match 3969 // with a CSINV rather than a CSEL. 3970 if (isAllOnesConstant(TVal.getOperand(1))) { 3971 std::swap(TVal, FVal); 3972 std::swap(CTVal, CFVal); 3973 CC = ISD::getSetCCInverse(CC, true); 3974 } 3975 } else if (TVal.getOpcode() == ISD::SUB) { 3976 // If TVal is a negation (SUB from 0) we want to swap TVal and FVal so 3977 // that we can match with a CSNEG rather than a CSEL. 3978 if (isNullConstant(TVal.getOperand(0))) { 3979 std::swap(TVal, FVal); 3980 std::swap(CTVal, CFVal); 3981 CC = ISD::getSetCCInverse(CC, true); 3982 } 3983 } else if (CTVal && CFVal) { 3984 const int64_t TrueVal = CTVal->getSExtValue(); 3985 const int64_t FalseVal = CFVal->getSExtValue(); 3986 bool Swap = false; 3987 3988 // If both TVal and FVal are constants, see if FVal is the 3989 // inverse/negation/increment of TVal and generate a CSINV/CSNEG/CSINC 3990 // instead of a CSEL in that case. 3991 if (TrueVal == ~FalseVal) { 3992 Opcode = AArch64ISD::CSINV; 3993 } else if (TrueVal == -FalseVal) { 3994 Opcode = AArch64ISD::CSNEG; 3995 } else if (TVal.getValueType() == MVT::i32) { 3996 // If our operands are only 32-bit wide, make sure we use 32-bit 3997 // arithmetic for the check whether we can use CSINC. This ensures that 3998 // the addition in the check will wrap around properly in case there is 3999 // an overflow (which would not be the case if we do the check with 4000 // 64-bit arithmetic). 4001 const uint32_t TrueVal32 = CTVal->getZExtValue(); 4002 const uint32_t FalseVal32 = CFVal->getZExtValue(); 4003 4004 if ((TrueVal32 == FalseVal32 + 1) || (TrueVal32 + 1 == FalseVal32)) { 4005 Opcode = AArch64ISD::CSINC; 4006 4007 if (TrueVal32 > FalseVal32) { 4008 Swap = true; 4009 } 4010 } 4011 // 64-bit check whether we can use CSINC. 4012 } else if ((TrueVal == FalseVal + 1) || (TrueVal + 1 == FalseVal)) { 4013 Opcode = AArch64ISD::CSINC; 4014 4015 if (TrueVal > FalseVal) { 4016 Swap = true; 4017 } 4018 } 4019 4020 // Swap TVal and FVal if necessary. 4021 if (Swap) { 4022 std::swap(TVal, FVal); 4023 std::swap(CTVal, CFVal); 4024 CC = ISD::getSetCCInverse(CC, true); 4025 } 4026 4027 if (Opcode != AArch64ISD::CSEL) { 4028 // Drop FVal since we can get its value by simply inverting/negating 4029 // TVal. 4030 FVal = TVal; 4031 } 4032 } 4033 4034 SDValue CCVal; 4035 SDValue Cmp = getAArch64Cmp(LHS, RHS, CC, CCVal, DAG, dl); 4036 4037 EVT VT = TVal.getValueType(); 4038 return DAG.getNode(Opcode, dl, VT, TVal, FVal, CCVal, Cmp); 4039 } 4040 4041 // Now we know we're dealing with FP values. 4042 assert(LHS.getValueType() == MVT::f32 || LHS.getValueType() == MVT::f64); 4043 assert(LHS.getValueType() == RHS.getValueType()); 4044 EVT VT = TVal.getValueType(); 4045 SDValue Cmp = emitComparison(LHS, RHS, CC, dl, DAG); 4046 4047 // Unfortunately, the mapping of LLVM FP CC's onto AArch64 CC's isn't totally 4048 // clean. Some of them require two CSELs to implement. 4049 AArch64CC::CondCode CC1, CC2; 4050 changeFPCCToAArch64CC(CC, CC1, CC2); 4051 SDValue CC1Val = DAG.getConstant(CC1, dl, MVT::i32); 4052 SDValue CS1 = DAG.getNode(AArch64ISD::CSEL, dl, VT, TVal, FVal, CC1Val, Cmp); 4053 4054 // If we need a second CSEL, emit it, using the output of the first as the 4055 // RHS. We're effectively OR'ing the two CC's together. 4056 if (CC2 != AArch64CC::AL) { 4057 SDValue CC2Val = DAG.getConstant(CC2, dl, MVT::i32); 4058 return DAG.getNode(AArch64ISD::CSEL, dl, VT, TVal, CS1, CC2Val, Cmp); 4059 } 4060 4061 // Otherwise, return the output of the first CSEL. 4062 return CS1; 4063 } 4064 4065 SDValue AArch64TargetLowering::LowerSELECT_CC(SDValue Op, 4066 SelectionDAG &DAG) const { 4067 ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(4))->get(); 4068 SDValue LHS = Op.getOperand(0); 4069 SDValue RHS = Op.getOperand(1); 4070 SDValue TVal = Op.getOperand(2); 4071 SDValue FVal = Op.getOperand(3); 4072 SDLoc DL(Op); 4073 return LowerSELECT_CC(CC, LHS, RHS, TVal, FVal, DL, DAG); 4074 } 4075 4076 SDValue AArch64TargetLowering::LowerSELECT(SDValue Op, 4077 SelectionDAG &DAG) const { 4078 SDValue CCVal = Op->getOperand(0); 4079 SDValue TVal = Op->getOperand(1); 4080 SDValue FVal = Op->getOperand(2); 4081 SDLoc DL(Op); 4082 4083 unsigned Opc = CCVal.getOpcode(); 4084 // Optimize {s|u}{add|sub|mul}.with.overflow feeding into a select 4085 // instruction. 4086 if (CCVal.getResNo() == 1 && 4087 (Opc == ISD::SADDO || Opc == ISD::UADDO || Opc == ISD::SSUBO || 4088 Opc == ISD::USUBO || Opc == ISD::SMULO || Opc == ISD::UMULO)) { 4089 // Only lower legal XALUO ops. 4090 if (!DAG.getTargetLoweringInfo().isTypeLegal(CCVal->getValueType(0))) 4091 return SDValue(); 4092 4093 AArch64CC::CondCode OFCC; 4094 SDValue Value, Overflow; 4095 std::tie(Value, Overflow) = getAArch64XALUOOp(OFCC, CCVal.getValue(0), DAG); 4096 SDValue CCVal = DAG.getConstant(OFCC, DL, MVT::i32); 4097 4098 return DAG.getNode(AArch64ISD::CSEL, DL, Op.getValueType(), TVal, FVal, 4099 CCVal, Overflow); 4100 } 4101 4102 // Lower it the same way as we would lower a SELECT_CC node. 4103 ISD::CondCode CC; 4104 SDValue LHS, RHS; 4105 if (CCVal.getOpcode() == ISD::SETCC) { 4106 LHS = CCVal.getOperand(0); 4107 RHS = CCVal.getOperand(1); 4108 CC = cast<CondCodeSDNode>(CCVal->getOperand(2))->get(); 4109 } else { 4110 LHS = CCVal; 4111 RHS = DAG.getConstant(0, DL, CCVal.getValueType()); 4112 CC = ISD::SETNE; 4113 } 4114 return LowerSELECT_CC(CC, LHS, RHS, TVal, FVal, DL, DAG); 4115 } 4116 4117 SDValue AArch64TargetLowering::LowerJumpTable(SDValue Op, 4118 SelectionDAG &DAG) const { 4119 // Jump table entries as PC relative offsets. No additional tweaking 4120 // is necessary here. Just get the address of the jump table. 4121 JumpTableSDNode *JT = cast<JumpTableSDNode>(Op); 4122 EVT PtrVT = getPointerTy(DAG.getDataLayout()); 4123 SDLoc DL(Op); 4124 4125 if (getTargetMachine().getCodeModel() == CodeModel::Large && 4126 !Subtarget->isTargetMachO()) { 4127 const unsigned char MO_NC = AArch64II::MO_NC; 4128 return DAG.getNode( 4129 AArch64ISD::WrapperLarge, DL, PtrVT, 4130 DAG.getTargetJumpTable(JT->getIndex(), PtrVT, AArch64II::MO_G3), 4131 DAG.getTargetJumpTable(JT->getIndex(), PtrVT, AArch64II::MO_G2 | MO_NC), 4132 DAG.getTargetJumpTable(JT->getIndex(), PtrVT, AArch64II::MO_G1 | MO_NC), 4133 DAG.getTargetJumpTable(JT->getIndex(), PtrVT, 4134 AArch64II::MO_G0 | MO_NC)); 4135 } 4136 4137 SDValue Hi = 4138 DAG.getTargetJumpTable(JT->getIndex(), PtrVT, AArch64II::MO_PAGE); 4139 SDValue Lo = DAG.getTargetJumpTable(JT->getIndex(), PtrVT, 4140 AArch64II::MO_PAGEOFF | AArch64II::MO_NC); 4141 SDValue ADRP = DAG.getNode(AArch64ISD::ADRP, DL, PtrVT, Hi); 4142 return DAG.getNode(AArch64ISD::ADDlow, DL, PtrVT, ADRP, Lo); 4143 } 4144 4145 SDValue AArch64TargetLowering::LowerConstantPool(SDValue Op, 4146 SelectionDAG &DAG) const { 4147 ConstantPoolSDNode *CP = cast<ConstantPoolSDNode>(Op); 4148 EVT PtrVT = getPointerTy(DAG.getDataLayout()); 4149 SDLoc DL(Op); 4150 4151 if (getTargetMachine().getCodeModel() == CodeModel::Large) { 4152 // Use the GOT for the large code model on iOS. 4153 if (Subtarget->isTargetMachO()) { 4154 SDValue GotAddr = DAG.getTargetConstantPool( 4155 CP->getConstVal(), PtrVT, CP->getAlignment(), CP->getOffset(), 4156 AArch64II::MO_GOT); 4157 return DAG.getNode(AArch64ISD::LOADgot, DL, PtrVT, GotAddr); 4158 } 4159 4160 const unsigned char MO_NC = AArch64II::MO_NC; 4161 return DAG.getNode( 4162 AArch64ISD::WrapperLarge, DL, PtrVT, 4163 DAG.getTargetConstantPool(CP->getConstVal(), PtrVT, CP->getAlignment(), 4164 CP->getOffset(), AArch64II::MO_G3), 4165 DAG.getTargetConstantPool(CP->getConstVal(), PtrVT, CP->getAlignment(), 4166 CP->getOffset(), AArch64II::MO_G2 | MO_NC), 4167 DAG.getTargetConstantPool(CP->getConstVal(), PtrVT, CP->getAlignment(), 4168 CP->getOffset(), AArch64II::MO_G1 | MO_NC), 4169 DAG.getTargetConstantPool(CP->getConstVal(), PtrVT, CP->getAlignment(), 4170 CP->getOffset(), AArch64II::MO_G0 | MO_NC)); 4171 } else { 4172 // Use ADRP/ADD or ADRP/LDR for everything else: the small memory model on 4173 // ELF, the only valid one on Darwin. 4174 SDValue Hi = 4175 DAG.getTargetConstantPool(CP->getConstVal(), PtrVT, CP->getAlignment(), 4176 CP->getOffset(), AArch64II::MO_PAGE); 4177 SDValue Lo = DAG.getTargetConstantPool( 4178 CP->getConstVal(), PtrVT, CP->getAlignment(), CP->getOffset(), 4179 AArch64II::MO_PAGEOFF | AArch64II::MO_NC); 4180 4181 SDValue ADRP = DAG.getNode(AArch64ISD::ADRP, DL, PtrVT, Hi); 4182 return DAG.getNode(AArch64ISD::ADDlow, DL, PtrVT, ADRP, Lo); 4183 } 4184 } 4185 4186 SDValue AArch64TargetLowering::LowerBlockAddress(SDValue Op, 4187 SelectionDAG &DAG) const { 4188 const BlockAddress *BA = cast<BlockAddressSDNode>(Op)->getBlockAddress(); 4189 EVT PtrVT = getPointerTy(DAG.getDataLayout()); 4190 SDLoc DL(Op); 4191 if (getTargetMachine().getCodeModel() == CodeModel::Large && 4192 !Subtarget->isTargetMachO()) { 4193 const unsigned char MO_NC = AArch64II::MO_NC; 4194 return DAG.getNode( 4195 AArch64ISD::WrapperLarge, DL, PtrVT, 4196 DAG.getTargetBlockAddress(BA, PtrVT, 0, AArch64II::MO_G3), 4197 DAG.getTargetBlockAddress(BA, PtrVT, 0, AArch64II::MO_G2 | MO_NC), 4198 DAG.getTargetBlockAddress(BA, PtrVT, 0, AArch64II::MO_G1 | MO_NC), 4199 DAG.getTargetBlockAddress(BA, PtrVT, 0, AArch64II::MO_G0 | MO_NC)); 4200 } else { 4201 SDValue Hi = DAG.getTargetBlockAddress(BA, PtrVT, 0, AArch64II::MO_PAGE); 4202 SDValue Lo = DAG.getTargetBlockAddress(BA, PtrVT, 0, AArch64II::MO_PAGEOFF | 4203 AArch64II::MO_NC); 4204 SDValue ADRP = DAG.getNode(AArch64ISD::ADRP, DL, PtrVT, Hi); 4205 return DAG.getNode(AArch64ISD::ADDlow, DL, PtrVT, ADRP, Lo); 4206 } 4207 } 4208 4209 SDValue AArch64TargetLowering::LowerDarwin_VASTART(SDValue Op, 4210 SelectionDAG &DAG) const { 4211 AArch64FunctionInfo *FuncInfo = 4212 DAG.getMachineFunction().getInfo<AArch64FunctionInfo>(); 4213 4214 SDLoc DL(Op); 4215 SDValue FR = DAG.getFrameIndex(FuncInfo->getVarArgsStackIndex(), 4216 getPointerTy(DAG.getDataLayout())); 4217 const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue(); 4218 return DAG.getStore(Op.getOperand(0), DL, FR, Op.getOperand(1), 4219 MachinePointerInfo(SV), false, false, 0); 4220 } 4221 4222 SDValue AArch64TargetLowering::LowerAAPCS_VASTART(SDValue Op, 4223 SelectionDAG &DAG) const { 4224 // The layout of the va_list struct is specified in the AArch64 Procedure Call 4225 // Standard, section B.3. 4226 MachineFunction &MF = DAG.getMachineFunction(); 4227 AArch64FunctionInfo *FuncInfo = MF.getInfo<AArch64FunctionInfo>(); 4228 auto PtrVT = getPointerTy(DAG.getDataLayout()); 4229 SDLoc DL(Op); 4230 4231 SDValue Chain = Op.getOperand(0); 4232 SDValue VAList = Op.getOperand(1); 4233 const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue(); 4234 SmallVector<SDValue, 4> MemOps; 4235 4236 // void *__stack at offset 0 4237 SDValue Stack = DAG.getFrameIndex(FuncInfo->getVarArgsStackIndex(), PtrVT); 4238 MemOps.push_back(DAG.getStore(Chain, DL, Stack, VAList, 4239 MachinePointerInfo(SV), false, false, 8)); 4240 4241 // void *__gr_top at offset 8 4242 int GPRSize = FuncInfo->getVarArgsGPRSize(); 4243 if (GPRSize > 0) { 4244 SDValue GRTop, GRTopAddr; 4245 4246 GRTopAddr = 4247 DAG.getNode(ISD::ADD, DL, PtrVT, VAList, DAG.getConstant(8, DL, PtrVT)); 4248 4249 GRTop = DAG.getFrameIndex(FuncInfo->getVarArgsGPRIndex(), PtrVT); 4250 GRTop = DAG.getNode(ISD::ADD, DL, PtrVT, GRTop, 4251 DAG.getConstant(GPRSize, DL, PtrVT)); 4252 4253 MemOps.push_back(DAG.getStore(Chain, DL, GRTop, GRTopAddr, 4254 MachinePointerInfo(SV, 8), false, false, 8)); 4255 } 4256 4257 // void *__vr_top at offset 16 4258 int FPRSize = FuncInfo->getVarArgsFPRSize(); 4259 if (FPRSize > 0) { 4260 SDValue VRTop, VRTopAddr; 4261 VRTopAddr = DAG.getNode(ISD::ADD, DL, PtrVT, VAList, 4262 DAG.getConstant(16, DL, PtrVT)); 4263 4264 VRTop = DAG.getFrameIndex(FuncInfo->getVarArgsFPRIndex(), PtrVT); 4265 VRTop = DAG.getNode(ISD::ADD, DL, PtrVT, VRTop, 4266 DAG.getConstant(FPRSize, DL, PtrVT)); 4267 4268 MemOps.push_back(DAG.getStore(Chain, DL, VRTop, VRTopAddr, 4269 MachinePointerInfo(SV, 16), false, false, 8)); 4270 } 4271 4272 // int __gr_offs at offset 24 4273 SDValue GROffsAddr = 4274 DAG.getNode(ISD::ADD, DL, PtrVT, VAList, DAG.getConstant(24, DL, PtrVT)); 4275 MemOps.push_back(DAG.getStore(Chain, DL, 4276 DAG.getConstant(-GPRSize, DL, MVT::i32), 4277 GROffsAddr, MachinePointerInfo(SV, 24), false, 4278 false, 4)); 4279 4280 // int __vr_offs at offset 28 4281 SDValue VROffsAddr = 4282 DAG.getNode(ISD::ADD, DL, PtrVT, VAList, DAG.getConstant(28, DL, PtrVT)); 4283 MemOps.push_back(DAG.getStore(Chain, DL, 4284 DAG.getConstant(-FPRSize, DL, MVT::i32), 4285 VROffsAddr, MachinePointerInfo(SV, 28), false, 4286 false, 4)); 4287 4288 return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOps); 4289 } 4290 4291 SDValue AArch64TargetLowering::LowerVASTART(SDValue Op, 4292 SelectionDAG &DAG) const { 4293 return Subtarget->isTargetDarwin() ? LowerDarwin_VASTART(Op, DAG) 4294 : LowerAAPCS_VASTART(Op, DAG); 4295 } 4296 4297 SDValue AArch64TargetLowering::LowerVACOPY(SDValue Op, 4298 SelectionDAG &DAG) const { 4299 // AAPCS has three pointers and two ints (= 32 bytes), Darwin has single 4300 // pointer. 4301 SDLoc DL(Op); 4302 unsigned VaListSize = Subtarget->isTargetDarwin() ? 8 : 32; 4303 const Value *DestSV = cast<SrcValueSDNode>(Op.getOperand(3))->getValue(); 4304 const Value *SrcSV = cast<SrcValueSDNode>(Op.getOperand(4))->getValue(); 4305 4306 return DAG.getMemcpy(Op.getOperand(0), DL, Op.getOperand(1), 4307 Op.getOperand(2), 4308 DAG.getConstant(VaListSize, DL, MVT::i32), 4309 8, false, false, false, MachinePointerInfo(DestSV), 4310 MachinePointerInfo(SrcSV)); 4311 } 4312 4313 SDValue AArch64TargetLowering::LowerVAARG(SDValue Op, SelectionDAG &DAG) const { 4314 assert(Subtarget->isTargetDarwin() && 4315 "automatic va_arg instruction only works on Darwin"); 4316 4317 const Value *V = cast<SrcValueSDNode>(Op.getOperand(2))->getValue(); 4318 EVT VT = Op.getValueType(); 4319 SDLoc DL(Op); 4320 SDValue Chain = Op.getOperand(0); 4321 SDValue Addr = Op.getOperand(1); 4322 unsigned Align = Op.getConstantOperandVal(3); 4323 auto PtrVT = getPointerTy(DAG.getDataLayout()); 4324 4325 SDValue VAList = DAG.getLoad(PtrVT, DL, Chain, Addr, MachinePointerInfo(V), 4326 false, false, false, 0); 4327 Chain = VAList.getValue(1); 4328 4329 if (Align > 8) { 4330 assert(((Align & (Align - 1)) == 0) && "Expected Align to be a power of 2"); 4331 VAList = DAG.getNode(ISD::ADD, DL, PtrVT, VAList, 4332 DAG.getConstant(Align - 1, DL, PtrVT)); 4333 VAList = DAG.getNode(ISD::AND, DL, PtrVT, VAList, 4334 DAG.getConstant(-(int64_t)Align, DL, PtrVT)); 4335 } 4336 4337 Type *ArgTy = VT.getTypeForEVT(*DAG.getContext()); 4338 uint64_t ArgSize = DAG.getDataLayout().getTypeAllocSize(ArgTy); 4339 4340 // Scalar integer and FP values smaller than 64 bits are implicitly extended 4341 // up to 64 bits. At the very least, we have to increase the striding of the 4342 // vaargs list to match this, and for FP values we need to introduce 4343 // FP_ROUND nodes as well. 4344 if (VT.isInteger() && !VT.isVector()) 4345 ArgSize = 8; 4346 bool NeedFPTrunc = false; 4347 if (VT.isFloatingPoint() && !VT.isVector() && VT != MVT::f64) { 4348 ArgSize = 8; 4349 NeedFPTrunc = true; 4350 } 4351 4352 // Increment the pointer, VAList, to the next vaarg 4353 SDValue VANext = DAG.getNode(ISD::ADD, DL, PtrVT, VAList, 4354 DAG.getConstant(ArgSize, DL, PtrVT)); 4355 // Store the incremented VAList to the legalized pointer 4356 SDValue APStore = DAG.getStore(Chain, DL, VANext, Addr, MachinePointerInfo(V), 4357 false, false, 0); 4358 4359 // Load the actual argument out of the pointer VAList 4360 if (NeedFPTrunc) { 4361 // Load the value as an f64. 4362 SDValue WideFP = DAG.getLoad(MVT::f64, DL, APStore, VAList, 4363 MachinePointerInfo(), false, false, false, 0); 4364 // Round the value down to an f32. 4365 SDValue NarrowFP = DAG.getNode(ISD::FP_ROUND, DL, VT, WideFP.getValue(0), 4366 DAG.getIntPtrConstant(1, DL)); 4367 SDValue Ops[] = { NarrowFP, WideFP.getValue(1) }; 4368 // Merge the rounded value with the chain output of the load. 4369 return DAG.getMergeValues(Ops, DL); 4370 } 4371 4372 return DAG.getLoad(VT, DL, APStore, VAList, MachinePointerInfo(), false, 4373 false, false, 0); 4374 } 4375 4376 SDValue AArch64TargetLowering::LowerFRAMEADDR(SDValue Op, 4377 SelectionDAG &DAG) const { 4378 MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo(); 4379 MFI->setFrameAddressIsTaken(true); 4380 4381 EVT VT = Op.getValueType(); 4382 SDLoc DL(Op); 4383 unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue(); 4384 SDValue FrameAddr = 4385 DAG.getCopyFromReg(DAG.getEntryNode(), DL, AArch64::FP, VT); 4386 while (Depth--) 4387 FrameAddr = DAG.getLoad(VT, DL, DAG.getEntryNode(), FrameAddr, 4388 MachinePointerInfo(), false, false, false, 0); 4389 return FrameAddr; 4390 } 4391 4392 // FIXME? Maybe this could be a TableGen attribute on some registers and 4393 // this table could be generated automatically from RegInfo. 4394 unsigned AArch64TargetLowering::getRegisterByName(const char* RegName, EVT VT, 4395 SelectionDAG &DAG) const { 4396 unsigned Reg = StringSwitch<unsigned>(RegName) 4397 .Case("sp", AArch64::SP) 4398 .Default(0); 4399 if (Reg) 4400 return Reg; 4401 report_fatal_error(Twine("Invalid register name \"" 4402 + StringRef(RegName) + "\".")); 4403 } 4404 4405 SDValue AArch64TargetLowering::LowerRETURNADDR(SDValue Op, 4406 SelectionDAG &DAG) const { 4407 MachineFunction &MF = DAG.getMachineFunction(); 4408 MachineFrameInfo *MFI = MF.getFrameInfo(); 4409 MFI->setReturnAddressIsTaken(true); 4410 4411 EVT VT = Op.getValueType(); 4412 SDLoc DL(Op); 4413 unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue(); 4414 if (Depth) { 4415 SDValue FrameAddr = LowerFRAMEADDR(Op, DAG); 4416 SDValue Offset = DAG.getConstant(8, DL, getPointerTy(DAG.getDataLayout())); 4417 return DAG.getLoad(VT, DL, DAG.getEntryNode(), 4418 DAG.getNode(ISD::ADD, DL, VT, FrameAddr, Offset), 4419 MachinePointerInfo(), false, false, false, 0); 4420 } 4421 4422 // Return LR, which contains the return address. Mark it an implicit live-in. 4423 unsigned Reg = MF.addLiveIn(AArch64::LR, &AArch64::GPR64RegClass); 4424 return DAG.getCopyFromReg(DAG.getEntryNode(), DL, Reg, VT); 4425 } 4426 4427 /// LowerShiftRightParts - Lower SRA_PARTS, which returns two 4428 /// i64 values and take a 2 x i64 value to shift plus a shift amount. 4429 SDValue AArch64TargetLowering::LowerShiftRightParts(SDValue Op, 4430 SelectionDAG &DAG) const { 4431 assert(Op.getNumOperands() == 3 && "Not a double-shift!"); 4432 EVT VT = Op.getValueType(); 4433 unsigned VTBits = VT.getSizeInBits(); 4434 SDLoc dl(Op); 4435 SDValue ShOpLo = Op.getOperand(0); 4436 SDValue ShOpHi = Op.getOperand(1); 4437 SDValue ShAmt = Op.getOperand(2); 4438 unsigned Opc = (Op.getOpcode() == ISD::SRA_PARTS) ? ISD::SRA : ISD::SRL; 4439 4440 assert(Op.getOpcode() == ISD::SRA_PARTS || Op.getOpcode() == ISD::SRL_PARTS); 4441 4442 SDValue RevShAmt = DAG.getNode(ISD::SUB, dl, MVT::i64, 4443 DAG.getConstant(VTBits, dl, MVT::i64), ShAmt); 4444 SDValue HiBitsForLo = DAG.getNode(ISD::SHL, dl, VT, ShOpHi, RevShAmt); 4445 4446 // Unfortunately, if ShAmt == 0, we just calculated "(SHL ShOpHi, 64)" which 4447 // is "undef". We wanted 0, so CSEL it directly. 4448 SDValue Cmp = emitComparison(ShAmt, DAG.getConstant(0, dl, MVT::i64), 4449 ISD::SETEQ, dl, DAG); 4450 SDValue CCVal = DAG.getConstant(AArch64CC::EQ, dl, MVT::i32); 4451 HiBitsForLo = 4452 DAG.getNode(AArch64ISD::CSEL, dl, VT, DAG.getConstant(0, dl, MVT::i64), 4453 HiBitsForLo, CCVal, Cmp); 4454 4455 SDValue ExtraShAmt = DAG.getNode(ISD::SUB, dl, MVT::i64, ShAmt, 4456 DAG.getConstant(VTBits, dl, MVT::i64)); 4457 4458 SDValue LoBitsForLo = DAG.getNode(ISD::SRL, dl, VT, ShOpLo, ShAmt); 4459 SDValue LoForNormalShift = 4460 DAG.getNode(ISD::OR, dl, VT, LoBitsForLo, HiBitsForLo); 4461 4462 Cmp = emitComparison(ExtraShAmt, DAG.getConstant(0, dl, MVT::i64), ISD::SETGE, 4463 dl, DAG); 4464 CCVal = DAG.getConstant(AArch64CC::GE, dl, MVT::i32); 4465 SDValue LoForBigShift = DAG.getNode(Opc, dl, VT, ShOpHi, ExtraShAmt); 4466 SDValue Lo = DAG.getNode(AArch64ISD::CSEL, dl, VT, LoForBigShift, 4467 LoForNormalShift, CCVal, Cmp); 4468 4469 // AArch64 shifts larger than the register width are wrapped rather than 4470 // clamped, so we can't just emit "hi >> x". 4471 SDValue HiForNormalShift = DAG.getNode(Opc, dl, VT, ShOpHi, ShAmt); 4472 SDValue HiForBigShift = 4473 Opc == ISD::SRA 4474 ? DAG.getNode(Opc, dl, VT, ShOpHi, 4475 DAG.getConstant(VTBits - 1, dl, MVT::i64)) 4476 : DAG.getConstant(0, dl, VT); 4477 SDValue Hi = DAG.getNode(AArch64ISD::CSEL, dl, VT, HiForBigShift, 4478 HiForNormalShift, CCVal, Cmp); 4479 4480 SDValue Ops[2] = { Lo, Hi }; 4481 return DAG.getMergeValues(Ops, dl); 4482 } 4483 4484 4485 /// LowerShiftLeftParts - Lower SHL_PARTS, which returns two 4486 /// i64 values and take a 2 x i64 value to shift plus a shift amount. 4487 SDValue AArch64TargetLowering::LowerShiftLeftParts(SDValue Op, 4488 SelectionDAG &DAG) const { 4489 assert(Op.getNumOperands() == 3 && "Not a double-shift!"); 4490 EVT VT = Op.getValueType(); 4491 unsigned VTBits = VT.getSizeInBits(); 4492 SDLoc dl(Op); 4493 SDValue ShOpLo = Op.getOperand(0); 4494 SDValue ShOpHi = Op.getOperand(1); 4495 SDValue ShAmt = Op.getOperand(2); 4496 4497 assert(Op.getOpcode() == ISD::SHL_PARTS); 4498 SDValue RevShAmt = DAG.getNode(ISD::SUB, dl, MVT::i64, 4499 DAG.getConstant(VTBits, dl, MVT::i64), ShAmt); 4500 SDValue LoBitsForHi = DAG.getNode(ISD::SRL, dl, VT, ShOpLo, RevShAmt); 4501 4502 // Unfortunately, if ShAmt == 0, we just calculated "(SRL ShOpLo, 64)" which 4503 // is "undef". We wanted 0, so CSEL it directly. 4504 SDValue Cmp = emitComparison(ShAmt, DAG.getConstant(0, dl, MVT::i64), 4505 ISD::SETEQ, dl, DAG); 4506 SDValue CCVal = DAG.getConstant(AArch64CC::EQ, dl, MVT::i32); 4507 LoBitsForHi = 4508 DAG.getNode(AArch64ISD::CSEL, dl, VT, DAG.getConstant(0, dl, MVT::i64), 4509 LoBitsForHi, CCVal, Cmp); 4510 4511 SDValue ExtraShAmt = DAG.getNode(ISD::SUB, dl, MVT::i64, ShAmt, 4512 DAG.getConstant(VTBits, dl, MVT::i64)); 4513 SDValue HiBitsForHi = DAG.getNode(ISD::SHL, dl, VT, ShOpHi, ShAmt); 4514 SDValue HiForNormalShift = 4515 DAG.getNode(ISD::OR, dl, VT, LoBitsForHi, HiBitsForHi); 4516 4517 SDValue HiForBigShift = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, ExtraShAmt); 4518 4519 Cmp = emitComparison(ExtraShAmt, DAG.getConstant(0, dl, MVT::i64), ISD::SETGE, 4520 dl, DAG); 4521 CCVal = DAG.getConstant(AArch64CC::GE, dl, MVT::i32); 4522 SDValue Hi = DAG.getNode(AArch64ISD::CSEL, dl, VT, HiForBigShift, 4523 HiForNormalShift, CCVal, Cmp); 4524 4525 // AArch64 shifts of larger than register sizes are wrapped rather than 4526 // clamped, so we can't just emit "lo << a" if a is too big. 4527 SDValue LoForBigShift = DAG.getConstant(0, dl, VT); 4528 SDValue LoForNormalShift = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, ShAmt); 4529 SDValue Lo = DAG.getNode(AArch64ISD::CSEL, dl, VT, LoForBigShift, 4530 LoForNormalShift, CCVal, Cmp); 4531 4532 SDValue Ops[2] = { Lo, Hi }; 4533 return DAG.getMergeValues(Ops, dl); 4534 } 4535 4536 bool AArch64TargetLowering::isOffsetFoldingLegal( 4537 const GlobalAddressSDNode *GA) const { 4538 // The AArch64 target doesn't support folding offsets into global addresses. 4539 return false; 4540 } 4541 4542 bool AArch64TargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT) const { 4543 // We can materialize #0.0 as fmov $Rd, XZR for 64-bit and 32-bit cases. 4544 // FIXME: We should be able to handle f128 as well with a clever lowering. 4545 if (Imm.isPosZero() && (VT == MVT::f64 || VT == MVT::f32)) 4546 return true; 4547 4548 if (VT == MVT::f64) 4549 return AArch64_AM::getFP64Imm(Imm) != -1; 4550 else if (VT == MVT::f32) 4551 return AArch64_AM::getFP32Imm(Imm) != -1; 4552 return false; 4553 } 4554 4555 //===----------------------------------------------------------------------===// 4556 // AArch64 Optimization Hooks 4557 //===----------------------------------------------------------------------===// 4558 4559 /// getEstimate - Return the appropriate estimate DAG for either the reciprocal 4560 /// or the reciprocal square root. 4561 static SDValue getEstimate(const AArch64Subtarget &ST, 4562 const AArch64TargetLowering::DAGCombinerInfo &DCI, unsigned Opcode, 4563 const SDValue &Operand, unsigned &ExtraSteps) { 4564 if (!ST.hasNEON()) 4565 return SDValue(); 4566 4567 EVT VT = Operand.getValueType(); 4568 4569 std::string RecipOp; 4570 RecipOp = Opcode == (AArch64ISD::FRECPE) ? "div": "sqrt"; 4571 RecipOp = ((VT.isVector()) ? "vec-": "") + RecipOp; 4572 RecipOp += (VT.getScalarType() == MVT::f64) ? "d": "f"; 4573 4574 TargetRecip Recips = DCI.DAG.getTarget().Options.Reciprocals; 4575 if (!Recips.isEnabled(RecipOp)) 4576 return SDValue(); 4577 4578 ExtraSteps = Recips.getRefinementSteps(RecipOp); 4579 return DCI.DAG.getNode(Opcode, SDLoc(Operand), VT, Operand); 4580 } 4581 4582 SDValue AArch64TargetLowering::getRecipEstimate(SDValue Operand, 4583 DAGCombinerInfo &DCI, unsigned &ExtraSteps) const { 4584 return getEstimate(*Subtarget, DCI, AArch64ISD::FRECPE, Operand, ExtraSteps); 4585 } 4586 4587 SDValue AArch64TargetLowering::getRsqrtEstimate(SDValue Operand, 4588 DAGCombinerInfo &DCI, unsigned &ExtraSteps, bool &UseOneConst) const { 4589 UseOneConst = true; 4590 return getEstimate(*Subtarget, DCI, AArch64ISD::FRSQRTE, Operand, ExtraSteps); 4591 } 4592 4593 //===----------------------------------------------------------------------===// 4594 // AArch64 Inline Assembly Support 4595 //===----------------------------------------------------------------------===// 4596 4597 // Table of Constraints 4598 // TODO: This is the current set of constraints supported by ARM for the 4599 // compiler, not all of them may make sense, e.g. S may be difficult to support. 4600 // 4601 // r - A general register 4602 // w - An FP/SIMD register of some size in the range v0-v31 4603 // x - An FP/SIMD register of some size in the range v0-v15 4604 // I - Constant that can be used with an ADD instruction 4605 // J - Constant that can be used with a SUB instruction 4606 // K - Constant that can be used with a 32-bit logical instruction 4607 // L - Constant that can be used with a 64-bit logical instruction 4608 // M - Constant that can be used as a 32-bit MOV immediate 4609 // N - Constant that can be used as a 64-bit MOV immediate 4610 // Q - A memory reference with base register and no offset 4611 // S - A symbolic address 4612 // Y - Floating point constant zero 4613 // Z - Integer constant zero 4614 // 4615 // Note that general register operands will be output using their 64-bit x 4616 // register name, whatever the size of the variable, unless the asm operand 4617 // is prefixed by the %w modifier. Floating-point and SIMD register operands 4618 // will be output with the v prefix unless prefixed by the %b, %h, %s, %d or 4619 // %q modifier. 4620 const char *AArch64TargetLowering::LowerXConstraint(EVT ConstraintVT) const { 4621 // At this point, we have to lower this constraint to something else, so we 4622 // lower it to an "r" or "w". However, by doing this we will force the result 4623 // to be in register, while the X constraint is much more permissive. 4624 // 4625 // Although we are correct (we are free to emit anything, without 4626 // constraints), we might break use cases that would expect us to be more 4627 // efficient and emit something else. 4628 if (!Subtarget->hasFPARMv8()) 4629 return "r"; 4630 4631 if (ConstraintVT.isFloatingPoint()) 4632 return "w"; 4633 4634 if (ConstraintVT.isVector() && 4635 (ConstraintVT.getSizeInBits() == 64 || 4636 ConstraintVT.getSizeInBits() == 128)) 4637 return "w"; 4638 4639 return "r"; 4640 } 4641 4642 /// getConstraintType - Given a constraint letter, return the type of 4643 /// constraint it is for this target. 4644 AArch64TargetLowering::ConstraintType 4645 AArch64TargetLowering::getConstraintType(StringRef Constraint) const { 4646 if (Constraint.size() == 1) { 4647 switch (Constraint[0]) { 4648 default: 4649 break; 4650 case 'z': 4651 return C_Other; 4652 case 'x': 4653 case 'w': 4654 return C_RegisterClass; 4655 // An address with a single base register. Due to the way we 4656 // currently handle addresses it is the same as 'r'. 4657 case 'Q': 4658 return C_Memory; 4659 } 4660 } 4661 return TargetLowering::getConstraintType(Constraint); 4662 } 4663 4664 /// Examine constraint type and operand type and determine a weight value. 4665 /// This object must already have been set up with the operand type 4666 /// and the current alternative constraint selected. 4667 TargetLowering::ConstraintWeight 4668 AArch64TargetLowering::getSingleConstraintMatchWeight( 4669 AsmOperandInfo &info, const char *constraint) const { 4670 ConstraintWeight weight = CW_Invalid; 4671 Value *CallOperandVal = info.CallOperandVal; 4672 // If we don't have a value, we can't do a match, 4673 // but allow it at the lowest weight. 4674 if (!CallOperandVal) 4675 return CW_Default; 4676 Type *type = CallOperandVal->getType(); 4677 // Look at the constraint type. 4678 switch (*constraint) { 4679 default: 4680 weight = TargetLowering::getSingleConstraintMatchWeight(info, constraint); 4681 break; 4682 case 'x': 4683 case 'w': 4684 if (type->isFloatingPointTy() || type->isVectorTy()) 4685 weight = CW_Register; 4686 break; 4687 case 'z': 4688 weight = CW_Constant; 4689 break; 4690 } 4691 return weight; 4692 } 4693 4694 std::pair<unsigned, const TargetRegisterClass *> 4695 AArch64TargetLowering::getRegForInlineAsmConstraint( 4696 const TargetRegisterInfo *TRI, StringRef Constraint, MVT VT) const { 4697 if (Constraint.size() == 1) { 4698 switch (Constraint[0]) { 4699 case 'r': 4700 if (VT.getSizeInBits() == 64) 4701 return std::make_pair(0U, &AArch64::GPR64commonRegClass); 4702 return std::make_pair(0U, &AArch64::GPR32commonRegClass); 4703 case 'w': 4704 if (VT.getSizeInBits() == 32) 4705 return std::make_pair(0U, &AArch64::FPR32RegClass); 4706 if (VT.getSizeInBits() == 64) 4707 return std::make_pair(0U, &AArch64::FPR64RegClass); 4708 if (VT.getSizeInBits() == 128) 4709 return std::make_pair(0U, &AArch64::FPR128RegClass); 4710 break; 4711 // The instructions that this constraint is designed for can 4712 // only take 128-bit registers so just use that regclass. 4713 case 'x': 4714 if (VT.getSizeInBits() == 128) 4715 return std::make_pair(0U, &AArch64::FPR128_loRegClass); 4716 break; 4717 } 4718 } 4719 if (StringRef("{cc}").equals_lower(Constraint)) 4720 return std::make_pair(unsigned(AArch64::NZCV), &AArch64::CCRRegClass); 4721 4722 // Use the default implementation in TargetLowering to convert the register 4723 // constraint into a member of a register class. 4724 std::pair<unsigned, const TargetRegisterClass *> Res; 4725 Res = TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT); 4726 4727 // Not found as a standard register? 4728 if (!Res.second) { 4729 unsigned Size = Constraint.size(); 4730 if ((Size == 4 || Size == 5) && Constraint[0] == '{' && 4731 tolower(Constraint[1]) == 'v' && Constraint[Size - 1] == '}') { 4732 int RegNo; 4733 bool Failed = Constraint.slice(2, Size - 1).getAsInteger(10, RegNo); 4734 if (!Failed && RegNo >= 0 && RegNo <= 31) { 4735 // v0 - v31 are aliases of q0 - q31 or d0 - d31 depending on size. 4736 // By default we'll emit v0-v31 for this unless there's a modifier where 4737 // we'll emit the correct register as well. 4738 if (VT != MVT::Other && VT.getSizeInBits() == 64) { 4739 Res.first = AArch64::FPR64RegClass.getRegister(RegNo); 4740 Res.second = &AArch64::FPR64RegClass; 4741 } else { 4742 Res.first = AArch64::FPR128RegClass.getRegister(RegNo); 4743 Res.second = &AArch64::FPR128RegClass; 4744 } 4745 } 4746 } 4747 } 4748 4749 return Res; 4750 } 4751 4752 /// LowerAsmOperandForConstraint - Lower the specified operand into the Ops 4753 /// vector. If it is invalid, don't add anything to Ops. 4754 void AArch64TargetLowering::LowerAsmOperandForConstraint( 4755 SDValue Op, std::string &Constraint, std::vector<SDValue> &Ops, 4756 SelectionDAG &DAG) const { 4757 SDValue Result; 4758 4759 // Currently only support length 1 constraints. 4760 if (Constraint.length() != 1) 4761 return; 4762 4763 char ConstraintLetter = Constraint[0]; 4764 switch (ConstraintLetter) { 4765 default: 4766 break; 4767 4768 // This set of constraints deal with valid constants for various instructions. 4769 // Validate and return a target constant for them if we can. 4770 case 'z': { 4771 // 'z' maps to xzr or wzr so it needs an input of 0. 4772 if (!isNullConstant(Op)) 4773 return; 4774 4775 if (Op.getValueType() == MVT::i64) 4776 Result = DAG.getRegister(AArch64::XZR, MVT::i64); 4777 else 4778 Result = DAG.getRegister(AArch64::WZR, MVT::i32); 4779 break; 4780 } 4781 4782 case 'I': 4783 case 'J': 4784 case 'K': 4785 case 'L': 4786 case 'M': 4787 case 'N': 4788 ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op); 4789 if (!C) 4790 return; 4791 4792 // Grab the value and do some validation. 4793 uint64_t CVal = C->getZExtValue(); 4794 switch (ConstraintLetter) { 4795 // The I constraint applies only to simple ADD or SUB immediate operands: 4796 // i.e. 0 to 4095 with optional shift by 12 4797 // The J constraint applies only to ADD or SUB immediates that would be 4798 // valid when negated, i.e. if [an add pattern] were to be output as a SUB 4799 // instruction [or vice versa], in other words -1 to -4095 with optional 4800 // left shift by 12. 4801 case 'I': 4802 if (isUInt<12>(CVal) || isShiftedUInt<12, 12>(CVal)) 4803 break; 4804 return; 4805 case 'J': { 4806 uint64_t NVal = -C->getSExtValue(); 4807 if (isUInt<12>(NVal) || isShiftedUInt<12, 12>(NVal)) { 4808 CVal = C->getSExtValue(); 4809 break; 4810 } 4811 return; 4812 } 4813 // The K and L constraints apply *only* to logical immediates, including 4814 // what used to be the MOVI alias for ORR (though the MOVI alias has now 4815 // been removed and MOV should be used). So these constraints have to 4816 // distinguish between bit patterns that are valid 32-bit or 64-bit 4817 // "bitmask immediates": for example 0xaaaaaaaa is a valid bimm32 (K), but 4818 // not a valid bimm64 (L) where 0xaaaaaaaaaaaaaaaa would be valid, and vice 4819 // versa. 4820 case 'K': 4821 if (AArch64_AM::isLogicalImmediate(CVal, 32)) 4822 break; 4823 return; 4824 case 'L': 4825 if (AArch64_AM::isLogicalImmediate(CVal, 64)) 4826 break; 4827 return; 4828 // The M and N constraints are a superset of K and L respectively, for use 4829 // with the MOV (immediate) alias. As well as the logical immediates they 4830 // also match 32 or 64-bit immediates that can be loaded either using a 4831 // *single* MOVZ or MOVN , such as 32-bit 0x12340000, 0x00001234, 0xffffedca 4832 // (M) or 64-bit 0x1234000000000000 (N) etc. 4833 // As a note some of this code is liberally stolen from the asm parser. 4834 case 'M': { 4835 if (!isUInt<32>(CVal)) 4836 return; 4837 if (AArch64_AM::isLogicalImmediate(CVal, 32)) 4838 break; 4839 if ((CVal & 0xFFFF) == CVal) 4840 break; 4841 if ((CVal & 0xFFFF0000ULL) == CVal) 4842 break; 4843 uint64_t NCVal = ~(uint32_t)CVal; 4844 if ((NCVal & 0xFFFFULL) == NCVal) 4845 break; 4846 if ((NCVal & 0xFFFF0000ULL) == NCVal) 4847 break; 4848 return; 4849 } 4850 case 'N': { 4851 if (AArch64_AM::isLogicalImmediate(CVal, 64)) 4852 break; 4853 if ((CVal & 0xFFFFULL) == CVal) 4854 break; 4855 if ((CVal & 0xFFFF0000ULL) == CVal) 4856 break; 4857 if ((CVal & 0xFFFF00000000ULL) == CVal) 4858 break; 4859 if ((CVal & 0xFFFF000000000000ULL) == CVal) 4860 break; 4861 uint64_t NCVal = ~CVal; 4862 if ((NCVal & 0xFFFFULL) == NCVal) 4863 break; 4864 if ((NCVal & 0xFFFF0000ULL) == NCVal) 4865 break; 4866 if ((NCVal & 0xFFFF00000000ULL) == NCVal) 4867 break; 4868 if ((NCVal & 0xFFFF000000000000ULL) == NCVal) 4869 break; 4870 return; 4871 } 4872 default: 4873 return; 4874 } 4875 4876 // All assembler immediates are 64-bit integers. 4877 Result = DAG.getTargetConstant(CVal, SDLoc(Op), MVT::i64); 4878 break; 4879 } 4880 4881 if (Result.getNode()) { 4882 Ops.push_back(Result); 4883 return; 4884 } 4885 4886 return TargetLowering::LowerAsmOperandForConstraint(Op, Constraint, Ops, DAG); 4887 } 4888 4889 //===----------------------------------------------------------------------===// 4890 // AArch64 Advanced SIMD Support 4891 //===----------------------------------------------------------------------===// 4892 4893 /// WidenVector - Given a value in the V64 register class, produce the 4894 /// equivalent value in the V128 register class. 4895 static SDValue WidenVector(SDValue V64Reg, SelectionDAG &DAG) { 4896 EVT VT = V64Reg.getValueType(); 4897 unsigned NarrowSize = VT.getVectorNumElements(); 4898 MVT EltTy = VT.getVectorElementType().getSimpleVT(); 4899 MVT WideTy = MVT::getVectorVT(EltTy, 2 * NarrowSize); 4900 SDLoc DL(V64Reg); 4901 4902 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, WideTy, DAG.getUNDEF(WideTy), 4903 V64Reg, DAG.getConstant(0, DL, MVT::i32)); 4904 } 4905 4906 /// getExtFactor - Determine the adjustment factor for the position when 4907 /// generating an "extract from vector registers" instruction. 4908 static unsigned getExtFactor(SDValue &V) { 4909 EVT EltType = V.getValueType().getVectorElementType(); 4910 return EltType.getSizeInBits() / 8; 4911 } 4912 4913 /// NarrowVector - Given a value in the V128 register class, produce the 4914 /// equivalent value in the V64 register class. 4915 static SDValue NarrowVector(SDValue V128Reg, SelectionDAG &DAG) { 4916 EVT VT = V128Reg.getValueType(); 4917 unsigned WideSize = VT.getVectorNumElements(); 4918 MVT EltTy = VT.getVectorElementType().getSimpleVT(); 4919 MVT NarrowTy = MVT::getVectorVT(EltTy, WideSize / 2); 4920 SDLoc DL(V128Reg); 4921 4922 return DAG.getTargetExtractSubreg(AArch64::dsub, DL, NarrowTy, V128Reg); 4923 } 4924 4925 // Gather data to see if the operation can be modelled as a 4926 // shuffle in combination with VEXTs. 4927 SDValue AArch64TargetLowering::ReconstructShuffle(SDValue Op, 4928 SelectionDAG &DAG) const { 4929 assert(Op.getOpcode() == ISD::BUILD_VECTOR && "Unknown opcode!"); 4930 SDLoc dl(Op); 4931 EVT VT = Op.getValueType(); 4932 unsigned NumElts = VT.getVectorNumElements(); 4933 4934 struct ShuffleSourceInfo { 4935 SDValue Vec; 4936 unsigned MinElt; 4937 unsigned MaxElt; 4938 4939 // We may insert some combination of BITCASTs and VEXT nodes to force Vec to 4940 // be compatible with the shuffle we intend to construct. As a result 4941 // ShuffleVec will be some sliding window into the original Vec. 4942 SDValue ShuffleVec; 4943 4944 // Code should guarantee that element i in Vec starts at element "WindowBase 4945 // + i * WindowScale in ShuffleVec". 4946 int WindowBase; 4947 int WindowScale; 4948 4949 bool operator ==(SDValue OtherVec) { return Vec == OtherVec; } 4950 ShuffleSourceInfo(SDValue Vec) 4951 : Vec(Vec), MinElt(UINT_MAX), MaxElt(0), ShuffleVec(Vec), WindowBase(0), 4952 WindowScale(1) {} 4953 }; 4954 4955 // First gather all vectors used as an immediate source for this BUILD_VECTOR 4956 // node. 4957 SmallVector<ShuffleSourceInfo, 2> Sources; 4958 for (unsigned i = 0; i < NumElts; ++i) { 4959 SDValue V = Op.getOperand(i); 4960 if (V.isUndef()) 4961 continue; 4962 else if (V.getOpcode() != ISD::EXTRACT_VECTOR_ELT || 4963 !isa<ConstantSDNode>(V.getOperand(1))) { 4964 // A shuffle can only come from building a vector from various 4965 // elements of other vectors, provided their indices are constant. 4966 return SDValue(); 4967 } 4968 4969 // Add this element source to the list if it's not already there. 4970 SDValue SourceVec = V.getOperand(0); 4971 auto Source = std::find(Sources.begin(), Sources.end(), SourceVec); 4972 if (Source == Sources.end()) 4973 Source = Sources.insert(Sources.end(), ShuffleSourceInfo(SourceVec)); 4974 4975 // Update the minimum and maximum lane number seen. 4976 unsigned EltNo = cast<ConstantSDNode>(V.getOperand(1))->getZExtValue(); 4977 Source->MinElt = std::min(Source->MinElt, EltNo); 4978 Source->MaxElt = std::max(Source->MaxElt, EltNo); 4979 } 4980 4981 // Currently only do something sane when at most two source vectors 4982 // are involved. 4983 if (Sources.size() > 2) 4984 return SDValue(); 4985 4986 // Find out the smallest element size among result and two sources, and use 4987 // it as element size to build the shuffle_vector. 4988 EVT SmallestEltTy = VT.getVectorElementType(); 4989 for (auto &Source : Sources) { 4990 EVT SrcEltTy = Source.Vec.getValueType().getVectorElementType(); 4991 if (SrcEltTy.bitsLT(SmallestEltTy)) { 4992 SmallestEltTy = SrcEltTy; 4993 } 4994 } 4995 unsigned ResMultiplier = 4996 VT.getVectorElementType().getSizeInBits() / SmallestEltTy.getSizeInBits(); 4997 NumElts = VT.getSizeInBits() / SmallestEltTy.getSizeInBits(); 4998 EVT ShuffleVT = EVT::getVectorVT(*DAG.getContext(), SmallestEltTy, NumElts); 4999 5000 // If the source vector is too wide or too narrow, we may nevertheless be able 5001 // to construct a compatible shuffle either by concatenating it with UNDEF or 5002 // extracting a suitable range of elements. 5003 for (auto &Src : Sources) { 5004 EVT SrcVT = Src.ShuffleVec.getValueType(); 5005 5006 if (SrcVT.getSizeInBits() == VT.getSizeInBits()) 5007 continue; 5008 5009 // This stage of the search produces a source with the same element type as 5010 // the original, but with a total width matching the BUILD_VECTOR output. 5011 EVT EltVT = SrcVT.getVectorElementType(); 5012 unsigned NumSrcElts = VT.getSizeInBits() / EltVT.getSizeInBits(); 5013 EVT DestVT = EVT::getVectorVT(*DAG.getContext(), EltVT, NumSrcElts); 5014 5015 if (SrcVT.getSizeInBits() < VT.getSizeInBits()) { 5016 assert(2 * SrcVT.getSizeInBits() == VT.getSizeInBits()); 5017 // We can pad out the smaller vector for free, so if it's part of a 5018 // shuffle... 5019 Src.ShuffleVec = 5020 DAG.getNode(ISD::CONCAT_VECTORS, dl, DestVT, Src.ShuffleVec, 5021 DAG.getUNDEF(Src.ShuffleVec.getValueType())); 5022 continue; 5023 } 5024 5025 assert(SrcVT.getSizeInBits() == 2 * VT.getSizeInBits()); 5026 5027 if (Src.MaxElt - Src.MinElt >= NumSrcElts) { 5028 // Span too large for a VEXT to cope 5029 return SDValue(); 5030 } 5031 5032 if (Src.MinElt >= NumSrcElts) { 5033 // The extraction can just take the second half 5034 Src.ShuffleVec = 5035 DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, DestVT, Src.ShuffleVec, 5036 DAG.getConstant(NumSrcElts, dl, MVT::i64)); 5037 Src.WindowBase = -NumSrcElts; 5038 } else if (Src.MaxElt < NumSrcElts) { 5039 // The extraction can just take the first half 5040 Src.ShuffleVec = 5041 DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, DestVT, Src.ShuffleVec, 5042 DAG.getConstant(0, dl, MVT::i64)); 5043 } else { 5044 // An actual VEXT is needed 5045 SDValue VEXTSrc1 = 5046 DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, DestVT, Src.ShuffleVec, 5047 DAG.getConstant(0, dl, MVT::i64)); 5048 SDValue VEXTSrc2 = 5049 DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, DestVT, Src.ShuffleVec, 5050 DAG.getConstant(NumSrcElts, dl, MVT::i64)); 5051 unsigned Imm = Src.MinElt * getExtFactor(VEXTSrc1); 5052 5053 Src.ShuffleVec = DAG.getNode(AArch64ISD::EXT, dl, DestVT, VEXTSrc1, 5054 VEXTSrc2, 5055 DAG.getConstant(Imm, dl, MVT::i32)); 5056 Src.WindowBase = -Src.MinElt; 5057 } 5058 } 5059 5060 // Another possible incompatibility occurs from the vector element types. We 5061 // can fix this by bitcasting the source vectors to the same type we intend 5062 // for the shuffle. 5063 for (auto &Src : Sources) { 5064 EVT SrcEltTy = Src.ShuffleVec.getValueType().getVectorElementType(); 5065 if (SrcEltTy == SmallestEltTy) 5066 continue; 5067 assert(ShuffleVT.getVectorElementType() == SmallestEltTy); 5068 Src.ShuffleVec = DAG.getNode(ISD::BITCAST, dl, ShuffleVT, Src.ShuffleVec); 5069 Src.WindowScale = SrcEltTy.getSizeInBits() / SmallestEltTy.getSizeInBits(); 5070 Src.WindowBase *= Src.WindowScale; 5071 } 5072 5073 // Final sanity check before we try to actually produce a shuffle. 5074 DEBUG( 5075 for (auto Src : Sources) 5076 assert(Src.ShuffleVec.getValueType() == ShuffleVT); 5077 ); 5078 5079 // The stars all align, our next step is to produce the mask for the shuffle. 5080 SmallVector<int, 8> Mask(ShuffleVT.getVectorNumElements(), -1); 5081 int BitsPerShuffleLane = ShuffleVT.getVectorElementType().getSizeInBits(); 5082 for (unsigned i = 0; i < VT.getVectorNumElements(); ++i) { 5083 SDValue Entry = Op.getOperand(i); 5084 if (Entry.isUndef()) 5085 continue; 5086 5087 auto Src = std::find(Sources.begin(), Sources.end(), Entry.getOperand(0)); 5088 int EltNo = cast<ConstantSDNode>(Entry.getOperand(1))->getSExtValue(); 5089 5090 // EXTRACT_VECTOR_ELT performs an implicit any_ext; BUILD_VECTOR an implicit 5091 // trunc. So only std::min(SrcBits, DestBits) actually get defined in this 5092 // segment. 5093 EVT OrigEltTy = Entry.getOperand(0).getValueType().getVectorElementType(); 5094 int BitsDefined = std::min(OrigEltTy.getSizeInBits(), 5095 VT.getVectorElementType().getSizeInBits()); 5096 int LanesDefined = BitsDefined / BitsPerShuffleLane; 5097 5098 // This source is expected to fill ResMultiplier lanes of the final shuffle, 5099 // starting at the appropriate offset. 5100 int *LaneMask = &Mask[i * ResMultiplier]; 5101 5102 int ExtractBase = EltNo * Src->WindowScale + Src->WindowBase; 5103 ExtractBase += NumElts * (Src - Sources.begin()); 5104 for (int j = 0; j < LanesDefined; ++j) 5105 LaneMask[j] = ExtractBase + j; 5106 } 5107 5108 // Final check before we try to produce nonsense... 5109 if (!isShuffleMaskLegal(Mask, ShuffleVT)) 5110 return SDValue(); 5111 5112 SDValue ShuffleOps[] = { DAG.getUNDEF(ShuffleVT), DAG.getUNDEF(ShuffleVT) }; 5113 for (unsigned i = 0; i < Sources.size(); ++i) 5114 ShuffleOps[i] = Sources[i].ShuffleVec; 5115 5116 SDValue Shuffle = DAG.getVectorShuffle(ShuffleVT, dl, ShuffleOps[0], 5117 ShuffleOps[1], Mask); 5118 return DAG.getNode(ISD::BITCAST, dl, VT, Shuffle); 5119 } 5120 5121 // check if an EXT instruction can handle the shuffle mask when the 5122 // vector sources of the shuffle are the same. 5123 static bool isSingletonEXTMask(ArrayRef<int> M, EVT VT, unsigned &Imm) { 5124 unsigned NumElts = VT.getVectorNumElements(); 5125 5126 // Assume that the first shuffle index is not UNDEF. Fail if it is. 5127 if (M[0] < 0) 5128 return false; 5129 5130 Imm = M[0]; 5131 5132 // If this is a VEXT shuffle, the immediate value is the index of the first 5133 // element. The other shuffle indices must be the successive elements after 5134 // the first one. 5135 unsigned ExpectedElt = Imm; 5136 for (unsigned i = 1; i < NumElts; ++i) { 5137 // Increment the expected index. If it wraps around, just follow it 5138 // back to index zero and keep going. 5139 ++ExpectedElt; 5140 if (ExpectedElt == NumElts) 5141 ExpectedElt = 0; 5142 5143 if (M[i] < 0) 5144 continue; // ignore UNDEF indices 5145 if (ExpectedElt != static_cast<unsigned>(M[i])) 5146 return false; 5147 } 5148 5149 return true; 5150 } 5151 5152 // check if an EXT instruction can handle the shuffle mask when the 5153 // vector sources of the shuffle are different. 5154 static bool isEXTMask(ArrayRef<int> M, EVT VT, bool &ReverseEXT, 5155 unsigned &Imm) { 5156 // Look for the first non-undef element. 5157 const int *FirstRealElt = std::find_if(M.begin(), M.end(), 5158 [](int Elt) {return Elt >= 0;}); 5159 5160 // Benefit form APInt to handle overflow when calculating expected element. 5161 unsigned NumElts = VT.getVectorNumElements(); 5162 unsigned MaskBits = APInt(32, NumElts * 2).logBase2(); 5163 APInt ExpectedElt = APInt(MaskBits, *FirstRealElt + 1); 5164 // The following shuffle indices must be the successive elements after the 5165 // first real element. 5166 const int *FirstWrongElt = std::find_if(FirstRealElt + 1, M.end(), 5167 [&](int Elt) {return Elt != ExpectedElt++ && Elt != -1;}); 5168 if (FirstWrongElt != M.end()) 5169 return false; 5170 5171 // The index of an EXT is the first element if it is not UNDEF. 5172 // Watch out for the beginning UNDEFs. The EXT index should be the expected 5173 // value of the first element. E.g. 5174 // <-1, -1, 3, ...> is treated as <1, 2, 3, ...>. 5175 // <-1, -1, 0, 1, ...> is treated as <2*NumElts-2, 2*NumElts-1, 0, 1, ...>. 5176 // ExpectedElt is the last mask index plus 1. 5177 Imm = ExpectedElt.getZExtValue(); 5178 5179 // There are two difference cases requiring to reverse input vectors. 5180 // For example, for vector <4 x i32> we have the following cases, 5181 // Case 1: shufflevector(<4 x i32>,<4 x i32>,<-1, -1, -1, 0>) 5182 // Case 2: shufflevector(<4 x i32>,<4 x i32>,<-1, -1, 7, 0>) 5183 // For both cases, we finally use mask <5, 6, 7, 0>, which requires 5184 // to reverse two input vectors. 5185 if (Imm < NumElts) 5186 ReverseEXT = true; 5187 else 5188 Imm -= NumElts; 5189 5190 return true; 5191 } 5192 5193 /// isREVMask - Check if a vector shuffle corresponds to a REV 5194 /// instruction with the specified blocksize. (The order of the elements 5195 /// within each block of the vector is reversed.) 5196 static bool isREVMask(ArrayRef<int> M, EVT VT, unsigned BlockSize) { 5197 assert((BlockSize == 16 || BlockSize == 32 || BlockSize == 64) && 5198 "Only possible block sizes for REV are: 16, 32, 64"); 5199 5200 unsigned EltSz = VT.getVectorElementType().getSizeInBits(); 5201 if (EltSz == 64) 5202 return false; 5203 5204 unsigned NumElts = VT.getVectorNumElements(); 5205 unsigned BlockElts = M[0] + 1; 5206 // If the first shuffle index is UNDEF, be optimistic. 5207 if (M[0] < 0) 5208 BlockElts = BlockSize / EltSz; 5209 5210 if (BlockSize <= EltSz || BlockSize != BlockElts * EltSz) 5211 return false; 5212 5213 for (unsigned i = 0; i < NumElts; ++i) { 5214 if (M[i] < 0) 5215 continue; // ignore UNDEF indices 5216 if ((unsigned)M[i] != (i - i % BlockElts) + (BlockElts - 1 - i % BlockElts)) 5217 return false; 5218 } 5219 5220 return true; 5221 } 5222 5223 static bool isZIPMask(ArrayRef<int> M, EVT VT, unsigned &WhichResult) { 5224 unsigned NumElts = VT.getVectorNumElements(); 5225 WhichResult = (M[0] == 0 ? 0 : 1); 5226 unsigned Idx = WhichResult * NumElts / 2; 5227 for (unsigned i = 0; i != NumElts; i += 2) { 5228 if ((M[i] >= 0 && (unsigned)M[i] != Idx) || 5229 (M[i + 1] >= 0 && (unsigned)M[i + 1] != Idx + NumElts)) 5230 return false; 5231 Idx += 1; 5232 } 5233 5234 return true; 5235 } 5236 5237 static bool isUZPMask(ArrayRef<int> M, EVT VT, unsigned &WhichResult) { 5238 unsigned NumElts = VT.getVectorNumElements(); 5239 WhichResult = (M[0] == 0 ? 0 : 1); 5240 for (unsigned i = 0; i != NumElts; ++i) { 5241 if (M[i] < 0) 5242 continue; // ignore UNDEF indices 5243 if ((unsigned)M[i] != 2 * i + WhichResult) 5244 return false; 5245 } 5246 5247 return true; 5248 } 5249 5250 static bool isTRNMask(ArrayRef<int> M, EVT VT, unsigned &WhichResult) { 5251 unsigned NumElts = VT.getVectorNumElements(); 5252 WhichResult = (M[0] == 0 ? 0 : 1); 5253 for (unsigned i = 0; i < NumElts; i += 2) { 5254 if ((M[i] >= 0 && (unsigned)M[i] != i + WhichResult) || 5255 (M[i + 1] >= 0 && (unsigned)M[i + 1] != i + NumElts + WhichResult)) 5256 return false; 5257 } 5258 return true; 5259 } 5260 5261 /// isZIP_v_undef_Mask - Special case of isZIPMask for canonical form of 5262 /// "vector_shuffle v, v", i.e., "vector_shuffle v, undef". 5263 /// Mask is e.g., <0, 0, 1, 1> instead of <0, 4, 1, 5>. 5264 static bool isZIP_v_undef_Mask(ArrayRef<int> M, EVT VT, unsigned &WhichResult) { 5265 unsigned NumElts = VT.getVectorNumElements(); 5266 WhichResult = (M[0] == 0 ? 0 : 1); 5267 unsigned Idx = WhichResult * NumElts / 2; 5268 for (unsigned i = 0; i != NumElts; i += 2) { 5269 if ((M[i] >= 0 && (unsigned)M[i] != Idx) || 5270 (M[i + 1] >= 0 && (unsigned)M[i + 1] != Idx)) 5271 return false; 5272 Idx += 1; 5273 } 5274 5275 return true; 5276 } 5277 5278 /// isUZP_v_undef_Mask - Special case of isUZPMask for canonical form of 5279 /// "vector_shuffle v, v", i.e., "vector_shuffle v, undef". 5280 /// Mask is e.g., <0, 2, 0, 2> instead of <0, 2, 4, 6>, 5281 static bool isUZP_v_undef_Mask(ArrayRef<int> M, EVT VT, unsigned &WhichResult) { 5282 unsigned Half = VT.getVectorNumElements() / 2; 5283 WhichResult = (M[0] == 0 ? 0 : 1); 5284 for (unsigned j = 0; j != 2; ++j) { 5285 unsigned Idx = WhichResult; 5286 for (unsigned i = 0; i != Half; ++i) { 5287 int MIdx = M[i + j * Half]; 5288 if (MIdx >= 0 && (unsigned)MIdx != Idx) 5289 return false; 5290 Idx += 2; 5291 } 5292 } 5293 5294 return true; 5295 } 5296 5297 /// isTRN_v_undef_Mask - Special case of isTRNMask for canonical form of 5298 /// "vector_shuffle v, v", i.e., "vector_shuffle v, undef". 5299 /// Mask is e.g., <0, 0, 2, 2> instead of <0, 4, 2, 6>. 5300 static bool isTRN_v_undef_Mask(ArrayRef<int> M, EVT VT, unsigned &WhichResult) { 5301 unsigned NumElts = VT.getVectorNumElements(); 5302 WhichResult = (M[0] == 0 ? 0 : 1); 5303 for (unsigned i = 0; i < NumElts; i += 2) { 5304 if ((M[i] >= 0 && (unsigned)M[i] != i + WhichResult) || 5305 (M[i + 1] >= 0 && (unsigned)M[i + 1] != i + WhichResult)) 5306 return false; 5307 } 5308 return true; 5309 } 5310 5311 static bool isINSMask(ArrayRef<int> M, int NumInputElements, 5312 bool &DstIsLeft, int &Anomaly) { 5313 if (M.size() != static_cast<size_t>(NumInputElements)) 5314 return false; 5315 5316 int NumLHSMatch = 0, NumRHSMatch = 0; 5317 int LastLHSMismatch = -1, LastRHSMismatch = -1; 5318 5319 for (int i = 0; i < NumInputElements; ++i) { 5320 if (M[i] == -1) { 5321 ++NumLHSMatch; 5322 ++NumRHSMatch; 5323 continue; 5324 } 5325 5326 if (M[i] == i) 5327 ++NumLHSMatch; 5328 else 5329 LastLHSMismatch = i; 5330 5331 if (M[i] == i + NumInputElements) 5332 ++NumRHSMatch; 5333 else 5334 LastRHSMismatch = i; 5335 } 5336 5337 if (NumLHSMatch == NumInputElements - 1) { 5338 DstIsLeft = true; 5339 Anomaly = LastLHSMismatch; 5340 return true; 5341 } else if (NumRHSMatch == NumInputElements - 1) { 5342 DstIsLeft = false; 5343 Anomaly = LastRHSMismatch; 5344 return true; 5345 } 5346 5347 return false; 5348 } 5349 5350 static bool isConcatMask(ArrayRef<int> Mask, EVT VT, bool SplitLHS) { 5351 if (VT.getSizeInBits() != 128) 5352 return false; 5353 5354 unsigned NumElts = VT.getVectorNumElements(); 5355 5356 for (int I = 0, E = NumElts / 2; I != E; I++) { 5357 if (Mask[I] != I) 5358 return false; 5359 } 5360 5361 int Offset = NumElts / 2; 5362 for (int I = NumElts / 2, E = NumElts; I != E; I++) { 5363 if (Mask[I] != I + SplitLHS * Offset) 5364 return false; 5365 } 5366 5367 return true; 5368 } 5369 5370 static SDValue tryFormConcatFromShuffle(SDValue Op, SelectionDAG &DAG) { 5371 SDLoc DL(Op); 5372 EVT VT = Op.getValueType(); 5373 SDValue V0 = Op.getOperand(0); 5374 SDValue V1 = Op.getOperand(1); 5375 ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(Op)->getMask(); 5376 5377 if (VT.getVectorElementType() != V0.getValueType().getVectorElementType() || 5378 VT.getVectorElementType() != V1.getValueType().getVectorElementType()) 5379 return SDValue(); 5380 5381 bool SplitV0 = V0.getValueType().getSizeInBits() == 128; 5382 5383 if (!isConcatMask(Mask, VT, SplitV0)) 5384 return SDValue(); 5385 5386 EVT CastVT = EVT::getVectorVT(*DAG.getContext(), VT.getVectorElementType(), 5387 VT.getVectorNumElements() / 2); 5388 if (SplitV0) { 5389 V0 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, CastVT, V0, 5390 DAG.getConstant(0, DL, MVT::i64)); 5391 } 5392 if (V1.getValueType().getSizeInBits() == 128) { 5393 V1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, CastVT, V1, 5394 DAG.getConstant(0, DL, MVT::i64)); 5395 } 5396 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, V0, V1); 5397 } 5398 5399 /// GeneratePerfectShuffle - Given an entry in the perfect-shuffle table, emit 5400 /// the specified operations to build the shuffle. 5401 static SDValue GeneratePerfectShuffle(unsigned PFEntry, SDValue LHS, 5402 SDValue RHS, SelectionDAG &DAG, 5403 const SDLoc &dl) { 5404 unsigned OpNum = (PFEntry >> 26) & 0x0F; 5405 unsigned LHSID = (PFEntry >> 13) & ((1 << 13) - 1); 5406 unsigned RHSID = (PFEntry >> 0) & ((1 << 13) - 1); 5407 5408 enum { 5409 OP_COPY = 0, // Copy, used for things like <u,u,u,3> to say it is <0,1,2,3> 5410 OP_VREV, 5411 OP_VDUP0, 5412 OP_VDUP1, 5413 OP_VDUP2, 5414 OP_VDUP3, 5415 OP_VEXT1, 5416 OP_VEXT2, 5417 OP_VEXT3, 5418 OP_VUZPL, // VUZP, left result 5419 OP_VUZPR, // VUZP, right result 5420 OP_VZIPL, // VZIP, left result 5421 OP_VZIPR, // VZIP, right result 5422 OP_VTRNL, // VTRN, left result 5423 OP_VTRNR // VTRN, right result 5424 }; 5425 5426 if (OpNum == OP_COPY) { 5427 if (LHSID == (1 * 9 + 2) * 9 + 3) 5428 return LHS; 5429 assert(LHSID == ((4 * 9 + 5) * 9 + 6) * 9 + 7 && "Illegal OP_COPY!"); 5430 return RHS; 5431 } 5432 5433 SDValue OpLHS, OpRHS; 5434 OpLHS = GeneratePerfectShuffle(PerfectShuffleTable[LHSID], LHS, RHS, DAG, dl); 5435 OpRHS = GeneratePerfectShuffle(PerfectShuffleTable[RHSID], LHS, RHS, DAG, dl); 5436 EVT VT = OpLHS.getValueType(); 5437 5438 switch (OpNum) { 5439 default: 5440 llvm_unreachable("Unknown shuffle opcode!"); 5441 case OP_VREV: 5442 // VREV divides the vector in half and swaps within the half. 5443 if (VT.getVectorElementType() == MVT::i32 || 5444 VT.getVectorElementType() == MVT::f32) 5445 return DAG.getNode(AArch64ISD::REV64, dl, VT, OpLHS); 5446 // vrev <4 x i16> -> REV32 5447 if (VT.getVectorElementType() == MVT::i16 || 5448 VT.getVectorElementType() == MVT::f16) 5449 return DAG.getNode(AArch64ISD::REV32, dl, VT, OpLHS); 5450 // vrev <4 x i8> -> REV16 5451 assert(VT.getVectorElementType() == MVT::i8); 5452 return DAG.getNode(AArch64ISD::REV16, dl, VT, OpLHS); 5453 case OP_VDUP0: 5454 case OP_VDUP1: 5455 case OP_VDUP2: 5456 case OP_VDUP3: { 5457 EVT EltTy = VT.getVectorElementType(); 5458 unsigned Opcode; 5459 if (EltTy == MVT::i8) 5460 Opcode = AArch64ISD::DUPLANE8; 5461 else if (EltTy == MVT::i16 || EltTy == MVT::f16) 5462 Opcode = AArch64ISD::DUPLANE16; 5463 else if (EltTy == MVT::i32 || EltTy == MVT::f32) 5464 Opcode = AArch64ISD::DUPLANE32; 5465 else if (EltTy == MVT::i64 || EltTy == MVT::f64) 5466 Opcode = AArch64ISD::DUPLANE64; 5467 else 5468 llvm_unreachable("Invalid vector element type?"); 5469 5470 if (VT.getSizeInBits() == 64) 5471 OpLHS = WidenVector(OpLHS, DAG); 5472 SDValue Lane = DAG.getConstant(OpNum - OP_VDUP0, dl, MVT::i64); 5473 return DAG.getNode(Opcode, dl, VT, OpLHS, Lane); 5474 } 5475 case OP_VEXT1: 5476 case OP_VEXT2: 5477 case OP_VEXT3: { 5478 unsigned Imm = (OpNum - OP_VEXT1 + 1) * getExtFactor(OpLHS); 5479 return DAG.getNode(AArch64ISD::EXT, dl, VT, OpLHS, OpRHS, 5480 DAG.getConstant(Imm, dl, MVT::i32)); 5481 } 5482 case OP_VUZPL: 5483 return DAG.getNode(AArch64ISD::UZP1, dl, DAG.getVTList(VT, VT), OpLHS, 5484 OpRHS); 5485 case OP_VUZPR: 5486 return DAG.getNode(AArch64ISD::UZP2, dl, DAG.getVTList(VT, VT), OpLHS, 5487 OpRHS); 5488 case OP_VZIPL: 5489 return DAG.getNode(AArch64ISD::ZIP1, dl, DAG.getVTList(VT, VT), OpLHS, 5490 OpRHS); 5491 case OP_VZIPR: 5492 return DAG.getNode(AArch64ISD::ZIP2, dl, DAG.getVTList(VT, VT), OpLHS, 5493 OpRHS); 5494 case OP_VTRNL: 5495 return DAG.getNode(AArch64ISD::TRN1, dl, DAG.getVTList(VT, VT), OpLHS, 5496 OpRHS); 5497 case OP_VTRNR: 5498 return DAG.getNode(AArch64ISD::TRN2, dl, DAG.getVTList(VT, VT), OpLHS, 5499 OpRHS); 5500 } 5501 } 5502 5503 static SDValue GenerateTBL(SDValue Op, ArrayRef<int> ShuffleMask, 5504 SelectionDAG &DAG) { 5505 // Check to see if we can use the TBL instruction. 5506 SDValue V1 = Op.getOperand(0); 5507 SDValue V2 = Op.getOperand(1); 5508 SDLoc DL(Op); 5509 5510 EVT EltVT = Op.getValueType().getVectorElementType(); 5511 unsigned BytesPerElt = EltVT.getSizeInBits() / 8; 5512 5513 SmallVector<SDValue, 8> TBLMask; 5514 for (int Val : ShuffleMask) { 5515 for (unsigned Byte = 0; Byte < BytesPerElt; ++Byte) { 5516 unsigned Offset = Byte + Val * BytesPerElt; 5517 TBLMask.push_back(DAG.getConstant(Offset, DL, MVT::i32)); 5518 } 5519 } 5520 5521 MVT IndexVT = MVT::v8i8; 5522 unsigned IndexLen = 8; 5523 if (Op.getValueType().getSizeInBits() == 128) { 5524 IndexVT = MVT::v16i8; 5525 IndexLen = 16; 5526 } 5527 5528 SDValue V1Cst = DAG.getNode(ISD::BITCAST, DL, IndexVT, V1); 5529 SDValue V2Cst = DAG.getNode(ISD::BITCAST, DL, IndexVT, V2); 5530 5531 SDValue Shuffle; 5532 if (V2.getNode()->isUndef()) { 5533 if (IndexLen == 8) 5534 V1Cst = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v16i8, V1Cst, V1Cst); 5535 Shuffle = DAG.getNode( 5536 ISD::INTRINSIC_WO_CHAIN, DL, IndexVT, 5537 DAG.getConstant(Intrinsic::aarch64_neon_tbl1, DL, MVT::i32), V1Cst, 5538 DAG.getBuildVector(IndexVT, DL, 5539 makeArrayRef(TBLMask.data(), IndexLen))); 5540 } else { 5541 if (IndexLen == 8) { 5542 V1Cst = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v16i8, V1Cst, V2Cst); 5543 Shuffle = DAG.getNode( 5544 ISD::INTRINSIC_WO_CHAIN, DL, IndexVT, 5545 DAG.getConstant(Intrinsic::aarch64_neon_tbl1, DL, MVT::i32), V1Cst, 5546 DAG.getBuildVector(IndexVT, DL, 5547 makeArrayRef(TBLMask.data(), IndexLen))); 5548 } else { 5549 // FIXME: We cannot, for the moment, emit a TBL2 instruction because we 5550 // cannot currently represent the register constraints on the input 5551 // table registers. 5552 // Shuffle = DAG.getNode(AArch64ISD::TBL2, DL, IndexVT, V1Cst, V2Cst, 5553 // DAG.getBuildVector(IndexVT, DL, &TBLMask[0], 5554 // IndexLen)); 5555 Shuffle = DAG.getNode( 5556 ISD::INTRINSIC_WO_CHAIN, DL, IndexVT, 5557 DAG.getConstant(Intrinsic::aarch64_neon_tbl2, DL, MVT::i32), V1Cst, 5558 V2Cst, DAG.getBuildVector(IndexVT, DL, 5559 makeArrayRef(TBLMask.data(), IndexLen))); 5560 } 5561 } 5562 return DAG.getNode(ISD::BITCAST, DL, Op.getValueType(), Shuffle); 5563 } 5564 5565 static unsigned getDUPLANEOp(EVT EltType) { 5566 if (EltType == MVT::i8) 5567 return AArch64ISD::DUPLANE8; 5568 if (EltType == MVT::i16 || EltType == MVT::f16) 5569 return AArch64ISD::DUPLANE16; 5570 if (EltType == MVT::i32 || EltType == MVT::f32) 5571 return AArch64ISD::DUPLANE32; 5572 if (EltType == MVT::i64 || EltType == MVT::f64) 5573 return AArch64ISD::DUPLANE64; 5574 5575 llvm_unreachable("Invalid vector element type?"); 5576 } 5577 5578 SDValue AArch64TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, 5579 SelectionDAG &DAG) const { 5580 SDLoc dl(Op); 5581 EVT VT = Op.getValueType(); 5582 5583 ShuffleVectorSDNode *SVN = cast<ShuffleVectorSDNode>(Op.getNode()); 5584 5585 // Convert shuffles that are directly supported on NEON to target-specific 5586 // DAG nodes, instead of keeping them as shuffles and matching them again 5587 // during code selection. This is more efficient and avoids the possibility 5588 // of inconsistencies between legalization and selection. 5589 ArrayRef<int> ShuffleMask = SVN->getMask(); 5590 5591 SDValue V1 = Op.getOperand(0); 5592 SDValue V2 = Op.getOperand(1); 5593 5594 if (SVN->isSplat()) { 5595 int Lane = SVN->getSplatIndex(); 5596 // If this is undef splat, generate it via "just" vdup, if possible. 5597 if (Lane == -1) 5598 Lane = 0; 5599 5600 if (Lane == 0 && V1.getOpcode() == ISD::SCALAR_TO_VECTOR) 5601 return DAG.getNode(AArch64ISD::DUP, dl, V1.getValueType(), 5602 V1.getOperand(0)); 5603 // Test if V1 is a BUILD_VECTOR and the lane being referenced is a non- 5604 // constant. If so, we can just reference the lane's definition directly. 5605 if (V1.getOpcode() == ISD::BUILD_VECTOR && 5606 !isa<ConstantSDNode>(V1.getOperand(Lane))) 5607 return DAG.getNode(AArch64ISD::DUP, dl, VT, V1.getOperand(Lane)); 5608 5609 // Otherwise, duplicate from the lane of the input vector. 5610 unsigned Opcode = getDUPLANEOp(V1.getValueType().getVectorElementType()); 5611 5612 // SelectionDAGBuilder may have "helpfully" already extracted or conatenated 5613 // to make a vector of the same size as this SHUFFLE. We can ignore the 5614 // extract entirely, and canonicalise the concat using WidenVector. 5615 if (V1.getOpcode() == ISD::EXTRACT_SUBVECTOR) { 5616 Lane += cast<ConstantSDNode>(V1.getOperand(1))->getZExtValue(); 5617 V1 = V1.getOperand(0); 5618 } else if (V1.getOpcode() == ISD::CONCAT_VECTORS) { 5619 unsigned Idx = Lane >= (int)VT.getVectorNumElements() / 2; 5620 Lane -= Idx * VT.getVectorNumElements() / 2; 5621 V1 = WidenVector(V1.getOperand(Idx), DAG); 5622 } else if (VT.getSizeInBits() == 64) 5623 V1 = WidenVector(V1, DAG); 5624 5625 return DAG.getNode(Opcode, dl, VT, V1, DAG.getConstant(Lane, dl, MVT::i64)); 5626 } 5627 5628 if (isREVMask(ShuffleMask, VT, 64)) 5629 return DAG.getNode(AArch64ISD::REV64, dl, V1.getValueType(), V1, V2); 5630 if (isREVMask(ShuffleMask, VT, 32)) 5631 return DAG.getNode(AArch64ISD::REV32, dl, V1.getValueType(), V1, V2); 5632 if (isREVMask(ShuffleMask, VT, 16)) 5633 return DAG.getNode(AArch64ISD::REV16, dl, V1.getValueType(), V1, V2); 5634 5635 bool ReverseEXT = false; 5636 unsigned Imm; 5637 if (isEXTMask(ShuffleMask, VT, ReverseEXT, Imm)) { 5638 if (ReverseEXT) 5639 std::swap(V1, V2); 5640 Imm *= getExtFactor(V1); 5641 return DAG.getNode(AArch64ISD::EXT, dl, V1.getValueType(), V1, V2, 5642 DAG.getConstant(Imm, dl, MVT::i32)); 5643 } else if (V2->isUndef() && isSingletonEXTMask(ShuffleMask, VT, Imm)) { 5644 Imm *= getExtFactor(V1); 5645 return DAG.getNode(AArch64ISD::EXT, dl, V1.getValueType(), V1, V1, 5646 DAG.getConstant(Imm, dl, MVT::i32)); 5647 } 5648 5649 unsigned WhichResult; 5650 if (isZIPMask(ShuffleMask, VT, WhichResult)) { 5651 unsigned Opc = (WhichResult == 0) ? AArch64ISD::ZIP1 : AArch64ISD::ZIP2; 5652 return DAG.getNode(Opc, dl, V1.getValueType(), V1, V2); 5653 } 5654 if (isUZPMask(ShuffleMask, VT, WhichResult)) { 5655 unsigned Opc = (WhichResult == 0) ? AArch64ISD::UZP1 : AArch64ISD::UZP2; 5656 return DAG.getNode(Opc, dl, V1.getValueType(), V1, V2); 5657 } 5658 if (isTRNMask(ShuffleMask, VT, WhichResult)) { 5659 unsigned Opc = (WhichResult == 0) ? AArch64ISD::TRN1 : AArch64ISD::TRN2; 5660 return DAG.getNode(Opc, dl, V1.getValueType(), V1, V2); 5661 } 5662 5663 if (isZIP_v_undef_Mask(ShuffleMask, VT, WhichResult)) { 5664 unsigned Opc = (WhichResult == 0) ? AArch64ISD::ZIP1 : AArch64ISD::ZIP2; 5665 return DAG.getNode(Opc, dl, V1.getValueType(), V1, V1); 5666 } 5667 if (isUZP_v_undef_Mask(ShuffleMask, VT, WhichResult)) { 5668 unsigned Opc = (WhichResult == 0) ? AArch64ISD::UZP1 : AArch64ISD::UZP2; 5669 return DAG.getNode(Opc, dl, V1.getValueType(), V1, V1); 5670 } 5671 if (isTRN_v_undef_Mask(ShuffleMask, VT, WhichResult)) { 5672 unsigned Opc = (WhichResult == 0) ? AArch64ISD::TRN1 : AArch64ISD::TRN2; 5673 return DAG.getNode(Opc, dl, V1.getValueType(), V1, V1); 5674 } 5675 5676 if (SDValue Concat = tryFormConcatFromShuffle(Op, DAG)) 5677 return Concat; 5678 5679 bool DstIsLeft; 5680 int Anomaly; 5681 int NumInputElements = V1.getValueType().getVectorNumElements(); 5682 if (isINSMask(ShuffleMask, NumInputElements, DstIsLeft, Anomaly)) { 5683 SDValue DstVec = DstIsLeft ? V1 : V2; 5684 SDValue DstLaneV = DAG.getConstant(Anomaly, dl, MVT::i64); 5685 5686 SDValue SrcVec = V1; 5687 int SrcLane = ShuffleMask[Anomaly]; 5688 if (SrcLane >= NumInputElements) { 5689 SrcVec = V2; 5690 SrcLane -= VT.getVectorNumElements(); 5691 } 5692 SDValue SrcLaneV = DAG.getConstant(SrcLane, dl, MVT::i64); 5693 5694 EVT ScalarVT = VT.getVectorElementType(); 5695 5696 if (ScalarVT.getSizeInBits() < 32 && ScalarVT.isInteger()) 5697 ScalarVT = MVT::i32; 5698 5699 return DAG.getNode( 5700 ISD::INSERT_VECTOR_ELT, dl, VT, DstVec, 5701 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, ScalarVT, SrcVec, SrcLaneV), 5702 DstLaneV); 5703 } 5704 5705 // If the shuffle is not directly supported and it has 4 elements, use 5706 // the PerfectShuffle-generated table to synthesize it from other shuffles. 5707 unsigned NumElts = VT.getVectorNumElements(); 5708 if (NumElts == 4) { 5709 unsigned PFIndexes[4]; 5710 for (unsigned i = 0; i != 4; ++i) { 5711 if (ShuffleMask[i] < 0) 5712 PFIndexes[i] = 8; 5713 else 5714 PFIndexes[i] = ShuffleMask[i]; 5715 } 5716 5717 // Compute the index in the perfect shuffle table. 5718 unsigned PFTableIndex = PFIndexes[0] * 9 * 9 * 9 + PFIndexes[1] * 9 * 9 + 5719 PFIndexes[2] * 9 + PFIndexes[3]; 5720 unsigned PFEntry = PerfectShuffleTable[PFTableIndex]; 5721 unsigned Cost = (PFEntry >> 30); 5722 5723 if (Cost <= 4) 5724 return GeneratePerfectShuffle(PFEntry, V1, V2, DAG, dl); 5725 } 5726 5727 return GenerateTBL(Op, ShuffleMask, DAG); 5728 } 5729 5730 static bool resolveBuildVector(BuildVectorSDNode *BVN, APInt &CnstBits, 5731 APInt &UndefBits) { 5732 EVT VT = BVN->getValueType(0); 5733 APInt SplatBits, SplatUndef; 5734 unsigned SplatBitSize; 5735 bool HasAnyUndefs; 5736 if (BVN->isConstantSplat(SplatBits, SplatUndef, SplatBitSize, HasAnyUndefs)) { 5737 unsigned NumSplats = VT.getSizeInBits() / SplatBitSize; 5738 5739 for (unsigned i = 0; i < NumSplats; ++i) { 5740 CnstBits <<= SplatBitSize; 5741 UndefBits <<= SplatBitSize; 5742 CnstBits |= SplatBits.zextOrTrunc(VT.getSizeInBits()); 5743 UndefBits |= (SplatBits ^ SplatUndef).zextOrTrunc(VT.getSizeInBits()); 5744 } 5745 5746 return true; 5747 } 5748 5749 return false; 5750 } 5751 5752 SDValue AArch64TargetLowering::LowerVectorAND(SDValue Op, 5753 SelectionDAG &DAG) const { 5754 BuildVectorSDNode *BVN = 5755 dyn_cast<BuildVectorSDNode>(Op.getOperand(1).getNode()); 5756 SDValue LHS = Op.getOperand(0); 5757 SDLoc dl(Op); 5758 EVT VT = Op.getValueType(); 5759 5760 if (!BVN) 5761 return Op; 5762 5763 APInt CnstBits(VT.getSizeInBits(), 0); 5764 APInt UndefBits(VT.getSizeInBits(), 0); 5765 if (resolveBuildVector(BVN, CnstBits, UndefBits)) { 5766 // We only have BIC vector immediate instruction, which is and-not. 5767 CnstBits = ~CnstBits; 5768 5769 // We make use of a little bit of goto ickiness in order to avoid having to 5770 // duplicate the immediate matching logic for the undef toggled case. 5771 bool SecondTry = false; 5772 AttemptModImm: 5773 5774 if (CnstBits.getHiBits(64) == CnstBits.getLoBits(64)) { 5775 CnstBits = CnstBits.zextOrTrunc(64); 5776 uint64_t CnstVal = CnstBits.getZExtValue(); 5777 5778 if (AArch64_AM::isAdvSIMDModImmType1(CnstVal)) { 5779 CnstVal = AArch64_AM::encodeAdvSIMDModImmType1(CnstVal); 5780 MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32; 5781 SDValue Mov = DAG.getNode(AArch64ISD::BICi, dl, MovTy, LHS, 5782 DAG.getConstant(CnstVal, dl, MVT::i32), 5783 DAG.getConstant(0, dl, MVT::i32)); 5784 return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov); 5785 } 5786 5787 if (AArch64_AM::isAdvSIMDModImmType2(CnstVal)) { 5788 CnstVal = AArch64_AM::encodeAdvSIMDModImmType2(CnstVal); 5789 MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32; 5790 SDValue Mov = DAG.getNode(AArch64ISD::BICi, dl, MovTy, LHS, 5791 DAG.getConstant(CnstVal, dl, MVT::i32), 5792 DAG.getConstant(8, dl, MVT::i32)); 5793 return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov); 5794 } 5795 5796 if (AArch64_AM::isAdvSIMDModImmType3(CnstVal)) { 5797 CnstVal = AArch64_AM::encodeAdvSIMDModImmType3(CnstVal); 5798 MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32; 5799 SDValue Mov = DAG.getNode(AArch64ISD::BICi, dl, MovTy, LHS, 5800 DAG.getConstant(CnstVal, dl, MVT::i32), 5801 DAG.getConstant(16, dl, MVT::i32)); 5802 return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov); 5803 } 5804 5805 if (AArch64_AM::isAdvSIMDModImmType4(CnstVal)) { 5806 CnstVal = AArch64_AM::encodeAdvSIMDModImmType4(CnstVal); 5807 MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32; 5808 SDValue Mov = DAG.getNode(AArch64ISD::BICi, dl, MovTy, LHS, 5809 DAG.getConstant(CnstVal, dl, MVT::i32), 5810 DAG.getConstant(24, dl, MVT::i32)); 5811 return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov); 5812 } 5813 5814 if (AArch64_AM::isAdvSIMDModImmType5(CnstVal)) { 5815 CnstVal = AArch64_AM::encodeAdvSIMDModImmType5(CnstVal); 5816 MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v8i16 : MVT::v4i16; 5817 SDValue Mov = DAG.getNode(AArch64ISD::BICi, dl, MovTy, LHS, 5818 DAG.getConstant(CnstVal, dl, MVT::i32), 5819 DAG.getConstant(0, dl, MVT::i32)); 5820 return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov); 5821 } 5822 5823 if (AArch64_AM::isAdvSIMDModImmType6(CnstVal)) { 5824 CnstVal = AArch64_AM::encodeAdvSIMDModImmType6(CnstVal); 5825 MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v8i16 : MVT::v4i16; 5826 SDValue Mov = DAG.getNode(AArch64ISD::BICi, dl, MovTy, LHS, 5827 DAG.getConstant(CnstVal, dl, MVT::i32), 5828 DAG.getConstant(8, dl, MVT::i32)); 5829 return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov); 5830 } 5831 } 5832 5833 if (SecondTry) 5834 goto FailedModImm; 5835 SecondTry = true; 5836 CnstBits = ~UndefBits; 5837 goto AttemptModImm; 5838 } 5839 5840 // We can always fall back to a non-immediate AND. 5841 FailedModImm: 5842 return Op; 5843 } 5844 5845 // Specialized code to quickly find if PotentialBVec is a BuildVector that 5846 // consists of only the same constant int value, returned in reference arg 5847 // ConstVal 5848 static bool isAllConstantBuildVector(const SDValue &PotentialBVec, 5849 uint64_t &ConstVal) { 5850 BuildVectorSDNode *Bvec = dyn_cast<BuildVectorSDNode>(PotentialBVec); 5851 if (!Bvec) 5852 return false; 5853 ConstantSDNode *FirstElt = dyn_cast<ConstantSDNode>(Bvec->getOperand(0)); 5854 if (!FirstElt) 5855 return false; 5856 EVT VT = Bvec->getValueType(0); 5857 unsigned NumElts = VT.getVectorNumElements(); 5858 for (unsigned i = 1; i < NumElts; ++i) 5859 if (dyn_cast<ConstantSDNode>(Bvec->getOperand(i)) != FirstElt) 5860 return false; 5861 ConstVal = FirstElt->getZExtValue(); 5862 return true; 5863 } 5864 5865 static unsigned getIntrinsicID(const SDNode *N) { 5866 unsigned Opcode = N->getOpcode(); 5867 switch (Opcode) { 5868 default: 5869 return Intrinsic::not_intrinsic; 5870 case ISD::INTRINSIC_WO_CHAIN: { 5871 unsigned IID = cast<ConstantSDNode>(N->getOperand(0))->getZExtValue(); 5872 if (IID < Intrinsic::num_intrinsics) 5873 return IID; 5874 return Intrinsic::not_intrinsic; 5875 } 5876 } 5877 } 5878 5879 // Attempt to form a vector S[LR]I from (or (and X, BvecC1), (lsl Y, C2)), 5880 // to (SLI X, Y, C2), where X and Y have matching vector types, BvecC1 is a 5881 // BUILD_VECTORs with constant element C1, C2 is a constant, and C1 == ~C2. 5882 // Also, logical shift right -> sri, with the same structure. 5883 static SDValue tryLowerToSLI(SDNode *N, SelectionDAG &DAG) { 5884 EVT VT = N->getValueType(0); 5885 5886 if (!VT.isVector()) 5887 return SDValue(); 5888 5889 SDLoc DL(N); 5890 5891 // Is the first op an AND? 5892 const SDValue And = N->getOperand(0); 5893 if (And.getOpcode() != ISD::AND) 5894 return SDValue(); 5895 5896 // Is the second op an shl or lshr? 5897 SDValue Shift = N->getOperand(1); 5898 // This will have been turned into: AArch64ISD::VSHL vector, #shift 5899 // or AArch64ISD::VLSHR vector, #shift 5900 unsigned ShiftOpc = Shift.getOpcode(); 5901 if ((ShiftOpc != AArch64ISD::VSHL && ShiftOpc != AArch64ISD::VLSHR)) 5902 return SDValue(); 5903 bool IsShiftRight = ShiftOpc == AArch64ISD::VLSHR; 5904 5905 // Is the shift amount constant? 5906 ConstantSDNode *C2node = dyn_cast<ConstantSDNode>(Shift.getOperand(1)); 5907 if (!C2node) 5908 return SDValue(); 5909 5910 // Is the and mask vector all constant? 5911 uint64_t C1; 5912 if (!isAllConstantBuildVector(And.getOperand(1), C1)) 5913 return SDValue(); 5914 5915 // Is C1 == ~C2, taking into account how much one can shift elements of a 5916 // particular size? 5917 uint64_t C2 = C2node->getZExtValue(); 5918 unsigned ElemSizeInBits = VT.getVectorElementType().getSizeInBits(); 5919 if (C2 > ElemSizeInBits) 5920 return SDValue(); 5921 unsigned ElemMask = (1 << ElemSizeInBits) - 1; 5922 if ((C1 & ElemMask) != (~C2 & ElemMask)) 5923 return SDValue(); 5924 5925 SDValue X = And.getOperand(0); 5926 SDValue Y = Shift.getOperand(0); 5927 5928 unsigned Intrin = 5929 IsShiftRight ? Intrinsic::aarch64_neon_vsri : Intrinsic::aarch64_neon_vsli; 5930 SDValue ResultSLI = 5931 DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT, 5932 DAG.getConstant(Intrin, DL, MVT::i32), X, Y, 5933 Shift.getOperand(1)); 5934 5935 DEBUG(dbgs() << "aarch64-lower: transformed: \n"); 5936 DEBUG(N->dump(&DAG)); 5937 DEBUG(dbgs() << "into: \n"); 5938 DEBUG(ResultSLI->dump(&DAG)); 5939 5940 ++NumShiftInserts; 5941 return ResultSLI; 5942 } 5943 5944 SDValue AArch64TargetLowering::LowerVectorOR(SDValue Op, 5945 SelectionDAG &DAG) const { 5946 // Attempt to form a vector S[LR]I from (or (and X, C1), (lsl Y, C2)) 5947 if (EnableAArch64SlrGeneration) { 5948 if (SDValue Res = tryLowerToSLI(Op.getNode(), DAG)) 5949 return Res; 5950 } 5951 5952 BuildVectorSDNode *BVN = 5953 dyn_cast<BuildVectorSDNode>(Op.getOperand(0).getNode()); 5954 SDValue LHS = Op.getOperand(1); 5955 SDLoc dl(Op); 5956 EVT VT = Op.getValueType(); 5957 5958 // OR commutes, so try swapping the operands. 5959 if (!BVN) { 5960 LHS = Op.getOperand(0); 5961 BVN = dyn_cast<BuildVectorSDNode>(Op.getOperand(1).getNode()); 5962 } 5963 if (!BVN) 5964 return Op; 5965 5966 APInt CnstBits(VT.getSizeInBits(), 0); 5967 APInt UndefBits(VT.getSizeInBits(), 0); 5968 if (resolveBuildVector(BVN, CnstBits, UndefBits)) { 5969 // We make use of a little bit of goto ickiness in order to avoid having to 5970 // duplicate the immediate matching logic for the undef toggled case. 5971 bool SecondTry = false; 5972 AttemptModImm: 5973 5974 if (CnstBits.getHiBits(64) == CnstBits.getLoBits(64)) { 5975 CnstBits = CnstBits.zextOrTrunc(64); 5976 uint64_t CnstVal = CnstBits.getZExtValue(); 5977 5978 if (AArch64_AM::isAdvSIMDModImmType1(CnstVal)) { 5979 CnstVal = AArch64_AM::encodeAdvSIMDModImmType1(CnstVal); 5980 MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32; 5981 SDValue Mov = DAG.getNode(AArch64ISD::ORRi, dl, MovTy, LHS, 5982 DAG.getConstant(CnstVal, dl, MVT::i32), 5983 DAG.getConstant(0, dl, MVT::i32)); 5984 return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov); 5985 } 5986 5987 if (AArch64_AM::isAdvSIMDModImmType2(CnstVal)) { 5988 CnstVal = AArch64_AM::encodeAdvSIMDModImmType2(CnstVal); 5989 MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32; 5990 SDValue Mov = DAG.getNode(AArch64ISD::ORRi, dl, MovTy, LHS, 5991 DAG.getConstant(CnstVal, dl, MVT::i32), 5992 DAG.getConstant(8, dl, MVT::i32)); 5993 return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov); 5994 } 5995 5996 if (AArch64_AM::isAdvSIMDModImmType3(CnstVal)) { 5997 CnstVal = AArch64_AM::encodeAdvSIMDModImmType3(CnstVal); 5998 MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32; 5999 SDValue Mov = DAG.getNode(AArch64ISD::ORRi, dl, MovTy, LHS, 6000 DAG.getConstant(CnstVal, dl, MVT::i32), 6001 DAG.getConstant(16, dl, MVT::i32)); 6002 return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov); 6003 } 6004 6005 if (AArch64_AM::isAdvSIMDModImmType4(CnstVal)) { 6006 CnstVal = AArch64_AM::encodeAdvSIMDModImmType4(CnstVal); 6007 MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32; 6008 SDValue Mov = DAG.getNode(AArch64ISD::ORRi, dl, MovTy, LHS, 6009 DAG.getConstant(CnstVal, dl, MVT::i32), 6010 DAG.getConstant(24, dl, MVT::i32)); 6011 return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov); 6012 } 6013 6014 if (AArch64_AM::isAdvSIMDModImmType5(CnstVal)) { 6015 CnstVal = AArch64_AM::encodeAdvSIMDModImmType5(CnstVal); 6016 MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v8i16 : MVT::v4i16; 6017 SDValue Mov = DAG.getNode(AArch64ISD::ORRi, dl, MovTy, LHS, 6018 DAG.getConstant(CnstVal, dl, MVT::i32), 6019 DAG.getConstant(0, dl, MVT::i32)); 6020 return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov); 6021 } 6022 6023 if (AArch64_AM::isAdvSIMDModImmType6(CnstVal)) { 6024 CnstVal = AArch64_AM::encodeAdvSIMDModImmType6(CnstVal); 6025 MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v8i16 : MVT::v4i16; 6026 SDValue Mov = DAG.getNode(AArch64ISD::ORRi, dl, MovTy, LHS, 6027 DAG.getConstant(CnstVal, dl, MVT::i32), 6028 DAG.getConstant(8, dl, MVT::i32)); 6029 return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov); 6030 } 6031 } 6032 6033 if (SecondTry) 6034 goto FailedModImm; 6035 SecondTry = true; 6036 CnstBits = UndefBits; 6037 goto AttemptModImm; 6038 } 6039 6040 // We can always fall back to a non-immediate OR. 6041 FailedModImm: 6042 return Op; 6043 } 6044 6045 // Normalize the operands of BUILD_VECTOR. The value of constant operands will 6046 // be truncated to fit element width. 6047 static SDValue NormalizeBuildVector(SDValue Op, 6048 SelectionDAG &DAG) { 6049 assert(Op.getOpcode() == ISD::BUILD_VECTOR && "Unknown opcode!"); 6050 SDLoc dl(Op); 6051 EVT VT = Op.getValueType(); 6052 EVT EltTy= VT.getVectorElementType(); 6053 6054 if (EltTy.isFloatingPoint() || EltTy.getSizeInBits() > 16) 6055 return Op; 6056 6057 SmallVector<SDValue, 16> Ops; 6058 for (SDValue Lane : Op->ops()) { 6059 if (auto *CstLane = dyn_cast<ConstantSDNode>(Lane)) { 6060 APInt LowBits(EltTy.getSizeInBits(), 6061 CstLane->getZExtValue()); 6062 Lane = DAG.getConstant(LowBits.getZExtValue(), dl, MVT::i32); 6063 } 6064 Ops.push_back(Lane); 6065 } 6066 return DAG.getBuildVector(VT, dl, Ops); 6067 } 6068 6069 SDValue AArch64TargetLowering::LowerBUILD_VECTOR(SDValue Op, 6070 SelectionDAG &DAG) const { 6071 SDLoc dl(Op); 6072 EVT VT = Op.getValueType(); 6073 Op = NormalizeBuildVector(Op, DAG); 6074 BuildVectorSDNode *BVN = cast<BuildVectorSDNode>(Op.getNode()); 6075 6076 APInt CnstBits(VT.getSizeInBits(), 0); 6077 APInt UndefBits(VT.getSizeInBits(), 0); 6078 if (resolveBuildVector(BVN, CnstBits, UndefBits)) { 6079 // We make use of a little bit of goto ickiness in order to avoid having to 6080 // duplicate the immediate matching logic for the undef toggled case. 6081 bool SecondTry = false; 6082 AttemptModImm: 6083 6084 if (CnstBits.getHiBits(64) == CnstBits.getLoBits(64)) { 6085 CnstBits = CnstBits.zextOrTrunc(64); 6086 uint64_t CnstVal = CnstBits.getZExtValue(); 6087 6088 // Certain magic vector constants (used to express things like NOT 6089 // and NEG) are passed through unmodified. This allows codegen patterns 6090 // for these operations to match. Special-purpose patterns will lower 6091 // these immediates to MOVIs if it proves necessary. 6092 if (VT.isInteger() && (CnstVal == 0 || CnstVal == ~0ULL)) 6093 return Op; 6094 6095 // The many faces of MOVI... 6096 if (AArch64_AM::isAdvSIMDModImmType10(CnstVal)) { 6097 CnstVal = AArch64_AM::encodeAdvSIMDModImmType10(CnstVal); 6098 if (VT.getSizeInBits() == 128) { 6099 SDValue Mov = DAG.getNode(AArch64ISD::MOVIedit, dl, MVT::v2i64, 6100 DAG.getConstant(CnstVal, dl, MVT::i32)); 6101 return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov); 6102 } 6103 6104 // Support the V64 version via subregister insertion. 6105 SDValue Mov = DAG.getNode(AArch64ISD::MOVIedit, dl, MVT::f64, 6106 DAG.getConstant(CnstVal, dl, MVT::i32)); 6107 return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov); 6108 } 6109 6110 if (AArch64_AM::isAdvSIMDModImmType1(CnstVal)) { 6111 CnstVal = AArch64_AM::encodeAdvSIMDModImmType1(CnstVal); 6112 MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32; 6113 SDValue Mov = DAG.getNode(AArch64ISD::MOVIshift, dl, MovTy, 6114 DAG.getConstant(CnstVal, dl, MVT::i32), 6115 DAG.getConstant(0, dl, MVT::i32)); 6116 return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov); 6117 } 6118 6119 if (AArch64_AM::isAdvSIMDModImmType2(CnstVal)) { 6120 CnstVal = AArch64_AM::encodeAdvSIMDModImmType2(CnstVal); 6121 MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32; 6122 SDValue Mov = DAG.getNode(AArch64ISD::MOVIshift, dl, MovTy, 6123 DAG.getConstant(CnstVal, dl, MVT::i32), 6124 DAG.getConstant(8, dl, MVT::i32)); 6125 return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov); 6126 } 6127 6128 if (AArch64_AM::isAdvSIMDModImmType3(CnstVal)) { 6129 CnstVal = AArch64_AM::encodeAdvSIMDModImmType3(CnstVal); 6130 MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32; 6131 SDValue Mov = DAG.getNode(AArch64ISD::MOVIshift, dl, MovTy, 6132 DAG.getConstant(CnstVal, dl, MVT::i32), 6133 DAG.getConstant(16, dl, MVT::i32)); 6134 return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov); 6135 } 6136 6137 if (AArch64_AM::isAdvSIMDModImmType4(CnstVal)) { 6138 CnstVal = AArch64_AM::encodeAdvSIMDModImmType4(CnstVal); 6139 MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32; 6140 SDValue Mov = DAG.getNode(AArch64ISD::MOVIshift, dl, MovTy, 6141 DAG.getConstant(CnstVal, dl, MVT::i32), 6142 DAG.getConstant(24, dl, MVT::i32)); 6143 return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov); 6144 } 6145 6146 if (AArch64_AM::isAdvSIMDModImmType5(CnstVal)) { 6147 CnstVal = AArch64_AM::encodeAdvSIMDModImmType5(CnstVal); 6148 MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v8i16 : MVT::v4i16; 6149 SDValue Mov = DAG.getNode(AArch64ISD::MOVIshift, dl, MovTy, 6150 DAG.getConstant(CnstVal, dl, MVT::i32), 6151 DAG.getConstant(0, dl, MVT::i32)); 6152 return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov); 6153 } 6154 6155 if (AArch64_AM::isAdvSIMDModImmType6(CnstVal)) { 6156 CnstVal = AArch64_AM::encodeAdvSIMDModImmType6(CnstVal); 6157 MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v8i16 : MVT::v4i16; 6158 SDValue Mov = DAG.getNode(AArch64ISD::MOVIshift, dl, MovTy, 6159 DAG.getConstant(CnstVal, dl, MVT::i32), 6160 DAG.getConstant(8, dl, MVT::i32)); 6161 return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov); 6162 } 6163 6164 if (AArch64_AM::isAdvSIMDModImmType7(CnstVal)) { 6165 CnstVal = AArch64_AM::encodeAdvSIMDModImmType7(CnstVal); 6166 MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32; 6167 SDValue Mov = DAG.getNode(AArch64ISD::MOVImsl, dl, MovTy, 6168 DAG.getConstant(CnstVal, dl, MVT::i32), 6169 DAG.getConstant(264, dl, MVT::i32)); 6170 return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov); 6171 } 6172 6173 if (AArch64_AM::isAdvSIMDModImmType8(CnstVal)) { 6174 CnstVal = AArch64_AM::encodeAdvSIMDModImmType8(CnstVal); 6175 MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32; 6176 SDValue Mov = DAG.getNode(AArch64ISD::MOVImsl, dl, MovTy, 6177 DAG.getConstant(CnstVal, dl, MVT::i32), 6178 DAG.getConstant(272, dl, MVT::i32)); 6179 return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov); 6180 } 6181 6182 if (AArch64_AM::isAdvSIMDModImmType9(CnstVal)) { 6183 CnstVal = AArch64_AM::encodeAdvSIMDModImmType9(CnstVal); 6184 MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v16i8 : MVT::v8i8; 6185 SDValue Mov = DAG.getNode(AArch64ISD::MOVI, dl, MovTy, 6186 DAG.getConstant(CnstVal, dl, MVT::i32)); 6187 return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov); 6188 } 6189 6190 // The few faces of FMOV... 6191 if (AArch64_AM::isAdvSIMDModImmType11(CnstVal)) { 6192 CnstVal = AArch64_AM::encodeAdvSIMDModImmType11(CnstVal); 6193 MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4f32 : MVT::v2f32; 6194 SDValue Mov = DAG.getNode(AArch64ISD::FMOV, dl, MovTy, 6195 DAG.getConstant(CnstVal, dl, MVT::i32)); 6196 return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov); 6197 } 6198 6199 if (AArch64_AM::isAdvSIMDModImmType12(CnstVal) && 6200 VT.getSizeInBits() == 128) { 6201 CnstVal = AArch64_AM::encodeAdvSIMDModImmType12(CnstVal); 6202 SDValue Mov = DAG.getNode(AArch64ISD::FMOV, dl, MVT::v2f64, 6203 DAG.getConstant(CnstVal, dl, MVT::i32)); 6204 return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov); 6205 } 6206 6207 // The many faces of MVNI... 6208 CnstVal = ~CnstVal; 6209 if (AArch64_AM::isAdvSIMDModImmType1(CnstVal)) { 6210 CnstVal = AArch64_AM::encodeAdvSIMDModImmType1(CnstVal); 6211 MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32; 6212 SDValue Mov = DAG.getNode(AArch64ISD::MVNIshift, dl, MovTy, 6213 DAG.getConstant(CnstVal, dl, MVT::i32), 6214 DAG.getConstant(0, dl, MVT::i32)); 6215 return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov); 6216 } 6217 6218 if (AArch64_AM::isAdvSIMDModImmType2(CnstVal)) { 6219 CnstVal = AArch64_AM::encodeAdvSIMDModImmType2(CnstVal); 6220 MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32; 6221 SDValue Mov = DAG.getNode(AArch64ISD::MVNIshift, dl, MovTy, 6222 DAG.getConstant(CnstVal, dl, MVT::i32), 6223 DAG.getConstant(8, dl, MVT::i32)); 6224 return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov); 6225 } 6226 6227 if (AArch64_AM::isAdvSIMDModImmType3(CnstVal)) { 6228 CnstVal = AArch64_AM::encodeAdvSIMDModImmType3(CnstVal); 6229 MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32; 6230 SDValue Mov = DAG.getNode(AArch64ISD::MVNIshift, dl, MovTy, 6231 DAG.getConstant(CnstVal, dl, MVT::i32), 6232 DAG.getConstant(16, dl, MVT::i32)); 6233 return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov); 6234 } 6235 6236 if (AArch64_AM::isAdvSIMDModImmType4(CnstVal)) { 6237 CnstVal = AArch64_AM::encodeAdvSIMDModImmType4(CnstVal); 6238 MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32; 6239 SDValue Mov = DAG.getNode(AArch64ISD::MVNIshift, dl, MovTy, 6240 DAG.getConstant(CnstVal, dl, MVT::i32), 6241 DAG.getConstant(24, dl, MVT::i32)); 6242 return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov); 6243 } 6244 6245 if (AArch64_AM::isAdvSIMDModImmType5(CnstVal)) { 6246 CnstVal = AArch64_AM::encodeAdvSIMDModImmType5(CnstVal); 6247 MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v8i16 : MVT::v4i16; 6248 SDValue Mov = DAG.getNode(AArch64ISD::MVNIshift, dl, MovTy, 6249 DAG.getConstant(CnstVal, dl, MVT::i32), 6250 DAG.getConstant(0, dl, MVT::i32)); 6251 return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov); 6252 } 6253 6254 if (AArch64_AM::isAdvSIMDModImmType6(CnstVal)) { 6255 CnstVal = AArch64_AM::encodeAdvSIMDModImmType6(CnstVal); 6256 MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v8i16 : MVT::v4i16; 6257 SDValue Mov = DAG.getNode(AArch64ISD::MVNIshift, dl, MovTy, 6258 DAG.getConstant(CnstVal, dl, MVT::i32), 6259 DAG.getConstant(8, dl, MVT::i32)); 6260 return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov); 6261 } 6262 6263 if (AArch64_AM::isAdvSIMDModImmType7(CnstVal)) { 6264 CnstVal = AArch64_AM::encodeAdvSIMDModImmType7(CnstVal); 6265 MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32; 6266 SDValue Mov = DAG.getNode(AArch64ISD::MVNImsl, dl, MovTy, 6267 DAG.getConstant(CnstVal, dl, MVT::i32), 6268 DAG.getConstant(264, dl, MVT::i32)); 6269 return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov); 6270 } 6271 6272 if (AArch64_AM::isAdvSIMDModImmType8(CnstVal)) { 6273 CnstVal = AArch64_AM::encodeAdvSIMDModImmType8(CnstVal); 6274 MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32; 6275 SDValue Mov = DAG.getNode(AArch64ISD::MVNImsl, dl, MovTy, 6276 DAG.getConstant(CnstVal, dl, MVT::i32), 6277 DAG.getConstant(272, dl, MVT::i32)); 6278 return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov); 6279 } 6280 } 6281 6282 if (SecondTry) 6283 goto FailedModImm; 6284 SecondTry = true; 6285 CnstBits = UndefBits; 6286 goto AttemptModImm; 6287 } 6288 FailedModImm: 6289 6290 // Scan through the operands to find some interesting properties we can 6291 // exploit: 6292 // 1) If only one value is used, we can use a DUP, or 6293 // 2) if only the low element is not undef, we can just insert that, or 6294 // 3) if only one constant value is used (w/ some non-constant lanes), 6295 // we can splat the constant value into the whole vector then fill 6296 // in the non-constant lanes. 6297 // 4) FIXME: If different constant values are used, but we can intelligently 6298 // select the values we'll be overwriting for the non-constant 6299 // lanes such that we can directly materialize the vector 6300 // some other way (MOVI, e.g.), we can be sneaky. 6301 unsigned NumElts = VT.getVectorNumElements(); 6302 bool isOnlyLowElement = true; 6303 bool usesOnlyOneValue = true; 6304 bool usesOnlyOneConstantValue = true; 6305 bool isConstant = true; 6306 unsigned NumConstantLanes = 0; 6307 SDValue Value; 6308 SDValue ConstantValue; 6309 for (unsigned i = 0; i < NumElts; ++i) { 6310 SDValue V = Op.getOperand(i); 6311 if (V.isUndef()) 6312 continue; 6313 if (i > 0) 6314 isOnlyLowElement = false; 6315 if (!isa<ConstantFPSDNode>(V) && !isa<ConstantSDNode>(V)) 6316 isConstant = false; 6317 6318 if (isa<ConstantSDNode>(V) || isa<ConstantFPSDNode>(V)) { 6319 ++NumConstantLanes; 6320 if (!ConstantValue.getNode()) 6321 ConstantValue = V; 6322 else if (ConstantValue != V) 6323 usesOnlyOneConstantValue = false; 6324 } 6325 6326 if (!Value.getNode()) 6327 Value = V; 6328 else if (V != Value) 6329 usesOnlyOneValue = false; 6330 } 6331 6332 if (!Value.getNode()) 6333 return DAG.getUNDEF(VT); 6334 6335 if (isOnlyLowElement) 6336 return DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Value); 6337 6338 // Use DUP for non-constant splats. For f32 constant splats, reduce to 6339 // i32 and try again. 6340 if (usesOnlyOneValue) { 6341 if (!isConstant) { 6342 if (Value.getOpcode() != ISD::EXTRACT_VECTOR_ELT || 6343 Value.getValueType() != VT) 6344 return DAG.getNode(AArch64ISD::DUP, dl, VT, Value); 6345 6346 // This is actually a DUPLANExx operation, which keeps everything vectory. 6347 6348 // DUPLANE works on 128-bit vectors, widen it if necessary. 6349 SDValue Lane = Value.getOperand(1); 6350 Value = Value.getOperand(0); 6351 if (Value.getValueType().getSizeInBits() == 64) 6352 Value = WidenVector(Value, DAG); 6353 6354 unsigned Opcode = getDUPLANEOp(VT.getVectorElementType()); 6355 return DAG.getNode(Opcode, dl, VT, Value, Lane); 6356 } 6357 6358 if (VT.getVectorElementType().isFloatingPoint()) { 6359 SmallVector<SDValue, 8> Ops; 6360 EVT EltTy = VT.getVectorElementType(); 6361 assert ((EltTy == MVT::f16 || EltTy == MVT::f32 || EltTy == MVT::f64) && 6362 "Unsupported floating-point vector type"); 6363 MVT NewType = MVT::getIntegerVT(EltTy.getSizeInBits()); 6364 for (unsigned i = 0; i < NumElts; ++i) 6365 Ops.push_back(DAG.getNode(ISD::BITCAST, dl, NewType, Op.getOperand(i))); 6366 EVT VecVT = EVT::getVectorVT(*DAG.getContext(), NewType, NumElts); 6367 SDValue Val = DAG.getBuildVector(VecVT, dl, Ops); 6368 Val = LowerBUILD_VECTOR(Val, DAG); 6369 if (Val.getNode()) 6370 return DAG.getNode(ISD::BITCAST, dl, VT, Val); 6371 } 6372 } 6373 6374 // If there was only one constant value used and for more than one lane, 6375 // start by splatting that value, then replace the non-constant lanes. This 6376 // is better than the default, which will perform a separate initialization 6377 // for each lane. 6378 if (NumConstantLanes > 0 && usesOnlyOneConstantValue) { 6379 SDValue Val = DAG.getNode(AArch64ISD::DUP, dl, VT, ConstantValue); 6380 // Now insert the non-constant lanes. 6381 for (unsigned i = 0; i < NumElts; ++i) { 6382 SDValue V = Op.getOperand(i); 6383 SDValue LaneIdx = DAG.getConstant(i, dl, MVT::i64); 6384 if (!isa<ConstantSDNode>(V) && !isa<ConstantFPSDNode>(V)) { 6385 // Note that type legalization likely mucked about with the VT of the 6386 // source operand, so we may have to convert it here before inserting. 6387 Val = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, Val, V, LaneIdx); 6388 } 6389 } 6390 return Val; 6391 } 6392 6393 // If all elements are constants and the case above didn't get hit, fall back 6394 // to the default expansion, which will generate a load from the constant 6395 // pool. 6396 if (isConstant) 6397 return SDValue(); 6398 6399 // Empirical tests suggest this is rarely worth it for vectors of length <= 2. 6400 if (NumElts >= 4) { 6401 if (SDValue shuffle = ReconstructShuffle(Op, DAG)) 6402 return shuffle; 6403 } 6404 6405 // If all else fails, just use a sequence of INSERT_VECTOR_ELT when we 6406 // know the default expansion would otherwise fall back on something even 6407 // worse. For a vector with one or two non-undef values, that's 6408 // scalar_to_vector for the elements followed by a shuffle (provided the 6409 // shuffle is valid for the target) and materialization element by element 6410 // on the stack followed by a load for everything else. 6411 if (!isConstant && !usesOnlyOneValue) { 6412 SDValue Vec = DAG.getUNDEF(VT); 6413 SDValue Op0 = Op.getOperand(0); 6414 unsigned ElemSize = VT.getVectorElementType().getSizeInBits(); 6415 unsigned i = 0; 6416 // For 32 and 64 bit types, use INSERT_SUBREG for lane zero to 6417 // a) Avoid a RMW dependency on the full vector register, and 6418 // b) Allow the register coalescer to fold away the copy if the 6419 // value is already in an S or D register. 6420 // Do not do this for UNDEF/LOAD nodes because we have better patterns 6421 // for those avoiding the SCALAR_TO_VECTOR/BUILD_VECTOR. 6422 if (!Op0.isUndef() && Op0.getOpcode() != ISD::LOAD && 6423 (ElemSize == 32 || ElemSize == 64)) { 6424 unsigned SubIdx = ElemSize == 32 ? AArch64::ssub : AArch64::dsub; 6425 MachineSDNode *N = 6426 DAG.getMachineNode(TargetOpcode::INSERT_SUBREG, dl, VT, Vec, Op0, 6427 DAG.getTargetConstant(SubIdx, dl, MVT::i32)); 6428 Vec = SDValue(N, 0); 6429 ++i; 6430 } 6431 for (; i < NumElts; ++i) { 6432 SDValue V = Op.getOperand(i); 6433 if (V.isUndef()) 6434 continue; 6435 SDValue LaneIdx = DAG.getConstant(i, dl, MVT::i64); 6436 Vec = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, Vec, V, LaneIdx); 6437 } 6438 return Vec; 6439 } 6440 6441 // Just use the default expansion. We failed to find a better alternative. 6442 return SDValue(); 6443 } 6444 6445 SDValue AArch64TargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op, 6446 SelectionDAG &DAG) const { 6447 assert(Op.getOpcode() == ISD::INSERT_VECTOR_ELT && "Unknown opcode!"); 6448 6449 // Check for non-constant or out of range lane. 6450 EVT VT = Op.getOperand(0).getValueType(); 6451 ConstantSDNode *CI = dyn_cast<ConstantSDNode>(Op.getOperand(2)); 6452 if (!CI || CI->getZExtValue() >= VT.getVectorNumElements()) 6453 return SDValue(); 6454 6455 6456 // Insertion/extraction are legal for V128 types. 6457 if (VT == MVT::v16i8 || VT == MVT::v8i16 || VT == MVT::v4i32 || 6458 VT == MVT::v2i64 || VT == MVT::v4f32 || VT == MVT::v2f64 || 6459 VT == MVT::v8f16) 6460 return Op; 6461 6462 if (VT != MVT::v8i8 && VT != MVT::v4i16 && VT != MVT::v2i32 && 6463 VT != MVT::v1i64 && VT != MVT::v2f32 && VT != MVT::v4f16) 6464 return SDValue(); 6465 6466 // For V64 types, we perform insertion by expanding the value 6467 // to a V128 type and perform the insertion on that. 6468 SDLoc DL(Op); 6469 SDValue WideVec = WidenVector(Op.getOperand(0), DAG); 6470 EVT WideTy = WideVec.getValueType(); 6471 6472 SDValue Node = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, WideTy, WideVec, 6473 Op.getOperand(1), Op.getOperand(2)); 6474 // Re-narrow the resultant vector. 6475 return NarrowVector(Node, DAG); 6476 } 6477 6478 SDValue 6479 AArch64TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op, 6480 SelectionDAG &DAG) const { 6481 assert(Op.getOpcode() == ISD::EXTRACT_VECTOR_ELT && "Unknown opcode!"); 6482 6483 // Check for non-constant or out of range lane. 6484 EVT VT = Op.getOperand(0).getValueType(); 6485 ConstantSDNode *CI = dyn_cast<ConstantSDNode>(Op.getOperand(1)); 6486 if (!CI || CI->getZExtValue() >= VT.getVectorNumElements()) 6487 return SDValue(); 6488 6489 6490 // Insertion/extraction are legal for V128 types. 6491 if (VT == MVT::v16i8 || VT == MVT::v8i16 || VT == MVT::v4i32 || 6492 VT == MVT::v2i64 || VT == MVT::v4f32 || VT == MVT::v2f64 || 6493 VT == MVT::v8f16) 6494 return Op; 6495 6496 if (VT != MVT::v8i8 && VT != MVT::v4i16 && VT != MVT::v2i32 && 6497 VT != MVT::v1i64 && VT != MVT::v2f32 && VT != MVT::v4f16) 6498 return SDValue(); 6499 6500 // For V64 types, we perform extraction by expanding the value 6501 // to a V128 type and perform the extraction on that. 6502 SDLoc DL(Op); 6503 SDValue WideVec = WidenVector(Op.getOperand(0), DAG); 6504 EVT WideTy = WideVec.getValueType(); 6505 6506 EVT ExtrTy = WideTy.getVectorElementType(); 6507 if (ExtrTy == MVT::i16 || ExtrTy == MVT::i8) 6508 ExtrTy = MVT::i32; 6509 6510 // For extractions, we just return the result directly. 6511 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ExtrTy, WideVec, 6512 Op.getOperand(1)); 6513 } 6514 6515 SDValue AArch64TargetLowering::LowerEXTRACT_SUBVECTOR(SDValue Op, 6516 SelectionDAG &DAG) const { 6517 EVT VT = Op.getOperand(0).getValueType(); 6518 SDLoc dl(Op); 6519 // Just in case... 6520 if (!VT.isVector()) 6521 return SDValue(); 6522 6523 ConstantSDNode *Cst = dyn_cast<ConstantSDNode>(Op.getOperand(1)); 6524 if (!Cst) 6525 return SDValue(); 6526 unsigned Val = Cst->getZExtValue(); 6527 6528 unsigned Size = Op.getValueType().getSizeInBits(); 6529 6530 // This will get lowered to an appropriate EXTRACT_SUBREG in ISel. 6531 if (Val == 0) 6532 return Op; 6533 6534 // If this is extracting the upper 64-bits of a 128-bit vector, we match 6535 // that directly. 6536 if (Size == 64 && Val * VT.getVectorElementType().getSizeInBits() == 64) 6537 return Op; 6538 6539 return SDValue(); 6540 } 6541 6542 bool AArch64TargetLowering::isShuffleMaskLegal(const SmallVectorImpl<int> &M, 6543 EVT VT) const { 6544 if (VT.getVectorNumElements() == 4 && 6545 (VT.is128BitVector() || VT.is64BitVector())) { 6546 unsigned PFIndexes[4]; 6547 for (unsigned i = 0; i != 4; ++i) { 6548 if (M[i] < 0) 6549 PFIndexes[i] = 8; 6550 else 6551 PFIndexes[i] = M[i]; 6552 } 6553 6554 // Compute the index in the perfect shuffle table. 6555 unsigned PFTableIndex = PFIndexes[0] * 9 * 9 * 9 + PFIndexes[1] * 9 * 9 + 6556 PFIndexes[2] * 9 + PFIndexes[3]; 6557 unsigned PFEntry = PerfectShuffleTable[PFTableIndex]; 6558 unsigned Cost = (PFEntry >> 30); 6559 6560 if (Cost <= 4) 6561 return true; 6562 } 6563 6564 bool DummyBool; 6565 int DummyInt; 6566 unsigned DummyUnsigned; 6567 6568 return (ShuffleVectorSDNode::isSplatMask(&M[0], VT) || isREVMask(M, VT, 64) || 6569 isREVMask(M, VT, 32) || isREVMask(M, VT, 16) || 6570 isEXTMask(M, VT, DummyBool, DummyUnsigned) || 6571 // isTBLMask(M, VT) || // FIXME: Port TBL support from ARM. 6572 isTRNMask(M, VT, DummyUnsigned) || isUZPMask(M, VT, DummyUnsigned) || 6573 isZIPMask(M, VT, DummyUnsigned) || 6574 isTRN_v_undef_Mask(M, VT, DummyUnsigned) || 6575 isUZP_v_undef_Mask(M, VT, DummyUnsigned) || 6576 isZIP_v_undef_Mask(M, VT, DummyUnsigned) || 6577 isINSMask(M, VT.getVectorNumElements(), DummyBool, DummyInt) || 6578 isConcatMask(M, VT, VT.getSizeInBits() == 128)); 6579 } 6580 6581 /// getVShiftImm - Check if this is a valid build_vector for the immediate 6582 /// operand of a vector shift operation, where all the elements of the 6583 /// build_vector must have the same constant integer value. 6584 static bool getVShiftImm(SDValue Op, unsigned ElementBits, int64_t &Cnt) { 6585 // Ignore bit_converts. 6586 while (Op.getOpcode() == ISD::BITCAST) 6587 Op = Op.getOperand(0); 6588 BuildVectorSDNode *BVN = dyn_cast<BuildVectorSDNode>(Op.getNode()); 6589 APInt SplatBits, SplatUndef; 6590 unsigned SplatBitSize; 6591 bool HasAnyUndefs; 6592 if (!BVN || !BVN->isConstantSplat(SplatBits, SplatUndef, SplatBitSize, 6593 HasAnyUndefs, ElementBits) || 6594 SplatBitSize > ElementBits) 6595 return false; 6596 Cnt = SplatBits.getSExtValue(); 6597 return true; 6598 } 6599 6600 /// isVShiftLImm - Check if this is a valid build_vector for the immediate 6601 /// operand of a vector shift left operation. That value must be in the range: 6602 /// 0 <= Value < ElementBits for a left shift; or 6603 /// 0 <= Value <= ElementBits for a long left shift. 6604 static bool isVShiftLImm(SDValue Op, EVT VT, bool isLong, int64_t &Cnt) { 6605 assert(VT.isVector() && "vector shift count is not a vector type"); 6606 int64_t ElementBits = VT.getVectorElementType().getSizeInBits(); 6607 if (!getVShiftImm(Op, ElementBits, Cnt)) 6608 return false; 6609 return (Cnt >= 0 && (isLong ? Cnt - 1 : Cnt) < ElementBits); 6610 } 6611 6612 /// isVShiftRImm - Check if this is a valid build_vector for the immediate 6613 /// operand of a vector shift right operation. The value must be in the range: 6614 /// 1 <= Value <= ElementBits for a right shift; or 6615 static bool isVShiftRImm(SDValue Op, EVT VT, bool isNarrow, int64_t &Cnt) { 6616 assert(VT.isVector() && "vector shift count is not a vector type"); 6617 int64_t ElementBits = VT.getVectorElementType().getSizeInBits(); 6618 if (!getVShiftImm(Op, ElementBits, Cnt)) 6619 return false; 6620 return (Cnt >= 1 && Cnt <= (isNarrow ? ElementBits / 2 : ElementBits)); 6621 } 6622 6623 SDValue AArch64TargetLowering::LowerVectorSRA_SRL_SHL(SDValue Op, 6624 SelectionDAG &DAG) const { 6625 EVT VT = Op.getValueType(); 6626 SDLoc DL(Op); 6627 int64_t Cnt; 6628 6629 if (!Op.getOperand(1).getValueType().isVector()) 6630 return Op; 6631 unsigned EltSize = VT.getVectorElementType().getSizeInBits(); 6632 6633 switch (Op.getOpcode()) { 6634 default: 6635 llvm_unreachable("unexpected shift opcode"); 6636 6637 case ISD::SHL: 6638 if (isVShiftLImm(Op.getOperand(1), VT, false, Cnt) && Cnt < EltSize) 6639 return DAG.getNode(AArch64ISD::VSHL, DL, VT, Op.getOperand(0), 6640 DAG.getConstant(Cnt, DL, MVT::i32)); 6641 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT, 6642 DAG.getConstant(Intrinsic::aarch64_neon_ushl, DL, 6643 MVT::i32), 6644 Op.getOperand(0), Op.