1 //===-- AArch64ISelLowering.cpp - AArch64 DAG Lowering Implementation ----===// 2 // 3 // The LLVM Compiler Infrastructure 4 // 5 // This file is distributed under the University of Illinois Open Source 6 // License. See LICENSE.TXT for details. 7 // 8 //===----------------------------------------------------------------------===// 9 // 10 // This file implements the AArch64TargetLowering class. 11 // 12 //===----------------------------------------------------------------------===// 13 14 #include "AArch64ISelLowering.h" 15 #include "AArch64CallingConvention.h" 16 #include "AArch64MachineFunctionInfo.h" 17 #include "AArch64PerfectShuffle.h" 18 #include "AArch64Subtarget.h" 19 #include "AArch64TargetMachine.h" 20 #include "AArch64TargetObjectFile.h" 21 #include "MCTargetDesc/AArch64AddressingModes.h" 22 #include "llvm/ADT/Statistic.h" 23 #include "llvm/CodeGen/CallingConvLower.h" 24 #include "llvm/CodeGen/MachineFrameInfo.h" 25 #include "llvm/CodeGen/MachineInstrBuilder.h" 26 #include "llvm/CodeGen/MachineRegisterInfo.h" 27 #include "llvm/IR/Function.h" 28 #include "llvm/IR/GetElementPtrTypeIterator.h" 29 #include "llvm/IR/Intrinsics.h" 30 #include "llvm/IR/Type.h" 31 #include "llvm/Support/CommandLine.h" 32 #include "llvm/Support/Debug.h" 33 #include "llvm/Support/ErrorHandling.h" 34 #include "llvm/Support/raw_ostream.h" 35 #include "llvm/Target/TargetOptions.h" 36 using namespace llvm; 37 38 #define DEBUG_TYPE "aarch64-lower" 39 40 STATISTIC(NumTailCalls, "Number of tail calls"); 41 STATISTIC(NumShiftInserts, "Number of vector shift inserts"); 42 43 static cl::opt<bool> 44 EnableAArch64SlrGeneration("aarch64-shift-insert-generation", cl::Hidden, 45 cl::desc("Allow AArch64 SLI/SRI formation"), 46 cl::init(false)); 47 48 // FIXME: The necessary dtprel relocations don't seem to be supported 49 // well in the GNU bfd and gold linkers at the moment. Therefore, by 50 // default, for now, fall back to GeneralDynamic code generation. 51 cl::opt<bool> EnableAArch64ELFLocalDynamicTLSGeneration( 52 "aarch64-elf-ldtls-generation", cl::Hidden, 53 cl::desc("Allow AArch64 Local Dynamic TLS code generation"), 54 cl::init(false)); 55 56 /// Value type used for condition codes. 57 static const MVT MVT_CC = MVT::i32; 58 59 AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM, 60 const AArch64Subtarget &STI) 61 : TargetLowering(TM), Subtarget(&STI) { 62 63 // AArch64 doesn't have comparisons which set GPRs or setcc instructions, so 64 // we have to make something up. Arbitrarily, choose ZeroOrOne. 65 setBooleanContents(ZeroOrOneBooleanContent); 66 // When comparing vectors the result sets the different elements in the 67 // vector to all-one or all-zero. 68 setBooleanVectorContents(ZeroOrNegativeOneBooleanContent); 69 70 // Set up the register classes. 71 addRegisterClass(MVT::i32, &AArch64::GPR32allRegClass); 72 addRegisterClass(MVT::i64, &AArch64::GPR64allRegClass); 73 74 if (Subtarget->hasFPARMv8()) { 75 addRegisterClass(MVT::f16, &AArch64::FPR16RegClass); 76 addRegisterClass(MVT::f32, &AArch64::FPR32RegClass); 77 addRegisterClass(MVT::f64, &AArch64::FPR64RegClass); 78 addRegisterClass(MVT::f128, &AArch64::FPR128RegClass); 79 } 80 81 if (Subtarget->hasNEON()) { 82 addRegisterClass(MVT::v16i8, &AArch64::FPR8RegClass); 83 addRegisterClass(MVT::v8i16, &AArch64::FPR16RegClass); 84 // Someone set us up the NEON. 85 addDRTypeForNEON(MVT::v2f32); 86 addDRTypeForNEON(MVT::v8i8); 87 addDRTypeForNEON(MVT::v4i16); 88 addDRTypeForNEON(MVT::v2i32); 89 addDRTypeForNEON(MVT::v1i64); 90 addDRTypeForNEON(MVT::v1f64); 91 addDRTypeForNEON(MVT::v4f16); 92 93 addQRTypeForNEON(MVT::v4f32); 94 addQRTypeForNEON(MVT::v2f64); 95 addQRTypeForNEON(MVT::v16i8); 96 addQRTypeForNEON(MVT::v8i16); 97 addQRTypeForNEON(MVT::v4i32); 98 addQRTypeForNEON(MVT::v2i64); 99 addQRTypeForNEON(MVT::v8f16); 100 } 101 102 // Compute derived properties from the register classes 103 computeRegisterProperties(Subtarget->getRegisterInfo()); 104 105 // Provide all sorts of operation actions 106 setOperationAction(ISD::GlobalAddress, MVT::i64, Custom); 107 setOperationAction(ISD::GlobalTLSAddress, MVT::i64, Custom); 108 setOperationAction(ISD::SETCC, MVT::i32, Custom); 109 setOperationAction(ISD::SETCC, MVT::i64, Custom); 110 setOperationAction(ISD::SETCC, MVT::f32, Custom); 111 setOperationAction(ISD::SETCC, MVT::f64, Custom); 112 setOperationAction(ISD::BRCOND, MVT::Other, Expand); 113 setOperationAction(ISD::BR_CC, MVT::i32, Custom); 114 setOperationAction(ISD::BR_CC, MVT::i64, Custom); 115 setOperationAction(ISD::BR_CC, MVT::f32, Custom); 116 setOperationAction(ISD::BR_CC, MVT::f64, Custom); 117 setOperationAction(ISD::SELECT, MVT::i32, Custom); 118 setOperationAction(ISD::SELECT, MVT::i64, Custom); 119 setOperationAction(ISD::SELECT, MVT::f32, Custom); 120 setOperationAction(ISD::SELECT, MVT::f64, Custom); 121 setOperationAction(ISD::SELECT_CC, MVT::i32, Custom); 122 setOperationAction(ISD::SELECT_CC, MVT::i64, Custom); 123 setOperationAction(ISD::SELECT_CC, MVT::f32, Custom); 124 setOperationAction(ISD::SELECT_CC, MVT::f64, Custom); 125 setOperationAction(ISD::BR_JT, MVT::Other, Expand); 126 setOperationAction(ISD::JumpTable, MVT::i64, Custom); 127 128 setOperationAction(ISD::SHL_PARTS, MVT::i64, Custom); 129 setOperationAction(ISD::SRA_PARTS, MVT::i64, Custom); 130 setOperationAction(ISD::SRL_PARTS, MVT::i64, Custom); 131 132 setOperationAction(ISD::FREM, MVT::f32, Expand); 133 setOperationAction(ISD::FREM, MVT::f64, Expand); 134 setOperationAction(ISD::FREM, MVT::f80, Expand); 135 136 // Custom lowering hooks are needed for XOR 137 // to fold it into CSINC/CSINV. 138 setOperationAction(ISD::XOR, MVT::i32, Custom); 139 setOperationAction(ISD::XOR, MVT::i64, Custom); 140 141 // Virtually no operation on f128 is legal, but LLVM can't expand them when 142 // there's a valid register class, so we need custom operations in most cases. 143 setOperationAction(ISD::FABS, MVT::f128, Expand); 144 setOperationAction(ISD::FADD, MVT::f128, Custom); 145 setOperationAction(ISD::FCOPYSIGN, MVT::f128, Expand); 146 setOperationAction(ISD::FCOS, MVT::f128, Expand); 147 setOperationAction(ISD::FDIV, MVT::f128, Custom); 148 setOperationAction(ISD::FMA, MVT::f128, Expand); 149 setOperationAction(ISD::FMUL, MVT::f128, Custom); 150 setOperationAction(ISD::FNEG, MVT::f128, Expand); 151 setOperationAction(ISD::FPOW, MVT::f128, Expand); 152 setOperationAction(ISD::FREM, MVT::f128, Expand); 153 setOperationAction(ISD::FRINT, MVT::f128, Expand); 154 setOperationAction(ISD::FSIN, MVT::f128, Expand); 155 setOperationAction(ISD::FSINCOS, MVT::f128, Expand); 156 setOperationAction(ISD::FSQRT, MVT::f128, Expand); 157 setOperationAction(ISD::FSUB, MVT::f128, Custom); 158 setOperationAction(ISD::FTRUNC, MVT::f128, Expand); 159 setOperationAction(ISD::SETCC, MVT::f128, Custom); 160 setOperationAction(ISD::BR_CC, MVT::f128, Custom); 161 setOperationAction(ISD::SELECT, MVT::f128, Custom); 162 setOperationAction(ISD::SELECT_CC, MVT::f128, Custom); 163 setOperationAction(ISD::FP_EXTEND, MVT::f128, Custom); 164 165 // Lowering for many of the conversions is actually specified by the non-f128 166 // type. The LowerXXX function will be trivial when f128 isn't involved. 167 setOperationAction(ISD::FP_TO_SINT, MVT::i32, Custom); 168 setOperationAction(ISD::FP_TO_SINT, MVT::i64, Custom); 169 setOperationAction(ISD::FP_TO_SINT, MVT::i128, Custom); 170 setOperationAction(ISD::FP_TO_UINT, MVT::i32, Custom); 171 setOperationAction(ISD::FP_TO_UINT, MVT::i64, Custom); 172 setOperationAction(ISD::FP_TO_UINT, MVT::i128, Custom); 173 setOperationAction(ISD::SINT_TO_FP, MVT::i32, Custom); 174 setOperationAction(ISD::SINT_TO_FP, MVT::i64, Custom); 175 setOperationAction(ISD::SINT_TO_FP, MVT::i128, Custom); 176 setOperationAction(ISD::UINT_TO_FP, MVT::i32, Custom); 177 setOperationAction(ISD::UINT_TO_FP, MVT::i64, Custom); 178 setOperationAction(ISD::UINT_TO_FP, MVT::i128, Custom); 179 setOperationAction(ISD::FP_ROUND, MVT::f32, Custom); 180 setOperationAction(ISD::FP_ROUND, MVT::f64, Custom); 181 182 // Variable arguments. 183 setOperationAction(ISD::VASTART, MVT::Other, Custom); 184 setOperationAction(ISD::VAARG, MVT::Other, Custom); 185 setOperationAction(ISD::VACOPY, MVT::Other, Custom); 186 setOperationAction(ISD::VAEND, MVT::Other, Expand); 187 188 // Variable-sized objects. 189 setOperationAction(ISD::STACKSAVE, MVT::Other, Expand); 190 setOperationAction(ISD::STACKRESTORE, MVT::Other, Expand); 191 setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i64, Expand); 192 193 // Constant pool entries 194 setOperationAction(ISD::ConstantPool, MVT::i64, Custom); 195 196 // BlockAddress 197 setOperationAction(ISD::BlockAddress, MVT::i64, Custom); 198 199 // Add/Sub overflow ops with MVT::Glues are lowered to NZCV dependences. 200 setOperationAction(ISD::ADDC, MVT::i32, Custom); 201 setOperationAction(ISD::ADDE, MVT::i32, Custom); 202 setOperationAction(ISD::SUBC, MVT::i32, Custom); 203 setOperationAction(ISD::SUBE, MVT::i32, Custom); 204 setOperationAction(ISD::ADDC, MVT::i64, Custom); 205 setOperationAction(ISD::ADDE, MVT::i64, Custom); 206 setOperationAction(ISD::SUBC, MVT::i64, Custom); 207 setOperationAction(ISD::SUBE, MVT::i64, Custom); 208 209 // AArch64 lacks both left-rotate and popcount instructions. 210 setOperationAction(ISD::ROTL, MVT::i32, Expand); 211 setOperationAction(ISD::ROTL, MVT::i64, Expand); 212 for (MVT VT : MVT::vector_valuetypes()) { 213 setOperationAction(ISD::ROTL, VT, Expand); 214 setOperationAction(ISD::ROTR, VT, Expand); 215 } 216 217 // AArch64 doesn't have {U|S}MUL_LOHI. 218 setOperationAction(ISD::UMUL_LOHI, MVT::i64, Expand); 219 setOperationAction(ISD::SMUL_LOHI, MVT::i64, Expand); 220 221 222 setOperationAction(ISD::CTPOP, MVT::i32, Custom); 223 setOperationAction(ISD::CTPOP, MVT::i64, Custom); 224 225 setOperationAction(ISD::SDIVREM, MVT::i32, Expand); 226 setOperationAction(ISD::SDIVREM, MVT::i64, Expand); 227 for (MVT VT : MVT::vector_valuetypes()) { 228 setOperationAction(ISD::SDIVREM, VT, Expand); 229 setOperationAction(ISD::UDIVREM, VT, Expand); 230 } 231 setOperationAction(ISD::SREM, MVT::i32, Expand); 232 setOperationAction(ISD::SREM, MVT::i64, Expand); 233 setOperationAction(ISD::UDIVREM, MVT::i32, Expand); 234 setOperationAction(ISD::UDIVREM, MVT::i64, Expand); 235 setOperationAction(ISD::UREM, MVT::i32, Expand); 236 setOperationAction(ISD::UREM, MVT::i64, Expand); 237 238 // Custom lower Add/Sub/Mul with overflow. 239 setOperationAction(ISD::SADDO, MVT::i32, Custom); 240 setOperationAction(ISD::SADDO, MVT::i64, Custom); 241 setOperationAction(ISD::UADDO, MVT::i32, Custom); 242 setOperationAction(ISD::UADDO, MVT::i64, Custom); 243 setOperationAction(ISD::SSUBO, MVT::i32, Custom); 244 setOperationAction(ISD::SSUBO, MVT::i64, Custom); 245 setOperationAction(ISD::USUBO, MVT::i32, Custom); 246 setOperationAction(ISD::USUBO, MVT::i64, Custom); 247 setOperationAction(ISD::SMULO, MVT::i32, Custom); 248 setOperationAction(ISD::SMULO, MVT::i64, Custom); 249 setOperationAction(ISD::UMULO, MVT::i32, Custom); 250 setOperationAction(ISD::UMULO, MVT::i64, Custom); 251 252 setOperationAction(ISD::FSIN, MVT::f32, Expand); 253 setOperationAction(ISD::FSIN, MVT::f64, Expand); 254 setOperationAction(ISD::FCOS, MVT::f32, Expand); 255 setOperationAction(ISD::FCOS, MVT::f64, Expand); 256 setOperationAction(ISD::FPOW, MVT::f32, Expand); 257 setOperationAction(ISD::FPOW, MVT::f64, Expand); 258 setOperationAction(ISD::FCOPYSIGN, MVT::f64, Custom); 259 setOperationAction(ISD::FCOPYSIGN, MVT::f32, Custom); 260 261 // f16 is a storage-only type, always promote it to f32. 262 setOperationAction(ISD::SETCC, MVT::f16, Promote); 263 setOperationAction(ISD::BR_CC, MVT::f16, Promote); 264 setOperationAction(ISD::SELECT_CC, MVT::f16, Promote); 265 setOperationAction(ISD::SELECT, MVT::f16, Promote); 266 setOperationAction(ISD::FADD, MVT::f16, Promote); 267 setOperationAction(ISD::FSUB, MVT::f16, Promote); 268 setOperationAction(ISD::FMUL, MVT::f16, Promote); 269 setOperationAction(ISD::FDIV, MVT::f16, Promote); 270 setOperationAction(ISD::FREM, MVT::f16, Promote); 271 setOperationAction(ISD::FMA, MVT::f16, Promote); 272 setOperationAction(ISD::FNEG, MVT::f16, Promote); 273 setOperationAction(ISD::FABS, MVT::f16, Promote); 274 setOperationAction(ISD::FCEIL, MVT::f16, Promote); 275 setOperationAction(ISD::FCOPYSIGN, MVT::f16, Promote); 276 setOperationAction(ISD::FCOS, MVT::f16, Promote); 277 setOperationAction(ISD::FFLOOR, MVT::f16, Promote); 278 setOperationAction(ISD::FNEARBYINT, MVT::f16, Promote); 279 setOperationAction(ISD::FPOW, MVT::f16, Promote); 280 setOperationAction(ISD::FPOWI, MVT::f16, Promote); 281 setOperationAction(ISD::FRINT, MVT::f16, Promote); 282 setOperationAction(ISD::FSIN, MVT::f16, Promote); 283 setOperationAction(ISD::FSINCOS, MVT::f16, Promote); 284 setOperationAction(ISD::FSQRT, MVT::f16, Promote); 285 setOperationAction(ISD::FEXP, MVT::f16, Promote); 286 setOperationAction(ISD::FEXP2, MVT::f16, Promote); 287 setOperationAction(ISD::FLOG, MVT::f16, Promote); 288 setOperationAction(ISD::FLOG2, MVT::f16, Promote); 289 setOperationAction(ISD::FLOG10, MVT::f16, Promote); 290 setOperationAction(ISD::FROUND, MVT::f16, Promote); 291 setOperationAction(ISD::FTRUNC, MVT::f16, Promote); 292 setOperationAction(ISD::FMINNUM, MVT::f16, Promote); 293 setOperationAction(ISD::FMAXNUM, MVT::f16, Promote); 294 setOperationAction(ISD::FMINNAN, MVT::f16, Promote); 295 setOperationAction(ISD::FMAXNAN, MVT::f16, Promote); 296 297 // v4f16 is also a storage-only type, so promote it to v4f32 when that is 298 // known to be safe. 299 setOperationAction(ISD::FADD, MVT::v4f16, Promote); 300 setOperationAction(ISD::FSUB, MVT::v4f16, Promote); 301 setOperationAction(ISD::FMUL, MVT::v4f16, Promote); 302 setOperationAction(ISD::FDIV, MVT::v4f16, Promote); 303 setOperationAction(ISD::FP_EXTEND, MVT::v4f16, Promote); 304 setOperationAction(ISD::FP_ROUND, MVT::v4f16, Promote); 305 AddPromotedToType(ISD::FADD, MVT::v4f16, MVT::v4f32); 306 AddPromotedToType(ISD::FSUB, MVT::v4f16, MVT::v4f32); 307 AddPromotedToType(ISD::FMUL, MVT::v4f16, MVT::v4f32); 308 AddPromotedToType(ISD::FDIV, MVT::v4f16, MVT::v4f32); 309 AddPromotedToType(ISD::FP_EXTEND, MVT::v4f16, MVT::v4f32); 310 AddPromotedToType(ISD::FP_ROUND, MVT::v4f16, MVT::v4f32); 311 312 // Expand all other v4f16 operations. 313 // FIXME: We could generate better code by promoting some operations to 314 // a pair of v4f32s 315 setOperationAction(ISD::FABS, MVT::v4f16, Expand); 316 setOperationAction(ISD::FCEIL, MVT::v4f16, Expand); 317 setOperationAction(ISD::FCOPYSIGN, MVT::v4f16, Expand); 318 setOperationAction(ISD::FCOS, MVT::v4f16, Expand); 319 setOperationAction(ISD::FFLOOR, MVT::v4f16, Expand); 320 setOperationAction(ISD::FMA, MVT::v4f16, Expand); 321 setOperationAction(ISD::FNEARBYINT, MVT::v4f16, Expand); 322 setOperationAction(ISD::FNEG, MVT::v4f16, Expand); 323 setOperationAction(ISD::FPOW, MVT::v4f16, Expand); 324 setOperationAction(ISD::FPOWI, MVT::v4f16, Expand); 325 setOperationAction(ISD::FREM, MVT::v4f16, Expand); 326 setOperationAction(ISD::FROUND, MVT::v4f16, Expand); 327 setOperationAction(ISD::FRINT, MVT::v4f16, Expand); 328 setOperationAction(ISD::FSIN, MVT::v4f16, Expand); 329 setOperationAction(ISD::FSINCOS, MVT::v4f16, Expand); 330 setOperationAction(ISD::FSQRT, MVT::v4f16, Expand); 331 setOperationAction(ISD::FTRUNC, MVT::v4f16, Expand); 332 setOperationAction(ISD::SETCC, MVT::v4f16, Expand); 333 setOperationAction(ISD::BR_CC, MVT::v4f16, Expand); 334 setOperationAction(ISD::SELECT, MVT::v4f16, Expand); 335 setOperationAction(ISD::SELECT_CC, MVT::v4f16, Expand); 336 setOperationAction(ISD::FEXP, MVT::v4f16, Expand); 337 setOperationAction(ISD::FEXP2, MVT::v4f16, Expand); 338 setOperationAction(ISD::FLOG, MVT::v4f16, Expand); 339 setOperationAction(ISD::FLOG2, MVT::v4f16, Expand); 340 setOperationAction(ISD::FLOG10, MVT::v4f16, Expand); 341 342 343 // v8f16 is also a storage-only type, so expand it. 344 setOperationAction(ISD::FABS, MVT::v8f16, Expand); 345 setOperationAction(ISD::FADD, MVT::v8f16, Expand); 346 setOperationAction(ISD::FCEIL, MVT::v8f16, Expand); 347 setOperationAction(ISD::FCOPYSIGN, MVT::v8f16, Expand); 348 setOperationAction(ISD::FCOS, MVT::v8f16, Expand); 349 setOperationAction(ISD::FDIV, MVT::v8f16, Expand); 350 setOperationAction(ISD::FFLOOR, MVT::v8f16, Expand); 351 setOperationAction(ISD::FMA, MVT::v8f16, Expand); 352 setOperationAction(ISD::FMUL, MVT::v8f16, Expand); 353 setOperationAction(ISD::FNEARBYINT, MVT::v8f16, Expand); 354 setOperationAction(ISD::FNEG, MVT::v8f16, Expand); 355 setOperationAction(ISD::FPOW, MVT::v8f16, Expand); 356 setOperationAction(ISD::FPOWI, MVT::v8f16, Expand); 357 setOperationAction(ISD::FREM, MVT::v8f16, Expand); 358 setOperationAction(ISD::FROUND, MVT::v8f16, Expand); 359 setOperationAction(ISD::FRINT, MVT::v8f16, Expand); 360 setOperationAction(ISD::FSIN, MVT::v8f16, Expand); 361 setOperationAction(ISD::FSINCOS, MVT::v8f16, Expand); 362 setOperationAction(ISD::FSQRT, MVT::v8f16, Expand); 363 setOperationAction(ISD::FSUB, MVT::v8f16, Expand); 364 setOperationAction(ISD::FTRUNC, MVT::v8f16, Expand); 365 setOperationAction(ISD::SETCC, MVT::v8f16, Expand); 366 setOperationAction(ISD::BR_CC, MVT::v8f16, Expand); 367 setOperationAction(ISD::SELECT, MVT::v8f16, Expand); 368 setOperationAction(ISD::SELECT_CC, MVT::v8f16, Expand); 369 setOperationAction(ISD::FP_EXTEND, MVT::v8f16, Expand); 370 setOperationAction(ISD::FEXP, MVT::v8f16, Expand); 371 setOperationAction(ISD::FEXP2, MVT::v8f16, Expand); 372 setOperationAction(ISD::FLOG, MVT::v8f16, Expand); 373 setOperationAction(ISD::FLOG2, MVT::v8f16, Expand); 374 setOperationAction(ISD::FLOG10, MVT::v8f16, Expand); 375 376 // AArch64 has implementations of a lot of rounding-like FP operations. 377 for (MVT Ty : {MVT::f32, MVT::f64}) { 378 setOperationAction(ISD::FFLOOR, Ty, Legal); 379 setOperationAction(ISD::FNEARBYINT, Ty, Legal); 380 setOperationAction(ISD::FCEIL, Ty, Legal); 381 setOperationAction(ISD::FRINT, Ty, Legal); 382 setOperationAction(ISD::FTRUNC, Ty, Legal); 383 setOperationAction(ISD::FROUND, Ty, Legal); 384 setOperationAction(ISD::FMINNUM, Ty, Legal); 385 setOperationAction(ISD::FMAXNUM, Ty, Legal); 386 setOperationAction(ISD::FMINNAN, Ty, Legal); 387 setOperationAction(ISD::FMAXNAN, Ty, Legal); 388 } 389 390 setOperationAction(ISD::PREFETCH, MVT::Other, Custom); 391 392 setOperationAction(ISD::ATOMIC_CMP_SWAP, MVT::i128, Custom); 393 394 // Lower READCYCLECOUNTER using an mrs from PMCCNTR_EL0. 395 // This requires the Performance Monitors extension. 396 if (Subtarget->hasPerfMon()) 397 setOperationAction(ISD::READCYCLECOUNTER, MVT::i64, Legal); 398 399 if (Subtarget->isTargetMachO()) { 400 // For iOS, we don't want to the normal expansion of a libcall to 401 // sincos. We want to issue a libcall to __sincos_stret to avoid memory 402 // traffic. 403 setOperationAction(ISD::FSINCOS, MVT::f64, Custom); 404 setOperationAction(ISD::FSINCOS, MVT::f32, Custom); 405 } else { 406 setOperationAction(ISD::FSINCOS, MVT::f64, Expand); 407 setOperationAction(ISD::FSINCOS, MVT::f32, Expand); 408 } 409 410 // Make floating-point constants legal for the large code model, so they don't 411 // become loads from the constant pool. 412 if (Subtarget->isTargetMachO() && TM.getCodeModel() == CodeModel::Large) { 413 setOperationAction(ISD::ConstantFP, MVT::f32, Legal); 414 setOperationAction(ISD::ConstantFP, MVT::f64, Legal); 415 } 416 417 // AArch64 does not have floating-point extending loads, i1 sign-extending 418 // load, floating-point truncating stores, or v2i32->v2i16 truncating store. 419 for (MVT VT : MVT::fp_valuetypes()) { 420 setLoadExtAction(ISD::EXTLOAD, VT, MVT::f16, Expand); 421 setLoadExtAction(ISD::EXTLOAD, VT, MVT::f32, Expand); 422 setLoadExtAction(ISD::EXTLOAD, VT, MVT::f64, Expand); 423 setLoadExtAction(ISD::EXTLOAD, VT, MVT::f80, Expand); 424 } 425 for (MVT VT : MVT::integer_valuetypes()) 426 setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i1, Expand); 427 428 setTruncStoreAction(MVT::f32, MVT::f16, Expand); 429 setTruncStoreAction(MVT::f64, MVT::f32, Expand); 430 setTruncStoreAction(MVT::f64, MVT::f16, Expand); 431 setTruncStoreAction(MVT::f128, MVT::f80, Expand); 432 setTruncStoreAction(MVT::f128, MVT::f64, Expand); 433 setTruncStoreAction(MVT::f128, MVT::f32, Expand); 434 setTruncStoreAction(MVT::f128, MVT::f16, Expand); 435 436 setOperationAction(ISD::BITCAST, MVT::i16, Custom); 437 setOperationAction(ISD::BITCAST, MVT::f16, Custom); 438 439 // Indexed loads and stores are supported. 440 for (unsigned im = (unsigned)ISD::PRE_INC; 441 im != (unsigned)ISD::LAST_INDEXED_MODE; ++im) { 442 setIndexedLoadAction(im, MVT::i8, Legal); 443 setIndexedLoadAction(im, MVT::i16, Legal); 444 setIndexedLoadAction(im, MVT::i32, Legal); 445 setIndexedLoadAction(im, MVT::i64, Legal); 446 setIndexedLoadAction(im, MVT::f64, Legal); 447 setIndexedLoadAction(im, MVT::f32, Legal); 448 setIndexedLoadAction(im, MVT::f16, Legal); 449 setIndexedStoreAction(im, MVT::i8, Legal); 450 setIndexedStoreAction(im, MVT::i16, Legal); 451 setIndexedStoreAction(im, MVT::i32, Legal); 452 setIndexedStoreAction(im, MVT::i64, Legal); 453 setIndexedStoreAction(im, MVT::f64, Legal); 454 setIndexedStoreAction(im, MVT::f32, Legal); 455 setIndexedStoreAction(im, MVT::f16, Legal); 456 } 457 458 // Trap. 459 setOperationAction(ISD::TRAP, MVT::Other, Legal); 460 461 // We combine OR nodes for bitfield operations. 462 setTargetDAGCombine(ISD::OR); 463 464 // Vector add and sub nodes may conceal a high-half opportunity. 465 // Also, try to fold ADD into CSINC/CSINV.. 466 setTargetDAGCombine(ISD::ADD); 467 setTargetDAGCombine(ISD::SUB); 468 setTargetDAGCombine(ISD::SRL); 469 setTargetDAGCombine(ISD::XOR); 470 setTargetDAGCombine(ISD::SINT_TO_FP); 471 setTargetDAGCombine(ISD::UINT_TO_FP); 472 473 setTargetDAGCombine(ISD::FP_TO_SINT); 474 setTargetDAGCombine(ISD::FP_TO_UINT); 475 setTargetDAGCombine(ISD::FDIV); 476 477 setTargetDAGCombine(ISD::INTRINSIC_WO_CHAIN); 478 479 setTargetDAGCombine(ISD::ANY_EXTEND); 480 setTargetDAGCombine(ISD::ZERO_EXTEND); 481 setTargetDAGCombine(ISD::SIGN_EXTEND); 482 setTargetDAGCombine(ISD::BITCAST); 483 setTargetDAGCombine(ISD::CONCAT_VECTORS); 484 setTargetDAGCombine(ISD::STORE); 485 if (Subtarget->supportsAddressTopByteIgnored()) 486 setTargetDAGCombine(ISD::LOAD); 487 488 setTargetDAGCombine(ISD::MUL); 489 490 setTargetDAGCombine(ISD::SELECT); 491 setTargetDAGCombine(ISD::VSELECT); 492 493 setTargetDAGCombine(ISD::INTRINSIC_VOID); 494 setTargetDAGCombine(ISD::INTRINSIC_W_CHAIN); 495 setTargetDAGCombine(ISD::INSERT_VECTOR_ELT); 496 setTargetDAGCombine(ISD::EXTRACT_VECTOR_ELT); 497 498 MaxStoresPerMemset = MaxStoresPerMemsetOptSize = 8; 499 MaxStoresPerMemcpy = MaxStoresPerMemcpyOptSize = 4; 500 MaxStoresPerMemmove = MaxStoresPerMemmoveOptSize = 4; 501 502 setStackPointerRegisterToSaveRestore(AArch64::SP); 503 504 setSchedulingPreference(Sched::Hybrid); 505 506 // Enable TBZ/TBNZ 507 MaskAndBranchFoldingIsLegal = true; 508 EnableExtLdPromotion = true; 509 510 // Set required alignment. 511 setMinFunctionAlignment(2); 512 // Set preferred alignments. 513 setPrefFunctionAlignment(STI.getPrefFunctionAlignment()); 514 setPrefLoopAlignment(STI.getPrefLoopAlignment()); 515 516 setHasExtractBitsInsn(true); 517 518 setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom); 519 520 if (Subtarget->hasNEON()) { 521 // FIXME: v1f64 shouldn't be legal if we can avoid it, because it leads to 522 // silliness like this: 523 setOperationAction(ISD::FABS, MVT::v1f64, Expand); 524 setOperationAction(ISD::FADD, MVT::v1f64, Expand); 525 setOperationAction(ISD::FCEIL, MVT::v1f64, Expand); 526 setOperationAction(ISD::FCOPYSIGN, MVT::v1f64, Expand); 527 setOperationAction(ISD::FCOS, MVT::v1f64, Expand); 528 setOperationAction(ISD::FDIV, MVT::v1f64, Expand); 529 setOperationAction(ISD::FFLOOR, MVT::v1f64, Expand); 530 setOperationAction(ISD::FMA, MVT::v1f64, Expand); 531 setOperationAction(ISD::FMUL, MVT::v1f64, Expand); 532 setOperationAction(ISD::FNEARBYINT, MVT::v1f64, Expand); 533 setOperationAction(ISD::FNEG, MVT::v1f64, Expand); 534 setOperationAction(ISD::FPOW, MVT::v1f64, Expand); 535 setOperationAction(ISD::FREM, MVT::v1f64, Expand); 536 setOperationAction(ISD::FROUND, MVT::v1f64, Expand); 537 setOperationAction(ISD::FRINT, MVT::v1f64, Expand); 538 setOperationAction(ISD::FSIN, MVT::v1f64, Expand); 539 setOperationAction(ISD::FSINCOS, MVT::v1f64, Expand); 540 setOperationAction(ISD::FSQRT, MVT::v1f64, Expand); 541 setOperationAction(ISD::FSUB, MVT::v1f64, Expand); 542 setOperationAction(ISD::FTRUNC, MVT::v1f64, Expand); 543 setOperationAction(ISD::SETCC, MVT::v1f64, Expand); 544 setOperationAction(ISD::BR_CC, MVT::v1f64, Expand); 545 setOperationAction(ISD::SELECT, MVT::v1f64, Expand); 546 setOperationAction(ISD::SELECT_CC, MVT::v1f64, Expand); 547 setOperationAction(ISD::FP_EXTEND, MVT::v1f64, Expand); 548 549 setOperationAction(ISD::FP_TO_SINT, MVT::v1i64, Expand); 550 setOperationAction(ISD::FP_TO_UINT, MVT::v1i64, Expand); 551 setOperationAction(ISD::SINT_TO_FP, MVT::v1i64, Expand); 552 setOperationAction(ISD::UINT_TO_FP, MVT::v1i64, Expand); 553 setOperationAction(ISD::FP_ROUND, MVT::v1f64, Expand); 554 555 setOperationAction(ISD::MUL, MVT::v1i64, Expand); 556 557 // AArch64 doesn't have a direct vector ->f32 conversion instructions for 558 // elements smaller than i32, so promote the input to i32 first. 559 setOperationAction(ISD::UINT_TO_FP, MVT::v4i8, Promote); 560 setOperationAction(ISD::SINT_TO_FP, MVT::v4i8, Promote); 561 setOperationAction(ISD::UINT_TO_FP, MVT::v4i16, Promote); 562 setOperationAction(ISD::SINT_TO_FP, MVT::v4i16, Promote); 563 // i8 and i16 vector elements also need promotion to i32 for v8i8 or v8i16 564 // -> v8f16 conversions. 565 setOperationAction(ISD::SINT_TO_FP, MVT::v8i8, Promote); 566 setOperationAction(ISD::UINT_TO_FP, MVT::v8i8, Promote); 567 setOperationAction(ISD::SINT_TO_FP, MVT::v8i16, Promote); 568 setOperationAction(ISD::UINT_TO_FP, MVT::v8i16, Promote); 569 // Similarly, there is no direct i32 -> f64 vector conversion instruction. 570 setOperationAction(ISD::SINT_TO_FP, MVT::v2i32, Custom); 571 setOperationAction(ISD::UINT_TO_FP, MVT::v2i32, Custom); 572 setOperationAction(ISD::SINT_TO_FP, MVT::v2i64, Custom); 573 setOperationAction(ISD::UINT_TO_FP, MVT::v2i64, Custom); 574 // Or, direct i32 -> f16 vector conversion. Set it so custom, so the 575 // conversion happens in two steps: v4i32 -> v4f32 -> v4f16 576 setOperationAction(ISD::SINT_TO_FP, MVT::v4i32, Custom); 577 setOperationAction(ISD::UINT_TO_FP, MVT::v4i32, Custom); 578 579 setOperationAction(ISD::CTLZ, MVT::v1i64, Expand); 580 setOperationAction(ISD::CTLZ, MVT::v2i64, Expand); 581 582 setOperationAction(ISD::CTTZ, MVT::v2i8, Expand); 583 setOperationAction(ISD::CTTZ, MVT::v4i16, Expand); 584 setOperationAction(ISD::CTTZ, MVT::v2i32, Expand); 585 setOperationAction(ISD::CTTZ, MVT::v1i64, Expand); 586 setOperationAction(ISD::CTTZ, MVT::v16i8, Expand); 587 setOperationAction(ISD::CTTZ, MVT::v8i16, Expand); 588 setOperationAction(ISD::CTTZ, MVT::v4i32, Expand); 589 setOperationAction(ISD::CTTZ, MVT::v2i64, Expand); 590 591 // AArch64 doesn't have MUL.2d: 592 setOperationAction(ISD::MUL, MVT::v2i64, Expand); 593 // Custom handling for some quad-vector types to detect MULL. 594 setOperationAction(ISD::MUL, MVT::v8i16, Custom); 595 setOperationAction(ISD::MUL, MVT::v4i32, Custom); 596 setOperationAction(ISD::MUL, MVT::v2i64, Custom); 597 598 setOperationAction(ISD::ANY_EXTEND, MVT::v4i32, Legal); 599 setTruncStoreAction(MVT::v2i32, MVT::v2i16, Expand); 600 // Likewise, narrowing and extending vector loads/stores aren't handled 601 // directly. 602 for (MVT VT : MVT::vector_valuetypes()) { 603 setOperationAction(ISD::SIGN_EXTEND_INREG, VT, Expand); 604 605 setOperationAction(ISD::MULHS, VT, Expand); 606 setOperationAction(ISD::SMUL_LOHI, VT, Expand); 607 setOperationAction(ISD::MULHU, VT, Expand); 608 setOperationAction(ISD::UMUL_LOHI, VT, Expand); 609 610 setOperationAction(ISD::BSWAP, VT, Expand); 611 612 for (MVT InnerVT : MVT::vector_valuetypes()) { 613 setTruncStoreAction(VT, InnerVT, Expand); 614 setLoadExtAction(ISD::SEXTLOAD, VT, InnerVT, Expand); 615 setLoadExtAction(ISD::ZEXTLOAD, VT, InnerVT, Expand); 616 setLoadExtAction(ISD::EXTLOAD, VT, InnerVT, Expand); 617 } 618 } 619 620 // AArch64 has implementations of a lot of rounding-like FP operations. 621 for (MVT Ty : {MVT::v2f32, MVT::v4f32, MVT::v2f64}) { 622 setOperationAction(ISD::FFLOOR, Ty, Legal); 623 setOperationAction(ISD::FNEARBYINT, Ty, Legal); 624 setOperationAction(ISD::FCEIL, Ty, Legal); 625 setOperationAction(ISD::FRINT, Ty, Legal); 626 setOperationAction(ISD::FTRUNC, Ty, Legal); 627 setOperationAction(ISD::FROUND, Ty, Legal); 628 } 629 } 630 631 PredictableSelectIsExpensive = Subtarget->predictableSelectIsExpensive(); 632 } 633 634 void AArch64TargetLowering::addTypeForNEON(MVT VT, MVT PromotedBitwiseVT) { 635 if (VT == MVT::v2f32 || VT == MVT::v4f16) { 636 setOperationAction(ISD::LOAD, VT, Promote); 637 AddPromotedToType(ISD::LOAD, VT, MVT::v2i32); 638 639 setOperationAction(ISD::STORE, VT, Promote); 640 AddPromotedToType(ISD::STORE, VT, MVT::v2i32); 641 } else if (VT == MVT::v2f64 || VT == MVT::v4f32 || VT == MVT::v8f16) { 642 setOperationAction(ISD::LOAD, VT, Promote); 643 AddPromotedToType(ISD::LOAD, VT, MVT::v2i64); 644 645 setOperationAction(ISD::STORE, VT, Promote); 646 AddPromotedToType(ISD::STORE, VT, MVT::v2i64); 647 } 648 649 // Mark vector float intrinsics as expand. 650 if (VT == MVT::v2f32 || VT == MVT::v4f32 || VT == MVT::v2f64) { 651 setOperationAction(ISD::FSIN, VT, Expand); 652 setOperationAction(ISD::FCOS, VT, Expand); 653 setOperationAction(ISD::FPOWI, VT, Expand); 654 setOperationAction(ISD::FPOW, VT, Expand); 655 setOperationAction(ISD::FLOG, VT, Expand); 656 setOperationAction(ISD::FLOG2, VT, Expand); 657 setOperationAction(ISD::FLOG10, VT, Expand); 658 setOperationAction(ISD::FEXP, VT, Expand); 659 setOperationAction(ISD::FEXP2, VT, Expand); 660 661 // But we do support custom-lowering for FCOPYSIGN. 662 setOperationAction(ISD::FCOPYSIGN, VT, Custom); 663 } 664 665 setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom); 666 setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom); 667 setOperationAction(ISD::BUILD_VECTOR, VT, Custom); 668 setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom); 669 setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Custom); 670 setOperationAction(ISD::SRA, VT, Custom); 671 setOperationAction(ISD::SRL, VT, Custom); 672 setOperationAction(ISD::SHL, VT, Custom); 673 setOperationAction(ISD::AND, VT, Custom); 674 setOperationAction(ISD::OR, VT, Custom); 675 setOperationAction(ISD::SETCC, VT, Custom); 676 setOperationAction(ISD::CONCAT_VECTORS, VT, Legal); 677 678 setOperationAction(ISD::SELECT, VT, Expand); 679 setOperationAction(ISD::SELECT_CC, VT, Expand); 680 setOperationAction(ISD::VSELECT, VT, Expand); 681 for (MVT InnerVT : MVT::all_valuetypes()) 682 setLoadExtAction(ISD::EXTLOAD, InnerVT, VT, Expand); 683 684 // CNT supports only B element sizes. 685 if (VT != MVT::v8i8 && VT != MVT::v16i8) 686 setOperationAction(ISD::CTPOP, VT, Expand); 687 688 setOperationAction(ISD::UDIV, VT, Expand); 689 setOperationAction(ISD::SDIV, VT, Expand); 690 setOperationAction(ISD::UREM, VT, Expand); 691 setOperationAction(ISD::SREM, VT, Expand); 692 setOperationAction(ISD::FREM, VT, Expand); 693 694 setOperationAction(ISD::FP_TO_SINT, VT, Custom); 695 setOperationAction(ISD::FP_TO_UINT, VT, Custom); 696 697 // [SU][MIN|MAX] are available for all NEON types apart from i64. 698 if (!VT.isFloatingPoint() && VT != MVT::v2i64 && VT != MVT::v1i64) 699 for (unsigned Opcode : {ISD::SMIN, ISD::SMAX, ISD::UMIN, ISD::UMAX}) 700 setOperationAction(Opcode, VT, Legal); 701 702 // F[MIN|MAX][NUM|NAN] are available for all FP NEON types (not f16 though!). 703 if (VT.isFloatingPoint() && VT.getVectorElementType() != MVT::f16) 704 for (unsigned Opcode : {ISD::FMINNAN, ISD::FMAXNAN, 705 ISD::FMINNUM, ISD::FMAXNUM}) 706 setOperationAction(Opcode, VT, Legal); 707 708 if (Subtarget->isLittleEndian()) { 709 for (unsigned im = (unsigned)ISD::PRE_INC; 710 im != (unsigned)ISD::LAST_INDEXED_MODE; ++im) { 711 setIndexedLoadAction(im, VT, Legal); 712 setIndexedStoreAction(im, VT, Legal); 713 } 714 } 715 } 716 717 void AArch64TargetLowering::addDRTypeForNEON(MVT VT) { 718 addRegisterClass(VT, &AArch64::FPR64RegClass); 719 addTypeForNEON(VT, MVT::v2i32); 720 } 721 722 void AArch64TargetLowering::addQRTypeForNEON(MVT VT) { 723 addRegisterClass(VT, &AArch64::FPR128RegClass); 724 addTypeForNEON(VT, MVT::v4i32); 725 } 726 727 EVT AArch64TargetLowering::getSetCCResultType(const DataLayout &, LLVMContext &, 728 EVT VT) const { 729 if (!VT.isVector()) 730 return MVT::i32; 731 return VT.changeVectorElementTypeToInteger(); 732 } 733 734 /// computeKnownBitsForTargetNode - Determine which of the bits specified in 735 /// Mask are known to be either zero or one and return them in the 736 /// KnownZero/KnownOne bitsets. 737 void AArch64TargetLowering::computeKnownBitsForTargetNode( 738 const SDValue Op, APInt &KnownZero, APInt &KnownOne, 739 const SelectionDAG &DAG, unsigned Depth) const { 740 switch (Op.getOpcode()) { 741 default: 742 break; 743 case AArch64ISD::CSEL: { 744 APInt KnownZero2, KnownOne2; 745 DAG.computeKnownBits(Op->getOperand(0), KnownZero, KnownOne, Depth + 1); 746 DAG.computeKnownBits(Op->getOperand(1), KnownZero2, KnownOne2, Depth + 1); 747 KnownZero &= KnownZero2; 748 KnownOne &= KnownOne2; 749 break; 750 } 751 case ISD::INTRINSIC_W_CHAIN: { 752 ConstantSDNode *CN = cast<ConstantSDNode>(Op->getOperand(1)); 753 Intrinsic::ID IntID = static_cast<Intrinsic::ID>(CN->getZExtValue()); 754 switch (IntID) { 755 default: return; 756 case Intrinsic::aarch64_ldaxr: 757 case Intrinsic::aarch64_ldxr: { 758 unsigned BitWidth = KnownOne.getBitWidth(); 759 EVT VT = cast<MemIntrinsicSDNode>(Op)->getMemoryVT(); 760 unsigned MemBits = VT.getScalarType().getSizeInBits(); 761 KnownZero |= APInt::getHighBitsSet(BitWidth, BitWidth - MemBits); 762 return; 763 } 764 } 765 break; 766 } 767 case ISD::INTRINSIC_WO_CHAIN: 768 case ISD::INTRINSIC_VOID: { 769 unsigned IntNo = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue(); 770 switch (IntNo) { 771 default: 772 break; 773 case Intrinsic::aarch64_neon_umaxv: 774 case Intrinsic::aarch64_neon_uminv: { 775 // Figure out the datatype of the vector operand. The UMINV instruction 776 // will zero extend the result, so we can mark as known zero all the 777 // bits larger than the element datatype. 32-bit or larget doesn't need 778 // this as those are legal types and will be handled by isel directly. 779 MVT VT = Op.getOperand(1).getValueType().getSimpleVT(); 780 unsigned BitWidth = KnownZero.getBitWidth(); 781 if (VT == MVT::v8i8 || VT == MVT::v16i8) { 782 assert(BitWidth >= 8 && "Unexpected width!"); 783 APInt Mask = APInt::getHighBitsSet(BitWidth, BitWidth - 8); 784 KnownZero |= Mask; 785 } else if (VT == MVT::v4i16 || VT == MVT::v8i16) { 786 assert(BitWidth >= 16 && "Unexpected width!"); 787 APInt Mask = APInt::getHighBitsSet(BitWidth, BitWidth - 16); 788 KnownZero |= Mask; 789 } 790 break; 791 } break; 792 } 793 } 794 } 795 } 796 797 MVT AArch64TargetLowering::getScalarShiftAmountTy(const DataLayout &DL, 798 EVT) const { 799 return MVT::i64; 800 } 801 802 bool AArch64TargetLowering::allowsMisalignedMemoryAccesses(EVT VT, 803 unsigned AddrSpace, 804 unsigned Align, 805 bool *Fast) const { 806 if (Subtarget->requiresStrictAlign()) 807 return false; 808 809 if (Fast) { 810 // Some CPUs are fine with unaligned stores except for 128-bit ones. 811 *Fast = !Subtarget->isMisaligned128StoreSlow() || VT.getStoreSize() != 16 || 812 // See comments in performSTORECombine() for more details about 813 // these conditions. 814 815 // Code that uses clang vector extensions can mark that it 816 // wants unaligned accesses to be treated as fast by 817 // underspecifying alignment to be 1 or 2. 818 Align <= 2 || 819 820 // Disregard v2i64. Memcpy lowering produces those and splitting 821 // them regresses performance on micro-benchmarks and olden/bh. 822 VT == MVT::v2i64; 823 } 824 return true; 825 } 826 827 FastISel * 828 AArch64TargetLowering::createFastISel(FunctionLoweringInfo &funcInfo, 829 const TargetLibraryInfo *libInfo) const { 830 return AArch64::createFastISel(funcInfo, libInfo); 831 } 832 833 const char *AArch64TargetLowering::getTargetNodeName(unsigned Opcode) const { 834 switch ((AArch64ISD::NodeType)Opcode) { 835 case AArch64ISD::FIRST_NUMBER: break; 836 case AArch64ISD::CALL: return "AArch64ISD::CALL"; 837 case AArch64ISD::ADRP: return "AArch64ISD::ADRP"; 838 case AArch64ISD::ADDlow: return "AArch64ISD::ADDlow"; 839 case AArch64ISD::LOADgot: return "AArch64ISD::LOADgot"; 840 case AArch64ISD::RET_FLAG: return "AArch64ISD::RET_FLAG"; 841 case AArch64ISD::BRCOND: return "AArch64ISD::BRCOND"; 842 case AArch64ISD::CSEL: return "AArch64ISD::CSEL"; 843 case AArch64ISD::FCSEL: return "AArch64ISD::FCSEL"; 844 case AArch64ISD::CSINV: return "AArch64ISD::CSINV"; 845 case AArch64ISD::CSNEG: return "AArch64ISD::CSNEG"; 846 case AArch64ISD::CSINC: return "AArch64ISD::CSINC"; 847 case AArch64ISD::THREAD_POINTER: return "AArch64ISD::THREAD_POINTER"; 848 case AArch64ISD::TLSDESC_CALLSEQ: return "AArch64ISD::TLSDESC_CALLSEQ"; 849 case AArch64ISD::ADC: return "AArch64ISD::ADC"; 850 case AArch64ISD::SBC: return "AArch64ISD::SBC"; 851 case AArch64ISD::ADDS: return "AArch64ISD::ADDS"; 852 case AArch64ISD::SUBS: return "AArch64ISD::SUBS"; 853 case AArch64ISD::ADCS: return "AArch64ISD::ADCS"; 854 case AArch64ISD::SBCS: return "AArch64ISD::SBCS"; 855 case AArch64ISD::ANDS: return "AArch64ISD::ANDS"; 856 case AArch64ISD::CCMP: return "AArch64ISD::CCMP"; 857 case AArch64ISD::CCMN: return "AArch64ISD::CCMN"; 858 case AArch64ISD::FCCMP: return "AArch64ISD::FCCMP"; 859 case AArch64ISD::FCMP: return "AArch64ISD::FCMP"; 860 case AArch64ISD::DUP: return "AArch64ISD::DUP"; 861 case AArch64ISD::DUPLANE8: return "AArch64ISD::DUPLANE8"; 862 case AArch64ISD::DUPLANE16: return "AArch64ISD::DUPLANE16"; 863 case AArch64ISD::DUPLANE32: return "AArch64ISD::DUPLANE32"; 864 case AArch64ISD::DUPLANE64: return "AArch64ISD::DUPLANE64"; 865 case AArch64ISD::MOVI: return "AArch64ISD::MOVI"; 866 case AArch64ISD::MOVIshift: return "AArch64ISD::MOVIshift"; 867 case AArch64ISD::MOVIedit: return "AArch64ISD::MOVIedit"; 868 case AArch64ISD::MOVImsl: return "AArch64ISD::MOVImsl"; 869 case AArch64ISD::FMOV: return "AArch64ISD::FMOV"; 870 case AArch64ISD::MVNIshift: return "AArch64ISD::MVNIshift"; 871 case AArch64ISD::MVNImsl: return "AArch64ISD::MVNImsl"; 872 case AArch64ISD::BICi: return "AArch64ISD::BICi"; 873 case AArch64ISD::ORRi: return "AArch64ISD::ORRi"; 874 case AArch64ISD::BSL: return "AArch64ISD::BSL"; 875 case AArch64ISD::NEG: return "AArch64ISD::NEG"; 876 case AArch64ISD::EXTR: return "AArch64ISD::EXTR"; 877 case AArch64ISD::ZIP1: return "AArch64ISD::ZIP1"; 878 case AArch64ISD::ZIP2: return "AArch64ISD::ZIP2"; 879 case AArch64ISD::UZP1: return "AArch64ISD::UZP1"; 880 case AArch64ISD::UZP2: return "AArch64ISD::UZP2"; 881 case AArch64ISD::TRN1: return "AArch64ISD::TRN1"; 882 case AArch64ISD::TRN2: return "AArch64ISD::TRN2"; 883 case AArch64ISD::REV16: return "AArch64ISD::REV16"; 884 case AArch64ISD::REV32: return "AArch64ISD::REV32"; 885 case AArch64ISD::REV64: return "AArch64ISD::REV64"; 886 case AArch64ISD::EXT: return "AArch64ISD::EXT"; 887 case AArch64ISD::VSHL: return "AArch64ISD::VSHL"; 888 case AArch64ISD::VLSHR: return "AArch64ISD::VLSHR"; 889 case AArch64ISD::VASHR: return "AArch64ISD::VASHR"; 890 case AArch64ISD::CMEQ: return "AArch64ISD::CMEQ"; 891 case AArch64ISD::CMGE: return "AArch64ISD::CMGE"; 892 case AArch64ISD::CMGT: return "AArch64ISD::CMGT"; 893 case AArch64ISD::CMHI: return "AArch64ISD::CMHI"; 894 case AArch64ISD::CMHS: return "AArch64ISD::CMHS"; 895 case AArch64ISD::FCMEQ: return "AArch64ISD::FCMEQ"; 896 case AArch64ISD::FCMGE: return "AArch64ISD::FCMGE"; 897 case AArch64ISD::FCMGT: return "AArch64ISD::FCMGT"; 898 case AArch64ISD::CMEQz: return "AArch64ISD::CMEQz"; 899 case AArch64ISD::CMGEz: return "AArch64ISD::CMGEz"; 900 case AArch64ISD::CMGTz: return "AArch64ISD::CMGTz"; 901 case AArch64ISD::CMLEz: return "AArch64ISD::CMLEz"; 902 case AArch64ISD::CMLTz: return "AArch64ISD::CMLTz"; 903 case AArch64ISD::FCMEQz: return "AArch64ISD::FCMEQz"; 904 case AArch64ISD::FCMGEz: return "AArch64ISD::FCMGEz"; 905 case AArch64ISD::FCMGTz: return "AArch64ISD::FCMGTz"; 906 case AArch64ISD::FCMLEz: return "AArch64ISD::FCMLEz"; 907 case AArch64ISD::FCMLTz: return "AArch64ISD::FCMLTz"; 908 case AArch64ISD::SADDV: return "AArch64ISD::SADDV"; 909 case AArch64ISD::UADDV: return "AArch64ISD::UADDV"; 910 case AArch64ISD::SMINV: return "AArch64ISD::SMINV"; 911 case AArch64ISD::UMINV: return "AArch64ISD::UMINV"; 912 case AArch64ISD::SMAXV: return "AArch64ISD::SMAXV"; 913 case AArch64ISD::UMAXV: return "AArch64ISD::UMAXV"; 914 case AArch64ISD::NOT: return "AArch64ISD::NOT"; 915 case AArch64ISD::BIT: return "AArch64ISD::BIT"; 916 case AArch64ISD::CBZ: return "AArch64ISD::CBZ"; 917 case AArch64ISD::CBNZ: return "AArch64ISD::CBNZ"; 918 case AArch64ISD::TBZ: return "AArch64ISD::TBZ"; 919 case AArch64ISD::TBNZ: return "AArch64ISD::TBNZ"; 920 case AArch64ISD::TC_RETURN: return "AArch64ISD::TC_RETURN"; 921 case AArch64ISD::PREFETCH: return "AArch64ISD::PREFETCH"; 922 case AArch64ISD::SITOF: return "AArch64ISD::SITOF"; 923 case AArch64ISD::UITOF: return "AArch64ISD::UITOF"; 924 case AArch64ISD::NVCAST: return "AArch64ISD::NVCAST"; 925 case AArch64ISD::SQSHL_I: return "AArch64ISD::SQSHL_I"; 926 case AArch64ISD::UQSHL_I: return "AArch64ISD::UQSHL_I"; 927 case AArch64ISD::SRSHR_I: return "AArch64ISD::SRSHR_I"; 928 case AArch64ISD::URSHR_I: return "AArch64ISD::URSHR_I"; 929 case AArch64ISD::SQSHLU_I: return "AArch64ISD::SQSHLU_I"; 930 case AArch64ISD::WrapperLarge: return "AArch64ISD::WrapperLarge"; 931 case AArch64ISD::LD2post: return "AArch64ISD::LD2post"; 932 case AArch64ISD::LD3post: return "AArch64ISD::LD3post"; 933 case AArch64ISD::LD4post: return "AArch64ISD::LD4post"; 934 case AArch64ISD::ST2post: return "AArch64ISD::ST2post"; 935 case AArch64ISD::ST3post: return "AArch64ISD::ST3post"; 936 case AArch64ISD::ST4post: return "AArch64ISD::ST4post"; 937 case AArch64ISD::LD1x2post: return "AArch64ISD::LD1x2post"; 938 case AArch64ISD::LD1x3post: return "AArch64ISD::LD1x3post"; 939 case AArch64ISD::LD1x4post: return "AArch64ISD::LD1x4post"; 940 case AArch64ISD::ST1x2post: return "AArch64ISD::ST1x2post"; 941 case AArch64ISD::ST1x3post: return "AArch64ISD::ST1x3post"; 942 case AArch64ISD::ST1x4post: return "AArch64ISD::ST1x4post"; 943 case AArch64ISD::LD1DUPpost: return "AArch64ISD::LD1DUPpost"; 944 case AArch64ISD::LD2DUPpost: return "AArch64ISD::LD2DUPpost"; 945 case AArch64ISD::LD3DUPpost: return "AArch64ISD::LD3DUPpost"; 946 case AArch64ISD::LD4DUPpost: return "AArch64ISD::LD4DUPpost"; 947 case AArch64ISD::LD1LANEpost: return "AArch64ISD::LD1LANEpost"; 948 case AArch64ISD::LD2LANEpost: return "AArch64ISD::LD2LANEpost"; 949 case AArch64ISD::LD3LANEpost: return "AArch64ISD::LD3LANEpost"; 950 case AArch64ISD::LD4LANEpost: return "AArch64ISD::LD4LANEpost"; 951 case AArch64ISD::ST2LANEpost: return "AArch64ISD::ST2LANEpost"; 952 case AArch64ISD::ST3LANEpost: return "AArch64ISD::ST3LANEpost"; 953 case AArch64ISD::ST4LANEpost: return "AArch64ISD::ST4LANEpost"; 954 case AArch64ISD::SMULL: return "AArch64ISD::SMULL"; 955 case AArch64ISD::UMULL: return "AArch64ISD::UMULL"; 956 case AArch64ISD::FRSQRTE: return "AArch64ISD::FRSQRTE"; 957 case AArch64ISD::FRECPE: return "AArch64ISD::FRECPE"; 958 } 959 return nullptr; 960 } 961 962 MachineBasicBlock * 963 AArch64TargetLowering::EmitF128CSEL(MachineInstr &MI, 964 MachineBasicBlock *MBB) const { 965 // We materialise the F128CSEL pseudo-instruction as some control flow and a 966 // phi node: 967 968 // OrigBB: 969 // [... previous instrs leading to comparison ...] 970 // b.ne TrueBB 971 // b EndBB 972 // TrueBB: 973 // ; Fallthrough 974 // EndBB: 975 // Dest = PHI [IfTrue, TrueBB], [IfFalse, OrigBB] 976 977 MachineFunction *MF = MBB->getParent(); 978 const TargetInstrInfo *TII = Subtarget->getInstrInfo(); 979 const BasicBlock *LLVM_BB = MBB->getBasicBlock(); 980 DebugLoc DL = MI.getDebugLoc(); 981 MachineFunction::iterator It = ++MBB->getIterator(); 982 983 unsigned DestReg = MI.getOperand(0).getReg(); 984 unsigned IfTrueReg = MI.getOperand(1).getReg(); 985 unsigned IfFalseReg = MI.getOperand(2).getReg(); 986 unsigned CondCode = MI.getOperand(3).getImm(); 987 bool NZCVKilled = MI.getOperand(4).isKill(); 988 989 MachineBasicBlock *TrueBB = MF->CreateMachineBasicBlock(LLVM_BB); 990 MachineBasicBlock *EndBB = MF->CreateMachineBasicBlock(LLVM_BB); 991 MF->insert(It, TrueBB); 992 MF->insert(It, EndBB); 993 994 // Transfer rest of current basic-block to EndBB 995 EndBB->splice(EndBB->begin(), MBB, std::next(MachineBasicBlock::iterator(MI)), 996 MBB->end()); 997 EndBB->transferSuccessorsAndUpdatePHIs(MBB); 998 999 BuildMI(MBB, DL, TII->get(AArch64::Bcc)).addImm(CondCode).addMBB(TrueBB); 1000 BuildMI(MBB, DL, TII->get(AArch64::B)).addMBB(EndBB); 1001 MBB->addSuccessor(TrueBB); 1002 MBB->addSuccessor(EndBB); 1003 1004 // TrueBB falls through to the end. 1005 TrueBB->addSuccessor(EndBB); 1006 1007 if (!NZCVKilled) { 1008 TrueBB->addLiveIn(AArch64::NZCV); 1009 EndBB->addLiveIn(AArch64::NZCV); 1010 } 1011 1012 BuildMI(*EndBB, EndBB->begin(), DL, TII->get(AArch64::PHI), DestReg) 1013 .addReg(IfTrueReg) 1014 .addMBB(TrueBB) 1015 .addReg(IfFalseReg) 1016 .addMBB(MBB); 1017 1018 MI.eraseFromParent(); 1019 return EndBB; 1020 } 1021 1022 MachineBasicBlock *AArch64TargetLowering::EmitInstrWithCustomInserter( 1023 MachineInstr &MI, MachineBasicBlock *BB) const { 1024 switch (MI.getOpcode()) { 1025 default: 1026 #ifndef NDEBUG 1027 MI.dump(); 1028 #endif 1029 llvm_unreachable("Unexpected instruction for custom inserter!"); 1030 1031 case AArch64::F128CSEL: 1032 return EmitF128CSEL(MI, BB); 1033 1034 case TargetOpcode::STACKMAP: 1035 case TargetOpcode::PATCHPOINT: 1036 return emitPatchPoint(MI, BB); 1037 } 1038 } 1039 1040 //===----------------------------------------------------------------------===// 1041 // AArch64 Lowering private implementation. 1042 //===----------------------------------------------------------------------===// 1043 1044 //===----------------------------------------------------------------------===// 1045 // Lowering Code 1046 //===----------------------------------------------------------------------===// 1047 1048 /// changeIntCCToAArch64CC - Convert a DAG integer condition code to an AArch64 1049 /// CC 1050 static AArch64CC::CondCode changeIntCCToAArch64CC(ISD::CondCode CC) { 1051 switch (CC) { 1052 default: 1053 llvm_unreachable("Unknown condition code!"); 1054 case ISD::SETNE: 1055 return AArch64CC::NE; 1056 case ISD::SETEQ: 1057 return AArch64CC::EQ; 1058 case ISD::SETGT: 1059 return AArch64CC::GT; 1060 case ISD::SETGE: 1061 return AArch64CC::GE; 1062 case ISD::SETLT: 1063 return AArch64CC::LT; 1064 case ISD::SETLE: 1065 return AArch64CC::LE; 1066 case ISD::SETUGT: 1067 return AArch64CC::HI; 1068 case ISD::SETUGE: 1069 return AArch64CC::HS; 1070 case ISD::SETULT: 1071 return AArch64CC::LO; 1072 case ISD::SETULE: 1073 return AArch64CC::LS; 1074 } 1075 } 1076 1077 /// changeFPCCToAArch64CC - Convert a DAG fp condition code to an AArch64 CC. 1078 static void changeFPCCToAArch64CC(ISD::CondCode CC, 1079 AArch64CC::CondCode &CondCode, 1080 AArch64CC::CondCode &CondCode2) { 1081 CondCode2 = AArch64CC::AL; 1082 switch (CC) { 1083 default: 1084 llvm_unreachable("Unknown FP condition!"); 1085 case ISD::SETEQ: 1086 case ISD::SETOEQ: 1087 CondCode = AArch64CC::EQ; 1088 break; 1089 case ISD::SETGT: 1090 case ISD::SETOGT: 1091 CondCode = AArch64CC::GT; 1092 break; 1093 case ISD::SETGE: 1094 case ISD::SETOGE: 1095 CondCode = AArch64CC::GE; 1096 break; 1097 case ISD::SETOLT: 1098 CondCode = AArch64CC::MI; 1099 break; 1100 case ISD::SETOLE: 1101 CondCode = AArch64CC::LS; 1102 break; 1103 case ISD::SETONE: 1104 CondCode = AArch64CC::MI; 1105 CondCode2 = AArch64CC::GT; 1106 break; 1107 case ISD::SETO: 1108 CondCode = AArch64CC::VC; 1109 break; 1110 case ISD::SETUO: 1111 CondCode = AArch64CC::VS; 1112 break; 1113 case ISD::SETUEQ: 1114 CondCode = AArch64CC::EQ; 1115 CondCode2 = AArch64CC::VS; 1116 break; 1117 case ISD::SETUGT: 1118 CondCode = AArch64CC::HI; 1119 break; 1120 case ISD::SETUGE: 1121 CondCode = AArch64CC::PL; 1122 break; 1123 case ISD::SETLT: 1124 case ISD::SETULT: 1125 CondCode = AArch64CC::LT; 1126 break; 1127 case ISD::SETLE: 1128 case ISD::SETULE: 1129 CondCode = AArch64CC::LE; 1130 break; 1131 case ISD::SETNE: 1132 case ISD::SETUNE: 1133 CondCode = AArch64CC::NE; 1134 break; 1135 } 1136 } 1137 1138 /// Convert a DAG fp condition code to an AArch64 CC. 1139 /// This differs from changeFPCCToAArch64CC in that it returns cond codes that 1140 /// should be AND'ed instead of OR'ed. 1141 static void changeFPCCToANDAArch64CC(ISD::CondCode CC, 1142 AArch64CC::CondCode &CondCode, 1143 AArch64CC::CondCode &CondCode2) { 1144 CondCode2 = AArch64CC::AL; 1145 switch (CC) { 1146 default: 1147 changeFPCCToAArch64CC(CC, CondCode, CondCode2); 1148 assert(CondCode2 == AArch64CC::AL); 1149 break; 1150 case ISD::SETONE: 1151 // (a one b) 1152 // == ((a olt b) || (a ogt b)) 1153 // == ((a ord b) && (a une b)) 1154 CondCode = AArch64CC::VC; 1155 CondCode2 = AArch64CC::NE; 1156 break; 1157 case ISD::SETUEQ: 1158 // (a ueq b) 1159 // == ((a uno b) || (a oeq b)) 1160 // == ((a ule b) && (a uge b)) 1161 CondCode = AArch64CC::PL; 1162 CondCode2 = AArch64CC::LE; 1163 break; 1164 } 1165 } 1166 1167 /// changeVectorFPCCToAArch64CC - Convert a DAG fp condition code to an AArch64 1168 /// CC usable with the vector instructions. Fewer operations are available 1169 /// without a real NZCV register, so we have to use less efficient combinations 1170 /// to get the same effect. 1171 static void changeVectorFPCCToAArch64CC(ISD::CondCode CC, 1172 AArch64CC::CondCode &CondCode, 1173 AArch64CC::CondCode &CondCode2, 1174 bool &Invert) { 1175 Invert = false; 1176 switch (CC) { 1177 default: 1178 // Mostly the scalar mappings work fine. 1179 changeFPCCToAArch64CC(CC, CondCode, CondCode2); 1180 break; 1181 case ISD::SETUO: 1182 Invert = true; // Fallthrough 1183 case ISD::SETO: 1184 CondCode = AArch64CC::MI; 1185 CondCode2 = AArch64CC::GE; 1186 break; 1187 case ISD::SETUEQ: 1188 case ISD::SETULT: 1189 case ISD::SETULE: 1190 case ISD::SETUGT: 1191 case ISD::SETUGE: 1192 // All of the compare-mask comparisons are ordered, but we can switch 1193 // between the two by a double inversion. E.g. ULE == !OGT. 1194 Invert = true; 1195 changeFPCCToAArch64CC(getSetCCInverse(CC, false), CondCode, CondCode2); 1196 break; 1197 } 1198 } 1199 1200 static bool isLegalArithImmed(uint64_t C) { 1201 // Matches AArch64DAGToDAGISel::SelectArithImmed(). 1202 return (C >> 12 == 0) || ((C & 0xFFFULL) == 0 && C >> 24 == 0); 1203 } 1204 1205 static SDValue emitComparison(SDValue LHS, SDValue RHS, ISD::CondCode CC, 1206 const SDLoc &dl, SelectionDAG &DAG) { 1207 EVT VT = LHS.getValueType(); 1208 1209 if (VT.isFloatingPoint()) { 1210 assert(VT != MVT::f128); 1211 if (VT == MVT::f16) { 1212 LHS = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f32, LHS); 1213 RHS = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f32, RHS); 1214 VT = MVT::f32; 1215 } 1216 return DAG.getNode(AArch64ISD::FCMP, dl, VT, LHS, RHS); 1217 } 1218 1219 // The CMP instruction is just an alias for SUBS, and representing it as 1220 // SUBS means that it's possible to get CSE with subtract operations. 1221 // A later phase can perform the optimization of setting the destination 1222 // register to WZR/XZR if it ends up being unused. 1223 unsigned Opcode = AArch64ISD::SUBS; 1224 1225 if (RHS.getOpcode() == ISD::SUB && isNullConstant(RHS.getOperand(0)) && 1226 (CC == ISD::SETEQ || CC == ISD::SETNE)) { 1227 // We'd like to combine a (CMP op1, (sub 0, op2) into a CMN instruction on 1228 // the grounds that "op1 - (-op2) == op1 + op2". However, the C and V flags 1229 // can be set differently by this operation. It comes down to whether 1230 // "SInt(~op2)+1 == SInt(~op2+1)" (and the same for UInt). If they are then 1231 // everything is fine. If not then the optimization is wrong. Thus general 1232 // comparisons are only valid if op2 != 0. 1233 1234 // So, finally, the only LLVM-native comparisons that don't mention C and V 1235 // are SETEQ and SETNE. They're the only ones we can safely use CMN for in 1236 // the absence of information about op2. 1237 Opcode = AArch64ISD::ADDS; 1238 RHS = RHS.getOperand(1); 1239 } else if (LHS.getOpcode() == ISD::AND && isNullConstant(RHS) && 1240 !isUnsignedIntSetCC(CC)) { 1241 // Similarly, (CMP (and X, Y), 0) can be implemented with a TST 1242 // (a.k.a. ANDS) except that the flags are only guaranteed to work for one 1243 // of the signed comparisons. 1244 Opcode = AArch64ISD::ANDS; 1245 RHS = LHS.getOperand(1); 1246 LHS = LHS.getOperand(0); 1247 } 1248 1249 return DAG.getNode(Opcode, dl, DAG.getVTList(VT, MVT_CC), LHS, RHS) 1250 .getValue(1); 1251 } 1252 1253 /// \defgroup AArch64CCMP CMP;CCMP matching 1254 /// 1255 /// These functions deal with the formation of CMP;CCMP;... sequences. 1256 /// The CCMP/CCMN/FCCMP/FCCMPE instructions allow the conditional execution of 1257 /// a comparison. They set the NZCV flags to a predefined value if their 1258 /// predicate is false. This allows to express arbitrary conjunctions, for 1259 /// example "cmp 0 (and (setCA (cmp A)) (setCB (cmp B))))" 1260 /// expressed as: 1261 /// cmp A 1262 /// ccmp B, inv(CB), CA 1263 /// check for CB flags 1264 /// 1265 /// In general we can create code for arbitrary "... (and (and A B) C)" 1266 /// sequences. We can also implement some "or" expressions, because "(or A B)" 1267 /// is equivalent to "not (and (not A) (not B))" and we can implement some 1268 /// negation operations: 1269 /// We can negate the results of a single comparison by inverting the flags 1270 /// used when the predicate fails and inverting the flags tested in the next 1271 /// instruction; We can also negate the results of the whole previous 1272 /// conditional compare sequence by inverting the flags tested in the next 1273 /// instruction. However there is no way to negate the result of a partial 1274 /// sequence. 1275 /// 1276 /// Therefore on encountering an "or" expression we can negate the subtree on 1277 /// one side and have to be able to push the negate to the leafs of the subtree 1278 /// on the other side (see also the comments in code). As complete example: 1279 /// "or (or (setCA (cmp A)) (setCB (cmp B))) 1280 /// (and (setCC (cmp C)) (setCD (cmp D)))" 1281 /// is transformed to 1282 /// "not (and (not (and (setCC (cmp C)) (setCC (cmp D)))) 1283 /// (and (not (setCA (cmp A)) (not (setCB (cmp B))))))" 1284 /// and implemented as: 1285 /// cmp C 1286 /// ccmp D, inv(CD), CC 1287 /// ccmp A, CA, inv(CD) 1288 /// ccmp B, CB, inv(CA) 1289 /// check for CB flags 1290 /// A counterexample is "or (and A B) (and C D)" which cannot be implemented 1291 /// by conditional compare sequences. 1292 /// @{ 1293 1294 /// Create a conditional comparison; Use CCMP, CCMN or FCCMP as appropriate. 1295 static SDValue emitConditionalComparison(SDValue LHS, SDValue RHS, 1296 ISD::CondCode CC, SDValue CCOp, 1297 AArch64CC::CondCode Predicate, 1298 AArch64CC::CondCode OutCC, 1299 const SDLoc &DL, SelectionDAG &DAG) { 1300 unsigned Opcode = 0; 1301 if (LHS.getValueType().isFloatingPoint()) { 1302 assert(LHS.getValueType() != MVT::f128); 1303 if (LHS.getValueType() == MVT::f16) { 1304 LHS = DAG.getNode(ISD::FP_EXTEND, DL, MVT::f32, LHS); 1305 RHS = DAG.getNode(ISD::FP_EXTEND, DL, MVT::f32, RHS); 1306 } 1307 Opcode = AArch64ISD::FCCMP; 1308 } else if (RHS.getOpcode() == ISD::SUB) { 1309 SDValue SubOp0 = RHS.getOperand(0); 1310 if (isNullConstant(SubOp0) && (CC == ISD::SETEQ || CC == ISD::SETNE)) { 1311 // See emitComparison() on why we can only do this for SETEQ and SETNE. 1312 Opcode = AArch64ISD::CCMN; 1313 RHS = RHS.getOperand(1); 1314 } 1315 } 1316 if (Opcode == 0) 1317 Opcode = AArch64ISD::CCMP; 1318 1319 SDValue Condition = DAG.getConstant(Predicate, DL, MVT_CC); 1320 AArch64CC::CondCode InvOutCC = AArch64CC::getInvertedCondCode(OutCC); 1321 unsigned NZCV = AArch64CC::getNZCVToSatisfyCondCode(InvOutCC); 1322 SDValue NZCVOp = DAG.getConstant(NZCV, DL, MVT::i32); 1323 return DAG.getNode(Opcode, DL, MVT_CC, LHS, RHS, NZCVOp, Condition, CCOp); 1324 } 1325 1326 /// Returns true if @p Val is a tree of AND/OR/SETCC operations. 1327 /// CanPushNegate is set to true if we can push a negate operation through 1328 /// the tree in a was that we are left with AND operations and negate operations 1329 /// at the leafs only. i.e. "not (or (or x y) z)" can be changed to 1330 /// "and (and (not x) (not y)) (not z)"; "not (or (and x y) z)" cannot be 1331 /// brought into such a form. 1332 static bool isConjunctionDisjunctionTree(const SDValue Val, bool &CanNegate, 1333 unsigned Depth = 0) { 1334 if (!Val.hasOneUse()) 1335 return false; 1336 unsigned Opcode = Val->getOpcode(); 1337 if (Opcode == ISD::SETCC) { 1338 if (Val->getOperand(0).getValueType() == MVT::f128) 1339 return false; 1340 CanNegate = true; 1341 return true; 1342 } 1343 // Protect against exponential runtime and stack overflow. 1344 if (Depth > 6) 1345 return false; 1346 if (Opcode == ISD::AND || Opcode == ISD::OR) { 1347 SDValue O0 = Val->getOperand(0); 1348 SDValue O1 = Val->getOperand(1); 1349 bool CanNegateL; 1350 if (!isConjunctionDisjunctionTree(O0, CanNegateL, Depth+1)) 1351 return false; 1352 bool CanNegateR; 1353 if (!isConjunctionDisjunctionTree(O1, CanNegateR, Depth+1)) 1354 return false; 1355 1356 if (Opcode == ISD::OR) { 1357 // For an OR expression we need to be able to negate at least one side or 1358 // we cannot do the transformation at all. 1359 if (!CanNegateL && !CanNegateR) 1360 return false; 1361 // We can however change a (not (or x y)) to (and (not x) (not y)) if we 1362 // can negate the x and y subtrees. 1363 CanNegate = CanNegateL && CanNegateR; 1364 } else { 1365 // If the operands are OR expressions then we finally need to negate their 1366 // outputs, we can only do that for the operand with emitted last by 1367 // negating OutCC, not for both operands. 1368 bool NeedsNegOutL = O0->getOpcode() == ISD::OR; 1369 bool NeedsNegOutR = O1->getOpcode() == ISD::OR; 1370 if (NeedsNegOutL && NeedsNegOutR) 1371 return false; 1372 // We cannot negate an AND operation (it would become an OR), 1373 CanNegate = false; 1374 } 1375 return true; 1376 } 1377 return false; 1378 } 1379 1380 /// Emit conjunction or disjunction tree with the CMP/FCMP followed by a chain 1381 /// of CCMP/CFCMP ops. See @ref AArch64CCMP. 1382 /// Tries to transform the given i1 producing node @p Val to a series compare 1383 /// and conditional compare operations. @returns an NZCV flags producing node 1384 /// and sets @p OutCC to the flags that should be tested or returns SDValue() if 1385 /// transformation was not possible. 1386 /// On recursive invocations @p PushNegate may be set to true to have negation 1387 /// effects pushed to the tree leafs; @p Predicate is an NZCV flag predicate 1388 /// for the comparisons in the current subtree; @p Depth limits the search 1389 /// depth to avoid stack overflow. 1390 static SDValue emitConjunctionDisjunctionTreeRec(SelectionDAG &DAG, SDValue Val, 1391 AArch64CC::CondCode &OutCC, bool Negate, SDValue CCOp, 1392 AArch64CC::CondCode Predicate) { 1393 // We're at a tree leaf, produce a conditional comparison operation. 1394 unsigned Opcode = Val->getOpcode(); 1395 if (Opcode == ISD::SETCC) { 1396 SDValue LHS = Val->getOperand(0); 1397 SDValue RHS = Val->getOperand(1); 1398 ISD::CondCode CC = cast<CondCodeSDNode>(Val->getOperand(2))->get(); 1399 bool isInteger = LHS.getValueType().isInteger(); 1400 if (Negate) 1401 CC = getSetCCInverse(CC, isInteger); 1402 SDLoc DL(Val); 1403 // Determine OutCC and handle FP special case. 1404 if (isInteger) { 1405 OutCC = changeIntCCToAArch64CC(CC); 1406 } else { 1407 assert(LHS.getValueType().isFloatingPoint()); 1408 AArch64CC::CondCode ExtraCC; 1409 changeFPCCToANDAArch64CC(CC, OutCC, ExtraCC); 1410 // Some floating point conditions can't be tested with a single condition 1411 // code. Construct an additional comparison in this case. 1412 if (ExtraCC != AArch64CC::AL) { 1413 SDValue ExtraCmp; 1414 if (!CCOp.getNode()) 1415 ExtraCmp = emitComparison(LHS, RHS, CC, DL, DAG); 1416 else 1417 ExtraCmp = emitConditionalComparison(LHS, RHS, CC, CCOp, Predicate, 1418 ExtraCC, DL, DAG); 1419 CCOp = ExtraCmp; 1420 Predicate = ExtraCC; 1421 } 1422 } 1423 1424 // Produce a normal comparison if we are first in the chain 1425 if (!CCOp) 1426 return emitComparison(LHS, RHS, CC, DL, DAG); 1427 // Otherwise produce a ccmp. 1428 return emitConditionalComparison(LHS, RHS, CC, CCOp, Predicate, OutCC, DL, 1429 DAG); 1430 } 1431 assert((Opcode == ISD::AND || (Opcode == ISD::OR && Val->hasOneUse())) && 1432 "Valid conjunction/disjunction tree"); 1433 1434 // Check if both sides can be transformed. 1435 SDValue LHS = Val->getOperand(0); 1436 SDValue RHS = Val->getOperand(1); 1437 1438 // In case of an OR we need to negate our operands and the result. 1439 // (A v B) <=> not(not(A) ^ not(B)) 1440 bool NegateOpsAndResult = Opcode == ISD::OR; 1441 // We can negate the results of all previous operations by inverting the 1442 // predicate flags giving us a free negation for one side. The other side 1443 // must be negatable by itself. 1444 if (NegateOpsAndResult) { 1445 // See which side we can negate. 1446 bool CanNegateL; 1447 bool isValidL = isConjunctionDisjunctionTree(LHS, CanNegateL); 1448 assert(isValidL && "Valid conjunction/disjunction tree"); 1449 (void)isValidL; 1450 1451 #ifndef NDEBUG 1452 bool CanNegateR; 1453 bool isValidR = isConjunctionDisjunctionTree(RHS, CanNegateR); 1454 assert(isValidR && "Valid conjunction/disjunction tree"); 1455 assert((CanNegateL || CanNegateR) && "Valid conjunction/disjunction tree"); 1456 #endif 1457 1458 // Order the side which we cannot negate to RHS so we can emit it first. 1459 if (!CanNegateL) 1460 std::swap(LHS, RHS); 1461 } else { 1462 bool NeedsNegOutL = LHS->getOpcode() == ISD::OR; 1463 assert((!NeedsNegOutL || RHS->getOpcode() != ISD::OR) && 1464 "Valid conjunction/disjunction tree"); 1465 // Order the side where we need to negate the output flags to RHS so it 1466 // gets emitted first. 1467 if (NeedsNegOutL) 1468 std::swap(LHS, RHS); 1469 } 1470 1471 // Emit RHS. If we want to negate the tree we only need to push a negate 1472 // through if we are already in a PushNegate case, otherwise we can negate 1473 // the "flags to test" afterwards. 1474 AArch64CC::CondCode RHSCC; 1475 SDValue CmpR = emitConjunctionDisjunctionTreeRec(DAG, RHS, RHSCC, Negate, 1476 CCOp, Predicate); 1477 if (NegateOpsAndResult && !Negate) 1478 RHSCC = AArch64CC::getInvertedCondCode(RHSCC); 1479 // Emit LHS. We may need to negate it. 1480 SDValue CmpL = emitConjunctionDisjunctionTreeRec(DAG, LHS, OutCC, 1481 NegateOpsAndResult, CmpR, 1482 RHSCC); 1483 // If we transformed an OR to and AND then we have to negate the result 1484 // (or absorb the Negate parameter). 1485 if (NegateOpsAndResult && !Negate) 1486 OutCC = AArch64CC::getInvertedCondCode(OutCC); 1487 return CmpL; 1488 } 1489 1490 /// Emit conjunction or disjunction tree with the CMP/FCMP followed by a chain 1491 /// of CCMP/CFCMP ops. See @ref AArch64CCMP. 1492 /// \see emitConjunctionDisjunctionTreeRec(). 1493 static SDValue emitConjunctionDisjunctionTree(SelectionDAG &DAG, SDValue Val, 1494 AArch64CC::CondCode &OutCC) { 1495 bool CanNegate; 1496 if (!isConjunctionDisjunctionTree(Val, CanNegate)) 1497 return SDValue(); 1498 1499 return emitConjunctionDisjunctionTreeRec(DAG, Val, OutCC, false, SDValue(), 1500 AArch64CC::AL); 1501 } 1502 1503 /// @} 1504 1505 static SDValue getAArch64Cmp(SDValue LHS, SDValue RHS, ISD::CondCode CC, 1506 SDValue &AArch64cc, SelectionDAG &DAG, 1507 const SDLoc &dl) { 1508 if (ConstantSDNode *RHSC = dyn_cast<ConstantSDNode>(RHS.getNode())) { 1509 EVT VT = RHS.getValueType(); 1510 uint64_t C = RHSC->getZExtValue(); 1511 if (!isLegalArithImmed(C)) { 1512 // Constant does not fit, try adjusting it by one? 1513 switch (CC) { 1514 default: 1515 break; 1516 case ISD::SETLT: 1517 case ISD::SETGE: 1518 if ((VT == MVT::i32 && C != 0x80000000 && 1519 isLegalArithImmed((uint32_t)(C - 1))) || 1520 (VT == MVT::i64 && C != 0x80000000ULL && 1521 isLegalArithImmed(C - 1ULL))) { 1522 CC = (CC == ISD::SETLT) ? ISD::SETLE : ISD::SETGT; 1523 C = (VT == MVT::i32) ? (uint32_t)(C - 1) : C - 1; 1524 RHS = DAG.getConstant(C, dl, VT); 1525 } 1526 break; 1527 case ISD::SETULT: 1528 case ISD::SETUGE: 1529 if ((VT == MVT::i32 && C != 0 && 1530 isLegalArithImmed((uint32_t)(C - 1))) || 1531 (VT == MVT::i64 && C != 0ULL && isLegalArithImmed(C - 1ULL))) { 1532 CC = (CC == ISD::SETULT) ? ISD::SETULE : ISD::SETUGT; 1533 C = (VT == MVT::i32) ? (uint32_t)(C - 1) : C - 1; 1534 RHS = DAG.getConstant(C, dl, VT); 1535 } 1536 break; 1537 case ISD::SETLE: 1538 case ISD::SETGT: 1539 if ((VT == MVT::i32 && C != INT32_MAX && 1540 isLegalArithImmed((uint32_t)(C + 1))) || 1541 (VT == MVT::i64 && C != INT64_MAX && 1542 isLegalArithImmed(C + 1ULL))) { 1543 CC = (CC == ISD::SETLE) ? ISD::SETLT : ISD::SETGE; 1544 C = (VT == MVT::i32) ? (uint32_t)(C + 1) : C + 1; 1545 RHS = DAG.getConstant(C, dl, VT); 1546 } 1547 break; 1548 case ISD::SETULE: 1549 case ISD::SETUGT: 1550 if ((VT == MVT::i32 && C != UINT32_MAX && 1551 isLegalArithImmed((uint32_t)(C + 1))) || 1552 (VT == MVT::i64 && C != UINT64_MAX && 1553 isLegalArithImmed(C + 1ULL))) { 1554 CC = (CC == ISD::SETULE) ? ISD::SETULT : ISD::SETUGE; 1555 C = (VT == MVT::i32) ? (uint32_t)(C + 1) : C + 1; 1556 RHS = DAG.getConstant(C, dl, VT); 1557 } 1558 break; 1559 } 1560 } 1561 } 1562 SDValue Cmp; 1563 AArch64CC::CondCode AArch64CC; 1564 if ((CC == ISD::SETEQ || CC == ISD::SETNE) && isa<ConstantSDNode>(RHS)) { 1565 const ConstantSDNode *RHSC = cast<ConstantSDNode>(RHS); 1566 1567 // The imm operand of ADDS is an unsigned immediate, in the range 0 to 4095. 1568 // For the i8 operand, the largest immediate is 255, so this can be easily 1569 // encoded in the compare instruction. For the i16 operand, however, the 1570 // largest immediate cannot be encoded in the compare. 1571 // Therefore, use a sign extending load and cmn to avoid materializing the 1572 // -1 constant. For example, 1573 // movz w1, #65535 1574 // ldrh w0, [x0, #0] 1575 // cmp w0, w1 1576 // > 1577 // ldrsh w0, [x0, #0] 1578 // cmn w0, #1 1579 // Fundamental, we're relying on the property that (zext LHS) == (zext RHS) 1580 // if and only if (sext LHS) == (sext RHS). The checks are in place to 1581 // ensure both the LHS and RHS are truly zero extended and to make sure the 1582 // transformation is profitable. 1583 if ((RHSC->getZExtValue() >> 16 == 0) && isa<LoadSDNode>(LHS) && 1584 cast<LoadSDNode>(LHS)->getExtensionType() == ISD::ZEXTLOAD && 1585 cast<LoadSDNode>(LHS)->getMemoryVT() == MVT::i16 && 1586 LHS.getNode()->hasNUsesOfValue(1, 0)) { 1587 int16_t ValueofRHS = cast<ConstantSDNode>(RHS)->getZExtValue(); 1588 if (ValueofRHS < 0 && isLegalArithImmed(-ValueofRHS)) { 1589 SDValue SExt = 1590 DAG.getNode(ISD::SIGN_EXTEND_INREG, dl, LHS.getValueType(), LHS, 1591 DAG.getValueType(MVT::i16)); 1592 Cmp = emitComparison(SExt, DAG.getConstant(ValueofRHS, dl, 1593 RHS.getValueType()), 1594 CC, dl, DAG); 1595 AArch64CC = changeIntCCToAArch64CC(CC); 1596 } 1597 } 1598 1599 if (!Cmp && (RHSC->isNullValue() || RHSC->isOne())) { 1600 if ((Cmp = emitConjunctionDisjunctionTree(DAG, LHS, AArch64CC))) { 1601 if ((CC == ISD::SETNE) ^ RHSC->isNullValue()) 1602 AArch64CC = AArch64CC::getInvertedCondCode(AArch64CC); 1603 } 1604 } 1605 } 1606 1607 if (!Cmp) { 1608 Cmp = emitComparison(LHS, RHS, CC, dl, DAG); 1609 AArch64CC = changeIntCCToAArch64CC(CC); 1610 } 1611 AArch64cc = DAG.getConstant(AArch64CC, dl, MVT_CC); 1612 return Cmp; 1613 } 1614 1615 static std::pair<SDValue, SDValue> 1616 getAArch64XALUOOp(AArch64CC::CondCode &CC, SDValue Op, SelectionDAG &DAG) { 1617 assert((Op.getValueType() == MVT::i32 || Op.getValueType() == MVT::i64) && 1618 "Unsupported value type"); 1619 SDValue Value, Overflow; 1620 SDLoc DL(Op); 1621 SDValue LHS = Op.getOperand(0); 1622 SDValue RHS = Op.getOperand(1); 1623 unsigned Opc = 0; 1624 switch (Op.getOpcode()) { 1625 default: 1626 llvm_unreachable("Unknown overflow instruction!"); 1627 case ISD::SADDO: 1628 Opc = AArch64ISD::ADDS; 1629 CC = AArch64CC::VS; 1630 break; 1631 case ISD::UADDO: 1632 Opc = AArch64ISD::ADDS; 1633 CC = AArch64CC::HS; 1634 break; 1635 case ISD::SSUBO: 1636 Opc = AArch64ISD::SUBS; 1637 CC = AArch64CC::VS; 1638 break; 1639 case ISD::USUBO: 1640 Opc = AArch64ISD::SUBS; 1641 CC = AArch64CC::LO; 1642 break; 1643 // Multiply needs a little bit extra work. 1644 case ISD::SMULO: 1645 case ISD::UMULO: { 1646 CC = AArch64CC::NE; 1647 bool IsSigned = Op.getOpcode() == ISD::SMULO; 1648 if (Op.getValueType() == MVT::i32) { 1649 unsigned ExtendOpc = IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND; 1650 // For a 32 bit multiply with overflow check we want the instruction 1651 // selector to generate a widening multiply (SMADDL/UMADDL). For that we 1652 // need to generate the following pattern: 1653 // (i64 add 0, (i64 mul (i64 sext|zext i32 %a), (i64 sext|zext i32 %b)) 1654 LHS = DAG.getNode(ExtendOpc, DL, MVT::i64, LHS); 1655 RHS = DAG.getNode(ExtendOpc, DL, MVT::i64, RHS); 1656 SDValue Mul = DAG.getNode(ISD::MUL, DL, MVT::i64, LHS, RHS); 1657 SDValue Add = DAG.getNode(ISD::ADD, DL, MVT::i64, Mul, 1658 DAG.getConstant(0, DL, MVT::i64)); 1659 // On AArch64 the upper 32 bits are always zero extended for a 32 bit 1660 // operation. We need to clear out the upper 32 bits, because we used a 1661 // widening multiply that wrote all 64 bits. In the end this should be a 1662 // noop. 1663 Value = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, Add); 1664 if (IsSigned) { 1665 // The signed overflow check requires more than just a simple check for 1666 // any bit set in the upper 32 bits of the result. These bits could be 1667 // just the sign bits of a negative number. To perform the overflow 1668 // check we have to arithmetic shift right the 32nd bit of the result by 1669 // 31 bits. Then we compare the result to the upper 32 bits. 1670 SDValue UpperBits = DAG.getNode(ISD::SRL, DL, MVT::i64, Add, 1671 DAG.getConstant(32, DL, MVT::i64)); 1672 UpperBits = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, UpperBits); 1673 SDValue LowerBits = DAG.getNode(ISD::SRA, DL, MVT::i32, Value, 1674 DAG.getConstant(31, DL, MVT::i64)); 1675 // It is important that LowerBits is last, otherwise the arithmetic 1676 // shift will not be folded into the compare (SUBS). 1677 SDVTList VTs = DAG.getVTList(MVT::i32, MVT::i32); 1678 Overflow = DAG.getNode(AArch64ISD::SUBS, DL, VTs, UpperBits, LowerBits) 1679 .getValue(1); 1680 } else { 1681 // The overflow check for unsigned multiply is easy. We only need to 1682 // check if any of the upper 32 bits are set. This can be done with a 1683 // CMP (shifted register). For that we need to generate the following 1684 // pattern: 1685 // (i64 AArch64ISD::SUBS i64 0, (i64 srl i64 %Mul, i64 32) 1686 SDValue UpperBits = DAG.getNode(ISD::SRL, DL, MVT::i64, Mul, 1687 DAG.getConstant(32, DL, MVT::i64)); 1688 SDVTList VTs = DAG.getVTList(MVT::i64, MVT::i32); 1689 Overflow = 1690 DAG.getNode(AArch64ISD::SUBS, DL, VTs, 1691 DAG.getConstant(0, DL, MVT::i64), 1692 UpperBits).getValue(1); 1693 } 1694 break; 1695 } 1696 assert(Op.getValueType() == MVT::i64 && "Expected an i64 value type"); 1697 // For the 64 bit multiply 1698 Value = DAG.getNode(ISD::MUL, DL, MVT::i64, LHS, RHS); 1699 if (IsSigned) { 1700 SDValue UpperBits = DAG.getNode(ISD::MULHS, DL, MVT::i64, LHS, RHS); 1701 SDValue LowerBits = DAG.getNode(ISD::SRA, DL, MVT::i64, Value, 1702 DAG.getConstant(63, DL, MVT::i64)); 1703 // It is important that LowerBits is last, otherwise the arithmetic 1704 // shift will not be folded into the compare (SUBS). 1705 SDVTList VTs = DAG.getVTList(MVT::i64, MVT::i32); 1706 Overflow = DAG.getNode(AArch64ISD::SUBS, DL, VTs, UpperBits, LowerBits) 1707 .getValue(1); 1708 } else { 1709 SDValue UpperBits = DAG.getNode(ISD::MULHU, DL, MVT::i64, LHS, RHS); 1710 SDVTList VTs = DAG.getVTList(MVT::i64, MVT::i32); 1711 Overflow = 1712 DAG.getNode(AArch64ISD::SUBS, DL, VTs, 1713 DAG.getConstant(0, DL, MVT::i64), 1714 UpperBits).getValue(1); 1715 } 1716 break; 1717 } 1718 } // switch (...) 1719 1720 if (Opc) { 1721 SDVTList VTs = DAG.getVTList(Op->getValueType(0), MVT::i32); 1722 1723 // Emit the AArch64 operation with overflow check. 1724 Value = DAG.getNode(Opc, DL, VTs, LHS, RHS); 1725 Overflow = Value.getValue(1); 1726 } 1727 return std::make_pair(Value, Overflow); 1728 } 1729 1730 SDValue AArch64TargetLowering::LowerF128Call(SDValue Op, SelectionDAG &DAG, 1731 RTLIB::Libcall Call) const { 1732 SmallVector<SDValue, 2> Ops(Op->op_begin(), Op->op_end()); 1733 return makeLibCall(DAG, Call, MVT::f128, Ops, false, SDLoc(Op)).first; 1734 } 1735 1736 static SDValue LowerXOR(SDValue Op, SelectionDAG &DAG) { 1737 SDValue Sel = Op.getOperand(0); 1738 SDValue Other = Op.getOperand(1); 1739 1740 // If neither operand is a SELECT_CC, give up. 1741 if (Sel.getOpcode() != ISD::SELECT_CC) 1742 std::swap(Sel, Other); 1743 if (Sel.getOpcode() != ISD::SELECT_CC) 1744 return Op; 1745 1746 // The folding we want to perform is: 1747 // (xor x, (select_cc a, b, cc, 0, -1) ) 1748 // --> 1749 // (csel x, (xor x, -1), cc ...) 1750 // 1751 // The latter will get matched to a CSINV instruction. 1752 1753 ISD::CondCode CC = cast<CondCodeSDNode>(Sel.getOperand(4))->get(); 1754 SDValue LHS = Sel.getOperand(0); 1755 SDValue RHS = Sel.getOperand(1); 1756 SDValue TVal = Sel.getOperand(2); 1757 SDValue FVal = Sel.getOperand(3); 1758 SDLoc dl(Sel); 1759 1760 // FIXME: This could be generalized to non-integer comparisons. 1761 if (LHS.getValueType() != MVT::i32 && LHS.getValueType() != MVT::i64) 1762 return Op; 1763 1764 ConstantSDNode *CFVal = dyn_cast<ConstantSDNode>(FVal); 1765 ConstantSDNode *CTVal = dyn_cast<ConstantSDNode>(TVal); 1766 1767 // The values aren't constants, this isn't the pattern we're looking for. 1768 if (!CFVal || !CTVal) 1769 return Op; 1770 1771 // We can commute the SELECT_CC by inverting the condition. This 1772 // might be needed to make this fit into a CSINV pattern. 1773 if (CTVal->isAllOnesValue() && CFVal->isNullValue()) { 1774 std::swap(TVal, FVal); 1775 std::swap(CTVal, CFVal); 1776 CC = ISD::getSetCCInverse(CC, true); 1777 } 1778 1779 // If the constants line up, perform the transform! 1780 if (CTVal->isNullValue() && CFVal->isAllOnesValue()) { 1781 SDValue CCVal; 1782 SDValue Cmp = getAArch64Cmp(LHS, RHS, CC, CCVal, DAG, dl); 1783 1784 FVal = Other; 1785 TVal = DAG.getNode(ISD::XOR, dl, Other.getValueType(), Other, 1786 DAG.getConstant(-1ULL, dl, Other.getValueType())); 1787 1788 return DAG.getNode(AArch64ISD::CSEL, dl, Sel.getValueType(), FVal, TVal, 1789 CCVal, Cmp); 1790 } 1791 1792 return Op; 1793 } 1794 1795 static SDValue LowerADDC_ADDE_SUBC_SUBE(SDValue Op, SelectionDAG &DAG) { 1796 EVT VT = Op.getValueType(); 1797 1798 // Let legalize expand this if it isn't a legal type yet. 1799 if (!DAG.getTargetLoweringInfo().isTypeLegal(VT)) 1800 return SDValue(); 1801 1802 SDVTList VTs = DAG.getVTList(VT, MVT::i32); 1803 1804 unsigned Opc; 1805 bool ExtraOp = false; 1806 switch (Op.getOpcode()) { 1807 default: 1808 llvm_unreachable("Invalid code"); 1809 case ISD::ADDC: 1810 Opc = AArch64ISD::ADDS; 1811 break; 1812 case ISD::SUBC: 1813 Opc = AArch64ISD::SUBS; 1814 break; 1815 case ISD::ADDE: 1816 Opc = AArch64ISD::ADCS; 1817 ExtraOp = true; 1818 break; 1819 case ISD::SUBE: 1820 Opc = AArch64ISD::SBCS; 1821 ExtraOp = true; 1822 break; 1823 } 1824 1825 if (!ExtraOp) 1826 return DAG.getNode(Opc, SDLoc(Op), VTs, Op.getOperand(0), Op.getOperand(1)); 1827 return DAG.getNode(Opc, SDLoc(Op), VTs, Op.getOperand(0), Op.getOperand(1), 1828 Op.getOperand(2)); 1829 } 1830 1831 static SDValue LowerXALUO(SDValue Op, SelectionDAG &DAG) { 1832 // Let legalize expand this if it isn't a legal type yet. 1833 if (!DAG.getTargetLoweringInfo().isTypeLegal(Op.getValueType())) 1834 return SDValue(); 1835 1836 SDLoc dl(Op); 1837 AArch64CC::CondCode CC; 1838 // The actual operation that sets the overflow or carry flag. 1839 SDValue Value, Overflow; 1840 std::tie(Value, Overflow) = getAArch64XALUOOp(CC, Op, DAG); 1841 1842 // We use 0 and 1 as false and true values. 1843 SDValue TVal = DAG.getConstant(1, dl, MVT::i32); 1844 SDValue FVal = DAG.getConstant(0, dl, MVT::i32); 1845 1846 // We use an inverted condition, because the conditional select is inverted 1847 // too. This will allow it to be selected to a single instruction: 1848 // CSINC Wd, WZR, WZR, invert(cond). 1849 SDValue CCVal = DAG.getConstant(getInvertedCondCode(CC), dl, MVT::i32); 1850 Overflow = DAG.getNode(AArch64ISD::CSEL, dl, MVT::i32, FVal, TVal, 1851 CCVal, Overflow); 1852 1853 SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32); 1854 return DAG.getNode(ISD::MERGE_VALUES, dl, VTs, Value, Overflow); 1855 } 1856 1857 // Prefetch operands are: 1858 // 1: Address to prefetch 1859 // 2: bool isWrite 1860 // 3: int locality (0 = no locality ... 3 = extreme locality) 1861 // 4: bool isDataCache 1862 static SDValue LowerPREFETCH(SDValue Op, SelectionDAG &DAG) { 1863 SDLoc DL(Op); 1864 unsigned IsWrite = cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue(); 1865 unsigned Locality = cast<ConstantSDNode>(Op.getOperand(3))->getZExtValue(); 1866 unsigned IsData = cast<ConstantSDNode>(Op.getOperand(4))->getZExtValue(); 1867 1868 bool IsStream = !Locality; 1869 // When the locality number is set 1870 if (Locality) { 1871 // The front-end should have filtered out the out-of-range values 1872 assert(Locality <= 3 && "Prefetch locality out-of-range"); 1873 // The locality degree is the opposite of the cache speed. 1874 // Put the number the other way around. 1875 // The encoding starts at 0 for level 1 1876 Locality = 3 - Locality; 1877 } 1878 1879 // built the mask value encoding the expected behavior. 1880 unsigned PrfOp = (IsWrite << 4) | // Load/Store bit 1881 (!IsData << 3) | // IsDataCache bit 1882 (Locality << 1) | // Cache level bits 1883 (unsigned)IsStream; // Stream bit 1884 return DAG.getNode(AArch64ISD::PREFETCH, DL, MVT::Other, Op.getOperand(0), 1885 DAG.getConstant(PrfOp, DL, MVT::i32), Op.getOperand(1)); 1886 } 1887 1888 SDValue AArch64TargetLowering::LowerFP_EXTEND(SDValue Op, 1889 SelectionDAG &DAG) const { 1890 assert(Op.getValueType() == MVT::f128 && "Unexpected lowering"); 1891 1892 RTLIB::Libcall LC; 1893 LC = RTLIB::getFPEXT(Op.getOperand(0).getValueType(), Op.getValueType()); 1894 1895 return LowerF128Call(Op, DAG, LC); 1896 } 1897 1898 SDValue AArch64TargetLowering::LowerFP_ROUND(SDValue Op, 1899 SelectionDAG &DAG) const { 1900 if (Op.getOperand(0).getValueType() != MVT::f128) { 1901 // It's legal except when f128 is involved 1902 return Op; 1903 } 1904 1905 RTLIB::Libcall LC; 1906 LC = RTLIB::getFPROUND(Op.getOperand(0).getValueType(), Op.getValueType()); 1907 1908 // FP_ROUND node has a second operand indicating whether it is known to be 1909 // precise. That doesn't take part in the LibCall so we can't directly use 1910 // LowerF128Call. 1911 SDValue SrcVal = Op.getOperand(0); 1912 return makeLibCall(DAG, LC, Op.getValueType(), SrcVal, /*isSigned*/ false, 1913 SDLoc(Op)).first; 1914 } 1915 1916 static SDValue LowerVectorFP_TO_INT(SDValue Op, SelectionDAG &DAG) { 1917 // Warning: We maintain cost tables in AArch64TargetTransformInfo.cpp. 1918 // Any additional optimization in this function should be recorded 1919 // in the cost tables. 1920 EVT InVT = Op.getOperand(0).getValueType(); 1921 EVT VT = Op.getValueType(); 1922 unsigned NumElts = InVT.getVectorNumElements(); 1923 1924 // f16 vectors are promoted to f32 before a conversion. 1925 if (InVT.getVectorElementType() == MVT::f16) { 1926 MVT NewVT = MVT::getVectorVT(MVT::f32, NumElts); 1927 SDLoc dl(Op); 1928 return DAG.getNode( 1929 Op.getOpcode(), dl, Op.getValueType(), 1930 DAG.getNode(ISD::FP_EXTEND, dl, NewVT, Op.getOperand(0))); 1931 } 1932 1933 if (VT.getSizeInBits() < InVT.getSizeInBits()) { 1934 SDLoc dl(Op); 1935 SDValue Cv = 1936 DAG.getNode(Op.getOpcode(), dl, InVT.changeVectorElementTypeToInteger(), 1937 Op.getOperand(0)); 1938 return DAG.getNode(ISD::TRUNCATE, dl, VT, Cv); 1939 } 1940 1941 if (VT.getSizeInBits() > InVT.getSizeInBits()) { 1942 SDLoc dl(Op); 1943 MVT ExtVT = 1944 MVT::getVectorVT(MVT::getFloatingPointVT(VT.getScalarSizeInBits()), 1945 VT.getVectorNumElements()); 1946 SDValue Ext = DAG.getNode(ISD::FP_EXTEND, dl, ExtVT, Op.getOperand(0)); 1947 return DAG.getNode(Op.getOpcode(), dl, VT, Ext); 1948 } 1949 1950 // Type changing conversions are illegal. 1951 return Op; 1952 } 1953 1954 SDValue AArch64TargetLowering::LowerFP_TO_INT(SDValue Op, 1955 SelectionDAG &DAG) const { 1956 if (Op.getOperand(0).getValueType().isVector()) 1957 return LowerVectorFP_TO_INT(Op, DAG); 1958 1959 // f16 conversions are promoted to f32. 1960 if (Op.getOperand(0).getValueType() == MVT::f16) { 1961 SDLoc dl(Op); 1962 return DAG.getNode( 1963 Op.getOpcode(), dl, Op.getValueType(), 1964 DAG.getNode(ISD::FP_EXTEND, dl, MVT::f32, Op.getOperand(0))); 1965 } 1966 1967 if (Op.getOperand(0).getValueType() != MVT::f128) { 1968 // It's legal except when f128 is involved 1969 return Op; 1970 } 1971 1972 RTLIB::Libcall LC; 1973 if (Op.getOpcode() == ISD::FP_TO_SINT) 1974 LC = RTLIB::getFPTOSINT(Op.getOperand(0).getValueType(), Op.getValueType()); 1975 else 1976 LC = RTLIB::getFPTOUINT(Op.getOperand(0).getValueType(), Op.getValueType()); 1977 1978 SmallVector<SDValue, 2> Ops(Op->op_begin(), Op->op_end()); 1979 return makeLibCall(DAG, LC, Op.getValueType(), Ops, false, SDLoc(Op)).first; 1980 } 1981 1982 static SDValue LowerVectorINT_TO_FP(SDValue Op, SelectionDAG &DAG) { 1983 // Warning: We maintain cost tables in AArch64TargetTransformInfo.cpp. 1984 // Any additional optimization in this function should be recorded 1985 // in the cost tables. 1986 EVT VT = Op.getValueType(); 1987 SDLoc dl(Op); 1988 SDValue In = Op.getOperand(0); 1989 EVT InVT = In.getValueType(); 1990 1991 if (VT.getSizeInBits() < InVT.getSizeInBits()) { 1992 MVT CastVT = 1993 MVT::getVectorVT(MVT::getFloatingPointVT(InVT.getScalarSizeInBits()), 1994 InVT.getVectorNumElements()); 1995 In = DAG.getNode(Op.getOpcode(), dl, CastVT, In); 1996 return DAG.getNode(ISD::FP_ROUND, dl, VT, In, DAG.getIntPtrConstant(0, dl)); 1997 } 1998 1999 if (VT.getSizeInBits() > InVT.getSizeInBits()) { 2000 unsigned CastOpc = 2001 Op.getOpcode() == ISD::SINT_TO_FP ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND; 2002 EVT CastVT = VT.changeVectorElementTypeToInteger(); 2003 In = DAG.getNode(CastOpc, dl, CastVT, In); 2004 return DAG.getNode(Op.getOpcode(), dl, VT, In); 2005 } 2006 2007 return Op; 2008 } 2009 2010 SDValue AArch64TargetLowering::LowerINT_TO_FP(SDValue Op, 2011 SelectionDAG &DAG) const { 2012 if (Op.getValueType().isVector()) 2013 return LowerVectorINT_TO_FP(Op, DAG); 2014 2015 // f16 conversions are promoted to f32. 2016 if (Op.getValueType() == MVT::f16) { 2017 SDLoc dl(Op); 2018 return DAG.getNode( 2019 ISD::FP_ROUND, dl, MVT::f16, 2020 DAG.getNode(Op.getOpcode(), dl, MVT::f32, Op.getOperand(0)), 2021 DAG.getIntPtrConstant(0, dl)); 2022 } 2023 2024 // i128 conversions are libcalls. 2025 if (Op.getOperand(0).getValueType() == MVT::i128) 2026 return SDValue(); 2027 2028 // Other conversions are legal, unless it's to the completely software-based 2029 // fp128. 2030 if (Op.getValueType() != MVT::f128) 2031 return Op; 2032 2033 RTLIB::Libcall LC; 2034 if (Op.getOpcode() == ISD::SINT_TO_FP) 2035 LC = RTLIB::getSINTTOFP(Op.getOperand(0).getValueType(), Op.getValueType()); 2036 else 2037 LC = RTLIB::getUINTTOFP(Op.getOperand(0).getValueType(), Op.getValueType()); 2038 2039 return LowerF128Call(Op, DAG, LC); 2040 } 2041 2042 SDValue AArch64TargetLowering::LowerFSINCOS(SDValue Op, 2043 SelectionDAG &DAG) const { 2044 // For iOS, we want to call an alternative entry point: __sincos_stret, 2045 // which returns the values in two S / D registers. 2046 SDLoc dl(Op); 2047 SDValue Arg = Op.getOperand(0); 2048 EVT ArgVT = Arg.getValueType(); 2049 Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext()); 2050 2051 ArgListTy Args; 2052 ArgListEntry Entry; 2053 2054 Entry.Node = Arg; 2055 Entry.Ty = ArgTy; 2056 Entry.isSExt = false; 2057 Entry.isZExt = false; 2058 Args.push_back(Entry); 2059 2060 const char *LibcallName = 2061 (ArgVT == MVT::f64) ? "__sincos_stret" : "__sincosf_stret"; 2062 SDValue Callee = 2063 DAG.getExternalSymbol(LibcallName, getPointerTy(DAG.getDataLayout())); 2064 2065 StructType *RetTy = StructType::get(ArgTy, ArgTy, nullptr); 2066 TargetLowering::CallLoweringInfo CLI(DAG); 2067 CLI.setDebugLoc(dl).setChain(DAG.getEntryNode()) 2068 .setCallee(CallingConv::Fast, RetTy, Callee, std::move(Args)); 2069 2070 std::pair<SDValue, SDValue> CallResult = LowerCallTo(CLI); 2071 return CallResult.first; 2072 } 2073 2074 static SDValue LowerBITCAST(SDValue Op, SelectionDAG &DAG) { 2075 if (Op.getValueType() != MVT::f16) 2076 return SDValue(); 2077 2078 assert(Op.getOperand(0).getValueType() == MVT::i16); 2079 SDLoc DL(Op); 2080 2081 Op = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Op.getOperand(0)); 2082 Op = DAG.getNode(ISD::BITCAST, DL, MVT::f32, Op); 2083 return SDValue( 2084 DAG.getMachineNode(TargetOpcode::EXTRACT_SUBREG, DL, MVT::f16, Op, 2085 DAG.getTargetConstant(AArch64::hsub, DL, MVT::i32)), 2086 0); 2087 } 2088 2089 static EVT getExtensionTo64Bits(const EVT &OrigVT) { 2090 if (OrigVT.getSizeInBits() >= 64) 2091 return OrigVT; 2092 2093 assert(OrigVT.isSimple() && "Expecting a simple value type"); 2094 2095 MVT::SimpleValueType OrigSimpleTy = OrigVT.getSimpleVT().SimpleTy; 2096 switch (OrigSimpleTy) { 2097 default: llvm_unreachable("Unexpected Vector Type"); 2098 case MVT::v2i8: 2099 case MVT::v2i16: 2100 return MVT::v2i32; 2101 case MVT::v4i8: 2102 return MVT::v4i16; 2103 } 2104 } 2105 2106 static SDValue addRequiredExtensionForVectorMULL(SDValue N, SelectionDAG &DAG, 2107 const EVT &OrigTy, 2108 const EVT &ExtTy, 2109 unsigned ExtOpcode) { 2110 // The vector originally had a size of OrigTy. It was then extended to ExtTy. 2111 // We expect the ExtTy to be 128-bits total. If the OrigTy is less than 2112 // 64-bits we need to insert a new extension so that it will be 64-bits. 2113 assert(ExtTy.is128BitVector() && "Unexpected extension size"); 2114 if (OrigTy.getSizeInBits() >= 64) 2115 return N; 2116 2117 // Must extend size to at least 64 bits to be used as an operand for VMULL. 2118 EVT NewVT = getExtensionTo64Bits(OrigTy); 2119 2120 return DAG.getNode(ExtOpcode, SDLoc(N), NewVT, N); 2121 } 2122 2123 static bool isExtendedBUILD_VECTOR(SDNode *N, SelectionDAG &DAG, 2124 bool isSigned) { 2125 EVT VT = N->getValueType(0); 2126 2127 if (N->getOpcode() != ISD::BUILD_VECTOR) 2128 return false; 2129 2130 for (const SDValue &Elt : N->op_values()) { 2131 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Elt)) { 2132 unsigned EltSize = VT.getVectorElementType().getSizeInBits(); 2133 unsigned HalfSize = EltSize / 2; 2134 if (isSigned) { 2135 if (!isIntN(HalfSize, C->getSExtValue())) 2136 return false; 2137 } else { 2138 if (!isUIntN(HalfSize, C->getZExtValue())) 2139 return false; 2140 } 2141 continue; 2142 } 2143 return false; 2144 } 2145 2146 return true; 2147 } 2148 2149 static SDValue skipExtensionForVectorMULL(SDNode *N, SelectionDAG &DAG) { 2150 if (N->getOpcode() == ISD::SIGN_EXTEND || N->getOpcode() == ISD::ZERO_EXTEND) 2151 return addRequiredExtensionForVectorMULL(N->getOperand(0), DAG, 2152 N->getOperand(0)->getValueType(0), 2153 N->getValueType(0), 2154 N->getOpcode()); 2155 2156 assert(N->getOpcode() == ISD::BUILD_VECTOR && "expected BUILD_VECTOR"); 2157 EVT VT = N->getValueType(0); 2158 SDLoc dl(N); 2159 unsigned EltSize = VT.getVectorElementType().getSizeInBits() / 2; 2160 unsigned NumElts = VT.getVectorNumElements(); 2161 MVT TruncVT = MVT::getIntegerVT(EltSize); 2162 SmallVector<SDValue, 8> Ops; 2163 for (unsigned i = 0; i != NumElts; ++i) { 2164 ConstantSDNode *C = cast<ConstantSDNode>(N->getOperand(i)); 2165 const APInt &CInt = C->getAPIntValue(); 2166 // Element types smaller than 32 bits are not legal, so use i32 elements. 2167 // The values are implicitly truncated so sext vs. zext doesn't matter. 2168 Ops.push_back(DAG.getConstant(CInt.zextOrTrunc(32), dl, MVT::i32)); 2169 } 2170 return DAG.getBuildVector(MVT::getVectorVT(TruncVT, NumElts), dl, Ops); 2171 } 2172 2173 static bool isSignExtended(SDNode *N, SelectionDAG &DAG) { 2174 if (N->getOpcode() == ISD::SIGN_EXTEND) 2175 return true; 2176 if (isExtendedBUILD_VECTOR(N, DAG, true)) 2177 return true; 2178 return false; 2179 } 2180 2181 static bool isZeroExtended(SDNode *N, SelectionDAG &DAG) { 2182 if (N->getOpcode() == ISD::ZERO_EXTEND) 2183 return true; 2184 if (isExtendedBUILD_VECTOR(N, DAG, false)) 2185 return true; 2186 return false; 2187 } 2188 2189 static bool isAddSubSExt(SDNode *N, SelectionDAG &DAG) { 2190 unsigned Opcode = N->getOpcode(); 2191 if (Opcode == ISD::ADD || Opcode == ISD::SUB) { 2192 SDNode *N0 = N->getOperand(0).getNode(); 2193 SDNode *N1 = N->getOperand(1).getNode(); 2194 return N0->hasOneUse() && N1->hasOneUse() && 2195 isSignExtended(N0, DAG) && isSignExtended(N1, DAG); 2196 } 2197 return false; 2198 } 2199 2200 static bool isAddSubZExt(SDNode *N, SelectionDAG &DAG) { 2201 unsigned Opcode = N->getOpcode(); 2202 if (Opcode == ISD::ADD || Opcode == ISD::SUB) { 2203 SDNode *N0 = N->getOperand(0).getNode(); 2204 SDNode *N1 = N->getOperand(1).getNode(); 2205 return N0->hasOneUse() && N1->hasOneUse() && 2206 isZeroExtended(N0, DAG) && isZeroExtended(N1, DAG); 2207 } 2208 return false; 2209 } 2210 2211 static SDValue LowerMUL(SDValue Op, SelectionDAG &DAG) { 2212 // Multiplications are only custom-lowered for 128-bit vectors so that 2213 // VMULL can be detected. Otherwise v2i64 multiplications are not legal. 2214 EVT VT = Op.getValueType(); 2215 assert(VT.is128BitVector() && VT.isInteger() && 2216 "unexpected type for custom-lowering ISD::MUL"); 2217 SDNode *N0 = Op.getOperand(0).getNode(); 2218 SDNode *N1 = Op.getOperand(1).getNode(); 2219 unsigned NewOpc = 0; 2220 bool isMLA = false; 2221 bool isN0SExt = isSignExtended(N0, DAG); 2222 bool isN1SExt = isSignExtended(N1, DAG); 2223 if (isN0SExt && isN1SExt) 2224 NewOpc = AArch64ISD::SMULL; 2225 else { 2226 bool isN0ZExt = isZeroExtended(N0, DAG); 2227 bool isN1ZExt = isZeroExtended(N1, DAG); 2228 if (isN0ZExt && isN1ZExt) 2229 NewOpc = AArch64ISD::UMULL; 2230 else if (isN1SExt || isN1ZExt) { 2231 // Look for (s/zext A + s/zext B) * (s/zext C). We want to turn these 2232 // into (s/zext A * s/zext C) + (s/zext B * s/zext C) 2233 if (isN1SExt && isAddSubSExt(N0, DAG)) { 2234 NewOpc = AArch64ISD::SMULL; 2235 isMLA = true; 2236 } else if (isN1ZExt && isAddSubZExt(N0, DAG)) { 2237 NewOpc = AArch64ISD::UMULL; 2238 isMLA = true; 2239 } else if (isN0ZExt && isAddSubZExt(N1, DAG)) { 2240 std::swap(N0, N1); 2241 NewOpc = AArch64ISD::UMULL; 2242 isMLA = true; 2243 } 2244 } 2245 2246 if (!NewOpc) { 2247 if (VT == MVT::v2i64) 2248 // Fall through to expand this. It is not legal. 2249 return SDValue(); 2250 else 2251 // Other vector multiplications are legal. 2252 return Op; 2253 } 2254 } 2255 2256 // Legalize to a S/UMULL instruction 2257 SDLoc DL(Op); 2258 SDValue Op0; 2259 SDValue Op1 = skipExtensionForVectorMULL(N1, DAG); 2260 if (!isMLA) { 2261 Op0 = skipExtensionForVectorMULL(N0, DAG); 2262 assert(Op0.getValueType().is64BitVector() && 2263 Op1.getValueType().is64BitVector() && 2264 "unexpected types for extended operands to VMULL"); 2265 return DAG.getNode(NewOpc, DL, VT, Op0, Op1); 2266 } 2267 // Optimizing (zext A + zext B) * C, to (S/UMULL A, C) + (S/UMULL B, C) during 2268 // isel lowering to take advantage of no-stall back to back s/umul + s/umla. 2269 // This is true for CPUs with accumulate forwarding such as Cortex-A53/A57 2270 SDValue N00 = skipExtensionForVectorMULL(N0->getOperand(0).getNode(), DAG); 2271 SDValue N01 = skipExtensionForVectorMULL(N0->getOperand(1).getNode(), DAG); 2272 EVT Op1VT = Op1.getValueType(); 2273 return DAG.getNode(N0->getOpcode(), DL, VT, 2274 DAG.getNode(NewOpc, DL, VT, 2275 DAG.getNode(ISD::BITCAST, DL, Op1VT, N00), Op1), 2276 DAG.getNode(NewOpc, DL, VT, 2277 DAG.getNode(ISD::BITCAST, DL, Op1VT, N01), Op1)); 2278 } 2279 2280 SDValue AArch64TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, 2281 SelectionDAG &DAG) const { 2282 unsigned IntNo = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue(); 2283 SDLoc dl(Op); 2284 switch (IntNo) { 2285 default: return SDValue(); // Don't custom lower most intrinsics. 2286 case Intrinsic::thread_pointer: { 2287 EVT PtrVT = getPointerTy(DAG.getDataLayout()); 2288 return DAG.getNode(AArch64ISD::THREAD_POINTER, dl, PtrVT); 2289 } 2290 case Intrinsic::aarch64_neon_smax: 2291 return DAG.getNode(ISD::SMAX, dl, Op.getValueType(), 2292 Op.getOperand(1), Op.getOperand(2)); 2293 case Intrinsic::aarch64_neon_umax: 2294 return DAG.getNode(ISD::UMAX, dl, Op.getValueType(), 2295 Op.getOperand(1), Op.getOperand(2)); 2296 case Intrinsic::aarch64_neon_smin: 2297 return DAG.getNode(ISD::SMIN, dl, Op.getValueType(), 2298 Op.getOperand(1), Op.getOperand(2)); 2299 case Intrinsic::aarch64_neon_umin: 2300 return DAG.getNode(ISD::UMIN, dl, Op.getValueType(), 2301 Op.getOperand(1), Op.getOperand(2)); 2302 } 2303 } 2304 2305 SDValue AArch64TargetLowering::LowerOperation(SDValue Op, 2306 SelectionDAG &DAG) const { 2307 switch (Op.getOpcode()) { 2308 default: 2309 llvm_unreachable("unimplemented operand"); 2310 return SDValue(); 2311 case ISD::BITCAST: 2312 return LowerBITCAST(Op, DAG); 2313 case ISD::GlobalAddress: 2314 return LowerGlobalAddress(Op, DAG); 2315 case ISD::GlobalTLSAddress: 2316 return LowerGlobalTLSAddress(Op, DAG); 2317 case ISD::SETCC: 2318 return LowerSETCC(Op, DAG); 2319 case ISD::BR_CC: 2320 return LowerBR_CC(Op, DAG); 2321 case ISD::SELECT: 2322 return LowerSELECT(Op, DAG); 2323 case ISD::SELECT_CC: 2324 return LowerSELECT_CC(Op, DAG); 2325 case ISD::JumpTable: 2326 return LowerJumpTable(Op, DAG); 2327 case ISD::ConstantPool: 2328 return LowerConstantPool(Op, DAG); 2329 case ISD::BlockAddress: 2330 return LowerBlockAddress(Op, DAG); 2331 case ISD::VASTART: 2332 return LowerVASTART(Op, DAG); 2333 case ISD::VACOPY: 2334 return LowerVACOPY(Op, DAG); 2335 case ISD::VAARG: 2336 return LowerVAARG(Op, DAG); 2337 case ISD::ADDC: 2338 case ISD::ADDE: 2339 case ISD::SUBC: 2340 case ISD::SUBE: 2341 return LowerADDC_ADDE_SUBC_SUBE(Op, DAG); 2342 case ISD::SADDO: 2343 case ISD::UADDO: 2344 case ISD::SSUBO: 2345 case ISD::USUBO: 2346 case ISD::SMULO: 2347 case ISD::UMULO: 2348 return LowerXALUO(Op, DAG); 2349 case ISD::FADD: 2350 return LowerF128Call(Op, DAG, RTLIB::ADD_F128); 2351 case ISD::FSUB: 2352 return LowerF128Call(Op, DAG, RTLIB::SUB_F128); 2353 case ISD::FMUL: 2354 return LowerF128Call(Op, DAG, RTLIB::MUL_F128); 2355 case ISD::FDIV: 2356 return LowerF128Call(Op, DAG, RTLIB::DIV_F128); 2357 case ISD::FP_ROUND: 2358 return LowerFP_ROUND(Op, DAG); 2359 case ISD::FP_EXTEND: 2360 return LowerFP_EXTEND(Op, DAG); 2361 case ISD::FRAMEADDR: 2362 return LowerFRAMEADDR(Op, DAG); 2363 case ISD::RETURNADDR: 2364 return LowerRETURNADDR(Op, DAG); 2365 case ISD::INSERT_VECTOR_ELT: 2366 return LowerINSERT_VECTOR_ELT(Op, DAG); 2367 case ISD::EXTRACT_VECTOR_ELT: 2368 return LowerEXTRACT_VECTOR_ELT(Op, DAG); 2369 case ISD::BUILD_VECTOR: 2370 return LowerBUILD_VECTOR(Op, DAG); 2371 case ISD::VECTOR_SHUFFLE: 2372 return LowerVECTOR_SHUFFLE(Op, DAG); 2373 case ISD::EXTRACT_SUBVECTOR: 2374 return LowerEXTRACT_SUBVECTOR(Op, DAG); 2375 case ISD::SRA: 2376 case ISD::SRL: 2377 case ISD::SHL: 2378 return LowerVectorSRA_SRL_SHL(Op, DAG); 2379 case ISD::SHL_PARTS: 2380 return LowerShiftLeftParts(Op, DAG); 2381 case ISD::SRL_PARTS: 2382 case ISD::SRA_PARTS: 2383 return LowerShiftRightParts(Op, DAG); 2384 case ISD::CTPOP: 2385 return LowerCTPOP(Op, DAG); 2386 case ISD::FCOPYSIGN: 2387 return LowerFCOPYSIGN(Op, DAG); 2388 case ISD::AND: 2389 return LowerVectorAND(Op, DAG); 2390 case ISD::OR: 2391 return LowerVectorOR(Op, DAG); 2392 case ISD::XOR: 2393 return LowerXOR(Op, DAG); 2394 case ISD::PREFETCH: 2395 return LowerPREFETCH(Op, DAG); 2396 case ISD::SINT_TO_FP: 2397 case ISD::UINT_TO_FP: 2398 return LowerINT_TO_FP(Op, DAG); 2399 case ISD::FP_TO_SINT: 2400 case ISD::FP_TO_UINT: 2401 return LowerFP_TO_INT(Op, DAG); 2402 case ISD::FSINCOS: 2403 return LowerFSINCOS(Op, DAG); 2404 case ISD::MUL: 2405 return LowerMUL(Op, DAG); 2406 case ISD::INTRINSIC_WO_CHAIN: 2407 return LowerINTRINSIC_WO_CHAIN(Op, DAG); 2408 } 2409 } 2410 2411 //===----------------------------------------------------------------------===// 2412 // Calling Convention Implementation 2413 //===----------------------------------------------------------------------===// 2414 2415 #include "AArch64GenCallingConv.inc" 2416 2417 /// Selects the correct CCAssignFn for a given CallingConvention value. 2418 CCAssignFn *AArch64TargetLowering::CCAssignFnForCall(CallingConv::ID CC, 2419 bool IsVarArg) const { 2420 switch (CC) { 2421 default: 2422 llvm_unreachable("Unsupported calling convention."); 2423 case CallingConv::WebKit_JS: 2424 return CC_AArch64_WebKit_JS; 2425 case CallingConv::GHC: 2426 return CC_AArch64_GHC; 2427 case CallingConv::C: 2428 case CallingConv::Fast: 2429 case CallingConv::PreserveMost: 2430 case CallingConv::CXX_FAST_TLS: 2431 if (!Subtarget->isTargetDarwin()) 2432 return CC_AArch64_AAPCS; 2433 return IsVarArg ? CC_AArch64_DarwinPCS_VarArg : CC_AArch64_DarwinPCS; 2434 } 2435 } 2436 2437 SDValue AArch64TargetLowering::LowerFormalArguments( 2438 SDValue Chain, CallingConv::ID CallConv, bool isVarArg, 2439 const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &DL, 2440 SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const { 2441 MachineFunction &MF = DAG.getMachineFunction(); 2442 MachineFrameInfo *MFI = MF.getFrameInfo(); 2443 2444 // Assign locations to all of the incoming arguments. 2445 SmallVector<CCValAssign, 16> ArgLocs; 2446 CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), ArgLocs, 2447 *DAG.getContext()); 2448 2449 // At this point, Ins[].VT may already be promoted to i32. To correctly 2450 // handle passing i8 as i8 instead of i32 on stack, we pass in both i32 and 2451 // i8 to CC_AArch64_AAPCS with i32 being ValVT and i8 being LocVT. 2452 // Since AnalyzeFormalArguments uses Ins[].VT for both ValVT and LocVT, here 2453 // we use a special version of AnalyzeFormalArguments to pass in ValVT and 2454 // LocVT. 2455 unsigned NumArgs = Ins.size(); 2456 Function::const_arg_iterator CurOrigArg = MF.getFunction()->arg_begin(); 2457 unsigned CurArgIdx = 0; 2458 for (unsigned i = 0; i != NumArgs; ++i) { 2459 MVT ValVT = Ins[i].VT; 2460 if (Ins[i].isOrigArg()) { 2461 std::advance(CurOrigArg, Ins[i].getOrigArgIndex() - CurArgIdx); 2462 CurArgIdx = Ins[i].getOrigArgIndex(); 2463 2464 // Get type of the original argument. 2465 EVT ActualVT = getValueType(DAG.getDataLayout(), CurOrigArg->getType(), 2466 /*AllowUnknown*/ true); 2467 MVT ActualMVT = ActualVT.isSimple() ? ActualVT.getSimpleVT() : MVT::Other; 2468 // If ActualMVT is i1/i8/i16, we should set LocVT to i8/i8/i16. 2469 if (ActualMVT == MVT::i1 || ActualMVT == MVT::i8) 2470 ValVT = MVT::i8; 2471 else if (ActualMVT == MVT::i16) 2472 ValVT = MVT::i16; 2473 } 2474 CCAssignFn *AssignFn = CCAssignFnForCall(CallConv, /*IsVarArg=*/false); 2475 bool Res = 2476 AssignFn(i, ValVT, ValVT, CCValAssign::Full, Ins[i].Flags, CCInfo); 2477 assert(!Res && "Call operand has unhandled type"); 2478 (void)Res; 2479 } 2480 assert(ArgLocs.size() == Ins.size()); 2481 SmallVector<SDValue, 16> ArgValues; 2482 for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) { 2483 CCValAssign &VA = ArgLocs[i]; 2484 2485 if (Ins[i].Flags.isByVal()) { 2486 // Byval is used for HFAs in the PCS, but the system should work in a 2487 // non-compliant manner for larger structs. 2488 EVT PtrVT = getPointerTy(DAG.getDataLayout()); 2489 int Size = Ins[i].Flags.getByValSize(); 2490 unsigned NumRegs = (Size + 7) / 8; 2491 2492 // FIXME: This works on big-endian for composite byvals, which are the common 2493 // case. It should also work for fundamental types too. 2494 unsigned FrameIdx = 2495 MFI->CreateFixedObject(8 * NumRegs, VA.getLocMemOffset(), false); 2496 SDValue FrameIdxN = DAG.getFrameIndex(FrameIdx, PtrVT); 2497 InVals.push_back(FrameIdxN); 2498 2499 continue; 2500 } 2501 2502 if (VA.isRegLoc()) { 2503 // Arguments stored in registers. 2504 EVT RegVT = VA.getLocVT(); 2505 2506 SDValue ArgValue; 2507 const TargetRegisterClass *RC; 2508 2509 if (RegVT == MVT::i32) 2510 RC = &AArch64::GPR32RegClass; 2511 else if (RegVT == MVT::i64) 2512 RC = &AArch64::GPR64RegClass; 2513 else if (RegVT == MVT::f16) 2514 RC = &AArch64::FPR16RegClass; 2515 else if (RegVT == MVT::f32) 2516 RC = &AArch64::FPR32RegClass; 2517 else if (RegVT == MVT::f64 || RegVT.is64BitVector()) 2518 RC = &AArch64::FPR64RegClass; 2519 else if (RegVT == MVT::f128 || RegVT.is128BitVector()) 2520 RC = &AArch64::FPR128RegClass; 2521 else 2522 llvm_unreachable("RegVT not supported by FORMAL_ARGUMENTS Lowering"); 2523 2524 // Transform the arguments in physical registers into virtual ones. 2525 unsigned Reg = MF.addLiveIn(VA.getLocReg(), RC); 2526 ArgValue = DAG.getCopyFromReg(Chain, DL, Reg, RegVT); 2527 2528 // If this is an 8, 16 or 32-bit value, it is really passed promoted 2529 // to 64 bits. Insert an assert[sz]ext to capture this, then 2530 // truncate to the right size. 2531 switch (VA.getLocInfo()) { 2532 default: 2533 llvm_unreachable("Unknown loc info!"); 2534 case CCValAssign::Full: 2535 break; 2536 case CCValAssign::BCvt: 2537 ArgValue = DAG.getNode(ISD::BITCAST, DL, VA.getValVT(), ArgValue); 2538 break; 2539 case CCValAssign::AExt: 2540 case CCValAssign::SExt: 2541 case CCValAssign::ZExt: 2542 // SelectionDAGBuilder will insert appropriate AssertZExt & AssertSExt 2543 // nodes after our lowering. 2544 assert(RegVT == Ins[i].VT && "incorrect register location selected"); 2545 break; 2546 } 2547 2548 InVals.push_back(ArgValue); 2549 2550 } else { // VA.isRegLoc() 2551 assert(VA.isMemLoc() && "CCValAssign is neither reg nor mem"); 2552 unsigned ArgOffset = VA.getLocMemOffset(); 2553 unsigned ArgSize = VA.getValVT().getSizeInBits() / 8; 2554 2555 uint32_t BEAlign = 0; 2556 if (!Subtarget->isLittleEndian() && ArgSize < 8 && 2557 !Ins[i].Flags.isInConsecutiveRegs()) 2558 BEAlign = 8 - ArgSize; 2559 2560 int FI = MFI->CreateFixedObject(ArgSize, ArgOffset + BEAlign, true); 2561 2562 // Create load nodes to retrieve arguments from the stack. 2563 SDValue FIN = DAG.getFrameIndex(FI, getPointerTy(DAG.getDataLayout())); 2564 SDValue ArgValue; 2565 2566 // For NON_EXTLOAD, generic code in getLoad assert(ValVT == MemVT) 2567 ISD::LoadExtType ExtType = ISD::NON_EXTLOAD; 2568 MVT MemVT = VA.getValVT(); 2569 2570 switch (VA.getLocInfo()) { 2571 default: 2572 break; 2573 case CCValAssign::BCvt: 2574 MemVT = VA.getLocVT(); 2575 break; 2576 case CCValAssign::SExt: 2577 ExtType = ISD::SEXTLOAD; 2578 break; 2579 case CCValAssign::ZExt: 2580 ExtType = ISD::ZEXTLOAD; 2581 break; 2582 case CCValAssign::AExt: 2583 ExtType = ISD::EXTLOAD; 2584 break; 2585 } 2586 2587 ArgValue = DAG.getExtLoad( 2588 ExtType, DL, VA.getLocVT(), Chain, FIN, 2589 MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI), 2590 MemVT, false, false, false, 0); 2591 2592 InVals.push_back(ArgValue); 2593 } 2594 } 2595 2596 // varargs 2597 AArch64FunctionInfo *FuncInfo = MF.getInfo<AArch64FunctionInfo>(); 2598 if (isVarArg) { 2599 if (!Subtarget->isTargetDarwin()) { 2600 // The AAPCS variadic function ABI is identical to the non-variadic 2601 // one. As a result there may be more arguments in registers and we should 2602 // save them for future reference. 2603 saveVarArgRegisters(CCInfo, DAG, DL, Chain); 2604 } 2605 2606 // This will point to the next argument passed via stack. 2607 unsigned StackOffset = CCInfo.getNextStackOffset(); 2608 // We currently pass all varargs at 8-byte alignment. 2609 StackOffset = ((StackOffset + 7) & ~7); 2610 FuncInfo->setVarArgsStackIndex(MFI->CreateFixedObject(4, StackOffset, true)); 2611 } 2612 2613 unsigned StackArgSize = CCInfo.getNextStackOffset(); 2614 bool TailCallOpt = MF.getTarget().Options.GuaranteedTailCallOpt; 2615 if (DoesCalleeRestoreStack(CallConv, TailCallOpt)) { 2616 // This is a non-standard ABI so by fiat I say we're allowed to make full 2617 // use of the stack area to be popped, which must be aligned to 16 bytes in 2618 // any case: 2619 StackArgSize = alignTo(StackArgSize, 16); 2620 2621 // If we're expected to restore the stack (e.g. fastcc) then we'll be adding 2622 // a multiple of 16. 2623 FuncInfo->setArgumentStackToRestore(StackArgSize); 2624 2625 // This realignment carries over to the available bytes below. Our own 2626 // callers will guarantee the space is free by giving an aligned value to 2627 // CALLSEQ_START. 2628 } 2629 // Even if we're not expected to free up the space, it's useful to know how 2630 // much is there while considering tail calls (because we can reuse it). 2631 FuncInfo->setBytesInStackArgArea(StackArgSize); 2632 2633 return Chain; 2634 } 2635 2636 void AArch64TargetLowering::saveVarArgRegisters(CCState &CCInfo, 2637 SelectionDAG &DAG, 2638 const SDLoc &DL, 2639 SDValue &Chain) const { 2640 MachineFunction &MF = DAG.getMachineFunction(); 2641 MachineFrameInfo *MFI = MF.getFrameInfo(); 2642 AArch64FunctionInfo *FuncInfo = MF.getInfo<AArch64FunctionInfo>(); 2643 auto PtrVT = getPointerTy(DAG.getDataLayout()); 2644 2645 SmallVector<SDValue, 8> MemOps; 2646 2647 static const MCPhysReg GPRArgRegs[] = { AArch64::X0, AArch64::X1, AArch64::X2, 2648 AArch64::X3, AArch64::X4, AArch64::X5, 2649 AArch64::X6, AArch64::X7 }; 2650 static const unsigned NumGPRArgRegs = array_lengthof(GPRArgRegs); 2651 unsigned FirstVariadicGPR = CCInfo.getFirstUnallocated(GPRArgRegs); 2652 2653 unsigned GPRSaveSize = 8 * (NumGPRArgRegs - FirstVariadicGPR); 2654 int GPRIdx = 0; 2655 if (GPRSaveSize != 0) { 2656 GPRIdx = MFI->CreateStackObject(GPRSaveSize, 8, false); 2657 2658 SDValue FIN = DAG.getFrameIndex(GPRIdx, PtrVT); 2659 2660 for (unsigned i = FirstVariadicGPR; i < NumGPRArgRegs; ++i) { 2661 unsigned VReg = MF.addLiveIn(GPRArgRegs[i], &AArch64::GPR64RegClass); 2662 SDValue Val = DAG.getCopyFromReg(Chain, DL, VReg, MVT::i64); 2663 SDValue Store = DAG.getStore( 2664 Val.getValue(1), DL, Val, FIN, 2665 MachinePointerInfo::getStack(DAG.getMachineFunction(), i * 8), false, 2666 false, 0); 2667 MemOps.push_back(Store); 2668 FIN = 2669 DAG.getNode(ISD::ADD, DL, PtrVT, FIN, DAG.getConstant(8, DL, PtrVT)); 2670 } 2671 } 2672 FuncInfo->setVarArgsGPRIndex(GPRIdx); 2673 FuncInfo->setVarArgsGPRSize(GPRSaveSize); 2674 2675 if (Subtarget->hasFPARMv8()) { 2676 static const MCPhysReg FPRArgRegs[] = { 2677 AArch64::Q0, AArch64::Q1, AArch64::Q2, AArch64::Q3, 2678 AArch64::Q4, AArch64::Q5, AArch64::Q6, AArch64::Q7}; 2679 static const unsigned NumFPRArgRegs = array_lengthof(FPRArgRegs); 2680 unsigned FirstVariadicFPR = CCInfo.getFirstUnallocated(FPRArgRegs); 2681 2682 unsigned FPRSaveSize = 16 * (NumFPRArgRegs - FirstVariadicFPR); 2683 int FPRIdx = 0; 2684 if (FPRSaveSize != 0) { 2685 FPRIdx = MFI->CreateStackObject(FPRSaveSize, 16, false); 2686 2687 SDValue FIN = DAG.getFrameIndex(FPRIdx, PtrVT); 2688 2689 for (unsigned i = FirstVariadicFPR; i < NumFPRArgRegs; ++i) { 2690 unsigned VReg = MF.addLiveIn(FPRArgRegs[i], &AArch64::FPR128RegClass); 2691 SDValue Val = DAG.getCopyFromReg(Chain, DL, VReg, MVT::f128); 2692 2693 SDValue Store = DAG.getStore( 2694 Val.getValue(1), DL, Val, FIN, 2695 MachinePointerInfo::getStack(DAG.getMachineFunction(), i * 16), 2696 false, false, 0); 2697 MemOps.push_back(Store); 2698 FIN = DAG.getNode(ISD::ADD, DL, PtrVT, FIN, 2699 DAG.getConstant(16, DL, PtrVT)); 2700 } 2701 } 2702 FuncInfo->setVarArgsFPRIndex(FPRIdx); 2703 FuncInfo->setVarArgsFPRSize(FPRSaveSize); 2704 } 2705 2706 if (!MemOps.empty()) { 2707 Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOps); 2708 } 2709 } 2710 2711 /// LowerCallResult - Lower the result values of a call into the 2712 /// appropriate copies out of appropriate physical registers. 2713 SDValue AArch64TargetLowering::LowerCallResult( 2714 SDValue Chain, SDValue InFlag, CallingConv::ID CallConv, bool isVarArg, 2715 const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &DL, 2716 SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals, bool isThisReturn, 2717 SDValue ThisVal) const { 2718 CCAssignFn *RetCC = CallConv == CallingConv::WebKit_JS 2719 ? RetCC_AArch64_WebKit_JS 2720 : RetCC_AArch64_AAPCS; 2721 // Assign locations to each value returned by this call. 2722 SmallVector<CCValAssign, 16> RVLocs; 2723 CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), RVLocs, 2724 *DAG.getContext()); 2725 CCInfo.AnalyzeCallResult(Ins, RetCC); 2726 2727 // Copy all of the result registers out of their specified physreg. 2728 for (unsigned i = 0; i != RVLocs.size(); ++i) { 2729 CCValAssign VA = RVLocs[i]; 2730 2731 // Pass 'this' value directly from the argument to return value, to avoid 2732 // reg unit interference 2733 if (i == 0 && isThisReturn) { 2734 assert(!VA.needsCustom() && VA.getLocVT() == MVT::i64 && 2735 "unexpected return calling convention register assignment"); 2736 InVals.push_back(ThisVal); 2737 continue; 2738 } 2739 2740 SDValue Val = 2741 DAG.getCopyFromReg(Chain, DL, VA.getLocReg(), VA.getLocVT(), InFlag); 2742 Chain = Val.getValue(1); 2743 InFlag = Val.getValue(2); 2744 2745 switch (VA.getLocInfo()) { 2746 default: 2747 llvm_unreachable("Unknown loc info!"); 2748 case CCValAssign::Full: 2749 break; 2750 case CCValAssign::BCvt: 2751 Val = DAG.getNode(ISD::BITCAST, DL, VA.getValVT(), Val); 2752 break; 2753 } 2754 2755 InVals.push_back(Val); 2756 } 2757 2758 return Chain; 2759 } 2760 2761 bool AArch64TargetLowering::isEligibleForTailCallOptimization( 2762 SDValue Callee, CallingConv::ID CalleeCC, bool isVarArg, 2763 const SmallVectorImpl<ISD::OutputArg> &Outs, 2764 const SmallVectorImpl<SDValue> &OutVals, 2765 const SmallVectorImpl<ISD::InputArg> &Ins, SelectionDAG &DAG) const { 2766 // For CallingConv::C this function knows whether the ABI needs 2767 // changing. That's not true for other conventions so they will have to opt in 2768 // manually. 2769 if (!IsTailCallConvention(CalleeCC) && CalleeCC != CallingConv::C) 2770 return false; 2771 2772 MachineFunction &MF = DAG.getMachineFunction(); 2773 const Function *CallerF = MF.getFunction(); 2774 CallingConv::ID CallerCC = CallerF->getCallingConv(); 2775 bool CCMatch = CallerCC == CalleeCC; 2776 2777 // Byval parameters hand the function a pointer directly into the stack area 2778 // we want to reuse during a tail call. Working around this *is* possible (see 2779 // X86) but less efficient and uglier in LowerCall. 2780 for (Function::const_arg_iterator i = CallerF->arg_begin(), 2781 e = CallerF->arg_end(); 2782 i != e; ++i) 2783 if (i->hasByValAttr()) 2784 return false; 2785 2786 if (getTargetMachine().Options.GuaranteedTailCallOpt) { 2787 return IsTailCallConvention(CalleeCC) && CCMatch; 2788 } 2789 2790 // Externally-defined functions with weak linkage should not be 2791 // tail-called on AArch64 when the OS does not support dynamic 2792 // pre-emption of symbols, as the AAELF spec requires normal calls 2793 // to undefined weak functions to be replaced with a NOP or jump to the 2794 // next instruction. The behaviour of branch instructions in this 2795 // situation (as used for tail calls) is implementation-defined, so we 2796 // cannot rely on the linker replacing the tail call with a return. 2797 if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee)) { 2798 const GlobalValue *GV = G->getGlobal(); 2799 const Triple &TT = getTargetMachine().getTargetTriple(); 2800 if (GV->hasExternalWeakLinkage() && 2801 (!TT.isOSWindows() || TT.isOSBinFormatELF() || TT.isOSBinFormatMachO())) 2802 return false; 2803 } 2804 2805 // Now we search for cases where we can use a tail call without changing the 2806 // ABI. Sibcall is used in some places (particularly gcc) to refer to this 2807 // concept. 2808 2809 // I want anyone implementing a new calling convention to think long and hard 2810 // about this assert. 2811 assert((!isVarArg || CalleeCC == CallingConv::C) && 2812 "Unexpected variadic calling convention"); 2813 2814 LLVMContext &C = *DAG.getContext(); 2815 if (isVarArg && !Outs.empty()) { 2816 // At least two cases here: if caller is fastcc then we can't have any 2817 // memory arguments (we'd be expected to clean up the stack afterwards). If 2818 // caller is C then we could potentially use its argument area. 2819 2820 // FIXME: for now we take the most conservative of these in both cases: 2821 // disallow all variadic memory operands. 2822 SmallVector<CCValAssign, 16> ArgLocs; 2823 CCState CCInfo(CalleeCC, isVarArg, MF, ArgLocs, C); 2824 2825 CCInfo.AnalyzeCallOperands(Outs, CCAssignFnForCall(CalleeCC, true)); 2826 for (const CCValAssign &ArgLoc : ArgLocs) 2827 if (!ArgLoc.isRegLoc()) 2828 return false; 2829 } 2830 2831 // Check that the call results are passed in the same way. 2832 if (!CCState::resultsCompatible(CalleeCC, CallerCC, MF, C, Ins, 2833 CCAssignFnForCall(CalleeCC, isVarArg), 2834 CCAssignFnForCall(CallerCC, isVarArg))) 2835 return false; 2836 // The callee has to preserve all registers the caller needs to preserve. 2837 const AArch64RegisterInfo *TRI = Subtarget->getRegisterInfo(); 2838 const uint32_t *CallerPreserved = TRI->getCallPreservedMask(MF, CallerCC); 2839 if (!CCMatch) { 2840 const uint32_t *CalleePreserved = TRI->getCallPreservedMask(MF, CalleeCC); 2841 if (!TRI->regmaskSubsetEqual(CallerPreserved, CalleePreserved)) 2842 return false; 2843 } 2844 2845 // Nothing more to check if the callee is taking no arguments 2846 if (Outs.empty()) 2847 return true; 2848 2849 SmallVector<CCValAssign, 16> ArgLocs; 2850 CCState CCInfo(CalleeCC, isVarArg, MF, ArgLocs, C); 2851 2852 CCInfo.AnalyzeCallOperands(Outs, CCAssignFnForCall(CalleeCC, isVarArg)); 2853 2854 const AArch64FunctionInfo *FuncInfo = MF.getInfo<AArch64FunctionInfo>(); 2855 2856 // If the stack arguments for this call do not fit into our own save area then 2857 // the call cannot be made tail. 2858 if (CCInfo.getNextStackOffset() > FuncInfo->getBytesInStackArgArea()) 2859 return false; 2860 2861 const MachineRegisterInfo &MRI = MF.getRegInfo(); 2862 if (!parametersInCSRMatch(MRI, CallerPreserved, ArgLocs, OutVals)) 2863 return false; 2864 2865 return true; 2866 } 2867 2868 SDValue AArch64TargetLowering::addTokenForArgument(SDValue Chain, 2869 SelectionDAG &DAG, 2870 MachineFrameInfo *MFI, 2871 int ClobberedFI) const { 2872 SmallVector<SDValue, 8> ArgChains; 2873 int64_t FirstByte = MFI->getObjectOffset(ClobberedFI); 2874 int64_t LastByte = FirstByte + MFI->getObjectSize(ClobberedFI) - 1; 2875 2876 // Include the original chain at the beginning of the list. When this is 2877 // used by target LowerCall hooks, this helps legalize find the 2878 // CALLSEQ_BEGIN node. 2879 ArgChains.push_back(Chain); 2880 2881 // Add a chain value for each stack argument corresponding 2882 for (SDNode::use_iterator U = DAG.getEntryNode().getNode()->use_begin(), 2883 UE = DAG.getEntryNode().getNode()->use_end(); 2884 U != UE; ++U) 2885 if (LoadSDNode *L = dyn_cast<LoadSDNode>(*U)) 2886 if (FrameIndexSDNode *FI = dyn_cast<FrameIndexSDNode>(L->getBasePtr())) 2887 if (FI->getIndex() < 0) { 2888 int64_t InFirstByte = MFI->getObjectOffset(FI->getIndex()); 2889 int64_t InLastByte = InFirstByte; 2890 InLastByte += MFI->getObjectSize(FI->getIndex()) - 1; 2891 2892 if ((InFirstByte <= FirstByte && FirstByte <= InLastByte) || 2893 (FirstByte <= InFirstByte && InFirstByte <= LastByte)) 2894 ArgChains.push_back(SDValue(L, 1)); 2895 } 2896 2897 // Build a tokenfactor for all the chains. 2898 return DAG.getNode(ISD::TokenFactor, SDLoc(Chain), MVT::Other, ArgChains); 2899 } 2900 2901 bool AArch64TargetLowering::DoesCalleeRestoreStack(CallingConv::ID CallCC, 2902 bool TailCallOpt) const { 2903 return CallCC == CallingConv::Fast && TailCallOpt; 2904 } 2905 2906 bool AArch64TargetLowering::IsTailCallConvention(CallingConv::ID CallCC) const { 2907 return CallCC == CallingConv::Fast || 2908 CallCC == CallingConv::PreserveMost; 2909 } 2910 2911 /// LowerCall - Lower a call to a callseq_start + CALL + callseq_end chain, 2912 /// and add input and output parameter nodes. 2913 SDValue 2914 AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI, 2915 SmallVectorImpl<SDValue> &InVals) const { 2916 SelectionDAG &DAG = CLI.DAG; 2917 SDLoc &DL = CLI.DL; 2918 SmallVector<ISD::OutputArg, 32> &Outs = CLI.Outs; 2919 SmallVector<SDValue, 32> &OutVals = CLI.OutVals; 2920 SmallVector<ISD::InputArg, 32> &Ins = CLI.Ins; 2921 SDValue Chain = CLI.Chain; 2922 SDValue Callee = CLI.Callee; 2923 bool &IsTailCall = CLI.IsTailCall; 2924 CallingConv::ID CallConv = CLI.CallConv; 2925 bool IsVarArg = CLI.IsVarArg; 2926 2927 MachineFunction &MF = DAG.getMachineFunction(); 2928 bool IsThisReturn = false; 2929 2930 AArch64FunctionInfo *FuncInfo = MF.getInfo<AArch64FunctionInfo>(); 2931 bool TailCallOpt = MF.getTarget().Options.GuaranteedTailCallOpt; 2932 bool IsSibCall = false; 2933 2934 if (IsTailCall) { 2935 // Check if it's really possible to do a tail call. 2936 IsTailCall = isEligibleForTailCallOptimization( 2937 Callee, CallConv, IsVarArg, Outs, OutVals, Ins, DAG); 2938 if (!IsTailCall && CLI.CS && CLI.CS->isMustTailCall()) 2939 report_fatal_error("failed to perform tail call elimination on a call " 2940 "site marked musttail"); 2941 2942 // A sibling call is one where we're under the usual C ABI and not planning 2943 // to change that but can still do a tail call: 2944 if (!TailCallOpt && IsTailCall) 2945 IsSibCall = true; 2946 2947 if (IsTailCall) 2948 ++NumTailCalls; 2949 } 2950 2951 // Analyze operands of the call, assigning locations to each operand. 2952 SmallVector<CCValAssign, 16> ArgLocs; 2953 CCState CCInfo(CallConv, IsVarArg, DAG.getMachineFunction(), ArgLocs, 2954 *DAG.getContext()); 2955 2956 if (IsVarArg) { 2957 // Handle fixed and variable vector arguments differently. 2958 // Variable vector arguments always go into memory. 2959 unsigned NumArgs = Outs.size(); 2960 2961 for (unsigned i = 0; i != NumArgs; ++i) { 2962 MVT ArgVT = Outs[i].VT; 2963 ISD::ArgFlagsTy ArgFlags = Outs[i].Flags; 2964 CCAssignFn *AssignFn = CCAssignFnForCall(CallConv, 2965 /*IsVarArg=*/ !Outs[i].IsFixed); 2966 bool Res = AssignFn(i, ArgVT, ArgVT, CCValAssign::Full, ArgFlags, CCInfo); 2967 assert(!Res && "Call operand has unhandled type"); 2968 (void)Res; 2969 } 2970 } else { 2971 // At this point, Outs[].VT may already be promoted to i32. To correctly 2972 // handle passing i8 as i8 instead of i32 on stack, we pass in both i32 and 2973 // i8 to CC_AArch64_AAPCS with i32 being ValVT and i8 being LocVT. 2974 // Since AnalyzeCallOperands uses Ins[].VT for both ValVT and LocVT, here 2975 // we use a special version of AnalyzeCallOperands to pass in ValVT and 2976 // LocVT. 2977 unsigned NumArgs = Outs.size(); 2978 for (unsigned i = 0; i != NumArgs; ++i) { 2979 MVT ValVT = Outs[i].VT; 2980 // Get type of the original argument. 2981 EVT ActualVT = getValueType(DAG.getDataLayout(), 2982 CLI.getArgs()[Outs[i].OrigArgIndex].Ty, 2983 /*AllowUnknown*/ true); 2984 MVT ActualMVT = ActualVT.isSimple() ? ActualVT.getSimpleVT() : ValVT; 2985 ISD::ArgFlagsTy ArgFlags = Outs[i].Flags; 2986 // If ActualMVT is i1/i8/i16, we should set LocVT to i8/i8/i16. 2987 if (ActualMVT == MVT::i1 || ActualMVT == MVT::i8) 2988 ValVT = MVT::i8; 2989 else if (ActualMVT == MVT::i16) 2990 ValVT = MVT::i16; 2991 2992 CCAssignFn *AssignFn = CCAssignFnForCall(CallConv, /*IsVarArg=*/false); 2993 bool Res = AssignFn(i, ValVT, ValVT, CCValAssign::Full, ArgFlags, CCInfo); 2994 assert(!Res && "Call operand has unhandled type"); 2995 (void)Res; 2996 } 2997 } 2998 2999 // Get a count of how many bytes are to be pushed on the stack. 3000 unsigned NumBytes = CCInfo.getNextStackOffset(); 3001 3002 if (IsSibCall) { 3003 // Since we're not changing the ABI to make this a tail call, the memory 3004 // operands are already available in the caller's incoming argument space. 3005 NumBytes = 0; 3006 } 3007 3008 // FPDiff is the byte offset of the call's argument area from the callee's. 3009 // Stores to callee stack arguments will be placed in FixedStackSlots offset 3010 // by this amount for a tail call. In a sibling call it must be 0 because the 3011 // caller will deallocate the entire stack and the callee still expects its 3012 // arguments to begin at SP+0. Completely unused for non-tail calls. 3013 int FPDiff = 0; 3014 3015 if (IsTailCall && !IsSibCall) { 3016 unsigned NumReusableBytes = FuncInfo->getBytesInStackArgArea(); 3017 3018 // Since callee will pop argument stack as a tail call, we must keep the 3019 // popped size 16-byte aligned. 3020 NumBytes = alignTo(NumBytes, 16); 3021 3022 // FPDiff will be negative if this tail call requires more space than we 3023 // would automatically have in our incoming argument space. Positive if we 3024 // can actually shrink the stack. 3025 FPDiff = NumReusableBytes - NumBytes; 3026 3027 // The stack pointer must be 16-byte aligned at all times it's used for a 3028 // memory operation, which in practice means at *all* times and in 3029 // particular across call boundaries. Therefore our own arguments started at 3030 // a 16-byte aligned SP and the delta applied for the tail call should 3031 // satisfy the same constraint. 3032 assert(FPDiff % 16 == 0 && "unaligned stack on tail call"); 3033 } 3034 3035 // Adjust the stack pointer for the new arguments... 3036 // These operations are automatically eliminated by the prolog/epilog pass 3037 if (!IsSibCall) 3038 Chain = DAG.getCALLSEQ_START(Chain, DAG.getIntPtrConstant(NumBytes, DL, 3039 true), 3040 DL); 3041 3042 SDValue StackPtr = DAG.getCopyFromReg(Chain, DL, AArch64::SP, 3043 getPointerTy(DAG.getDataLayout())); 3044 3045 SmallVector<std::pair<unsigned, SDValue>, 8> RegsToPass; 3046 SmallVector<SDValue, 8> MemOpChains; 3047 auto PtrVT = getPointerTy(DAG.getDataLayout()); 3048 3049 // Walk the register/memloc assignments, inserting copies/loads. 3050 for (unsigned i = 0, realArgIdx = 0, e = ArgLocs.size(); i != e; 3051 ++i, ++realArgIdx) { 3052 CCValAssign &VA = ArgLocs[i]; 3053 SDValue Arg = OutVals[realArgIdx]; 3054 ISD::ArgFlagsTy Flags = Outs[realArgIdx].Flags; 3055 3056 // Promote the value if needed. 3057 switch (VA.getLocInfo()) { 3058 default: 3059 llvm_unreachable("Unknown loc info!"); 3060 case CCValAssign::Full: 3061 break; 3062 case CCValAssign::SExt: 3063 Arg = DAG.getNode(ISD::SIGN_EXTEND, DL, VA.getLocVT(), Arg); 3064 break; 3065 case CCValAssign::ZExt: 3066 Arg = DAG.getNode(ISD::ZERO_EXTEND, DL, VA.getLocVT(), Arg); 3067 break; 3068 case CCValAssign::AExt: 3069 if (Outs[realArgIdx].ArgVT == MVT::i1) { 3070 // AAPCS requires i1 to be zero-extended to 8-bits by the caller. 3071 Arg = DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, Arg); 3072 Arg = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i8, Arg); 3073 } 3074 Arg = DAG.getNode(ISD::ANY_EXTEND, DL, VA.getLocVT(), Arg); 3075 break; 3076 case CCValAssign::BCvt: 3077 Arg = DAG.getNode(ISD::BITCAST, DL, VA.getLocVT(), Arg); 3078 break; 3079 case CCValAssign::FPExt: 3080 Arg = DAG.getNode(ISD::FP_EXTEND, DL, VA.getLocVT(), Arg); 3081 break; 3082 } 3083 3084 if (VA.isRegLoc()) { 3085 if (realArgIdx == 0 && Flags.isReturned() && Outs[0].VT == MVT::i64) { 3086 assert(VA.getLocVT() == MVT::i64 && 3087 "unexpected calling convention register assignment"); 3088 assert(!Ins.empty() && Ins[0].VT == MVT::i64 && 3089 "unexpected use of 'returned'"); 3090 IsThisReturn = true; 3091 } 3092 RegsToPass.push_back(std::make_pair(VA.getLocReg(), Arg)); 3093 } else { 3094 assert(VA.isMemLoc()); 3095 3096 SDValue DstAddr; 3097 MachinePointerInfo DstInfo; 3098 3099 // FIXME: This works on big-endian for composite byvals, which are the 3100 // common case. It should also work for fundamental types too. 3101 uint32_t BEAlign = 0; 3102 unsigned OpSize = Flags.isByVal() ? Flags.getByValSize() * 8 3103 : VA.getValVT().getSizeInBits(); 3104 OpSize = (OpSize + 7) / 8; 3105 if (!Subtarget->isLittleEndian() && !Flags.isByVal() && 3106 !Flags.isInConsecutiveRegs()) { 3107 if (OpSize < 8) 3108 BEAlign = 8 - OpSize; 3109 } 3110 unsigned LocMemOffset = VA.getLocMemOffset(); 3111 int32_t Offset = LocMemOffset + BEAlign; 3112 SDValue PtrOff = DAG.getIntPtrConstant(Offset, DL); 3113 PtrOff = DAG.getNode(ISD::ADD, DL, PtrVT, StackPtr, PtrOff); 3114 3115 if (IsTailCall) { 3116 Offset = Offset + FPDiff; 3117 int FI = MF.getFrameInfo()->CreateFixedObject(OpSize, Offset, true); 3118 3119 DstAddr = DAG.getFrameIndex(FI, PtrVT); 3120 DstInfo = 3121 MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI); 3122 3123 // Make sure any stack arguments overlapping with where we're storing 3124 // are loaded before this eventual operation. Otherwise they'll be 3125 // clobbered. 3126 Chain = addTokenForArgument(Chain, DAG, MF.getFrameInfo(), FI); 3127 } else { 3128 SDValue PtrOff = DAG.getIntPtrConstant(Offset, DL); 3129 3130 DstAddr = DAG.getNode(ISD::ADD, DL, PtrVT, StackPtr, PtrOff); 3131 DstInfo = MachinePointerInfo::getStack(DAG.getMachineFunction(), 3132 LocMemOffset); 3133 } 3134 3135 if (Outs[i].Flags.isByVal()) { 3136 SDValue SizeNode = 3137 DAG.getConstant(Outs[i].Flags.getByValSize(), DL, MVT::i64); 3138 SDValue Cpy = DAG.getMemcpy( 3139 Chain, DL, DstAddr, Arg, SizeNode, Outs[i].Flags.getByValAlign(), 3140 /*isVol = */ false, /*AlwaysInline = */ false, 3141 /*isTailCall = */ false, 3142 DstInfo, MachinePointerInfo()); 3143 3144 MemOpChains.push_back(Cpy); 3145 } else { 3146 // Since we pass i1/i8/i16 as i1/i8/i16 on stack and Arg is already 3147 // promoted to a legal register type i32, we should truncate Arg back to 3148 // i1/i8/i16. 3149 if (VA.getValVT() == MVT::i1 || VA.getValVT() == MVT::i8 || 3150 VA.getValVT() == MVT::i16) 3151 Arg = DAG.getNode(ISD::TRUNCATE, DL, VA.getValVT(), Arg); 3152 3153 SDValue Store = 3154 DAG.getStore(Chain, DL, Arg, DstAddr, DstInfo, false, false, 0); 3155 MemOpChains.push_back(Store); 3156 } 3157 } 3158 } 3159 3160 if (!MemOpChains.empty()) 3161 Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOpChains); 3162 3163 // Build a sequence of copy-to-reg nodes chained together with token chain 3164 // and flag operands which copy the outgoing args into the appropriate regs. 3165 SDValue InFlag; 3166 for (auto &RegToPass : RegsToPass) { 3167 Chain = DAG.getCopyToReg(Chain, DL, RegToPass.first, 3168 RegToPass.second, InFlag); 3169 InFlag = Chain.getValue(1); 3170 } 3171 3172 // If the callee is a GlobalAddress/ExternalSymbol node (quite common, every 3173 // direct call is) turn it into a TargetGlobalAddress/TargetExternalSymbol 3174 // node so that legalize doesn't hack it. 3175 if (getTargetMachine().getCodeModel() == CodeModel::Large && 3176 Subtarget->isTargetMachO()) { 3177 if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee)) { 3178 const GlobalValue *GV = G->getGlobal(); 3179 bool InternalLinkage = GV->hasInternalLinkage(); 3180 if (InternalLinkage) 3181 Callee = DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, 0); 3182 else { 3183 Callee = 3184 DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, AArch64II::MO_GOT); 3185 Callee = DAG.getNode(AArch64ISD::LOADgot, DL, PtrVT, Callee); 3186 } 3187 } else if (ExternalSymbolSDNode *S = 3188 dyn_cast<ExternalSymbolSDNode>(Callee)) { 3189 const char *Sym = S->getSymbol(); 3190 Callee = DAG.getTargetExternalSymbol(Sym, PtrVT, AArch64II::MO_GOT); 3191 Callee = DAG.getNode(AArch64ISD::LOADgot, DL, PtrVT, Callee); 3192 } 3193 } else if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee)) { 3194 const GlobalValue *GV = G->getGlobal(); 3195 Callee = DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, 0); 3196 } else if (ExternalSymbolSDNode *S = dyn_cast<ExternalSymbolSDNode>(Callee)) { 3197 const char *Sym = S->getSymbol(); 3198 Callee = DAG.getTargetExternalSymbol(Sym, PtrVT, 0); 3199 } 3200 3201 // We don't usually want to end the call-sequence here because we would tidy 3202 // the frame up *after* the call, however in the ABI-changing tail-call case 3203 // we've carefully laid out the parameters so that when sp is reset they'll be 3204 // in the correct location. 3205 if (IsTailCall && !IsSibCall) { 3206 Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(NumBytes, DL, true), 3207 DAG.getIntPtrConstant(0, DL, true), InFlag, DL); 3208 InFlag = Chain.getValue(1); 3209 } 3210 3211 std::vector<SDValue> Ops; 3212 Ops.push_back(Chain); 3213 Ops.push_back(Callee); 3214 3215 if (IsTailCall) { 3216 // Each tail call may have to adjust the stack by a different amount, so 3217 // this information must travel along with the operation for eventual 3218 // consumption by emitEpilogue. 3219 Ops.push_back(DAG.getTargetConstant(FPDiff, DL, MVT::i32)); 3220 } 3221 3222 // Add argument registers to the end of the list so that they are known live 3223 // into the call. 3224 for (auto &RegToPass : RegsToPass) 3225 Ops.push_back(DAG.getRegister(RegToPass.first, 3226 RegToPass.second.getValueType())); 3227 3228 // Add a register mask operand representing the call-preserved registers. 3229 const uint32_t *Mask; 3230 const AArch64RegisterInfo *TRI = Subtarget->getRegisterInfo(); 3231 if (IsThisReturn) { 3232 // For 'this' returns, use the X0-preserving mask if applicable 3233 Mask = TRI->getThisReturnPreservedMask(MF, CallConv); 3234 if (!Mask) { 3235 IsThisReturn = false; 3236 Mask = TRI->getCallPreservedMask(MF, CallConv); 3237 } 3238 } else 3239 Mask = TRI->getCallPreservedMask(MF, CallConv); 3240 3241 assert(Mask && "Missing call preserved mask for calling convention"); 3242 Ops.push_back(DAG.getRegisterMask(Mask)); 3243 3244 if (InFlag.getNode()) 3245 Ops.push_back(InFlag); 3246 3247 SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue); 3248 3249 // If we're doing a tall call, use a TC_RETURN here rather than an 3250 // actual call instruction. 3251 if (IsTailCall) { 3252 MF.getFrameInfo()->setHasTailCall(); 3253 return DAG.getNode(AArch64ISD::TC_RETURN, DL, NodeTys, Ops); 3254 } 3255 3256 // Returns a chain and a flag for retval copy to use. 3257 Chain = DAG.getNode(AArch64ISD::CALL, DL, NodeTys, Ops); 3258 InFlag = Chain.getValue(1); 3259 3260 uint64_t CalleePopBytes = 3261 DoesCalleeRestoreStack(CallConv, TailCallOpt) ? alignTo(NumBytes, 16) : 0; 3262 3263 Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(NumBytes, DL, true), 3264 DAG.getIntPtrConstant(CalleePopBytes, DL, true), 3265 InFlag, DL); 3266 if (!Ins.empty()) 3267 InFlag = Chain.getValue(1); 3268 3269 // Handle result values, copying them out of physregs into vregs that we 3270 // return. 3271 return LowerCallResult(Chain, InFlag, CallConv, IsVarArg, Ins, DL, DAG, 3272 InVals, IsThisReturn, 3273 IsThisReturn ? OutVals[0] : SDValue()); 3274 } 3275 3276 bool AArch64TargetLowering::CanLowerReturn( 3277 CallingConv::ID CallConv, MachineFunction &MF, bool isVarArg, 3278 const SmallVectorImpl<ISD::OutputArg> &Outs, LLVMContext &Context) const { 3279 CCAssignFn *RetCC = CallConv == CallingConv::WebKit_JS 3280 ? RetCC_AArch64_WebKit_JS 3281 : RetCC_AArch64_AAPCS; 3282 SmallVector<CCValAssign, 16> RVLocs; 3283 CCState CCInfo(CallConv, isVarArg, MF, RVLocs, Context); 3284 return CCInfo.CheckReturn(Outs, RetCC); 3285 } 3286 3287 SDValue 3288 AArch64TargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv, 3289 bool isVarArg, 3290 const SmallVectorImpl<ISD::OutputArg> &Outs, 3291 const SmallVectorImpl<SDValue> &OutVals, 3292 const SDLoc &DL, SelectionDAG &DAG) const { 3293 CCAssignFn *RetCC = CallConv == CallingConv::WebKit_JS 3294 ? RetCC_AArch64_WebKit_JS 3295 : RetCC_AArch64_AAPCS; 3296 SmallVector<CCValAssign, 16> RVLocs; 3297 CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), RVLocs, 3298 *DAG.getContext()); 3299 CCInfo.AnalyzeReturn(Outs, RetCC); 3300 3301 // Copy the result values into the output registers. 3302 SDValue Flag; 3303 SmallVector<SDValue, 4> RetOps(1, Chain); 3304 for (unsigned i = 0, realRVLocIdx = 0; i != RVLocs.size(); 3305 ++i, ++realRVLocIdx) { 3306 CCValAssign &VA = RVLocs[i]; 3307 assert(VA.isRegLoc() && "Can only return in registers!"); 3308 SDValue Arg = OutVals[realRVLocIdx]; 3309 3310 switch (VA.getLocInfo()) { 3311 default: 3312 llvm_unreachable("Unknown loc info!"); 3313 case CCValAssign::Full: 3314 if (Outs[i].ArgVT == MVT::i1) { 3315 // AAPCS requires i1 to be zero-extended to i8 by the producer of the 3316 // value. This is strictly redundant on Darwin (which uses "zeroext 3317 // i1"), but will be optimised out before ISel. 3318 Arg = DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, Arg); 3319 Arg = DAG.getNode(ISD::ZERO_EXTEND, DL, VA.getLocVT(), Arg); 3320 } 3321 break; 3322 case CCValAssign::BCvt: 3323 Arg = DAG.getNode(ISD::BITCAST, DL, VA.getLocVT(), Arg); 3324 break; 3325 } 3326 3327 Chain = DAG.getCopyToReg(Chain, DL, VA.getLocReg(), Arg, Flag); 3328 Flag = Chain.getValue(1); 3329 RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT())); 3330 } 3331 const AArch64RegisterInfo *TRI = Subtarget->getRegisterInfo(); 3332 const MCPhysReg *I = 3333 TRI->getCalleeSavedRegsViaCopy(&DAG.getMachineFunction()); 3334 if (I) { 3335 for (; *I; ++I) { 3336 if (AArch64::GPR64RegClass.contains(*I)) 3337 RetOps.push_back(DAG.getRegister(*I, MVT::i64)); 3338 else if (AArch64::FPR64RegClass.contains(*I)) 3339 RetOps.push_back(DAG.getRegister(*I, MVT::getFloatingPointVT(64))); 3340 else 3341 llvm_unreachable("Unexpected register class in CSRsViaCopy!"); 3342 } 3343 } 3344 3345 RetOps[0] = Chain; // Update chain. 3346 3347 // Add the flag if we have it. 3348 if (Flag.getNode()) 3349 RetOps.push_back(Flag); 3350 3351 return DAG.getNode(AArch64ISD::RET_FLAG, DL, MVT::Other, RetOps); 3352 } 3353 3354 //===----------------------------------------------------------------------===// 3355 // Other Lowering Code 3356 //===----------------------------------------------------------------------===// 3357 3358 SDValue AArch64TargetLowering::LowerGlobalAddress(SDValue Op, 3359 SelectionDAG &DAG) const { 3360 EVT PtrVT = getPointerTy(DAG.getDataLayout()); 3361 SDLoc DL(Op); 3362 const GlobalAddressSDNode *GN = cast<GlobalAddressSDNode>(Op); 3363 const GlobalValue *GV = GN->getGlobal(); 3364 unsigned char OpFlags = 3365 Subtarget->ClassifyGlobalReference(GV, getTargetMachine()); 3366 3367 assert(cast<GlobalAddressSDNode>(Op)->getOffset() == 0 && 3368 "unexpected offset in global node"); 3369 3370 // This also catched the large code model case for Darwin. 3371 if ((OpFlags & AArch64II::MO_GOT) != 0) { 3372 SDValue GotAddr = DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, OpFlags); 3373 // FIXME: Once remat is capable of dealing with instructions with register 3374 // operands, expand this into two nodes instead of using a wrapper node. 3375 return DAG.getNode(AArch64ISD::LOADgot, DL, PtrVT, GotAddr); 3376 } 3377 3378 if (getTargetMachine().getCodeModel() == CodeModel::Large) { 3379 const unsigned char MO_NC = AArch64II::MO_NC; 3380 return DAG.getNode( 3381 AArch64ISD::WrapperLarge, DL, PtrVT, 3382 DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, AArch64II::MO_G3), 3383 DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, AArch64II::MO_G2 | MO_NC), 3384 DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, AArch64II::MO_G1 | MO_NC), 3385 DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, AArch64II::MO_G0 | MO_NC)); 3386 } else { 3387 // Use ADRP/ADD or ADRP/LDR for everything else: the small model on ELF and 3388 // the only correct model on Darwin. 3389 SDValue Hi = DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, 3390 OpFlags | AArch64II::MO_PAGE); 3391 unsigned char LoFlags = OpFlags | AArch64II::MO_PAGEOFF | AArch64II::MO_NC; 3392 SDValue Lo = DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, LoFlags); 3393 3394 SDValue ADRP = DAG.getNode(AArch64ISD::ADRP, DL, PtrVT, Hi); 3395 return DAG.getNode(AArch64ISD::ADDlow, DL, PtrVT, ADRP, Lo); 3396 } 3397 } 3398 3399 /// \brief Convert a TLS address reference into the correct sequence of loads 3400 /// and calls to compute the variable's address (for Darwin, currently) and 3401 /// return an SDValue containing the final node. 3402 3403 /// Darwin only has one TLS scheme which must be capable of dealing with the 3404 /// fully general situation, in the worst case. This means: 3405 /// + "extern __thread" declaration. 3406 /// + Defined in a possibly unknown dynamic library. 3407 /// 3408 /// The general system is that each __thread variable has a [3 x i64] descriptor 3409 /// which contains information used by the runtime to calculate the address. The 3410 /// only part of this the compiler needs to know about is the first xword, which 3411 /// contains a function pointer that must be called with the address of the 3412 /// entire descriptor in "x0". 3413 /// 3414 /// Since this descriptor may be in a different unit, in general even the 3415 /// descriptor must be accessed via an indirect load. The "ideal" code sequence 3416 /// is: 3417 /// adrp x0, _var@TLVPPAGE 3418 /// ldr x0, [x0, _var@TLVPPAGEOFF] ; x0 now contains address of descriptor 3419 /// ldr x1, [x0] ; x1 contains 1st entry of descriptor, 3420 /// ; the function pointer 3421 /// blr x1 ; Uses descriptor address in x0 3422 /// ; Address of _var is now in x0. 3423 /// 3424 /// If the address of _var's descriptor *is* known to the linker, then it can 3425 /// change the first "ldr" instruction to an appropriate "add x0, x0, #imm" for 3426 /// a slight efficiency gain. 3427 SDValue 3428 AArch64TargetLowering::LowerDarwinGlobalTLSAddress(SDValue Op, 3429 SelectionDAG &DAG) const { 3430 assert(Subtarget->isTargetDarwin() && "TLS only supported on Darwin"); 3431 3432 SDLoc DL(Op); 3433 MVT PtrVT = getPointerTy(DAG.getDataLayout()); 3434 const GlobalValue *GV = cast<GlobalAddressSDNode>(Op)->getGlobal(); 3435 3436 SDValue TLVPAddr = 3437 DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, AArch64II::MO_TLS); 3438 SDValue DescAddr = DAG.getNode(AArch64ISD::LOADgot, DL, PtrVT, TLVPAddr); 3439 3440 // The first entry in the descriptor is a function pointer that we must call 3441 // to obtain the address of the variable. 3442 SDValue Chain = DAG.getEntryNode(); 3443 SDValue FuncTLVGet = 3444 DAG.getLoad(MVT::i64, DL, Chain, DescAddr, 3445 MachinePointerInfo::getGOT(DAG.getMachineFunction()), false, 3446 true, true, 8); 3447 Chain = FuncTLVGet.getValue(1); 3448 3449 MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo(); 3450 MFI->setAdjustsStack(true); 3451 3452 // TLS calls preserve all registers except those that absolutely must be 3453 // trashed: X0 (it takes an argument), LR (it's a call) and NZCV (let's not be 3454 // silly). 3455 const uint32_t *Mask = 3456 Subtarget->getRegisterInfo()->getTLSCallPreservedMask(); 3457 3458 // Finally, we can make the call. This is just a degenerate version of a 3459 // normal AArch64 call node: x0 takes the address of the descriptor, and 3460 // returns the address of the variable in this thread. 3461 Chain = DAG.getCopyToReg(Chain, DL, AArch64::X0, DescAddr, SDValue()); 3462 Chain = 3463 DAG.getNode(AArch64ISD::CALL, DL, DAG.getVTList(MVT::Other, MVT::Glue), 3464 Chain, FuncTLVGet, DAG.getRegister(AArch64::X0, MVT::i64), 3465 DAG.getRegisterMask(Mask), Chain.getValue(1)); 3466 return DAG.getCopyFromReg(Chain, DL, AArch64::X0, PtrVT, Chain.getValue(1)); 3467 } 3468 3469 /// When accessing thread-local variables under either the general-dynamic or 3470 /// local-dynamic system, we make a "TLS-descriptor" call. The variable will 3471 /// have a descriptor, accessible via a PC-relative ADRP, and whose first entry 3472 /// is a function pointer to carry out the resolution. 3473 /// 3474 /// The sequence is: 3475 /// adrp x0, :tlsdesc:var 3476 /// ldr x1, [x0, #:tlsdesc_lo12:var] 3477 /// add x0, x0, #:tlsdesc_lo12:var 3478 /// .tlsdesccall var 3479 /// blr x1 3480 /// (TPIDR_EL0 offset now in x0) 3481 /// 3482 /// The above sequence must be produced unscheduled, to enable the linker to 3483 /// optimize/relax this sequence. 3484 /// Therefore, a pseudo-instruction (TLSDESC_CALLSEQ) is used to represent the 3485 /// above sequence, and expanded really late in the compilation flow, to ensure 3486 /// the sequence is produced as per above. 3487 SDValue AArch64TargetLowering::LowerELFTLSDescCallSeq(SDValue SymAddr, 3488 const SDLoc &DL, 3489 SelectionDAG &DAG) const { 3490 EVT PtrVT = getPointerTy(DAG.getDataLayout()); 3491 3492 SDValue Chain = DAG.getEntryNode(); 3493 SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue); 3494 3495 Chain = 3496 DAG.getNode(AArch64ISD::TLSDESC_CALLSEQ, DL, NodeTys, {Chain, SymAddr}); 3497 SDValue Glue = Chain.getValue(1); 3498 3499 return DAG.getCopyFromReg(Chain, DL, AArch64::X0, PtrVT, Glue); 3500 } 3501 3502 SDValue 3503 AArch64TargetLowering::LowerELFGlobalTLSAddress(SDValue Op, 3504 SelectionDAG &DAG) const { 3505 assert(Subtarget->isTargetELF() && "This function expects an ELF target"); 3506 assert(getTargetMachine().getCodeModel() == CodeModel::Small && 3507 "ELF TLS only supported in small memory model"); 3508 // Different choices can be made for the maximum size of the TLS area for a 3509 // module. For the small address model, the default TLS size is 16MiB and the 3510 // maximum TLS size is 4GiB. 3511 // FIXME: add -mtls-size command line option and make it control the 16MiB 3512 // vs. 4GiB code sequence generation. 3513 const GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(Op); 3514 3515 TLSModel::Model Model = getTargetMachine().getTLSModel(GA->getGlobal()); 3516 3517 if (DAG.getTarget().Options.EmulatedTLS) 3518 return LowerToTLSEmulatedModel(GA, DAG); 3519 3520 if (!EnableAArch64ELFLocalDynamicTLSGeneration) { 3521 if (Model == TLSModel::LocalDynamic) 3522 Model = TLSModel::GeneralDynamic; 3523 } 3524 3525 SDValue TPOff; 3526 EVT PtrVT = getPointerTy(DAG.getDataLayout()); 3527 SDLoc DL(Op); 3528 const GlobalValue *GV = GA->getGlobal(); 3529 3530 SDValue ThreadBase = DAG.getNode(AArch64ISD::THREAD_POINTER, DL, PtrVT); 3531 3532 if (Model == TLSModel::LocalExec) { 3533 SDValue HiVar = DAG.getTargetGlobalAddress( 3534 GV, DL, PtrVT, 0, AArch64II::MO_TLS | AArch64II::MO_HI12); 3535 SDValue LoVar = DAG.getTargetGlobalAddress( 3536 GV, DL, PtrVT, 0, 3537 AArch64II::MO_TLS | AArch64II::MO_PAGEOFF | AArch64II::MO_NC); 3538 3539 SDValue TPWithOff_lo = 3540 SDValue(DAG.getMachineNode(AArch64::ADDXri, DL, PtrVT, ThreadBase, 3541 HiVar, 3542 DAG.getTargetConstant(0, DL, MVT::i32)), 3543 0); 3544 SDValue TPWithOff = 3545 SDValue(DAG.getMachineNode(AArch64::ADDXri, DL, PtrVT, TPWithOff_lo, 3546 LoVar, 3547 DAG.getTargetConstant(0, DL, MVT::i32)), 3548 0); 3549 return TPWithOff; 3550 } else if (Model == TLSModel::InitialExec) { 3551 TPOff = DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, AArch64II::MO_TLS); 3552 TPOff = DAG.getNode(AArch64ISD::LOADgot, DL, PtrVT, TPOff); 3553 } else if (Model == TLSModel::LocalDynamic) { 3554 // Local-dynamic accesses proceed in two phases. A general-dynamic TLS 3555 // descriptor call against the special symbol _TLS_MODULE_BASE_ to calculate 3556 // the beginning of the module's TLS region, followed by a DTPREL offset 3557 // calculation. 3558 3559 // These accesses will need deduplicating if there's more than one. 3560 AArch64FunctionInfo *MFI = 3561 DAG.getMachineFunction().getInfo<AArch64FunctionInfo>(); 3562 MFI->incNumLocalDynamicTLSAccesses(); 3563 3564 // The call needs a relocation too for linker relaxation. It doesn't make 3565 // sense to call it MO_PAGE or MO_PAGEOFF though so we need another copy of 3566 // the address. 3567 SDValue SymAddr = DAG.getTargetExternalSymbol("_TLS_MODULE_BASE_", PtrVT, 3568 AArch64II::MO_TLS); 3569 3570 // Now we can calculate the offset from TPIDR_EL0 to this module's 3571 // thread-local area. 3572 TPOff = LowerELFTLSDescCallSeq(SymAddr, DL, DAG); 3573 3574 // Now use :dtprel_whatever: operations to calculate this variable's offset 3575 // in its thread-storage area. 3576 SDValue HiVar = DAG.getTargetGlobalAddress( 3577 GV, DL, MVT::i64, 0, AArch64II::MO_TLS | AArch64II::MO_HI12); 3578 SDValue LoVar = DAG.getTargetGlobalAddress( 3579 GV, DL, MVT::i64, 0, 3580 AArch64II::MO_TLS | AArch64II::MO_PAGEOFF | AArch64II::MO_NC); 3581 3582 TPOff = SDValue(DAG.getMachineNode(AArch64::ADDXri, DL, PtrVT, TPOff, HiVar, 3583 DAG.getTargetConstant(0, DL, MVT::i32)), 3584 0); 3585 TPOff = SDValue(DAG.getMachineNode(AArch64::ADDXri, DL, PtrVT, TPOff, LoVar, 3586 DAG.getTargetConstant(0, DL, MVT::i32)), 3587 0); 3588 } else if (Model == TLSModel::GeneralDynamic) { 3589 // The call needs a relocation too for linker relaxation. It doesn't make 3590 // sense to call it MO_PAGE or MO_PAGEOFF though so we need another copy of 3591 // the address. 3592 SDValue SymAddr = 3593 DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, AArch64II::MO_TLS); 3594 3595 // Finally we can make a call to calculate the offset from tpidr_el0. 3596 TPOff = LowerELFTLSDescCallSeq(SymAddr, DL, DAG); 3597 } else 3598 llvm_unreachable("Unsupported ELF TLS access model"); 3599 3600 return DAG.getNode(ISD::ADD, DL, PtrVT, ThreadBase, TPOff); 3601 } 3602 3603 SDValue AArch64TargetLowering::LowerGlobalTLSAddress(SDValue Op, 3604 SelectionDAG &DAG) const { 3605 if (Subtarget->isTargetDarwin()) 3606 return LowerDarwinGlobalTLSAddress(Op, DAG); 3607 else if (Subtarget->isTargetELF()) 3608 return LowerELFGlobalTLSAddress(Op, DAG); 3609 3610 llvm_unreachable("Unexpected platform trying to use TLS"); 3611 } 3612 SDValue AArch64TargetLowering::LowerBR_CC(SDValue Op, SelectionDAG &DAG) const { 3613 SDValue Chain = Op.getOperand(0); 3614 ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(1))->get(); 3615 SDValue LHS = Op.getOperand(2); 3616 SDValue RHS = Op.getOperand(3); 3617 SDValue Dest = Op.getOperand(4); 3618 SDLoc dl(Op); 3619 3620 // Handle f128 first, since lowering it will result in comparing the return 3621 // value of a libcall against zero, which is just what the rest of LowerBR_CC 3622 // is expecting to deal with. 3623 if (LHS.getValueType() == MVT::f128) { 3624 softenSetCCOperands(DAG, MVT::f128, LHS, RHS, CC, dl); 3625 3626 // If softenSetCCOperands returned a scalar, we need to compare the result 3627 // against zero to select between true and false values. 3628 if (!RHS.getNode()) { 3629 RHS = DAG.getConstant(0, dl, LHS.getValueType()); 3630 CC = ISD::SETNE; 3631 } 3632 } 3633 3634 // Optimize {s|u}{add|sub|mul}.with.overflow feeding into a branch 3635 // instruction. 3636 unsigned Opc = LHS.getOpcode(); 3637 if (LHS.getResNo() == 1 && isOneConstant(RHS) && 3638 (Opc == ISD::SADDO || Opc == ISD::UADDO || Opc == ISD::SSUBO || 3639 Opc == ISD::USUBO || Opc == ISD::SMULO || Opc == ISD::UMULO)) { 3640 assert((CC == ISD::SETEQ || CC == ISD::SETNE) && 3641 "Unexpected condition code."); 3642 // Only lower legal XALUO ops. 3643 if (!DAG.getTargetLoweringInfo().isTypeLegal(LHS->getValueType(0))) 3644 return SDValue(); 3645 3646 // The actual operation with overflow check. 3647 AArch64CC::CondCode OFCC; 3648 SDValue Value, Overflow; 3649 std::tie(Value, Overflow) = getAArch64XALUOOp(OFCC, LHS.getValue(0), DAG); 3650 3651 if (CC == ISD::SETNE) 3652 OFCC = getInvertedCondCode(OFCC); 3653 SDValue CCVal = DAG.getConstant(OFCC, dl, MVT::i32); 3654 3655 return DAG.getNode(AArch64ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal, 3656 Overflow); 3657 } 3658 3659 if (LHS.getValueType().isInteger()) { 3660 assert((LHS.getValueType() == RHS.getValueType()) && 3661 (LHS.getValueType() == MVT::i32 || LHS.getValueType() == MVT::i64)); 3662 3663 // If the RHS of the comparison is zero, we can potentially fold this 3664 // to a specialized branch. 3665 const ConstantSDNode *RHSC = dyn_cast<ConstantSDNode>(RHS); 3666 if (RHSC && RHSC->getZExtValue() == 0) { 3667 if (CC == ISD::SETEQ) { 3668 // See if we can use a TBZ to fold in an AND as well. 3669 // TBZ has a smaller branch displacement than CBZ. If the offset is 3670 // out of bounds, a late MI-layer pass rewrites branches. 3671 // 403.gcc is an example that hits this case. 3672 if (LHS.getOpcode() == ISD::AND && 3673 isa<ConstantSDNode>(LHS.getOperand(1)) && 3674 isPowerOf2_64(LHS.getConstantOperandVal(1))) { 3675 SDValue Test = LHS.getOperand(0); 3676 uint64_t Mask = LHS.getConstantOperandVal(1); 3677 return DAG.getNode(AArch64ISD::TBZ, dl, MVT::Other, Chain, Test, 3678 DAG.getConstant(Log2_64(Mask), dl, MVT::i64), 3679 Dest); 3680 } 3681 3682 return DAG.getNode(AArch64ISD::CBZ, dl, MVT::Other, Chain, LHS, Dest); 3683 } else if (CC == ISD::SETNE) { 3684 // See if we can use a TBZ to fold in an AND as well. 3685 // TBZ has a smaller branch displacement than CBZ. If the offset is 3686 // out of bounds, a late MI-layer pass rewrites branches. 3687 // 403.gcc is an example that hits this case. 3688 if (LHS.getOpcode() == ISD::AND && 3689 isa<ConstantSDNode>(LHS.getOperand(1)) && 3690 isPowerOf2_64(LHS.getConstantOperandVal(1))) { 3691 SDValue Test = LHS.getOperand(0); 3692 uint64_t Mask = LHS.getConstantOperandVal(1); 3693 return DAG.getNode(AArch64ISD::TBNZ, dl, MVT::Other, Chain, Test, 3694 DAG.getConstant(Log2_64(Mask), dl, MVT::i64), 3695 Dest); 3696 } 3697 3698 return DAG.getNode(AArch64ISD::CBNZ, dl, MVT::Other, Chain, LHS, Dest); 3699 } else if (CC == ISD::SETLT && LHS.getOpcode() != ISD::AND) { 3700 // Don't combine AND since emitComparison converts the AND to an ANDS 3701 // (a.k.a. TST) and the test in the test bit and branch instruction 3702 // becomes redundant. This would also increase register pressure. 3703 uint64_t Mask = LHS.getValueType().getSizeInBits() - 1; 3704 return DAG.getNode(AArch64ISD::TBNZ, dl, MVT::Other, Chain, LHS, 3705 DAG.getConstant(Mask, dl, MVT::i64), Dest); 3706 } 3707 } 3708 if (RHSC && RHSC->getSExtValue() == -1 && CC == ISD::SETGT && 3709 LHS.getOpcode() != ISD::AND) { 3710 // Don't combine AND since emitComparison converts the AND to an ANDS 3711 // (a.k.a. TST) and the test in the test bit and branch instruction 3712 // becomes redundant. This would also increase register pressure. 3713 uint64_t Mask = LHS.getValueType().getSizeInBits() - 1; 3714 return DAG.getNode(AArch64ISD::TBZ, dl, MVT::Other, Chain, LHS, 3715 DAG.getConstant(Mask, dl, MVT::i64), Dest); 3716 } 3717 3718 SDValue CCVal; 3719 SDValue Cmp = getAArch64Cmp(LHS, RHS, CC, CCVal, DAG, dl); 3720 return DAG.getNode(AArch64ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal, 3721 Cmp); 3722 } 3723 3724 assert(LHS.getValueType() == MVT::f32 || LHS.getValueType() == MVT::f64); 3725 3726 // Unfortunately, the mapping of LLVM FP CC's onto AArch64 CC's isn't totally 3727 // clean. Some of them require two branches to implement. 3728 SDValue Cmp = emitComparison(LHS, RHS, CC, dl, DAG); 3729 AArch64CC::CondCode CC1, CC2; 3730 changeFPCCToAArch64CC(CC, CC1, CC2); 3731 SDValue CC1Val = DAG.getConstant(CC1, dl, MVT::i32); 3732 SDValue BR1 = 3733 DAG.getNode(AArch64ISD::BRCOND, dl, MVT::Other, Chain, Dest, CC1Val, Cmp); 3734 if (CC2 != AArch64CC::AL) { 3735 SDValue CC2Val = DAG.getConstant(CC2, dl, MVT::i32); 3736 return DAG.getNode(AArch64ISD::BRCOND, dl, MVT::Other, BR1, Dest, CC2Val, 3737 Cmp); 3738 } 3739 3740 return BR1; 3741 } 3742 3743 SDValue AArch64TargetLowering::LowerFCOPYSIGN(SDValue Op, 3744 SelectionDAG &DAG) const { 3745 EVT VT = Op.getValueType(); 3746 SDLoc DL(Op); 3747 3748 SDValue In1 = Op.getOperand(0); 3749 SDValue In2 = Op.getOperand(1); 3750 EVT SrcVT = In2.getValueType(); 3751 3752 if (SrcVT.bitsLT(VT)) 3753 In2 = DAG.getNode(ISD::FP_EXTEND, DL, VT, In2); 3754 else if (SrcVT.bitsGT(VT)) 3755 In2 = DAG.getNode(ISD::FP_ROUND, DL, VT, In2, DAG.getIntPtrConstant(0, DL)); 3756 3757 EVT VecVT; 3758 EVT EltVT; 3759 uint64_t EltMask; 3760 SDValue VecVal1, VecVal2; 3761 if (VT == MVT::f32 || VT == MVT::v2f32 || VT == MVT::v4f32) { 3762 EltVT = MVT::i32; 3763 VecVT = (VT == MVT::v2f32 ? MVT::v2i32 : MVT::v4i32); 3764 EltMask = 0x80000000ULL; 3765 3766 if (!VT.isVector()) { 3767 VecVal1 = DAG.getTargetInsertSubreg(AArch64::ssub, DL, VecVT, 3768 DAG.getUNDEF(VecVT), In1); 3769 VecVal2 = DAG.getTargetInsertSubreg(AArch64::ssub, DL, VecVT, 3770 DAG.getUNDEF(VecVT), In2); 3771 } else { 3772 VecVal1 = DAG.getNode(ISD::BITCAST, DL, VecVT, In1); 3773 VecVal2 = DAG.getNode(ISD::BITCAST, DL, VecVT, In2); 3774 } 3775 } else if (VT == MVT::f64 || VT == MVT::v2f64) { 3776 EltVT = MVT::i64; 3777 VecVT = MVT::v2i64; 3778 3779 // We want to materialize a mask with the high bit set, but the AdvSIMD 3780 // immediate moves cannot materialize that in a single instruction for 3781 // 64-bit elements. Instead, materialize zero and then negate it. 3782 EltMask = 0; 3783 3784 if (!VT.isVector()) { 3785 VecVal1 = DAG.getTargetInsertSubreg(AArch64::dsub, DL, VecVT, 3786 DAG.getUNDEF(VecVT), In1); 3787 VecVal2 = DAG.getTargetInsertSubreg(AArch64::dsub, DL, VecVT, 3788 DAG.getUNDEF(VecVT), In2); 3789 } else { 3790 VecVal1 = DAG.getNode(ISD::BITCAST, DL, VecVT, In1); 3791 VecVal2 = DAG.getNode(ISD::BITCAST, DL, VecVT, In2); 3792 } 3793 } else { 3794 llvm_unreachable("Invalid type for copysign!"); 3795 } 3796 3797 SDValue BuildVec = DAG.getConstant(EltMask, DL, VecVT); 3798 3799 // If we couldn't materialize the mask above, then the mask vector will be 3800 // the zero vector, and we need to negate it here. 3801 if (VT == MVT::f64 || VT == MVT::v2f64) { 3802 BuildVec = DAG.getNode(ISD::BITCAST, DL, MVT::v2f64, BuildVec); 3803 BuildVec = DAG.getNode(ISD::FNEG, DL, MVT::v2f64, BuildVec); 3804 BuildVec = DAG.getNode(ISD::BITCAST, DL, MVT::v2i64, BuildVec); 3805 } 3806 3807 SDValue Sel = 3808 DAG.getNode(AArch64ISD::BIT, DL, VecVT, VecVal1, VecVal2, BuildVec); 3809 3810 if (VT == MVT::f32) 3811 return DAG.getTargetExtractSubreg(AArch64::ssub, DL, VT, Sel); 3812 else if (VT == MVT::f64) 3813 return DAG.getTargetExtractSubreg(AArch64::dsub, DL, VT, Sel); 3814 else 3815 return DAG.getNode(ISD::BITCAST, DL, VT, Sel); 3816 } 3817 3818 SDValue AArch64TargetLowering::LowerCTPOP(SDValue Op, SelectionDAG &DAG) const { 3819 if (DAG.getMachineFunction().getFunction()->hasFnAttribute( 3820 Attribute::NoImplicitFloat)) 3821 return SDValue(); 3822 3823 if (!Subtarget->hasNEON()) 3824 return SDValue(); 3825 3826 // While there is no integer popcount instruction, it can 3827 // be more efficiently lowered to the following sequence that uses 3828 // AdvSIMD registers/instructions as long as the copies to/from 3829 // the AdvSIMD registers are cheap. 3830 // FMOV D0, X0 // copy 64-bit int to vector, high bits zero'd 3831 // CNT V0.8B, V0.8B // 8xbyte pop-counts 3832 // ADDV B0, V0.8B // sum 8xbyte pop-counts 3833 // UMOV X0, V0.B[0] // copy byte result back to integer reg 3834 SDValue Val = Op.getOperand(0); 3835 SDLoc DL(Op); 3836 EVT VT = Op.getValueType(); 3837 3838 if (VT == MVT::i32) 3839 Val = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i64, Val); 3840 Val = DAG.getNode(ISD::BITCAST, DL, MVT::v8i8, Val); 3841 3842 SDValue CtPop = DAG.getNode(ISD::CTPOP, DL, MVT::v8i8, Val); 3843 SDValue UaddLV = DAG.getNode( 3844 ISD::INTRINSIC_WO_CHAIN, DL, MVT::i32, 3845 DAG.getConstant(Intrinsic::aarch64_neon_uaddlv, DL, MVT::i32), CtPop); 3846 3847 if (VT == MVT::i64) 3848 UaddLV = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i64, UaddLV); 3849 return UaddLV; 3850 } 3851 3852 SDValue AArch64TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const { 3853 3854 if (Op.getValueType().isVector()) 3855 return LowerVSETCC(Op, DAG); 3856 3857 SDValue LHS = Op.getOperand(0); 3858 SDValue RHS = Op.getOperand(1); 3859 ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(2))->get(); 3860 SDLoc dl(Op); 3861 3862 // We chose ZeroOrOneBooleanContents, so use zero and one. 3863 EVT VT = Op.getValueType(); 3864 SDValue TVal = DAG.getConstant(1, dl, VT); 3865 SDValue FVal = DAG.getConstant(0, dl, VT); 3866 3867 // Handle f128 first, since one possible outcome is a normal integer 3868 // comparison which gets picked up by the next if statement. 3869 if (LHS.getValueType() == MVT::f128) { 3870 softenSetCCOperands(DAG, MVT::f128, LHS, RHS, CC, dl); 3871 3872 // If softenSetCCOperands returned a scalar, use it. 3873 if (!RHS.getNode()) { 3874 assert(LHS.getValueType() == Op.getValueType() && 3875 "Unexpected setcc expansion!"); 3876 return LHS; 3877 } 3878 } 3879 3880 if (LHS.getValueType().isInteger()) { 3881 SDValue CCVal; 3882 SDValue Cmp = 3883 getAArch64Cmp(LHS, RHS, ISD::getSetCCInverse(CC, true), CCVal, DAG, dl); 3884 3885 // Note that we inverted the condition above, so we reverse the order of 3886 // the true and false operands here. This will allow the setcc to be 3887 // matched to a single CSINC instruction. 3888 return DAG.getNode(AArch64ISD::CSEL, dl, VT, FVal, TVal, CCVal, Cmp); 3889 } 3890 3891 // Now we know we're dealing with FP values. 3892 assert(LHS.getValueType() == MVT::f32 || LHS.getValueType() == MVT::f64); 3893 3894 // If that fails, we'll need to perform an FCMP + CSEL sequence. Go ahead 3895 // and do the comparison. 3896 SDValue Cmp = emitComparison(LHS, RHS, CC, dl, DAG); 3897 3898 AArch64CC::CondCode CC1, CC2; 3899 changeFPCCToAArch64CC(CC, CC1, CC2); 3900 if (CC2 == AArch64CC::AL) { 3901 changeFPCCToAArch64CC(ISD::getSetCCInverse(CC, false), CC1, CC2); 3902 SDValue CC1Val = DAG.getConstant(CC1, dl, MVT::i32); 3903 3904 // Note that we inverted the condition above, so we reverse the order of 3905 // the true and false operands here. This will allow the setcc to be 3906 // matched to a single CSINC instruction. 3907 return DAG.getNode(AArch64ISD::CSEL, dl, VT, FVal, TVal, CC1Val, Cmp); 3908 } else { 3909 // Unfortunately, the mapping of LLVM FP CC's onto AArch64 CC's isn't 3910 // totally clean. Some of them require two CSELs to implement. As is in 3911 // this case, we emit the first CSEL and then emit a second using the output 3912 // of the first as the RHS. We're effectively OR'ing the two CC's together. 3913 3914 // FIXME: It would be nice if we could match the two CSELs to two CSINCs. 3915 SDValue CC1Val = DAG.getConstant(CC1, dl, MVT::i32); 3916 SDValue CS1 = 3917 DAG.getNode(AArch64ISD::CSEL, dl, VT, TVal, FVal, CC1Val, Cmp); 3918 3919 SDValue CC2Val = DAG.getConstant(CC2, dl, MVT::i32); 3920 return DAG.getNode(AArch64ISD::CSEL, dl, VT, TVal, CS1, CC2Val, Cmp); 3921 } 3922 } 3923 3924 SDValue AArch64TargetLowering::LowerSELECT_CC(ISD::CondCode CC, SDValue LHS, 3925 SDValue RHS, SDValue TVal, 3926 SDValue FVal, const SDLoc &dl, 3927 SelectionDAG &DAG) const { 3928 // Handle f128 first, because it will result in a comparison of some RTLIB 3929 // call result against zero. 3930 if (LHS.getValueType() == MVT::f128) { 3931 softenSetCCOperands(DAG, MVT::f128, LHS, RHS, CC, dl); 3932 3933 // If softenSetCCOperands returned a scalar, we need to compare the result 3934 // against zero to select between true and false values. 3935 if (!RHS.getNode()) { 3936 RHS = DAG.getConstant(0, dl, LHS.getValueType()); 3937 CC = ISD::SETNE; 3938 } 3939 } 3940 3941 // Also handle f16, for which we need to do a f32 comparison. 3942 if (LHS.getValueType() == MVT::f16) { 3943 LHS = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f32, LHS); 3944 RHS = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f32, RHS); 3945 } 3946 3947 // Next, handle integers. 3948 if (LHS.getValueType().isInteger()) { 3949 assert((LHS.getValueType() == RHS.getValueType()) && 3950 (LHS.getValueType() == MVT::i32 || LHS.getValueType() == MVT::i64)); 3951 3952 unsigned Opcode = AArch64ISD::CSEL; 3953 3954 // If both the TVal and the FVal are constants, see if we can swap them in 3955 // order to for a CSINV or CSINC out of them. 3956 ConstantSDNode *CFVal = dyn_cast<ConstantSDNode>(FVal); 3957 ConstantSDNode *CTVal = dyn_cast<ConstantSDNode>(TVal); 3958 3959 if (CTVal && CFVal && CTVal->isAllOnesValue() && CFVal->isNullValue()) { 3960 std::swap(TVal, FVal); 3961 std::swap(CTVal, CFVal); 3962 CC = ISD::getSetCCInverse(CC, true); 3963 } else if (CTVal && CFVal && CTVal->isOne() && CFVal->isNullValue()) { 3964 std::swap(TVal, FVal); 3965 std::swap(CTVal, CFVal); 3966 CC = ISD::getSetCCInverse(CC, true); 3967 } else if (TVal.getOpcode() == ISD::XOR) { 3968 // If TVal is a NOT we want to swap TVal and FVal so that we can match 3969 // with a CSINV rather than a CSEL. 3970 if (isAllOnesConstant(TVal.getOperand(1))) { 3971 std::swap(TVal, FVal); 3972 std::swap(CTVal, CFVal); 3973 CC = ISD::getSetCCInverse(CC, true); 3974 } 3975 } else if (TVal.getOpcode() == ISD::SUB) { 3976 // If TVal is a negation (SUB from 0) we want to swap TVal and FVal so 3977 // that we can match with a CSNEG rather than a CSEL. 3978 if (isNullConstant(TVal.getOperand(0))) { 3979 std::swap(TVal, FVal); 3980 std::swap(CTVal, CFVal); 3981 CC = ISD::getSetCCInverse(CC, true); 3982 } 3983 } else if (CTVal && CFVal) { 3984 const int64_t TrueVal = CTVal->getSExtValue(); 3985 const int64_t FalseVal = CFVal->getSExtValue(); 3986 bool Swap = false; 3987 3988 // If both TVal and FVal are constants, see if FVal is the 3989 // inverse/negation/increment of TVal and generate a CSINV/CSNEG/CSINC 3990 // instead of a CSEL in that case. 3991 if (TrueVal == ~FalseVal) { 3992 Opcode = AArch64ISD::CSINV; 3993 } else if (TrueVal == -FalseVal) { 3994 Opcode = AArch64ISD::CSNEG; 3995 } else if (TVal.getValueType() == MVT::i32) { 3996 // If our operands are only 32-bit wide, make sure we use 32-bit 3997 // arithmetic for the check whether we can use CSINC. This ensures that 3998 // the addition in the check will wrap around properly in case there is 3999 // an overflow (which would not be the case if we do the check with 4000 // 64-bit arithmetic). 4001 const uint32_t TrueVal32 = CTVal->getZExtValue(); 4002 const uint32_t FalseVal32 = CFVal->getZExtValue(); 4003 4004 if ((TrueVal32 == FalseVal32 + 1) || (TrueVal32 + 1 == FalseVal32)) { 4005 Opcode = AArch64ISD::CSINC; 4006 4007 if (TrueVal32 > FalseVal32) { 4008 Swap = true; 4009 } 4010 } 4011 // 64-bit check whether we can use CSINC. 4012 } else if ((TrueVal == FalseVal + 1) || (TrueVal + 1 == FalseVal)) { 4013 Opcode = AArch64ISD::CSINC; 4014 4015 if (TrueVal > FalseVal) { 4016 Swap = true; 4017 } 4018 } 4019 4020 // Swap TVal and FVal if necessary. 4021 if (Swap) { 4022 std::swap(TVal, FVal); 4023 std::swap(CTVal, CFVal); 4024 CC = ISD::getSetCCInverse(CC, true); 4025 } 4026 4027 if (Opcode != AArch64ISD::CSEL) { 4028 // Drop FVal since we can get its value by simply inverting/negating 4029 // TVal. 4030 FVal = TVal; 4031 } 4032 } 4033 4034 SDValue CCVal; 4035 SDValue Cmp = getAArch64Cmp(LHS, RHS, CC, CCVal, DAG, dl); 4036 4037 EVT VT = TVal.getValueType(); 4038 return DAG.getNode(Opcode, dl, VT, TVal, FVal, CCVal, Cmp); 4039 } 4040 4041 // Now we know we're dealing with FP values. 4042 assert(LHS.getValueType() == MVT::f32 || LHS.getValueType() == MVT::f64); 4043 assert(LHS.getValueType() == RHS.getValueType()); 4044 EVT VT = TVal.getValueType(); 4045 SDValue Cmp = emitComparison(LHS, RHS, CC, dl, DAG); 4046 4047 // Unfortunately, the mapping of LLVM FP CC's onto AArch64 CC's isn't totally 4048 // clean. Some of them require two CSELs to implement. 4049 AArch64CC::CondCode CC1, CC2; 4050 changeFPCCToAArch64CC(CC, CC1, CC2); 4051 SDValue CC1Val = DAG.getConstant(CC1, dl, MVT::i32); 4052 SDValue CS1 = DAG.getNode(AArch64ISD::CSEL, dl, VT, TVal, FVal, CC1Val, Cmp); 4053 4054 // If we need a second CSEL, emit it, using the output of the first as the 4055 // RHS. We're effectively OR'ing the two CC's together. 4056 if (CC2 != AArch64CC::AL) { 4057 SDValue CC2Val = DAG.getConstant(CC2, dl, MVT::i32); 4058 return DAG.getNode(AArch64ISD::CSEL, dl, VT, TVal, CS1, CC2Val, Cmp); 4059 } 4060 4061 // Otherwise, return the output of the first CSEL. 4062 return CS1; 4063 } 4064 4065 SDValue AArch64TargetLowering::LowerSELECT_CC(SDValue Op, 4066 SelectionDAG &DAG) const { 4067 ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(4))->get(); 4068 SDValue LHS = Op.getOperand(0); 4069 SDValue RHS = Op.getOperand(1); 4070 SDValue TVal = Op.getOperand(2); 4071 SDValue FVal = Op.getOperand(3); 4072 SDLoc DL(Op); 4073 return LowerSELECT_CC(CC, LHS, RHS, TVal, FVal, DL, DAG); 4074 } 4075 4076 SDValue AArch64TargetLowering::LowerSELECT(SDValue Op, 4077 SelectionDAG &DAG) const { 4078 SDValue CCVal = Op->getOperand(0); 4079 SDValue TVal = Op->getOperand(1); 4080 SDValue FVal = Op->getOperand(2); 4081 SDLoc DL(Op); 4082 4083 unsigned Opc = CCVal.getOpcode(); 4084 // Optimize {s|u}{add|sub|mul}.with.overflow feeding into a select 4085 // instruction. 4086 if (CCVal.getResNo() == 1 && 4087 (Opc == ISD::SADDO || Opc == ISD::UADDO || Opc == ISD::SSUBO || 4088 Opc == ISD::USUBO || Opc == ISD::SMULO || Opc == ISD::UMULO)) { 4089 // Only lower legal XALUO ops. 4090 if (!DAG.getTargetLoweringInfo().isTypeLegal(CCVal->getValueType(0))) 4091 return SDValue(); 4092 4093 AArch64CC::CondCode OFCC; 4094 SDValue Value, Overflow; 4095 std::tie(Value, Overflow) = getAArch64XALUOOp(OFCC, CCVal.getValue(0), DAG); 4096 SDValue CCVal = DAG.getConstant(OFCC, DL, MVT::i32); 4097 4098 return DAG.getNode(AArch64ISD::CSEL, DL, Op.getValueType(), TVal, FVal, 4099 CCVal, Overflow); 4100 } 4101 4102 // Lower it the same way as we would lower a SELECT_CC node. 4103 ISD::CondCode CC; 4104 SDValue LHS, RHS; 4105 if (CCVal.getOpcode() == ISD::SETCC) { 4106 LHS = CCVal.getOperand(0); 4107 RHS = CCVal.getOperand(1); 4108 CC = cast<CondCodeSDNode>(CCVal->getOperand(2))->get(); 4109 } else { 4110 LHS = CCVal; 4111 RHS = DAG.getConstant(0, DL, CCVal.getValueType()); 4112 CC = ISD::SETNE; 4113 } 4114 return LowerSELECT_CC(CC, LHS, RHS, TVal, FVal, DL, DAG); 4115 } 4116 4117 SDValue AArch64TargetLowering::LowerJumpTable(SDValue Op, 4118 SelectionDAG &DAG) const { 4119 // Jump table entries as PC relative offsets. No additional tweaking 4120 // is necessary here. Just get the address of the jump table. 4121 JumpTableSDNode *JT = cast<JumpTableSDNode>(Op); 4122 EVT PtrVT = getPointerTy(DAG.getDataLayout()); 4123 SDLoc DL(Op); 4124 4125 if (getTargetMachine().getCodeModel() == CodeModel::Large && 4126 !Subtarget->isTargetMachO()) { 4127 const unsigned char MO_NC = AArch64II::MO_NC; 4128 return DAG.getNode( 4129 AArch64ISD::WrapperLarge, DL, PtrVT, 4130 DAG.getTargetJumpTable(JT->getIndex(), PtrVT, AArch64II::MO_G3), 4131 DAG.getTargetJumpTable(JT->getIndex(), PtrVT, AArch64II::MO_G2 | MO_NC), 4132 DAG.getTargetJumpTable(JT->getIndex(), PtrVT, AArch64II::MO_G1 | MO_NC), 4133 DAG.getTargetJumpTable(JT->getIndex(), PtrVT, 4134 AArch64II::MO_G0 | MO_NC)); 4135 } 4136 4137 SDValue Hi = 4138 DAG.getTargetJumpTable(JT->getIndex(), PtrVT, AArch64II::MO_PAGE); 4139 SDValue Lo = DAG.getTargetJumpTable(JT->getIndex(), PtrVT, 4140 AArch64II::MO_PAGEOFF | AArch64II::MO_NC); 4141 SDValue ADRP = DAG.getNode(AArch64ISD::ADRP, DL, PtrVT, Hi); 4142 return DAG.getNode(AArch64ISD::ADDlow, DL, PtrVT, ADRP, Lo); 4143 } 4144 4145 SDValue AArch64TargetLowering::LowerConstantPool(SDValue Op, 4146 SelectionDAG &DAG) const { 4147 ConstantPoolSDNode *CP = cast<ConstantPoolSDNode>(Op); 4148 EVT PtrVT = getPointerTy(DAG.getDataLayout()); 4149 SDLoc DL(Op); 4150 4151 if (getTargetMachine().getCodeModel() == CodeModel::Large) { 4152 // Use the GOT for the large code model on iOS. 4153 if (Subtarget->isTargetMachO()) { 4154 SDValue GotAddr = DAG.getTargetConstantPool( 4155 CP->getConstVal(), PtrVT, CP->getAlignment(), CP->getOffset(), 4156 AArch64II::MO_GOT); 4157 return DAG.getNode(AArch64ISD::LOADgot, DL, PtrVT, GotAddr); 4158 } 4159 4160 const unsigned char MO_NC = AArch64II::MO_NC; 4161 return DAG.getNode( 4162 AArch64ISD::WrapperLarge, DL, PtrVT, 4163 DAG.getTargetConstantPool(CP->getConstVal(), PtrVT, CP->getAlignment(), 4164 CP->getOffset(), AArch64II::MO_G3), 4165 DAG.getTargetConstantPool(CP->getConstVal(), PtrVT, CP->getAlignment(), 4166 CP->getOffset(), AArch64II::MO_G2 | MO_NC), 4167 DAG.getTargetConstantPool(CP->getConstVal(), PtrVT, CP->getAlignment(), 4168 CP->getOffset(), AArch64II::MO_G1 | MO_NC), 4169 DAG.getTargetConstantPool(CP->getConstVal(), PtrVT, CP->getAlignment(), 4170 CP->getOffset(), AArch64II::MO_G0 | MO_NC)); 4171 } else { 4172 // Use ADRP/ADD or ADRP/LDR for everything else: the small memory model on 4173 // ELF, the only valid one on Darwin. 4174 SDValue Hi = 4175 DAG.getTargetConstantPool(CP->getConstVal(), PtrVT, CP->getAlignment(), 4176 CP->getOffset(), AArch64II::MO_PAGE); 4177 SDValue Lo = DAG.getTargetConstantPool( 4178 CP->getConstVal(), PtrVT, CP->getAlignment(), CP->getOffset(), 4179 AArch64II::MO_PAGEOFF | AArch64II::MO_NC); 4180 4181 SDValue ADRP = DAG.getNode(AArch64ISD::ADRP, DL, PtrVT, Hi); 4182 return DAG.getNode(AArch64ISD::ADDlow, DL, PtrVT, ADRP, Lo); 4183 } 4184 } 4185 4186 SDValue AArch64TargetLowering::LowerBlockAddress(SDValue Op, 4187 SelectionDAG &DAG) const { 4188 const BlockAddress *BA = cast<BlockAddressSDNode>(Op)->getBlockAddress(); 4189 EVT PtrVT = getPointerTy(DAG.getDataLayout()); 4190 SDLoc DL(Op); 4191 if (getTargetMachine().getCodeModel() == CodeModel::Large && 4192 !Subtarget->isTargetMachO()) { 4193 const unsigned char MO_NC = AArch64II::MO_NC; 4194 return DAG.getNode( 4195 AArch64ISD::WrapperLarge, DL, PtrVT, 4196 DAG.getTargetBlockAddress(BA, PtrVT, 0, AArch64II::MO_G3), 4197 DAG.getTargetBlockAddress(BA, PtrVT, 0, AArch64II::MO_G2 | MO_NC), 4198 DAG.getTargetBlockAddress(BA, PtrVT, 0, AArch64II::MO_G1 | MO_NC), 4199 DAG.getTargetBlockAddress(BA, PtrVT, 0, AArch64II::MO_G0 | MO_NC)); 4200 } else { 4201 SDValue Hi = DAG.getTargetBlockAddress(BA, PtrVT, 0, AArch64II::MO_PAGE); 4202 SDValue Lo = DAG.getTargetBlockAddress(BA, PtrVT, 0, AArch64II::MO_PAGEOFF | 4203 AArch64II::MO_NC); 4204 SDValue ADRP = DAG.getNode(AArch64ISD::ADRP, DL, PtrVT, Hi); 4205 return DAG.getNode(AArch64ISD::ADDlow, DL, PtrVT, ADRP, Lo); 4206 } 4207 } 4208 4209 SDValue AArch64TargetLowering::LowerDarwin_VASTART(SDValue Op, 4210 SelectionDAG &DAG) const { 4211 AArch64FunctionInfo *FuncInfo = 4212 DAG.getMachineFunction().getInfo<AArch64FunctionInfo>(); 4213 4214 SDLoc DL(Op); 4215 SDValue FR = DAG.getFrameIndex(FuncInfo->getVarArgsStackIndex(), 4216 getPointerTy(DAG.getDataLayout())); 4217 const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue(); 4218 return DAG.getStore(Op.getOperand(0), DL, FR, Op.getOperand(1), 4219 MachinePointerInfo(SV), false, false, 0); 4220 } 4221 4222 SDValue AArch64TargetLowering::LowerAAPCS_VASTART(SDValue Op, 4223 SelectionDAG &DAG) const { 4224 // The layout of the va_list struct is specified in the AArch64 Procedure Call 4225 // Standard, section B.3. 4226 MachineFunction &MF = DAG.getMachineFunction(); 4227 AArch64FunctionInfo *FuncInfo = MF.getInfo<AArch64FunctionInfo>(); 4228 auto PtrVT = getPointerTy(DAG.getDataLayout()); 4229 SDLoc DL(Op); 4230 4231 SDValue Chain = Op.getOperand(0); 4232 SDValue VAList = Op.getOperand(1); 4233 const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue(); 4234 SmallVector<SDValue, 4> MemOps; 4235 4236 // void *__stack at offset 0 4237 SDValue Stack = DAG.getFrameIndex(FuncInfo->getVarArgsStackIndex(), PtrVT); 4238 MemOps.push_back(DAG.getStore(Chain, DL, Stack, VAList, 4239 MachinePointerInfo(SV), false, false, 8)); 4240 4241 // void *__gr_top at offset 8 4242 int GPRSize = FuncInfo->getVarArgsGPRSize(); 4243 if (GPRSize > 0) { 4244 SDValue GRTop, GRTopAddr; 4245 4246 GRTopAddr = 4247 DAG.getNode(ISD::ADD, DL, PtrVT, VAList, DAG.getConstant(8, DL, PtrVT)); 4248 4249 GRTop = DAG.getFrameIndex(FuncInfo->getVarArgsGPRIndex(), PtrVT); 4250 GRTop = DAG.getNode(ISD::ADD, DL, PtrVT, GRTop, 4251 DAG.getConstant(GPRSize, DL, PtrVT)); 4252 4253 MemOps.push_back(DAG.getStore(Chain, DL, GRTop, GRTopAddr, 4254 MachinePointerInfo(SV, 8), false, false, 8)); 4255 } 4256 4257 // void *__vr_top at offset 16 4258 int FPRSize = FuncInfo->getVarArgsFPRSize(); 4259 if (FPRSize > 0) { 4260 SDValue VRTop, VRTopAddr; 4261 VRTopAddr = DAG.getNode(ISD::ADD, DL, PtrVT, VAList, 4262 DAG.getConstant(16, DL, PtrVT)); 4263 4264 VRTop = DAG.getFrameIndex(FuncInfo->getVarArgsFPRIndex(), PtrVT); 4265 VRTop = DAG.getNode(ISD::ADD, DL, PtrVT, VRTop, 4266 DAG.getConstant(FPRSize, DL, PtrVT)); 4267 4268 MemOps.push_back(DAG.getStore(Chain, DL, VRTop, VRTopAddr, 4269 MachinePointerInfo(SV, 16), false, false, 8)); 4270 } 4271 4272 // int __gr_offs at offset 24 4273 SDValue GROffsAddr = 4274 DAG.getNode(ISD::ADD, DL, PtrVT, VAList, DAG.getConstant(24, DL, PtrVT)); 4275 MemOps.push_back(DAG.getStore(Chain, DL, 4276 DAG.getConstant(-GPRSize, DL, MVT::i32), 4277 GROffsAddr, MachinePointerInfo(SV, 24), false, 4278 false, 4)); 4279 4280 // int __vr_offs at offset 28 4281 SDValue VROffsAddr = 4282 DAG.getNode(ISD::ADD, DL, PtrVT, VAList, DAG.getConstant(28, DL, PtrVT)); 4283 MemOps.push_back(DAG.getStore(Chain, DL, 4284 DAG.getConstant(-FPRSize, DL, MVT::i32), 4285 VROffsAddr, MachinePointerInfo(SV, 28), false, 4286 false, 4)); 4287 4288 return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOps); 4289 } 4290 4291 SDValue AArch64TargetLowering::LowerVASTART(SDValue Op, 4292 SelectionDAG &DAG) const { 4293 return Subtarget->isTargetDarwin() ? LowerDarwin_VASTART(Op, DAG) 4294 : LowerAAPCS_VASTART(Op, DAG); 4295 } 4296 4297 SDValue AArch64TargetLowering::LowerVACOPY(SDValue Op, 4298 SelectionDAG &DAG) const { 4299 // AAPCS has three pointers and two ints (= 32 bytes), Darwin has single 4300 // pointer. 4301 SDLoc DL(Op); 4302 unsigned VaListSize = Subtarget->isTargetDarwin() ? 8 : 32; 4303 const Value *DestSV = cast<SrcValueSDNode>(Op.getOperand(3))->getValue(); 4304 const Value *SrcSV = cast<SrcValueSDNode>(Op.getOperand(4))->getValue(); 4305 4306 return DAG.getMemcpy(Op.getOperand(0), DL, Op.getOperand(1), 4307 Op.getOperand(2), 4308 DAG.getConstant(VaListSize, DL, MVT::i32), 4309 8, false, false, false, MachinePointerInfo(DestSV), 4310 MachinePointerInfo(SrcSV)); 4311 } 4312 4313 SDValue AArch64TargetLowering::LowerVAARG(SDValue Op, SelectionDAG &DAG) const { 4314 assert(Subtarget->isTargetDarwin() && 4315 "automatic va_arg instruction only works on Darwin"); 4316 4317 const Value *V = cast<SrcValueSDNode>(Op.getOperand(2))->getValue(); 4318 EVT VT = Op.getValueType(); 4319 SDLoc DL(Op); 4320 SDValue Chain = Op.getOperand(0); 4321 SDValue Addr = Op.getOperand(1); 4322 unsigned Align = Op.getConstantOperandVal(3); 4323 auto PtrVT = getPointerTy(DAG.getDataLayout()); 4324 4325 SDValue VAList = DAG.getLoad(PtrVT, DL, Chain, Addr, MachinePointerInfo(V), 4326 false, false, false, 0); 4327 Chain = VAList.getValue(1); 4328 4329 if (Align > 8) { 4330 assert(((Align & (Align - 1)) == 0) && "Expected Align to be a power of 2"); 4331 VAList = DAG.getNode(ISD::ADD, DL, PtrVT, VAList, 4332 DAG.getConstant(Align - 1, DL, PtrVT)); 4333 VAList = DAG.getNode(ISD::AND, DL, PtrVT, VAList, 4334 DAG.getConstant(-(int64_t)Align, DL, PtrVT)); 4335 } 4336 4337 Type *ArgTy = VT.getTypeForEVT(*DAG.getContext()); 4338 uint64_t ArgSize = DAG.getDataLayout().getTypeAllocSize(ArgTy); 4339 4340 // Scalar integer and FP values smaller than 64 bits are implicitly extended 4341 // up to 64 bits. At the very least, we have to increase the striding of the 4342 // vaargs list to match this, and for FP values we need to introduce 4343 // FP_ROUND nodes as well. 4344 if (VT.isInteger() && !VT.isVector()) 4345 ArgSize = 8; 4346 bool NeedFPTrunc = false; 4347 if (VT.isFloatingPoint() && !VT.isVector() && VT != MVT::f64) { 4348 ArgSize = 8; 4349 NeedFPTrunc = true; 4350 } 4351 4352 // Increment the pointer, VAList, to the next vaarg 4353 SDValue VANext = DAG.getNode(ISD::ADD, DL, PtrVT, VAList, 4354 DAG.getConstant(ArgSize, DL, PtrVT)); 4355 // Store the incremented VAList to the legalized pointer 4356 SDValue APStore = DAG.getStore(Chain, DL, VANext, Addr, MachinePointerInfo(V), 4357 false, false, 0); 4358 4359 // Load the actual argument out of the pointer VAList 4360 if (NeedFPTrunc) { 4361 // Load the value as an f64. 4362 SDValue WideFP = DAG.getLoad(MVT::f64, DL, APStore, VAList, 4363 MachinePointerInfo(), false, false, false, 0); 4364 // Round the value down to an f32. 4365 SDValue NarrowFP = DAG.getNode(ISD::FP_ROUND, DL, VT, WideFP.getValue(0), 4366 DAG.getIntPtrConstant(1, DL)); 4367 SDValue Ops[] = { NarrowFP, WideFP.getValue(1) }; 4368 // Merge the rounded value with the chain output of the load. 4369 return DAG.getMergeValues(Ops, DL); 4370 } 4371 4372 return DAG.getLoad(VT, DL, APStore, VAList, MachinePointerInfo(), false, 4373 false, false, 0); 4374 } 4375 4376 SDValue AArch64TargetLowering::LowerFRAMEADDR(SDValue Op, 4377 SelectionDAG &DAG) const { 4378 MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo(); 4379 MFI->setFrameAddressIsTaken(true); 4380 4381 EVT VT = Op.getValueType(); 4382 SDLoc DL(Op); 4383 unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue(); 4384 SDValue FrameAddr = 4385 DAG.getCopyFromReg(DAG.getEntryNode(), DL, AArch64::FP, VT); 4386 while (Depth--) 4387 FrameAddr = DAG.getLoad(VT, DL, DAG.getEntryNode(), FrameAddr, 4388 MachinePointerInfo(), false, false, false, 0); 4389 return FrameAddr; 4390 } 4391 4392 // FIXME? Maybe this could be a TableGen attribute on some registers and 4393 // this table could be generated automatically from RegInfo. 4394 unsigned AArch64TargetLowering::getRegisterByName(const char* RegName, EVT VT, 4395 SelectionDAG &DAG) const { 4396 unsigned Reg = StringSwitch<unsigned>(RegName) 4397 .Case("sp", AArch64::SP) 4398 .Default(0); 4399 if (Reg) 4400 return Reg; 4401 report_fatal_error(Twine("Invalid register name \"" 4402 + StringRef(RegName) + "\".")); 4403 } 4404 4405 SDValue AArch64TargetLowering::LowerRETURNADDR(SDValue Op, 4406 SelectionDAG &DAG) const { 4407 MachineFunction &MF = DAG.getMachineFunction(); 4408 MachineFrameInfo *MFI = MF.getFrameInfo(); 4409 MFI->setReturnAddressIsTaken(true); 4410 4411 EVT VT = Op.getValueType(); 4412 SDLoc DL(Op); 4413 unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue(); 4414 if (Depth) { 4415 SDValue FrameAddr = LowerFRAMEADDR(Op, DAG); 4416 SDValue Offset = DAG.getConstant(8, DL, getPointerTy(DAG.getDataLayout())); 4417 return DAG.getLoad(VT, DL, DAG.getEntryNode(), 4418 DAG.getNode(ISD::ADD, DL, VT, FrameAddr, Offset), 4419 MachinePointerInfo(), false, false, false, 0); 4420 } 4421 4422 // Return LR, which contains the return address. Mark it an implicit live-in. 4423 unsigned Reg = MF.addLiveIn(AArch64::LR, &AArch64::GPR64RegClass); 4424 return DAG.getCopyFromReg(DAG.getEntryNode(), DL, Reg, VT); 4425 } 4426 4427 /// LowerShiftRightParts - Lower SRA_PARTS, which returns two 4428 /// i64 values and take a 2 x i64 value to shift plus a shift amount. 4429 SDValue AArch64TargetLowering::LowerShiftRightParts(SDValue Op, 4430 SelectionDAG &DAG) const { 4431 assert(Op.getNumOperands() == 3 && "Not a double-shift!"); 4432 EVT VT = Op.getValueType(); 4433 unsigned VTBits = VT.getSizeInBits(); 4434 SDLoc dl(Op); 4435 SDValue ShOpLo = Op.getOperand(0); 4436 SDValue ShOpHi = Op.getOperand(1); 4437 SDValue ShAmt = Op.getOperand(2); 4438 unsigned Opc = (Op.getOpcode() == ISD::SRA_PARTS) ? ISD::SRA : ISD::SRL; 4439 4440 assert(Op.getOpcode() == ISD::SRA_PARTS || Op.getOpcode() == ISD::SRL_PARTS); 4441 4442 SDValue RevShAmt = DAG.getNode(ISD::SUB, dl, MVT::i64, 4443 DAG.getConstant(VTBits, dl, MVT::i64), ShAmt); 4444 SDValue HiBitsForLo = DAG.getNode(ISD::SHL, dl, VT, ShOpHi, RevShAmt); 4445 4446 // Unfortunately, if ShAmt == 0, we just calculated "(SHL ShOpHi, 64)" which 4447 // is "undef". We wanted 0, so CSEL it directly. 4448 SDValue Cmp = emitComparison(ShAmt, DAG.getConstant(0, dl, MVT::i64), 4449 ISD::SETEQ, dl, DAG); 4450 SDValue CCVal = DAG.getConstant(AArch64CC::EQ, dl, MVT::i32); 4451 HiBitsForLo = 4452 DAG.getNode(AArch64ISD::CSEL, dl, VT, DAG.getConstant(0, dl, MVT::i64), 4453 HiBitsForLo, CCVal, Cmp); 4454 4455 SDValue ExtraShAmt = DAG.getNode(ISD::SUB, dl, MVT::i64, ShAmt, 4456 DAG.getConstant(VTBits, dl, MVT::i64)); 4457 4458 SDValue LoBitsForLo = DAG.getNode(ISD::SRL, dl, VT, ShOpLo, ShAmt); 4459 SDValue LoForNormalShift = 4460 DAG.getNode(ISD::OR, dl, VT, LoBitsForLo, HiBitsForLo); 4461 4462 Cmp = emitComparison(ExtraShAmt, DAG.getConstant(0, dl, MVT::i64), ISD::SETGE, 4463 dl, DAG); 4464 CCVal = DAG.getConstant(AArch64CC::GE, dl, MVT::i32); 4465 SDValue LoForBigShift = DAG.getNode(Opc, dl, VT, ShOpHi, ExtraShAmt); 4466 SDValue Lo = DAG.getNode(AArch64ISD::CSEL, dl, VT, LoForBigShift, 4467 LoForNormalShift, CCVal, Cmp); 4468 4469 // AArch64 shifts larger than the register width are wrapped rather than 4470 // clamped, so we can't just emit "hi >> x". 4471 SDValue HiForNormalShift = DAG.getNode(Opc, dl, VT, ShOpHi, ShAmt); 4472 SDValue HiForBigShift = 4473 Opc == ISD::SRA 4474 ? DAG.getNode(Opc, dl, VT, ShOpHi, 4475 DAG.getConstant(VTBits - 1, dl, MVT::i64)) 4476 : DAG.getConstant(0, dl, VT); 4477 SDValue Hi = DAG.getNode(AArch64ISD::CSEL, dl, VT, HiForBigShift, 4478 HiForNormalShift, CCVal, Cmp); 4479 4480 SDValue Ops[2] = { Lo, Hi }; 4481 return DAG.getMergeValues(Ops, dl); 4482 } 4483 4484 4485 /// LowerShiftLeftParts - Lower SHL_PARTS, which returns two 4486 /// i64 values and take a 2 x i64 value to shift plus a shift amount. 4487 SDValue AArch64TargetLowering::LowerShiftLeftParts(SDValue Op, 4488 SelectionDAG &DAG) const { 4489 assert(Op.getNumOperands() == 3 && "Not a double-shift!"); 4490 EVT VT = Op.getValueType(); 4491 unsigned VTBits = VT.getSizeInBits(); 4492 SDLoc dl(Op); 4493 SDValue ShOpLo = Op.getOperand(0); 4494 SDValue ShOpHi = Op.getOperand(1); 4495 SDValue ShAmt = Op.getOperand(2); 4496 4497 assert(Op.getOpcode() == ISD::SHL_PARTS); 4498 SDValue RevShAmt = DAG.getNode(ISD::SUB, dl, MVT::i64, 4499 DAG.getConstant(VTBits, dl, MVT::i64), ShAmt); 4500 SDValue LoBitsForHi = DAG.getNode(ISD::SRL, dl, VT, ShOpLo, RevShAmt); 4501 4502 // Unfortunately, if ShAmt == 0, we just calculated "(SRL ShOpLo, 64)" which 4503 // is "undef". We wanted 0, so CSEL it directly. 4504 SDValue Cmp = emitComparison(ShAmt, DAG.getConstant(0, dl, MVT::i64), 4505 ISD::SETEQ, dl, DAG); 4506 SDValue CCVal = DAG.getConstant(AArch64CC::EQ, dl, MVT::i32); 4507 LoBitsForHi = 4508 DAG.getNode(AArch64ISD::CSEL, dl, VT, DAG.getConstant(0, dl, MVT::i64), 4509 LoBitsForHi, CCVal, Cmp); 4510 4511 SDValue ExtraShAmt = DAG.getNode(ISD::SUB, dl, MVT::i64, ShAmt, 4512 DAG.getConstant(VTBits, dl, MVT::i64)); 4513 SDValue HiBitsForHi = DAG.getNode(ISD::SHL, dl, VT, ShOpHi, ShAmt); 4514 SDValue HiForNormalShift = 4515 DAG.getNode(ISD::OR, dl, VT, LoBitsForHi, HiBitsForHi); 4516 4517 SDValue HiForBigShift = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, ExtraShAmt); 4518 4519 Cmp = emitComparison(ExtraShAmt, DAG.getConstant(0, dl, MVT::i64), ISD::SETGE, 4520 dl, DAG); 4521 CCVal = DAG.getConstant(AArch64CC::GE, dl, MVT::i32); 4522 SDValue Hi = DAG.getNode(AArch64ISD::CSEL, dl, VT, HiForBigShift, 4523 HiForNormalShift, CCVal, Cmp); 4524 4525 // AArch64 shifts of larger than register sizes are wrapped rather than 4526 // clamped, so we can't just emit "lo << a" if a is too big. 4527 SDValue LoForBigShift = DAG.getConstant(0, dl, VT); 4528 SDValue LoForNormalShift = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, ShAmt); 4529 SDValue Lo = DAG.getNode(AArch64ISD::CSEL, dl, VT, LoForBigShift, 4530 LoForNormalShift, CCVal, Cmp); 4531 4532 SDValue Ops[2] = { Lo, Hi }; 4533 return DAG.getMergeValues(Ops, dl); 4534 } 4535 4536 bool AArch64TargetLowering::isOffsetFoldingLegal( 4537 const GlobalAddressSDNode *GA) const { 4538 // The AArch64 target doesn't support folding offsets into global addresses. 4539 return false; 4540 } 4541 4542 bool AArch64TargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT) const { 4543 // We can materialize #0.0 as fmov $Rd, XZR for 64-bit and 32-bit cases. 4544 // FIXME: We should be able to handle f128 as well with a clever lowering. 4545 if (Imm.isPosZero() && (VT == MVT::f64 || VT == MVT::f32)) 4546 return true; 4547 4548 if (VT == MVT::f64) 4549 return AArch64_AM::getFP64Imm(Imm) != -1; 4550 else if (VT == MVT::f32) 4551 return AArch64_AM::getFP32Imm(Imm) != -1; 4552 return false; 4553 } 4554 4555 //===----------------------------------------------------------------------===// 4556 // AArch64 Optimization Hooks 4557 //===----------------------------------------------------------------------===// 4558 4559 /// getEstimate - Return the appropriate estimate DAG for either the reciprocal 4560 /// or the reciprocal square root. 4561 static SDValue getEstimate(const AArch64Subtarget &ST, 4562 const AArch64TargetLowering::DAGCombinerInfo &DCI, unsigned Opcode, 4563 const SDValue &Operand, unsigned &ExtraSteps) { 4564 if (!ST.hasNEON()) 4565 return SDValue(); 4566 4567 EVT VT = Operand.getValueType(); 4568 4569 std::string RecipOp; 4570 RecipOp = Opcode == (AArch64ISD::FRECPE) ? "div": "sqrt"; 4571 RecipOp = ((VT.isVector()) ? "vec-": "") + RecipOp; 4572 RecipOp += (VT.getScalarType() == MVT::f64) ? "d": "f"; 4573 4574 TargetRecip Recips = DCI.DAG.getTarget().Options.Reciprocals; 4575 if (!Recips.isEnabled(RecipOp)) 4576 return SDValue(); 4577 4578 ExtraSteps = Recips.getRefinementSteps(RecipOp); 4579 return DCI.DAG.getNode(Opcode, SDLoc(Operand), VT, Operand); 4580 } 4581 4582 SDValue AArch64TargetLowering::getRecipEstimate(SDValue Operand, 4583 DAGCombinerInfo &DCI, unsigned &ExtraSteps) const { 4584 return getEstimate(*Subtarget, DCI, AArch64ISD::FRECPE, Operand, ExtraSteps); 4585 } 4586 4587 SDValue AArch64TargetLowering::getRsqrtEstimate(SDValue Operand, 4588 DAGCombinerInfo &DCI, unsigned &ExtraSteps, bool &UseOneConst) const { 4589 UseOneConst = true; 4590 return getEstimate(*Subtarget, DCI, AArch64ISD::FRSQRTE, Operand, ExtraSteps); 4591 } 4592 4593 //===----------------------------------------------------------------------===// 4594 // AArch64 Inline Assembly Support 4595 //===----------------------------------------------------------------------===// 4596 4597 // Table of Constraints 4598 // TODO: This is the current set of constraints supported by ARM for the 4599 // compiler, not all of them may make sense, e.g. S may be difficult to support. 4600 // 4601 // r - A general register 4602 // w - An FP/SIMD register of some size in the range v0-v31 4603 // x - An FP/SIMD register of some size in the range v0-v15 4604 // I - Constant that can be used with an ADD instruction 4605 // J - Constant that can be used with a SUB instruction 4606 // K - Constant that can be used with a 32-bit logical instruction 4607 // L - Constant that can be used with a 64-bit logical instruction 4608 // M - Constant that can be used as a 32-bit MOV immediate 4609 // N - Constant that can be used as a 64-bit MOV immediate 4610 // Q - A memory reference with base register and no offset 4611 // S - A symbolic address 4612 // Y - Floating point constant zero 4613 // Z - Integer constant zero 4614 // 4615 // Note that general register operands will be output using their 64-bit x 4616 // register name, whatever the size of the variable, unless the asm operand 4617 // is prefixed by the %w modifier. Floating-point and SIMD register operands 4618 // will be output with the v prefix unless prefixed by the %b, %h, %s, %d or 4619 // %q modifier. 4620 const char *AArch64TargetLowering::LowerXConstraint(EVT ConstraintVT) const { 4621 // At this point, we have to lower this constraint to something else, so we 4622 // lower it to an "r" or "w". However, by doing this we will force the result 4623 // to be in register, while the X constraint is much more permissive. 4624 // 4625 // Although we are correct (we are free to emit anything, without 4626 // constraints), we might break use cases that would expect us to be more 4627 // efficient and emit something else. 4628 if (!Subtarget->hasFPARMv8()) 4629 return "r"; 4630 4631 if (ConstraintVT.isFloatingPoint()) 4632 return "w"; 4633 4634 if (ConstraintVT.isVector() && 4635 (ConstraintVT.getSizeInBits() == 64 || 4636 ConstraintVT.getSizeInBits() == 128)) 4637 return "w"; 4638 4639 return "r"; 4640 } 4641 4642 /// getConstraintType - Given a constraint letter, return the type of 4643 /// constraint it is for this target. 4644 AArch64TargetLowering::ConstraintType 4645 AArch64TargetLowering::getConstraintType(StringRef Constraint) const { 4646 if (Constraint.size() == 1) { 4647 switch (Constraint[0]) { 4648 default: 4649 break; 4650 case 'z': 4651 return C_Other; 4652 case 'x': 4653 case 'w': 4654 return C_RegisterClass; 4655 // An address with a single base register. Due to the way we 4656 // currently handle addresses it is the same as 'r'. 4657 case 'Q': 4658 return C_Memory; 4659 } 4660 } 4661 return TargetLowering::getConstraintType(Constraint); 4662 } 4663 4664 /// Examine constraint type and operand type and determine a weight value. 4665 /// This object must already have been set up with the operand type 4666 /// and the current alternative constraint selected. 4667 TargetLowering::ConstraintWeight 4668 AArch64TargetLowering::getSingleConstraintMatchWeight( 4669 AsmOperandInfo &info, const char *constraint) const { 4670 ConstraintWeight weight = CW_Invalid; 4671 Value *CallOperandVal = info.CallOperandVal; 4672 // If we don't have a value, we can't do a match, 4673 // but allow it at the lowest weight. 4674 if (!CallOperandVal) 4675 return CW_Default; 4676 Type *type = CallOperandVal->getType(); 4677 // Look at the constraint type. 4678 switch (*constraint) { 4679 default: 4680 weight = TargetLowering::getSingleConstraintMatchWeight(info, constraint); 4681 break; 4682 case 'x': 4683 case 'w': 4684 if (type->isFloatingPointTy() || type->isVectorTy()) 4685 weight = CW_Register; 4686 break; 4687 case 'z': 4688 weight = CW_Constant; 4689 break; 4690 } 4691 return weight; 4692 } 4693 4694 std::pair<unsigned, const TargetRegisterClass *> 4695 AArch64TargetLowering::getRegForInlineAsmConstraint( 4696 const TargetRegisterInfo *TRI, StringRef Constraint, MVT VT) const { 4697 if (Constraint.size() == 1) { 4698 switch (Constraint[0]) { 4699 case 'r': 4700 if (VT.getSizeInBits() == 64) 4701 return std::make_pair(0U, &AArch64::GPR64commonRegClass); 4702 return std::make_pair(0U, &AArch64::GPR32commonRegClass); 4703 case 'w': 4704 if (VT.getSizeInBits() == 32) 4705 return std::make_pair(0U, &AArch64::FPR32RegClass); 4706 if (VT.getSizeInBits() == 64) 4707 return std::make_pair(0U, &AArch64::FPR64RegClass); 4708 if (VT.getSizeInBits() == 128) 4709 return std::make_pair(0U, &AArch64::FPR128RegClass); 4710 break; 4711 // The instructions that this constraint is designed for can 4712 // only take 128-bit registers so just use that regclass. 4713 case 'x': 4714 if (VT.getSizeInBits() == 128) 4715 return std::make_pair(0U, &AArch64::FPR128_loRegClass); 4716 break; 4717 } 4718 } 4719 if (StringRef("{cc}").equals_lower(Constraint)) 4720 return std::make_pair(unsigned(AArch64::NZCV), &AArch64::CCRRegClass); 4721 4722 // Use the default implementation in TargetLowering to convert the register 4723 // constraint into a member of a register class. 4724 std::pair<unsigned, const TargetRegisterClass *> Res; 4725 Res = TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT); 4726 4727 // Not found as a standard register? 4728 if (!Res.second) { 4729 unsigned Size = Constraint.size(); 4730 if ((Size == 4 || Size == 5) && Constraint[0] == '{' && 4731 tolower(Constraint[1]) == 'v' && Constraint[Size - 1] == '}') { 4732 int RegNo; 4733 bool Failed = Constraint.slice(2, Size - 1).getAsInteger(10, RegNo); 4734 if (!Failed && RegNo >= 0 && RegNo <= 31) { 4735 // v0 - v31 are aliases of q0 - q31 or d0 - d31 depending on size. 4736 // By default we'll emit v0-v31 for this unless there's a modifier where 4737 // we'll emit the correct register as well. 4738 if (VT != MVT::Other && VT.getSizeInBits() == 64) { 4739 Res.first = AArch64::FPR64RegClass.getRegister(RegNo); 4740 Res.second = &AArch64::FPR64RegClass; 4741 } else { 4742 Res.first = AArch64::FPR128RegClass.getRegister(RegNo); 4743 Res.second = &AArch64::FPR128RegClass; 4744 } 4745 } 4746 } 4747 } 4748 4749 return Res; 4750 } 4751 4752 /// LowerAsmOperandForConstraint - Lower the specified operand into the Ops 4753 /// vector. If it is invalid, don't add anything to Ops. 4754 void AArch64TargetLowering::LowerAsmOperandForConstraint( 4755 SDValue Op, std::string &Constraint, std::vector<SDValue> &Ops, 4756 SelectionDAG &DAG) const { 4757 SDValue Result; 4758 4759 // Currently only support length 1 constraints. 4760 if (Constraint.length() != 1) 4761 return; 4762 4763 char ConstraintLetter = Constraint[0]; 4764 switch (ConstraintLetter) { 4765 default: 4766 break; 4767 4768 // This set of constraints deal with valid constants for various instructions. 4769 // Validate and return a target constant for them if we can. 4770 case 'z': { 4771 // 'z' maps to xzr or wzr so it needs an input of 0. 4772 if (!isNullConstant(Op)) 4773 return; 4774 4775 if (Op.getValueType() == MVT::i64) 4776 Result = DAG.getRegister(AArch64::XZR, MVT::i64); 4777 else 4778 Result = DAG.getRegister(AArch64::WZR, MVT::i32); 4779 break; 4780 } 4781 4782 case 'I': 4783 case 'J': 4784 case 'K': 4785 case 'L': 4786 case 'M': 4787 case 'N': 4788 ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op); 4789 if (!C) 4790 return; 4791 4792 // Grab the value and do some validation. 4793 uint64_t CVal = C->getZExtValue(); 4794 switch (ConstraintLetter) { 4795 // The I constraint applies only to simple ADD or SUB immediate operands: 4796 // i.e. 0 to 4095 with optional shift by 12 4797 // The J constraint applies only to ADD or SUB immediates that would be 4798 // valid when negated, i.e. if [an add pattern] were to be output as a SUB 4799 // instruction [or vice versa], in other words -1 to -4095 with optional 4800 // left shift by 12. 4801 case 'I': 4802 if (isUInt<12>(CVal) || isShiftedUInt<12, 12>(CVal)) 4803 break; 4804 return; 4805 case 'J': { 4806 uint64_t NVal = -C->getSExtValue(); 4807 if (isUInt<12>(NVal) || isShiftedUInt<12, 12>(NVal)) { 4808 CVal = C->getSExtValue(); 4809 break; 4810 } 4811 return; 4812 } 4813 // The K and L constraints apply *only* to logical immediates, including 4814 // what used to be the MOVI alias for ORR (though the MOVI alias has now 4815 // been removed and MOV should be used). So these constraints have to 4816 // distinguish between bit patterns that are valid 32-bit or 64-bit 4817 // "bitmask immediates": for example 0xaaaaaaaa is a valid bimm32 (K), but 4818 // not a valid bimm64 (L) where 0xaaaaaaaaaaaaaaaa would be valid, and vice 4819 // versa. 4820 case 'K': 4821 if (AArch64_AM::isLogicalImmediate(CVal, 32)) 4822 break; 4823 return; 4824 case 'L': 4825 if (AArch64_AM::isLogicalImmediate(CVal, 64)) 4826 break; 4827 return; 4828 // The M and N constraints are a superset of K and L respectively, for use 4829 // with the MOV (immediate) alias. As well as the logical immediates they 4830 // also match 32 or 64-bit immediates that can be loaded either using a 4831 // *single* MOVZ or MOVN , such as 32-bit 0x12340000, 0x00001234, 0xffffedca 4832 // (M) or 64-bit 0x1234000000000000 (N) etc. 4833 // As a note some of this code is liberally stolen from the asm parser. 4834 case 'M': { 4835 if (!isUInt<32>(CVal)) 4836 return; 4837 if (AArch64_AM::isLogicalImmediate(CVal, 32)) 4838 break; 4839 if ((CVal & 0xFFFF) == CVal) 4840 break; 4841 if ((CVal & 0xFFFF0000ULL) == CVal) 4842 break; 4843 uint64_t NCVal = ~(uint32_t)CVal; 4844 if ((NCVal & 0xFFFFULL) == NCVal) 4845 break; 4846 if ((NCVal & 0xFFFF0000ULL) == NCVal) 4847 break; 4848 return; 4849 } 4850 case 'N': { 4851 if (AArch64_AM::isLogicalImmediate(CVal, 64)) 4852 break; 4853 if ((CVal & 0xFFFFULL) == CVal) 4854 break; 4855 if ((CVal & 0xFFFF0000ULL) == CVal) 4856 break; 4857 if ((CVal & 0xFFFF00000000ULL) == CVal) 4858 break; 4859 if ((CVal & 0xFFFF000000000000ULL) == CVal) 4860 break; 4861 uint64_t NCVal = ~CVal; 4862 if ((NCVal & 0xFFFFULL) == NCVal) 4863 break; 4864 if ((NCVal & 0xFFFF0000ULL) == NCVal) 4865 break; 4866 if ((NCVal & 0xFFFF00000000ULL) == NCVal) 4867 break; 4868 if ((NCVal & 0xFFFF000000000000ULL) == NCVal) 4869 break; 4870 return; 4871 } 4872 default: 4873 return; 4874 } 4875 4876 // All assembler immediates are 64-bit integers. 4877 Result = DAG.getTargetConstant(CVal, SDLoc(Op), MVT::i64); 4878 break; 4879 } 4880 4881 if (Result.getNode()) { 4882 Ops.push_back(Result); 4883 return; 4884 } 4885 4886 return TargetLowering::LowerAsmOperandForConstraint(Op, Constraint, Ops, DAG); 4887 } 4888 4889 //===----------------------------------------------------------------------===// 4890 // AArch64 Advanced SIMD Support 4891 //===----------------------------------------------------------------------===// 4892 4893 /// WidenVector - Given a value in the V64 register class, produce the 4894 /// equivalent value in the V128 register class. 4895 static SDValue WidenVector(SDValue V64Reg, SelectionDAG &DAG) { 4896 EVT VT = V64Reg.getValueType(); 4897 unsigned NarrowSize = VT.getVectorNumElements(); 4898 MVT EltTy = VT.getVectorElementType().getSimpleVT(); 4899 MVT WideTy = MVT::getVectorVT(EltTy, 2 * NarrowSize); 4900 SDLoc DL(V64Reg); 4901 4902 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, WideTy, DAG.getUNDEF(WideTy), 4903 V64Reg, DAG.getConstant(0, DL, MVT::i32)); 4904 } 4905 4906 /// getExtFactor - Determine the adjustment factor for the position when 4907 /// generating an "extract from vector registers" instruction. 4908 static unsigned getExtFactor(SDValue &V) { 4909 EVT EltType = V.getValueType().getVectorElementType(); 4910 return EltType.getSizeInBits() / 8; 4911 } 4912 4913 /// NarrowVector - Given a value in the V128 register class, produce the 4914 /// equivalent value in the V64 register class. 4915 static SDValue NarrowVector(SDValue V128Reg, SelectionDAG &DAG) { 4916 EVT VT = V128Reg.getValueType(); 4917 unsigned WideSize = VT.getVectorNumElements(); 4918 MVT EltTy = VT.getVectorElementType().getSimpleVT(); 4919 MVT NarrowTy = MVT::getVectorVT(EltTy, WideSize / 2); 4920 SDLoc DL(V128Reg); 4921 4922 return DAG.getTargetExtractSubreg(AArch64::dsub, DL, NarrowTy, V128Reg); 4923 } 4924 4925 // Gather data to see if the operation can be modelled as a 4926 // shuffle in combination with VEXTs. 4927 SDValue AArch64TargetLowering::ReconstructShuffle(SDValue Op, 4928 SelectionDAG &DAG) const { 4929 assert(Op.getOpcode() == ISD::BUILD_VECTOR && "Unknown opcode!"); 4930 SDLoc dl(Op); 4931 EVT VT = Op.getValueType(); 4932 unsigned NumElts = VT.getVectorNumElements(); 4933 4934 struct ShuffleSourceInfo { 4935 SDValue Vec; 4936 unsigned MinElt; 4937 unsigned MaxElt; 4938 4939 // We may insert some combination of BITCASTs and VEXT nodes to force Vec to 4940 // be compatible with the shuffle we intend to construct. As a result 4941 // ShuffleVec will be some sliding window into the original Vec. 4942 SDValue ShuffleVec; 4943 4944 // Code should guarantee that element i in Vec starts at element "WindowBase 4945 // + i * WindowScale in ShuffleVec". 4946 int WindowBase; 4947 int WindowScale; 4948 4949 bool operator ==(SDValue OtherVec) { return Vec == OtherVec; } 4950 ShuffleSourceInfo(SDValue Vec) 4951 : Vec(Vec), MinElt(UINT_MAX), MaxElt(0), ShuffleVec(Vec), WindowBase(0), 4952 WindowScale(1) {} 4953 }; 4954 4955 // First gather all vectors used as an immediate source for this BUILD_VECTOR 4956 // node. 4957 SmallVector<ShuffleSourceInfo, 2> Sources; 4958 for (unsigned i = 0; i < NumElts; ++i) { 4959 SDValue V = Op.getOperand(i); 4960 if (V.isUndef()) 4961 continue; 4962 else if (V.getOpcode() != ISD::EXTRACT_VECTOR_ELT || 4963 !isa<ConstantSDNode>(V.getOperand(1))) { 4964 // A shuffle can only come from building a vector from various 4965 // elements of other vectors, provided their indices are constant. 4966 return SDValue(); 4967 } 4968 4969 // Add this element source to the list if it's not already there. 4970 SDValue SourceVec = V.getOperand(0); 4971 auto Source = std::find(Sources.begin(), Sources.end(), SourceVec); 4972 if (Source == Sources.end()) 4973 Source = Sources.insert(Sources.end(), ShuffleSourceInfo(SourceVec)); 4974 4975 // Update the minimum and maximum lane number seen. 4976 unsigned EltNo = cast<ConstantSDNode>(V.getOperand(1))->getZExtValue(); 4977 Source->MinElt = std::min(Source->MinElt, EltNo); 4978 Source->MaxElt = std::max(Source->MaxElt, EltNo); 4979 } 4980 4981 // Currently only do something sane when at most two source vectors 4982 // are involved. 4983 if (Sources.size() > 2) 4984 return SDValue(); 4985 4986 // Find out the smallest element size among result and two sources, and use 4987 // it as element size to build the shuffle_vector. 4988 EVT SmallestEltTy = VT.getVectorElementType(); 4989 for (auto &Source : Sources) { 4990 EVT SrcEltTy = Source.Vec.getValueType().getVectorElementType(); 4991 if (SrcEltTy.bitsLT(SmallestEltTy)) { 4992 SmallestEltTy = SrcEltTy; 4993 } 4994 } 4995 unsigned ResMultiplier = 4996 VT.getVectorElementType().getSizeInBits() / SmallestEltTy.getSizeInBits(); 4997 NumElts = VT.getSizeInBits() / SmallestEltTy.getSizeInBits(); 4998 EVT ShuffleVT = EVT::getVectorVT(*DAG.getContext(), SmallestEltTy, NumElts); 4999 5000 // If the source vector is too wide or too narrow, we may nevertheless be able 5001 // to construct a compatible shuffle either by concatenating it with UNDEF or 5002 // extracting a suitable range of elements. 5003 for (auto &Src : Sources) { 5004 EVT SrcVT = Src.ShuffleVec.getValueType(); 5005 5006 if (SrcVT.getSizeInBits() == VT.getSizeInBits()) 5007 continue; 5008 5009 // This stage of the search produces a source with the same element type as 5010 // the original, but with a total width matching the BUILD_VECTOR output. 5011 EVT EltVT = SrcVT.getVectorElementType(); 5012 unsigned NumSrcElts = VT.getSizeInBits() / EltVT.getSizeInBits(); 5013 EVT DestVT = EVT::getVectorVT(*DAG.getContext(), EltVT, NumSrcElts); 5014 5015 if (SrcVT.getSizeInBits() < VT.getSizeInBits()) { 5016 assert(2 * SrcVT.getSizeInBits() == VT.getSizeInBits()); 5017 // We can pad out the smaller vector for free, so if it's part of a 5018 // shuffle... 5019 Src.ShuffleVec = 5020 DAG.getNode(ISD::CONCAT_VECTORS, dl, DestVT, Src.ShuffleVec, 5021 DAG.getUNDEF(Src.ShuffleVec.getValueType())); 5022 continue; 5023 } 5024 5025 assert(SrcVT.getSizeInBits() == 2 * VT.getSizeInBits()); 5026 5027 if (Src.MaxElt - Src.MinElt >= NumSrcElts) { 5028 // Span too large for a VEXT to cope 5029 return SDValue(); 5030 } 5031 5032 if (Src.MinElt >= NumSrcElts) { 5033 // The extraction can just take the second half 5034 Src.ShuffleVec = 5035 DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, DestVT, Src.ShuffleVec, 5036 DAG.getConstant(NumSrcElts, dl, MVT::i64)); 5037 Src.WindowBase = -NumSrcElts; 5038 } else if (Src.MaxElt < NumSrcElts) { 5039 // The extraction can just take the first half 5040 Src.ShuffleVec = 5041 DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, DestVT, Src.ShuffleVec, 5042 DAG.getConstant(0, dl, MVT::i64)); 5043 } else { 5044 // An actual VEXT is needed 5045 SDValue VEXTSrc1 = 5046 DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, DestVT, Src.ShuffleVec, 5047 DAG.getConstant(0, dl, MVT::i64)); 5048 SDValue VEXTSrc2 = 5049 DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, DestVT, Src.ShuffleVec, 5050 DAG.getConstant(NumSrcElts, dl, MVT::i64)); 5051 unsigned Imm = Src.MinElt * getExtFactor(VEXTSrc1); 5052 5053 Src.ShuffleVec = DAG.getNode(AArch64ISD::EXT, dl, DestVT, VEXTSrc1, 5054 VEXTSrc2, 5055 DAG.getConstant(Imm, dl, MVT::i32)); 5056 Src.WindowBase = -Src.MinElt; 5057 } 5058 } 5059 5060 // Another possible incompatibility occurs from the vector element types. We 5061 // can fix this by bitcasting the source vectors to the same type we intend 5062 // for the shuffle. 5063 for (auto &Src : Sources) { 5064 EVT SrcEltTy = Src.ShuffleVec.getValueType().getVectorElementType(); 5065 if (SrcEltTy == SmallestEltTy) 5066 continue; 5067 assert(ShuffleVT.getVectorElementType() == SmallestEltTy); 5068 Src.ShuffleVec = DAG.getNode(ISD::BITCAST, dl, ShuffleVT, Src.ShuffleVec); 5069 Src.WindowScale = SrcEltTy.getSizeInBits() / SmallestEltTy.getSizeInBits(); 5070 Src.WindowBase *= Src.WindowScale; 5071 } 5072 5073 // Final sanity check before we try to actually produce a shuffle. 5074 DEBUG( 5075 for (auto Src : Sources) 5076 assert(Src.ShuffleVec.getValueType() == ShuffleVT); 5077 ); 5078 5079 // The stars all align, our next step is to produce the mask for the shuffle. 5080 SmallVector<int, 8> Mask(ShuffleVT.getVectorNumElements(), -1); 5081 int BitsPerShuffleLane = ShuffleVT.getVectorElementType().getSizeInBits(); 5082 for (unsigned i = 0; i < VT.getVectorNumElements(); ++i) { 5083 SDValue Entry = Op.getOperand(i); 5084 if (Entry.isUndef()) 5085 continue; 5086 5087 auto Src = std::find(Sources.begin(), Sources.end(), Entry.getOperand(0)); 5088 int EltNo = cast<ConstantSDNode>(Entry.getOperand(1))->getSExtValue(); 5089 5090 // EXTRACT_VECTOR_ELT performs an implicit any_ext; BUILD_VECTOR an implicit 5091 // trunc. So only std::min(SrcBits, DestBits) actually get defined in this 5092 // segment. 5093 EVT OrigEltTy = Entry.getOperand(0).getValueType().getVectorElementType(); 5094 int BitsDefined = std::min(OrigEltTy.getSizeInBits(), 5095 VT.getVectorElementType().getSizeInBits()); 5096 int LanesDefined = BitsDefined / BitsPerShuffleLane; 5097 5098 // This source is expected to fill ResMultiplier lanes of the final shuffle, 5099 // starting at the appropriate offset. 5100 int *LaneMask = &Mask[i * ResMultiplier]; 5101 5102 int ExtractBase = EltNo * Src->WindowScale + Src->WindowBase; 5103 ExtractBase += NumElts * (Src - Sources.begin()); 5104 for (int j = 0; j < LanesDefined; ++j) 5105 LaneMask[j] = ExtractBase + j; 5106 } 5107 5108 // Final check before we try to produce nonsense... 5109 if (!isShuffleMaskLegal(Mask, ShuffleVT)) 5110 return SDValue(); 5111 5112 SDValue ShuffleOps[] = { DAG.getUNDEF(ShuffleVT), DAG.getUNDEF(ShuffleVT) }; 5113 for (unsigned i = 0; i < Sources.size(); ++i) 5114 ShuffleOps[i] = Sources[i].ShuffleVec; 5115 5116 SDValue Shuffle = DAG.getVectorShuffle(ShuffleVT, dl, ShuffleOps[0], 5117 ShuffleOps[1], Mask); 5118 return DAG.getNode(ISD::BITCAST, dl, VT, Shuffle); 5119 } 5120 5121 // check if an EXT instruction can handle the shuffle mask when the 5122 // vector sources of the shuffle are the same. 5123 static bool isSingletonEXTMask(ArrayRef<int> M, EVT VT, unsigned &Imm) { 5124 unsigned NumElts = VT.getVectorNumElements(); 5125 5126 // Assume that the first shuffle index is not UNDEF. Fail if it is. 5127 if (M[0] < 0) 5128 return false; 5129 5130 Imm = M[0]; 5131 5132 // If this is a VEXT shuffle, the immediate value is the index of the first 5133 // element. The other shuffle indices must be the successive elements after 5134 // the first one. 5135 unsigned ExpectedElt = Imm; 5136 for (unsigned i = 1; i < NumElts; ++i) { 5137 // Increment the expected index. If it wraps around, just follow it 5138 // back to index zero and keep going. 5139 ++ExpectedElt; 5140 if (ExpectedElt == NumElts) 5141 ExpectedElt = 0; 5142 5143 if (M[i] < 0) 5144 continue; // ignore UNDEF indices 5145 if (ExpectedElt != static_cast<unsigned>(M[i])) 5146 return false; 5147 } 5148 5149 return true; 5150 } 5151 5152 // check if an EXT instruction can handle the shuffle mask when the 5153 // vector sources of the shuffle are different. 5154 static bool isEXTMask(ArrayRef<int> M, EVT VT, bool &ReverseEXT, 5155 unsigned &Imm) { 5156 // Look for the first non-undef element. 5157 const int *FirstRealElt = std::find_if(M.begin(), M.end(), 5158 [](int Elt) {return Elt >= 0;}); 5159 5160 // Benefit form APInt to handle overflow when calculating expected element. 5161 unsigned NumElts = VT.getVectorNumElements(); 5162 unsigned MaskBits = APInt(32, NumElts * 2).logBase2(); 5163 APInt ExpectedElt = APInt(MaskBits, *FirstRealElt + 1); 5164 // The following shuffle indices must be the successive elements after the 5165 // first real element. 5166 const int *FirstWrongElt = std::find_if(FirstRealElt + 1, M.end(), 5167 [&](int Elt) {return Elt != ExpectedElt++ && Elt != -1;}); 5168 if (FirstWrongElt != M.end()) 5169 return false; 5170 5171 // The index of an EXT is the first element if it is not UNDEF. 5172 // Watch out for the beginning UNDEFs. The EXT index should be the expected 5173 // value of the first element. E.g. 5174 // <-1, -1, 3, ...> is treated as <1, 2, 3, ...>. 5175 // <-1, -1, 0, 1, ...> is treated as <2*NumElts-2, 2*NumElts-1, 0, 1, ...>. 5176 // ExpectedElt is the last mask index plus 1. 5177 Imm = ExpectedElt.getZExtValue(); 5178 5179 // There are two difference cases requiring to reverse input vectors. 5180 // For example, for vector <4 x i32> we have the following cases, 5181 // Case 1: shufflevector(<4 x i32>,<4 x i32>,<-1, -1, -1, 0>) 5182 // Case 2: shufflevector(<4 x i32>,<4 x i32>,<-1, -1, 7, 0>) 5183 // For both cases, we finally use mask <5, 6, 7, 0>, which requires 5184 // to reverse two input vectors. 5185 if (Imm < NumElts) 5186 ReverseEXT = true; 5187 else 5188 Imm -= NumElts; 5189 5190 return true; 5191 } 5192 5193 /// isREVMask - Check if a vector shuffle corresponds to a REV 5194 /// instruction with the specified blocksize. (The order of the elements 5195 /// within each block of the vector is reversed.) 5196 static bool isREVMask(ArrayRef<int> M, EVT VT, unsigned BlockSize) { 5197 assert((BlockSize == 16 || BlockSize == 32 || BlockSize == 64) && 5198 "Only possible block sizes for REV are: 16, 32, 64"); 5199 5200 unsigned EltSz = VT.getVectorElementType().getSizeInBits(); 5201 if (EltSz == 64) 5202 return false; 5203 5204 unsigned NumElts = VT.getVectorNumElements(); 5205 unsigned BlockElts = M[0] + 1; 5206 // If the first shuffle index is UNDEF, be optimistic. 5207 if (M[0] < 0) 5208 BlockElts = BlockSize / EltSz; 5209 5210 if (BlockSize <= EltSz || BlockSize != BlockElts * EltSz) 5211 return false; 5212 5213 for (unsigned i = 0; i < NumElts; ++i) { 5214 if (M[i] < 0) 5215 continue; // ignore UNDEF indices 5216 if ((unsigned)M[i] != (i - i % BlockElts) + (BlockElts - 1 - i % BlockElts)) 5217 return false; 5218 } 5219 5220 return true; 5221 } 5222 5223 static bool isZIPMask(ArrayRef<int> M, EVT VT, unsigned &WhichResult) { 5224 unsigned NumElts = VT.getVectorNumElements(); 5225 WhichResult = (M[0] == 0 ? 0 : 1); 5226 unsigned Idx = WhichResult * NumElts / 2; 5227 for (unsigned i = 0; i != NumElts; i += 2) { 5228 if ((M[i] >= 0 && (unsigned)M[i] != Idx) || 5229 (M[i + 1] >= 0 && (unsigned)M[i + 1] != Idx + NumElts)) 5230 return false; 5231 Idx += 1; 5232 } 5233 5234 return true; 5235 } 5236 5237 static bool isUZPMask(ArrayRef<int> M, EVT VT, unsigned &WhichResult) { 5238 unsigned NumElts = VT.getVectorNumElements(); 5239 WhichResult = (M[0] == 0 ? 0 : 1); 5240 for (unsigned i = 0; i != NumElts; ++i) { 5241 if (M[i] < 0) 5242 continue; // ignore UNDEF indices 5243 if ((unsigned)M[i] != 2 * i + WhichResult) 5244 return false; 5245 } 5246 5247 return true; 5248 } 5249 5250 static bool isTRNMask(ArrayRef<int> M, EVT VT, unsigned &WhichResult) { 5251 unsigned NumElts = VT.getVectorNumElements(); 5252 WhichResult = (M[0] == 0 ? 0 : 1); 5253 for (unsigned i = 0; i < NumElts; i += 2) { 5254 if ((M[i] >= 0 && (unsigned)M[i] != i + WhichResult) || 5255 (M[i + 1] >= 0 && (unsigned)M[i + 1] != i + NumElts + WhichResult)) 5256 return false; 5257 } 5258 return true; 5259 } 5260 5261 /// isZIP_v_undef_Mask - Special case of isZIPMask for canonical form of 5262 /// "vector_shuffle v, v", i.e., "vector_shuffle v, undef". 5263 /// Mask is e.g., <0, 0, 1, 1> instead of <0, 4, 1, 5>. 5264 static bool isZIP_v_undef_Mask(ArrayRef<int> M, EVT VT, unsigned &WhichResult) { 5265 unsigned NumElts = VT.getVectorNumElements(); 5266 WhichResult = (M[0] == 0 ? 0 : 1); 5267 unsigned Idx = WhichResult * NumElts / 2; 5268 for (unsigned i = 0; i != NumElts; i += 2) { 5269 if ((M[i] >= 0 && (unsigned)M[i] != Idx) || 5270 (M[i + 1] >= 0 && (unsigned)M[i + 1] != Idx)) 5271 return false; 5272 Idx += 1; 5273 } 5274 5275 return true; 5276 } 5277 5278 /// isUZP_v_undef_Mask - Special case of isUZPMask for canonical form of 5279 /// "vector_shuffle v, v", i.e., "vector_shuffle v, undef". 5280 /// Mask is e.g., <0, 2, 0, 2> instead of <0, 2, 4, 6>, 5281 static bool isUZP_v_undef_Mask(ArrayRef<int> M, EVT VT, unsigned &WhichResult) { 5282 unsigned Half = VT.getVectorNumElements() / 2; 5283 WhichResult = (M[0] == 0 ? 0 : 1); 5284 for (unsigned j = 0; j != 2; ++j) { 5285 unsigned Idx = WhichResult; 5286 for (unsigned i = 0; i != Half; ++i) { 5287 int MIdx = M[i + j * Half]; 5288 if (MIdx >= 0 && (unsigned)MIdx != Idx) 5289 return false; 5290 Idx += 2; 5291 } 5292 } 5293 5294 return true; 5295 } 5296 5297 /// isTRN_v_undef_Mask - Special case of isTRNMask for canonical form of 5298 /// "vector_shuffle v, v", i.e., "vector_shuffle v, undef". 5299 /// Mask is e.g., <0, 0, 2, 2> instead of <0, 4, 2, 6>. 5300 static bool isTRN_v_undef_Mask(ArrayRef<int> M, EVT VT, unsigned &WhichResult) { 5301 unsigned NumElts = VT.getVectorNumElements(); 5302 WhichResult = (M[0] == 0 ? 0 : 1); 5303 for (unsigned i = 0; i < NumElts; i += 2) { 5304 if ((M[i] >= 0 && (unsigned)M[i] != i + WhichResult) || 5305 (M[i + 1] >= 0 && (unsigned)M[i + 1] != i + WhichResult)) 5306 return false; 5307 } 5308 return true; 5309 } 5310 5311 static bool isINSMask(ArrayRef<int> M, int NumInputElements, 5312 bool &DstIsLeft, int &Anomaly) { 5313 if (M.size() != static_cast<size_t>(NumInputElements)) 5314 return false; 5315 5316 int NumLHSMatch = 0, NumRHSMatch = 0; 5317 int LastLHSMismatch = -1, LastRHSMismatch = -1; 5318 5319 for (int i = 0; i < NumInputElements; ++i) { 5320 if (M[i] == -1) { 5321 ++NumLHSMatch; 5322 ++NumRHSMatch; 5323 continue; 5324 } 5325 5326 if (M[i] == i) 5327 ++NumLHSMatch; 5328 else 5329 LastLHSMismatch = i; 5330 5331 if (M[i] == i + NumInputElements) 5332 ++NumRHSMatch; 5333 else 5334 LastRHSMismatch = i; 5335 } 5336 5337 if (NumLHSMatch == NumInputElements - 1) { 5338 DstIsLeft = true; 5339 Anomaly = LastLHSMismatch; 5340 return true; 5341 } else if (NumRHSMatch == NumInputElements - 1) { 5342 DstIsLeft = false; 5343 Anomaly = LastRHSMismatch; 5344 return true; 5345 } 5346 5347 return false; 5348 } 5349 5350 static bool isConcatMask(ArrayRef<int> Mask, EVT VT, bool SplitLHS) { 5351 if (VT.getSizeInBits() != 128) 5352 return false; 5353 5354 unsigned NumElts = VT.getVectorNumElements(); 5355 5356 for (int I = 0, E = NumElts / 2; I != E; I++) { 5357 if (Mask[I] != I) 5358 return false; 5359 } 5360 5361 int Offset = NumElts / 2; 5362 for (int I = NumElts / 2, E = NumElts; I != E; I++) { 5363 if (Mask[I] != I + SplitLHS * Offset) 5364 return false; 5365 } 5366 5367 return true; 5368 } 5369 5370 static SDValue tryFormConcatFromShuffle(SDValue Op, SelectionDAG &DAG) { 5371 SDLoc DL(Op); 5372 EVT VT = Op.getValueType(); 5373 SDValue V0 = Op.getOperand(0); 5374 SDValue V1 = Op.getOperand(1); 5375 ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(Op)->getMask(); 5376 5377 if (VT.getVectorElementType() != V0.getValueType().getVectorElementType() || 5378 VT.getVectorElementType() != V1.getValueType().getVectorElementType()) 5379 return SDValue(); 5380 5381 bool SplitV0 = V0.getValueType().getSizeInBits() == 128; 5382 5383 if (!isConcatMask(Mask, VT, SplitV0)) 5384 return SDValue(); 5385 5386 EVT CastVT = EVT::getVectorVT(*DAG.getContext(), VT.getVectorElementType(), 5387 VT.getVectorNumElements() / 2); 5388 if (SplitV0) { 5389 V0 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, CastVT, V0, 5390 DAG.getConstant(0, DL, MVT::i64)); 5391 } 5392 if (V1.getValueType().getSizeInBits() == 128) { 5393 V1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, CastVT, V1, 5394 DAG.getConstant(0, DL, MVT::i64)); 5395 } 5396 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, V0, V1); 5397 } 5398 5399 /// GeneratePerfectShuffle - Given an entry in the perfect-shuffle table, emit 5400 /// the specified operations to build the shuffle. 5401 static SDValue GeneratePerfectShuffle(unsigned PFEntry, SDValue LHS, 5402 SDValue RHS, SelectionDAG &DAG, 5403 const SDLoc &dl) { 5404 unsigned OpNum = (PFEntry >> 26) & 0x0F; 5405 unsigned LHSID = (PFEntry >> 13) & ((1 << 13) - 1); 5406 unsigned RHSID = (PFEntry >> 0) & ((1 << 13) - 1); 5407 5408 enum { 5409 OP_COPY = 0, // Copy, used for things like <u,u,u,3> to say it is <0,1,2,3> 5410 OP_VREV, 5411 OP_VDUP0, 5412 OP_VDUP1, 5413 OP_VDUP2, 5414 OP_VDUP3, 5415 OP_VEXT1, 5416 OP_VEXT2, 5417 OP_VEXT3, 5418 OP_VUZPL, // VUZP, left result 5419 OP_VUZPR, // VUZP, right result 5420 OP_VZIPL, // VZIP, left result 5421 OP_VZIPR, // VZIP, right result 5422 OP_VTRNL, // VTRN, left result 5423 OP_VTRNR // VTRN, right result 5424 }; 5425 5426 if (OpNum == OP_COPY) { 5427 if (LHSID == (1 * 9 + 2) * 9 + 3) 5428 return LHS; 5429 assert(LHSID == ((4 * 9 + 5) * 9 + 6) * 9 + 7 && "Illegal OP_COPY!"); 5430 return RHS; 5431 } 5432 5433 SDValue OpLHS, OpRHS; 5434 OpLHS = GeneratePerfectShuffle(PerfectShuffleTable[LHSID], LHS, RHS, DAG, dl); 5435 OpRHS = GeneratePerfectShuffle(PerfectShuffleTable[RHSID], LHS, RHS, DAG, dl); 5436 EVT VT = OpLHS.getValueType(); 5437 5438 switch (OpNum) { 5439 default: 5440 llvm_unreachable("Unknown shuffle opcode!"); 5441 case OP_VREV: 5442 // VREV divides the vector in half and swaps within the half. 5443 if (VT.getVectorElementType() == MVT::i32 || 5444 VT.getVectorElementType() == MVT::f32) 5445 return DAG.getNode(AArch64ISD::REV64, dl, VT, OpLHS); 5446 // vrev <4 x i16> -> REV32 5447 if (VT.getVectorElementType() == MVT::i16 || 5448 VT.getVectorElementType() == MVT::f16) 5449 return DAG.getNode(AArch64ISD::REV32, dl, VT, OpLHS); 5450 // vrev <4 x i8> -> REV16 5451 assert(VT.getVectorElementType() == MVT::i8); 5452 return DAG.getNode(AArch64ISD::REV16, dl, VT, OpLHS); 5453 case OP_VDUP0: 5454 case OP_VDUP1: 5455 case OP_VDUP2: 5456 case OP_VDUP3: { 5457 EVT EltTy = VT.getVectorElementType(); 5458 unsigned Opcode; 5459 if (EltTy == MVT::i8) 5460 Opcode = AArch64ISD::DUPLANE8; 5461 else if (EltTy == MVT::i16 || EltTy == MVT::f16) 5462 Opcode = AArch64ISD::DUPLANE16; 5463 else if (EltTy == MVT::i32 || EltTy == MVT::f32) 5464 Opcode = AArch64ISD::DUPLANE32; 5465 else if (EltTy == MVT::i64 || EltTy == MVT::f64) 5466 Opcode = AArch64ISD::DUPLANE64; 5467 else 5468 llvm_unreachable("Invalid vector element type?"); 5469 5470 if (VT.getSizeInBits() == 64) 5471 OpLHS = WidenVector(OpLHS, DAG); 5472 SDValue Lane = DAG.getConstant(OpNum - OP_VDUP0, dl, MVT::i64); 5473 return DAG.getNode(Opcode, dl, VT, OpLHS, Lane); 5474 } 5475 case OP_VEXT1: 5476 case OP_VEXT2: 5477 case OP_VEXT3: { 5478 unsigned Imm = (OpNum - OP_VEXT1 + 1) * getExtFactor(OpLHS); 5479 return DAG.getNode(AArch64ISD::EXT, dl, VT, OpLHS, OpRHS, 5480 DAG.getConstant(Imm, dl, MVT::i32)); 5481 } 5482 case OP_VUZPL: 5483 return DAG.getNode(AArch64ISD::UZP1, dl, DAG.getVTList(VT, VT), OpLHS, 5484 OpRHS); 5485 case OP_VUZPR: 5486 return DAG.getNode(AArch64ISD::UZP2, dl, DAG.getVTList(VT, VT), OpLHS, 5487 OpRHS); 5488 case OP_VZIPL: 5489 return DAG.getNode(AArch64ISD::ZIP1, dl, DAG.getVTList(VT, VT), OpLHS, 5490 OpRHS); 5491 case OP_VZIPR: 5492 return DAG.getNode(AArch64ISD::ZIP2, dl, DAG.getVTList(VT, VT), OpLHS, 5493 OpRHS); 5494 case OP_VTRNL: 5495 return DAG.getNode(AArch64ISD::TRN1, dl, DAG.getVTList(VT, VT), OpLHS, 5496 OpRHS); 5497 case OP_VTRNR: 5498 return DAG.getNode(AArch64ISD::TRN2, dl, DAG.getVTList(VT, VT), OpLHS, 5499 OpRHS); 5500 } 5501 } 5502 5503 static SDValue GenerateTBL(SDValue Op, ArrayRef<int> ShuffleMask, 5504 SelectionDAG &DAG) { 5505 // Check to see if we can use the TBL instruction. 5506 SDValue V1 = Op.getOperand(0); 5507 SDValue V2 = Op.getOperand(1); 5508 SDLoc DL(Op); 5509 5510 EVT EltVT = Op.getValueType().getVectorElementType(); 5511 unsigned BytesPerElt = EltVT.getSizeInBits() / 8; 5512 5513 SmallVector<SDValue, 8> TBLMask; 5514 for (int Val : ShuffleMask) { 5515 for (unsigned Byte = 0; Byte < BytesPerElt; ++Byte) { 5516 unsigned Offset = Byte + Val * BytesPerElt; 5517 TBLMask.push_back(DAG.getConstant(Offset, DL, MVT::i32)); 5518 } 5519 } 5520 5521 MVT IndexVT = MVT::v8i8; 5522 unsigned IndexLen = 8; 5523 if (Op.getValueType().getSizeInBits() == 128) { 5524 IndexVT = MVT::v16i8; 5525 IndexLen = 16; 5526 } 5527 5528 SDValue V1Cst = DAG.getNode(ISD::BITCAST, DL, IndexVT, V1); 5529 SDValue V2Cst = DAG.getNode(ISD::BITCAST, DL, IndexVT, V2); 5530 5531 SDValue Shuffle; 5532 if (V2.getNode()->isUndef()) { 5533 if (IndexLen == 8) 5534 V1Cst = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v16i8, V1Cst, V1Cst); 5535 Shuffle = DAG.getNode( 5536 ISD::INTRINSIC_WO_CHAIN, DL, IndexVT, 5537 DAG.getConstant(Intrinsic::aarch64_neon_tbl1, DL, MVT::i32), V1Cst, 5538 DAG.getBuildVector(IndexVT, DL, 5539 makeArrayRef(TBLMask.data(), IndexLen))); 5540 } else { 5541 if (IndexLen == 8) { 5542 V1Cst = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v16i8, V1Cst, V2Cst); 5543 Shuffle = DAG.getNode( 5544 ISD::INTRINSIC_WO_CHAIN, DL, IndexVT, 5545 DAG.getConstant(Intrinsic::aarch64_neon_tbl1, DL, MVT::i32), V1Cst, 5546 DAG.getBuildVector(IndexVT, DL, 5547 makeArrayRef(TBLMask.data(), IndexLen))); 5548 } else { 5549 // FIXME: We cannot, for the moment, emit a TBL2 instruction because we 5550 // cannot currently represent the register constraints on the input 5551 // table registers. 5552 // Shuffle = DAG.getNode(AArch64ISD::TBL2, DL, IndexVT, V1Cst, V2Cst, 5553 // DAG.getBuildVector(IndexVT, DL, &TBLMask[0], 5554 // IndexLen)); 5555 Shuffle = DAG.getNode( 5556 ISD::INTRINSIC_WO_CHAIN, DL, IndexVT, 5557 DAG.getConstant(Intrinsic::aarch64_neon_tbl2, DL, MVT::i32), V1Cst, 5558 V2Cst, DAG.getBuildVector(IndexVT, DL, 5559 makeArrayRef(TBLMask.data(), IndexLen))); 5560 } 5561 } 5562 return DAG.getNode(ISD::BITCAST, DL, Op.getValueType(), Shuffle); 5563 } 5564 5565 static unsigned getDUPLANEOp(EVT EltType) { 5566 if (EltType == MVT::i8) 5567 return AArch64ISD::DUPLANE8; 5568 if (EltType == MVT::i16 || EltType == MVT::f16) 5569 return AArch64ISD::DUPLANE16; 5570 if (EltType == MVT::i32 || EltType == MVT::f32) 5571 return AArch64ISD::DUPLANE32; 5572 if (EltType == MVT::i64 || EltType == MVT::f64) 5573 return AArch64ISD::DUPLANE64; 5574 5575 llvm_unreachable("Invalid vector element type?"); 5576 } 5577 5578 SDValue AArch64TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, 5579 SelectionDAG &DAG) const { 5580 SDLoc dl(Op); 5581 EVT VT = Op.getValueType(); 5582 5583 ShuffleVectorSDNode *SVN = cast<ShuffleVectorSDNode>(Op.getNode()); 5584 5585 // Convert shuffles that are directly supported on NEON to target-specific 5586 // DAG nodes, instead of keeping them as shuffles and matching them again 5587 // during code selection. This is more efficient and avoids the possibility 5588 // of inconsistencies between legalization and selection. 5589 ArrayRef<int> ShuffleMask = SVN->getMask(); 5590 5591 SDValue V1 = Op.getOperand(0); 5592 SDValue V2 = Op.getOperand(1); 5593 5594 if (SVN->isSplat()) { 5595 int Lane = SVN->getSplatIndex(); 5596 // If this is undef splat, generate it via "just" vdup, if possible. 5597 if (Lane == -1) 5598 Lane = 0; 5599 5600 if (Lane == 0 && V1.getOpcode() == ISD::SCALAR_TO_VECTOR) 5601 return DAG.getNode(AArch64ISD::DUP, dl, V1.getValueType(), 5602 V1.getOperand(0)); 5603 // Test if V1 is a BUILD_VECTOR and the lane being referenced is a non- 5604 // constant. If so, we can just reference the lane's definition directly. 5605 if (V1.getOpcode() == ISD::BUILD_VECTOR && 5606 !isa<ConstantSDNode>(V1.getOperand(Lane))) 5607 return DAG.getNode(AArch64ISD::DUP, dl, VT, V1.getOperand(Lane)); 5608 5609 // Otherwise, duplicate from the lane of the input vector. 5610 unsigned Opcode = getDUPLANEOp(V1.getValueType().getVectorElementType()); 5611 5612 // SelectionDAGBuilder may have "helpfully" already extracted or conatenated 5613 // to make a vector of the same size as this SHUFFLE. We can ignore the 5614 // extract entirely, and canonicalise the concat using WidenVector. 5615 if (V1.getOpcode() == ISD::EXTRACT_SUBVECTOR) { 5616 Lane += cast<ConstantSDNode>(V1.getOperand(1))->getZExtValue(); 5617 V1 = V1.getOperand(0); 5618 } else if (V1.getOpcode() == ISD::CONCAT_VECTORS) { 5619 unsigned Idx = Lane >= (int)VT.getVectorNumElements() / 2; 5620 Lane -= Idx * VT.getVectorNumElements() / 2; 5621 V1 = WidenVector(V1.getOperand(Idx), DAG); 5622 } else if (VT.getSizeInBits() == 64) 5623 V1 = WidenVector(V1, DAG); 5624 5625 return DAG.getNode(Opcode, dl, VT, V1, DAG.getConstant(Lane, dl, MVT::i64)); 5626 } 5627 5628 if (isREVMask(ShuffleMask, VT, 64)) 5629 return DAG.getNode(AArch64ISD::REV64, dl, V1.getValueType(), V1, V2); 5630 if (isREVMask(ShuffleMask, VT, 32)) 5631 return DAG.getNode(AArch64ISD::REV32, dl, V1.getValueType(), V1, V2); 5632 if (isREVMask(ShuffleMask, VT, 16)) 5633 return DAG.getNode(AArch64ISD::REV16, dl, V1.getValueType(), V1, V2); 5634 5635 bool ReverseEXT = false; 5636 unsigned Imm; 5637 if (isEXTMask(ShuffleMask, VT, ReverseEXT, Imm)) { 5638 if (ReverseEXT) 5639 std::swap(V1, V2); 5640 Imm *= getExtFactor(V1); 5641 return DAG.getNode(AArch64ISD::EXT, dl, V1.getValueType(), V1, V2, 5642 DAG.getConstant(Imm, dl, MVT::i32)); 5643 } else if (V2->isUndef() && isSingletonEXTMask(ShuffleMask, VT, Imm)) { 5644 Imm *= getExtFactor(V1); 5645 return DAG.getNode(AArch64ISD::EXT, dl, V1.getValueType(), V1, V1, 5646 DAG.getConstant(Imm, dl, MVT::i32)); 5647 } 5648 5649 unsigned WhichResult; 5650 if (isZIPMask(ShuffleMask, VT, WhichResult)) { 5651 unsigned Opc = (WhichResult == 0) ? AArch64ISD::ZIP1 : AArch64ISD::ZIP2; 5652 return DAG.getNode(Opc, dl, V1.getValueType(), V1, V2); 5653 } 5654 if (isUZPMask(ShuffleMask, VT, WhichResult)) { 5655 unsigned Opc = (WhichResult == 0) ? AArch64ISD::UZP1 : AArch64ISD::UZP2; 5656 return DAG.getNode(Opc, dl, V1.getValueType(), V1, V2); 5657 } 5658 if (isTRNMask(ShuffleMask, VT, WhichResult)) { 5659 unsigned Opc = (WhichResult == 0) ? AArch64ISD::TRN1 : AArch64ISD::TRN2; 5660 return DAG.getNode(Opc, dl, V1.getValueType(), V1, V2); 5661 } 5662 5663 if (isZIP_v_undef_Mask(ShuffleMask, VT, WhichResult)) { 5664 unsigned Opc = (WhichResult == 0) ? AArch64ISD::ZIP1 : AArch64ISD::ZIP2; 5665 return DAG.getNode(Opc, dl, V1.getValueType(), V1, V1); 5666 } 5667 if (isUZP_v_undef_Mask(ShuffleMask, VT, WhichResult)) { 5668 unsigned Opc = (WhichResult == 0) ? AArch64ISD::UZP1 : AArch64ISD::UZP2; 5669 return DAG.getNode(Opc, dl, V1.getValueType(), V1, V1); 5670 } 5671 if (isTRN_v_undef_Mask(ShuffleMask, VT, WhichResult)) { 5672 unsigned Opc = (WhichResult == 0) ? AArch64ISD::TRN1 : AArch64ISD::TRN2; 5673 return DAG.getNode(Opc, dl, V1.getValueType(), V1, V1); 5674 } 5675 5676 if (SDValue Concat = tryFormConcatFromShuffle(Op, DAG)) 5677 return Concat; 5678 5679 bool DstIsLeft; 5680 int Anomaly; 5681 int NumInputElements = V1.getValueType().getVectorNumElements(); 5682 if (isINSMask(ShuffleMask, NumInputElements, DstIsLeft, Anomaly)) { 5683 SDValue DstVec = DstIsLeft ? V1 : V2; 5684 SDValue DstLaneV = DAG.getConstant(Anomaly, dl, MVT::i64); 5685 5686 SDValue SrcVec = V1; 5687 int SrcLane = ShuffleMask[Anomaly]; 5688 if (SrcLane >= NumInputElements) { 5689 SrcVec = V2; 5690 SrcLane -= VT.getVectorNumElements(); 5691 } 5692 SDValue SrcLaneV = DAG.getConstant(SrcLane, dl, MVT::i64); 5693 5694 EVT ScalarVT = VT.getVectorElementType(); 5695 5696 if (ScalarVT.getSizeInBits() < 32 && ScalarVT.isInteger()) 5697 ScalarVT = MVT::i32; 5698 5699 return DAG.getNode( 5700 ISD::INSERT_VECTOR_ELT, dl, VT, DstVec, 5701 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, ScalarVT, SrcVec, SrcLaneV), 5702 DstLaneV); 5703 } 5704 5705 // If the shuffle is not directly supported and it has 4 elements, use 5706 // the PerfectShuffle-generated table to synthesize it from other shuffles. 5707 unsigned NumElts = VT.getVectorNumElements(); 5708 if (NumElts == 4) { 5709 unsigned PFIndexes[4]; 5710 for (unsigned i = 0; i != 4; ++i) { 5711 if (ShuffleMask[i] < 0) 5712 PFIndexes[i] = 8; 5713 else 5714 PFIndexes[i] = ShuffleMask[i]; 5715 } 5716 5717 // Compute the index in the perfect shuffle table. 5718 unsigned PFTableIndex = PFIndexes[0] * 9 * 9 * 9 + PFIndexes[1] * 9 * 9 + 5719 PFIndexes[2] * 9 + PFIndexes[3]; 5720 unsigned PFEntry = PerfectShuffleTable[PFTableIndex]; 5721 unsigned Cost = (PFEntry >> 30); 5722 5723 if (Cost <= 4) 5724 return GeneratePerfectShuffle(PFEntry, V1, V2, DAG, dl); 5725 } 5726 5727 return GenerateTBL(Op, ShuffleMask, DAG); 5728 } 5729 5730 static bool resolveBuildVector(BuildVectorSDNode *BVN, APInt &CnstBits, 5731 APInt &UndefBits) { 5732 EVT VT = BVN->getValueType(0); 5733 APInt SplatBits, SplatUndef; 5734 unsigned SplatBitSize; 5735 bool HasAnyUndefs; 5736 if (BVN->isConstantSplat(SplatBits, SplatUndef, SplatBitSize, HasAnyUndefs)) { 5737 unsigned NumSplats = VT.getSizeInBits() / SplatBitSize; 5738 5739 for (unsigned i = 0; i < NumSplats; ++i) { 5740 CnstBits <<= SplatBitSize; 5741 UndefBits <<= SplatBitSize; 5742 CnstBits |= SplatBits.zextOrTrunc(VT.getSizeInBits()); 5743 UndefBits |= (SplatBits ^ SplatUndef).zextOrTrunc(VT.getSizeInBits()); 5744 } 5745 5746 return true; 5747 } 5748 5749 return false; 5750 } 5751 5752 SDValue AArch64TargetLowering::LowerVectorAND(SDValue Op, 5753 SelectionDAG &DAG) const { 5754 BuildVectorSDNode *BVN = 5755 dyn_cast<BuildVectorSDNode>(Op.getOperand(1).getNode()); 5756 SDValue LHS = Op.getOperand(0); 5757 SDLoc dl(Op); 5758 EVT VT = Op.getValueType(); 5759 5760 if (!BVN) 5761 return Op; 5762 5763 APInt CnstBits(VT.getSizeInBits(), 0); 5764 APInt UndefBits(VT.getSizeInBits(), 0); 5765 if (resolveBuildVector(BVN, CnstBits, UndefBits)) { 5766 // We only have BIC vector immediate instruction, which is and-not. 5767 CnstBits = ~CnstBits; 5768 5769 // We make use of a little bit of goto ickiness in order to avoid having to 5770 // duplicate the immediate matching logic for the undef toggled case. 5771 bool SecondTry = false; 5772 AttemptModImm: 5773 5774 if (CnstBits.getHiBits(64) == CnstBits.getLoBits(64)) { 5775 CnstBits = CnstBits.zextOrTrunc(64); 5776 uint64_t CnstVal = CnstBits.getZExtValue(); 5777 5778 if (AArch64_AM::isAdvSIMDModImmType1(CnstVal)) { 5779 CnstVal = AArch64_AM::encodeAdvSIMDModImmType1(CnstVal); 5780 MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32; 5781 SDValue Mov = DAG.getNode(AArch64ISD::BICi, dl, MovTy, LHS, 5782 DAG.getConstant(CnstVal, dl, MVT::i32), 5783 DAG.getConstant(0, dl, MVT::i32)); 5784 return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov); 5785 } 5786 5787 if (AArch64_AM::isAdvSIMDModImmType2(CnstVal)) { 5788 CnstVal = AArch64_AM::encodeAdvSIMDModImmType2(CnstVal); 5789 MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32; 5790 SDValue Mov = DAG.getNode(AArch64ISD::BICi, dl, MovTy, LHS, 5791 DAG.getConstant(CnstVal, dl, MVT::i32), 5792 DAG.getConstant(8, dl, MVT::i32)); 5793 return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov); 5794 } 5795 5796 if (AArch64_AM::isAdvSIMDModImmType3(CnstVal)) { 5797 CnstVal = AArch64_AM::encodeAdvSIMDModImmType3(CnstVal); 5798 MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32; 5799 SDValue Mov = DAG.getNode(AArch64ISD::BICi, dl, MovTy, LHS, 5800 DAG.getConstant(CnstVal, dl, MVT::i32), 5801 DAG.getConstant(16, dl, MVT::i32)); 5802 return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov); 5803 } 5804 5805 if (AArch64_AM::isAdvSIMDModImmType4(CnstVal)) { 5806 CnstVal = AArch64_AM::encodeAdvSIMDModImmType4(CnstVal); 5807 MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32; 5808 SDValue Mov = DAG.getNode(AArch64ISD::BICi, dl, MovTy, LHS, 5809 DAG.getConstant(CnstVal, dl, MVT::i32), 5810 DAG.getConstant(24, dl, MVT::i32)); 5811 return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov); 5812 } 5813 5814 if (AArch64_AM::isAdvSIMDModImmType5(CnstVal)) { 5815 CnstVal = AArch64_AM::encodeAdvSIMDModImmType5(CnstVal); 5816 MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v8i16 : MVT::v4i16; 5817 SDValue Mov = DAG.getNode(AArch64ISD::BICi, dl, MovTy, LHS, 5818 DAG.getConstant(CnstVal, dl, MVT::i32), 5819 DAG.getConstant(0, dl, MVT::i32)); 5820 return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov); 5821 } 5822 5823 if (AArch64_AM::isAdvSIMDModImmType6(CnstVal)) { 5824 CnstVal = AArch64_AM::encodeAdvSIMDModImmType6(CnstVal); 5825 MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v8i16 : MVT::v4i16; 5826 SDValue Mov = DAG.getNode(AArch64ISD::BICi, dl, MovTy, LHS, 5827 DAG.getConstant(CnstVal, dl, MVT::i32), 5828 DAG.getConstant(8, dl, MVT::i32)); 5829 return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov); 5830 } 5831 } 5832 5833 if (SecondTry) 5834 goto FailedModImm; 5835 SecondTry = true; 5836 CnstBits = ~UndefBits; 5837 goto AttemptModImm; 5838 } 5839 5840 // We can always fall back to a non-immediate AND. 5841 FailedModImm: 5842 return Op; 5843 } 5844 5845 // Specialized code to quickly find if PotentialBVec is a BuildVector that 5846 // consists of only the same constant int value, returned in reference arg 5847 // ConstVal 5848 static bool isAllConstantBuildVector(const SDValue &PotentialBVec, 5849 uint64_t &ConstVal) { 5850 BuildVectorSDNode *Bvec = dyn_cast<BuildVectorSDNode>(PotentialBVec); 5851 if (!Bvec) 5852 return false; 5853 ConstantSDNode *FirstElt = dyn_cast<ConstantSDNode>(Bvec->getOperand(0)); 5854 if (!FirstElt) 5855 return false; 5856 EVT VT = Bvec->getValueType(0); 5857 unsigned NumElts = VT.getVectorNumElements(); 5858 for (unsigned i = 1; i < NumElts; ++i) 5859 if (dyn_cast<ConstantSDNode>(Bvec->getOperand(i)) != FirstElt) 5860 return false; 5861 ConstVal = FirstElt->getZExtValue(); 5862 return true; 5863 } 5864 5865 static unsigned getIntrinsicID(const SDNode *N) { 5866 unsigned Opcode = N->getOpcode(); 5867 switch (Opcode) { 5868 default: 5869 return Intrinsic::not_intrinsic; 5870 case ISD::INTRINSIC_WO_CHAIN: { 5871 unsigned IID = cast<ConstantSDNode>(N->getOperand(0))->getZExtValue(); 5872 if (IID < Intrinsic::num_intrinsics) 5873 return IID; 5874 return Intrinsic::not_intrinsic; 5875 } 5876 } 5877 } 5878 5879 // Attempt to form a vector S[LR]I from (or (and X, BvecC1), (lsl Y, C2)), 5880 // to (SLI X, Y, C2), where X and Y have matching vector types, BvecC1 is a 5881 // BUILD_VECTORs with constant element C1, C2 is a constant, and C1 == ~C2. 5882 // Also, logical shift right -> sri, with the same structure. 5883 static SDValue tryLowerToSLI(SDNode *N, SelectionDAG &DAG) { 5884 EVT VT = N->getValueType(0); 5885 5886 if (!VT.isVector()) 5887 return SDValue(); 5888 5889 SDLoc DL(N); 5890 5891 // Is the first op an AND? 5892 const SDValue And = N->getOperand(0); 5893 if (And.getOpcode() != ISD::AND) 5894 return SDValue(); 5895 5896 // Is the second op an shl or lshr? 5897 SDValue Shift = N->getOperand(1); 5898 // This will have been turned into: AArch64ISD::VSHL vector, #shift 5899 // or AArch64ISD::VLSHR vector, #shift 5900 unsigned ShiftOpc = Shift.getOpcode(); 5901 if ((ShiftOpc != AArch64ISD::VSHL && ShiftOpc != AArch64ISD::VLSHR)) 5902 return SDValue(); 5903 bool IsShiftRight = ShiftOpc == AArch64ISD::VLSHR; 5904 5905 // Is the shift amount constant? 5906 ConstantSDNode *C2node = dyn_cast<ConstantSDNode>(Shift.getOperand(1)); 5907 if (!C2node) 5908 return SDValue(); 5909 5910 // Is the and mask vector all constant? 5911 uint64_t C1; 5912 if (!isAllConstantBuildVector(And.getOperand(1), C1)) 5913 return SDValue(); 5914 5915 // Is C1 == ~C2, taking into account how much one can shift elements of a 5916 // particular size? 5917 uint64_t C2 = C2node->getZExtValue(); 5918 unsigned ElemSizeInBits = VT.getVectorElementType().getSizeInBits(); 5919 if (C2 > ElemSizeInBits) 5920 return SDValue(); 5921 unsigned ElemMask = (1 << ElemSizeInBits) - 1; 5922 if ((C1 & ElemMask) != (~C2 & ElemMask)) 5923 return SDValue(); 5924 5925 SDValue X = And.getOperand(0); 5926 SDValue Y = Shift.getOperand(0); 5927 5928 unsigned Intrin = 5929 IsShiftRight ? Intrinsic::aarch64_neon_vsri : Intrinsic::aarch64_neon_vsli; 5930 SDValue ResultSLI = 5931 DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT, 5932 DAG.getConstant(Intrin, DL, MVT::i32), X, Y, 5933 Shift.getOperand(1)); 5934 5935 DEBUG(dbgs() << "aarch64-lower: transformed: \n"); 5936 DEBUG(N->dump(&DAG)); 5937 DEBUG(dbgs() << "into: \n"); 5938 DEBUG(ResultSLI->dump(&DAG)); 5939 5940 ++NumShiftInserts; 5941 return ResultSLI; 5942 } 5943 5944 SDValue AArch64TargetLowering::LowerVectorOR(SDValue Op, 5945 SelectionDAG &DAG) const { 5946 // Attempt to form a vector S[LR]I from (or (and X, C1), (lsl Y, C2)) 5947 if (EnableAArch64SlrGeneration) { 5948 if (SDValue Res = tryLowerToSLI(Op.getNode(), DAG)) 5949 return Res; 5950 } 5951 5952 BuildVectorSDNode *BVN = 5953 dyn_cast<BuildVectorSDNode>(Op.getOperand(0).getNode()); 5954 SDValue LHS = Op.getOperand(1); 5955 SDLoc dl(Op); 5956 EVT VT = Op.getValueType(); 5957 5958 // OR commutes, so try swapping the operands. 5959 if (!BVN) { 5960 LHS = Op.getOperand(0); 5961 BVN = dyn_cast<BuildVectorSDNode>(Op.getOperand(1).getNode()); 5962 } 5963 if (!BVN) 5964 return Op; 5965 5966 APInt CnstBits(VT.getSizeInBits(), 0); 5967 APInt UndefBits(VT.getSizeInBits(), 0); 5968 if (resolveBuildVector(BVN, CnstBits, UndefBits)) { 5969 // We make use of a little bit of goto ickiness in order to avoid having to 5970 // duplicate the immediate matching logic for the undef toggled case. 5971 bool SecondTry = false; 5972 AttemptModImm: 5973 5974 if (CnstBits.getHiBits(64) == CnstBits.getLoBits(64)) { 5975 CnstBits = CnstBits.zextOrTrunc(64); 5976 uint64_t CnstVal = CnstBits.getZExtValue(); 5977 5978 if (AArch64_AM::isAdvSIMDModImmType1(CnstVal)) { 5979 CnstVal = AArch64_AM::encodeAdvSIMDModImmType1(CnstVal); 5980 MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32; 5981 SDValue Mov = DAG.getNode(AArch64ISD::ORRi, dl, MovTy, LHS, 5982 DAG.getConstant(CnstVal, dl, MVT::i32), 5983 DAG.getConstant(0, dl, MVT::i32)); 5984 return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov); 5985 } 5986 5987 if (AArch64_AM::isAdvSIMDModImmType2(CnstVal)) { 5988 CnstVal = AArch64_AM::encodeAdvSIMDModImmType2(CnstVal); 5989 MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32; 5990 SDValue Mov = DAG.getNode(AArch64ISD::ORRi, dl, MovTy, LHS, 5991 DAG.getConstant(CnstVal, dl, MVT::i32), 5992 DAG.getConstant(8, dl, MVT::i32)); 5993 return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov); 5994 } 5995 5996 if (AArch64_AM::isAdvSIMDModImmType3(CnstVal)) { 5997 CnstVal = AArch64_AM::encodeAdvSIMDModImmType3(CnstVal); 5998 MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32; 5999 SDValue Mov = DAG.getNode(AArch64ISD::ORRi, dl, MovTy, LHS, 6000 DAG.getConstant(CnstVal, dl, MVT::i32), 6001 DAG.getConstant(16, dl, MVT::i32)); 6002 return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov); 6003 } 6004 6005 if (AArch64_AM::isAdvSIMDModImmType4(CnstVal)) { 6006 CnstVal = AArch64_AM::encodeAdvSIMDModImmType4(CnstVal); 6007 MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32; 6008 SDValue Mov = DAG.getNode(AArch64ISD::ORRi, dl, MovTy, LHS, 6009 DAG.getConstant(CnstVal, dl, MVT::i32), 6010 DAG.getConstant(24, dl, MVT::i32)); 6011 return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov); 6012 } 6013 6014 if (AArch64_AM::isAdvSIMDModImmType5(CnstVal)) { 6015 CnstVal = AArch64_AM::encodeAdvSIMDModImmType5(CnstVal); 6016 MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v8i16 : MVT::v4i16; 6017 SDValue Mov = DAG.getNode(AArch64ISD::ORRi, dl, MovTy, LHS, 6018 DAG.getConstant(CnstVal, dl, MVT::i32), 6019 DAG.getConstant(0, dl, MVT::i32)); 6020 return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov); 6021 } 6022 6023 if (AArch64_AM::isAdvSIMDModImmType6(CnstVal)) { 6024 CnstVal = AArch64_AM::encodeAdvSIMDModImmType6(CnstVal); 6025 MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v8i16 : MVT::v4i16; 6026 SDValue Mov = DAG.getNode(AArch64ISD::ORRi, dl, MovTy, LHS, 6027 DAG.getConstant(CnstVal, dl, MVT::i32), 6028 DAG.getConstant(8, dl, MVT::i32)); 6029 return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov); 6030 } 6031 } 6032 6033 if (SecondTry) 6034 goto FailedModImm; 6035 SecondTry = true; 6036 CnstBits = UndefBits; 6037 goto AttemptModImm; 6038 } 6039 6040 // We can always fall back to a non-immediate OR. 6041 FailedModImm: 6042 return Op; 6043 } 6044 6045 // Normalize the operands of BUILD_VECTOR. The value of constant operands will 6046 // be truncated to fit element width. 6047 static SDValue NormalizeBuildVector(SDValue Op, 6048 SelectionDAG &DAG) { 6049 assert(Op.getOpcode() == ISD::BUILD_VECTOR && "Unknown opcode!"); 6050 SDLoc dl(Op); 6051 EVT VT = Op.getValueType(); 6052 EVT EltTy= VT.getVectorElementType(); 6053 6054 if (EltTy.isFloatingPoint() || EltTy.getSizeInBits() > 16) 6055 return Op; 6056 6057 SmallVector<SDValue, 16> Ops; 6058 for (SDValue Lane : Op->ops()) { 6059 if (auto *CstLane = dyn_cast<ConstantSDNode>(Lane)) { 6060 APInt LowBits(EltTy.getSizeInBits(), 6061 CstLane->getZExtValue()); 6062 Lane = DAG.getConstant(LowBits.getZExtValue(), dl, MVT::i32); 6063 } 6064 Ops.push_back(Lane); 6065 } 6066 return DAG.getBuildVector(VT, dl, Ops); 6067 } 6068 6069 SDValue AArch64TargetLowering::LowerBUILD_VECTOR(SDValue Op, 6070 SelectionDAG &DAG) const { 6071 SDLoc dl(Op); 6072 EVT VT = Op.getValueType(); 6073 Op = NormalizeBuildVector(Op, DAG); 6074 BuildVectorSDNode *BVN = cast<BuildVectorSDNode>(Op.getNode()); 6075 6076 APInt CnstBits(VT.getSizeInBits(), 0); 6077 APInt UndefBits(VT.getSizeInBits(), 0); 6078 if (resolveBuildVector(BVN, CnstBits, UndefBits)) { 6079 // We make use of a little bit of goto ickiness in order to avoid having to 6080 // duplicate the immediate matching logic for the undef toggled case. 6081 bool SecondTry = false; 6082 AttemptModImm: 6083 6084 if (CnstBits.getHiBits(64) == CnstBits.getLoBits(64)) { 6085 CnstBits = CnstBits.zextOrTrunc(64); 6086 uint64_t CnstVal = CnstBits.getZExtValue(); 6087 6088 // Certain magic vector constants (used to express things like NOT 6089 // and NEG) are passed through unmodified. This allows codegen patterns 6090 // for these operations to match. Special-purpose patterns will lower 6091 // these immediates to MOVIs if it proves necessary. 6092 if (VT.isInteger() && (CnstVal == 0 || CnstVal == ~0ULL)) 6093 return Op; 6094 6095 // The many faces of MOVI... 6096 if (AArch64_AM::isAdvSIMDModImmType10(CnstVal)) { 6097 CnstVal = AArch64_AM::encodeAdvSIMDModImmType10(CnstVal); 6098 if (VT.getSizeInBits() == 128) { 6099 SDValue Mov = DAG.getNode(AArch64ISD::MOVIedit, dl, MVT::v2i64, 6100 DAG.getConstant(CnstVal, dl, MVT::i32)); 6101 return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov); 6102 } 6103 6104 // Support the V64 version via subregister insertion. 6105 SDValue Mov = DAG.getNode(AArch64ISD::MOVIedit, dl, MVT::f64, 6106 DAG.getConstant(CnstVal, dl, MVT::i32)); 6107 return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov); 6108 } 6109 6110 if (AArch64_AM::isAdvSIMDModImmType1(CnstVal)) { 6111 CnstVal = AArch64_AM::encodeAdvSIMDModImmType1(CnstVal); 6112 MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32; 6113 SDValue Mov = DAG.getNode(AArch64ISD::MOVIshift, dl, MovTy, 6114 DAG.getConstant(CnstVal, dl, MVT::i32), 6115 DAG.getConstant(0, dl, MVT::i32)); 6116 return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov); 6117 } 6118 6119 if (AArch64_AM::isAdvSIMDModImmType2(CnstVal)) { 6120 CnstVal = AArch64_AM::encodeAdvSIMDModImmType2(CnstVal); 6121 MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32; 6122 SDValue Mov = DAG.getNode(AArch64ISD::MOVIshift, dl, MovTy, 6123 DAG.getConstant(CnstVal, dl, MVT::i32), 6124 DAG.getConstant(8, dl, MVT::i32)); 6125 return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov); 6126 } 6127 6128 if (AArch64_AM::isAdvSIMDModImmType3(CnstVal)) { 6129 CnstVal = AArch64_AM::encodeAdvSIMDModImmType3(CnstVal); 6130 MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32; 6131 SDValue Mov = DAG.getNode(AArch64ISD::MOVIshift, dl, MovTy, 6132 DAG.getConstant(CnstVal, dl, MVT::i32), 6133 DAG.getConstant(16, dl, MVT::i32)); 6134 return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov); 6135 } 6136 6137 if (AArch64_AM::isAdvSIMDModImmType4(CnstVal)) { 6138 CnstVal = AArch64_AM::encodeAdvSIMDModImmType4(CnstVal); 6139 MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32; 6140 SDValue Mov = DAG.getNode(AArch64ISD::MOVIshift, dl, MovTy, 6141 DAG.getConstant(CnstVal, dl, MVT::i32), 6142 DAG.getConstant(24, dl, MVT::i32)); 6143 return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov); 6144 } 6145 6146 if (AArch64_AM::isAdvSIMDModImmType5(CnstVal)) { 6147 CnstVal = AArch64_AM::encodeAdvSIMDModImmType5(CnstVal); 6148 MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v8i16 : MVT::v4i16; 6149 SDValue Mov = DAG.getNode(AArch64ISD::MOVIshift, dl, MovTy, 6150 DAG.getConstant(CnstVal, dl, MVT::i32), 6151 DAG.getConstant(0, dl, MVT::i32)); 6152 return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov); 6153 } 6154 6155 if (AArch64_AM::isAdvSIMDModImmType6(CnstVal)) { 6156 CnstVal = AArch64_AM::encodeAdvSIMDModImmType6(CnstVal); 6157 MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v8i16 : MVT::v4i16; 6158 SDValue Mov = DAG.getNode(AArch64ISD::MOVIshift, dl, MovTy, 6159 DAG.getConstant(CnstVal, dl, MVT::i32), 6160 DAG.getConstant(8, dl, MVT::i32)); 6161 return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov); 6162 } 6163 6164 if (AArch64_AM::isAdvSIMDModImmType7(CnstVal)) { 6165 CnstVal = AArch64_AM::encodeAdvSIMDModImmType7(CnstVal); 6166 MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32; 6167 SDValue Mov = DAG.getNode(AArch64ISD::MOVImsl, dl, MovTy, 6168 DAG.getConstant(CnstVal, dl, MVT::i32), 6169 DAG.getConstant(264, dl, MVT::i32)); 6170 return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov); 6171 } 6172 6173 if (AArch64_AM::isAdvSIMDModImmType8(CnstVal)) { 6174 CnstVal = AArch64_AM::encodeAdvSIMDModImmType8(CnstVal); 6175 MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32; 6176 SDValue Mov = DAG.getNode(AArch64ISD::MOVImsl, dl, MovTy, 6177 DAG.getConstant(CnstVal, dl, MVT::i32), 6178 DAG.getConstant(272, dl, MVT::i32)); 6179 return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov); 6180 } 6181 6182 if (AArch64_AM::isAdvSIMDModImmType9(CnstVal)) { 6183 CnstVal = AArch64_AM::encodeAdvSIMDModImmType9(CnstVal); 6184 MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v16i8 : MVT::v8i8; 6185 SDValue Mov = DAG.getNode(AArch64ISD::MOVI, dl, MovTy, 6186 DAG.getConstant(CnstVal, dl, MVT::i32)); 6187 return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov); 6188 } 6189 6190 // The few faces of FMOV... 6191 if (AArch64_AM::isAdvSIMDModImmType11(CnstVal)) { 6192 CnstVal = AArch64_AM::encodeAdvSIMDModImmType11(CnstVal); 6193 MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4f32 : MVT::v2f32; 6194 SDValue Mov = DAG.getNode(AArch64ISD::FMOV, dl, MovTy, 6195 DAG.getConstant(CnstVal, dl, MVT::i32)); 6196 return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov); 6197 } 6198 6199 if (AArch64_AM::isAdvSIMDModImmType12(CnstVal) && 6200 VT.getSizeInBits() == 128) { 6201 CnstVal = AArch64_AM::encodeAdvSIMDModImmType12(CnstVal); 6202 SDValue Mov = DAG.getNode(AArch64ISD::FMOV, dl, MVT::v2f64, 6203 DAG.getConstant(CnstVal, dl, MVT::i32)); 6204 return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov); 6205 } 6206 6207 // The many faces of MVNI... 6208 CnstVal = ~CnstVal; 6209 if (AArch64_AM::isAdvSIMDModImmType1(CnstVal)) { 6210 CnstVal = AArch64_AM::encodeAdvSIMDModImmType1(CnstVal); 6211 MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32; 6212 SDValue Mov = DAG.getNode(AArch64ISD::MVNIshift, dl, MovTy, 6213 DAG.getConstant(CnstVal, dl, MVT::i32), 6214 DAG.getConstant(0, dl, MVT::i32)); 6215 return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov); 6216 } 6217 6218 if (AArch64_AM::isAdvSIMDModImmType2(CnstVal)) { 6219 CnstVal = AArch64_AM::encodeAdvSIMDModImmType2(CnstVal); 6220 MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32; 6221 SDValue Mov = DAG.getNode(AArch64ISD::MVNIshift, dl, MovTy, 6222 DAG.getConstant(CnstVal, dl, MVT::i32), 6223 DAG.getConstant(8, dl, MVT::i32)); 6224 return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov); 6225 } 6226 6227 if (AArch64_AM::isAdvSIMDModImmType3(CnstVal)) { 6228 CnstVal = AArch64_AM::encodeAdvSIMDModImmType3(CnstVal); 6229 MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32; 6230 SDValue Mov = DAG.getNode(AArch64ISD::MVNIshift, dl, MovTy, 6231 DAG.getConstant(CnstVal, dl, MVT::i32), 6232 DAG.getConstant(16, dl, MVT::i32)); 6233 return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov); 6234 } 6235 6236 if (AArch64_AM::isAdvSIMDModImmType4(CnstVal)) { 6237 CnstVal = AArch64_AM::encodeAdvSIMDModImmType4(CnstVal); 6238 MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32; 6239 SDValue Mov = DAG.getNode(AArch64ISD::MVNIshift, dl, MovTy, 6240 DAG.getConstant(CnstVal, dl, MVT::i32), 6241 DAG.getConstant(24, dl, MVT::i32)); 6242 return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov); 6243 } 6244 6245 if (AArch64_AM::isAdvSIMDModImmType5(CnstVal)) { 6246 CnstVal = AArch64_AM::encodeAdvSIMDModImmType5(CnstVal); 6247 MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v8i16 : MVT::v4i16; 6248 SDValue Mov = DAG.getNode(AArch64ISD::MVNIshift, dl, MovTy, 6249 DAG.getConstant(CnstVal, dl, MVT::i32), 6250 DAG.getConstant(0, dl, MVT::i32)); 6251 return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov); 6252 } 6253 6254 if (AArch64_AM::isAdvSIMDModImmType6(CnstVal)) { 6255 CnstVal = AArch64_AM::encodeAdvSIMDModImmType6(CnstVal); 6256 MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v8i16 : MVT::v4i16; 6257 SDValue Mov = DAG.getNode(AArch64ISD::MVNIshift, dl, MovTy, 6258 DAG.getConstant(CnstVal, dl, MVT::i32), 6259 DAG.getConstant(8, dl, MVT::i32)); 6260 return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov); 6261 } 6262 6263 if (AArch64_AM::isAdvSIMDModImmType7(CnstVal)) { 6264 CnstVal = AArch64_AM::encodeAdvSIMDModImmType7(CnstVal); 6265 MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32; 6266 SDValue Mov = DAG.getNode(AArch64ISD::MVNImsl, dl, MovTy, 6267 DAG.getConstant(CnstVal, dl, MVT::i32), 6268 DAG.getConstant(264, dl, MVT::i32)); 6269 return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov); 6270 } 6271 6272 if (AArch64_AM::isAdvSIMDModImmType8(CnstVal)) { 6273 CnstVal = AArch64_AM::encodeAdvSIMDModImmType8(CnstVal); 6274 MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32; 6275 SDValue Mov = DAG.getNode(AArch64ISD::MVNImsl, dl, MovTy, 6276 DAG.getConstant(CnstVal, dl, MVT::i32), 6277 DAG.getConstant(272, dl, MVT::i32)); 6278 return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov); 6279 } 6280 } 6281 6282 if (SecondTry) 6283 goto FailedModImm; 6284 SecondTry = true; 6285 CnstBits = UndefBits; 6286 goto AttemptModImm; 6287 } 6288 FailedModImm: 6289 6290 // Scan through the operands to find some interesting properties we can 6291 // exploit: 6292 // 1) If only one value is used, we can use a DUP, or 6293 // 2) if only the low element is not undef, we can just insert that, or 6294 // 3) if only one constant value is used (w/ some non-constant lanes), 6295 // we can splat the constant value into the whole vector then fill 6296 // in the non-constant lanes. 6297 // 4) FIXME: If different constant values are used, but we can intelligently 6298 // select the values we'll be overwriting for the non-constant 6299 // lanes such that we can directly materialize the vector 6300 // some other way (MOVI, e.g.), we can be sneaky. 6301 unsigned NumElts = VT.getVectorNumElements(); 6302 bool isOnlyLowElement = true; 6303 bool usesOnlyOneValue = true; 6304 bool usesOnlyOneConstantValue = true; 6305 bool isConstant = true; 6306 unsigned NumConstantLanes = 0; 6307 SDValue Value; 6308 SDValue ConstantValue; 6309 for (unsigned i = 0; i < NumElts; ++i) { 6310 SDValue V = Op.getOperand(i); 6311 if (V.isUndef()) 6312 continue; 6313 if (i > 0) 6314 isOnlyLowElement = false; 6315 if (!isa<ConstantFPSDNode>(V) && !isa<ConstantSDNode>(V)) 6316 isConstant = false; 6317 6318 if (isa<ConstantSDNode>(V) || isa<ConstantFPSDNode>(V)) { 6319 ++NumConstantLanes; 6320 if (!ConstantValue.getNode()) 6321 ConstantValue = V; 6322 else if (ConstantValue != V) 6323 usesOnlyOneConstantValue = false; 6324 } 6325 6326 if (!Value.getNode()) 6327 Value = V; 6328 else if (V != Value) 6329 usesOnlyOneValue = false; 6330 } 6331 6332 if (!Value.getNode()) 6333 return DAG.getUNDEF(VT); 6334 6335 if (isOnlyLowElement) 6336 return DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Value); 6337 6338 // Use DUP for non-constant splats. For f32 constant splats, reduce to 6339 // i32 and try again. 6340 if (usesOnlyOneValue) { 6341 if (!isConstant) { 6342 if (Value.getOpcode() != ISD::EXTRACT_VECTOR_ELT || 6343 Value.getValueType() != VT) 6344 return DAG.getNode(AArch64ISD::DUP, dl, VT, Value); 6345 6346 // This is actually a DUPLANExx operation, which keeps everything vectory. 6347 6348 // DUPLANE works on 128-bit vectors, widen it if necessary. 6349 SDValue Lane = Value.getOperand(1); 6350 Value = Value.getOperand(0); 6351 if (Value.getValueType().getSizeInBits() == 64) 6352 Value = WidenVector(Value, DAG); 6353 6354 unsigned Opcode = getDUPLANEOp(VT.getVectorElementType()); 6355 return DAG.getNode(Opcode, dl, VT, Value, Lane); 6356 } 6357 6358 if (VT.getVectorElementType().isFloatingPoint()) { 6359 SmallVector<SDValue, 8> Ops; 6360 EVT EltTy = VT.getVectorElementType(); 6361 assert ((EltTy == MVT::f16 || EltTy == MVT::f32 || EltTy == MVT::f64) && 6362 "Unsupported floating-point vector type"); 6363 MVT NewType = MVT::getIntegerVT(EltTy.getSizeInBits()); 6364 for (unsigned i = 0; i < NumElts; ++i) 6365 Ops.push_back(DAG.getNode(ISD::BITCAST, dl, NewType, Op.getOperand(i))); 6366 EVT VecVT = EVT::getVectorVT(*DAG.getContext(), NewType, NumElts); 6367 SDValue Val = DAG.getBuildVector(VecVT, dl, Ops); 6368 Val = LowerBUILD_VECTOR(Val, DAG); 6369 if (Val.getNode()) 6370 return DAG.getNode(ISD::BITCAST, dl, VT, Val); 6371 } 6372 } 6373 6374 // If there was only one constant value used and for more than one lane, 6375 // start by splatting that value, then replace the non-constant lanes. This 6376 // is better than the default, which will perform a separate initialization 6377 // for each lane. 6378 if (NumConstantLanes > 0 && usesOnlyOneConstantValue) { 6379 SDValue Val = DAG.getNode(AArch64ISD::DUP, dl, VT, ConstantValue); 6380 // Now insert the non-constant lanes. 6381 for (unsigned i = 0; i < NumElts; ++i) { 6382 SDValue V = Op.getOperand(i); 6383 SDValue LaneIdx = DAG.getConstant(i, dl, MVT::i64); 6384 if (!isa<ConstantSDNode>(V) && !isa<ConstantFPSDNode>(V)) { 6385 // Note that type legalization likely mucked about with the VT of the 6386 // source operand, so we may have to convert it here before inserting. 6387 Val = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, Val, V, LaneIdx); 6388 } 6389 } 6390 return Val; 6391 } 6392 6393 // If all elements are constants and the case above didn't get hit, fall back 6394 // to the default expansion, which will generate a load from the constant 6395 // pool. 6396 if (isConstant) 6397 return SDValue(); 6398 6399 // Empirical tests suggest this is rarely worth it for vectors of length <= 2. 6400 if (NumElts >= 4) { 6401 if (SDValue shuffle = ReconstructShuffle(Op, DAG)) 6402 return shuffle; 6403 } 6404 6405 // If all else fails, just use a sequence of INSERT_VECTOR_ELT when we 6406 // know the default expansion would otherwise fall back on something even 6407 // worse. For a vector with one or two non-undef values, that's 6408 // scalar_to_vector for the elements followed by a shuffle (provided the 6409 // shuffle is valid for the target) and materialization element by element 6410 // on the stack followed by a load for everything else. 6411 if (!isConstant && !usesOnlyOneValue) { 6412 SDValue Vec = DAG.getUNDEF(VT); 6413 SDValue Op0 = Op.getOperand(0); 6414 unsigned ElemSize = VT.getVectorElementType().getSizeInBits(); 6415 unsigned i = 0; 6416 // For 32 and 64 bit types, use INSERT_SUBREG for lane zero to 6417 // a) Avoid a RMW dependency on the full vector register, and 6418 // b) Allow the register coalescer to fold away the copy if the 6419 // value is already in an S or D register. 6420 // Do not do this for UNDEF/LOAD nodes because we have better patterns 6421 // for those avoiding the SCALAR_TO_VECTOR/BUILD_VECTOR. 6422 if (!Op0.isUndef() && Op0.getOpcode() != ISD::LOAD && 6423 (ElemSize == 32 || ElemSize == 64)) { 6424 unsigned SubIdx = ElemSize == 32 ? AArch64::ssub : AArch64::dsub; 6425 MachineSDNode *N = 6426 DAG.getMachineNode(TargetOpcode::INSERT_SUBREG, dl, VT, Vec, Op0, 6427 DAG.getTargetConstant(SubIdx, dl, MVT::i32)); 6428 Vec = SDValue(N, 0); 6429 ++i; 6430 } 6431 for (; i < NumElts; ++i) { 6432 SDValue V = Op.getOperand(i); 6433 if (V.isUndef()) 6434 continue; 6435 SDValue LaneIdx = DAG.getConstant(i, dl, MVT::i64); 6436 Vec = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, Vec, V, LaneIdx); 6437 } 6438 return Vec; 6439 } 6440 6441 // Just use the default expansion. We failed to find a better alternative. 6442 return SDValue(); 6443 } 6444 6445 SDValue AArch64TargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op, 6446 SelectionDAG &DAG) const { 6447 assert(Op.getOpcode() == ISD::INSERT_VECTOR_ELT && "Unknown opcode!"); 6448 6449 // Check for non-constant or out of range lane. 6450 EVT VT = Op.getOperand(0).getValueType(); 6451 ConstantSDNode *CI = dyn_cast<ConstantSDNode>(Op.getOperand(2)); 6452 if (!CI || CI->getZExtValue() >= VT.getVectorNumElements()) 6453 return SDValue(); 6454 6455 6456 // Insertion/extraction are legal for V128 types. 6457 if (VT == MVT::v16i8 || VT == MVT::v8i16 || VT == MVT::v4i32 || 6458 VT == MVT::v2i64 || VT == MVT::v4f32 || VT == MVT::v2f64 || 6459 VT == MVT::v8f16) 6460 return Op; 6461 6462 if (VT != MVT::v8i8 && VT != MVT::v4i16 && VT != MVT::v2i32 && 6463 VT != MVT::v1i64 && VT != MVT::v2f32 && VT != MVT::v4f16) 6464 return SDValue(); 6465 6466 // For V64 types, we perform insertion by expanding the value 6467 // to a V128 type and perform the insertion on that. 6468 SDLoc DL(Op); 6469 SDValue WideVec = WidenVector(Op.getOperand(0), DAG); 6470 EVT WideTy = WideVec.getValueType(); 6471 6472 SDValue Node = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, WideTy, WideVec, 6473 Op.getOperand(1), Op.getOperand(2)); 6474 // Re-narrow the resultant vector. 6475 return NarrowVector(Node, DAG); 6476 } 6477 6478 SDValue 6479 AArch64TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op, 6480 SelectionDAG &DAG) const { 6481 assert(Op.getOpcode() == ISD::EXTRACT_VECTOR_ELT && "Unknown opcode!"); 6482 6483 // Check for non-constant or out of range lane. 6484 EVT VT = Op.getOperand(0).getValueType(); 6485 ConstantSDNode *CI = dyn_cast<ConstantSDNode>(Op.getOperand(1)); 6486 if (!CI || CI->getZExtValue() >= VT.getVectorNumElements()) 6487 return SDValue(); 6488 6489 6490 // Insertion/extraction are legal for V128 types. 6491 if (VT == MVT::v16i8 || VT == MVT::v8i16 || VT == MVT::v4i32 || 6492 VT == MVT::v2i64 || VT == MVT::v4f32 || VT == MVT::v2f64 || 6493 VT == MVT::v8f16) 6494 return Op; 6495 6496 if (VT != MVT::v8i8 && VT != MVT::v4i16 && VT != MVT::v2i32 && 6497 VT != MVT::v1i64 && VT != MVT::v2f32 && VT != MVT::v4f16) 6498 return SDValue(); 6499 6500 // For V64 types, we perform extraction by expanding the value 6501 // to a V128 type and perform the extraction on that. 6502 SDLoc DL(Op); 6503 SDValue WideVec = WidenVector(Op.getOperand(0), DAG); 6504 EVT WideTy = WideVec.getValueType(); 6505 6506 EVT ExtrTy = WideTy.getVectorElementType(); 6507 if (ExtrTy == MVT::i16 || ExtrTy == MVT::i8) 6508 ExtrTy = MVT::i32; 6509 6510 // For extractions, we just return the result directly. 6511 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ExtrTy, WideVec, 6512 Op.getOperand(1)); 6513 } 6514 6515 SDValue AArch64TargetLowering::LowerEXTRACT_SUBVECTOR(SDValue Op, 6516 SelectionDAG &DAG) const { 6517 EVT VT = Op.getOperand(0).getValueType(); 6518 SDLoc dl(Op); 6519 // Just in case... 6520 if (!VT.isVector()) 6521 return SDValue(); 6522 6523 ConstantSDNode *Cst = dyn_cast<ConstantSDNode>(Op.getOperand(1)); 6524 if (!Cst) 6525 return SDValue(); 6526 unsigned Val = Cst->getZExtValue(); 6527 6528 unsigned Size = Op.getValueType().getSizeInBits(); 6529 6530 // This will get lowered to an appropriate EXTRACT_SUBREG in ISel. 6531 if (Val == 0) 6532 return Op; 6533 6534 // If this is extracting the upper 64-bits of a 128-bit vector, we match 6535 // that directly. 6536 if (Size == 64 && Val * VT.getVectorElementType().getSizeInBits() == 64) 6537 return Op; 6538 6539 return SDValue(); 6540 } 6541 6542 bool AArch64TargetLowering::isShuffleMaskLegal(const SmallVectorImpl<int> &M, 6543 EVT VT) const { 6544 if (VT.getVectorNumElements() == 4 && 6545 (VT.is128BitVector() || VT.is64BitVector())) { 6546 unsigned PFIndexes[4]; 6547 for (unsigned i = 0; i != 4; ++i) { 6548 if (M[i] < 0) 6549 PFIndexes[i] = 8; 6550 else 6551 PFIndexes[i] = M[i]; 6552 } 6553 6554 // Compute the index in the perfect shuffle table. 6555 unsigned PFTableIndex = PFIndexes[0] * 9 * 9 * 9 + PFIndexes[1] * 9 * 9 + 6556 PFIndexes[2] * 9 + PFIndexes[3]; 6557 unsigned PFEntry = PerfectShuffleTable[PFTableIndex]; 6558 unsigned Cost = (PFEntry >> 30); 6559 6560 if (Cost <= 4) 6561 return true; 6562 } 6563 6564 bool DummyBool; 6565 int DummyInt; 6566 unsigned DummyUnsigned; 6567 6568 return (ShuffleVectorSDNode::isSplatMask(&M[0], VT) || isREVMask(M, VT, 64) || 6569 isREVMask(M, VT, 32) || isREVMask(M, VT, 16) || 6570 isEXTMask(M, VT, DummyBool, DummyUnsigned) || 6571 // isTBLMask(M, VT) || // FIXME: Port TBL support from ARM. 6572 isTRNMask(M, VT, DummyUnsigned) || isUZPMask(M, VT, DummyUnsigned) || 6573 isZIPMask(M, VT, DummyUnsigned) || 6574 isTRN_v_undef_Mask(M, VT, DummyUnsigned) || 6575 isUZP_v_undef_Mask(M, VT, DummyUnsigned) || 6576 isZIP_v_undef_Mask(M, VT, DummyUnsigned) || 6577 isINSMask(M, VT.getVectorNumElements(), DummyBool, DummyInt) || 6578 isConcatMask(M, VT, VT.getSizeInBits() == 128)); 6579 } 6580 6581 /// getVShiftImm - Check if this is a valid build_vector for the immediate 6582 /// operand of a vector shift operation, where all the elements of the 6583 /// build_vector must have the same constant integer value. 6584 static bool getVShiftImm(SDValue Op, unsigned ElementBits, int64_t &Cnt) { 6585 // Ignore bit_converts. 6586 while (Op.getOpcode() == ISD::BITCAST) 6587 Op = Op.getOperand(0); 6588 BuildVectorSDNode *BVN = dyn_cast<BuildVectorSDNode>(Op.getNode()); 6589 APInt SplatBits, SplatUndef; 6590 unsigned SplatBitSize; 6591 bool HasAnyUndefs; 6592 if (!BVN || !BVN->isConstantSplat(SplatBits, SplatUndef, SplatBitSize, 6593 HasAnyUndefs, ElementBits) || 6594 SplatBitSize > ElementBits) 6595 return false; 6596 Cnt = SplatBits.getSExtValue(); 6597 return true; 6598 } 6599 6600 /// isVShiftLImm - Check if this is a valid build_vector for the immediate 6601 /// operand of a vector shift left operation. That value must be in the range: 6602 /// 0 <= Value < ElementBits for a left shift; or 6603 /// 0 <= Value <= ElementBits for a long left shift. 6604 static bool isVShiftLImm(SDValue Op, EVT VT, bool isLong, int64_t &Cnt) { 6605 assert(VT.isVector() && "vector shift count is not a vector type"); 6606 int64_t ElementBits = VT.getVectorElementType().getSizeInBits(); 6607 if (!getVShiftImm(Op, ElementBits, Cnt)) 6608 return false; 6609 return (Cnt >= 0 && (isLong ? Cnt - 1 : Cnt) < ElementBits); 6610 } 6611 6612 /// isVShiftRImm - Check if this is a valid build_vector for the immediate 6613 /// operand of a vector shift right operation. The value must be in the range: 6614 /// 1 <= Value <= ElementBits for a right shift; or 6615 static bool isVShiftRImm(SDValue Op, EVT VT, bool isNarrow, int64_t &Cnt) { 6616 assert(VT.isVector() && "vector shift count is not a vector type"); 6617 int64_t ElementBits = VT.getVectorElementType().getSizeInBits(); 6618 if (!getVShiftImm(Op, ElementBits, Cnt)) 6619 return false; 6620 return (Cnt >= 1 && Cnt <= (isNarrow ? ElementBits / 2 : ElementBits)); 6621 } 6622 6623 SDValue AArch64TargetLowering::LowerVectorSRA_SRL_SHL(SDValue Op, 6624 SelectionDAG &DAG) const { 6625 EVT VT = Op.getValueType(); 6626 SDLoc DL(Op); 6627 int64_t Cnt; 6628 6629 if (!Op.getOperand(1).getValueType().isVector()) 6630 return Op; 6631 unsigned EltSize = VT.getVectorElementType().getSizeInBits(); 6632 6633 switch (Op.getOpcode()) { 6634 default: 6635 llvm_unreachable("unexpected shift opcode"); 6636 6637 case ISD::SHL: 6638 if (isVShiftLImm(Op.getOperand(1), VT, false, Cnt) && Cnt < EltSize) 6639 return DAG.getNode(AArch64ISD::VSHL, DL, VT, Op.getOperand(0), 6640 DAG.getConstant(Cnt, DL, MVT::i32)); 6641 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT, 6642 DAG.getConstant(Intrinsic::aarch64_neon_ushl, DL, 6643 MVT::i32), 6644 Op.getOperand(0), Op.getOperand(1)); 6645 case ISD::SRA: 6646 case ISD::SRL: 6647 // Right shift immediate 6648 if (isVShiftRImm(Op.getOperand(1), VT, false, Cnt) && Cnt < EltSize) { 6649 unsigned Opc = 6650 (Op.getOpcode() == ISD::SRA) ? AArch64ISD::VASHR : AArch64ISD::VLSHR; 6651 return DAG.getNode(Opc, DL, VT, Op.getOperand(0), 6652 DAG.getConstant(Cnt, DL, MVT::i32)); 6653 } 6654 6655 // Right shift register. Note, there is not a shift right register 6656 // instruction, but the shift left register instruction takes a signed 6657 // value, where negative numbers specify a right shift. 6658 unsigned Opc = (Op.getOpcode() == ISD::SRA) ? Intrinsic::aarch64_neon_sshl 6659 : Intrinsic::aarch64_neon_ushl; 6660 // negate the shift amount 6661 SDValue NegShift = DAG.getNode(AArch64ISD::NEG, DL, VT, Op.getOperand(1)); 6662 SDValue NegShiftLeft = 6663 DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT, 6664 DAG.getConstant(Opc, DL, MVT::i32), Op.getOperand(0), 6665 NegShift); 6666 return NegShiftLeft; 6667 } 6668 6669 return SDValue(); 6670 } 6671 6672 static SDValue EmitVectorComparison(SDValue LHS, SDValue RHS, 6673 AArch64CC::CondCode CC, bool NoNans, EVT VT, 6674 const SDLoc &dl, SelectionDAG &DAG) { 6675 EVT SrcVT = LHS.getValueType(); 6676 assert(VT.getSizeInBits() == SrcVT.getSizeInBits() && 6677 "function only supposed to emit natural comparisons"); 6678 6679 BuildVectorSDNode *BVN = dyn_cast<BuildVectorSDNode>(RHS.getNode()); 6680 APInt CnstBits(VT.getSizeInBits(), 0); 6681 APInt UndefBits(VT.getSizeInBits(), 0); 6682 bool IsCnst = BVN && resolveBuildVector(BVN, CnstBits, UndefBits); 6683 bool IsZero = IsCnst && (CnstBits == 0); 6684 6685 if (SrcVT.getVectorElementType().isFloatingPoint()) { 6686 switch (CC) { 6687 default: 6688 return SDValue(); 6689 case AArch64CC::NE: { 6690 SDValue Fcmeq; 6691 if (IsZero) 6692 Fcmeq = DAG.getNode(AArch64ISD::FCMEQz, dl, VT, LHS); 6693 else 6694 Fcmeq = DAG.getNode(AArch64ISD::FCMEQ, dl, VT, LHS, RHS); 6695 return DAG.getNode(AArch64ISD::NOT, dl, VT, Fcmeq); 6696 } 6697 case AArch64CC::EQ: 6698 if (IsZero) 6699 return DAG.getNode(AArch64ISD::FCMEQz, dl, VT, LHS); 6700 return DAG.getNode(AArch64ISD::FCMEQ, dl, VT, LHS, RHS); 6701 case AArch64CC::GE: 6702 if (IsZero) 6703 return DAG.getNode(AArch64ISD::FCMGEz, dl, VT, LHS); 6704 return DAG.getNode(AArch64ISD::FCMGE, dl, VT, LHS, RHS); 6705 case AArch64CC::GT: 6706 if (IsZero) 6707 return DAG.getNode(AArch64ISD::FCMGTz, dl, VT, LHS); 6708 return DAG.getNode(AArch64ISD::FCMGT, dl, VT, LHS, RHS); 6709 case AArch64CC::LS: 6710 if (IsZero) 6711 return DAG.getNode(AArch64ISD::FCMLEz, dl, VT, LHS); 6712 return DAG.getNode(AArch64ISD::FCMGE, dl, VT, RHS, LHS); 6713 case AArch64CC::LT: 6714 if (!NoNans) 6715 return SDValue(); 6716 // If we ignore NaNs then we can use to the MI implementation. 6717 // Fallthrough. 6718 case AArch64CC::MI: 6719 if (IsZero) 6720 return DAG.getNode(AArch64ISD::FCMLTz, dl, VT, LHS); 6721 return DAG.getNode(AArch64ISD::FCMGT, dl, VT, RHS, LHS); 6722 } 6723 } 6724 6725 switch (CC) { 6726 default: 6727 return SDValue(); 6728 case AArch64CC::NE: { 6729 SDValue Cmeq; 6730 if (IsZero) 6731 Cmeq = DAG.getNode(AArch64ISD::CMEQz, dl, VT, LHS); 6732 else 6733 Cmeq = DAG.getNode(AArch64ISD::CMEQ, dl, VT, LHS, RHS); 6734 return DAG.getNode(AArch64ISD::NOT, dl, VT, Cmeq); 6735 } 6736 case AArch64CC::EQ: 6737 if (IsZero) 6738 return DAG.getNode(AArch64ISD::CMEQz, dl, VT, LHS); 6739 return DAG.getNode(AArch64ISD::CMEQ, dl, VT, LHS, RHS); 6740 case AArch64CC::GE: 6741 if (IsZero) 6742 return DAG.getNode(AArch64ISD::CMGEz, dl, VT, LHS); 6743 return DAG.getNode(AArch64ISD::CMGE, dl, VT, LHS, RHS); 6744 case AArch64CC::GT: 6745 if (IsZero) 6746 return DAG.getNode(AArch64ISD::CMGTz, dl, VT, LHS); 6747 return DAG.getNode(AArch64ISD::CMGT, dl, VT, LHS, RHS); 6748 case AArch64CC::LE: 6749 if (IsZero) 6750 return DAG.getNode(AArch64ISD::CMLEz, dl, VT, LHS); 6751 return DAG.getNode(AArch64ISD::CMGE, dl, VT, RHS, LHS); 6752 case AArch64CC::LS: 6753 return DAG.getNode(AArch64ISD::CMHS, dl, VT, RHS, LHS); 6754 case AArch64CC::LO: 6755 return DAG.getNode(AArch64ISD::CMHI, dl, VT, RHS, LHS); 6756 case AArch64CC::LT: 6757 if (IsZero) 6758 return DAG.getNode(AArch64ISD::CMLTz, dl, VT, LHS); 6759 return DAG.getNode(AArch64ISD::CMGT, dl, VT, RHS, LHS); 6760 case AArch64CC::HI: 6761 return DAG.getNode(AArch64ISD::CMHI, dl, VT, LHS, RHS); 6762 case AArch64CC::HS: 6763 return DAG.getNode(AArch64ISD::CMHS, dl, VT, LHS, RHS); 6764 } 6765 } 6766 6767 SDValue AArch64TargetLowering::LowerVSETCC(SDValue Op, 6768 SelectionDAG &DAG) const { 6769 ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(2))->get(); 6770 SDValue LHS = Op.getOperand(0); 6771 SDValue RHS = Op.getOperand(1); 6772 EVT CmpVT = LHS.getValueType().changeVectorElementTypeToInteger(); 6773 SDLoc dl(Op); 6774 6775 if (LHS.getValueType().getVectorElementType().isInteger()) { 6776 assert(LHS.getValueType() == RHS.getValueType()); 6777 AArch64CC::CondCode AArch64CC = changeIntCCToAArch64CC(CC); 6778 SDValue Cmp = 6779 EmitVectorComparison(LHS, RHS, AArch64CC, false, CmpVT, dl, DAG); 6780 return DAG.getSExtOrTrunc(Cmp, dl, Op.getValueType()); 6781 } 6782 6783 if (LHS.getValueType().getVectorElementType() == MVT::f16) 6784 return SDValue(); 6785 6786 assert(LHS.getValueType().getVectorElementType() == MVT::f32 || 6787 LHS.getValueType().getVectorElementType() == MVT::f64); 6788 6789 // Unfortunately, the mapping of LLVM FP CC's onto AArch64 CC's isn't totally 6790 // clean. Some of them require two branches to implement. 6791 AArch64CC::CondCode CC1, CC2; 6792 bool ShouldInvert; 6793 changeVectorFPCCToAArch64CC(CC, CC1, CC2, ShouldInvert); 6794 6795 bool NoNaNs = getTargetMachine().Options.NoNaNsFPMath; 6796 SDValue Cmp = 6797 EmitVectorComparison(LHS, RHS, CC1, NoNaNs, CmpVT, dl, DAG); 6798 if (!Cmp.getNode()) 6799 return SDValue(); 6800 6801 if (CC2 != AArch64CC::AL) { 6802 SDValue Cmp2 = 6803 EmitVectorComparison(LHS, RHS, CC2, NoNaNs, CmpVT, dl, DAG); 6804 if (!Cmp2.getNode()) 6805 return SDValue(); 6806 6807 Cmp = DAG.getNode(ISD::OR, dl, CmpVT, Cmp, Cmp2); 6808 } 6809 6810 Cmp = DAG.getSExtOrTrunc(Cmp, dl, Op.getValueType()); 6811 6812 if (ShouldInvert) 6813 return Cmp = DAG.getNOT(dl, Cmp, Cmp.getValueType()); 6814 6815 return Cmp; 6816 } 6817 6818 /// getTgtMemIntrinsic - Represent NEON load and store intrinsics as 6819 /// MemIntrinsicNodes. The associated MachineMemOperands record the alignment 6820 /// specified in the intrinsic calls. 6821 bool AArch64TargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info, 6822 const CallInst &I, 6823 unsigned Intrinsic) const { 6824 auto &DL = I.getModule()->getDataLayout(); 6825 switch (Intrinsic) { 6826 case Intrinsic::aarch64_neon_ld2: 6827 case Intrinsic::aarch64_neon_ld3: 6828 case Intrinsic::aarch64_neon_ld4: 6829 case Intrinsic::aarch64_neon_ld1x2: 6830 case Intrinsic::aarch64_neon_ld1x3: 6831 case Intrinsic::aarch64_neon_ld1x4: 6832 case Intrinsic::aarch64_neon_ld2lane: 6833 case Intrinsic::aarch64_neon_ld3lane: 6834 case Intrinsic::aarch64_neon_ld4lane: 6835 case Intrinsic::aarch64_neon_ld2r: 6836 case Intrinsic::aarch64_neon_ld3r: 6837 case Intrinsic::aarch64_neon_ld4r: { 6838 Info.opc = ISD::INTRINSIC_W_CHAIN; 6839 // Conservatively set memVT to the entire set of vectors loaded. 6840 uint64_t NumElts = DL.getTypeSizeInBits(I.getType()) / 64; 6841 Info.memVT = EVT::getVectorVT(I.getType()->getContext(), MVT::i64, NumElts); 6842 Info.ptrVal = I.getArgOperand(I.getNumArgOperands() - 1); 6843 Info.offset = 0; 6844 Info.align = 0; 6845 Info.vol = false; // volatile loads with NEON intrinsics not supported 6846 Info.readMem = true; 6847 Info.writeMem = false; 6848 return true; 6849 } 6850 case Intrinsic::aarch64_neon_st2: 6851 case Intrinsic::aarch64_neon_st3: 6852 case Intrinsic::aarch64_neon_st4: 6853 case Intrinsic::aarch64_neon_st1x2: 6854 case Intrinsic::aarch64_neon_st1x3: 6855 case Intrinsic::aarch64_neon_st1x4: 6856 case Intrinsic::aarch64_neon_st2lane: 6857 case Intrinsic::aarch64_neon_st3lane: 6858 case Intrinsic::aarch64_neon_st4lane: { 6859 Info.opc = ISD::INTRINSIC_VOID; 6860 // Conservatively set memVT to the entire set of vectors stored. 6861 unsigned NumElts = 0; 6862 for (unsigned ArgI = 1, ArgE = I.getNumArgOperands(); ArgI < ArgE; ++ArgI) { 6863 Type *ArgTy = I.getArgOperand(ArgI)->getType(); 6864 if (!ArgTy->isVectorTy()) 6865 break; 6866 NumElts += DL.getTypeSizeInBits(ArgTy) / 64; 6867 } 6868 Info.memVT = EVT::getVectorVT(I.getType()->getContext(), MVT::i64, NumElts); 6869 Info.ptrVal = I.getArgOperand(I.getNumArgOperands() - 1); 6870 Info.offset = 0; 6871 Info.align = 0; 6872 Info.vol = false; // volatile stores with NEON intrinsics not supported 6873 Info.readMem = false; 6874 Info.writeMem = true; 6875 return true; 6876 } 6877 case Intrinsic::aarch64_ldaxr: 6878 case Intrinsic::aarch64_ldxr: { 6879 PointerType *PtrTy = cast<PointerType>(I.getArgOperand(0)->getType()); 6880 Info.opc = ISD::INTRINSIC_W_CHAIN; 6881 Info.memVT = MVT::getVT(PtrTy->getElementType()); 6882 Info.ptrVal = I.getArgOperand(0); 6883 Info.offset = 0; 6884 Info.align = DL.getABITypeAlignment(PtrTy->getElementType()); 6885 Info.vol = true; 6886 Info.readMem = true; 6887 Info.writeMem = false; 6888 return true; 6889 } 6890 case Intrinsic::aarch64_stlxr: 6891 case Intrinsic::aarch64_stxr: { 6892 PointerType *PtrTy = cast<PointerType>(I.getArgOperand(1)->getType()); 6893 Info.opc = ISD::INTRINSIC_W_CHAIN; 6894 Info.memVT = MVT::getVT(PtrTy->getElementType()); 6895 Info.ptrVal = I.getArgOperand(1); 6896 Info.offset = 0; 6897 Info.align = DL.getABITypeAlignment(PtrTy->getElementType()); 6898 Info.vol = true; 6899 Info.readMem = false; 6900 Info.writeMem = true; 6901 return true; 6902 } 6903 case Intrinsic::aarch64_ldaxp: 6904 case Intrinsic::aarch64_ldxp: { 6905 Info.opc = ISD::INTRINSIC_W_CHAIN; 6906 Info.memVT = MVT::i128; 6907 Info.ptrVal = I.getArgOperand(0); 6908 Info.offset = 0; 6909 Info.align = 16; 6910 Info.vol = true; 6911 Info.readMem = true; 6912 Info.writeMem = false; 6913 return true; 6914 } 6915 case Intrinsic::aarch64_stlxp: 6916 case Intrinsic::aarch64_stxp: { 6917 Info.opc = ISD::INTRINSIC_W_CHAIN; 6918 Info.memVT = MVT::i128; 6919 Info.ptrVal = I.getArgOperand(2); 6920 Info.offset = 0; 6921 Info.align = 16; 6922 Info.vol = true; 6923 Info.readMem = false; 6924 Info.writeMem = true; 6925 return true; 6926 } 6927 default: 6928 break; 6929 } 6930 6931 return false; 6932 } 6933 6934 // Truncations from 64-bit GPR to 32-bit GPR is free. 6935 bool AArch64TargetLowering::isTruncateFree(Type *Ty1, Type *Ty2) const { 6936 if (!Ty1->isIntegerTy() || !Ty2->isIntegerTy()) 6937 return false; 6938 unsigned NumBits1 = Ty1->getPrimitiveSizeInBits(); 6939 unsigned NumBits2 = Ty2->getPrimitiveSizeInBits(); 6940 return NumBits1 > NumBits2; 6941 } 6942 bool AArch64TargetLowering::isTruncateFree(EVT VT1, EVT VT2) const { 6943 if (VT1.isVector() || VT2.isVector() || !VT1.isInteger() || !VT2.isInteger()) 6944 return false; 6945 unsigned NumBits1 = VT1.getSizeInBits(); 6946 unsigned NumBits2 = VT2.getSizeInBits(); 6947 return NumBits1 > NumBits2; 6948 } 6949 6950 /// Check if it is profitable to hoist instruction in then/else to if. 6951 /// Not profitable if I and it's user can form a FMA instruction 6952 /// because we prefer FMSUB/FMADD. 6953 bool AArch64TargetLowering::isProfitableToHoist(Instruction *I) const { 6954 if (I->getOpcode() != Instruction::FMul) 6955 return true; 6956 6957 if (I->getNumUses() != 1) 6958 return true; 6959 6960 Instruction *User = I->user_back(); 6961 6962 if (User && 6963 !(User->getOpcode() == Instruction::FSub || 6964 User->getOpcode() == Instruction::FAdd)) 6965 return true; 6966 6967 const TargetOptions &Options = getTargetMachine().Options; 6968 const DataLayout &DL = I->getModule()->getDataLayout(); 6969 EVT VT = getValueType(DL, User->getOperand(0)->getType()); 6970 6971 return !(isFMAFasterThanFMulAndFAdd(VT) && 6972 isOperationLegalOrCustom(ISD::FMA, VT) && 6973 (Options.AllowFPOpFusion == FPOpFusion::Fast || 6974 Options.UnsafeFPMath)); 6975 } 6976 6977 // All 32-bit GPR operations implicitly zero the high-half of the corresponding 6978 // 64-bit GPR. 6979 bool AArch64TargetLowering::isZExtFree(Type *Ty1, Type *Ty2) const { 6980 if (!Ty1->isIntegerTy() || !Ty2->isIntegerTy()) 6981 return false; 6982 unsigned NumBits1 = Ty1->getPrimitiveSizeInBits(); 6983 unsigned NumBits2 = Ty2->getPrimitiveSizeInBits(); 6984 return NumBits1 == 32 && NumBits2 == 64; 6985 } 6986 bool AArch64TargetLowering::isZExtFree(EVT VT1, EVT VT2) const { 6987 if (VT1.isVector() || VT2.isVector() || !VT1.isInteger() || !VT2.isInteger()) 6988 return false; 6989 unsigned NumBits1 = VT1.getSizeInBits(); 6990 unsigned NumBits2 = VT2.getSizeInBits(); 6991 return NumBits1 == 32 && NumBits2 == 64; 6992 } 6993 6994 bool AArch64TargetLowering::isZExtFree(SDValue Val, EVT VT2) const { 6995 EVT VT1 = Val.getValueType(); 6996 if (isZExtFree(VT1, VT2)) { 6997 return true; 6998 } 6999 7000 if (Val.getOpcode() != ISD::LOAD) 7001 return false; 7002 7003 // 8-, 16-, and 32-bit integer loads all implicitly zero-extend. 7004 return (VT1.isSimple() && !VT1.isVector() && VT1.isInteger() && 7005 VT2.isSimple() && !VT2.isVector() && VT2.isInteger() && 7006 VT1.getSizeInBits() <= 32); 7007 } 7008 7009 bool AArch64TargetLowering::isExtFreeImpl(const Instruction *Ext) const { 7010 if (isa<FPExtInst>(Ext)) 7011 return false; 7012 7013 // Vector types are next free. 7014 if (Ext->getType()->isVectorTy()) 7015 return false; 7016 7017 for (const Use &U : Ext->uses()) { 7018 // The extension is free if we can fold it with a left shift in an 7019 // addressing mode or an arithmetic operation: add, sub, and cmp. 7020 7021 // Is there a shift? 7022 const Instruction *Instr = cast<Instruction>(U.getUser()); 7023 7024 // Is this a constant shift? 7025 switch (Instr->getOpcode()) { 7026 case Instruction::Shl: 7027 if (!isa<ConstantInt>(Instr->getOperand(1))) 7028 return false; 7029 break; 7030 case Instruction::GetElementPtr: { 7031 gep_type_iterator GTI = gep_type_begin(Instr); 7032 auto &DL = Ext->getModule()->getDataLayout(); 7033 std::advance(GTI, U.getOperandNo()); 7034 Type *IdxTy = *GTI; 7035 // This extension will end up with a shift because of the scaling factor. 7036 // 8-bit sized types have a scaling factor of 1, thus a shift amount of 0. 7037 // Get the shift amount based on the scaling factor: 7038 // log2(sizeof(IdxTy)) - log2(8). 7039 uint64_t ShiftAmt = 7040 countTrailingZeros(DL.getTypeStoreSizeInBits(IdxTy)) - 3; 7041 // Is the constant foldable in the shift of the addressing mode? 7042 // I.e., shift amount is between 1 and 4 inclusive. 7043 if (ShiftAmt == 0 || ShiftAmt > 4) 7044 return false; 7045 break; 7046 } 7047 case Instruction::Trunc: 7048 // Check if this is a noop. 7049 // trunc(sext ty1 to ty2) to ty1. 7050 if (Instr->getType() == Ext->getOperand(0)->getType()) 7051 continue; 7052 // FALL THROUGH. 7053 default: 7054 return false; 7055 } 7056 7057 // At this point we can use the bfm family, so this extension is free 7058 // for that use. 7059 } 7060 return true; 7061 } 7062 7063 bool AArch64TargetLowering::hasPairedLoad(Type *LoadedType, 7064 unsigned &RequiredAligment) const { 7065 if (!LoadedType->isIntegerTy() && !LoadedType->isFloatTy()) 7066 return false; 7067 // Cyclone supports unaligned accesses. 7068 RequiredAligment = 0; 7069 unsigned NumBits = LoadedType->getPrimitiveSizeInBits(); 7070 return NumBits == 32 || NumBits == 64; 7071 } 7072 7073 bool AArch64TargetLowering::hasPairedLoad(EVT LoadedType, 7074 unsigned &RequiredAligment) const { 7075 if (!LoadedType.isSimple() || 7076 (!LoadedType.isInteger() && !LoadedType.isFloatingPoint())) 7077 return false; 7078 // Cyclone supports unaligned accesses. 7079 RequiredAligment = 0; 7080 unsigned NumBits = LoadedType.getSizeInBits(); 7081 return NumBits == 32 || NumBits == 64; 7082 } 7083 7084 /// \brief Lower an interleaved load into a ldN intrinsic. 7085 /// 7086 /// E.g. Lower an interleaved load (Factor = 2): 7087 /// %wide.vec = load <8 x i32>, <8 x i32>* %ptr 7088 /// %v0 = shuffle %wide.vec, undef, <0, 2, 4, 6> ; Extract even elements 7089 /// %v1 = shuffle %wide.vec, undef, <1, 3, 5, 7> ; Extract odd elements 7090 /// 7091 /// Into: 7092 /// %ld2 = { <4 x i32>, <4 x i32> } call llvm.aarch64.neon.ld2(%ptr) 7093 /// %vec0 = extractelement { <4 x i32>, <4 x i32> } %ld2, i32 0 7094 /// %vec1 = extractelement { <4 x i32>, <4 x i32> } %ld2, i32 1 7095 bool AArch64TargetLowering::lowerInterleavedLoad( 7096 LoadInst *LI, ArrayRef<ShuffleVectorInst *> Shuffles, 7097 ArrayRef<unsigned> Indices, unsigned Factor) const { 7098 assert(Factor >= 2 && Factor <= getMaxSupportedInterleaveFactor() && 7099 "Invalid interleave factor"); 7100 assert(!Shuffles.empty() && "Empty shufflevector input"); 7101 assert(Shuffles.size() == Indices.size() && 7102 "Unmatched number of shufflevectors and indices"); 7103 7104 const DataLayout &DL = LI->getModule()->getDataLayout(); 7105 7106 VectorType *VecTy = Shuffles[0]->getType(); 7107 unsigned VecSize = DL.getTypeSizeInBits(VecTy); 7108 7109 // Skip if we do not have NEON and skip illegal vector types. 7110 if (!Subtarget->hasNEON() || (VecSize != 64 && VecSize != 128)) 7111 return false; 7112 7113 // A pointer vector can not be the return type of the ldN intrinsics. Need to 7114 // load integer vectors first and then convert to pointer vectors. 7115 Type *EltTy = VecTy->getVectorElementType(); 7116 if (EltTy->isPointerTy()) 7117 VecTy = 7118 VectorType::get(DL.getIntPtrType(EltTy), VecTy->getVectorNumElements()); 7119 7120 Type *PtrTy = VecTy->getPointerTo(LI->getPointerAddressSpace()); 7121 Type *Tys[2] = {VecTy, PtrTy}; 7122 static const Intrinsic::ID LoadInts[3] = {Intrinsic::aarch64_neon_ld2, 7123 Intrinsic::aarch64_neon_ld3, 7124 Intrinsic::aarch64_neon_ld4}; 7125 Function *LdNFunc = 7126 Intrinsic::getDeclaration(LI->getModule(), LoadInts[Factor - 2], Tys); 7127 7128 IRBuilder<> Builder(LI); 7129 Value *Ptr = Builder.CreateBitCast(LI->getPointerOperand(), PtrTy); 7130 7131 CallInst *LdN = Builder.CreateCall(LdNFunc, Ptr, "ldN"); 7132 7133 // Replace uses of each shufflevector with the corresponding vector loaded 7134 // by ldN. 7135 for (unsigned i = 0; i < Shuffles.size(); i++) { 7136 ShuffleVectorInst *SVI = Shuffles[i]; 7137 unsigned Index = Indices[i]; 7138 7139 Value *SubVec = Builder.CreateExtractValue(LdN, Index); 7140 7141 // Convert the integer vector to pointer vector if the element is pointer. 7142 if (EltTy->isPointerTy()) 7143 SubVec = Builder.CreateIntToPtr(SubVec, SVI->getType()); 7144 7145 SVI->replaceAllUsesWith(SubVec); 7146 } 7147 7148 return true; 7149 } 7150 7151 /// \brief Get a mask consisting of sequential integers starting from \p Start. 7152 /// 7153 /// I.e. <Start, Start + 1, ..., Start + NumElts - 1> 7154 static Constant *getSequentialMask(IRBuilder<> &Builder, unsigned Start, 7155 unsigned NumElts) { 7156 SmallVector<Constant *, 16> Mask; 7157 for (unsigned i = 0; i < NumElts; i++) 7158 Mask.push_back(Builder.getInt32(Start + i)); 7159 7160 return ConstantVector::get(Mask); 7161 } 7162 7163 /// \brief Lower an interleaved store into a stN intrinsic. 7164 /// 7165 /// E.g. Lower an interleaved store (Factor = 3): 7166 /// %i.vec = shuffle <8 x i32> %v0, <8 x i32> %v1, 7167 /// <0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11> 7168 /// store <12 x i32> %i.vec, <12 x i32>* %ptr 7169 /// 7170 /// Into: 7171 /// %sub.v0 = shuffle <8 x i32> %v0, <8 x i32> v1, <0, 1, 2, 3> 7172 /// %sub.v1 = shuffle <8 x i32> %v0, <8 x i32> v1, <4, 5, 6, 7> 7173 /// %sub.v2 = shuffle <8 x i32> %v0, <8 x i32> v1, <8, 9, 10, 11> 7174 /// call void llvm.aarch64.neon.st3(%sub.v0, %sub.v1, %sub.v2, %ptr) 7175 /// 7176 /// Note that the new shufflevectors will be removed and we'll only generate one 7177 /// st3 instruction in CodeGen. 7178 bool AArch64TargetLowering::lowerInterleavedStore(StoreInst *SI, 7179 ShuffleVectorInst *SVI, 7180 unsigned Factor) const { 7181 assert(Factor >= 2 && Factor <= getMaxSupportedInterleaveFactor() && 7182 "Invalid interleave factor"); 7183 7184 VectorType *VecTy = SVI->getType(); 7185 assert(VecTy->getVectorNumElements() % Factor == 0 && 7186 "Invalid interleaved store"); 7187 7188 unsigned NumSubElts = VecTy->getVectorNumElements() / Factor; 7189 Type *EltTy = VecTy->getVectorElementType(); 7190 VectorType *SubVecTy = VectorType::get(EltTy, NumSubElts); 7191 7192 const DataLayout &DL = SI->getModule()->getDataLayout(); 7193 unsigned SubVecSize = DL.getTypeSizeInBits(SubVecTy); 7194 7195 // Skip if we do not have NEON and skip illegal vector types. 7196 if (!Subtarget->hasNEON() || (SubVecSize != 64 && SubVecSize != 128)) 7197 return false; 7198 7199 Value *Op0 = SVI->getOperand(0); 7200 Value *Op1 = SVI->getOperand(1); 7201 IRBuilder<> Builder(SI); 7202 7203 // StN intrinsics don't support pointer vectors as arguments. Convert pointer 7204 // vectors to integer vectors. 7205 if (EltTy->isPointerTy()) { 7206 Type *IntTy = DL.getIntPtrType(EltTy); 7207 unsigned NumOpElts = 7208 dyn_cast<VectorType>(Op0->getType())->getVectorNumElements(); 7209 7210 // Convert to the corresponding integer vector. 7211 Type *IntVecTy = VectorType::get(IntTy, NumOpElts); 7212 Op0 = Builder.CreatePtrToInt(Op0, IntVecTy); 7213 Op1 = Builder.CreatePtrToInt(Op1, IntVecTy); 7214 7215 SubVecTy = VectorType::get(IntTy, NumSubElts); 7216 } 7217 7218 Type *PtrTy = SubVecTy->getPointerTo(SI->getPointerAddressSpace()); 7219 Type *Tys[2] = {SubVecTy, PtrTy}; 7220 static const Intrinsic::ID StoreInts[3] = {Intrinsic::aarch64_neon_st2, 7221 Intrinsic::aarch64_neon_st3, 7222 Intrinsic::aarch64_neon_st4}; 7223 Function *StNFunc = 7224 Intrinsic::getDeclaration(SI->getModule(), StoreInts[Factor - 2], Tys); 7225 7226 SmallVector<Value *, 5> Ops; 7227 7228 // Split the shufflevector operands into sub vectors for the new stN call. 7229 for (unsigned i = 0; i < Factor; i++) 7230 Ops.push_back(Builder.CreateShuffleVector( 7231 Op0, Op1, getSequentialMask(Builder, NumSubElts * i, NumSubElts))); 7232 7233 Ops.push_back(Builder.CreateBitCast(SI->getPointerOperand(), PtrTy)); 7234 Builder.CreateCall(StNFunc, Ops); 7235 return true; 7236 } 7237 7238 static bool memOpAlign(unsigned DstAlign, unsigned SrcAlign, 7239 unsigned AlignCheck) { 7240 return ((SrcAlign == 0 || SrcAlign % AlignCheck == 0) && 7241 (DstAlign == 0 || DstAlign % AlignCheck == 0)); 7242 } 7243 7244 EVT AArch64TargetLowering::getOptimalMemOpType(uint64_t Size, unsigned DstAlign, 7245 unsigned SrcAlign, bool IsMemset, 7246 bool ZeroMemset, 7247 bool MemcpyStrSrc, 7248 MachineFunction &MF) const { 7249 // Don't use AdvSIMD to implement 16-byte memset. It would have taken one 7250 // instruction to materialize the v2i64 zero and one store (with restrictive 7251 // addressing mode). Just do two i64 store of zero-registers. 7252 bool Fast; 7253 const Function *F = MF.getFunction(); 7254 if (Subtarget->hasFPARMv8() && !IsMemset && Size >= 16 && 7255 !F->hasFnAttribute(Attribute::NoImplicitFloat) && 7256 (memOpAlign(SrcAlign, DstAlign, 16) || 7257 (allowsMisalignedMemoryAccesses(MVT::f128, 0, 1, &Fast) && Fast))) 7258 return MVT::f128; 7259 7260 if (Size >= 8 && 7261 (memOpAlign(SrcAlign, DstAlign, 8) || 7262 (allowsMisalignedMemoryAccesses(MVT::i64, 0, 1, &Fast) && Fast))) 7263 return MVT::i64; 7264 7265 if (Size >= 4 && 7266 (memOpAlign(SrcAlign, DstAlign, 4) || 7267 (allowsMisalignedMemoryAccesses(MVT::i32, 0, 1, &Fast) && Fast))) 7268 return MVT::i32; 7269 7270 return MVT::Other; 7271 } 7272 7273 // 12-bit optionally shifted immediates are legal for adds. 7274 bool AArch64TargetLowering::isLegalAddImmediate(int64_t Immed) const { 7275 // Avoid UB for INT64_MIN. 7276 if (Immed == std::numeric_limits<int64_t>::min()) 7277 return false; 7278 // Same encoding for add/sub, just flip the sign. 7279 Immed = std::abs(Immed); 7280 return ((Immed >> 12) == 0 || ((Immed & 0xfff) == 0 && Immed >> 24 == 0)); 7281 } 7282 7283 // Integer comparisons are implemented with ADDS/SUBS, so the range of valid 7284 // immediates is the same as for an add or a sub. 7285 bool AArch64TargetLowering::isLegalICmpImmediate(int64_t Immed) const { 7286 return isLegalAddImmediate(Immed); 7287 } 7288 7289 /// isLegalAddressingMode - Return true if the addressing mode represented 7290 /// by AM is legal for this target, for a load/store of the specified type. 7291 bool AArch64TargetLowering::isLegalAddressingMode(const DataLayout &DL, 7292 const AddrMode &AM, Type *Ty, 7293 unsigned AS) const { 7294 // AArch64 has five basic addressing modes: 7295 // reg 7296 // reg + 9-bit signed offset 7297 // reg + SIZE_IN_BYTES * 12-bit unsigned offset 7298 // reg1 + reg2 7299 // reg + SIZE_IN_BYTES * reg 7300 7301 // No global is ever allowed as a base. 7302 if (AM.BaseGV) 7303 return false; 7304 7305 // No reg+reg+imm addressing. 7306 if (AM.HasBaseReg && AM.BaseOffs && AM.Scale) 7307 return false; 7308 7309 // check reg + imm case: 7310 // i.e., reg + 0, reg + imm9, reg + SIZE_IN_BYTES * uimm12 7311 uint64_t NumBytes = 0; 7312 if (Ty->isSized()) { 7313 uint64_t NumBits = DL.getTypeSizeInBits(Ty); 7314 NumBytes = NumBits / 8; 7315 if (!isPowerOf2_64(NumBits)) 7316 NumBytes = 0; 7317 } 7318 7319 if (!AM.Scale) { 7320 int64_t Offset = AM.BaseOffs; 7321 7322 // 9-bit signed offset 7323 if (Offset >= -(1LL << 9) && Offset <= (1LL << 9) - 1) 7324 return true; 7325 7326 // 12-bit unsigned offset 7327 unsigned shift = Log2_64(NumBytes); 7328 if (NumBytes && Offset > 0 && (Offset / NumBytes) <= (1LL << 12) - 1 && 7329 // Must be a multiple of NumBytes (NumBytes is a power of 2) 7330 (Offset >> shift) << shift == Offset) 7331 return true; 7332 return false; 7333 } 7334 7335 // Check reg1 + SIZE_IN_BYTES * reg2 and reg1 + reg2 7336 7337 return !AM.Scale || AM.Scale == 1 || 7338 (AM.Scale > 0 && (uint64_t)AM.Scale == NumBytes); 7339 } 7340 7341 int AArch64TargetLowering::getScalingFactorCost(const DataLayout &DL, 7342 const AddrMode &AM, Type *Ty, 7343 unsigned AS) const { 7344 // Scaling factors are not free at all. 7345 // Operands | Rt Latency 7346 // ------------------------------------------- 7347 // Rt, [Xn, Xm] | 4 7348 // ------------------------------------------- 7349 // Rt, [Xn, Xm, lsl #imm] | Rn: 4 Rm: 5 7350 // Rt, [Xn, Wm, <extend> #imm] | 7351 if (isLegalAddressingMode(DL, AM, Ty, AS)) 7352 // Scale represents reg2 * scale, thus account for 1 if 7353 // it is not equal to 0 or 1. 7354 return AM.Scale != 0 && AM.Scale != 1; 7355 return -1; 7356 } 7357 7358 bool AArch64TargetLowering::isFMAFasterThanFMulAndFAdd(EVT VT) const { 7359 VT = VT.getScalarType(); 7360 7361 if (!VT.isSimple()) 7362 return false; 7363 7364 switch (VT.getSimpleVT().SimpleTy) { 7365 case MVT::f32: 7366 case MVT::f64: 7367 return true; 7368 default: 7369 break; 7370 } 7371 7372 return false; 7373 } 7374 7375 const MCPhysReg * 7376 AArch64TargetLowering::getScratchRegisters(CallingConv::ID) const { 7377 // LR is a callee-save register, but we must treat it as clobbered by any call 7378 // site. Hence we include LR in the scratch registers, which are in turn added 7379 // as implicit-defs for stackmaps and patchpoints. 7380 static const MCPhysReg ScratchRegs[] = { 7381 AArch64::X16, AArch64::X17, AArch64::LR, 0 7382 }; 7383 return ScratchRegs; 7384 } 7385 7386 bool 7387 AArch64TargetLowering::isDesirableToCommuteWithShift(const SDNode *N) const { 7388 EVT VT = N->getValueType(0); 7389 // If N is unsigned bit extraction: ((x >> C) & mask), then do not combine 7390 // it with shift to let it be lowered to UBFX. 7391 if (N->getOpcode() == ISD::AND && (VT == MVT::i32 || VT == MVT::i64) && 7392 isa<ConstantSDNode>(N->getOperand(1))) { 7393 uint64_t TruncMask = N->getConstantOperandVal(1); 7394 if (isMask_64(TruncMask) && 7395 N->getOperand(0).getOpcode() == ISD::SRL && 7396 isa<ConstantSDNode>(N->getOperand(0)->getOperand(1))) 7397 return false; 7398 } 7399 return true; 7400 } 7401 7402 bool AArch64TargetLowering::shouldConvertConstantLoadToIntImm(const APInt &Imm, 7403 Type *Ty) const { 7404 assert(Ty->isIntegerTy()); 7405 7406 unsigned BitSize = Ty->getPrimitiveSizeInBits(); 7407 if (BitSize == 0) 7408 return false; 7409 7410 int64_t Val = Imm.getSExtValue(); 7411 if (Val == 0 || AArch64_AM::isLogicalImmediate(Val, BitSize)) 7412 return true; 7413 7414 if ((int64_t)Val < 0) 7415 Val = ~Val; 7416 if (BitSize == 32) 7417 Val &= (1LL << 32) - 1; 7418 7419 unsigned LZ = countLeadingZeros((uint64_t)Val); 7420 unsigned Shift = (63 - LZ) / 16; 7421 // MOVZ is free so return true for one or fewer MOVK. 7422 return Shift < 3; 7423 } 7424 7425 /// Turn vector tests of the signbit in the form of: 7426 /// xor (sra X, elt_size(X)-1), -1 7427 /// into: 7428 /// cmge X, X, #0 7429 static SDValue foldVectorXorShiftIntoCmp(SDNode *N, SelectionDAG &DAG, 7430 const AArch64Subtarget *Subtarget) { 7431 EVT VT = N->getValueType(0); 7432 if (!Subtarget->hasNEON() || !VT.isVector()) 7433 return SDValue(); 7434 7435 // There must be a shift right algebraic before the xor, and the xor must be a 7436 // 'not' operation. 7437 SDValue Shift = N->getOperand(0); 7438 SDValue Ones = N->getOperand(1); 7439 if (Shift.getOpcode() != AArch64ISD::VASHR || !Shift.hasOneUse() || 7440 !ISD::isBuildVectorAllOnes(Ones.getNode())) 7441 return SDValue(); 7442 7443 // The shift should be smearing the sign bit across each vector element. 7444 auto *ShiftAmt = dyn_cast<ConstantSDNode>(Shift.getOperand(1)); 7445 EVT ShiftEltTy = Shift.getValueType().getVectorElementType(); 7446 if (!ShiftAmt || ShiftAmt->getZExtValue() != ShiftEltTy.getSizeInBits() - 1) 7447 return SDValue(); 7448 7449 return DAG.getNode(AArch64ISD::CMGEz, SDLoc(N), VT, Shift.getOperand(0)); 7450 } 7451 7452 // Generate SUBS and CSEL for integer abs. 7453 static SDValue performIntegerAbsCombine(SDNode *N, SelectionDAG &DAG) { 7454 EVT VT = N->getValueType(0); 7455 7456 SDValue N0 = N->getOperand(0); 7457 SDValue N1 = N->getOperand(1); 7458 SDLoc DL(N); 7459 7460 // Check pattern of XOR(ADD(X,Y), Y) where Y is SRA(X, size(X)-1) 7461 // and change it to SUB and CSEL. 7462 if (VT.isInteger() && N->getOpcode() == ISD::XOR && 7463 N0.getOpcode() == ISD::ADD && N0.getOperand(1) == N1 && 7464 N1.getOpcode() == ISD::SRA && N1.getOperand(0) == N0.getOperand(0)) 7465 if (ConstantSDNode *Y1C = dyn_cast<ConstantSDNode>(N1.getOperand(1))) 7466 if (Y1C->getAPIntValue() == VT.getSizeInBits() - 1) { 7467 SDValue Neg = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT), 7468 N0.getOperand(0)); 7469 // Generate SUBS & CSEL. 7470 SDValue Cmp = 7471 DAG.getNode(AArch64ISD::SUBS, DL, DAG.getVTList(VT, MVT::i32), 7472 N0.getOperand(0), DAG.getConstant(0, DL, VT)); 7473 return DAG.getNode(AArch64ISD::CSEL, DL, VT, N0.getOperand(0), Neg, 7474 DAG.getConstant(AArch64CC::PL, DL, MVT::i32), 7475 SDValue(Cmp.getNode(), 1)); 7476 } 7477 return SDValue(); 7478 } 7479 7480 static SDValue performXorCombine(SDNode *N, SelectionDAG &DAG, 7481 TargetLowering::DAGCombinerInfo &DCI, 7482 const AArch64Subtarget *Subtarget) { 7483 if (DCI.isBeforeLegalizeOps()) 7484 return SDValue(); 7485 7486 if (SDValue Cmp = foldVectorXorShiftIntoCmp(N, DAG, Subtarget)) 7487 return Cmp; 7488 7489 return performIntegerAbsCombine(N, DAG); 7490 } 7491 7492 SDValue 7493 AArch64TargetLowering::BuildSDIVPow2(SDNode *N, const APInt &Divisor, 7494 SelectionDAG &DAG, 7495 std::vector<SDNode *> *Created) const { 7496 AttributeSet Attr = DAG.getMachineFunction().getFunction()->getAttributes(); 7497 if (isIntDivCheap(N->getValueType(0), Attr)) 7498 return SDValue(N,0); // Lower SDIV as SDIV 7499 7500 // fold (sdiv X, pow2) 7501 EVT VT = N->getValueType(0); 7502 if ((VT != MVT::i32 && VT != MVT::i64) || 7503 !(Divisor.isPowerOf2() || (-Divisor).isPowerOf2())) 7504 return SDValue(); 7505 7506 SDLoc DL(N); 7507 SDValue N0 = N->getOperand(0); 7508 unsigned Lg2 = Divisor.countTrailingZeros(); 7509 SDValue Zero = DAG.getConstant(0, DL, VT); 7510 SDValue Pow2MinusOne = DAG.getConstant((1ULL << Lg2) - 1, DL, VT); 7511 7512 // Add (N0 < 0) ? Pow2 - 1 : 0; 7513 SDValue CCVal; 7514 SDValue Cmp = getAArch64Cmp(N0, Zero, ISD::SETLT, CCVal, DAG, DL); 7515 SDValue Add = DAG.getNode(ISD::ADD, DL, VT, N0, Pow2MinusOne); 7516 SDValue CSel = DAG.getNode(AArch64ISD::CSEL, DL, VT, Add, N0, CCVal, Cmp); 7517 7518 if (Created) { 7519 Created->push_back(Cmp.getNode()); 7520 Created->push_back(Add.getNode()); 7521 Created->push_back(CSel.getNode()); 7522 } 7523 7524 // Divide by pow2. 7525 SDValue SRA = 7526 DAG.getNode(ISD::SRA, DL, VT, CSel, DAG.getConstant(Lg2, DL, MVT::i64)); 7527 7528 // If we're dividing by a positive value, we're done. Otherwise, we must 7529 // negate the result. 7530 if (Divisor.isNonNegative()) 7531 return SRA; 7532 7533 if (Created) 7534 Created->push_back(SRA.getNode()); 7535 return DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT), SRA); 7536 } 7537 7538 static SDValue performMulCombine(SDNode *N, SelectionDAG &DAG, 7539 TargetLowering::DAGCombinerInfo &DCI, 7540 const AArch64Subtarget *Subtarget) { 7541 if (DCI.isBeforeLegalizeOps()) 7542 return SDValue(); 7543 7544 // Multiplication of a power of two plus/minus one can be done more 7545 // cheaply as as shift+add/sub. For now, this is true unilaterally. If 7546 // future CPUs have a cheaper MADD instruction, this may need to be 7547 // gated on a subtarget feature. For Cyclone, 32-bit MADD is 4 cycles and 7548 // 64-bit is 5 cycles, so this is always a win. 7549 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(N->getOperand(1))) { 7550 const APInt &Value = C->getAPIntValue(); 7551 EVT VT = N->getValueType(0); 7552 SDLoc DL(N); 7553 if (Value.isNonNegative()) { 7554 // (mul x, 2^N + 1) => (add (shl x, N), x) 7555 APInt VM1 = Value - 1; 7556 if (VM1.isPowerOf2()) { 7557 SDValue ShiftedVal = 7558 DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0), 7559 DAG.getConstant(VM1.logBase2(), DL, MVT::i64)); 7560 return DAG.getNode(ISD::ADD, DL, VT, ShiftedVal, 7561 N->getOperand(0)); 7562 } 7563 // (mul x, 2^N - 1) => (sub (shl x, N), x) 7564 APInt VP1 = Value + 1; 7565 if (VP1.isPowerOf2()) { 7566 SDValue ShiftedVal = 7567 DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0), 7568 DAG.getConstant(VP1.logBase2(), DL, MVT::i64)); 7569 return DAG.getNode(ISD::SUB, DL, VT, ShiftedVal, 7570 N->getOperand(0)); 7571 } 7572 } else { 7573 // (mul x, -(2^N - 1)) => (sub x, (shl x, N)) 7574 APInt VNP1 = -Value + 1; 7575 if (VNP1.isPowerOf2()) { 7576 SDValue ShiftedVal = 7577 DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0), 7578 DAG.getConstant(VNP1.logBase2(), DL, MVT::i64)); 7579 return DAG.getNode(ISD::SUB, DL, VT, N->getOperand(0), 7580 ShiftedVal); 7581 } 7582 // (mul x, -(2^N + 1)) => - (add (shl x, N), x) 7583 APInt VNM1 = -Value - 1; 7584 if (VNM1.isPowerOf2()) { 7585 SDValue ShiftedVal = 7586 DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0), 7587 DAG.getConstant(VNM1.logBase2(), DL, MVT::i64)); 7588 SDValue Add = 7589 DAG.getNode(ISD::ADD, DL, VT, ShiftedVal, N->getOperand(0)); 7590 return DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT), Add); 7591 } 7592 } 7593 } 7594 return SDValue(); 7595 } 7596 7597 static SDValue performVectorCompareAndMaskUnaryOpCombine(SDNode *N, 7598 SelectionDAG &DAG) { 7599 // Take advantage of vector comparisons producing 0 or -1 in each lane to 7600 // optimize away operation when it's from a constant. 7601 // 7602 // The general transformation is: 7603 // UNARYOP(AND(VECTOR_CMP(x,y), constant)) --> 7604 // AND(VECTOR_CMP(x,y), constant2) 7605 // constant2 = UNARYOP(constant) 7606 7607 // Early exit if this isn't a vector operation, the operand of the 7608 // unary operation isn't a bitwise AND, or if the sizes of the operations 7609 // aren't the same. 7610 EVT VT = N->getValueType(0); 7611 if (!VT.isVector() || N->getOperand(0)->getOpcode() != ISD::AND || 7612 N->getOperand(0)->getOperand(0)->getOpcode() != ISD::SETCC || 7613 VT.getSizeInBits() != N->getOperand(0)->getValueType(0).getSizeInBits()) 7614 return SDValue(); 7615 7616 // Now check that the other operand of the AND is a constant. We could 7617 // make the transformation for non-constant splats as well, but it's unclear 7618 // that would be a benefit as it would not eliminate any operations, just 7619 // perform one more step in scalar code before moving to the vector unit. 7620 if (BuildVectorSDNode *BV = 7621 dyn_cast<BuildVectorSDNode>(N->getOperand(0)->getOperand(1))) { 7622 // Bail out if the vector isn't a constant. 7623 if (!BV->isConstant()) 7624 return SDValue(); 7625 7626 // Everything checks out. Build up the new and improved node. 7627 SDLoc DL(N); 7628 EVT IntVT = BV->getValueType(0); 7629 // Create a new constant of the appropriate type for the transformed 7630 // DAG. 7631 SDValue SourceConst = DAG.getNode(N->getOpcode(), DL, VT, SDValue(BV, 0)); 7632 // The AND node needs bitcasts to/from an integer vector type around it. 7633 SDValue MaskConst = DAG.getNode(ISD::BITCAST, DL, IntVT, SourceConst); 7634 SDValue NewAnd = DAG.getNode(ISD::AND, DL, IntVT, 7635 N->getOperand(0)->getOperand(0), MaskConst); 7636 SDValue Res = DAG.getNode(ISD::BITCAST, DL, VT, NewAnd); 7637 return Res; 7638 } 7639 7640 return SDValue(); 7641 } 7642 7643 static SDValue performIntToFpCombine(SDNode *N, SelectionDAG &DAG, 7644 const AArch64Subtarget *Subtarget) { 7645 // First try to optimize away the conversion when it's conditionally from 7646 // a constant. Vectors only. 7647 if (SDValue Res = performVectorCompareAndMaskUnaryOpCombine(N, DAG)) 7648 return Res; 7649 7650 EVT VT = N->getValueType(0); 7651 if (VT != MVT::f32 && VT != MVT::f64) 7652 return SDValue(); 7653 7654 // Only optimize when the source and destination types have the same width. 7655 if (VT.getSizeInBits() != N->getOperand(0).getValueType().getSizeInBits()) 7656 return SDValue(); 7657 7658 // If the result of an integer load is only used by an integer-to-float 7659 // conversion, use a fp load instead and a AdvSIMD scalar {S|U}CVTF instead. 7660 // This eliminates an "integer-to-vector-move" UOP and improves throughput. 7661 SDValue N0 = N->getOperand(0); 7662 if (Subtarget->hasNEON() && ISD::isNormalLoad(N0.getNode()) && N0.hasOneUse() && 7663 // Do not change the width of a volatile load. 7664 !cast<LoadSDNode>(N0)->isVolatile()) { 7665 LoadSDNode *LN0 = cast<LoadSDNode>(N0); 7666 SDValue Load = DAG.getLoad(VT, SDLoc(N), LN0->getChain(), LN0->getBasePtr(), 7667 LN0->getPointerInfo(), LN0->isVolatile(), 7668 LN0->isNonTemporal(), LN0->isInvariant(), 7669 LN0->getAlignment()); 7670 7671 // Make sure successors of the original load stay after it by updating them 7672 // to use the new Chain. 7673 DAG.ReplaceAllUsesOfValueWith(SDValue(LN0, 1), Load.getValue(1)); 7674 7675 unsigned Opcode = 7676 (N->getOpcode() == ISD::SINT_TO_FP) ? AArch64ISD::SITOF : AArch64ISD::UITOF; 7677 return DAG.getNode(Opcode, SDLoc(N), VT, Load); 7678 } 7679 7680 return SDValue(); 7681 } 7682 7683 /// Fold a floating-point multiply by power of two into floating-point to 7684 /// fixed-point conversion. 7685 static SDValue performFpToIntCombine(SDNode *N, SelectionDAG &DAG, 7686 TargetLowering::DAGCombinerInfo &DCI, 7687 const AArch64Subtarget *Subtarget) { 7688 if (!Subtarget->hasNEON()) 7689 return SDValue(); 7690 7691 SDValue Op = N->getOperand(0); 7692 if (!Op.getValueType().isVector() || !Op.getValueType().isSimple() || 7693 Op.getOpcode() != ISD::FMUL) 7694 return SDValue(); 7695 7696 SDValue ConstVec = Op->getOperand(1); 7697 if (!isa<BuildVectorSDNode>(ConstVec)) 7698 return SDValue(); 7699 7700 MVT FloatTy = Op.getSimpleValueType().getVectorElementType(); 7701 uint32_t FloatBits = FloatTy.getSizeInBits(); 7702 if (FloatBits != 32 && FloatBits != 64) 7703 return SDValue(); 7704 7705 MVT IntTy = N->getSimpleValueType(0).getVectorElementType(); 7706 uint32_t IntBits = IntTy.getSizeInBits(); 7707 if (IntBits != 16 && IntBits != 32 && IntBits != 64) 7708 return SDValue(); 7709 7710 // Avoid conversions where iN is larger than the float (e.g., float -> i64). 7711 if (IntBits > FloatBits) 7712 return SDValue(); 7713 7714 BitVector UndefElements; 7715 BuildVectorSDNode *BV = cast<BuildVectorSDNode>(ConstVec); 7716 int32_t Bits = IntBits == 64 ? 64 : 32; 7717 int32_t C = BV->getConstantFPSplatPow2ToLog2Int(&UndefElements, Bits + 1); 7718 if (C == -1 || C == 0 || C > Bits) 7719 return SDValue(); 7720 7721 MVT ResTy; 7722 unsigned NumLanes = Op.getValueType().getVectorNumElements(); 7723 switch (NumLanes) { 7724 default: 7725 return SDValue(); 7726 case 2: 7727 ResTy = FloatBits == 32 ? MVT::v2i32 : MVT::v2i64; 7728 break; 7729 case 4: 7730 ResTy = FloatBits == 32 ? MVT::v4i32 : MVT::v4i64; 7731 break; 7732 } 7733 7734 if (ResTy == MVT::v4i64 && DCI.isBeforeLegalizeOps()) 7735 return SDValue(); 7736 7737 assert((ResTy != MVT::v4i64 || DCI.isBeforeLegalizeOps()) && 7738 "Illegal vector type after legalization"); 7739 7740 SDLoc DL(N); 7741 bool IsSigned = N->getOpcode() == ISD::FP_TO_SINT; 7742 unsigned IntrinsicOpcode = IsSigned ? Intrinsic::aarch64_neon_vcvtfp2fxs 7743 : Intrinsic::aarch64_neon_vcvtfp2fxu; 7744 SDValue FixConv = 7745 DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, ResTy, 7746 DAG.getConstant(IntrinsicOpcode, DL, MVT::i32), 7747 Op->getOperand(0), DAG.getConstant(C, DL, MVT::i32)); 7748 // We can handle smaller integers by generating an extra trunc. 7749 if (IntBits < FloatBits) 7750 FixConv = DAG.getNode(ISD::TRUNCATE, DL, N->getValueType(0), FixConv); 7751 7752 return FixConv; 7753 } 7754 7755 /// Fold a floating-point divide by power of two into fixed-point to 7756 /// floating-point conversion. 7757 static SDValue performFDivCombine(SDNode *N, SelectionDAG &DAG, 7758 const AArch64Subtarget *Subtarget) { 7759 if (!Subtarget->hasNEON()) 7760 return SDValue(); 7761 7762 SDValue Op = N->getOperand(0); 7763 unsigned Opc = Op->getOpcode(); 7764 if (!Op.getValueType().isVector() || 7765 (Opc != ISD::SINT_TO_FP && Opc != ISD::UINT_TO_FP)) 7766 return SDValue(); 7767 7768 SDValue ConstVec = N->getOperand(1); 7769 if (!isa<BuildVectorSDNode>(ConstVec)) 7770 return SDValue(); 7771 7772 MVT IntTy = Op.getOperand(0).getSimpleValueType().getVectorElementType(); 7773 int32_t IntBits = IntTy.getSizeInBits(); 7774 if (IntBits != 16 && IntBits != 32 && IntBits != 64) 7775 return SDValue(); 7776 7777 MVT FloatTy = N->getSimpleValueType(0).getVectorElementType(); 7778 int32_t FloatBits = FloatTy.getSizeInBits(); 7779 if (FloatBits != 32 && FloatBits != 64) 7780 return SDValue(); 7781 7782 // Avoid conversions where iN is larger than the float (e.g., i64 -> float). 7783 if (IntBits > FloatBits) 7784 return SDValue(); 7785 7786 BitVector UndefElements; 7787 BuildVectorSDNode *BV = cast<BuildVectorSDNode>(ConstVec); 7788 int32_t C = BV->getConstantFPSplatPow2ToLog2Int(&UndefElements, FloatBits + 1); 7789 if (C == -1 || C == 0 || C > FloatBits) 7790 return SDValue(); 7791 7792 MVT ResTy; 7793 unsigned NumLanes = Op.getValueType().getVectorNumElements(); 7794 switch (NumLanes) { 7795 default: 7796 return SDValue(); 7797 case 2: 7798 ResTy = FloatBits == 32 ? MVT::v2i32 : MVT::v2i64; 7799 break; 7800 case 4: 7801 ResTy = MVT::v4i32; 7802 break; 7803 } 7804 7805 SDLoc DL(N); 7806 SDValue ConvInput = Op.getOperand(0); 7807 bool IsSigned = Opc == ISD::SINT_TO_FP; 7808 if (IntBits < FloatBits) 7809 ConvInput = DAG.getNode(IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND, DL, 7810 ResTy, ConvInput); 7811 7812 unsigned IntrinsicOpcode = IsSigned ? Intrinsic::aarch64_neon_vcvtfxs2fp 7813 : Intrinsic::aarch64_neon_vcvtfxu2fp; 7814 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, Op.getValueType(), 7815 DAG.getConstant(IntrinsicOpcode, DL, MVT::i32), ConvInput, 7816 DAG.getConstant(C, DL, MVT::i32)); 7817 } 7818 7819 /// An EXTR instruction is made up of two shifts, ORed together. This helper 7820 /// searches for and classifies those shifts. 7821 static bool findEXTRHalf(SDValue N, SDValue &Src, uint32_t &ShiftAmount, 7822 bool &FromHi) { 7823 if (N.getOpcode() == ISD::SHL) 7824 FromHi = false; 7825 else if (N.getOpcode() == ISD::SRL) 7826 FromHi = true; 7827 else 7828 return false; 7829 7830 if (!isa<ConstantSDNode>(N.getOperand(1))) 7831 return false; 7832 7833 ShiftAmount = N->getConstantOperandVal(1); 7834 Src = N->getOperand(0); 7835 return true; 7836 } 7837 7838 /// EXTR instruction extracts a contiguous chunk of bits from two existing 7839 /// registers viewed as a high/low pair. This function looks for the pattern: 7840 /// (or (shl VAL1, #N), (srl VAL2, #RegWidth-N)) and replaces it with an 7841 /// EXTR. Can't quite be done in TableGen because the two immediates aren't 7842 /// independent. 7843 static SDValue tryCombineToEXTR(SDNode *N, 7844 TargetLowering::DAGCombinerInfo &DCI) { 7845 SelectionDAG &DAG = DCI.DAG; 7846 SDLoc DL(N); 7847 EVT VT = N->getValueType(0); 7848 7849 assert(N->getOpcode() == ISD::OR && "Unexpected root"); 7850 7851 if (VT != MVT::i32 && VT != MVT::i64) 7852 return SDValue(); 7853 7854 SDValue LHS; 7855 uint32_t ShiftLHS = 0; 7856 bool LHSFromHi = 0; 7857 if (!findEXTRHalf(N->getOperand(0), LHS, ShiftLHS, LHSFromHi)) 7858 return SDValue(); 7859 7860 SDValue RHS; 7861 uint32_t ShiftRHS = 0; 7862 bool RHSFromHi = 0; 7863 if (!findEXTRHalf(N->getOperand(1), RHS, ShiftRHS, RHSFromHi)) 7864 return SDValue(); 7865 7866 // If they're both trying to come from the high part of the register, they're 7867 // not really an EXTR. 7868 if (LHSFromHi == RHSFromHi) 7869 return SDValue(); 7870 7871 if (ShiftLHS + ShiftRHS != VT.getSizeInBits()) 7872 return SDValue(); 7873 7874 if (LHSFromHi) { 7875 std::swap(LHS, RHS); 7876 std::swap(ShiftLHS, ShiftRHS); 7877 } 7878 7879 return DAG.getNode(AArch64ISD::EXTR, DL, VT, LHS, RHS, 7880 DAG.getConstant(ShiftRHS, DL, MVT::i64)); 7881 } 7882 7883 static SDValue tryCombineToBSL(SDNode *N, 7884 TargetLowering::DAGCombinerInfo &DCI) { 7885 EVT VT = N->getValueType(0); 7886 SelectionDAG &DAG = DCI.DAG; 7887 SDLoc DL(N); 7888 7889 if (!VT.isVector()) 7890 return SDValue(); 7891 7892 SDValue N0 = N->getOperand(0); 7893 if (N0.getOpcode() != ISD::AND) 7894 return SDValue(); 7895 7896 SDValue N1 = N->getOperand(1); 7897 if (N1.getOpcode() != ISD::AND) 7898 return SDValue(); 7899 7900 // We only have to look for constant vectors here since the general, variable 7901 // case can be handled in TableGen. 7902 unsigned Bits = VT.getVectorElementType().getSizeInBits(); 7903 uint64_t BitMask = Bits == 64 ? -1ULL : ((1ULL << Bits) - 1); 7904 for (int i = 1; i >= 0; --i) 7905 for (int j = 1; j >= 0; --j) { 7906 BuildVectorSDNode *BVN0 = dyn_cast<BuildVectorSDNode>(N0->getOperand(i)); 7907 BuildVectorSDNode *BVN1 = dyn_cast<BuildVectorSDNode>(N1->getOperand(j)); 7908 if (!BVN0 || !BVN1) 7909 continue; 7910 7911 bool FoundMatch = true; 7912 for (unsigned k = 0; k < VT.getVectorNumElements(); ++k) { 7913 ConstantSDNode *CN0 = dyn_cast<ConstantSDNode>(BVN0->getOperand(k)); 7914 ConstantSDNode *CN1 = dyn_cast<ConstantSDNode>(BVN1->getOperand(k)); 7915 if (!CN0 || !CN1 || 7916 CN0->getZExtValue() != (BitMask & ~CN1->getZExtValue())) { 7917 FoundMatch = false; 7918 break; 7919 } 7920 } 7921 7922 if (FoundMatch) 7923 return DAG.getNode(AArch64ISD::BSL, DL, VT, SDValue(BVN0, 0), 7924 N0->getOperand(1 - i), N1->getOperand(1 - j)); 7925 } 7926 7927 return SDValue(); 7928 } 7929 7930 static SDValue performORCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, 7931 const AArch64Subtarget *Subtarget) { 7932 // Attempt to form an EXTR from (or (shl VAL1, #N), (srl VAL2, #RegWidth-N)) 7933 SelectionDAG &DAG = DCI.DAG; 7934 EVT VT = N->getValueType(0); 7935 7936 if (!DAG.getTargetLoweringInfo().isTypeLegal(VT)) 7937 return SDValue(); 7938 7939 if (SDValue Res = tryCombineToEXTR(N, DCI)) 7940 return Res; 7941 7942 if (SDValue Res = tryCombineToBSL(N, DCI)) 7943 return Res; 7944 7945 return SDValue(); 7946 } 7947 7948 static SDValue performSRLCombine(SDNode *N, 7949 TargetLowering::DAGCombinerInfo &DCI) { 7950 SelectionDAG &DAG = DCI.DAG; 7951 EVT VT = N->getValueType(0); 7952 if (VT != MVT::i32 && VT != MVT::i64) 7953 return SDValue(); 7954 7955 // Canonicalize (srl (bswap i32 x), 16) to (rotr (bswap i32 x), 16), if the 7956 // high 16-bits of x are zero. Similarly, canonicalize (srl (bswap i64 x), 32) 7957 // to (rotr (bswap i64 x), 32), if the high 32-bits of x are zero. 7958 SDValue N0 = N->getOperand(0); 7959 if (N0.getOpcode() == ISD::BSWAP) { 7960 SDLoc DL(N); 7961 SDValue N1 = N->getOperand(1); 7962 SDValue N00 = N0.getOperand(0); 7963 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(N1)) { 7964 uint64_t ShiftAmt = C->getZExtValue(); 7965 if (VT == MVT::i32 && ShiftAmt == 16 && 7966 DAG.MaskedValueIsZero(N00, APInt::getHighBitsSet(32, 16))) 7967 return DAG.getNode(ISD::ROTR, DL, VT, N0, N1); 7968 if (VT == MVT::i64 && ShiftAmt == 32 && 7969 DAG.MaskedValueIsZero(N00, APInt::getHighBitsSet(64, 32))) 7970 return DAG.getNode(ISD::ROTR, DL, VT, N0, N1); 7971 } 7972 } 7973 return SDValue(); 7974 } 7975 7976 static SDValue performBitcastCombine(SDNode *N, 7977 TargetLowering::DAGCombinerInfo &DCI, 7978 SelectionDAG &DAG) { 7979 // Wait 'til after everything is legalized to try this. That way we have 7980 // legal vector types and such. 7981 if (DCI.isBeforeLegalizeOps()) 7982 return SDValue(); 7983 7984 // Remove extraneous bitcasts around an extract_subvector. 7985 // For example, 7986 // (v4i16 (bitconvert 7987 // (extract_subvector (v2i64 (bitconvert (v8i16 ...)), (i64 1))))) 7988 // becomes 7989 // (extract_subvector ((v8i16 ...), (i64 4))) 7990 7991 // Only interested in 64-bit vectors as the ultimate result. 7992 EVT VT = N->getValueType(0); 7993 if (!VT.isVector()) 7994 return SDValue(); 7995 if (VT.getSimpleVT().getSizeInBits() != 64) 7996 return SDValue(); 7997 // Is the operand an extract_subvector starting at the beginning or halfway 7998 // point of the vector? A low half may also come through as an 7999 // EXTRACT_SUBREG, so look for that, too. 8000 SDValue Op0 = N->getOperand(0); 8001 if (Op0->getOpcode() != ISD::EXTRACT_SUBVECTOR && 8002 !(Op0->isMachineOpcode() && 8003 Op0->getMachineOpcode() == AArch64::EXTRACT_SUBREG)) 8004 return SDValue(); 8005 uint64_t idx = cast<ConstantSDNode>(Op0->getOperand(1))->getZExtValue(); 8006 if (Op0->getOpcode() == ISD::EXTRACT_SUBVECTOR) { 8007 if (Op0->getValueType(0).getVectorNumElements() != idx && idx != 0) 8008 return SDValue(); 8009 } else if (Op0->getMachineOpcode() == AArch64::EXTRACT_SUBREG) { 8010 if (idx != AArch64::dsub) 8011 return SDValue(); 8012 // The dsub reference is equivalent to a lane zero subvector reference. 8013 idx = 0; 8014 } 8015 // Look through the bitcast of the input to the extract. 8016 if (Op0->getOperand(0)->getOpcode() != ISD::BITCAST) 8017 return SDValue(); 8018 SDValue Source = Op0->getOperand(0)->getOperand(0); 8019 // If the source type has twice the number of elements as our destination 8020 // type, we know this is an extract of the high or low half of the vector. 8021 EVT SVT = Source->getValueType(0); 8022 if (SVT.getVectorNumElements() != VT.getVectorNumElements() * 2) 8023 return SDValue(); 8024 8025 DEBUG(dbgs() << "aarch64-lower: bitcast extract_subvector simplification\n"); 8026 8027 // Create the simplified form to just extract the low or high half of the 8028 // vector directly rather than bothering with the bitcasts. 8029 SDLoc dl(N); 8030 unsigned NumElements = VT.getVectorNumElements(); 8031 if (idx) { 8032 SDValue HalfIdx = DAG.getConstant(NumElements, dl, MVT::i64); 8033 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, Source, HalfIdx); 8034 } else { 8035 SDValue SubReg = DAG.getTargetConstant(AArch64::dsub, dl, MVT::i32); 8036 return SDValue(DAG.getMachineNode(TargetOpcode::EXTRACT_SUBREG, dl, VT, 8037 Source, SubReg), 8038 0); 8039 } 8040 } 8041 8042 static SDValue performConcatVectorsCombine(SDNode *N, 8043 TargetLowering::DAGCombinerInfo &DCI, 8044 SelectionDAG &DAG) { 8045 SDLoc dl(N); 8046 EVT VT = N->getValueType(0); 8047 SDValue N0 = N->getOperand(0), N1 = N->getOperand(1); 8048 8049 // Optimize concat_vectors of truncated vectors, where the intermediate 8050 // type is illegal, to avoid said illegality, e.g., 8051 // (v4i16 (concat_vectors (v2i16 (truncate (v2i64))), 8052 // (v2i16 (truncate (v2i64))))) 8053 // -> 8054 // (v4i16 (truncate (vector_shuffle (v4i32 (bitcast (v2i64))), 8055 // (v4i32 (bitcast (v2i64))), 8056 // <0, 2, 4, 6>))) 8057 // This isn't really target-specific, but ISD::TRUNCATE legality isn't keyed 8058 // on both input and result type, so we might generate worse code. 8059 // On AArch64 we know it's fine for v2i64->v4i16 and v4i32->v8i8. 8060 if (N->getNumOperands() == 2 && 8061 N0->getOpcode() == ISD::TRUNCATE && 8062 N1->getOpcode() == ISD::TRUNCATE) { 8063 SDValue N00 = N0->getOperand(0); 8064 SDValue N10 = N1->getOperand(0); 8065 EVT N00VT = N00.getValueType(); 8066 8067 if (N00VT == N10.getValueType() && 8068 (N00VT == MVT::v2i64 || N00VT == MVT::v4i32) && 8069 N00VT.getScalarSizeInBits() == 4 * VT.getScalarSizeInBits()) { 8070 MVT MidVT = (N00VT == MVT::v2i64 ? MVT::v4i32 : MVT::v8i16); 8071 SmallVector<int, 8> Mask(MidVT.getVectorNumElements()); 8072 for (size_t i = 0; i < Mask.size(); ++i) 8073 Mask[i] = i * 2; 8074 return DAG.getNode(ISD::TRUNCATE, dl, VT, 8075 DAG.getVectorShuffle( 8076 MidVT, dl, 8077 DAG.getNode(ISD::BITCAST, dl, MidVT, N00), 8078 DAG.getNode(ISD::BITCAST, dl, MidVT, N10), Mask)); 8079 } 8080 } 8081 8082 // Wait 'til after everything is legalized to try this. That way we have 8083 // legal vector types and such. 8084 if (DCI.isBeforeLegalizeOps()) 8085 return SDValue(); 8086 8087 // If we see a (concat_vectors (v1x64 A), (v1x64 A)) it's really a vector 8088 // splat. The indexed instructions are going to be expecting a DUPLANE64, so 8089 // canonicalise to that. 8090 if (N0 == N1 && VT.getVectorNumElements() == 2) { 8091 assert(VT.getVectorElementType().getSizeInBits() == 64); 8092 return DAG.getNode(AArch64ISD::DUPLANE64, dl, VT, WidenVector(N0, DAG), 8093 DAG.getConstant(0, dl, MVT::i64)); 8094 } 8095 8096 // Canonicalise concat_vectors so that the right-hand vector has as few 8097 // bit-casts as possible before its real operation. The primary matching 8098 // destination for these operations will be the narrowing "2" instructions, 8099 // which depend on the operation being performed on this right-hand vector. 8100 // For example, 8101 // (concat_vectors LHS, (v1i64 (bitconvert (v4i16 RHS)))) 8102 // becomes 8103 // (bitconvert (concat_vectors (v4i16 (bitconvert LHS)), RHS)) 8104 8105 if (N1->getOpcode() != ISD::BITCAST) 8106 return SDValue(); 8107 SDValue RHS = N1->getOperand(0); 8108 MVT RHSTy = RHS.getValueType().getSimpleVT(); 8109 // If the RHS is not a vector, this is not the pattern we're looking for. 8110 if (!RHSTy.isVector()) 8111 return SDValue(); 8112 8113 DEBUG(dbgs() << "aarch64-lower: concat_vectors bitcast simplification\n"); 8114 8115 MVT ConcatTy = MVT::getVectorVT(RHSTy.getVectorElementType(), 8116 RHSTy.getVectorNumElements() * 2); 8117 return DAG.getNode(ISD::BITCAST, dl, VT, 8118 DAG.getNode(ISD::CONCAT_VECTORS, dl, ConcatTy, 8119 DAG.getNode(ISD::BITCAST, dl, RHSTy, N0), 8120 RHS)); 8121 } 8122 8123 static SDValue tryCombineFixedPointConvert(SDNode *N, 8124 TargetLowering::DAGCombinerInfo &DCI, 8125 SelectionDAG &DAG) { 8126 // Wait 'til after everything is legalized to try this. That way we have 8127 // legal vector types and such. 8128 if (DCI.isBeforeLegalizeOps()) 8129 return SDValue(); 8130 // Transform a scalar conversion of a value from a lane extract into a 8131 // lane extract of a vector conversion. E.g., from foo1 to foo2: 8132 // double foo1(int64x2_t a) { return vcvtd_n_f64_s64(a[1], 9); } 8133 // double foo2(int64x2_t a) { return vcvtq_n_f64_s64(a, 9)[1]; } 8134 // 8135 // The second form interacts better with instruction selection and the 8136 // register allocator to avoid cross-class register copies that aren't 8137 // coalescable due to a lane reference. 8138 8139 // Check the operand and see if it originates from a lane extract. 8140 SDValue Op1 = N->getOperand(1); 8141 if (Op1.getOpcode() == ISD::EXTRACT_VECTOR_ELT) { 8142 // Yep, no additional predication needed. Perform the transform. 8143 SDValue IID = N->getOperand(0); 8144 SDValue Shift = N->getOperand(2); 8145 SDValue Vec = Op1.getOperand(0); 8146 SDValue Lane = Op1.getOperand(1); 8147 EVT ResTy = N->getValueType(0); 8148 EVT VecResTy; 8149 SDLoc DL(N); 8150 8151 // The vector width should be 128 bits by the time we get here, even 8152 // if it started as 64 bits (the extract_vector handling will have 8153 // done so). 8154 assert(Vec.getValueType().getSizeInBits() == 128 && 8155 "unexpected vector size on extract_vector_elt!"); 8156 if (Vec.getValueType() == MVT::v4i32) 8157 VecResTy = MVT::v4f32; 8158 else if (Vec.getValueType() == MVT::v2i64) 8159 VecResTy = MVT::v2f64; 8160 else 8161 llvm_unreachable("unexpected vector type!"); 8162 8163 SDValue Convert = 8164 DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VecResTy, IID, Vec, Shift); 8165 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ResTy, Convert, Lane); 8166 } 8167 return SDValue(); 8168 } 8169 8170 // AArch64 high-vector "long" operations are formed by performing the non-high 8171 // version on an extract_subvector of each operand which gets the high half: 8172 // 8173 // (longop2 LHS, RHS) == (longop (extract_high LHS), (extract_high RHS)) 8174 // 8175 // However, there are cases which don't have an extract_high explicitly, but 8176 // have another operation that can be made compatible with one for free. For 8177 // example: 8178 // 8179 // (dupv64 scalar) --> (extract_high (dup128 scalar)) 8180 // 8181 // This routine does the actual conversion of such DUPs, once outer routines 8182 // have determined that everything else is in order. 8183 // It also supports immediate DUP-like nodes (MOVI/MVNi), which we can fold 8184 // similarly here. 8185 static SDValue tryExtendDUPToExtractHigh(SDValue N, SelectionDAG &DAG) { 8186 switch (N.getOpcode()) { 8187 case AArch64ISD::DUP: 8188 case AArch64ISD::DUPLANE8: 8189 case AArch64ISD::DUPLANE16: 8190 case AArch64ISD::DUPLANE32: 8191 case AArch64ISD::DUPLANE64: 8192 case AArch64ISD::MOVI: 8193 case AArch64ISD::MOVIshift: 8194 case AArch64ISD::MOVIedit: 8195 case AArch64ISD::MOVImsl: 8196 case AArch64ISD::MVNIshift: 8197 case AArch64ISD::MVNImsl: 8198 break; 8199 default: 8200 // FMOV could be supported, but isn't very useful, as it would only occur 8201 // if you passed a bitcast' floating point immediate to an eligible long 8202 // integer op (addl, smull, ...). 8203 return SDValue(); 8204 } 8205 8206 MVT NarrowTy = N.getSimpleValueType(); 8207 if (!NarrowTy.is64BitVector()) 8208 return SDValue(); 8209 8210 MVT ElementTy = NarrowTy.getVectorElementType(); 8211 unsigned NumElems = NarrowTy.getVectorNumElements(); 8212 MVT NewVT = MVT::getVectorVT(ElementTy, NumElems * 2); 8213 8214 SDLoc dl(N); 8215 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, NarrowTy, 8216 DAG.getNode(N->getOpcode(), dl, NewVT, N->ops()), 8217 DAG.getConstant(NumElems, dl, MVT::i64)); 8218 } 8219 8220 static bool isEssentiallyExtractSubvector(SDValue N) { 8221 if (N.getOpcode() == ISD::EXTRACT_SUBVECTOR) 8222 return true; 8223 8224 return N.getOpcode() == ISD::BITCAST && 8225 N.getOperand(0).getOpcode() == ISD::EXTRACT_SUBVECTOR; 8226 } 8227 8228 /// \brief Helper structure to keep track of ISD::SET_CC operands. 8229 struct GenericSetCCInfo { 8230 const SDValue *Opnd0; 8231 const SDValue *Opnd1; 8232 ISD::CondCode CC; 8233 }; 8234 8235 /// \brief Helper structure to keep track of a SET_CC lowered into AArch64 code. 8236 struct AArch64SetCCInfo { 8237 const SDValue *Cmp; 8238 AArch64CC::CondCode CC; 8239 }; 8240 8241 /// \brief Helper structure to keep track of SetCC information. 8242 union SetCCInfo { 8243 GenericSetCCInfo Generic; 8244 AArch64SetCCInfo AArch64; 8245 }; 8246 8247 /// \brief Helper structure to be able to read SetCC information. If set to 8248 /// true, IsAArch64 field, Info is a AArch64SetCCInfo, otherwise Info is a 8249 /// GenericSetCCInfo. 8250 struct SetCCInfoAndKind { 8251 SetCCInfo Info; 8252 bool IsAArch64; 8253 }; 8254 8255 /// \brief Check whether or not \p Op is a SET_CC operation, either a generic or 8256 /// an 8257 /// AArch64 lowered one. 8258 /// \p SetCCInfo is filled accordingly. 8259 /// \post SetCCInfo is meanginfull only when this function returns true. 8260 /// \return True when Op is a kind of SET_CC operation. 8261 static bool isSetCC(SDValue Op, SetCCInfoAndKind &SetCCInfo) { 8262 // If this is a setcc, this is straight forward. 8263 if (Op.getOpcode() == ISD::SETCC) { 8264 SetCCInfo.Info.Generic.Opnd0 = &Op.getOperand(0); 8265 SetCCInfo.Info.Generic.Opnd1 = &Op.getOperand(1); 8266 SetCCInfo.Info.Generic.CC = cast<CondCodeSDNode>(Op.getOperand(2))->get(); 8267 SetCCInfo.IsAArch64 = false; 8268 return true; 8269 } 8270 // Otherwise, check if this is a matching csel instruction. 8271 // In other words: 8272 // - csel 1, 0, cc 8273 // - csel 0, 1, !cc 8274 if (Op.getOpcode() != AArch64ISD::CSEL) 8275 return false; 8276 // Set the information about the operands. 8277 // TODO: we want the operands of the Cmp not the csel 8278 SetCCInfo.Info.AArch64.Cmp = &Op.getOperand(3); 8279 SetCCInfo.IsAArch64 = true; 8280 SetCCInfo.Info.AArch64.CC = static_cast<AArch64CC::CondCode>( 8281 cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue()); 8282 8283 // Check that the operands matches the constraints: 8284 // (1) Both operands must be constants. 8285 // (2) One must be 1 and the other must be 0. 8286 ConstantSDNode *TValue = dyn_cast<ConstantSDNode>(Op.getOperand(0)); 8287 ConstantSDNode *FValue = dyn_cast<ConstantSDNode>(Op.getOperand(1)); 8288 8289 // Check (1). 8290 if (!TValue || !FValue) 8291 return false; 8292 8293 // Check (2). 8294 if (!TValue->isOne()) { 8295 // Update the comparison when we are interested in !cc. 8296 std::swap(TValue, FValue); 8297 SetCCInfo.Info.AArch64.CC = 8298 AArch64CC::getInvertedCondCode(SetCCInfo.Info.AArch64.CC); 8299 } 8300 return TValue->isOne() && FValue->isNullValue(); 8301 } 8302 8303 // Returns true if Op is setcc or zext of setcc. 8304 static bool isSetCCOrZExtSetCC(const SDValue& Op, SetCCInfoAndKind &Info) { 8305 if (isSetCC(Op, Info)) 8306 return true; 8307 return ((Op.getOpcode() == ISD::ZERO_EXTEND) && 8308 isSetCC(Op->getOperand(0), Info)); 8309 } 8310 8311 // The folding we want to perform is: 8312 // (add x, [zext] (setcc cc ...) ) 8313 // --> 8314 // (csel x, (add x, 1), !cc ...) 8315 // 8316 // The latter will get matched to a CSINC instruction. 8317 static SDValue performSetccAddFolding(SDNode *Op, SelectionDAG &DAG) { 8318 assert(Op && Op->getOpcode() == ISD::ADD && "Unexpected operation!"); 8319 SDValue LHS = Op->getOperand(0); 8320 SDValue RHS = Op->getOperand(1); 8321 SetCCInfoAndKind InfoAndKind; 8322 8323 // If neither operand is a SET_CC, give up. 8324 if (!isSetCCOrZExtSetCC(LHS, InfoAndKind)) { 8325 std::swap(LHS, RHS); 8326 if (!isSetCCOrZExtSetCC(LHS, InfoAndKind)) 8327 return SDValue(); 8328 } 8329 8330 // FIXME: This could be generatized to work for FP comparisons. 8331 EVT CmpVT = InfoAndKind.IsAArch64 8332 ? InfoAndKind.Info.AArch64.Cmp->getOperand(0).getValueType() 8333 : InfoAndKind.Info.Generic.Opnd0->getValueType(); 8334 if (CmpVT != MVT::i32 && CmpVT != MVT::i64) 8335 return SDValue(); 8336 8337 SDValue CCVal; 8338 SDValue Cmp; 8339 SDLoc dl(Op); 8340 if (InfoAndKind.IsAArch64) { 8341 CCVal = DAG.getConstant( 8342 AArch64CC::getInvertedCondCode(InfoAndKind.Info.AArch64.CC), dl, 8343 MVT::i32); 8344 Cmp = *InfoAndKind.Info.AArch64.Cmp; 8345 } else 8346 Cmp = getAArch64Cmp(*InfoAndKind.Info.Generic.Opnd0, 8347 *InfoAndKind.Info.Generic.Opnd1, 8348 ISD::getSetCCInverse(InfoAndKind.Info.Generic.CC, true), 8349 CCVal, DAG, dl); 8350 8351 EVT VT = Op->getValueType(0); 8352 LHS = DAG.getNode(ISD::ADD, dl, VT, RHS, DAG.getConstant(1, dl, VT)); 8353 return DAG.getNode(AArch64ISD::CSEL, dl, VT, RHS, LHS, CCVal, Cmp); 8354 } 8355 8356 // The basic add/sub long vector instructions have variants with "2" on the end 8357 // which act on the high-half of their inputs. They are normally matched by 8358 // patterns like: 8359 // 8360 // (add (zeroext (extract_high LHS)), 8361 // (zeroext (extract_high RHS))) 8362 // -> uaddl2 vD, vN, vM 8363 // 8364 // However, if one of the extracts is something like a duplicate, this 8365 // instruction can still be used profitably. This function puts the DAG into a 8366 // more appropriate form for those patterns to trigger. 8367 static SDValue performAddSubLongCombine(SDNode *N, 8368 TargetLowering::DAGCombinerInfo &DCI, 8369 SelectionDAG &DAG) { 8370 if (DCI.isBeforeLegalizeOps()) 8371 return SDValue(); 8372 8373 MVT VT = N->getSimpleValueType(0); 8374 if (!VT.is128BitVector()) { 8375 if (N->getOpcode() == ISD::ADD) 8376 return performSetccAddFolding(N, DAG); 8377 return SDValue(); 8378 } 8379 8380 // Make sure both branches are extended in the same way. 8381 SDValue LHS = N->getOperand(0); 8382 SDValue RHS = N->getOperand(1); 8383 if ((LHS.getOpcode() != ISD::ZERO_EXTEND && 8384 LHS.getOpcode() != ISD::SIGN_EXTEND) || 8385 LHS.getOpcode() != RHS.getOpcode()) 8386 return SDValue(); 8387 8388 unsigned ExtType = LHS.getOpcode(); 8389 8390 // It's not worth doing if at least one of the inputs isn't already an 8391 // extract, but we don't know which it'll be so we have to try both. 8392 if (isEssentiallyExtractSubvector(LHS.getOperand(0))) { 8393 RHS = tryExtendDUPToExtractHigh(RHS.getOperand(0), DAG); 8394 if (!RHS.getNode()) 8395 return SDValue(); 8396 8397 RHS = DAG.getNode(ExtType, SDLoc(N), VT, RHS); 8398 } else if (isEssentiallyExtractSubvector(RHS.getOperand(0))) { 8399 LHS = tryExtendDUPToExtractHigh(LHS.getOperand(0), DAG); 8400 if (!LHS.getNode()) 8401 return SDValue(); 8402 8403 LHS = DAG.getNode(ExtType, SDLoc(N), VT, LHS); 8404 } 8405 8406 return DAG.getNode(N->getOpcode(), SDLoc(N), VT, LHS, RHS); 8407 } 8408 8409 // Massage DAGs which we can use the high-half "long" operations on into 8410 // something isel will recognize better. E.g. 8411 // 8412 // (aarch64_neon_umull (extract_high vec) (dupv64 scalar)) --> 8413 // (aarch64_neon_umull (extract_high (v2i64 vec))) 8414 // (extract_high (v2i64 (dup128 scalar))))) 8415 // 8416 static SDValue tryCombineLongOpWithDup(unsigned IID, SDNode *N, 8417 TargetLowering::DAGCombinerInfo &DCI, 8418 SelectionDAG &DAG) { 8419 if (DCI.isBeforeLegalizeOps()) 8420 return SDValue(); 8421 8422 SDValue LHS = N->getOperand(1); 8423 SDValue RHS = N->getOperand(2); 8424 assert(LHS.getValueType().is64BitVector() && 8425 RHS.getValueType().is64BitVector() && 8426 "unexpected shape for long operation"); 8427 8428 // Either node could be a DUP, but it's not worth doing both of them (you'd 8429 // just as well use the non-high version) so look for a corresponding extract 8430 // operation on the other "wing". 8431 if (isEssentiallyExtractSubvector(LHS)) { 8432 RHS = tryExtendDUPToExtractHigh(RHS, DAG); 8433 if (!RHS.getNode()) 8434 return SDValue(); 8435 } else if (isEssentiallyExtractSubvector(RHS)) { 8436 LHS = tryExtendDUPToExtractHigh(LHS, DAG); 8437 if (!LHS.getNode()) 8438 return SDValue(); 8439 } 8440 8441 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SDLoc(N), N->getValueType(0), 8442 N->getOperand(0), LHS, RHS); 8443 } 8444 8445 static SDValue tryCombineShiftImm(unsigned IID, SDNode *N, SelectionDAG &DAG) { 8446 MVT ElemTy = N->getSimpleValueType(0).getScalarType(); 8447 unsigned ElemBits = ElemTy.getSizeInBits(); 8448 8449 int64_t ShiftAmount; 8450 if (BuildVectorSDNode *BVN = dyn_cast<BuildVectorSDNode>(N->getOperand(2))) { 8451 APInt SplatValue, SplatUndef; 8452 unsigned SplatBitSize; 8453 bool HasAnyUndefs; 8454 if (!BVN->isConstantSplat(SplatValue, SplatUndef, SplatBitSize, 8455 HasAnyUndefs, ElemBits) || 8456 SplatBitSize != ElemBits) 8457 return SDValue(); 8458 8459 ShiftAmount = SplatValue.getSExtValue(); 8460 } else if (ConstantSDNode *CVN = dyn_cast<ConstantSDNode>(N->getOperand(2))) { 8461 ShiftAmount = CVN->getSExtValue(); 8462 } else 8463 return SDValue(); 8464 8465 unsigned Opcode; 8466 bool IsRightShift; 8467 switch (IID) { 8468 default: 8469 llvm_unreachable("Unknown shift intrinsic"); 8470 case Intrinsic::aarch64_neon_sqshl: 8471 Opcode = AArch64ISD::SQSHL_I; 8472 IsRightShift = false; 8473 break; 8474 case Intrinsic::aarch64_neon_uqshl: 8475 Opcode = AArch64ISD::UQSHL_I; 8476 IsRightShift = false; 8477 break; 8478 case Intrinsic::aarch64_neon_srshl: 8479 Opcode = AArch64ISD::SRSHR_I; 8480 IsRightShift = true; 8481 break; 8482 case Intrinsic::aarch64_neon_urshl: 8483 Opcode = AArch64ISD::URSHR_I; 8484 IsRightShift = true; 8485 break; 8486 case Intrinsic::aarch64_neon_sqshlu: 8487 Opcode = AArch64ISD::SQSHLU_I; 8488 IsRightShift = false; 8489 break; 8490 } 8491 8492 if (IsRightShift && ShiftAmount <= -1 && ShiftAmount >= -(int)ElemBits) { 8493 SDLoc dl(N); 8494 return DAG.getNode(Opcode, dl, N->getValueType(0), N->getOperand(1), 8495 DAG.getConstant(-ShiftAmount, dl, MVT::i32)); 8496 } else if (!IsRightShift && ShiftAmount >= 0 && ShiftAmount < ElemBits) { 8497 SDLoc dl(N); 8498 return DAG.getNode(Opcode, dl, N->getValueType(0), N->getOperand(1), 8499 DAG.getConstant(ShiftAmount, dl, MVT::i32)); 8500 } 8501 8502 return SDValue(); 8503 } 8504 8505 // The CRC32[BH] instructions ignore the high bits of their data operand. Since 8506 // the intrinsics must be legal and take an i32, this means there's almost 8507 // certainly going to be a zext in the DAG which we can eliminate. 8508 static SDValue tryCombineCRC32(unsigned Mask, SDNode *N, SelectionDAG &DAG) { 8509 SDValue AndN = N->getOperand(2); 8510 if (AndN.getOpcode() != ISD::AND) 8511 return SDValue(); 8512 8513 ConstantSDNode *CMask = dyn_cast<ConstantSDNode>(AndN.getOperand(1)); 8514 if (!CMask || CMask->getZExtValue() != Mask) 8515 return SDValue(); 8516 8517 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SDLoc(N), MVT::i32, 8518 N->getOperand(0), N->getOperand(1), AndN.getOperand(0)); 8519 } 8520 8521 static SDValue combineAcrossLanesIntrinsic(unsigned Opc, SDNode *N, 8522 SelectionDAG &DAG) { 8523 SDLoc dl(N); 8524 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, N->getValueType(0), 8525 DAG.getNode(Opc, dl, 8526 N->getOperand(1).getSimpleValueType(), 8527 N->getOperand(1)), 8528 DAG.getConstant(0, dl, MVT::i64)); 8529 } 8530 8531 static SDValue performIntrinsicCombine(SDNode *N, 8532 TargetLowering::DAGCombinerInfo &DCI, 8533 const AArch64Subtarget *Subtarget) { 8534 SelectionDAG &DAG = DCI.DAG; 8535 unsigned IID = getIntrinsicID(N); 8536 switch (IID) { 8537 default: 8538 break; 8539 case Intrinsic::aarch64_neon_vcvtfxs2fp: 8540 case Intrinsic::aarch64_neon_vcvtfxu2fp: 8541 return tryCombineFixedPointConvert(N, DCI, DAG); 8542 case Intrinsic::aarch64_neon_saddv: 8543 return combineAcrossLanesIntrinsic(AArch64ISD::SADDV, N, DAG); 8544 case Intrinsic::aarch64_neon_uaddv: 8545 return combineAcrossLanesIntrinsic(AArch64ISD::UADDV, N, DAG); 8546 case Intrinsic::aarch64_neon_sminv: 8547 return combineAcrossLanesIntrinsic(AArch64ISD::SMINV, N, DAG); 8548 case Intrinsic::aarch64_neon_uminv: 8549 return combineAcrossLanesIntrinsic(AArch64ISD::UMINV, N, DAG); 8550 case Intrinsic::aarch64_neon_smaxv: 8551 return combineAcrossLanesIntrinsic(AArch64ISD::SMAXV, N, DAG); 8552 case Intrinsic::aarch64_neon_umaxv: 8553 return combineAcrossLanesIntrinsic(AArch64ISD::UMAXV, N, DAG); 8554 case Intrinsic::aarch64_neon_fmax: 8555 return DAG.getNode(ISD::FMAXNAN, SDLoc(N), N->getValueType(0), 8556 N->getOperand(1), N->getOperand(2)); 8557 case Intrinsic::aarch64_neon_fmin: 8558 return DAG.getNode(ISD::FMINNAN, SDLoc(N), N->getValueType(0), 8559 N->getOperand(1), N->getOperand(2)); 8560 case Intrinsic::aarch64_neon_fmaxnm: 8561 return DAG.getNode(ISD::FMAXNUM, SDLoc(N), N->getValueType(0), 8562 N->getOperand(1), N->getOperand(2)); 8563 case Intrinsic::aarch64_neon_fminnm: 8564 return DAG.getNode(ISD::FMINNUM, SDLoc(N), N->getValueType(0), 8565 N->getOperand(1), N->getOperand(2)); 8566 case Intrinsic::aarch64_neon_smull: 8567 case Intrinsic::aarch64_neon_umull: 8568 case Intrinsic::aarch64_neon_pmull: 8569 case Intrinsic::aarch64_neon_sqdmull: 8570 return tryCombineLongOpWithDup(IID, N, DCI, DAG); 8571 case Intrinsic::aarch64_neon_sqshl: 8572 case Intrinsic::aarch64_neon_uqshl: 8573 case Intrinsic::aarch64_neon_sqshlu: 8574 case Intrinsic::aarch64_neon_srshl: 8575 case Intrinsic::aarch64_neon_urshl: 8576 return tryCombineShiftImm(IID, N, DAG); 8577 case Intrinsic::aarch64_crc32b: 8578 case Intrinsic::aarch64_crc32cb: 8579 return tryCombineCRC32(0xff, N, DAG); 8580 case Intrinsic::aarch64_crc32h: 8581 case Intrinsic::aarch64_crc32ch: 8582 return tryCombineCRC32(0xffff, N, DAG); 8583 } 8584 return SDValue(); 8585 } 8586 8587 static SDValue performExtendCombine(SDNode *N, 8588 TargetLowering::DAGCombinerInfo &DCI, 8589 SelectionDAG &DAG) { 8590 // If we see something like (zext (sabd (extract_high ...), (DUP ...))) then 8591 // we can convert that DUP into another extract_high (of a bigger DUP), which 8592 // helps the backend to decide that an sabdl2 would be useful, saving a real 8593 // extract_high operation. 8594 if (!DCI.isBeforeLegalizeOps() && N->getOpcode() == ISD::ZERO_EXTEND && 8595 N->getOperand(0).getOpcode() == ISD::INTRINSIC_WO_CHAIN) { 8596 SDNode *ABDNode = N->getOperand(0).getNode(); 8597 unsigned IID = getIntrinsicID(ABDNode); 8598 if (IID == Intrinsic::aarch64_neon_sabd || 8599 IID == Intrinsic::aarch64_neon_uabd) { 8600 SDValue NewABD = tryCombineLongOpWithDup(IID, ABDNode, DCI, DAG); 8601 if (!NewABD.getNode()) 8602 return SDValue(); 8603 8604 return DAG.getNode(ISD::ZERO_EXTEND, SDLoc(N), N->getValueType(0), 8605 NewABD); 8606 } 8607 } 8608 8609 // This is effectively a custom type legalization for AArch64. 8610 // 8611 // Type legalization will split an extend of a small, legal, type to a larger 8612 // illegal type by first splitting the destination type, often creating 8613 // illegal source types, which then get legalized in isel-confusing ways, 8614 // leading to really terrible codegen. E.g., 8615 // %result = v8i32 sext v8i8 %value 8616 // becomes 8617 // %losrc = extract_subreg %value, ... 8618 // %hisrc = extract_subreg %value, ... 8619 // %lo = v4i32 sext v4i8 %losrc 8620 // %hi = v4i32 sext v4i8 %hisrc 8621 // Things go rapidly downhill from there. 8622 // 8623 // For AArch64, the [sz]ext vector instructions can only go up one element 8624 // size, so we can, e.g., extend from i8 to i16, but to go from i8 to i32 8625 // take two instructions. 8626 // 8627 // This implies that the most efficient way to do the extend from v8i8 8628 // to two v4i32 values is to first extend the v8i8 to v8i16, then do 8629 // the normal splitting to happen for the v8i16->v8i32. 8630 8631 // This is pre-legalization to catch some cases where the default 8632 // type legalization will create ill-tempered code. 8633 if (!DCI.isBeforeLegalizeOps()) 8634 return SDValue(); 8635 8636 // We're only interested in cleaning things up for non-legal vector types 8637 // here. If both the source and destination are legal, things will just 8638 // work naturally without any fiddling. 8639 const TargetLowering &TLI = DAG.getTargetLoweringInfo(); 8640 EVT ResVT = N->getValueType(0); 8641 if (!ResVT.isVector() || TLI.isTypeLegal(ResVT)) 8642 return SDValue(); 8643 // If the vector type isn't a simple VT, it's beyond the scope of what 8644 // we're worried about here. Let legalization do its thing and hope for 8645 // the best. 8646 SDValue Src = N->getOperand(0); 8647 EVT SrcVT = Src->getValueType(0); 8648 if (!ResVT.isSimple() || !SrcVT.isSimple()) 8649 return SDValue(); 8650 8651 // If the source VT is a 64-bit vector, we can play games and get the 8652 // better results we want. 8653 if (SrcVT.getSizeInBits() != 64) 8654 return SDValue(); 8655 8656 unsigned SrcEltSize = SrcVT.getVectorElementType().getSizeInBits(); 8657 unsigned ElementCount = SrcVT.getVectorNumElements(); 8658 SrcVT = MVT::getVectorVT(MVT::getIntegerVT(SrcEltSize * 2), ElementCount); 8659 SDLoc DL(N); 8660 Src = DAG.getNode(N->getOpcode(), DL, SrcVT, Src); 8661 8662 // Now split the rest of the operation into two halves, each with a 64 8663 // bit source. 8664 EVT LoVT, HiVT; 8665 SDValue Lo, Hi; 8666 unsigned NumElements = ResVT.getVectorNumElements(); 8667 assert(!(NumElements & 1) && "Splitting vector, but not in half!"); 8668 LoVT = HiVT = EVT::getVectorVT(*DAG.getContext(), 8669 ResVT.getVectorElementType(), NumElements / 2); 8670 8671 EVT InNVT = EVT::getVectorVT(*DAG.getContext(), SrcVT.getVectorElementType(), 8672 LoVT.getVectorNumElements()); 8673 Lo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, InNVT, Src, 8674 DAG.getConstant(0, DL, MVT::i64)); 8675 Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, InNVT, Src, 8676 DAG.getConstant(InNVT.getVectorNumElements(), DL, MVT::i64)); 8677 Lo = DAG.getNode(N->getOpcode(), DL, LoVT, Lo); 8678 Hi = DAG.getNode(N->getOpcode(), DL, HiVT, Hi); 8679 8680 // Now combine the parts back together so we still have a single result 8681 // like the combiner expects. 8682 return DAG.getNode(ISD::CONCAT_VECTORS, DL, ResVT, Lo, Hi); 8683 } 8684 8685 /// Replace a splat of a scalar to a vector store by scalar stores of the scalar 8686 /// value. The load store optimizer pass will merge them to store pair stores. 8687 /// This has better performance than a splat of the scalar followed by a split 8688 /// vector store. Even if the stores are not merged it is four stores vs a dup, 8689 /// followed by an ext.b and two stores. 8690 static SDValue replaceSplatVectorStore(SelectionDAG &DAG, StoreSDNode *St) { 8691 SDValue StVal = St->getValue(); 8692 EVT VT = StVal.getValueType(); 8693 8694 // Don't replace floating point stores, they possibly won't be transformed to 8695 // stp because of the store pair suppress pass. 8696 if (VT.isFloatingPoint()) 8697 return SDValue(); 8698 8699 // Check for insert vector elements. 8700 if (StVal.getOpcode() != ISD::INSERT_VECTOR_ELT) 8701 return SDValue(); 8702 8703 // We can express a splat as store pair(s) for 2 or 4 elements. 8704 unsigned NumVecElts = VT.getVectorNumElements(); 8705 if (NumVecElts != 4 && NumVecElts != 2) 8706 return SDValue(); 8707 SDValue SplatVal = StVal.getOperand(1); 8708 unsigned RemainInsertElts = NumVecElts - 1; 8709 8710 // Check that this is a splat. 8711 while (--RemainInsertElts) { 8712 SDValue NextInsertElt = StVal.getOperand(0); 8713 if (NextInsertElt.getOpcode() != ISD::INSERT_VECTOR_ELT) 8714 return SDValue(); 8715 if (NextInsertElt.getOperand(1) != SplatVal) 8716 return SDValue(); 8717 StVal = NextInsertElt; 8718 } 8719 unsigned OrigAlignment = St->getAlignment(); 8720 unsigned EltOffset = NumVecElts == 4 ? 4 : 8; 8721 unsigned Alignment = std::min(OrigAlignment, EltOffset); 8722 8723 // Create scalar stores. This is at least as good as the code sequence for a 8724 // split unaligned store which is a dup.s, ext.b, and two stores. 8725 // Most of the time the three stores should be replaced by store pair 8726 // instructions (stp). 8727 SDLoc DL(St); 8728 SDValue BasePtr = St->getBasePtr(); 8729 SDValue NewST1 = 8730 DAG.getStore(St->getChain(), DL, SplatVal, BasePtr, St->getPointerInfo(), 8731 St->isVolatile(), St->isNonTemporal(), St->getAlignment()); 8732 8733 unsigned Offset = EltOffset; 8734 while (--NumVecElts) { 8735 SDValue OffsetPtr = DAG.getNode(ISD::ADD, DL, MVT::i64, BasePtr, 8736 DAG.getConstant(Offset, DL, MVT::i64)); 8737 NewST1 = DAG.getStore(NewST1.getValue(0), DL, SplatVal, OffsetPtr, 8738 St->getPointerInfo(), St->isVolatile(), 8739 St->isNonTemporal(), Alignment); 8740 Offset += EltOffset; 8741 } 8742 return NewST1; 8743 } 8744 8745 static SDValue split16BStores(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, 8746 SelectionDAG &DAG, 8747 const AArch64Subtarget *Subtarget) { 8748 if (!DCI.isBeforeLegalize()) 8749 return SDValue(); 8750 8751 StoreSDNode *S = cast<StoreSDNode>(N); 8752 if (S->isVolatile()) 8753 return SDValue(); 8754 8755 // FIXME: The logic for deciding if an unaligned store should be split should 8756 // be included in TLI.allowsMisalignedMemoryAccesses(), and there should be 8757 // a call to that function here. 8758 8759 if (!Subtarget->isMisaligned128StoreSlow()) 8760 return SDValue(); 8761 8762 // Don't split at -Oz. 8763 if (DAG.getMachineFunction().getFunction()->optForMinSize()) 8764 return SDValue(); 8765 8766 SDValue StVal = S->getValue(); 8767 EVT VT = StVal.getValueType(); 8768 8769 // Don't split v2i64 vectors. Memcpy lowering produces those and splitting 8770 // those up regresses performance on micro-benchmarks and olden/bh. 8771 if (!VT.isVector() || VT.getVectorNumElements() < 2 || VT == MVT::v2i64) 8772 return SDValue(); 8773 8774 // Split unaligned 16B stores. They are terrible for performance. 8775 // Don't split stores with alignment of 1 or 2. Code that uses clang vector 8776 // extensions can use this to mark that it does not want splitting to happen 8777 // (by underspecifying alignment to be 1 or 2). Furthermore, the chance of 8778 // eliminating alignment hazards is only 1 in 8 for alignment of 2. 8779 if (VT.getSizeInBits() != 128 || S->getAlignment() >= 16 || 8780 S->getAlignment() <= 2) 8781 return SDValue(); 8782 8783 // If we get a splat of a scalar convert this vector store to a store of 8784 // scalars. They will be merged into store pairs thereby removing two 8785 // instructions. 8786 if (SDValue ReplacedSplat = replaceSplatVectorStore(DAG, S)) 8787 return ReplacedSplat; 8788 8789 SDLoc DL(S); 8790 unsigned NumElts = VT.getVectorNumElements() / 2; 8791 // Split VT into two. 8792 EVT HalfVT = 8793 EVT::getVectorVT(*DAG.getContext(), VT.getVectorElementType(), NumElts); 8794 SDValue SubVector0 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, StVal, 8795 DAG.getConstant(0, DL, MVT::i64)); 8796 SDValue SubVector1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, StVal, 8797 DAG.getConstant(NumElts, DL, MVT::i64)); 8798 SDValue BasePtr = S->getBasePtr(); 8799 SDValue NewST1 = 8800 DAG.getStore(S->getChain(), DL, SubVector0, BasePtr, S->getPointerInfo(), 8801 S->isVolatile(), S->isNonTemporal(), S->getAlignment()); 8802 SDValue OffsetPtr = DAG.getNode(ISD::ADD, DL, MVT::i64, BasePtr, 8803 DAG.getConstant(8, DL, MVT::i64)); 8804 return DAG.getStore(NewST1.getValue(0), DL, SubVector1, OffsetPtr, 8805 S->getPointerInfo(), S->isVolatile(), S->isNonTemporal(), 8806 S->getAlignment()); 8807 } 8808 8809 /// Target-specific DAG combine function for post-increment LD1 (lane) and 8810 /// post-increment LD1R. 8811 static SDValue performPostLD1Combine(SDNode *N, 8812 TargetLowering::DAGCombinerInfo &DCI, 8813 bool IsLaneOp) { 8814 if (DCI.isBeforeLegalizeOps()) 8815 return SDValue(); 8816 8817 SelectionDAG &DAG = DCI.DAG; 8818 EVT VT = N->getValueType(0); 8819 8820 unsigned LoadIdx = IsLaneOp ? 1 : 0; 8821 SDNode *LD = N->getOperand(LoadIdx).getNode(); 8822 // If it is not LOAD, can not do such combine. 8823 if (LD->getOpcode() != ISD::LOAD) 8824 return SDValue(); 8825 8826 LoadSDNode *LoadSDN = cast<LoadSDNode>(LD); 8827 EVT MemVT = LoadSDN->getMemoryVT(); 8828 // Check if memory operand is the same type as the vector element. 8829 if (MemVT != VT.getVectorElementType()) 8830 return SDValue(); 8831 8832 // Check if there are other uses. If so, do not combine as it will introduce 8833 // an extra load. 8834 for (SDNode::use_iterator UI = LD->use_begin(), UE = LD->use_end(); UI != UE; 8835 ++UI) { 8836 if (UI.getUse().getResNo() == 1) // Ignore uses of the chain result. 8837 continue; 8838 if (*UI != N) 8839 return SDValue(); 8840 } 8841 8842 SDValue Addr = LD->getOperand(1); 8843 SDValue Vector = N->getOperand(0); 8844 // Search for a use of the address operand that is an increment. 8845 for (SDNode::use_iterator UI = Addr.getNode()->use_begin(), UE = 8846 Addr.getNode()->use_end(); UI != UE; ++UI) { 8847 SDNode *User = *UI; 8848 if (User->getOpcode() != ISD::ADD 8849 || UI.getUse().getResNo() != Addr.getResNo()) 8850 continue; 8851 8852 // Check that the add is independent of the load. Otherwise, folding it 8853 // would create a cycle. 8854 if (User->isPredecessorOf(LD) || LD->isPredecessorOf(User)) 8855 continue; 8856 // Also check that add is not used in the vector operand. This would also 8857 // create a cycle. 8858 if (User->isPredecessorOf(Vector.getNode())) 8859 continue; 8860 8861 // If the increment is a constant, it must match the memory ref size. 8862 SDValue Inc = User->getOperand(User->getOperand(0) == Addr ? 1 : 0); 8863 if (ConstantSDNode *CInc = dyn_cast<ConstantSDNode>(Inc.getNode())) { 8864 uint32_t IncVal = CInc->getZExtValue(); 8865 unsigned NumBytes = VT.getScalarSizeInBits() / 8; 8866 if (IncVal != NumBytes) 8867 continue; 8868 Inc = DAG.getRegister(AArch64::XZR, MVT::i64); 8869 } 8870 8871 // Finally, check that the vector doesn't depend on the load. 8872 // Again, this would create a cycle. 8873 // The load depending on the vector is fine, as that's the case for the 8874 // LD1*post we'll eventually generate anyway. 8875 if (LoadSDN->isPredecessorOf(Vector.getNode())) 8876 continue; 8877 8878 SmallVector<SDValue, 8> Ops; 8879 Ops.push_back(LD->getOperand(0)); // Chain 8880 if (IsLaneOp) { 8881 Ops.push_back(Vector); // The vector to be inserted 8882 Ops.push_back(N->getOperand(2)); // The lane to be inserted in the vector 8883 } 8884 Ops.push_back(Addr); 8885 Ops.push_back(Inc); 8886 8887 EVT Tys[3] = { VT, MVT::i64, MVT::Other }; 8888 SDVTList SDTys = DAG.getVTList(Tys); 8889 unsigned NewOp = IsLaneOp ? AArch64ISD::LD1LANEpost : AArch64ISD::LD1DUPpost; 8890 SDValue UpdN = DAG.getMemIntrinsicNode(NewOp, SDLoc(N), SDTys, Ops, 8891 MemVT, 8892 LoadSDN->getMemOperand()); 8893 8894 // Update the uses. 8895 SDValue NewResults[] = { 8896 SDValue(LD, 0), // The result of load 8897 SDValue(UpdN.getNode(), 2) // Chain 8898 }; 8899 DCI.CombineTo(LD, NewResults); 8900 DCI.CombineTo(N, SDValue(UpdN.getNode(), 0)); // Dup/Inserted Result 8901 DCI.CombineTo(User, SDValue(UpdN.getNode(), 1)); // Write back register 8902 8903 break; 8904 } 8905 return SDValue(); 8906 } 8907 8908 /// Simplify \Addr given that the top byte of it is ignored by HW during 8909 /// address translation. 8910 static bool performTBISimplification(SDValue Addr, 8911 TargetLowering::DAGCombinerInfo &DCI, 8912 SelectionDAG &DAG) { 8913 APInt DemandedMask = APInt::getLowBitsSet(64, 56); 8914 APInt KnownZero, KnownOne; 8915 TargetLowering::TargetLoweringOpt TLO(DAG, DCI.isBeforeLegalize(), 8916 DCI.isBeforeLegalizeOps()); 8917 const TargetLowering &TLI = DAG.getTargetLoweringInfo(); 8918 if (TLI.SimplifyDemandedBits(Addr, DemandedMask, KnownZero, KnownOne, TLO)) { 8919 DCI.CommitTargetLoweringOpt(TLO); 8920 return true; 8921 } 8922 return false; 8923 } 8924 8925 static SDValue performSTORECombine(SDNode *N, 8926 TargetLowering::DAGCombinerInfo &DCI, 8927 SelectionDAG &DAG, 8928 const AArch64Subtarget *Subtarget) { 8929 if (SDValue Split = split16BStores(N, DCI, DAG, Subtarget)) 8930 return Split; 8931 8932 if (Subtarget->supportsAddressTopByteIgnored() && 8933 performTBISimplification(N->getOperand(2), DCI, DAG)) 8934 return SDValue(N, 0); 8935 8936 return SDValue(); 8937 } 8938 8939 /// This function handles the log2-shuffle pattern produced by the 8940 /// LoopVectorizer for the across vector reduction. It consists of 8941 /// log2(NumVectorElements) steps and, in each step, 2^(s) elements 8942 /// are reduced, where s is an induction variable from 0 to 8943 /// log2(NumVectorElements). 8944 static SDValue tryMatchAcrossLaneShuffleForReduction(SDNode *N, SDValue OpV, 8945 unsigned Op, 8946 SelectionDAG &DAG) { 8947 EVT VTy = OpV->getOperand(0).getValueType(); 8948 if (!VTy.isVector()) 8949 return SDValue(); 8950 8951 int NumVecElts = VTy.getVectorNumElements(); 8952 if (Op == ISD::FMAXNUM || Op == ISD::FMINNUM) { 8953 if (NumVecElts != 4) 8954 return SDValue(); 8955 } else { 8956 if (NumVecElts != 4 && NumVecElts != 8 && NumVecElts != 16) 8957 return SDValue(); 8958 } 8959 8960 int NumExpectedSteps = APInt(8, NumVecElts).logBase2(); 8961 SDValue PreOp = OpV; 8962 // Iterate over each step of the across vector reduction. 8963 for (int CurStep = 0; CurStep != NumExpectedSteps; ++CurStep) { 8964 SDValue CurOp = PreOp.getOperand(0); 8965 SDValue Shuffle = PreOp.getOperand(1); 8966 if (Shuffle.getOpcode() != ISD::VECTOR_SHUFFLE) { 8967 // Try to swap the 1st and 2nd operand as add and min/max instructions 8968 // are commutative. 8969 CurOp = PreOp.getOperand(1); 8970 Shuffle = PreOp.getOperand(0); 8971 if (Shuffle.getOpcode() != ISD::VECTOR_SHUFFLE) 8972 return SDValue(); 8973 } 8974 8975 // Check if the input vector is fed by the operator we want to handle, 8976 // except the last step; the very first input vector is not necessarily 8977 // the same operator we are handling. 8978 if (CurOp.getOpcode() != Op && (CurStep != (NumExpectedSteps - 1))) 8979 return SDValue(); 8980 8981 // Check if it forms one step of the across vector reduction. 8982 // E.g., 8983 // %cur = add %1, %0 8984 // %shuffle = vector_shuffle %cur, <2, 3, u, u> 8985 // %pre = add %cur, %shuffle 8986 if (Shuffle.getOperand(0) != CurOp) 8987 return SDValue(); 8988 8989 int NumMaskElts = 1 << CurStep; 8990 ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(Shuffle)->getMask(); 8991 // Check mask values in each step. 8992 // We expect the shuffle mask in each step follows a specific pattern 8993 // denoted here by the <M, U> form, where M is a sequence of integers 8994 // starting from NumMaskElts, increasing by 1, and the number integers 8995 // in M should be NumMaskElts. U is a sequence of UNDEFs and the number 8996 // of undef in U should be NumVecElts - NumMaskElts. 8997 // E.g., for <8 x i16>, mask values in each step should be : 8998 // step 0 : <1,u,u,u,u,u,u,u> 8999 // step 1 : <2,3,u,u,u,u,u,u> 9000 // step 2 : <4,5,6,7,u,u,u,u> 9001 for (int i = 0; i < NumVecElts; ++i) 9002 if ((i < NumMaskElts && Mask[i] != (NumMaskElts + i)) || 9003 (i >= NumMaskElts && !(Mask[i] < 0))) 9004 return SDValue(); 9005 9006 PreOp = CurOp; 9007 } 9008 unsigned Opcode; 9009 bool IsIntrinsic = false; 9010 9011 switch (Op) { 9012 default: 9013 llvm_unreachable("Unexpected operator for across vector reduction"); 9014 case ISD::ADD: 9015 Opcode = AArch64ISD::UADDV; 9016 break; 9017 case ISD::SMAX: 9018 Opcode = AArch64ISD::SMAXV; 9019 break; 9020 case ISD::UMAX: 9021 Opcode = AArch64ISD::UMAXV; 9022 break; 9023 case ISD::SMIN: 9024 Opcode = AArch64ISD::SMINV; 9025 break; 9026 case ISD::UMIN: 9027 Opcode = AArch64ISD::UMINV; 9028 break; 9029 case ISD::FMAXNUM: 9030 Opcode = Intrinsic::aarch64_neon_fmaxnmv; 9031 IsIntrinsic = true; 9032 break; 9033 case ISD::FMINNUM: 9034 Opcode = Intrinsic::aarch64_neon_fminnmv; 9035 IsIntrinsic = true; 9036 break; 9037 } 9038 SDLoc DL(N); 9039 9040 return IsIntrinsic 9041 ? DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, N->getValueType(0), 9042 DAG.getConstant(Opcode, DL, MVT::i32), PreOp) 9043 : DAG.getNode( 9044 ISD::EXTRACT_VECTOR_ELT, DL, N->getValueType(0), 9045 DAG.getNode(Opcode, DL, PreOp.getSimpleValueType(), PreOp), 9046 DAG.getConstant(0, DL, MVT::i64)); 9047 } 9048 9049 /// Target-specific DAG combine for the across vector min/max reductions. 9050 /// This function specifically handles the final clean-up step of the vector 9051 /// min/max reductions produced by the LoopVectorizer. It is the log2-shuffle 9052 /// pattern, which narrows down and finds the final min/max value from all 9053 /// elements of the vector. 9054 /// For example, for a <16 x i8> vector : 9055 /// svn0 = vector_shuffle %0, undef<8,9,10,11,12,13,14,15,u,u,u,u,u,u,u,u> 9056 /// %smax0 = smax %arr, svn0 9057 /// %svn1 = vector_shuffle %smax0, undef<4,5,6,7,u,u,u,u,u,u,u,u,u,u,u,u> 9058 /// %smax1 = smax %smax0, %svn1 9059 /// %svn2 = vector_shuffle %smax1, undef<2,3,u,u,u,u,u,u,u,u,u,u,u,u,u,u> 9060 /// %smax2 = smax %smax1, svn2 9061 /// %svn3 = vector_shuffle %smax2, undef<1,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u> 9062 /// %sc = setcc %smax2, %svn3, gt 9063 /// %n0 = extract_vector_elt %sc, #0 9064 /// %n1 = extract_vector_elt %smax2, #0 9065 /// %n2 = extract_vector_elt $smax2, #1 9066 /// %result = select %n0, %n1, n2 9067 /// becomes : 9068 /// %1 = smaxv %0 9069 /// %result = extract_vector_elt %1, 0 9070 static SDValue 9071 performAcrossLaneMinMaxReductionCombine(SDNode *N, SelectionDAG &DAG, 9072 const AArch64Subtarget *Subtarget) { 9073 if (!Subtarget->hasNEON()) 9074 return SDValue(); 9075 9076 SDValue N0 = N->getOperand(0); 9077 SDValue IfTrue = N->getOperand(1); 9078 SDValue IfFalse = N->getOperand(2); 9079 9080 // Check if the SELECT merges up the final result of the min/max 9081 // from a vector. 9082 if (N0.getOpcode() != ISD::EXTRACT_VECTOR_ELT || 9083 IfTrue.getOpcode() != ISD::EXTRACT_VECTOR_ELT || 9084 IfFalse.getOpcode() != ISD::EXTRACT_VECTOR_ELT) 9085 return SDValue(); 9086 9087 // Expect N0 is fed by SETCC. 9088 SDValue SetCC = N0.getOperand(0); 9089 EVT SetCCVT = SetCC.getValueType(); 9090 if (SetCC.getOpcode() != ISD::SETCC || !SetCCVT.isVector() || 9091 SetCCVT.getVectorElementType() != MVT::i1) 9092 return SDValue(); 9093 9094 SDValue VectorOp = SetCC.getOperand(0); 9095 unsigned Op = VectorOp->getOpcode(); 9096 // Check if the input vector is fed by the operator we want to handle. 9097 if (Op != ISD::SMAX && Op != ISD::UMAX && Op != ISD::SMIN && 9098 Op != ISD::UMIN && Op != ISD::FMAXNUM && Op != ISD::FMINNUM) 9099 return SDValue(); 9100 9101 EVT VTy = VectorOp.getValueType(); 9102 if (!VTy.isVector()) 9103 return SDValue(); 9104 9105 if (VTy.getSizeInBits() < 64) 9106 return SDValue(); 9107 9108 EVT EltTy = VTy.getVectorElementType(); 9109 if (Op == ISD::FMAXNUM || Op == ISD::FMINNUM) { 9110 if (EltTy != MVT::f32) 9111 return SDValue(); 9112 } else { 9113 if (EltTy != MVT::i32 && EltTy != MVT::i16 && EltTy != MVT::i8) 9114 return SDValue(); 9115 } 9116 9117 // Check if extracting from the same vector. 9118 // For example, 9119 // %sc = setcc %vector, %svn1, gt 9120 // %n0 = extract_vector_elt %sc, #0 9121 // %n1 = extract_vector_elt %vector, #0 9122 // %n2 = extract_vector_elt $vector, #1 9123 if (!(VectorOp == IfTrue->getOperand(0) && 9124 VectorOp == IfFalse->getOperand(0))) 9125 return SDValue(); 9126 9127 // Check if the condition code is matched with the operator type. 9128 ISD::CondCode CC = cast<CondCodeSDNode>(SetCC->getOperand(2))->get(); 9129 if ((Op == ISD::SMAX && CC != ISD::SETGT && CC != ISD::SETGE) || 9130 (Op == ISD::UMAX && CC != ISD::SETUGT && CC != ISD::SETUGE) || 9131 (Op == ISD::SMIN && CC != ISD::SETLT && CC != ISD::SETLE) || 9132 (Op == ISD::UMIN && CC != ISD::SETULT && CC != ISD::SETULE) || 9133 (Op == ISD::FMAXNUM && CC != ISD::SETOGT && CC != ISD::SETOGE && 9134 CC != ISD::SETUGT && CC != ISD::SETUGE && CC != ISD::SETGT && 9135 CC != ISD::SETGE) || 9136 (Op == ISD::FMINNUM && CC != ISD::SETOLT && CC != ISD::SETOLE && 9137 CC != ISD::SETULT && CC != ISD::SETULE && CC != ISD::SETLT && 9138 CC != ISD::SETLE)) 9139 return SDValue(); 9140 9141 // Expect to check only lane 0 from the vector SETCC. 9142 if (!isNullConstant(N0.getOperand(1))) 9143 return SDValue(); 9144 9145 // Expect to extract the true value from lane 0. 9146 if (!isNullConstant(IfTrue.getOperand(1))) 9147 return SDValue(); 9148 9149 // Expect to extract the false value from lane 1. 9150 if (!isOneConstant(IfFalse.getOperand(1))) 9151 return SDValue(); 9152 9153 return tryMatchAcrossLaneShuffleForReduction(N, SetCC, Op, DAG); 9154 } 9155 9156 /// Target-specific DAG combine for the across vector add reduction. 9157 /// This function specifically handles the final clean-up step of the vector 9158 /// add reduction produced by the LoopVectorizer. It is the log2-shuffle 9159 /// pattern, which adds all elements of a vector together. 9160 /// For example, for a <4 x i32> vector : 9161 /// %1 = vector_shuffle %0, <2,3,u,u> 9162 /// %2 = add %0, %1 9163 /// %3 = vector_shuffle %2, <1,u,u,u> 9164 /// %4 = add %2, %3 9165 /// %result = extract_vector_elt %4, 0 9166 /// becomes : 9167 /// %0 = uaddv %0 9168 /// %result = extract_vector_elt %0, 0 9169 static SDValue 9170 performAcrossLaneAddReductionCombine(SDNode *N, SelectionDAG &DAG, 9171 const AArch64Subtarget *Subtarget) { 9172 if (!Subtarget->hasNEON()) 9173 return SDValue(); 9174 SDValue N0 = N->getOperand(0); 9175 SDValue N1 = N->getOperand(1); 9176 9177 // Check if the input vector is fed by the ADD. 9178 if (N0->getOpcode() != ISD::ADD) 9179 return SDValue(); 9180 9181 // The vector extract idx must constant zero because we only expect the final 9182 // result of the reduction is placed in lane 0. 9183 if (!isNullConstant(N1)) 9184 return SDValue(); 9185 9186 EVT VTy = N0.getValueType(); 9187 if (!VTy.isVector()) 9188 return SDValue(); 9189 9190 EVT EltTy = VTy.getVectorElementType(); 9191 if (EltTy != MVT::i32 && EltTy != MVT::i16 && EltTy != MVT::i8) 9192 return SDValue(); 9193 9194 if (VTy.getSizeInBits() < 64) 9195 return SDValue(); 9196 9197 return tryMatchAcrossLaneShuffleForReduction(N, N0, ISD::ADD, DAG); 9198 } 9199 9200 /// Target-specific DAG combine function for NEON load/store intrinsics 9201 /// to merge base address updates. 9202 static SDValue performNEONPostLDSTCombine(SDNode *N, 9203 TargetLowering::DAGCombinerInfo &DCI, 9204 SelectionDAG &DAG) { 9205 if (DCI.isBeforeLegalize() || DCI.isCalledByLegalizer()) 9206 return SDValue(); 9207 9208 unsigned AddrOpIdx = N->getNumOperands() - 1; 9209 SDValue Addr = N->getOperand(AddrOpIdx); 9210 9211 // Search for a use of the address operand that is an increment. 9212 for (SDNode::use_iterator UI = Addr.getNode()->use_begin(), 9213 UE = Addr.getNode()->use_end(); UI != UE; ++UI) { 9214 SDNode *User = *UI; 9215 if (User->getOpcode() != ISD::ADD || 9216 UI.getUse().getResNo() != Addr.getResNo()) 9217 continue; 9218 9219 // Check that the add is independent of the load/store. Otherwise, folding 9220 // it would create a cycle. 9221 if (User->isPredecessorOf(N) || N->isPredecessorOf(User)) 9222 continue; 9223 9224 // Find the new opcode for the updating load/store. 9225 bool IsStore = false; 9226 bool IsLaneOp = false; 9227 bool IsDupOp = false; 9228 unsigned NewOpc = 0; 9229 unsigned NumVecs = 0; 9230 unsigned IntNo = cast<ConstantSDNode>(N->getOperand(1))->getZExtValue(); 9231 switch (IntNo) { 9232 default: llvm_unreachable("unexpected intrinsic for Neon base update"); 9233 case Intrinsic::aarch64_neon_ld2: NewOpc = AArch64ISD::LD2post; 9234 NumVecs = 2; break; 9235 case Intrinsic::aarch64_neon_ld3: NewOpc = AArch64ISD::LD3post; 9236 NumVecs = 3; break; 9237 case Intrinsic::aarch64_neon_ld4: NewOpc = AArch64ISD::LD4post; 9238 NumVecs = 4; break; 9239 case Intrinsic::aarch64_neon_st2: NewOpc = AArch64ISD::ST2post; 9240 NumVecs = 2; IsStore = true; break; 9241 case Intrinsic::aarch64_neon_st3: NewOpc = AArch64ISD::ST3post; 9242 NumVecs = 3; IsStore = true; break; 9243 case Intrinsic::aarch64_neon_st4: NewOpc = AArch64ISD::ST4post; 9244 NumVecs = 4; IsStore = true; break; 9245 case Intrinsic::aarch64_neon_ld1x2: NewOpc = AArch64ISD::LD1x2post; 9246 NumVecs = 2; break; 9247 case Intrinsic::aarch64_neon_ld1x3: NewOpc = AArch64ISD::LD1x3post; 9248 NumVecs = 3; break; 9249 case Intrinsic::aarch64_neon_ld1x4: NewOpc = AArch64ISD::LD1x4post; 9250 NumVecs = 4; break; 9251 case Intrinsic::aarch64_neon_st1x2: NewOpc = AArch64ISD::ST1x2post; 9252 NumVecs = 2; IsStore = true; break; 9253 case Intrinsic::aarch64_neon_st1x3: NewOpc = AArch64ISD::ST1x3post; 9254 NumVecs = 3; IsStore = true; break; 9255 case Intrinsic::aarch64_neon_st1x4: NewOpc = AArch64ISD::ST1x4post; 9256 NumVecs = 4; IsStore = true; break; 9257 case Intrinsic::aarch64_neon_ld2r: NewOpc = AArch64ISD::LD2DUPpost; 9258 NumVecs = 2; IsDupOp = true; break; 9259 case Intrinsic::aarch64_neon_ld3r: NewOpc = AArch64ISD::LD3DUPpost; 9260 NumVecs = 3; IsDupOp = true; break; 9261 case Intrinsic::aarch64_neon_ld4r: NewOpc = AArch64ISD::LD4DUPpost; 9262 NumVecs = 4; IsDupOp = true; break; 9263 case Intrinsic::aarch64_neon_ld2lane: NewOpc = AArch64ISD::LD2LANEpost; 9264 NumVecs = 2; IsLaneOp = true; break; 9265 case Intrinsic::aarch64_neon_ld3lane: NewOpc = AArch64ISD::LD3LANEpost; 9266 NumVecs = 3; IsLaneOp = true; break; 9267 case Intrinsic::aarch64_neon_ld4lane: NewOpc = AArch64ISD::LD4LANEpost; 9268 NumVecs = 4; IsLaneOp = true; break; 9269 case Intrinsic::aarch64_neon_st2lane: NewOpc = AArch64ISD::ST2LANEpost; 9270 NumVecs = 2; IsStore = true; IsLaneOp = true; break; 9271 case Intrinsic::aarch64_neon_st3lane: NewOpc = AArch64ISD::ST3LANEpost; 9272 NumVecs = 3; IsStore = true; IsLaneOp = true; break; 9273 case Intrinsic::aarch64_neon_st4lane: NewOpc = AArch64ISD::ST4LANEpost; 9274 NumVecs = 4; IsStore = true; IsLaneOp = true; break; 9275 } 9276 9277 EVT VecTy; 9278 if (IsStore) 9279 VecTy = N->getOperand(2).getValueType(); 9280 else 9281 VecTy = N->getValueType(0); 9282 9283 // If the increment is a constant, it must match the memory ref size. 9284 SDValue Inc = User->getOperand(User->getOperand(0) == Addr ? 1 : 0); 9285 if (ConstantSDNode *CInc = dyn_cast<ConstantSDNode>(Inc.getNode())) { 9286 uint32_t IncVal = CInc->getZExtValue(); 9287 unsigned NumBytes = NumVecs * VecTy.getSizeInBits() / 8; 9288 if (IsLaneOp || IsDupOp) 9289 NumBytes /= VecTy.getVectorNumElements(); 9290 if (IncVal != NumBytes) 9291 continue; 9292 Inc = DAG.getRegister(AArch64::XZR, MVT::i64); 9293 } 9294 SmallVector<SDValue, 8> Ops; 9295 Ops.push_back(N->getOperand(0)); // Incoming chain 9296 // Load lane and store have vector list as input. 9297 if (IsLaneOp || IsStore) 9298 for (unsigned i = 2; i < AddrOpIdx; ++i) 9299 Ops.push_back(N->getOperand(i)); 9300 Ops.push_back(Addr); // Base register 9301 Ops.push_back(Inc); 9302 9303 // Return Types. 9304 EVT Tys[6]; 9305 unsigned NumResultVecs = (IsStore ? 0 : NumVecs); 9306 unsigned n; 9307 for (n = 0; n < NumResultVecs; ++n) 9308 Tys[n] = VecTy; 9309 Tys[n++] = MVT::i64; // Type of write back register 9310 Tys[n] = MVT::Other; // Type of the chain 9311 SDVTList SDTys = DAG.getVTList(makeArrayRef(Tys, NumResultVecs + 2)); 9312 9313 MemIntrinsicSDNode *MemInt = cast<MemIntrinsicSDNode>(N); 9314 SDValue UpdN = DAG.getMemIntrinsicNode(NewOpc, SDLoc(N), SDTys, Ops, 9315 MemInt->getMemoryVT(), 9316 MemInt->getMemOperand()); 9317 9318 // Update the uses. 9319 std::vector<SDValue> NewResults; 9320 for (unsigned i = 0; i < NumResultVecs; ++i) { 9321 NewResults.push_back(SDValue(UpdN.getNode(), i)); 9322 } 9323 NewResults.push_back(SDValue(UpdN.getNode(), NumResultVecs + 1)); 9324 DCI.CombineTo(N, NewResults); 9325 DCI.CombineTo(User, SDValue(UpdN.getNode(), NumResultVecs)); 9326 9327 break; 9328 } 9329 return SDValue(); 9330 } 9331 9332 // Checks to see if the value is the prescribed width and returns information 9333 // about its extension mode. 9334 static 9335 bool checkValueWidth(SDValue V, unsigned width, ISD::LoadExtType &ExtType) { 9336 ExtType = ISD::NON_EXTLOAD; 9337 switch(V.getNode()->getOpcode()) { 9338 default: 9339 return false; 9340 case ISD::LOAD: { 9341 LoadSDNode *LoadNode = cast<LoadSDNode>(V.getNode()); 9342 if ((LoadNode->getMemoryVT() == MVT::i8 && width == 8) 9343 || (LoadNode->getMemoryVT() == MVT::i16 && width == 16)) { 9344 ExtType = LoadNode->getExtensionType(); 9345 return true; 9346 } 9347 return false; 9348 } 9349 case ISD::AssertSext: { 9350 VTSDNode *TypeNode = cast<VTSDNode>(V.getNode()->getOperand(1)); 9351 if ((TypeNode->getVT() == MVT::i8 && width == 8) 9352 || (TypeNode->getVT() == MVT::i16 && width == 16)) { 9353 ExtType = ISD::SEXTLOAD; 9354 return true; 9355 } 9356 return false; 9357 } 9358 case ISD::AssertZext: { 9359 VTSDNode *TypeNode = cast<VTSDNode>(V.getNode()->getOperand(1)); 9360 if ((TypeNode->getVT() == MVT::i8 && width == 8) 9361 || (TypeNode->getVT() == MVT::i16 && width == 16)) { 9362 ExtType = ISD::ZEXTLOAD; 9363 return true; 9364 } 9365 return false; 9366 } 9367 case ISD::Constant: 9368 case ISD::TargetConstant: { 9369 return std::abs(cast<ConstantSDNode>(V.getNode())->getSExtValue()) < 9370 1LL << (width - 1); 9371 } 9372 } 9373 9374 return true; 9375 } 9376 9377 // This function does a whole lot of voodoo to determine if the tests are 9378 // equivalent without and with a mask. Essentially what happens is that given a 9379 // DAG resembling: 9380 // 9381 // +-------------+ +-------------+ +-------------+ +-------------+ 9382 // | Input | | AddConstant | | CompConstant| | CC | 9383 // +-------------+ +-------------+ +-------------+ +-------------+ 9384 // | | | | 9385 // V V | +----------+ 9386 // +-------------+ +----+ | | 9387 // | ADD | |0xff| | | 9388 // +-------------+ +----+ | | 9389 // | | | | 9390 // V V | | 9391 // +-------------+ | | 9392 // | AND | | | 9393 // +-------------+ | | 9394 // | | | 9395 // +-----+ | | 9396 // | | | 9397 // V V V 9398 // +-------------+ 9399 // | CMP | 9400 // +-------------+ 9401 // 9402 // The AND node may be safely removed for some combinations of inputs. In 9403 // particular we need to take into account the extension type of the Input, 9404 // the exact values of AddConstant, CompConstant, and CC, along with the nominal 9405 // width of the input (this can work for any width inputs, the above graph is 9406 // specific to 8 bits. 9407 // 9408 // The specific equations were worked out by generating output tables for each 9409 // AArch64CC value in terms of and AddConstant (w1), CompConstant(w2). The 9410 // problem was simplified by working with 4 bit inputs, which means we only 9411 // needed to reason about 24 distinct bit patterns: 8 patterns unique to zero 9412 // extension (8,15), 8 patterns unique to sign extensions (-8,-1), and 8 9413 // patterns present in both extensions (0,7). For every distinct set of 9414 // AddConstant and CompConstants bit patterns we can consider the masked and 9415 // unmasked versions to be equivalent if the result of this function is true for 9416 // all 16 distinct bit patterns of for the current extension type of Input (w0). 9417 // 9418 // sub w8, w0, w1 9419 // and w10, w8, #0x0f 9420 // cmp w8, w2 9421 // cset w9, AArch64CC 9422 // cmp w10, w2 9423 // cset w11, AArch64CC 9424 // cmp w9, w11 9425 // cset w0, eq 9426 // ret 9427 // 9428 // Since the above function shows when the outputs are equivalent it defines 9429 // when it is safe to remove the AND. Unfortunately it only runs on AArch64 and 9430 // would be expensive to run during compiles. The equations below were written 9431 // in a test harness that confirmed they gave equivalent outputs to the above 9432 // for all inputs function, so they can be used determine if the removal is 9433 // legal instead. 9434 // 9435 // isEquivalentMaskless() is the code for testing if the AND can be removed 9436 // factored out of the DAG recognition as the DAG can take several forms. 9437 9438 static bool isEquivalentMaskless(unsigned CC, unsigned width, 9439 ISD::LoadExtType ExtType, int AddConstant, 9440 int CompConstant) { 9441 // By being careful about our equations and only writing the in term 9442 // symbolic values and well known constants (0, 1, -1, MaxUInt) we can 9443 // make them generally applicable to all bit widths. 9444 int MaxUInt = (1 << width); 9445 9446 // For the purposes of these comparisons sign extending the type is 9447 // equivalent to zero extending the add and displacing it by half the integer 9448 // width. Provided we are careful and make sure our equations are valid over 9449 // the whole range we can just adjust the input and avoid writing equations 9450 // for sign extended inputs. 9451 if (ExtType == ISD::SEXTLOAD) 9452 AddConstant -= (1 << (width-1)); 9453 9454 switch(CC) { 9455 case AArch64CC::LE: 9456 case AArch64CC::GT: { 9457 if ((AddConstant == 0) || 9458 (CompConstant == MaxUInt - 1 && AddConstant < 0) || 9459 (AddConstant >= 0 && CompConstant < 0) || 9460 (AddConstant <= 0 && CompConstant <= 0 && CompConstant < AddConstant)) 9461 return true; 9462 } break; 9463 case AArch64CC::LT: 9464 case AArch64CC::GE: { 9465 if ((AddConstant == 0) || 9466 (AddConstant >= 0 && CompConstant <= 0) || 9467 (AddConstant <= 0 && CompConstant <= 0 && CompConstant <= AddConstant)) 9468 return true; 9469 } break; 9470 case AArch64CC::HI: 9471 case AArch64CC::LS: { 9472 if ((AddConstant >= 0 && CompConstant < 0) || 9473 (AddConstant <= 0 && CompConstant >= -1 && 9474 CompConstant < AddConstant + MaxUInt)) 9475 return true; 9476 } break; 9477 case AArch64CC::PL: 9478 case AArch64CC::MI: { 9479 if ((AddConstant == 0) || 9480 (AddConstant > 0 && CompConstant <= 0) || 9481 (AddConstant < 0 && CompConstant <= AddConstant)) 9482 return true; 9483 } break; 9484 case AArch64CC::LO: 9485 case AArch64CC::HS: { 9486 if ((AddConstant >= 0 && CompConstant <= 0) || 9487 (AddConstant <= 0 && CompConstant >= 0 && 9488 CompConstant <= AddConstant + MaxUInt)) 9489 return true; 9490 } break; 9491 case AArch64CC::EQ: 9492 case AArch64CC::NE: { 9493 if ((AddConstant > 0 && CompConstant < 0) || 9494 (AddConstant < 0 && CompConstant >= 0 && 9495 CompConstant < AddConstant + MaxUInt) || 9496 (AddConstant >= 0 && CompConstant >= 0 && 9497 CompConstant >= AddConstant) || 9498 (AddConstant <= 0 && CompConstant < 0 && CompConstant < AddConstant)) 9499 9500 return true; 9501 } break; 9502 case AArch64CC::VS: 9503 case AArch64CC::VC: 9504 case AArch64CC::AL: 9505 case AArch64CC::NV: 9506 return true; 9507 case AArch64CC::Invalid: 9508 break; 9509 } 9510 9511 return false; 9512 } 9513 9514 static 9515 SDValue performCONDCombine(SDNode *N, 9516 TargetLowering::DAGCombinerInfo &DCI, 9517 SelectionDAG &DAG, unsigned CCIndex, 9518 unsigned CmpIndex) { 9519 unsigned CC = cast<ConstantSDNode>(N->getOperand(CCIndex))->getSExtValue(); 9520 SDNode *SubsNode = N->getOperand(CmpIndex).getNode(); 9521 unsigned CondOpcode = SubsNode->getOpcode(); 9522 9523 if (CondOpcode != AArch64ISD::SUBS) 9524 return SDValue(); 9525 9526 // There is a SUBS feeding this condition. Is it fed by a mask we can 9527 // use? 9528 9529 SDNode *AndNode = SubsNode->getOperand(0).getNode(); 9530 unsigned MaskBits = 0; 9531 9532 if (AndNode->getOpcode() != ISD::AND) 9533 return SDValue(); 9534 9535 if (ConstantSDNode *CN = dyn_cast<ConstantSDNode>(AndNode->getOperand(1))) { 9536 uint32_t CNV = CN->getZExtValue(); 9537 if (CNV == 255) 9538 MaskBits = 8; 9539 else if (CNV == 65535) 9540 MaskBits = 16; 9541 } 9542 9543 if (!MaskBits) 9544 return SDValue(); 9545 9546 SDValue AddValue = AndNode->getOperand(0); 9547 9548 if (AddValue.getOpcode() != ISD::ADD) 9549 return SDValue(); 9550 9551 // The basic dag structure is correct, grab the inputs and validate them. 9552 9553 SDValue AddInputValue1 = AddValue.getNode()->getOperand(0); 9554 SDValue AddInputValue2 = AddValue.getNode()->getOperand(1); 9555 SDValue SubsInputValue = SubsNode->getOperand(1); 9556 9557 // The mask is present and the provenance of all the values is a smaller type, 9558 // lets see if the mask is superfluous. 9559 9560 if (!isa<ConstantSDNode>(AddInputValue2.getNode()) || 9561 !isa<ConstantSDNode>(SubsInputValue.getNode())) 9562 return SDValue(); 9563 9564 ISD::LoadExtType ExtType; 9565 9566 if (!checkValueWidth(SubsInputValue, MaskBits, ExtType) || 9567 !checkValueWidth(AddInputValue2, MaskBits, ExtType) || 9568 !checkValueWidth(AddInputValue1, MaskBits, ExtType) ) 9569 return SDValue(); 9570 9571 if(!isEquivalentMaskless(CC, MaskBits, ExtType, 9572 cast<ConstantSDNode>(AddInputValue2.getNode())->getSExtValue(), 9573 cast<ConstantSDNode>(SubsInputValue.getNode())->getSExtValue())) 9574 return SDValue(); 9575 9576 // The AND is not necessary, remove it. 9577 9578 SDVTList VTs = DAG.getVTList(SubsNode->getValueType(0), 9579 SubsNode->getValueType(1)); 9580 SDValue Ops[] = { AddValue, SubsNode->getOperand(1) }; 9581 9582 SDValue NewValue = DAG.getNode(CondOpcode, SDLoc(SubsNode), VTs, Ops); 9583 DAG.ReplaceAllUsesWith(SubsNode, NewValue.getNode()); 9584 9585 return SDValue(N, 0); 9586 } 9587 9588 // Optimize compare with zero and branch. 9589 static SDValue performBRCONDCombine(SDNode *N, 9590 TargetLowering::DAGCombinerInfo &DCI, 9591 SelectionDAG &DAG) { 9592 if (SDValue NV = performCONDCombine(N, DCI, DAG, 2, 3)) 9593 N = NV.getNode(); 9594 SDValue Chain = N->getOperand(0); 9595 SDValue Dest = N->getOperand(1); 9596 SDValue CCVal = N->getOperand(2); 9597 SDValue Cmp = N->getOperand(3); 9598 9599 assert(isa<ConstantSDNode>(CCVal) && "Expected a ConstantSDNode here!"); 9600 unsigned CC = cast<ConstantSDNode>(CCVal)->getZExtValue(); 9601 if (CC != AArch64CC::EQ && CC != AArch64CC::NE) 9602 return SDValue(); 9603 9604 unsigned CmpOpc = Cmp.getOpcode(); 9605 if (CmpOpc != AArch64ISD::ADDS && CmpOpc != AArch64ISD::SUBS) 9606 return SDValue(); 9607 9608 // Only attempt folding if there is only one use of the flag and no use of the 9609 // value. 9610 if (!Cmp->hasNUsesOfValue(0, 0) || !Cmp->hasNUsesOfValue(1, 1)) 9611 return SDValue(); 9612 9613 SDValue LHS = Cmp.getOperand(0); 9614 SDValue RHS = Cmp.getOperand(1); 9615 9616 assert(LHS.getValueType() == RHS.getValueType() && 9617 "Expected the value type to be the same for both operands!"); 9618 if (LHS.getValueType() != MVT::i32 && LHS.getValueType() != MVT::i64) 9619 return SDValue(); 9620 9621 if (isNullConstant(LHS)) 9622 std::swap(LHS, RHS); 9623 9624 if (!isNullConstant(RHS)) 9625 return SDValue(); 9626 9627 if (LHS.getOpcode() == ISD::SHL || LHS.getOpcode() == ISD::SRA || 9628 LHS.getOpcode() == ISD::SRL) 9629 return SDValue(); 9630 9631 // Fold the compare into the branch instruction. 9632 SDValue BR; 9633 if (CC == AArch64CC::EQ) 9634 BR = DAG.getNode(AArch64ISD::CBZ, SDLoc(N), MVT::Other, Chain, LHS, Dest); 9635 else 9636 BR = DAG.getNode(AArch64ISD::CBNZ, SDLoc(N), MVT::Other, Chain, LHS, Dest); 9637 9638 // Do not add new nodes to DAG combiner worklist. 9639 DCI.CombineTo(N, BR, false); 9640 9641 return SDValue(); 9642 } 9643 9644 // Optimize some simple tbz/tbnz cases. Returns the new operand and bit to test 9645 // as well as whether the test should be inverted. This code is required to 9646 // catch these cases (as opposed to standard dag combines) because 9647 // AArch64ISD::TBZ is matched during legalization. 9648 static SDValue getTestBitOperand(SDValue Op, unsigned &Bit, bool &Invert, 9649 SelectionDAG &DAG) { 9650 9651 if (!Op->hasOneUse()) 9652 return Op; 9653 9654 // We don't handle undef/constant-fold cases below, as they should have 9655 // already been taken care of (e.g. and of 0, test of undefined shifted bits, 9656 // etc.) 9657 9658 // (tbz (trunc x), b) -> (tbz x, b) 9659 // This case is just here to enable more of the below cases to be caught. 9660 if (Op->getOpcode() == ISD::TRUNCATE && 9661 Bit < Op->getValueType(0).getSizeInBits()) { 9662 return getTestBitOperand(Op->getOperand(0), Bit, Invert, DAG); 9663 } 9664 9665 if (Op->getNumOperands() != 2) 9666 return Op; 9667 9668 auto *C = dyn_cast<ConstantSDNode>(Op->getOperand(1)); 9669 if (!C) 9670 return Op; 9671 9672 switch (Op->getOpcode()) { 9673 default: 9674 return Op; 9675 9676 // (tbz (and x, m), b) -> (tbz x, b) 9677 case ISD::AND: 9678 if ((C->getZExtValue() >> Bit) & 1) 9679 return getTestBitOperand(Op->getOperand(0), Bit, Invert, DAG); 9680 return Op; 9681 9682 // (tbz (shl x, c), b) -> (tbz x, b-c) 9683 case ISD::SHL: 9684 if (C->getZExtValue() <= Bit && 9685 (Bit - C->getZExtValue()) < Op->getValueType(0).getSizeInBits()) { 9686 Bit = Bit - C->getZExtValue(); 9687 return getTestBitOperand(Op->getOperand(0), Bit, Invert, DAG); 9688 } 9689 return Op; 9690 9691 // (tbz (sra x, c), b) -> (tbz x, b+c) or (tbz x, msb) if b+c is > # bits in x 9692 case ISD::SRA: 9693 Bit = Bit + C->getZExtValue(); 9694 if (Bit >= Op->getValueType(0).getSizeInBits()) 9695 Bit = Op->getValueType(0).getSizeInBits() - 1; 9696 return getTestBitOperand(Op->getOperand(0), Bit, Invert, DAG); 9697 9698 // (tbz (srl x, c), b) -> (tbz x, b+c) 9699 case ISD::SRL: 9700 if ((Bit + C->getZExtValue()) < Op->getValueType(0).getSizeInBits()) { 9701 Bit = Bit + C->getZExtValue(); 9702 return getTestBitOperand(Op->getOperand(0), Bit, Invert, DAG); 9703 } 9704 return Op; 9705 9706 // (tbz (xor x, -1), b) -> (tbnz x, b) 9707 case ISD::XOR: 9708 if ((C->getZExtValue() >> Bit) & 1) 9709 Invert = !Invert; 9710 return getTestBitOperand(Op->getOperand(0), Bit, Invert, DAG); 9711 } 9712 } 9713 9714 // Optimize test single bit zero/non-zero and branch. 9715 static SDValue performTBZCombine(SDNode *N, 9716 TargetLowering::DAGCombinerInfo &DCI, 9717 SelectionDAG &DAG) { 9718 unsigned Bit = cast<ConstantSDNode>(N->getOperand(2))->getZExtValue(); 9719 bool Invert = false; 9720 SDValue TestSrc = N->getOperand(1); 9721 SDValue NewTestSrc = getTestBitOperand(TestSrc, Bit, Invert, DAG); 9722 9723 if (TestSrc == NewTestSrc) 9724 return SDValue(); 9725 9726 unsigned NewOpc = N->getOpcode(); 9727 if (Invert) { 9728 if (NewOpc == AArch64ISD::TBZ) 9729 NewOpc = AArch64ISD::TBNZ; 9730 else { 9731 assert(NewOpc == AArch64ISD::TBNZ); 9732 NewOpc = AArch64ISD::TBZ; 9733 } 9734 } 9735 9736 SDLoc DL(N); 9737 return DAG.getNode(NewOpc, DL, MVT::Other, N->getOperand(0), NewTestSrc, 9738 DAG.getConstant(Bit, DL, MVT::i64), N->getOperand(3)); 9739 } 9740 9741 // vselect (v1i1 setcc) -> 9742 // vselect (v1iXX setcc) (XX is the size of the compared operand type) 9743 // FIXME: Currently the type legalizer can't handle VSELECT having v1i1 as 9744 // condition. If it can legalize "VSELECT v1i1" correctly, no need to combine 9745 // such VSELECT. 9746 static SDValue performVSelectCombine(SDNode *N, SelectionDAG &DAG) { 9747 SDValue N0 = N->getOperand(0); 9748 EVT CCVT = N0.getValueType(); 9749 9750 if (N0.getOpcode() != ISD::SETCC || CCVT.getVectorNumElements() != 1 || 9751 CCVT.getVectorElementType() != MVT::i1) 9752 return SDValue(); 9753 9754 EVT ResVT = N->getValueType(0); 9755 EVT CmpVT = N0.getOperand(0).getValueType(); 9756 // Only combine when the result type is of the same size as the compared 9757 // operands. 9758 if (ResVT.getSizeInBits() != CmpVT.getSizeInBits()) 9759 return SDValue(); 9760 9761 SDValue IfTrue = N->getOperand(1); 9762 SDValue IfFalse = N->getOperand(2); 9763 SDValue SetCC = 9764 DAG.getSetCC(SDLoc(N), CmpVT.changeVectorElementTypeToInteger(), 9765 N0.getOperand(0), N0.getOperand(1), 9766 cast<CondCodeSDNode>(N0.getOperand(2))->get()); 9767 return DAG.getNode(ISD::VSELECT, SDLoc(N), ResVT, SetCC, 9768 IfTrue, IfFalse); 9769 } 9770 9771 /// A vector select: "(select vL, vR, (setcc LHS, RHS))" is best performed with 9772 /// the compare-mask instructions rather than going via NZCV, even if LHS and 9773 /// RHS are really scalar. This replaces any scalar setcc in the above pattern 9774 /// with a vector one followed by a DUP shuffle on the result. 9775 static SDValue performSelectCombine(SDNode *N, 9776 TargetLowering::DAGCombinerInfo &DCI) { 9777 SelectionDAG &DAG = DCI.DAG; 9778 SDValue N0 = N->getOperand(0); 9779 EVT ResVT = N->getValueType(0); 9780 9781 if (N0.getOpcode() != ISD::SETCC) 9782 return SDValue(); 9783 9784 // Make sure the SETCC result is either i1 (initial DAG), or i32, the lowered 9785 // scalar SetCCResultType. We also don't expect vectors, because we assume 9786 // that selects fed by vector SETCCs are canonicalized to VSELECT. 9787 assert((N0.getValueType() == MVT::i1 || N0.getValueType() == MVT::i32) && 9788 "Scalar-SETCC feeding SELECT has unexpected result type!"); 9789 9790 // If NumMaskElts == 0, the comparison is larger than select result. The 9791 // largest real NEON comparison is 64-bits per lane, which means the result is 9792 // at most 32-bits and an illegal vector. Just bail out for now. 9793 EVT SrcVT = N0.getOperand(0).getValueType(); 9794 9795 // Don't try to do this optimization when the setcc itself has i1 operands. 9796 // There are no legal vectors of i1, so this would be pointless. 9797 if (SrcVT == MVT::i1) 9798 return SDValue(); 9799 9800 int NumMaskElts = ResVT.getSizeInBits() / SrcVT.getSizeInBits(); 9801 if (!ResVT.isVector() || NumMaskElts == 0) 9802 return SDValue(); 9803 9804 SrcVT = EVT::getVectorVT(*DAG.getContext(), SrcVT, NumMaskElts); 9805 EVT CCVT = SrcVT.changeVectorElementTypeToInteger(); 9806 9807 // Also bail out if the vector CCVT isn't the same size as ResVT. 9808 // This can happen if the SETCC operand size doesn't divide the ResVT size 9809 // (e.g., f64 vs v3f32). 9810 if (CCVT.getSizeInBits() != ResVT.getSizeInBits()) 9811 return SDValue(); 9812 9813 // Make sure we didn't create illegal types, if we're not supposed to. 9814 assert(DCI.isBeforeLegalize() || 9815 DAG.getTargetLoweringInfo().isTypeLegal(SrcVT)); 9816 9817 // First perform a vector comparison, where lane 0 is the one we're interested 9818 // in. 9819 SDLoc DL(N0); 9820 SDValue LHS = 9821 DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, SrcVT, N0.getOperand(0)); 9822 SDValue RHS = 9823 DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, SrcVT, N0.getOperand(1)); 9824 SDValue SetCC = DAG.getNode(ISD::SETCC, DL, CCVT, LHS, RHS, N0.getOperand(2)); 9825 9826 // Now duplicate the comparison mask we want across all other lanes. 9827 SmallVector<int, 8> DUPMask(CCVT.getVectorNumElements(), 0); 9828 SDValue Mask = DAG.getVectorShuffle(CCVT, DL, SetCC, SetCC, DUPMask); 9829 Mask = DAG.getNode(ISD::BITCAST, DL, 9830 ResVT.changeVectorElementTypeToInteger(), Mask); 9831 9832 return DAG.getSelect(DL, ResVT, Mask, N->getOperand(1), N->getOperand(2)); 9833 } 9834 9835 /// Get rid of unnecessary NVCASTs (that don't change the type). 9836 static SDValue performNVCASTCombine(SDNode *N) { 9837 if (N->getValueType(0) == N->getOperand(0).getValueType()) 9838 return N->getOperand(0); 9839 9840 return SDValue(); 9841 } 9842 9843 SDValue AArch64TargetLowering::PerformDAGCombine(SDNode *N, 9844 DAGCombinerInfo &DCI) const { 9845 SelectionDAG &DAG = DCI.DAG; 9846 switch (N->getOpcode()) { 9847 default: 9848 break; 9849 case ISD::ADD: 9850 case ISD::SUB: 9851 return performAddSubLongCombine(N, DCI, DAG); 9852 case ISD::XOR: 9853 return performXorCombine(N, DAG, DCI, Subtarget); 9854 case ISD::MUL: 9855 return performMulCombine(N, DAG, DCI, Subtarget); 9856 case ISD::SINT_TO_FP: 9857 case ISD::UINT_TO_FP: 9858 return performIntToFpCombine(N, DAG, Subtarget); 9859 case ISD::FP_TO_SINT: 9860 case ISD::FP_TO_UINT: 9861 return performFpToIntCombine(N, DAG, DCI, Subtarget); 9862 case ISD::FDIV: 9863 return performFDivCombine(N, DAG, Subtarget); 9864 case ISD::OR: 9865 return performORCombine(N, DCI, Subtarget); 9866 case ISD::SRL: 9867 return performSRLCombine(N, DCI); 9868 case ISD::INTRINSIC_WO_CHAIN: 9869 return performIntrinsicCombine(N, DCI, Subtarget); 9870 case ISD::ANY_EXTEND: 9871 case ISD::ZERO_EXTEND: 9872 case ISD::SIGN_EXTEND: 9873 return performExtendCombine(N, DCI, DAG); 9874 case ISD::BITCAST: 9875 return performBitcastCombine(N, DCI, DAG); 9876 case ISD::CONCAT_VECTORS: 9877 return performConcatVectorsCombine(N, DCI, DAG); 9878 case ISD::SELECT: { 9879 SDValue RV = performSelectCombine(N, DCI); 9880 if (!RV.getNode()) 9881 RV = performAcrossLaneMinMaxReductionCombine(N, DAG, Subtarget); 9882 return RV; 9883 } 9884 case ISD::VSELECT: 9885 return performVSelectCombine(N, DCI.DAG); 9886 case ISD::LOAD: 9887 if (performTBISimplification(N->getOperand(1), DCI, DAG)) 9888 return SDValue(N, 0); 9889 break; 9890 case ISD::STORE: 9891 return performSTORECombine(N, DCI, DAG, Subtarget); 9892 case AArch64ISD::BRCOND: 9893 return performBRCONDCombine(N, DCI, DAG); 9894 case AArch64ISD::TBNZ: 9895 case AArch64ISD::TBZ: 9896 return performTBZCombine(N, DCI, DAG); 9897 case AArch64ISD::CSEL: 9898 return performCONDCombine(N, DCI, DAG, 2, 3); 9899 case AArch64ISD::DUP: 9900 return performPostLD1Combine(N, DCI, false); 9901 case AArch64ISD::NVCAST: 9902 return performNVCASTCombine(N); 9903 case ISD::INSERT_VECTOR_ELT: 9904 return performPostLD1Combine(N, DCI, true); 9905 case ISD::EXTRACT_VECTOR_ELT: 9906 return performAcrossLaneAddReductionCombine(N, DAG, Subtarget); 9907 case ISD::INTRINSIC_VOID: 9908 case ISD::INTRINSIC_W_CHAIN: 9909 switch (cast<ConstantSDNode>(N->getOperand(1))->getZExtValue()) { 9910 case Intrinsic::aarch64_neon_ld2: 9911 case Intrinsic::aarch64_neon_ld3: 9912 case Intrinsic::aarch64_neon_ld4: 9913 case Intrinsic::aarch64_neon_ld1x2: 9914 case Intrinsic::aarch64_neon_ld1x3: 9915 case Intrinsic::aarch64_neon_ld1x4: 9916 case Intrinsic::aarch64_neon_ld2lane: 9917 case Intrinsic::aarch64_neon_ld3lane: 9918 case Intrinsic::aarch64_neon_ld4lane: 9919 case Intrinsic::aarch64_neon_ld2r: 9920 case Intrinsic::aarch64_neon_ld3r: 9921 case Intrinsic::aarch64_neon_ld4r: 9922 case Intrinsic::aarch64_neon_st2: 9923 case Intrinsic::aarch64_neon_st3: 9924 case Intrinsic::aarch64_neon_st4: 9925 case Intrinsic::aarch64_neon_st1x2: 9926 case Intrinsic::aarch64_neon_st1x3: 9927 case Intrinsic::aarch64_neon_st1x4: 9928 case Intrinsic::aarch64_neon_st2lane: 9929 case Intrinsic::aarch64_neon_st3lane: 9930 case Intrinsic::aarch64_neon_st4lane: 9931 return performNEONPostLDSTCombine(N, DCI, DAG); 9932 default: 9933 break; 9934 } 9935 } 9936 return SDValue(); 9937 } 9938 9939 // Check if the return value is used as only a return value, as otherwise 9940 // we can't perform a tail-call. In particular, we need to check for 9941 // target ISD nodes that are returns and any other "odd" constructs 9942 // that the generic analysis code won't necessarily catch. 9943 bool AArch64TargetLowering::isUsedByReturnOnly(SDNode *N, 9944 SDValue &Chain) const { 9945 if (N->getNumValues() != 1) 9946 return false; 9947 if (!N->hasNUsesOfValue(1, 0)) 9948 return false; 9949 9950 SDValue TCChain = Chain; 9951 SDNode *Copy = *N->use_begin(); 9952 if (Copy->getOpcode() == ISD::CopyToReg) { 9953 // If the copy has a glue operand, we conservatively assume it isn't safe to 9954 // perform a tail call. 9955 if (Copy->getOperand(Copy->getNumOperands() - 1).getValueType() == 9956 MVT::Glue) 9957 return false; 9958 TCChain = Copy->getOperand(0); 9959 } else if (Copy->getOpcode() != ISD::FP_EXTEND) 9960 return false; 9961 9962 bool HasRet = false; 9963 for (SDNode *Node : Copy->uses()) { 9964 if (Node->getOpcode() != AArch64ISD::RET_FLAG) 9965 return false; 9966 HasRet = true; 9967 } 9968 9969 if (!HasRet) 9970 return false; 9971 9972 Chain = TCChain; 9973 return true; 9974 } 9975 9976 // Return whether the an instruction can potentially be optimized to a tail 9977 // call. This will cause the optimizers to attempt to move, or duplicate, 9978 // return instructions to help enable tail call optimizations for this 9979 // instruction. 9980 bool AArch64TargetLowering::mayBeEmittedAsTailCall(CallInst *CI) const { 9981 return CI->isTailCall(); 9982 } 9983 9984 bool AArch64TargetLowering::getIndexedAddressParts(SDNode *Op, SDValue &Base, 9985 SDValue &Offset, 9986 ISD::MemIndexedMode &AM, 9987 bool &IsInc, 9988 SelectionDAG &DAG) const { 9989 if (Op->getOpcode() != ISD::ADD && Op->getOpcode() != ISD::SUB) 9990 return false; 9991 9992 Base = Op->getOperand(0); 9993 // All of the indexed addressing mode instructions take a signed 9994 // 9 bit immediate offset. 9995 if (ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(Op->getOperand(1))) { 9996 int64_t RHSC = (int64_t)RHS->getZExtValue(); 9997 if (RHSC >= 256 || RHSC <= -256) 9998 return false; 9999 IsInc = (Op->getOpcode() == ISD::ADD); 10000 Offset = Op->getOperand(1); 10001 return true; 10002 } 10003 return false; 10004 } 10005 10006 bool AArch64TargetLowering::getPreIndexedAddressParts(SDNode *N, SDValue &Base, 10007 SDValue &Offset, 10008 ISD::MemIndexedMode &AM, 10009 SelectionDAG &DAG) const { 10010 EVT VT; 10011 SDValue Ptr; 10012 if (LoadSDNode *LD = dyn_cast<LoadSDNode>(N)) { 10013 VT = LD->getMemoryVT(); 10014 Ptr = LD->getBasePtr(); 10015 } else if (StoreSDNode *ST = dyn_cast<StoreSDNode>(N)) { 10016 VT = ST->getMemoryVT(); 10017 Ptr = ST->getBasePtr(); 10018 } else 10019 return false; 10020 10021 bool IsInc; 10022 if (!getIndexedAddressParts(Ptr.getNode(), Base, Offset, AM, IsInc, DAG)) 10023 return false; 10024 AM = IsInc ? ISD::PRE_INC : ISD::PRE_DEC; 10025 return true; 10026 } 10027 10028 bool AArch64TargetLowering::getPostIndexedAddressParts( 10029 SDNode *N, SDNode *Op, SDValue &Base, SDValue &Offset, 10030 ISD::MemIndexedMode &AM, SelectionDAG &DAG) const { 10031 EVT VT; 10032 SDValue Ptr; 10033 if (LoadSDNode *LD = dyn_cast<LoadSDNode>(N)) { 10034 VT = LD->getMemoryVT(); 10035 Ptr = LD->getBasePtr(); 10036 } else if (StoreSDNode *ST = dyn_cast<StoreSDNode>(N)) { 10037 VT = ST->getMemoryVT(); 10038 Ptr = ST->getBasePtr(); 10039 } else 10040 return false; 10041 10042 bool IsInc; 10043 if (!getIndexedAddressParts(Op, Base, Offset, AM, IsInc, DAG)) 10044 return false; 10045 // Post-indexing updates the base, so it's not a valid transform 10046 // if that's not the same as the load's pointer. 10047 if (Ptr != Base) 10048 return false; 10049 AM = IsInc ? ISD::POST_INC : ISD::POST_DEC; 10050 return true; 10051 } 10052 10053 static void ReplaceBITCASTResults(SDNode *N, SmallVectorImpl<SDValue> &Results, 10054 SelectionDAG &DAG) { 10055 SDLoc DL(N); 10056 SDValue Op = N->getOperand(0); 10057 10058 if (N->getValueType(0) != MVT::i16 || Op.getValueType() != MVT::f16) 10059 return; 10060 10061 Op = SDValue( 10062 DAG.getMachineNode(TargetOpcode::INSERT_SUBREG, DL, MVT::f32, 10063 DAG.getUNDEF(MVT::i32), Op, 10064 DAG.getTargetConstant(AArch64::hsub, DL, MVT::i32)), 10065 0); 10066 Op = DAG.getNode(ISD::BITCAST, DL, MVT::i32, Op); 10067 Results.push_back(DAG.getNode(ISD::TRUNCATE, DL, MVT::i16, Op)); 10068 } 10069 10070 static void ReplaceReductionResults(SDNode *N, 10071 SmallVectorImpl<SDValue> &Results, 10072 SelectionDAG &DAG, unsigned InterOp, 10073 unsigned AcrossOp) { 10074 EVT LoVT, HiVT; 10075 SDValue Lo, Hi; 10076 SDLoc dl(N); 10077 std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(N->getValueType(0)); 10078 std::tie(Lo, Hi) = DAG.SplitVectorOperand(N, 0); 10079 SDValue InterVal = DAG.getNode(InterOp, dl, LoVT, Lo, Hi); 10080 SDValue SplitVal = DAG.getNode(AcrossOp, dl, LoVT, InterVal); 10081 Results.push_back(SplitVal); 10082 } 10083 10084 static void ReplaceCMP_SWAP_128Results(SDNode *N, 10085 SmallVectorImpl<SDValue> & Results, 10086 SelectionDAG &DAG) { 10087 assert(N->getValueType(0) == MVT::i128 && 10088 "AtomicCmpSwap on types less than 128 should be legal"); 10089 SDValue Ops[] = {N->getOperand(1), 10090 N->getOperand(2)->getOperand(0), 10091 N->getOperand(2)->getOperand(1), 10092 N->getOperand(3)->getOperand(0), 10093 N->getOperand(3)->getOperand(1), 10094 N->getOperand(0)}; 10095 SDNode *CmpSwap = DAG.getMachineNode( 10096 AArch64::CMP_SWAP_128, SDLoc(N), 10097 DAG.getVTList(MVT::i64, MVT::i64, MVT::i32, MVT::Other), Ops); 10098 10099 MachineFunction &MF = DAG.getMachineFunction(); 10100 MachineSDNode::mmo_iterator MemOp = MF.allocateMemRefsArray(1); 10101 MemOp[0] = cast<MemSDNode>(N)->getMemOperand(); 10102 cast<MachineSDNode>(CmpSwap)->setMemRefs(MemOp, MemOp + 1); 10103 10104 Results.push_back(SDValue(CmpSwap, 0)); 10105 Results.push_back(SDValue(CmpSwap, 1)); 10106 Results.push_back(SDValue(CmpSwap, 3)); 10107 } 10108 10109 void AArch64TargetLowering::ReplaceNodeResults( 10110 SDNode *N, SmallVectorImpl<SDValue> &Results, SelectionDAG &DAG) const { 10111 switch (N->getOpcode()) { 10112 default: 10113 llvm_unreachable("Don't know how to custom expand this"); 10114 case ISD::BITCAST: 10115 ReplaceBITCASTResults(N, Results, DAG); 10116 return; 10117 case AArch64ISD::SADDV: 10118 ReplaceReductionResults(N, Results, DAG, ISD::ADD, AArch64ISD::SADDV); 10119 return; 10120 case AArch64ISD::UADDV: 10121 ReplaceReductionResults(N, Results, DAG, ISD::ADD, AArch64ISD::UADDV); 10122 return; 10123 case AArch64ISD::SMINV: 10124 ReplaceReductionResults(N, Results, DAG, ISD::SMIN, AArch64ISD::SMINV); 10125 return; 10126 case AArch64ISD::UMINV: 10127 ReplaceReductionResults(N, Results, DAG, ISD::UMIN, AArch64ISD::UMINV); 10128 return; 10129 case AArch64ISD::SMAXV: 10130 ReplaceReductionResults(N, Results, DAG, ISD::SMAX, AArch64ISD::SMAXV); 10131 return; 10132 case AArch64ISD::UMAXV: 10133 ReplaceReductionResults(N, Results, DAG, ISD::UMAX, AArch64ISD::UMAXV); 10134 return; 10135 case ISD::FP_TO_UINT: 10136 case ISD::FP_TO_SINT: 10137 assert(N->getValueType(0) == MVT::i128 && "unexpected illegal conversion"); 10138 // Let normal code take care of it by not adding anything to Results. 10139 return; 10140 case ISD::ATOMIC_CMP_SWAP: 10141 ReplaceCMP_SWAP_128Results(N, Results, DAG); 10142 return; 10143 } 10144 } 10145 10146 bool AArch64TargetLowering::useLoadStackGuardNode() const { 10147 if (!Subtarget->isTargetAndroid()) 10148 return true; 10149 return TargetLowering::useLoadStackGuardNode(); 10150 } 10151 10152 unsigned AArch64TargetLowering::combineRepeatedFPDivisors() const { 10153 // Combine multiple FDIVs with the same divisor into multiple FMULs by the 10154 // reciprocal if there are three or more FDIVs. 10155 return 3; 10156 } 10157 10158 TargetLoweringBase::LegalizeTypeAction 10159 AArch64TargetLowering::getPreferredVectorAction(EVT VT) const { 10160 MVT SVT = VT.getSimpleVT(); 10161 // During type legalization, we prefer to widen v1i8, v1i16, v1i32 to v8i8, 10162 // v4i16, v2i32 instead of to promote. 10163 if (SVT == MVT::v1i8 || SVT == MVT::v1i16 || SVT == MVT::v1i32 10164 || SVT == MVT::v1f32) 10165 return TypeWidenVector; 10166 10167 return TargetLoweringBase::getPreferredVectorAction(VT); 10168 } 10169 10170 // Loads and stores less than 128-bits are already atomic; ones above that 10171 // are doomed anyway, so defer to the default libcall and blame the OS when 10172 // things go wrong. 10173 bool AArch64TargetLowering::shouldExpandAtomicStoreInIR(StoreInst *SI) const { 10174 unsigned Size = SI->getValueOperand()->getType()->getPrimitiveSizeInBits(); 10175 return Size == 128; 10176 } 10177 10178 // Loads and stores less than 128-bits are already atomic; ones above that 10179 // are doomed anyway, so defer to the default libcall and blame the OS when 10180 // things go wrong. 10181 TargetLowering::AtomicExpansionKind 10182 AArch64TargetLowering::shouldExpandAtomicLoadInIR(LoadInst *LI) const { 10183 unsigned Size = LI->getType()->getPrimitiveSizeInBits(); 10184 return Size == 128 ? AtomicExpansionKind::LLSC : AtomicExpansionKind::None; 10185 } 10186 10187 // For the real atomic operations, we have ldxr/stxr up to 128 bits, 10188 TargetLowering::AtomicExpansionKind 10189 AArch64TargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const { 10190 unsigned Size = AI->getType()->getPrimitiveSizeInBits(); 10191 return Size <= 128 ? AtomicExpansionKind::LLSC : AtomicExpansionKind::None; 10192 } 10193 10194 bool AArch64TargetLowering::shouldExpandAtomicCmpXchgInIR( 10195 AtomicCmpXchgInst *AI) const { 10196 // At -O0, fast-regalloc cannot cope with the live vregs necessary to 10197 // implement cmpxchg without spilling. If the address being exchanged is also 10198 // on the stack and close enough to the spill slot, this can lead to a 10199 // situation where the monitor always gets cleared and the atomic operation 10200 // can never succeed. So at -O0 we need a late-expanded pseudo-inst instead. 10201 return getTargetMachine().getOptLevel() != 0; 10202 } 10203 10204 Value *AArch64TargetLowering::emitLoadLinked(IRBuilder<> &Builder, Value *Addr, 10205 AtomicOrdering Ord) const { 10206 Module *M = Builder.GetInsertBlock()->getParent()->getParent(); 10207 Type *ValTy = cast<PointerType>(Addr->getType())->getElementType(); 10208 bool IsAcquire = isAcquireOrStronger(Ord); 10209 10210 // Since i128 isn't legal and intrinsics don't get type-lowered, the ldrexd 10211 // intrinsic must return {i64, i64} and we have to recombine them into a 10212 // single i128 here. 10213 if (ValTy->getPrimitiveSizeInBits() == 128) { 10214 Intrinsic::ID Int = 10215 IsAcquire ? Intrinsic::aarch64_ldaxp : Intrinsic::aarch64_ldxp; 10216 Function *Ldxr = llvm::Intrinsic::getDeclaration(M, Int); 10217 10218 Addr = Builder.CreateBitCast(Addr, Type::getInt8PtrTy(M->getContext())); 10219 Value *LoHi = Builder.CreateCall(Ldxr, Addr, "lohi"); 10220 10221 Value *Lo = Builder.CreateExtractValue(LoHi, 0, "lo"); 10222 Value *Hi = Builder.CreateExtractValue(LoHi, 1, "hi"); 10223 Lo = Builder.CreateZExt(Lo, ValTy, "lo64"); 10224 Hi = Builder.CreateZExt(Hi, ValTy, "hi64"); 10225 return Builder.CreateOr( 10226 Lo, Builder.CreateShl(Hi, ConstantInt::get(ValTy, 64)), "val64"); 10227 } 10228 10229 Type *Tys[] = { Addr->getType() }; 10230 Intrinsic::ID Int = 10231 IsAcquire ? Intrinsic::aarch64_ldaxr : Intrinsic::aarch64_ldxr; 10232 Function *Ldxr = llvm::Intrinsic::getDeclaration(M, Int, Tys); 10233 10234 return Builder.CreateTruncOrBitCast( 10235 Builder.CreateCall(Ldxr, Addr), 10236 cast<PointerType>(Addr->getType())->getElementType()); 10237 } 10238 10239 void AArch64TargetLowering::emitAtomicCmpXchgNoStoreLLBalance( 10240 IRBuilder<> &Builder) const { 10241 Module *M = Builder.GetInsertBlock()->getParent()->getParent(); 10242 Builder.CreateCall( 10243 llvm::Intrinsic::getDeclaration(M, Intrinsic::aarch64_clrex)); 10244 } 10245 10246 Value *AArch64TargetLowering::emitStoreConditional(IRBuilder<> &Builder, 10247 Value *Val, Value *Addr, 10248 AtomicOrdering Ord) const { 10249 Module *M = Builder.GetInsertBlock()->getParent()->getParent(); 10250 bool IsRelease = isReleaseOrStronger(Ord); 10251 10252 // Since the intrinsics must have legal type, the i128 intrinsics take two 10253 // parameters: "i64, i64". We must marshal Val into the appropriate form 10254 // before the call. 10255 if (Val->getType()->getPrimitiveSizeInBits() == 128) { 10256 Intrinsic::ID Int = 10257 IsRelease ? Intrinsic::aarch64_stlxp : Intrinsic::aarch64_stxp; 10258 Function *Stxr = Intrinsic::getDeclaration(M, Int); 10259 Type *Int64Ty = Type::getInt64Ty(M->getContext()); 10260 10261 Value *Lo = Builder.CreateTrunc(Val, Int64Ty, "lo"); 10262 Value *Hi = Builder.CreateTrunc(Builder.CreateLShr(Val, 64), Int64Ty, "hi"); 10263 Addr = Builder.CreateBitCast(Addr, Type::getInt8PtrTy(M->getContext())); 10264 return Builder.CreateCall(Stxr, {Lo, Hi, Addr}); 10265 } 10266 10267 Intrinsic::ID Int = 10268 IsRelease ? Intrinsic::aarch64_stlxr : Intrinsic::aarch64_stxr; 10269 Type *Tys[] = { Addr->getType() }; 10270 Function *Stxr = Intrinsic::getDeclaration(M, Int, Tys); 10271 10272 return Builder.CreateCall(Stxr, 10273 {Builder.CreateZExtOrBitCast( 10274 Val, Stxr->getFunctionType()->getParamType(0)), 10275 Addr}); 10276 } 10277 10278 bool AArch64TargetLowering::functionArgumentNeedsConsecutiveRegisters( 10279 Type *Ty, CallingConv::ID CallConv, bool isVarArg) const { 10280 return Ty->isArrayTy(); 10281 } 10282 10283 bool AArch64TargetLowering::shouldNormalizeToSelectSequence(LLVMContext &, 10284 EVT) const { 10285 return false; 10286 } 10287 10288 Value *AArch64TargetLowering::getIRStackGuard(IRBuilder<> &IRB) const { 10289 if (!Subtarget->isTargetAndroid()) 10290 return TargetLowering::getIRStackGuard(IRB); 10291 10292 // Android provides a fixed TLS slot for the stack cookie. See the definition 10293 // of TLS_SLOT_STACK_GUARD in 10294 // https://android.googlesource.com/platform/bionic/+/master/libc/private/bionic_tls.h 10295 const unsigned TlsOffset = 0x28; 10296 Module *M = IRB.GetInsertBlock()->getParent()->getParent(); 10297 Function *ThreadPointerFunc = 10298 Intrinsic::getDeclaration(M, Intrinsic::thread_pointer); 10299 return IRB.CreatePointerCast( 10300 IRB.CreateConstGEP1_32(IRB.CreateCall(ThreadPointerFunc), TlsOffset), 10301 Type::getInt8PtrTy(IRB.getContext())->getPointerTo(0)); 10302 } 10303 10304 Value *AArch64TargetLowering::getSafeStackPointerLocation(IRBuilder<> &IRB) const { 10305 if (!Subtarget->isTargetAndroid()) 10306 return TargetLowering::getSafeStackPointerLocation(IRB); 10307 10308 // Android provides a fixed TLS slot for the SafeStack pointer. See the 10309 // definition of TLS_SLOT_SAFESTACK in 10310 // https://android.googlesource.com/platform/bionic/+/master/libc/private/bionic_tls.h 10311 const unsigned TlsOffset = 0x48; 10312 Module *M = IRB.GetInsertBlock()->getParent()->getParent(); 10313 Function *ThreadPointerFunc = 10314 Intrinsic::getDeclaration(M, Intrinsic::thread_pointer); 10315 return IRB.CreatePointerCast( 10316 IRB.CreateConstGEP1_32(IRB.CreateCall(ThreadPointerFunc), TlsOffset), 10317 Type::getInt8PtrTy(IRB.getContext())->getPointerTo(0)); 10318 } 10319 10320 void AArch64TargetLowering::initializeSplitCSR(MachineBasicBlock *Entry) const { 10321 // Update IsSplitCSR in AArch64unctionInfo. 10322 AArch64FunctionInfo *AFI = Entry->getParent()->getInfo<AArch64FunctionInfo>(); 10323 AFI->setIsSplitCSR(true); 10324 } 10325 10326 void AArch64TargetLowering::insertCopiesSplitCSR( 10327 MachineBasicBlock *Entry, 10328 const SmallVectorImpl<MachineBasicBlock *> &Exits) const { 10329 const AArch64RegisterInfo *TRI = Subtarget->getRegisterInfo(); 10330 const MCPhysReg *IStart = TRI->getCalleeSavedRegsViaCopy(Entry->getParent()); 10331 if (!IStart) 10332 return; 10333 10334 const TargetInstrInfo *TII = Subtarget->getInstrInfo(); 10335 MachineRegisterInfo *MRI = &Entry->getParent()->getRegInfo(); 10336 MachineBasicBlock::iterator MBBI = Entry->begin(); 10337 for (const MCPhysReg *I = IStart; *I; ++I) { 10338 const TargetRegisterClass *RC = nullptr; 10339 if (AArch64::GPR64RegClass.contains(*I)) 10340 RC = &AArch64::GPR64RegClass; 10341 else if (AArch64::FPR64RegClass.contains(*I)) 10342 RC = &AArch64::FPR64RegClass; 10343 else 10344 llvm_unreachable("Unexpected register class in CSRsViaCopy!"); 10345 10346 unsigned NewVR = MRI->createVirtualRegister(RC); 10347 // Create copy from CSR to a virtual register. 10348 // FIXME: this currently does not emit CFI pseudo-instructions, it works 10349 // fine for CXX_FAST_TLS since the C++-style TLS access functions should be 10350 // nounwind. If we want to generalize this later, we may need to emit 10351 // CFI pseudo-instructions. 10352 assert(Entry->getParent()->getFunction()->hasFnAttribute( 10353 Attribute::NoUnwind) && 10354 "Function should be nounwind in insertCopiesSplitCSR!"); 10355 Entry->addLiveIn(*I); 10356 BuildMI(*Entry, MBBI, DebugLoc(), TII->get(TargetOpcode::COPY), NewVR) 10357 .addReg(*I); 10358 10359 // Insert the copy-back instructions right before the terminator. 10360 for (auto *Exit : Exits) 10361 BuildMI(*Exit, Exit->getFirstTerminator(), DebugLoc(), 10362 TII->get(TargetOpcode::COPY), *I) 10363 .addReg(NewVR); 10364 } 10365 } 10366 10367 bool AArch64TargetLowering::isIntDivCheap(EVT VT, AttributeSet Attr) const { 10368 // Integer division on AArch64 is expensive. However, when aggressively 10369 // optimizing for code size, we prefer to use a div instruction, as it is 10370 // usually smaller than the alternative sequence. 10371 // The exception to this is vector division. Since AArch64 doesn't have vector 10372 // integer division, leaving the division as-is is a loss even in terms of 10373 // size, because it will have to be scalarized, while the alternative code 10374 // sequence can be performed in vector form. 10375 bool OptSize = 10376 Attr.hasAttribute(AttributeSet::FunctionIndex, Attribute::MinSize); 10377 return OptSize && !VT.isVector(); 10378 } 10379