1 //===-- AArch64ISelLowering.cpp - AArch64 DAG Lowering Implementation ----===// 2 // 3 // The LLVM Compiler Infrastructure 4 // 5 // This file is distributed under the University of Illinois Open Source 6 // License. See LICENSE.TXT for details. 7 // 8 //===----------------------------------------------------------------------===// 9 // 10 // This file implements the AArch64TargetLowering class. 11 // 12 //===----------------------------------------------------------------------===// 13 14 #include "AArch64ISelLowering.h" 15 #include "AArch64CallingConvention.h" 16 #include "AArch64MachineFunctionInfo.h" 17 #include "AArch64PerfectShuffle.h" 18 #include "AArch64Subtarget.h" 19 #include "AArch64TargetMachine.h" 20 #include "AArch64TargetObjectFile.h" 21 #include "MCTargetDesc/AArch64AddressingModes.h" 22 #include "llvm/ADT/Statistic.h" 23 #include "llvm/CodeGen/CallingConvLower.h" 24 #include "llvm/CodeGen/MachineFrameInfo.h" 25 #include "llvm/CodeGen/MachineInstrBuilder.h" 26 #include "llvm/CodeGen/MachineRegisterInfo.h" 27 #include "llvm/IR/Function.h" 28 #include "llvm/IR/GetElementPtrTypeIterator.h" 29 #include "llvm/IR/Intrinsics.h" 30 #include "llvm/IR/Type.h" 31 #include "llvm/Support/CommandLine.h" 32 #include "llvm/Support/Debug.h" 33 #include "llvm/Support/ErrorHandling.h" 34 #include "llvm/Support/raw_ostream.h" 35 #include "llvm/Target/TargetOptions.h" 36 using namespace llvm; 37 38 #define DEBUG_TYPE "aarch64-lower" 39 40 STATISTIC(NumTailCalls, "Number of tail calls"); 41 STATISTIC(NumShiftInserts, "Number of vector shift inserts"); 42 43 // Place holder until extr generation is tested fully. 44 static cl::opt<bool> 45 EnableAArch64ExtrGeneration("aarch64-extr-generation", cl::Hidden, 46 cl::desc("Allow AArch64 (or (shift)(shift))->extract"), 47 cl::init(true)); 48 49 static cl::opt<bool> 50 EnableAArch64SlrGeneration("aarch64-shift-insert-generation", cl::Hidden, 51 cl::desc("Allow AArch64 SLI/SRI formation"), 52 cl::init(false)); 53 54 // FIXME: The necessary dtprel relocations don't seem to be supported 55 // well in the GNU bfd and gold linkers at the moment. Therefore, by 56 // default, for now, fall back to GeneralDynamic code generation. 57 cl::opt<bool> EnableAArch64ELFLocalDynamicTLSGeneration( 58 "aarch64-elf-ldtls-generation", cl::Hidden, 59 cl::desc("Allow AArch64 Local Dynamic TLS code generation"), 60 cl::init(false)); 61 62 /// Value type used for condition codes. 63 static const MVT MVT_CC = MVT::i32; 64 65 AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM, 66 const AArch64Subtarget &STI) 67 : TargetLowering(TM), Subtarget(&STI) { 68 69 // AArch64 doesn't have comparisons which set GPRs or setcc instructions, so 70 // we have to make something up. Arbitrarily, choose ZeroOrOne. 71 setBooleanContents(ZeroOrOneBooleanContent); 72 // When comparing vectors the result sets the different elements in the 73 // vector to all-one or all-zero. 74 setBooleanVectorContents(ZeroOrNegativeOneBooleanContent); 75 76 // Set up the register classes. 77 addRegisterClass(MVT::i32, &AArch64::GPR32allRegClass); 78 addRegisterClass(MVT::i64, &AArch64::GPR64allRegClass); 79 80 if (Subtarget->hasFPARMv8()) { 81 addRegisterClass(MVT::f16, &AArch64::FPR16RegClass); 82 addRegisterClass(MVT::f32, &AArch64::FPR32RegClass); 83 addRegisterClass(MVT::f64, &AArch64::FPR64RegClass); 84 addRegisterClass(MVT::f128, &AArch64::FPR128RegClass); 85 } 86 87 if (Subtarget->hasNEON()) { 88 addRegisterClass(MVT::v16i8, &AArch64::FPR8RegClass); 89 addRegisterClass(MVT::v8i16, &AArch64::FPR16RegClass); 90 // Someone set us up the NEON. 91 addDRTypeForNEON(MVT::v2f32); 92 addDRTypeForNEON(MVT::v8i8); 93 addDRTypeForNEON(MVT::v4i16); 94 addDRTypeForNEON(MVT::v2i32); 95 addDRTypeForNEON(MVT::v1i64); 96 addDRTypeForNEON(MVT::v1f64); 97 addDRTypeForNEON(MVT::v4f16); 98 99 addQRTypeForNEON(MVT::v4f32); 100 addQRTypeForNEON(MVT::v2f64); 101 addQRTypeForNEON(MVT::v16i8); 102 addQRTypeForNEON(MVT::v8i16); 103 addQRTypeForNEON(MVT::v4i32); 104 addQRTypeForNEON(MVT::v2i64); 105 addQRTypeForNEON(MVT::v8f16); 106 } 107 108 // Compute derived properties from the register classes 109 computeRegisterProperties(Subtarget->getRegisterInfo()); 110 111 // Provide all sorts of operation actions 112 setOperationAction(ISD::GlobalAddress, MVT::i64, Custom); 113 setOperationAction(ISD::GlobalTLSAddress, MVT::i64, Custom); 114 setOperationAction(ISD::SETCC, MVT::i32, Custom); 115 setOperationAction(ISD::SETCC, MVT::i64, Custom); 116 setOperationAction(ISD::SETCC, MVT::f32, Custom); 117 setOperationAction(ISD::SETCC, MVT::f64, Custom); 118 setOperationAction(ISD::BRCOND, MVT::Other, Expand); 119 setOperationAction(ISD::BR_CC, MVT::i32, Custom); 120 setOperationAction(ISD::BR_CC, MVT::i64, Custom); 121 setOperationAction(ISD::BR_CC, MVT::f32, Custom); 122 setOperationAction(ISD::BR_CC, MVT::f64, Custom); 123 setOperationAction(ISD::SELECT, MVT::i32, Custom); 124 setOperationAction(ISD::SELECT, MVT::i64, Custom); 125 setOperationAction(ISD::SELECT, MVT::f32, Custom); 126 setOperationAction(ISD::SELECT, MVT::f64, Custom); 127 setOperationAction(ISD::SELECT_CC, MVT::i32, Custom); 128 setOperationAction(ISD::SELECT_CC, MVT::i64, Custom); 129 setOperationAction(ISD::SELECT_CC, MVT::f32, Custom); 130 setOperationAction(ISD::SELECT_CC, MVT::f64, Custom); 131 setOperationAction(ISD::BR_JT, MVT::Other, Expand); 132 setOperationAction(ISD::JumpTable, MVT::i64, Custom); 133 134 setOperationAction(ISD::SHL_PARTS, MVT::i64, Custom); 135 setOperationAction(ISD::SRA_PARTS, MVT::i64, Custom); 136 setOperationAction(ISD::SRL_PARTS, MVT::i64, Custom); 137 138 setOperationAction(ISD::FREM, MVT::f32, Expand); 139 setOperationAction(ISD::FREM, MVT::f64, Expand); 140 setOperationAction(ISD::FREM, MVT::f80, Expand); 141 142 // Custom lowering hooks are needed for XOR 143 // to fold it into CSINC/CSINV. 144 setOperationAction(ISD::XOR, MVT::i32, Custom); 145 setOperationAction(ISD::XOR, MVT::i64, Custom); 146 147 // Virtually no operation on f128 is legal, but LLVM can't expand them when 148 // there's a valid register class, so we need custom operations in most cases. 149 setOperationAction(ISD::FABS, MVT::f128, Expand); 150 setOperationAction(ISD::FADD, MVT::f128, Custom); 151 setOperationAction(ISD::FCOPYSIGN, MVT::f128, Expand); 152 setOperationAction(ISD::FCOS, MVT::f128, Expand); 153 setOperationAction(ISD::FDIV, MVT::f128, Custom); 154 setOperationAction(ISD::FMA, MVT::f128, Expand); 155 setOperationAction(ISD::FMUL, MVT::f128, Custom); 156 setOperationAction(ISD::FNEG, MVT::f128, Expand); 157 setOperationAction(ISD::FPOW, MVT::f128, Expand); 158 setOperationAction(ISD::FREM, MVT::f128, Expand); 159 setOperationAction(ISD::FRINT, MVT::f128, Expand); 160 setOperationAction(ISD::FSIN, MVT::f128, Expand); 161 setOperationAction(ISD::FSINCOS, MVT::f128, Expand); 162 setOperationAction(ISD::FSQRT, MVT::f128, Expand); 163 setOperationAction(ISD::FSUB, MVT::f128, Custom); 164 setOperationAction(ISD::FTRUNC, MVT::f128, Expand); 165 setOperationAction(ISD::SETCC, MVT::f128, Custom); 166 setOperationAction(ISD::BR_CC, MVT::f128, Custom); 167 setOperationAction(ISD::SELECT, MVT::f128, Custom); 168 setOperationAction(ISD::SELECT_CC, MVT::f128, Custom); 169 setOperationAction(ISD::FP_EXTEND, MVT::f128, Custom); 170 171 // Lowering for many of the conversions is actually specified by the non-f128 172 // type. The LowerXXX function will be trivial when f128 isn't involved. 173 setOperationAction(ISD::FP_TO_SINT, MVT::i32, Custom); 174 setOperationAction(ISD::FP_TO_SINT, MVT::i64, Custom); 175 setOperationAction(ISD::FP_TO_SINT, MVT::i128, Custom); 176 setOperationAction(ISD::FP_TO_UINT, MVT::i32, Custom); 177 setOperationAction(ISD::FP_TO_UINT, MVT::i64, Custom); 178 setOperationAction(ISD::FP_TO_UINT, MVT::i128, Custom); 179 setOperationAction(ISD::SINT_TO_FP, MVT::i32, Custom); 180 setOperationAction(ISD::SINT_TO_FP, MVT::i64, Custom); 181 setOperationAction(ISD::SINT_TO_FP, MVT::i128, Custom); 182 setOperationAction(ISD::UINT_TO_FP, MVT::i32, Custom); 183 setOperationAction(ISD::UINT_TO_FP, MVT::i64, Custom); 184 setOperationAction(ISD::UINT_TO_FP, MVT::i128, Custom); 185 setOperationAction(ISD::FP_ROUND, MVT::f32, Custom); 186 setOperationAction(ISD::FP_ROUND, MVT::f64, Custom); 187 188 // Variable arguments. 189 setOperationAction(ISD::VASTART, MVT::Other, Custom); 190 setOperationAction(ISD::VAARG, MVT::Other, Custom); 191 setOperationAction(ISD::VACOPY, MVT::Other, Custom); 192 setOperationAction(ISD::VAEND, MVT::Other, Expand); 193 194 // Variable-sized objects. 195 setOperationAction(ISD::STACKSAVE, MVT::Other, Expand); 196 setOperationAction(ISD::STACKRESTORE, MVT::Other, Expand); 197 setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i64, Expand); 198 199 // Constant pool entries 200 setOperationAction(ISD::ConstantPool, MVT::i64, Custom); 201 202 // BlockAddress 203 setOperationAction(ISD::BlockAddress, MVT::i64, Custom); 204 205 // Add/Sub overflow ops with MVT::Glues are lowered to NZCV dependences. 206 setOperationAction(ISD::ADDC, MVT::i32, Custom); 207 setOperationAction(ISD::ADDE, MVT::i32, Custom); 208 setOperationAction(ISD::SUBC, MVT::i32, Custom); 209 setOperationAction(ISD::SUBE, MVT::i32, Custom); 210 setOperationAction(ISD::ADDC, MVT::i64, Custom); 211 setOperationAction(ISD::ADDE, MVT::i64, Custom); 212 setOperationAction(ISD::SUBC, MVT::i64, Custom); 213 setOperationAction(ISD::SUBE, MVT::i64, Custom); 214 215 // AArch64 lacks both left-rotate and popcount instructions. 216 setOperationAction(ISD::ROTL, MVT::i32, Expand); 217 setOperationAction(ISD::ROTL, MVT::i64, Expand); 218 for (MVT VT : MVT::vector_valuetypes()) { 219 setOperationAction(ISD::ROTL, VT, Expand); 220 setOperationAction(ISD::ROTR, VT, Expand); 221 } 222 223 // AArch64 doesn't have {U|S}MUL_LOHI. 224 setOperationAction(ISD::UMUL_LOHI, MVT::i64, Expand); 225 setOperationAction(ISD::SMUL_LOHI, MVT::i64, Expand); 226 227 228 // Expand the undefined-at-zero variants to cttz/ctlz to their defined-at-zero 229 // counterparts, which AArch64 supports directly. 230 setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i32, Expand); 231 setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i32, Expand); 232 setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i64, Expand); 233 setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i64, Expand); 234 235 setOperationAction(ISD::CTPOP, MVT::i32, Custom); 236 setOperationAction(ISD::CTPOP, MVT::i64, Custom); 237 238 setOperationAction(ISD::SDIVREM, MVT::i32, Expand); 239 setOperationAction(ISD::SDIVREM, MVT::i64, Expand); 240 for (MVT VT : MVT::vector_valuetypes()) { 241 setOperationAction(ISD::SDIVREM, VT, Expand); 242 setOperationAction(ISD::UDIVREM, VT, Expand); 243 } 244 setOperationAction(ISD::SREM, MVT::i32, Expand); 245 setOperationAction(ISD::SREM, MVT::i64, Expand); 246 setOperationAction(ISD::UDIVREM, MVT::i32, Expand); 247 setOperationAction(ISD::UDIVREM, MVT::i64, Expand); 248 setOperationAction(ISD::UREM, MVT::i32, Expand); 249 setOperationAction(ISD::UREM, MVT::i64, Expand); 250 251 // Custom lower Add/Sub/Mul with overflow. 252 setOperationAction(ISD::SADDO, MVT::i32, Custom); 253 setOperationAction(ISD::SADDO, MVT::i64, Custom); 254 setOperationAction(ISD::UADDO, MVT::i32, Custom); 255 setOperationAction(ISD::UADDO, MVT::i64, Custom); 256 setOperationAction(ISD::SSUBO, MVT::i32, Custom); 257 setOperationAction(ISD::SSUBO, MVT::i64, Custom); 258 setOperationAction(ISD::USUBO, MVT::i32, Custom); 259 setOperationAction(ISD::USUBO, MVT::i64, Custom); 260 setOperationAction(ISD::SMULO, MVT::i32, Custom); 261 setOperationAction(ISD::SMULO, MVT::i64, Custom); 262 setOperationAction(ISD::UMULO, MVT::i32, Custom); 263 setOperationAction(ISD::UMULO, MVT::i64, Custom); 264 265 setOperationAction(ISD::FSIN, MVT::f32, Expand); 266 setOperationAction(ISD::FSIN, MVT::f64, Expand); 267 setOperationAction(ISD::FCOS, MVT::f32, Expand); 268 setOperationAction(ISD::FCOS, MVT::f64, Expand); 269 setOperationAction(ISD::FPOW, MVT::f32, Expand); 270 setOperationAction(ISD::FPOW, MVT::f64, Expand); 271 setOperationAction(ISD::FCOPYSIGN, MVT::f64, Custom); 272 setOperationAction(ISD::FCOPYSIGN, MVT::f32, Custom); 273 274 // f16 is a storage-only type, always promote it to f32. 275 setOperationAction(ISD::SETCC, MVT::f16, Promote); 276 setOperationAction(ISD::BR_CC, MVT::f16, Promote); 277 setOperationAction(ISD::SELECT_CC, MVT::f16, Promote); 278 setOperationAction(ISD::SELECT, MVT::f16, Promote); 279 setOperationAction(ISD::FADD, MVT::f16, Promote); 280 setOperationAction(ISD::FSUB, MVT::f16, Promote); 281 setOperationAction(ISD::FMUL, MVT::f16, Promote); 282 setOperationAction(ISD::FDIV, MVT::f16, Promote); 283 setOperationAction(ISD::FREM, MVT::f16, Promote); 284 setOperationAction(ISD::FMA, MVT::f16, Promote); 285 setOperationAction(ISD::FNEG, MVT::f16, Promote); 286 setOperationAction(ISD::FABS, MVT::f16, Promote); 287 setOperationAction(ISD::FCEIL, MVT::f16, Promote); 288 setOperationAction(ISD::FCOPYSIGN, MVT::f16, Promote); 289 setOperationAction(ISD::FCOS, MVT::f16, Promote); 290 setOperationAction(ISD::FFLOOR, MVT::f16, Promote); 291 setOperationAction(ISD::FNEARBYINT, MVT::f16, Promote); 292 setOperationAction(ISD::FPOW, MVT::f16, Promote); 293 setOperationAction(ISD::FPOWI, MVT::f16, Promote); 294 setOperationAction(ISD::FRINT, MVT::f16, Promote); 295 setOperationAction(ISD::FSIN, MVT::f16, Promote); 296 setOperationAction(ISD::FSINCOS, MVT::f16, Promote); 297 setOperationAction(ISD::FSQRT, MVT::f16, Promote); 298 setOperationAction(ISD::FEXP, MVT::f16, Promote); 299 setOperationAction(ISD::FEXP2, MVT::f16, Promote); 300 setOperationAction(ISD::FLOG, MVT::f16, Promote); 301 setOperationAction(ISD::FLOG2, MVT::f16, Promote); 302 setOperationAction(ISD::FLOG10, MVT::f16, Promote); 303 setOperationAction(ISD::FROUND, MVT::f16, Promote); 304 setOperationAction(ISD::FTRUNC, MVT::f16, Promote); 305 setOperationAction(ISD::FMINNUM, MVT::f16, Promote); 306 setOperationAction(ISD::FMAXNUM, MVT::f16, Promote); 307 setOperationAction(ISD::FMINNAN, MVT::f16, Promote); 308 setOperationAction(ISD::FMAXNAN, MVT::f16, Promote); 309 310 // v4f16 is also a storage-only type, so promote it to v4f32 when that is 311 // known to be safe. 312 setOperationAction(ISD::FADD, MVT::v4f16, Promote); 313 setOperationAction(ISD::FSUB, MVT::v4f16, Promote); 314 setOperationAction(ISD::FMUL, MVT::v4f16, Promote); 315 setOperationAction(ISD::FDIV, MVT::v4f16, Promote); 316 setOperationAction(ISD::FP_EXTEND, MVT::v4f16, Promote); 317 setOperationAction(ISD::FP_ROUND, MVT::v4f16, Promote); 318 AddPromotedToType(ISD::FADD, MVT::v4f16, MVT::v4f32); 319 AddPromotedToType(ISD::FSUB, MVT::v4f16, MVT::v4f32); 320 AddPromotedToType(ISD::FMUL, MVT::v4f16, MVT::v4f32); 321 AddPromotedToType(ISD::FDIV, MVT::v4f16, MVT::v4f32); 322 AddPromotedToType(ISD::FP_EXTEND, MVT::v4f16, MVT::v4f32); 323 AddPromotedToType(ISD::FP_ROUND, MVT::v4f16, MVT::v4f32); 324 325 // Expand all other v4f16 operations. 326 // FIXME: We could generate better code by promoting some operations to 327 // a pair of v4f32s 328 setOperationAction(ISD::FABS, MVT::v4f16, Expand); 329 setOperationAction(ISD::FCEIL, MVT::v4f16, Expand); 330 setOperationAction(ISD::FCOPYSIGN, MVT::v4f16, Expand); 331 setOperationAction(ISD::FCOS, MVT::v4f16, Expand); 332 setOperationAction(ISD::FFLOOR, MVT::v4f16, Expand); 333 setOperationAction(ISD::FMA, MVT::v4f16, Expand); 334 setOperationAction(ISD::FNEARBYINT, MVT::v4f16, Expand); 335 setOperationAction(ISD::FNEG, MVT::v4f16, Expand); 336 setOperationAction(ISD::FPOW, MVT::v4f16, Expand); 337 setOperationAction(ISD::FPOWI, MVT::v4f16, Expand); 338 setOperationAction(ISD::FREM, MVT::v4f16, Expand); 339 setOperationAction(ISD::FROUND, MVT::v4f16, Expand); 340 setOperationAction(ISD::FRINT, MVT::v4f16, Expand); 341 setOperationAction(ISD::FSIN, MVT::v4f16, Expand); 342 setOperationAction(ISD::FSINCOS, MVT::v4f16, Expand); 343 setOperationAction(ISD::FSQRT, MVT::v4f16, Expand); 344 setOperationAction(ISD::FTRUNC, MVT::v4f16, Expand); 345 setOperationAction(ISD::SETCC, MVT::v4f16, Expand); 346 setOperationAction(ISD::BR_CC, MVT::v4f16, Expand); 347 setOperationAction(ISD::SELECT, MVT::v4f16, Expand); 348 setOperationAction(ISD::SELECT_CC, MVT::v4f16, Expand); 349 setOperationAction(ISD::FEXP, MVT::v4f16, Expand); 350 setOperationAction(ISD::FEXP2, MVT::v4f16, Expand); 351 setOperationAction(ISD::FLOG, MVT::v4f16, Expand); 352 setOperationAction(ISD::FLOG2, MVT::v4f16, Expand); 353 setOperationAction(ISD::FLOG10, MVT::v4f16, Expand); 354 355 356 // v8f16 is also a storage-only type, so expand it. 357 setOperationAction(ISD::FABS, MVT::v8f16, Expand); 358 setOperationAction(ISD::FADD, MVT::v8f16, Expand); 359 setOperationAction(ISD::FCEIL, MVT::v8f16, Expand); 360 setOperationAction(ISD::FCOPYSIGN, MVT::v8f16, Expand); 361 setOperationAction(ISD::FCOS, MVT::v8f16, Expand); 362 setOperationAction(ISD::FDIV, MVT::v8f16, Expand); 363 setOperationAction(ISD::FFLOOR, MVT::v8f16, Expand); 364 setOperationAction(ISD::FMA, MVT::v8f16, Expand); 365 setOperationAction(ISD::FMUL, MVT::v8f16, Expand); 366 setOperationAction(ISD::FNEARBYINT, MVT::v8f16, Expand); 367 setOperationAction(ISD::FNEG, MVT::v8f16, Expand); 368 setOperationAction(ISD::FPOW, MVT::v8f16, Expand); 369 setOperationAction(ISD::FPOWI, MVT::v8f16, Expand); 370 setOperationAction(ISD::FREM, MVT::v8f16, Expand); 371 setOperationAction(ISD::FROUND, MVT::v8f16, Expand); 372 setOperationAction(ISD::FRINT, MVT::v8f16, Expand); 373 setOperationAction(ISD::FSIN, MVT::v8f16, Expand); 374 setOperationAction(ISD::FSINCOS, MVT::v8f16, Expand); 375 setOperationAction(ISD::FSQRT, MVT::v8f16, Expand); 376 setOperationAction(ISD::FSUB, MVT::v8f16, Expand); 377 setOperationAction(ISD::FTRUNC, MVT::v8f16, Expand); 378 setOperationAction(ISD::SETCC, MVT::v8f16, Expand); 379 setOperationAction(ISD::BR_CC, MVT::v8f16, Expand); 380 setOperationAction(ISD::SELECT, MVT::v8f16, Expand); 381 setOperationAction(ISD::SELECT_CC, MVT::v8f16, Expand); 382 setOperationAction(ISD::FP_EXTEND, MVT::v8f16, Expand); 383 setOperationAction(ISD::FEXP, MVT::v8f16, Expand); 384 setOperationAction(ISD::FEXP2, MVT::v8f16, Expand); 385 setOperationAction(ISD::FLOG, MVT::v8f16, Expand); 386 setOperationAction(ISD::FLOG2, MVT::v8f16, Expand); 387 setOperationAction(ISD::FLOG10, MVT::v8f16, Expand); 388 389 // AArch64 has implementations of a lot of rounding-like FP operations. 390 for (MVT Ty : {MVT::f32, MVT::f64}) { 391 setOperationAction(ISD::FFLOOR, Ty, Legal); 392 setOperationAction(ISD::FNEARBYINT, Ty, Legal); 393 setOperationAction(ISD::FCEIL, Ty, Legal); 394 setOperationAction(ISD::FRINT, Ty, Legal); 395 setOperationAction(ISD::FTRUNC, Ty, Legal); 396 setOperationAction(ISD::FROUND, Ty, Legal); 397 setOperationAction(ISD::FMINNUM, Ty, Legal); 398 setOperationAction(ISD::FMAXNUM, Ty, Legal); 399 setOperationAction(ISD::FMINNAN, Ty, Legal); 400 setOperationAction(ISD::FMAXNAN, Ty, Legal); 401 } 402 403 setOperationAction(ISD::PREFETCH, MVT::Other, Custom); 404 405 // Lower READCYCLECOUNTER using an mrs from PMCCNTR_EL0. 406 // This requires the Performance Monitors extension. 407 if (Subtarget->hasPerfMon()) 408 setOperationAction(ISD::READCYCLECOUNTER, MVT::i64, Legal); 409 410 if (Subtarget->isTargetMachO()) { 411 // For iOS, we don't want to the normal expansion of a libcall to 412 // sincos. We want to issue a libcall to __sincos_stret to avoid memory 413 // traffic. 414 setOperationAction(ISD::FSINCOS, MVT::f64, Custom); 415 setOperationAction(ISD::FSINCOS, MVT::f32, Custom); 416 } else { 417 setOperationAction(ISD::FSINCOS, MVT::f64, Expand); 418 setOperationAction(ISD::FSINCOS, MVT::f32, Expand); 419 } 420 421 // Make floating-point constants legal for the large code model, so they don't 422 // become loads from the constant pool. 423 if (Subtarget->isTargetMachO() && TM.getCodeModel() == CodeModel::Large) { 424 setOperationAction(ISD::ConstantFP, MVT::f32, Legal); 425 setOperationAction(ISD::ConstantFP, MVT::f64, Legal); 426 } 427 428 // AArch64 does not have floating-point extending loads, i1 sign-extending 429 // load, floating-point truncating stores, or v2i32->v2i16 truncating store. 430 for (MVT VT : MVT::fp_valuetypes()) { 431 setLoadExtAction(ISD::EXTLOAD, VT, MVT::f16, Expand); 432 setLoadExtAction(ISD::EXTLOAD, VT, MVT::f32, Expand); 433 setLoadExtAction(ISD::EXTLOAD, VT, MVT::f64, Expand); 434 setLoadExtAction(ISD::EXTLOAD, VT, MVT::f80, Expand); 435 } 436 for (MVT VT : MVT::integer_valuetypes()) 437 setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i1, Expand); 438 439 setTruncStoreAction(MVT::f32, MVT::f16, Expand); 440 setTruncStoreAction(MVT::f64, MVT::f32, Expand); 441 setTruncStoreAction(MVT::f64, MVT::f16, Expand); 442 setTruncStoreAction(MVT::f128, MVT::f80, Expand); 443 setTruncStoreAction(MVT::f128, MVT::f64, Expand); 444 setTruncStoreAction(MVT::f128, MVT::f32, Expand); 445 setTruncStoreAction(MVT::f128, MVT::f16, Expand); 446 447 setOperationAction(ISD::BITCAST, MVT::i16, Custom); 448 setOperationAction(ISD::BITCAST, MVT::f16, Custom); 449 450 // Indexed loads and stores are supported. 451 for (unsigned im = (unsigned)ISD::PRE_INC; 452 im != (unsigned)ISD::LAST_INDEXED_MODE; ++im) { 453 setIndexedLoadAction(im, MVT::i8, Legal); 454 setIndexedLoadAction(im, MVT::i16, Legal); 455 setIndexedLoadAction(im, MVT::i32, Legal); 456 setIndexedLoadAction(im, MVT::i64, Legal); 457 setIndexedLoadAction(im, MVT::f64, Legal); 458 setIndexedLoadAction(im, MVT::f32, Legal); 459 setIndexedLoadAction(im, MVT::f16, Legal); 460 setIndexedStoreAction(im, MVT::i8, Legal); 461 setIndexedStoreAction(im, MVT::i16, Legal); 462 setIndexedStoreAction(im, MVT::i32, Legal); 463 setIndexedStoreAction(im, MVT::i64, Legal); 464 setIndexedStoreAction(im, MVT::f64, Legal); 465 setIndexedStoreAction(im, MVT::f32, Legal); 466 setIndexedStoreAction(im, MVT::f16, Legal); 467 } 468 469 // Trap. 470 setOperationAction(ISD::TRAP, MVT::Other, Legal); 471 472 // We combine OR nodes for bitfield operations. 473 setTargetDAGCombine(ISD::OR); 474 475 // Vector add and sub nodes may conceal a high-half opportunity. 476 // Also, try to fold ADD into CSINC/CSINV.. 477 setTargetDAGCombine(ISD::ADD); 478 setTargetDAGCombine(ISD::SUB); 479 480 setTargetDAGCombine(ISD::XOR); 481 setTargetDAGCombine(ISD::SINT_TO_FP); 482 setTargetDAGCombine(ISD::UINT_TO_FP); 483 484 setTargetDAGCombine(ISD::FP_TO_SINT); 485 setTargetDAGCombine(ISD::FP_TO_UINT); 486 setTargetDAGCombine(ISD::FDIV); 487 488 setTargetDAGCombine(ISD::INTRINSIC_WO_CHAIN); 489 490 setTargetDAGCombine(ISD::ANY_EXTEND); 491 setTargetDAGCombine(ISD::ZERO_EXTEND); 492 setTargetDAGCombine(ISD::SIGN_EXTEND); 493 setTargetDAGCombine(ISD::BITCAST); 494 setTargetDAGCombine(ISD::CONCAT_VECTORS); 495 setTargetDAGCombine(ISD::STORE); 496 if (Subtarget->supportsAddressTopByteIgnored()) 497 setTargetDAGCombine(ISD::LOAD); 498 499 setTargetDAGCombine(ISD::MUL); 500 501 setTargetDAGCombine(ISD::SELECT); 502 setTargetDAGCombine(ISD::VSELECT); 503 504 setTargetDAGCombine(ISD::INTRINSIC_VOID); 505 setTargetDAGCombine(ISD::INTRINSIC_W_CHAIN); 506 setTargetDAGCombine(ISD::INSERT_VECTOR_ELT); 507 setTargetDAGCombine(ISD::EXTRACT_VECTOR_ELT); 508 509 MaxStoresPerMemset = MaxStoresPerMemsetOptSize = 8; 510 MaxStoresPerMemcpy = MaxStoresPerMemcpyOptSize = 4; 511 MaxStoresPerMemmove = MaxStoresPerMemmoveOptSize = 4; 512 513 setStackPointerRegisterToSaveRestore(AArch64::SP); 514 515 setSchedulingPreference(Sched::Hybrid); 516 517 // Enable TBZ/TBNZ 518 MaskAndBranchFoldingIsLegal = true; 519 EnableExtLdPromotion = true; 520 521 setMinFunctionAlignment(2); 522 523 setHasExtractBitsInsn(true); 524 525 setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom); 526 527 if (Subtarget->hasNEON()) { 528 // FIXME: v1f64 shouldn't be legal if we can avoid it, because it leads to 529 // silliness like this: 530 setOperationAction(ISD::FABS, MVT::v1f64, Expand); 531 setOperationAction(ISD::FADD, MVT::v1f64, Expand); 532 setOperationAction(ISD::FCEIL, MVT::v1f64, Expand); 533 setOperationAction(ISD::FCOPYSIGN, MVT::v1f64, Expand); 534 setOperationAction(ISD::FCOS, MVT::v1f64, Expand); 535 setOperationAction(ISD::FDIV, MVT::v1f64, Expand); 536 setOperationAction(ISD::FFLOOR, MVT::v1f64, Expand); 537 setOperationAction(ISD::FMA, MVT::v1f64, Expand); 538 setOperationAction(ISD::FMUL, MVT::v1f64, Expand); 539 setOperationAction(ISD::FNEARBYINT, MVT::v1f64, Expand); 540 setOperationAction(ISD::FNEG, MVT::v1f64, Expand); 541 setOperationAction(ISD::FPOW, MVT::v1f64, Expand); 542 setOperationAction(ISD::FREM, MVT::v1f64, Expand); 543 setOperationAction(ISD::FROUND, MVT::v1f64, Expand); 544 setOperationAction(ISD::FRINT, MVT::v1f64, Expand); 545 setOperationAction(ISD::FSIN, MVT::v1f64, Expand); 546 setOperationAction(ISD::FSINCOS, MVT::v1f64, Expand); 547 setOperationAction(ISD::FSQRT, MVT::v1f64, Expand); 548 setOperationAction(ISD::FSUB, MVT::v1f64, Expand); 549 setOperationAction(ISD::FTRUNC, MVT::v1f64, Expand); 550 setOperationAction(ISD::SETCC, MVT::v1f64, Expand); 551 setOperationAction(ISD::BR_CC, MVT::v1f64, Expand); 552 setOperationAction(ISD::SELECT, MVT::v1f64, Expand); 553 setOperationAction(ISD::SELECT_CC, MVT::v1f64, Expand); 554 setOperationAction(ISD::FP_EXTEND, MVT::v1f64, Expand); 555 556 setOperationAction(ISD::FP_TO_SINT, MVT::v1i64, Expand); 557 setOperationAction(ISD::FP_TO_UINT, MVT::v1i64, Expand); 558 setOperationAction(ISD::SINT_TO_FP, MVT::v1i64, Expand); 559 setOperationAction(ISD::UINT_TO_FP, MVT::v1i64, Expand); 560 setOperationAction(ISD::FP_ROUND, MVT::v1f64, Expand); 561 562 setOperationAction(ISD::MUL, MVT::v1i64, Expand); 563 564 // AArch64 doesn't have a direct vector ->f32 conversion instructions for 565 // elements smaller than i32, so promote the input to i32 first. 566 setOperationAction(ISD::UINT_TO_FP, MVT::v4i8, Promote); 567 setOperationAction(ISD::SINT_TO_FP, MVT::v4i8, Promote); 568 setOperationAction(ISD::UINT_TO_FP, MVT::v4i16, Promote); 569 setOperationAction(ISD::SINT_TO_FP, MVT::v4i16, Promote); 570 // i8 and i16 vector elements also need promotion to i32 for v8i8 or v8i16 571 // -> v8f16 conversions. 572 setOperationAction(ISD::SINT_TO_FP, MVT::v8i8, Promote); 573 setOperationAction(ISD::UINT_TO_FP, MVT::v8i8, Promote); 574 setOperationAction(ISD::SINT_TO_FP, MVT::v8i16, Promote); 575 setOperationAction(ISD::UINT_TO_FP, MVT::v8i16, Promote); 576 // Similarly, there is no direct i32 -> f64 vector conversion instruction. 577 setOperationAction(ISD::SINT_TO_FP, MVT::v2i32, Custom); 578 setOperationAction(ISD::UINT_TO_FP, MVT::v2i32, Custom); 579 setOperationAction(ISD::SINT_TO_FP, MVT::v2i64, Custom); 580 setOperationAction(ISD::UINT_TO_FP, MVT::v2i64, Custom); 581 // Or, direct i32 -> f16 vector conversion. Set it so custom, so the 582 // conversion happens in two steps: v4i32 -> v4f32 -> v4f16 583 setOperationAction(ISD::SINT_TO_FP, MVT::v4i32, Custom); 584 setOperationAction(ISD::UINT_TO_FP, MVT::v4i32, Custom); 585 586 // AArch64 doesn't have MUL.2d: 587 setOperationAction(ISD::MUL, MVT::v2i64, Expand); 588 // Custom handling for some quad-vector types to detect MULL. 589 setOperationAction(ISD::MUL, MVT::v8i16, Custom); 590 setOperationAction(ISD::MUL, MVT::v4i32, Custom); 591 setOperationAction(ISD::MUL, MVT::v2i64, Custom); 592 593 setOperationAction(ISD::ANY_EXTEND, MVT::v4i32, Legal); 594 setTruncStoreAction(MVT::v2i32, MVT::v2i16, Expand); 595 // Likewise, narrowing and extending vector loads/stores aren't handled 596 // directly. 597 for (MVT VT : MVT::vector_valuetypes()) { 598 setOperationAction(ISD::SIGN_EXTEND_INREG, VT, Expand); 599 600 setOperationAction(ISD::MULHS, VT, Expand); 601 setOperationAction(ISD::SMUL_LOHI, VT, Expand); 602 setOperationAction(ISD::MULHU, VT, Expand); 603 setOperationAction(ISD::UMUL_LOHI, VT, Expand); 604 605 setOperationAction(ISD::BSWAP, VT, Expand); 606 607 for (MVT InnerVT : MVT::vector_valuetypes()) { 608 setTruncStoreAction(VT, InnerVT, Expand); 609 setLoadExtAction(ISD::SEXTLOAD, VT, InnerVT, Expand); 610 setLoadExtAction(ISD::ZEXTLOAD, VT, InnerVT, Expand); 611 setLoadExtAction(ISD::EXTLOAD, VT, InnerVT, Expand); 612 } 613 } 614 615 // AArch64 has implementations of a lot of rounding-like FP operations. 616 for (MVT Ty : {MVT::v2f32, MVT::v4f32, MVT::v2f64}) { 617 setOperationAction(ISD::FFLOOR, Ty, Legal); 618 setOperationAction(ISD::FNEARBYINT, Ty, Legal); 619 setOperationAction(ISD::FCEIL, Ty, Legal); 620 setOperationAction(ISD::FRINT, Ty, Legal); 621 setOperationAction(ISD::FTRUNC, Ty, Legal); 622 setOperationAction(ISD::FROUND, Ty, Legal); 623 } 624 } 625 626 // Prefer likely predicted branches to selects on out-of-order cores. 627 if (Subtarget->isCortexA57()) 628 PredictableSelectIsExpensive = true; 629 } 630 631 void AArch64TargetLowering::addTypeForNEON(EVT VT, EVT PromotedBitwiseVT) { 632 if (VT == MVT::v2f32 || VT == MVT::v4f16) { 633 setOperationAction(ISD::LOAD, VT.getSimpleVT(), Promote); 634 AddPromotedToType(ISD::LOAD, VT.getSimpleVT(), MVT::v2i32); 635 636 setOperationAction(ISD::STORE, VT.getSimpleVT(), Promote); 637 AddPromotedToType(ISD::STORE, VT.getSimpleVT(), MVT::v2i32); 638 } else if (VT == MVT::v2f64 || VT == MVT::v4f32 || VT == MVT::v8f16) { 639 setOperationAction(ISD::LOAD, VT.getSimpleVT(), Promote); 640 AddPromotedToType(ISD::LOAD, VT.getSimpleVT(), MVT::v2i64); 641 642 setOperationAction(ISD::STORE, VT.getSimpleVT(), Promote); 643 AddPromotedToType(ISD::STORE, VT.getSimpleVT(), MVT::v2i64); 644 } 645 646 // Mark vector float intrinsics as expand. 647 if (VT == MVT::v2f32 || VT == MVT::v4f32 || VT == MVT::v2f64) { 648 setOperationAction(ISD::FSIN, VT.getSimpleVT(), Expand); 649 setOperationAction(ISD::FCOS, VT.getSimpleVT(), Expand); 650 setOperationAction(ISD::FPOWI, VT.getSimpleVT(), Expand); 651 setOperationAction(ISD::FPOW, VT.getSimpleVT(), Expand); 652 setOperationAction(ISD::FLOG, VT.getSimpleVT(), Expand); 653 setOperationAction(ISD::FLOG2, VT.getSimpleVT(), Expand); 654 setOperationAction(ISD::FLOG10, VT.getSimpleVT(), Expand); 655 setOperationAction(ISD::FEXP, VT.getSimpleVT(), Expand); 656 setOperationAction(ISD::FEXP2, VT.getSimpleVT(), Expand); 657 658 // But we do support custom-lowering for FCOPYSIGN. 659 setOperationAction(ISD::FCOPYSIGN, VT.getSimpleVT(), Custom); 660 } 661 662 setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT.getSimpleVT(), Custom); 663 setOperationAction(ISD::INSERT_VECTOR_ELT, VT.getSimpleVT(), Custom); 664 setOperationAction(ISD::BUILD_VECTOR, VT.getSimpleVT(), Custom); 665 setOperationAction(ISD::VECTOR_SHUFFLE, VT.getSimpleVT(), Custom); 666 setOperationAction(ISD::EXTRACT_SUBVECTOR, VT.getSimpleVT(), Custom); 667 setOperationAction(ISD::SRA, VT.getSimpleVT(), Custom); 668 setOperationAction(ISD::SRL, VT.getSimpleVT(), Custom); 669 setOperationAction(ISD::SHL, VT.getSimpleVT(), Custom); 670 setOperationAction(ISD::AND, VT.getSimpleVT(), Custom); 671 setOperationAction(ISD::OR, VT.getSimpleVT(), Custom); 672 setOperationAction(ISD::SETCC, VT.getSimpleVT(), Custom); 673 setOperationAction(ISD::CONCAT_VECTORS, VT.getSimpleVT(), Legal); 674 675 setOperationAction(ISD::SELECT, VT.getSimpleVT(), Expand); 676 setOperationAction(ISD::SELECT_CC, VT.getSimpleVT(), Expand); 677 setOperationAction(ISD::VSELECT, VT.getSimpleVT(), Expand); 678 for (MVT InnerVT : MVT::all_valuetypes()) 679 setLoadExtAction(ISD::EXTLOAD, InnerVT, VT.getSimpleVT(), Expand); 680 681 // CNT supports only B element sizes. 682 if (VT != MVT::v8i8 && VT != MVT::v16i8) 683 setOperationAction(ISD::CTPOP, VT.getSimpleVT(), Expand); 684 685 setOperationAction(ISD::UDIV, VT.getSimpleVT(), Expand); 686 setOperationAction(ISD::SDIV, VT.getSimpleVT(), Expand); 687 setOperationAction(ISD::UREM, VT.getSimpleVT(), Expand); 688 setOperationAction(ISD::SREM, VT.getSimpleVT(), Expand); 689 setOperationAction(ISD::FREM, VT.getSimpleVT(), Expand); 690 691 setOperationAction(ISD::FP_TO_SINT, VT.getSimpleVT(), Custom); 692 setOperationAction(ISD::FP_TO_UINT, VT.getSimpleVT(), Custom); 693 694 // [SU][MIN|MAX] are available for all NEON types apart from i64. 695 if (!VT.isFloatingPoint() && 696 VT.getSimpleVT() != MVT::v2i64 && VT.getSimpleVT() != MVT::v1i64) 697 for (unsigned Opcode : {ISD::SMIN, ISD::SMAX, ISD::UMIN, ISD::UMAX}) 698 setOperationAction(Opcode, VT.getSimpleVT(), Legal); 699 700 // F[MIN|MAX][NUM|NAN] are available for all FP NEON types (not f16 though!). 701 if (VT.isFloatingPoint() && VT.getVectorElementType() != MVT::f16) 702 for (unsigned Opcode : {ISD::FMINNAN, ISD::FMAXNAN, 703 ISD::FMINNUM, ISD::FMAXNUM}) 704 setOperationAction(Opcode, VT.getSimpleVT(), Legal); 705 706 if (Subtarget->isLittleEndian()) { 707 for (unsigned im = (unsigned)ISD::PRE_INC; 708 im != (unsigned)ISD::LAST_INDEXED_MODE; ++im) { 709 setIndexedLoadAction(im, VT.getSimpleVT(), Legal); 710 setIndexedStoreAction(im, VT.getSimpleVT(), Legal); 711 } 712 } 713 } 714 715 void AArch64TargetLowering::addDRTypeForNEON(MVT VT) { 716 addRegisterClass(VT, &AArch64::FPR64RegClass); 717 addTypeForNEON(VT, MVT::v2i32); 718 } 719 720 void AArch64TargetLowering::addQRTypeForNEON(MVT VT) { 721 addRegisterClass(VT, &AArch64::FPR128RegClass); 722 addTypeForNEON(VT, MVT::v4i32); 723 } 724 725 EVT AArch64TargetLowering::getSetCCResultType(const DataLayout &, LLVMContext &, 726 EVT VT) const { 727 if (!VT.isVector()) 728 return MVT::i32; 729 return VT.changeVectorElementTypeToInteger(); 730 } 731 732 /// computeKnownBitsForTargetNode - Determine which of the bits specified in 733 /// Mask are known to be either zero or one and return them in the 734 /// KnownZero/KnownOne bitsets. 735 void AArch64TargetLowering::computeKnownBitsForTargetNode( 736 const SDValue Op, APInt &KnownZero, APInt &KnownOne, 737 const SelectionDAG &DAG, unsigned Depth) const { 738 switch (Op.getOpcode()) { 739 default: 740 break; 741 case AArch64ISD::CSEL: { 742 APInt KnownZero2, KnownOne2; 743 DAG.computeKnownBits(Op->getOperand(0), KnownZero, KnownOne, Depth + 1); 744 DAG.computeKnownBits(Op->getOperand(1), KnownZero2, KnownOne2, Depth + 1); 745 KnownZero &= KnownZero2; 746 KnownOne &= KnownOne2; 747 break; 748 } 749 case ISD::INTRINSIC_W_CHAIN: { 750 ConstantSDNode *CN = cast<ConstantSDNode>(Op->getOperand(1)); 751 Intrinsic::ID IntID = static_cast<Intrinsic::ID>(CN->getZExtValue()); 752 switch (IntID) { 753 default: return; 754 case Intrinsic::aarch64_ldaxr: 755 case Intrinsic::aarch64_ldxr: { 756 unsigned BitWidth = KnownOne.getBitWidth(); 757 EVT VT = cast<MemIntrinsicSDNode>(Op)->getMemoryVT(); 758 unsigned MemBits = VT.getScalarType().getSizeInBits(); 759 KnownZero |= APInt::getHighBitsSet(BitWidth, BitWidth - MemBits); 760 return; 761 } 762 } 763 break; 764 } 765 case ISD::INTRINSIC_WO_CHAIN: 766 case ISD::INTRINSIC_VOID: { 767 unsigned IntNo = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue(); 768 switch (IntNo) { 769 default: 770 break; 771 case Intrinsic::aarch64_neon_umaxv: 772 case Intrinsic::aarch64_neon_uminv: { 773 // Figure out the datatype of the vector operand. The UMINV instruction 774 // will zero extend the result, so we can mark as known zero all the 775 // bits larger than the element datatype. 32-bit or larget doesn't need 776 // this as those are legal types and will be handled by isel directly. 777 MVT VT = Op.getOperand(1).getValueType().getSimpleVT(); 778 unsigned BitWidth = KnownZero.getBitWidth(); 779 if (VT == MVT::v8i8 || VT == MVT::v16i8) { 780 assert(BitWidth >= 8 && "Unexpected width!"); 781 APInt Mask = APInt::getHighBitsSet(BitWidth, BitWidth - 8); 782 KnownZero |= Mask; 783 } else if (VT == MVT::v4i16 || VT == MVT::v8i16) { 784 assert(BitWidth >= 16 && "Unexpected width!"); 785 APInt Mask = APInt::getHighBitsSet(BitWidth, BitWidth - 16); 786 KnownZero |= Mask; 787 } 788 break; 789 } break; 790 } 791 } 792 } 793 } 794 795 MVT AArch64TargetLowering::getScalarShiftAmountTy(const DataLayout &DL, 796 EVT) const { 797 return MVT::i64; 798 } 799 800 bool AArch64TargetLowering::allowsMisalignedMemoryAccesses(EVT VT, 801 unsigned AddrSpace, 802 unsigned Align, 803 bool *Fast) const { 804 if (Subtarget->requiresStrictAlign()) 805 return false; 806 807 // FIXME: This is mostly true for Cyclone, but not necessarily others. 808 if (Fast) { 809 // FIXME: Define an attribute for slow unaligned accesses instead of 810 // relying on the CPU type as a proxy. 811 // On Cyclone, unaligned 128-bit stores are slow. 812 *Fast = !Subtarget->isCyclone() || VT.getStoreSize() != 16 || 813 // See comments in performSTORECombine() for more details about 814 // these conditions. 815 816 // Code that uses clang vector extensions can mark that it 817 // wants unaligned accesses to be treated as fast by 818 // underspecifying alignment to be 1 or 2. 819 Align <= 2 || 820 821 // Disregard v2i64. Memcpy lowering produces those and splitting 822 // them regresses performance on micro-benchmarks and olden/bh. 823 VT == MVT::v2i64; 824 } 825 return true; 826 } 827 828 FastISel * 829 AArch64TargetLowering::createFastISel(FunctionLoweringInfo &funcInfo, 830 const TargetLibraryInfo *libInfo) const { 831 return AArch64::createFastISel(funcInfo, libInfo); 832 } 833 834 const char *AArch64TargetLowering::getTargetNodeName(unsigned Opcode) const { 835 switch ((AArch64ISD::NodeType)Opcode) { 836 case AArch64ISD::FIRST_NUMBER: break; 837 case AArch64ISD::CALL: return "AArch64ISD::CALL"; 838 case AArch64ISD::ADRP: return "AArch64ISD::ADRP"; 839 case AArch64ISD::ADDlow: return "AArch64ISD::ADDlow"; 840 case AArch64ISD::LOADgot: return "AArch64ISD::LOADgot"; 841 case AArch64ISD::RET_FLAG: return "AArch64ISD::RET_FLAG"; 842 case AArch64ISD::BRCOND: return "AArch64ISD::BRCOND"; 843 case AArch64ISD::CSEL: return "AArch64ISD::CSEL"; 844 case AArch64ISD::FCSEL: return "AArch64ISD::FCSEL"; 845 case AArch64ISD::CSINV: return "AArch64ISD::CSINV"; 846 case AArch64ISD::CSNEG: return "AArch64ISD::CSNEG"; 847 case AArch64ISD::CSINC: return "AArch64ISD::CSINC"; 848 case AArch64ISD::THREAD_POINTER: return "AArch64ISD::THREAD_POINTER"; 849 case AArch64ISD::TLSDESC_CALLSEQ: return "AArch64ISD::TLSDESC_CALLSEQ"; 850 case AArch64ISD::ADC: return "AArch64ISD::ADC"; 851 case AArch64ISD::SBC: return "AArch64ISD::SBC"; 852 case AArch64ISD::ADDS: return "AArch64ISD::ADDS"; 853 case AArch64ISD::SUBS: return "AArch64ISD::SUBS"; 854 case AArch64ISD::ADCS: return "AArch64ISD::ADCS"; 855 case AArch64ISD::SBCS: return "AArch64ISD::SBCS"; 856 case AArch64ISD::ANDS: return "AArch64ISD::ANDS"; 857 case AArch64ISD::CCMP: return "AArch64ISD::CCMP"; 858 case AArch64ISD::CCMN: return "AArch64ISD::CCMN"; 859 case AArch64ISD::FCCMP: return "AArch64ISD::FCCMP"; 860 case AArch64ISD::FCMP: return "AArch64ISD::FCMP"; 861 case AArch64ISD::DUP: return "AArch64ISD::DUP"; 862 case AArch64ISD::DUPLANE8: return "AArch64ISD::DUPLANE8"; 863 case AArch64ISD::DUPLANE16: return "AArch64ISD::DUPLANE16"; 864 case AArch64ISD::DUPLANE32: return "AArch64ISD::DUPLANE32"; 865 case AArch64ISD::DUPLANE64: return "AArch64ISD::DUPLANE64"; 866 case AArch64ISD::MOVI: return "AArch64ISD::MOVI"; 867 case AArch64ISD::MOVIshift: return "AArch64ISD::MOVIshift"; 868 case AArch64ISD::MOVIedit: return "AArch64ISD::MOVIedit"; 869 case AArch64ISD::MOVImsl: return "AArch64ISD::MOVImsl"; 870 case AArch64ISD::FMOV: return "AArch64ISD::FMOV"; 871 case AArch64ISD::MVNIshift: return "AArch64ISD::MVNIshift"; 872 case AArch64ISD::MVNImsl: return "AArch64ISD::MVNImsl"; 873 case AArch64ISD::BICi: return "AArch64ISD::BICi"; 874 case AArch64ISD::ORRi: return "AArch64ISD::ORRi"; 875 case AArch64ISD::BSL: return "AArch64ISD::BSL"; 876 case AArch64ISD::NEG: return "AArch64ISD::NEG"; 877 case AArch64ISD::EXTR: return "AArch64ISD::EXTR"; 878 case AArch64ISD::ZIP1: return "AArch64ISD::ZIP1"; 879 case AArch64ISD::ZIP2: return "AArch64ISD::ZIP2"; 880 case AArch64ISD::UZP1: return "AArch64ISD::UZP1"; 881 case AArch64ISD::UZP2: return "AArch64ISD::UZP2"; 882 case AArch64ISD::TRN1: return "AArch64ISD::TRN1"; 883 case AArch64ISD::TRN2: return "AArch64ISD::TRN2"; 884 case AArch64ISD::REV16: return "AArch64ISD::REV16"; 885 case AArch64ISD::REV32: return "AArch64ISD::REV32"; 886 case AArch64ISD::REV64: return "AArch64ISD::REV64"; 887 case AArch64ISD::EXT: return "AArch64ISD::EXT"; 888 case AArch64ISD::VSHL: return "AArch64ISD::VSHL"; 889 case AArch64ISD::VLSHR: return "AArch64ISD::VLSHR"; 890 case AArch64ISD::VASHR: return "AArch64ISD::VASHR"; 891 case AArch64ISD::CMEQ: return "AArch64ISD::CMEQ"; 892 case AArch64ISD::CMGE: return "AArch64ISD::CMGE"; 893 case AArch64ISD::CMGT: return "AArch64ISD::CMGT"; 894 case AArch64ISD::CMHI: return "AArch64ISD::CMHI"; 895 case AArch64ISD::CMHS: return "AArch64ISD::CMHS"; 896 case AArch64ISD::FCMEQ: return "AArch64ISD::FCMEQ"; 897 case AArch64ISD::FCMGE: return "AArch64ISD::FCMGE"; 898 case AArch64ISD::FCMGT: return "AArch64ISD::FCMGT"; 899 case AArch64ISD::CMEQz: return "AArch64ISD::CMEQz"; 900 case AArch64ISD::CMGEz: return "AArch64ISD::CMGEz"; 901 case AArch64ISD::CMGTz: return "AArch64ISD::CMGTz"; 902 case AArch64ISD::CMLEz: return "AArch64ISD::CMLEz"; 903 case AArch64ISD::CMLTz: return "AArch64ISD::CMLTz"; 904 case AArch64ISD::FCMEQz: return "AArch64ISD::FCMEQz"; 905 case AArch64ISD::FCMGEz: return "AArch64ISD::FCMGEz"; 906 case AArch64ISD::FCMGTz: return "AArch64ISD::FCMGTz"; 907 case AArch64ISD::FCMLEz: return "AArch64ISD::FCMLEz"; 908 case AArch64ISD::FCMLTz: return "AArch64ISD::FCMLTz"; 909 case AArch64ISD::SADDV: return "AArch64ISD::SADDV"; 910 case AArch64ISD::UADDV: return "AArch64ISD::UADDV"; 911 case AArch64ISD::SMINV: return "AArch64ISD::SMINV"; 912 case AArch64ISD::UMINV: return "AArch64ISD::UMINV"; 913 case AArch64ISD::SMAXV: return "AArch64ISD::SMAXV"; 914 case AArch64ISD::UMAXV: return "AArch64ISD::UMAXV"; 915 case AArch64ISD::NOT: return "AArch64ISD::NOT"; 916 case AArch64ISD::BIT: return "AArch64ISD::BIT"; 917 case AArch64ISD::CBZ: return "AArch64ISD::CBZ"; 918 case AArch64ISD::CBNZ: return "AArch64ISD::CBNZ"; 919 case AArch64ISD::TBZ: return "AArch64ISD::TBZ"; 920 case AArch64ISD::TBNZ: return "AArch64ISD::TBNZ"; 921 case AArch64ISD::TC_RETURN: return "AArch64ISD::TC_RETURN"; 922 case AArch64ISD::PREFETCH: return "AArch64ISD::PREFETCH"; 923 case AArch64ISD::SITOF: return "AArch64ISD::SITOF"; 924 case AArch64ISD::UITOF: return "AArch64ISD::UITOF"; 925 case AArch64ISD::NVCAST: return "AArch64ISD::NVCAST"; 926 case AArch64ISD::SQSHL_I: return "AArch64ISD::SQSHL_I"; 927 case AArch64ISD::UQSHL_I: return "AArch64ISD::UQSHL_I"; 928 case AArch64ISD::SRSHR_I: return "AArch64ISD::SRSHR_I"; 929 case AArch64ISD::URSHR_I: return "AArch64ISD::URSHR_I"; 930 case AArch64ISD::SQSHLU_I: return "AArch64ISD::SQSHLU_I"; 931 case AArch64ISD::WrapperLarge: return "AArch64ISD::WrapperLarge"; 932 case AArch64ISD::LD2post: return "AArch64ISD::LD2post"; 933 case AArch64ISD::LD3post: return "AArch64ISD::LD3post"; 934 case AArch64ISD::LD4post: return "AArch64ISD::LD4post"; 935 case AArch64ISD::ST2post: return "AArch64ISD::ST2post"; 936 case AArch64ISD::ST3post: return "AArch64ISD::ST3post"; 937 case AArch64ISD::ST4post: return "AArch64ISD::ST4post"; 938 case AArch64ISD::LD1x2post: return "AArch64ISD::LD1x2post"; 939 case AArch64ISD::LD1x3post: return "AArch64ISD::LD1x3post"; 940 case AArch64ISD::LD1x4post: return "AArch64ISD::LD1x4post"; 941 case AArch64ISD::ST1x2post: return "AArch64ISD::ST1x2post"; 942 case AArch64ISD::ST1x3post: return "AArch64ISD::ST1x3post"; 943 case AArch64ISD::ST1x4post: return "AArch64ISD::ST1x4post"; 944 case AArch64ISD::LD1DUPpost: return "AArch64ISD::LD1DUPpost"; 945 case AArch64ISD::LD2DUPpost: return "AArch64ISD::LD2DUPpost"; 946 case AArch64ISD::LD3DUPpost: return "AArch64ISD::LD3DUPpost"; 947 case AArch64ISD::LD4DUPpost: return "AArch64ISD::LD4DUPpost"; 948 case AArch64ISD::LD1LANEpost: return "AArch64ISD::LD1LANEpost"; 949 case AArch64ISD::LD2LANEpost: return "AArch64ISD::LD2LANEpost"; 950 case AArch64ISD::LD3LANEpost: return "AArch64ISD::LD3LANEpost"; 951 case AArch64ISD::LD4LANEpost: return "AArch64ISD::LD4LANEpost"; 952 case AArch64ISD::ST2LANEpost: return "AArch64ISD::ST2LANEpost"; 953 case AArch64ISD::ST3LANEpost: return "AArch64ISD::ST3LANEpost"; 954 case AArch64ISD::ST4LANEpost: return "AArch64ISD::ST4LANEpost"; 955 case AArch64ISD::SMULL: return "AArch64ISD::SMULL"; 956 case AArch64ISD::UMULL: return "AArch64ISD::UMULL"; 957 } 958 return nullptr; 959 } 960 961 MachineBasicBlock * 962 AArch64TargetLowering::EmitF128CSEL(MachineInstr *MI, 963 MachineBasicBlock *MBB) const { 964 // We materialise the F128CSEL pseudo-instruction as some control flow and a 965 // phi node: 966 967 // OrigBB: 968 // [... previous instrs leading to comparison ...] 969 // b.ne TrueBB 970 // b EndBB 971 // TrueBB: 972 // ; Fallthrough 973 // EndBB: 974 // Dest = PHI [IfTrue, TrueBB], [IfFalse, OrigBB] 975 976 MachineFunction *MF = MBB->getParent(); 977 const TargetInstrInfo *TII = Subtarget->getInstrInfo(); 978 const BasicBlock *LLVM_BB = MBB->getBasicBlock(); 979 DebugLoc DL = MI->getDebugLoc(); 980 MachineFunction::iterator It = ++MBB->getIterator(); 981 982 unsigned DestReg = MI->getOperand(0).getReg(); 983 unsigned IfTrueReg = MI->getOperand(1).getReg(); 984 unsigned IfFalseReg = MI->getOperand(2).getReg(); 985 unsigned CondCode = MI->getOperand(3).getImm(); 986 bool NZCVKilled = MI->getOperand(4).isKill(); 987 988 MachineBasicBlock *TrueBB = MF->CreateMachineBasicBlock(LLVM_BB); 989 MachineBasicBlock *EndBB = MF->CreateMachineBasicBlock(LLVM_BB); 990 MF->insert(It, TrueBB); 991 MF->insert(It, EndBB); 992 993 // Transfer rest of current basic-block to EndBB 994 EndBB->splice(EndBB->begin(), MBB, std::next(MachineBasicBlock::iterator(MI)), 995 MBB->end()); 996 EndBB->transferSuccessorsAndUpdatePHIs(MBB); 997 998 BuildMI(MBB, DL, TII->get(AArch64::Bcc)).addImm(CondCode).addMBB(TrueBB); 999 BuildMI(MBB, DL, TII->get(AArch64::B)).addMBB(EndBB); 1000 MBB->addSuccessor(TrueBB); 1001 MBB->addSuccessor(EndBB); 1002 1003 // TrueBB falls through to the end. 1004 TrueBB->addSuccessor(EndBB); 1005 1006 if (!NZCVKilled) { 1007 TrueBB->addLiveIn(AArch64::NZCV); 1008 EndBB->addLiveIn(AArch64::NZCV); 1009 } 1010 1011 BuildMI(*EndBB, EndBB->begin(), DL, TII->get(AArch64::PHI), DestReg) 1012 .addReg(IfTrueReg) 1013 .addMBB(TrueBB) 1014 .addReg(IfFalseReg) 1015 .addMBB(MBB); 1016 1017 MI->eraseFromParent(); 1018 return EndBB; 1019 } 1020 1021 MachineBasicBlock * 1022 AArch64TargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI, 1023 MachineBasicBlock *BB) const { 1024 switch (MI->getOpcode()) { 1025 default: 1026 #ifndef NDEBUG 1027 MI->dump(); 1028 #endif 1029 llvm_unreachable("Unexpected instruction for custom inserter!"); 1030 1031 case AArch64::F128CSEL: 1032 return EmitF128CSEL(MI, BB); 1033 1034 case TargetOpcode::STACKMAP: 1035 case TargetOpcode::PATCHPOINT: 1036 return emitPatchPoint(MI, BB); 1037 } 1038 } 1039 1040 //===----------------------------------------------------------------------===// 1041 // AArch64 Lowering private implementation. 1042 //===----------------------------------------------------------------------===// 1043 1044 //===----------------------------------------------------------------------===// 1045 // Lowering Code 1046 //===----------------------------------------------------------------------===// 1047 1048 /// changeIntCCToAArch64CC - Convert a DAG integer condition code to an AArch64 1049 /// CC 1050 static AArch64CC::CondCode changeIntCCToAArch64CC(ISD::CondCode CC) { 1051 switch (CC) { 1052 default: 1053 llvm_unreachable("Unknown condition code!"); 1054 case ISD::SETNE: 1055 return AArch64CC::NE; 1056 case ISD::SETEQ: 1057 return AArch64CC::EQ; 1058 case ISD::SETGT: 1059 return AArch64CC::GT; 1060 case ISD::SETGE: 1061 return AArch64CC::GE; 1062 case ISD::SETLT: 1063 return AArch64CC::LT; 1064 case ISD::SETLE: 1065 return AArch64CC::LE; 1066 case ISD::SETUGT: 1067 return AArch64CC::HI; 1068 case ISD::SETUGE: 1069 return AArch64CC::HS; 1070 case ISD::SETULT: 1071 return AArch64CC::LO; 1072 case ISD::SETULE: 1073 return AArch64CC::LS; 1074 } 1075 } 1076 1077 /// changeFPCCToAArch64CC - Convert a DAG fp condition code to an AArch64 CC. 1078 static void changeFPCCToAArch64CC(ISD::CondCode CC, 1079 AArch64CC::CondCode &CondCode, 1080 AArch64CC::CondCode &CondCode2) { 1081 CondCode2 = AArch64CC::AL; 1082 switch (CC) { 1083 default: 1084 llvm_unreachable("Unknown FP condition!"); 1085 case ISD::SETEQ: 1086 case ISD::SETOEQ: 1087 CondCode = AArch64CC::EQ; 1088 break; 1089 case ISD::SETGT: 1090 case ISD::SETOGT: 1091 CondCode = AArch64CC::GT; 1092 break; 1093 case ISD::SETGE: 1094 case ISD::SETOGE: 1095 CondCode = AArch64CC::GE; 1096 break; 1097 case ISD::SETOLT: 1098 CondCode = AArch64CC::MI; 1099 break; 1100 case ISD::SETOLE: 1101 CondCode = AArch64CC::LS; 1102 break; 1103 case ISD::SETONE: 1104 CondCode = AArch64CC::MI; 1105 CondCode2 = AArch64CC::GT; 1106 break; 1107 case ISD::SETO: 1108 CondCode = AArch64CC::VC; 1109 break; 1110 case ISD::SETUO: 1111 CondCode = AArch64CC::VS; 1112 break; 1113 case ISD::SETUEQ: 1114 CondCode = AArch64CC::EQ; 1115 CondCode2 = AArch64CC::VS; 1116 break; 1117 case ISD::SETUGT: 1118 CondCode = AArch64CC::HI; 1119 break; 1120 case ISD::SETUGE: 1121 CondCode = AArch64CC::PL; 1122 break; 1123 case ISD::SETLT: 1124 case ISD::SETULT: 1125 CondCode = AArch64CC::LT; 1126 break; 1127 case ISD::SETLE: 1128 case ISD::SETULE: 1129 CondCode = AArch64CC::LE; 1130 break; 1131 case ISD::SETNE: 1132 case ISD::SETUNE: 1133 CondCode = AArch64CC::NE; 1134 break; 1135 } 1136 } 1137 1138 /// changeVectorFPCCToAArch64CC - Convert a DAG fp condition code to an AArch64 1139 /// CC usable with the vector instructions. Fewer operations are available 1140 /// without a real NZCV register, so we have to use less efficient combinations 1141 /// to get the same effect. 1142 static void changeVectorFPCCToAArch64CC(ISD::CondCode CC, 1143 AArch64CC::CondCode &CondCode, 1144 AArch64CC::CondCode &CondCode2, 1145 bool &Invert) { 1146 Invert = false; 1147 switch (CC) { 1148 default: 1149 // Mostly the scalar mappings work fine. 1150 changeFPCCToAArch64CC(CC, CondCode, CondCode2); 1151 break; 1152 case ISD::SETUO: 1153 Invert = true; // Fallthrough 1154 case ISD::SETO: 1155 CondCode = AArch64CC::MI; 1156 CondCode2 = AArch64CC::GE; 1157 break; 1158 case ISD::SETUEQ: 1159 case ISD::SETULT: 1160 case ISD::SETULE: 1161 case ISD::SETUGT: 1162 case ISD::SETUGE: 1163 // All of the compare-mask comparisons are ordered, but we can switch 1164 // between the two by a double inversion. E.g. ULE == !OGT. 1165 Invert = true; 1166 changeFPCCToAArch64CC(getSetCCInverse(CC, false), CondCode, CondCode2); 1167 break; 1168 } 1169 } 1170 1171 static bool isLegalArithImmed(uint64_t C) { 1172 // Matches AArch64DAGToDAGISel::SelectArithImmed(). 1173 return (C >> 12 == 0) || ((C & 0xFFFULL) == 0 && C >> 24 == 0); 1174 } 1175 1176 static SDValue emitComparison(SDValue LHS, SDValue RHS, ISD::CondCode CC, 1177 SDLoc dl, SelectionDAG &DAG) { 1178 EVT VT = LHS.getValueType(); 1179 1180 if (VT.isFloatingPoint()) 1181 return DAG.getNode(AArch64ISD::FCMP, dl, VT, LHS, RHS); 1182 1183 // The CMP instruction is just an alias for SUBS, and representing it as 1184 // SUBS means that it's possible to get CSE with subtract operations. 1185 // A later phase can perform the optimization of setting the destination 1186 // register to WZR/XZR if it ends up being unused. 1187 unsigned Opcode = AArch64ISD::SUBS; 1188 1189 if (RHS.getOpcode() == ISD::SUB && isNullConstant(RHS.getOperand(0)) && 1190 (CC == ISD::SETEQ || CC == ISD::SETNE)) { 1191 // We'd like to combine a (CMP op1, (sub 0, op2) into a CMN instruction on 1192 // the grounds that "op1 - (-op2) == op1 + op2". However, the C and V flags 1193 // can be set differently by this operation. It comes down to whether 1194 // "SInt(~op2)+1 == SInt(~op2+1)" (and the same for UInt). If they are then 1195 // everything is fine. If not then the optimization is wrong. Thus general 1196 // comparisons are only valid if op2 != 0. 1197 1198 // So, finally, the only LLVM-native comparisons that don't mention C and V 1199 // are SETEQ and SETNE. They're the only ones we can safely use CMN for in 1200 // the absence of information about op2. 1201 Opcode = AArch64ISD::ADDS; 1202 RHS = RHS.getOperand(1); 1203 } else if (LHS.getOpcode() == ISD::AND && isNullConstant(RHS) && 1204 !isUnsignedIntSetCC(CC)) { 1205 // Similarly, (CMP (and X, Y), 0) can be implemented with a TST 1206 // (a.k.a. ANDS) except that the flags are only guaranteed to work for one 1207 // of the signed comparisons. 1208 Opcode = AArch64ISD::ANDS; 1209 RHS = LHS.getOperand(1); 1210 LHS = LHS.getOperand(0); 1211 } 1212 1213 return DAG.getNode(Opcode, dl, DAG.getVTList(VT, MVT_CC), LHS, RHS) 1214 .getValue(1); 1215 } 1216 1217 /// \defgroup AArch64CCMP CMP;CCMP matching 1218 /// 1219 /// These functions deal with the formation of CMP;CCMP;... sequences. 1220 /// The CCMP/CCMN/FCCMP/FCCMPE instructions allow the conditional execution of 1221 /// a comparison. They set the NZCV flags to a predefined value if their 1222 /// predicate is false. This allows to express arbitrary conjunctions, for 1223 /// example "cmp 0 (and (setCA (cmp A)) (setCB (cmp B))))" 1224 /// expressed as: 1225 /// cmp A 1226 /// ccmp B, inv(CB), CA 1227 /// check for CB flags 1228 /// 1229 /// In general we can create code for arbitrary "... (and (and A B) C)" 1230 /// sequences. We can also implement some "or" expressions, because "(or A B)" 1231 /// is equivalent to "not (and (not A) (not B))" and we can implement some 1232 /// negation operations: 1233 /// We can negate the results of a single comparison by inverting the flags 1234 /// used when the predicate fails and inverting the flags tested in the next 1235 /// instruction; We can also negate the results of the whole previous 1236 /// conditional compare sequence by inverting the flags tested in the next 1237 /// instruction. However there is no way to negate the result of a partial 1238 /// sequence. 1239 /// 1240 /// Therefore on encountering an "or" expression we can negate the subtree on 1241 /// one side and have to be able to push the negate to the leafs of the subtree 1242 /// on the other side (see also the comments in code). As complete example: 1243 /// "or (or (setCA (cmp A)) (setCB (cmp B))) 1244 /// (and (setCC (cmp C)) (setCD (cmp D)))" 1245 /// is transformed to 1246 /// "not (and (not (and (setCC (cmp C)) (setCC (cmp D)))) 1247 /// (and (not (setCA (cmp A)) (not (setCB (cmp B))))))" 1248 /// and implemented as: 1249 /// cmp C 1250 /// ccmp D, inv(CD), CC 1251 /// ccmp A, CA, inv(CD) 1252 /// ccmp B, CB, inv(CA) 1253 /// check for CB flags 1254 /// A counterexample is "or (and A B) (and C D)" which cannot be implemented 1255 /// by conditional compare sequences. 1256 /// @{ 1257 1258 /// Create a conditional comparison; Use CCMP, CCMN or FCCMP as appropriate. 1259 static SDValue emitConditionalComparison(SDValue LHS, SDValue RHS, 1260 ISD::CondCode CC, SDValue CCOp, 1261 SDValue Condition, unsigned NZCV, 1262 SDLoc DL, SelectionDAG &DAG) { 1263 unsigned Opcode = 0; 1264 if (LHS.getValueType().isFloatingPoint()) 1265 Opcode = AArch64ISD::FCCMP; 1266 else if (RHS.getOpcode() == ISD::SUB) { 1267 SDValue SubOp0 = RHS.getOperand(0); 1268 if (isNullConstant(SubOp0) && (CC == ISD::SETEQ || CC == ISD::SETNE)) { 1269 // See emitComparison() on why we can only do this for SETEQ and SETNE. 1270 Opcode = AArch64ISD::CCMN; 1271 RHS = RHS.getOperand(1); 1272 } 1273 } 1274 if (Opcode == 0) 1275 Opcode = AArch64ISD::CCMP; 1276 1277 SDValue NZCVOp = DAG.getConstant(NZCV, DL, MVT::i32); 1278 return DAG.getNode(Opcode, DL, MVT_CC, LHS, RHS, NZCVOp, Condition, CCOp); 1279 } 1280 1281 /// Returns true if @p Val is a tree of AND/OR/SETCC operations. 1282 /// CanPushNegate is set to true if we can push a negate operation through 1283 /// the tree in a was that we are left with AND operations and negate operations 1284 /// at the leafs only. i.e. "not (or (or x y) z)" can be changed to 1285 /// "and (and (not x) (not y)) (not z)"; "not (or (and x y) z)" cannot be 1286 /// brought into such a form. 1287 static bool isConjunctionDisjunctionTree(const SDValue Val, bool &CanPushNegate, 1288 unsigned Depth = 0) { 1289 if (!Val.hasOneUse()) 1290 return false; 1291 unsigned Opcode = Val->getOpcode(); 1292 if (Opcode == ISD::SETCC) { 1293 CanPushNegate = true; 1294 return true; 1295 } 1296 // Protect against stack overflow. 1297 if (Depth > 15) 1298 return false; 1299 if (Opcode == ISD::AND || Opcode == ISD::OR) { 1300 SDValue O0 = Val->getOperand(0); 1301 SDValue O1 = Val->getOperand(1); 1302 bool CanPushNegateL; 1303 if (!isConjunctionDisjunctionTree(O0, CanPushNegateL, Depth+1)) 1304 return false; 1305 bool CanPushNegateR; 1306 if (!isConjunctionDisjunctionTree(O1, CanPushNegateR, Depth+1)) 1307 return false; 1308 // We cannot push a negate through an AND operation (it would become an OR), 1309 // we can however change a (not (or x y)) to (and (not x) (not y)) if we can 1310 // push the negate through the x/y subtrees. 1311 CanPushNegate = (Opcode == ISD::OR) && CanPushNegateL && CanPushNegateR; 1312 return true; 1313 } 1314 return false; 1315 } 1316 1317 /// Emit conjunction or disjunction tree with the CMP/FCMP followed by a chain 1318 /// of CCMP/CFCMP ops. See @ref AArch64CCMP. 1319 /// Tries to transform the given i1 producing node @p Val to a series compare 1320 /// and conditional compare operations. @returns an NZCV flags producing node 1321 /// and sets @p OutCC to the flags that should be tested or returns SDValue() if 1322 /// transformation was not possible. 1323 /// On recursive invocations @p PushNegate may be set to true to have negation 1324 /// effects pushed to the tree leafs; @p Predicate is an NZCV flag predicate 1325 /// for the comparisons in the current subtree; @p Depth limits the search 1326 /// depth to avoid stack overflow. 1327 static SDValue emitConjunctionDisjunctionTree(SelectionDAG &DAG, SDValue Val, 1328 AArch64CC::CondCode &OutCC, bool PushNegate = false, 1329 SDValue CCOp = SDValue(), AArch64CC::CondCode Predicate = AArch64CC::AL, 1330 unsigned Depth = 0) { 1331 // We're at a tree leaf, produce a conditional comparison operation. 1332 unsigned Opcode = Val->getOpcode(); 1333 if (Opcode == ISD::SETCC) { 1334 SDValue LHS = Val->getOperand(0); 1335 SDValue RHS = Val->getOperand(1); 1336 ISD::CondCode CC = cast<CondCodeSDNode>(Val->getOperand(2))->get(); 1337 bool isInteger = LHS.getValueType().isInteger(); 1338 if (PushNegate) 1339 CC = getSetCCInverse(CC, isInteger); 1340 SDLoc DL(Val); 1341 // Determine OutCC and handle FP special case. 1342 if (isInteger) { 1343 OutCC = changeIntCCToAArch64CC(CC); 1344 } else { 1345 assert(LHS.getValueType().isFloatingPoint()); 1346 AArch64CC::CondCode ExtraCC; 1347 changeFPCCToAArch64CC(CC, OutCC, ExtraCC); 1348 // Surpisingly some floating point conditions can't be tested with a 1349 // single condition code. Construct an additional comparison in this case. 1350 // See comment below on how we deal with OR conditions. 1351 if (ExtraCC != AArch64CC::AL) { 1352 SDValue ExtraCmp; 1353 if (!CCOp.getNode()) 1354 ExtraCmp = emitComparison(LHS, RHS, CC, DL, DAG); 1355 else { 1356 SDValue ConditionOp = DAG.getConstant(Predicate, DL, MVT_CC); 1357 // Note that we want the inverse of ExtraCC, so NZCV is not inversed. 1358 unsigned NZCV = AArch64CC::getNZCVToSatisfyCondCode(ExtraCC); 1359 ExtraCmp = emitConditionalComparison(LHS, RHS, CC, CCOp, ConditionOp, 1360 NZCV, DL, DAG); 1361 } 1362 CCOp = ExtraCmp; 1363 Predicate = AArch64CC::getInvertedCondCode(ExtraCC); 1364 OutCC = AArch64CC::getInvertedCondCode(OutCC); 1365 } 1366 } 1367 1368 // Produce a normal comparison if we are first in the chain 1369 if (!CCOp.getNode()) 1370 return emitComparison(LHS, RHS, CC, DL, DAG); 1371 // Otherwise produce a ccmp. 1372 SDValue ConditionOp = DAG.getConstant(Predicate, DL, MVT_CC); 1373 AArch64CC::CondCode InvOutCC = AArch64CC::getInvertedCondCode(OutCC); 1374 unsigned NZCV = AArch64CC::getNZCVToSatisfyCondCode(InvOutCC); 1375 return emitConditionalComparison(LHS, RHS, CC, CCOp, ConditionOp, NZCV, DL, 1376 DAG); 1377 } else if ((Opcode != ISD::AND && Opcode != ISD::OR) || !Val->hasOneUse()) 1378 return SDValue(); 1379 1380 assert((Opcode == ISD::OR || !PushNegate) 1381 && "Can only push negate through OR operation"); 1382 1383 // Check if both sides can be transformed. 1384 SDValue LHS = Val->getOperand(0); 1385 SDValue RHS = Val->getOperand(1); 1386 bool CanPushNegateL; 1387 if (!isConjunctionDisjunctionTree(LHS, CanPushNegateL, Depth+1)) 1388 return SDValue(); 1389 bool CanPushNegateR; 1390 if (!isConjunctionDisjunctionTree(RHS, CanPushNegateR, Depth+1)) 1391 return SDValue(); 1392 1393 // Do we need to negate our operands? 1394 bool NegateOperands = Opcode == ISD::OR; 1395 // We can negate the results of all previous operations by inverting the 1396 // predicate flags giving us a free negation for one side. For the other side 1397 // we need to be able to push the negation to the leafs of the tree. 1398 if (NegateOperands) { 1399 if (!CanPushNegateL && !CanPushNegateR) 1400 return SDValue(); 1401 // Order the side where we can push the negate through to LHS. 1402 if (!CanPushNegateL && CanPushNegateR) 1403 std::swap(LHS, RHS); 1404 } else { 1405 bool NeedsNegOutL = LHS->getOpcode() == ISD::OR; 1406 bool NeedsNegOutR = RHS->getOpcode() == ISD::OR; 1407 if (NeedsNegOutL && NeedsNegOutR) 1408 return SDValue(); 1409 // Order the side where we need to negate the output flags to RHS so it 1410 // gets emitted first. 1411 if (NeedsNegOutL) 1412 std::swap(LHS, RHS); 1413 } 1414 1415 // Emit RHS. If we want to negate the tree we only need to push a negate 1416 // through if we are already in a PushNegate case, otherwise we can negate 1417 // the "flags to test" afterwards. 1418 AArch64CC::CondCode RHSCC; 1419 SDValue CmpR = emitConjunctionDisjunctionTree(DAG, RHS, RHSCC, PushNegate, 1420 CCOp, Predicate, Depth+1); 1421 if (NegateOperands && !PushNegate) 1422 RHSCC = AArch64CC::getInvertedCondCode(RHSCC); 1423 // Emit LHS. We must push the negate through if we need to negate it. 1424 SDValue CmpL = emitConjunctionDisjunctionTree(DAG, LHS, OutCC, NegateOperands, 1425 CmpR, RHSCC, Depth+1); 1426 // If we transformed an OR to and AND then we have to negate the result 1427 // (or absorb a PushNegate resulting in a double negation). 1428 if (Opcode == ISD::OR && !PushNegate) 1429 OutCC = AArch64CC::getInvertedCondCode(OutCC); 1430 return CmpL; 1431 } 1432 1433 /// @} 1434 1435 static SDValue getAArch64Cmp(SDValue LHS, SDValue RHS, ISD::CondCode CC, 1436 SDValue &AArch64cc, SelectionDAG &DAG, SDLoc dl) { 1437 if (ConstantSDNode *RHSC = dyn_cast<ConstantSDNode>(RHS.getNode())) { 1438 EVT VT = RHS.getValueType(); 1439 uint64_t C = RHSC->getZExtValue(); 1440 if (!isLegalArithImmed(C)) { 1441 // Constant does not fit, try adjusting it by one? 1442 switch (CC) { 1443 default: 1444 break; 1445 case ISD::SETLT: 1446 case ISD::SETGE: 1447 if ((VT == MVT::i32 && C != 0x80000000 && 1448 isLegalArithImmed((uint32_t)(C - 1))) || 1449 (VT == MVT::i64 && C != 0x80000000ULL && 1450 isLegalArithImmed(C - 1ULL))) { 1451 CC = (CC == ISD::SETLT) ? ISD::SETLE : ISD::SETGT; 1452 C = (VT == MVT::i32) ? (uint32_t)(C - 1) : C - 1; 1453 RHS = DAG.getConstant(C, dl, VT); 1454 } 1455 break; 1456 case ISD::SETULT: 1457 case ISD::SETUGE: 1458 if ((VT == MVT::i32 && C != 0 && 1459 isLegalArithImmed((uint32_t)(C - 1))) || 1460 (VT == MVT::i64 && C != 0ULL && isLegalArithImmed(C - 1ULL))) { 1461 CC = (CC == ISD::SETULT) ? ISD::SETULE : ISD::SETUGT; 1462 C = (VT == MVT::i32) ? (uint32_t)(C - 1) : C - 1; 1463 RHS = DAG.getConstant(C, dl, VT); 1464 } 1465 break; 1466 case ISD::SETLE: 1467 case ISD::SETGT: 1468 if ((VT == MVT::i32 && C != INT32_MAX && 1469 isLegalArithImmed((uint32_t)(C + 1))) || 1470 (VT == MVT::i64 && C != INT64_MAX && 1471 isLegalArithImmed(C + 1ULL))) { 1472 CC = (CC == ISD::SETLE) ? ISD::SETLT : ISD::SETGE; 1473 C = (VT == MVT::i32) ? (uint32_t)(C + 1) : C + 1; 1474 RHS = DAG.getConstant(C, dl, VT); 1475 } 1476 break; 1477 case ISD::SETULE: 1478 case ISD::SETUGT: 1479 if ((VT == MVT::i32 && C != UINT32_MAX && 1480 isLegalArithImmed((uint32_t)(C + 1))) || 1481 (VT == MVT::i64 && C != UINT64_MAX && 1482 isLegalArithImmed(C + 1ULL))) { 1483 CC = (CC == ISD::SETULE) ? ISD::SETULT : ISD::SETUGE; 1484 C = (VT == MVT::i32) ? (uint32_t)(C + 1) : C + 1; 1485 RHS = DAG.getConstant(C, dl, VT); 1486 } 1487 break; 1488 } 1489 } 1490 } 1491 SDValue Cmp; 1492 AArch64CC::CondCode AArch64CC; 1493 if ((CC == ISD::SETEQ || CC == ISD::SETNE) && isa<ConstantSDNode>(RHS)) { 1494 const ConstantSDNode *RHSC = cast<ConstantSDNode>(RHS); 1495 1496 // The imm operand of ADDS is an unsigned immediate, in the range 0 to 4095. 1497 // For the i8 operand, the largest immediate is 255, so this can be easily 1498 // encoded in the compare instruction. For the i16 operand, however, the 1499 // largest immediate cannot be encoded in the compare. 1500 // Therefore, use a sign extending load and cmn to avoid materializing the 1501 // -1 constant. For example, 1502 // movz w1, #65535 1503 // ldrh w0, [x0, #0] 1504 // cmp w0, w1 1505 // > 1506 // ldrsh w0, [x0, #0] 1507 // cmn w0, #1 1508 // Fundamental, we're relying on the property that (zext LHS) == (zext RHS) 1509 // if and only if (sext LHS) == (sext RHS). The checks are in place to 1510 // ensure both the LHS and RHS are truly zero extended and to make sure the 1511 // transformation is profitable. 1512 if ((RHSC->getZExtValue() >> 16 == 0) && isa<LoadSDNode>(LHS) && 1513 cast<LoadSDNode>(LHS)->getExtensionType() == ISD::ZEXTLOAD && 1514 cast<LoadSDNode>(LHS)->getMemoryVT() == MVT::i16 && 1515 LHS.getNode()->hasNUsesOfValue(1, 0)) { 1516 int16_t ValueofRHS = cast<ConstantSDNode>(RHS)->getZExtValue(); 1517 if (ValueofRHS < 0 && isLegalArithImmed(-ValueofRHS)) { 1518 SDValue SExt = 1519 DAG.getNode(ISD::SIGN_EXTEND_INREG, dl, LHS.getValueType(), LHS, 1520 DAG.getValueType(MVT::i16)); 1521 Cmp = emitComparison(SExt, DAG.getConstant(ValueofRHS, dl, 1522 RHS.getValueType()), 1523 CC, dl, DAG); 1524 AArch64CC = changeIntCCToAArch64CC(CC); 1525 } 1526 } 1527 1528 if (!Cmp && (RHSC->isNullValue() || RHSC->isOne())) { 1529 if ((Cmp = emitConjunctionDisjunctionTree(DAG, LHS, AArch64CC))) { 1530 if ((CC == ISD::SETNE) ^ RHSC->isNullValue()) 1531 AArch64CC = AArch64CC::getInvertedCondCode(AArch64CC); 1532 } 1533 } 1534 } 1535 1536 if (!Cmp) { 1537 Cmp = emitComparison(LHS, RHS, CC, dl, DAG); 1538 AArch64CC = changeIntCCToAArch64CC(CC); 1539 } 1540 AArch64cc = DAG.getConstant(AArch64CC, dl, MVT_CC); 1541 return Cmp; 1542 } 1543 1544 static std::pair<SDValue, SDValue> 1545 getAArch64XALUOOp(AArch64CC::CondCode &CC, SDValue Op, SelectionDAG &DAG) { 1546 assert((Op.getValueType() == MVT::i32 || Op.getValueType() == MVT::i64) && 1547 "Unsupported value type"); 1548 SDValue Value, Overflow; 1549 SDLoc DL(Op); 1550 SDValue LHS = Op.getOperand(0); 1551 SDValue RHS = Op.getOperand(1); 1552 unsigned Opc = 0; 1553 switch (Op.getOpcode()) { 1554 default: 1555 llvm_unreachable("Unknown overflow instruction!"); 1556 case ISD::SADDO: 1557 Opc = AArch64ISD::ADDS; 1558 CC = AArch64CC::VS; 1559 break; 1560 case ISD::UADDO: 1561 Opc = AArch64ISD::ADDS; 1562 CC = AArch64CC::HS; 1563 break; 1564 case ISD::SSUBO: 1565 Opc = AArch64ISD::SUBS; 1566 CC = AArch64CC::VS; 1567 break; 1568 case ISD::USUBO: 1569 Opc = AArch64ISD::SUBS; 1570 CC = AArch64CC::LO; 1571 break; 1572 // Multiply needs a little bit extra work. 1573 case ISD::SMULO: 1574 case ISD::UMULO: { 1575 CC = AArch64CC::NE; 1576 bool IsSigned = Op.getOpcode() == ISD::SMULO; 1577 if (Op.getValueType() == MVT::i32) { 1578 unsigned ExtendOpc = IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND; 1579 // For a 32 bit multiply with overflow check we want the instruction 1580 // selector to generate a widening multiply (SMADDL/UMADDL). For that we 1581 // need to generate the following pattern: 1582 // (i64 add 0, (i64 mul (i64 sext|zext i32 %a), (i64 sext|zext i32 %b)) 1583 LHS = DAG.getNode(ExtendOpc, DL, MVT::i64, LHS); 1584 RHS = DAG.getNode(ExtendOpc, DL, MVT::i64, RHS); 1585 SDValue Mul = DAG.getNode(ISD::MUL, DL, MVT::i64, LHS, RHS); 1586 SDValue Add = DAG.getNode(ISD::ADD, DL, MVT::i64, Mul, 1587 DAG.getConstant(0, DL, MVT::i64)); 1588 // On AArch64 the upper 32 bits are always zero extended for a 32 bit 1589 // operation. We need to clear out the upper 32 bits, because we used a 1590 // widening multiply that wrote all 64 bits. In the end this should be a 1591 // noop. 1592 Value = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, Add); 1593 if (IsSigned) { 1594 // The signed overflow check requires more than just a simple check for 1595 // any bit set in the upper 32 bits of the result. These bits could be 1596 // just the sign bits of a negative number. To perform the overflow 1597 // check we have to arithmetic shift right the 32nd bit of the result by 1598 // 31 bits. Then we compare the result to the upper 32 bits. 1599 SDValue UpperBits = DAG.getNode(ISD::SRL, DL, MVT::i64, Add, 1600 DAG.getConstant(32, DL, MVT::i64)); 1601 UpperBits = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, UpperBits); 1602 SDValue LowerBits = DAG.getNode(ISD::SRA, DL, MVT::i32, Value, 1603 DAG.getConstant(31, DL, MVT::i64)); 1604 // It is important that LowerBits is last, otherwise the arithmetic 1605 // shift will not be folded into the compare (SUBS). 1606 SDVTList VTs = DAG.getVTList(MVT::i32, MVT::i32); 1607 Overflow = DAG.getNode(AArch64ISD::SUBS, DL, VTs, UpperBits, LowerBits) 1608 .getValue(1); 1609 } else { 1610 // The overflow check for unsigned multiply is easy. We only need to 1611 // check if any of the upper 32 bits are set. This can be done with a 1612 // CMP (shifted register). For that we need to generate the following 1613 // pattern: 1614 // (i64 AArch64ISD::SUBS i64 0, (i64 srl i64 %Mul, i64 32) 1615 SDValue UpperBits = DAG.getNode(ISD::SRL, DL, MVT::i64, Mul, 1616 DAG.getConstant(32, DL, MVT::i64)); 1617 SDVTList VTs = DAG.getVTList(MVT::i64, MVT::i32); 1618 Overflow = 1619 DAG.getNode(AArch64ISD::SUBS, DL, VTs, 1620 DAG.getConstant(0, DL, MVT::i64), 1621 UpperBits).getValue(1); 1622 } 1623 break; 1624 } 1625 assert(Op.getValueType() == MVT::i64 && "Expected an i64 value type"); 1626 // For the 64 bit multiply 1627 Value = DAG.getNode(ISD::MUL, DL, MVT::i64, LHS, RHS); 1628 if (IsSigned) { 1629 SDValue UpperBits = DAG.getNode(ISD::MULHS, DL, MVT::i64, LHS, RHS); 1630 SDValue LowerBits = DAG.getNode(ISD::SRA, DL, MVT::i64, Value, 1631 DAG.getConstant(63, DL, MVT::i64)); 1632 // It is important that LowerBits is last, otherwise the arithmetic 1633 // shift will not be folded into the compare (SUBS). 1634 SDVTList VTs = DAG.getVTList(MVT::i64, MVT::i32); 1635 Overflow = DAG.getNode(AArch64ISD::SUBS, DL, VTs, UpperBits, LowerBits) 1636 .getValue(1); 1637 } else { 1638 SDValue UpperBits = DAG.getNode(ISD::MULHU, DL, MVT::i64, LHS, RHS); 1639 SDVTList VTs = DAG.getVTList(MVT::i64, MVT::i32); 1640 Overflow = 1641 DAG.getNode(AArch64ISD::SUBS, DL, VTs, 1642 DAG.getConstant(0, DL, MVT::i64), 1643 UpperBits).getValue(1); 1644 } 1645 break; 1646 } 1647 } // switch (...) 1648 1649 if (Opc) { 1650 SDVTList VTs = DAG.getVTList(Op->getValueType(0), MVT::i32); 1651 1652 // Emit the AArch64 operation with overflow check. 1653 Value = DAG.getNode(Opc, DL, VTs, LHS, RHS); 1654 Overflow = Value.getValue(1); 1655 } 1656 return std::make_pair(Value, Overflow); 1657 } 1658 1659 SDValue AArch64TargetLowering::LowerF128Call(SDValue Op, SelectionDAG &DAG, 1660 RTLIB::Libcall Call) const { 1661 SmallVector<SDValue, 2> Ops(Op->op_begin(), Op->op_end()); 1662 return makeLibCall(DAG, Call, MVT::f128, Ops, false, SDLoc(Op)).first; 1663 } 1664 1665 static SDValue LowerXOR(SDValue Op, SelectionDAG &DAG) { 1666 SDValue Sel = Op.getOperand(0); 1667 SDValue Other = Op.getOperand(1); 1668 1669 // If neither operand is a SELECT_CC, give up. 1670 if (Sel.getOpcode() != ISD::SELECT_CC) 1671 std::swap(Sel, Other); 1672 if (Sel.getOpcode() != ISD::SELECT_CC) 1673 return Op; 1674 1675 // The folding we want to perform is: 1676 // (xor x, (select_cc a, b, cc, 0, -1) ) 1677 // --> 1678 // (csel x, (xor x, -1), cc ...) 1679 // 1680 // The latter will get matched to a CSINV instruction. 1681 1682 ISD::CondCode CC = cast<CondCodeSDNode>(Sel.getOperand(4))->get(); 1683 SDValue LHS = Sel.getOperand(0); 1684 SDValue RHS = Sel.getOperand(1); 1685 SDValue TVal = Sel.getOperand(2); 1686 SDValue FVal = Sel.getOperand(3); 1687 SDLoc dl(Sel); 1688 1689 // FIXME: This could be generalized to non-integer comparisons. 1690 if (LHS.getValueType() != MVT::i32 && LHS.getValueType() != MVT::i64) 1691 return Op; 1692 1693 ConstantSDNode *CFVal = dyn_cast<ConstantSDNode>(FVal); 1694 ConstantSDNode *CTVal = dyn_cast<ConstantSDNode>(TVal); 1695 1696 // The values aren't constants, this isn't the pattern we're looking for. 1697 if (!CFVal || !CTVal) 1698 return Op; 1699 1700 // We can commute the SELECT_CC by inverting the condition. This 1701 // might be needed to make this fit into a CSINV pattern. 1702 if (CTVal->isAllOnesValue() && CFVal->isNullValue()) { 1703 std::swap(TVal, FVal); 1704 std::swap(CTVal, CFVal); 1705 CC = ISD::getSetCCInverse(CC, true); 1706 } 1707 1708 // If the constants line up, perform the transform! 1709 if (CTVal->isNullValue() && CFVal->isAllOnesValue()) { 1710 SDValue CCVal; 1711 SDValue Cmp = getAArch64Cmp(LHS, RHS, CC, CCVal, DAG, dl); 1712 1713 FVal = Other; 1714 TVal = DAG.getNode(ISD::XOR, dl, Other.getValueType(), Other, 1715 DAG.getConstant(-1ULL, dl, Other.getValueType())); 1716 1717 return DAG.getNode(AArch64ISD::CSEL, dl, Sel.getValueType(), FVal, TVal, 1718 CCVal, Cmp); 1719 } 1720 1721 return Op; 1722 } 1723 1724 static SDValue LowerADDC_ADDE_SUBC_SUBE(SDValue Op, SelectionDAG &DAG) { 1725 EVT VT = Op.getValueType(); 1726 1727 // Let legalize expand this if it isn't a legal type yet. 1728 if (!DAG.getTargetLoweringInfo().isTypeLegal(VT)) 1729 return SDValue(); 1730 1731 SDVTList VTs = DAG.getVTList(VT, MVT::i32); 1732 1733 unsigned Opc; 1734 bool ExtraOp = false; 1735 switch (Op.getOpcode()) { 1736 default: 1737 llvm_unreachable("Invalid code"); 1738 case ISD::ADDC: 1739 Opc = AArch64ISD::ADDS; 1740 break; 1741 case ISD::SUBC: 1742 Opc = AArch64ISD::SUBS; 1743 break; 1744 case ISD::ADDE: 1745 Opc = AArch64ISD::ADCS; 1746 ExtraOp = true; 1747 break; 1748 case ISD::SUBE: 1749 Opc = AArch64ISD::SBCS; 1750 ExtraOp = true; 1751 break; 1752 } 1753 1754 if (!ExtraOp) 1755 return DAG.getNode(Opc, SDLoc(Op), VTs, Op.getOperand(0), Op.getOperand(1)); 1756 return DAG.getNode(Opc, SDLoc(Op), VTs, Op.getOperand(0), Op.getOperand(1), 1757 Op.getOperand(2)); 1758 } 1759 1760 static SDValue LowerXALUO(SDValue Op, SelectionDAG &DAG) { 1761 // Let legalize expand this if it isn't a legal type yet. 1762 if (!DAG.getTargetLoweringInfo().isTypeLegal(Op.getValueType())) 1763 return SDValue(); 1764 1765 SDLoc dl(Op); 1766 AArch64CC::CondCode CC; 1767 // The actual operation that sets the overflow or carry flag. 1768 SDValue Value, Overflow; 1769 std::tie(Value, Overflow) = getAArch64XALUOOp(CC, Op, DAG); 1770 1771 // We use 0 and 1 as false and true values. 1772 SDValue TVal = DAG.getConstant(1, dl, MVT::i32); 1773 SDValue FVal = DAG.getConstant(0, dl, MVT::i32); 1774 1775 // We use an inverted condition, because the conditional select is inverted 1776 // too. This will allow it to be selected to a single instruction: 1777 // CSINC Wd, WZR, WZR, invert(cond). 1778 SDValue CCVal = DAG.getConstant(getInvertedCondCode(CC), dl, MVT::i32); 1779 Overflow = DAG.getNode(AArch64ISD::CSEL, dl, MVT::i32, FVal, TVal, 1780 CCVal, Overflow); 1781 1782 SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32); 1783 return DAG.getNode(ISD::MERGE_VALUES, dl, VTs, Value, Overflow); 1784 } 1785 1786 // Prefetch operands are: 1787 // 1: Address to prefetch 1788 // 2: bool isWrite 1789 // 3: int locality (0 = no locality ... 3 = extreme locality) 1790 // 4: bool isDataCache 1791 static SDValue LowerPREFETCH(SDValue Op, SelectionDAG &DAG) { 1792 SDLoc DL(Op); 1793 unsigned IsWrite = cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue(); 1794 unsigned Locality = cast<ConstantSDNode>(Op.getOperand(3))->getZExtValue(); 1795 unsigned IsData = cast<ConstantSDNode>(Op.getOperand(4))->getZExtValue(); 1796 1797 bool IsStream = !Locality; 1798 // When the locality number is set 1799 if (Locality) { 1800 // The front-end should have filtered out the out-of-range values 1801 assert(Locality <= 3 && "Prefetch locality out-of-range"); 1802 // The locality degree is the opposite of the cache speed. 1803 // Put the number the other way around. 1804 // The encoding starts at 0 for level 1 1805 Locality = 3 - Locality; 1806 } 1807 1808 // built the mask value encoding the expected behavior. 1809 unsigned PrfOp = (IsWrite << 4) | // Load/Store bit 1810 (!IsData << 3) | // IsDataCache bit 1811 (Locality << 1) | // Cache level bits 1812 (unsigned)IsStream; // Stream bit 1813 return DAG.getNode(AArch64ISD::PREFETCH, DL, MVT::Other, Op.getOperand(0), 1814 DAG.getConstant(PrfOp, DL, MVT::i32), Op.getOperand(1)); 1815 } 1816 1817 SDValue AArch64TargetLowering::LowerFP_EXTEND(SDValue Op, 1818 SelectionDAG &DAG) const { 1819 assert(Op.getValueType() == MVT::f128 && "Unexpected lowering"); 1820 1821 RTLIB::Libcall LC; 1822 LC = RTLIB::getFPEXT(Op.getOperand(0).getValueType(), Op.getValueType()); 1823 1824 return LowerF128Call(Op, DAG, LC); 1825 } 1826 1827 SDValue AArch64TargetLowering::LowerFP_ROUND(SDValue Op, 1828 SelectionDAG &DAG) const { 1829 if (Op.getOperand(0).getValueType() != MVT::f128) { 1830 // It's legal except when f128 is involved 1831 return Op; 1832 } 1833 1834 RTLIB::Libcall LC; 1835 LC = RTLIB::getFPROUND(Op.getOperand(0).getValueType(), Op.getValueType()); 1836 1837 // FP_ROUND node has a second operand indicating whether it is known to be 1838 // precise. That doesn't take part in the LibCall so we can't directly use 1839 // LowerF128Call. 1840 SDValue SrcVal = Op.getOperand(0); 1841 return makeLibCall(DAG, LC, Op.getValueType(), SrcVal, /*isSigned*/ false, 1842 SDLoc(Op)).first; 1843 } 1844 1845 static SDValue LowerVectorFP_TO_INT(SDValue Op, SelectionDAG &DAG) { 1846 // Warning: We maintain cost tables in AArch64TargetTransformInfo.cpp. 1847 // Any additional optimization in this function should be recorded 1848 // in the cost tables. 1849 EVT InVT = Op.getOperand(0).getValueType(); 1850 EVT VT = Op.getValueType(); 1851 unsigned NumElts = InVT.getVectorNumElements(); 1852 1853 // f16 vectors are promoted to f32 before a conversion. 1854 if (InVT.getVectorElementType() == MVT::f16) { 1855 MVT NewVT = MVT::getVectorVT(MVT::f32, NumElts); 1856 SDLoc dl(Op); 1857 return DAG.getNode( 1858 Op.getOpcode(), dl, Op.getValueType(), 1859 DAG.getNode(ISD::FP_EXTEND, dl, NewVT, Op.getOperand(0))); 1860 } 1861 1862 if (VT.getSizeInBits() < InVT.getSizeInBits()) { 1863 SDLoc dl(Op); 1864 SDValue Cv = 1865 DAG.getNode(Op.getOpcode(), dl, InVT.changeVectorElementTypeToInteger(), 1866 Op.getOperand(0)); 1867 return DAG.getNode(ISD::TRUNCATE, dl, VT, Cv); 1868 } 1869 1870 if (VT.getSizeInBits() > InVT.getSizeInBits()) { 1871 SDLoc dl(Op); 1872 MVT ExtVT = 1873 MVT::getVectorVT(MVT::getFloatingPointVT(VT.getScalarSizeInBits()), 1874 VT.getVectorNumElements()); 1875 SDValue Ext = DAG.getNode(ISD::FP_EXTEND, dl, ExtVT, Op.getOperand(0)); 1876 return DAG.getNode(Op.getOpcode(), dl, VT, Ext); 1877 } 1878 1879 // Type changing conversions are illegal. 1880 return Op; 1881 } 1882 1883 SDValue AArch64TargetLowering::LowerFP_TO_INT(SDValue Op, 1884 SelectionDAG &DAG) const { 1885 if (Op.getOperand(0).getValueType().isVector()) 1886 return LowerVectorFP_TO_INT(Op, DAG); 1887 1888 // f16 conversions are promoted to f32. 1889 if (Op.getOperand(0).getValueType() == MVT::f16) { 1890 SDLoc dl(Op); 1891 return DAG.getNode( 1892 Op.getOpcode(), dl, Op.getValueType(), 1893 DAG.getNode(ISD::FP_EXTEND, dl, MVT::f32, Op.getOperand(0))); 1894 } 1895 1896 if (Op.getOperand(0).getValueType() != MVT::f128) { 1897 // It's legal except when f128 is involved 1898 return Op; 1899 } 1900 1901 RTLIB::Libcall LC; 1902 if (Op.getOpcode() == ISD::FP_TO_SINT) 1903 LC = RTLIB::getFPTOSINT(Op.getOperand(0).getValueType(), Op.getValueType()); 1904 else 1905 LC = RTLIB::getFPTOUINT(Op.getOperand(0).getValueType(), Op.getValueType()); 1906 1907 SmallVector<SDValue, 2> Ops(Op->op_begin(), Op->op_end()); 1908 return makeLibCall(DAG, LC, Op.getValueType(), Ops, false, SDLoc(Op)).first; 1909 } 1910 1911 static SDValue LowerVectorINT_TO_FP(SDValue Op, SelectionDAG &DAG) { 1912 // Warning: We maintain cost tables in AArch64TargetTransformInfo.cpp. 1913 // Any additional optimization in this function should be recorded 1914 // in the cost tables. 1915 EVT VT = Op.getValueType(); 1916 SDLoc dl(Op); 1917 SDValue In = Op.getOperand(0); 1918 EVT InVT = In.getValueType(); 1919 1920 if (VT.getSizeInBits() < InVT.getSizeInBits()) { 1921 MVT CastVT = 1922 MVT::getVectorVT(MVT::getFloatingPointVT(InVT.getScalarSizeInBits()), 1923 InVT.getVectorNumElements()); 1924 In = DAG.getNode(Op.getOpcode(), dl, CastVT, In); 1925 return DAG.getNode(ISD::FP_ROUND, dl, VT, In, DAG.getIntPtrConstant(0, dl)); 1926 } 1927 1928 if (VT.getSizeInBits() > InVT.getSizeInBits()) { 1929 unsigned CastOpc = 1930 Op.getOpcode() == ISD::SINT_TO_FP ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND; 1931 EVT CastVT = VT.changeVectorElementTypeToInteger(); 1932 In = DAG.getNode(CastOpc, dl, CastVT, In); 1933 return DAG.getNode(Op.getOpcode(), dl, VT, In); 1934 } 1935 1936 return Op; 1937 } 1938 1939 SDValue AArch64TargetLowering::LowerINT_TO_FP(SDValue Op, 1940 SelectionDAG &DAG) const { 1941 if (Op.getValueType().isVector()) 1942 return LowerVectorINT_TO_FP(Op, DAG); 1943 1944 // f16 conversions are promoted to f32. 1945 if (Op.getValueType() == MVT::f16) { 1946 SDLoc dl(Op); 1947 return DAG.getNode( 1948 ISD::FP_ROUND, dl, MVT::f16, 1949 DAG.getNode(Op.getOpcode(), dl, MVT::f32, Op.getOperand(0)), 1950 DAG.getIntPtrConstant(0, dl)); 1951 } 1952 1953 // i128 conversions are libcalls. 1954 if (Op.getOperand(0).getValueType() == MVT::i128) 1955 return SDValue(); 1956 1957 // Other conversions are legal, unless it's to the completely software-based 1958 // fp128. 1959 if (Op.getValueType() != MVT::f128) 1960 return Op; 1961 1962 RTLIB::Libcall LC; 1963 if (Op.getOpcode() == ISD::SINT_TO_FP) 1964 LC = RTLIB::getSINTTOFP(Op.getOperand(0).getValueType(), Op.getValueType()); 1965 else 1966 LC = RTLIB::getUINTTOFP(Op.getOperand(0).getValueType(), Op.getValueType()); 1967 1968 return LowerF128Call(Op, DAG, LC); 1969 } 1970 1971 SDValue AArch64TargetLowering::LowerFSINCOS(SDValue Op, 1972 SelectionDAG &DAG) const { 1973 // For iOS, we want to call an alternative entry point: __sincos_stret, 1974 // which returns the values in two S / D registers. 1975 SDLoc dl(Op); 1976 SDValue Arg = Op.getOperand(0); 1977 EVT ArgVT = Arg.getValueType(); 1978 Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext()); 1979 1980 ArgListTy Args; 1981 ArgListEntry Entry; 1982 1983 Entry.Node = Arg; 1984 Entry.Ty = ArgTy; 1985 Entry.isSExt = false; 1986 Entry.isZExt = false; 1987 Args.push_back(Entry); 1988 1989 const char *LibcallName = 1990 (ArgVT == MVT::f64) ? "__sincos_stret" : "__sincosf_stret"; 1991 SDValue Callee = 1992 DAG.getExternalSymbol(LibcallName, getPointerTy(DAG.getDataLayout())); 1993 1994 StructType *RetTy = StructType::get(ArgTy, ArgTy, nullptr); 1995 TargetLowering::CallLoweringInfo CLI(DAG); 1996 CLI.setDebugLoc(dl).setChain(DAG.getEntryNode()) 1997 .setCallee(CallingConv::Fast, RetTy, Callee, std::move(Args), 0); 1998 1999 std::pair<SDValue, SDValue> CallResult = LowerCallTo(CLI); 2000 return CallResult.first; 2001 } 2002 2003 static SDValue LowerBITCAST(SDValue Op, SelectionDAG &DAG) { 2004 if (Op.getValueType() != MVT::f16) 2005 return SDValue(); 2006 2007 assert(Op.getOperand(0).getValueType() == MVT::i16); 2008 SDLoc DL(Op); 2009 2010 Op = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Op.getOperand(0)); 2011 Op = DAG.getNode(ISD::BITCAST, DL, MVT::f32, Op); 2012 return SDValue( 2013 DAG.getMachineNode(TargetOpcode::EXTRACT_SUBREG, DL, MVT::f16, Op, 2014 DAG.getTargetConstant(AArch64::hsub, DL, MVT::i32)), 2015 0); 2016 } 2017 2018 static EVT getExtensionTo64Bits(const EVT &OrigVT) { 2019 if (OrigVT.getSizeInBits() >= 64) 2020 return OrigVT; 2021 2022 assert(OrigVT.isSimple() && "Expecting a simple value type"); 2023 2024 MVT::SimpleValueType OrigSimpleTy = OrigVT.getSimpleVT().SimpleTy; 2025 switch (OrigSimpleTy) { 2026 default: llvm_unreachable("Unexpected Vector Type"); 2027 case MVT::v2i8: 2028 case MVT::v2i16: 2029 return MVT::v2i32; 2030 case MVT::v4i8: 2031 return MVT::v4i16; 2032 } 2033 } 2034 2035 static SDValue addRequiredExtensionForVectorMULL(SDValue N, SelectionDAG &DAG, 2036 const EVT &OrigTy, 2037 const EVT &ExtTy, 2038 unsigned ExtOpcode) { 2039 // The vector originally had a size of OrigTy. It was then extended to ExtTy. 2040 // We expect the ExtTy to be 128-bits total. If the OrigTy is less than 2041 // 64-bits we need to insert a new extension so that it will be 64-bits. 2042 assert(ExtTy.is128BitVector() && "Unexpected extension size"); 2043 if (OrigTy.getSizeInBits() >= 64) 2044 return N; 2045 2046 // Must extend size to at least 64 bits to be used as an operand for VMULL. 2047 EVT NewVT = getExtensionTo64Bits(OrigTy); 2048 2049 return DAG.getNode(ExtOpcode, SDLoc(N), NewVT, N); 2050 } 2051 2052 static bool isExtendedBUILD_VECTOR(SDNode *N, SelectionDAG &DAG, 2053 bool isSigned) { 2054 EVT VT = N->getValueType(0); 2055 2056 if (N->getOpcode() != ISD::BUILD_VECTOR) 2057 return false; 2058 2059 for (const SDValue &Elt : N->op_values()) { 2060 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Elt)) { 2061 unsigned EltSize = VT.getVectorElementType().getSizeInBits(); 2062 unsigned HalfSize = EltSize / 2; 2063 if (isSigned) { 2064 if (!isIntN(HalfSize, C->getSExtValue())) 2065 return false; 2066 } else { 2067 if (!isUIntN(HalfSize, C->getZExtValue())) 2068 return false; 2069 } 2070 continue; 2071 } 2072 return false; 2073 } 2074 2075 return true; 2076 } 2077 2078 static SDValue skipExtensionForVectorMULL(SDNode *N, SelectionDAG &DAG) { 2079 if (N->getOpcode() == ISD::SIGN_EXTEND || N->getOpcode() == ISD::ZERO_EXTEND) 2080 return addRequiredExtensionForVectorMULL(N->getOperand(0), DAG, 2081 N->getOperand(0)->getValueType(0), 2082 N->getValueType(0), 2083 N->getOpcode()); 2084 2085 assert(N->getOpcode() == ISD::BUILD_VECTOR && "expected BUILD_VECTOR"); 2086 EVT VT = N->getValueType(0); 2087 SDLoc dl(N); 2088 unsigned EltSize = VT.getVectorElementType().getSizeInBits() / 2; 2089 unsigned NumElts = VT.getVectorNumElements(); 2090 MVT TruncVT = MVT::getIntegerVT(EltSize); 2091 SmallVector<SDValue, 8> Ops; 2092 for (unsigned i = 0; i != NumElts; ++i) { 2093 ConstantSDNode *C = cast<ConstantSDNode>(N->getOperand(i)); 2094 const APInt &CInt = C->getAPIntValue(); 2095 // Element types smaller than 32 bits are not legal, so use i32 elements. 2096 // The values are implicitly truncated so sext vs. zext doesn't matter. 2097 Ops.push_back(DAG.getConstant(CInt.zextOrTrunc(32), dl, MVT::i32)); 2098 } 2099 return DAG.getNode(ISD::BUILD_VECTOR, dl, 2100 MVT::getVectorVT(TruncVT, NumElts), Ops); 2101 } 2102 2103 static bool isSignExtended(SDNode *N, SelectionDAG &DAG) { 2104 if (N->getOpcode() == ISD::SIGN_EXTEND) 2105 return true; 2106 if (isExtendedBUILD_VECTOR(N, DAG, true)) 2107 return true; 2108 return false; 2109 } 2110 2111 static bool isZeroExtended(SDNode *N, SelectionDAG &DAG) { 2112 if (N->getOpcode() == ISD::ZERO_EXTEND) 2113 return true; 2114 if (isExtendedBUILD_VECTOR(N, DAG, false)) 2115 return true; 2116 return false; 2117 } 2118 2119 static bool isAddSubSExt(SDNode *N, SelectionDAG &DAG) { 2120 unsigned Opcode = N->getOpcode(); 2121 if (Opcode == ISD::ADD || Opcode == ISD::SUB) { 2122 SDNode *N0 = N->getOperand(0).getNode(); 2123 SDNode *N1 = N->getOperand(1).getNode(); 2124 return N0->hasOneUse() && N1->hasOneUse() && 2125 isSignExtended(N0, DAG) && isSignExtended(N1, DAG); 2126 } 2127 return false; 2128 } 2129 2130 static bool isAddSubZExt(SDNode *N, SelectionDAG &DAG) { 2131 unsigned Opcode = N->getOpcode(); 2132 if (Opcode == ISD::ADD || Opcode == ISD::SUB) { 2133 SDNode *N0 = N->getOperand(0).getNode(); 2134 SDNode *N1 = N->getOperand(1).getNode(); 2135 return N0->hasOneUse() && N1->hasOneUse() && 2136 isZeroExtended(N0, DAG) && isZeroExtended(N1, DAG); 2137 } 2138 return false; 2139 } 2140 2141 static SDValue LowerMUL(SDValue Op, SelectionDAG &DAG) { 2142 // Multiplications are only custom-lowered for 128-bit vectors so that 2143 // VMULL can be detected. Otherwise v2i64 multiplications are not legal. 2144 EVT VT = Op.getValueType(); 2145 assert(VT.is128BitVector() && VT.isInteger() && 2146 "unexpected type for custom-lowering ISD::MUL"); 2147 SDNode *N0 = Op.getOperand(0).getNode(); 2148 SDNode *N1 = Op.getOperand(1).getNode(); 2149 unsigned NewOpc = 0; 2150 bool isMLA = false; 2151 bool isN0SExt = isSignExtended(N0, DAG); 2152 bool isN1SExt = isSignExtended(N1, DAG); 2153 if (isN0SExt && isN1SExt) 2154 NewOpc = AArch64ISD::SMULL; 2155 else { 2156 bool isN0ZExt = isZeroExtended(N0, DAG); 2157 bool isN1ZExt = isZeroExtended(N1, DAG); 2158 if (isN0ZExt && isN1ZExt) 2159 NewOpc = AArch64ISD::UMULL; 2160 else if (isN1SExt || isN1ZExt) { 2161 // Look for (s/zext A + s/zext B) * (s/zext C). We want to turn these 2162 // into (s/zext A * s/zext C) + (s/zext B * s/zext C) 2163 if (isN1SExt && isAddSubSExt(N0, DAG)) { 2164 NewOpc = AArch64ISD::SMULL; 2165 isMLA = true; 2166 } else if (isN1ZExt && isAddSubZExt(N0, DAG)) { 2167 NewOpc = AArch64ISD::UMULL; 2168 isMLA = true; 2169 } else if (isN0ZExt && isAddSubZExt(N1, DAG)) { 2170 std::swap(N0, N1); 2171 NewOpc = AArch64ISD::UMULL; 2172 isMLA = true; 2173 } 2174 } 2175 2176 if (!NewOpc) { 2177 if (VT == MVT::v2i64) 2178 // Fall through to expand this. It is not legal. 2179 return SDValue(); 2180 else 2181 // Other vector multiplications are legal. 2182 return Op; 2183 } 2184 } 2185 2186 // Legalize to a S/UMULL instruction 2187 SDLoc DL(Op); 2188 SDValue Op0; 2189 SDValue Op1 = skipExtensionForVectorMULL(N1, DAG); 2190 if (!isMLA) { 2191 Op0 = skipExtensionForVectorMULL(N0, DAG); 2192 assert(Op0.getValueType().is64BitVector() && 2193 Op1.getValueType().is64BitVector() && 2194 "unexpected types for extended operands to VMULL"); 2195 return DAG.getNode(NewOpc, DL, VT, Op0, Op1); 2196 } 2197 // Optimizing (zext A + zext B) * C, to (S/UMULL A, C) + (S/UMULL B, C) during 2198 // isel lowering to take advantage of no-stall back to back s/umul + s/umla. 2199 // This is true for CPUs with accumulate forwarding such as Cortex-A53/A57 2200 SDValue N00 = skipExtensionForVectorMULL(N0->getOperand(0).getNode(), DAG); 2201 SDValue N01 = skipExtensionForVectorMULL(N0->getOperand(1).getNode(), DAG); 2202 EVT Op1VT = Op1.getValueType(); 2203 return DAG.getNode(N0->getOpcode(), DL, VT, 2204 DAG.getNode(NewOpc, DL, VT, 2205 DAG.getNode(ISD::BITCAST, DL, Op1VT, N00), Op1), 2206 DAG.getNode(NewOpc, DL, VT, 2207 DAG.getNode(ISD::BITCAST, DL, Op1VT, N01), Op1)); 2208 } 2209 2210 SDValue AArch64TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, 2211 SelectionDAG &DAG) const { 2212 unsigned IntNo = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue(); 2213 SDLoc dl(Op); 2214 switch (IntNo) { 2215 default: return SDValue(); // Don't custom lower most intrinsics. 2216 case Intrinsic::aarch64_thread_pointer: { 2217 EVT PtrVT = getPointerTy(DAG.getDataLayout()); 2218 return DAG.getNode(AArch64ISD::THREAD_POINTER, dl, PtrVT); 2219 } 2220 case Intrinsic::aarch64_neon_smax: 2221 return DAG.getNode(ISD::SMAX, dl, Op.getValueType(), 2222 Op.getOperand(1), Op.getOperand(2)); 2223 case Intrinsic::aarch64_neon_umax: 2224 return DAG.getNode(ISD::UMAX, dl, Op.getValueType(), 2225 Op.getOperand(1), Op.getOperand(2)); 2226 case Intrinsic::aarch64_neon_smin: 2227 return DAG.getNode(ISD::SMIN, dl, Op.getValueType(), 2228 Op.getOperand(1), Op.getOperand(2)); 2229 case Intrinsic::aarch64_neon_umin: 2230 return DAG.getNode(ISD::UMIN, dl, Op.getValueType(), 2231 Op.getOperand(1), Op.getOperand(2)); 2232 } 2233 } 2234 2235 SDValue AArch64TargetLowering::LowerOperation(SDValue Op, 2236 SelectionDAG &DAG) const { 2237 switch (Op.getOpcode()) { 2238 default: 2239 llvm_unreachable("unimplemented operand"); 2240 return SDValue(); 2241 case ISD::BITCAST: 2242 return LowerBITCAST(Op, DAG); 2243 case ISD::GlobalAddress: 2244 return LowerGlobalAddress(Op, DAG); 2245 case ISD::GlobalTLSAddress: 2246 return LowerGlobalTLSAddress(Op, DAG); 2247 case ISD::SETCC: 2248 return LowerSETCC(Op, DAG); 2249 case ISD::BR_CC: 2250 return LowerBR_CC(Op, DAG); 2251 case ISD::SELECT: 2252 return LowerSELECT(Op, DAG); 2253 case ISD::SELECT_CC: 2254 return LowerSELECT_CC(Op, DAG); 2255 case ISD::JumpTable: 2256 return LowerJumpTable(Op, DAG); 2257 case ISD::ConstantPool: 2258 return LowerConstantPool(Op, DAG); 2259 case ISD::BlockAddress: 2260 return LowerBlockAddress(Op, DAG); 2261 case ISD::VASTART: 2262 return LowerVASTART(Op, DAG); 2263 case ISD::VACOPY: 2264 return LowerVACOPY(Op, DAG); 2265 case ISD::VAARG: 2266 return LowerVAARG(Op, DAG); 2267 case ISD::ADDC: 2268 case ISD::ADDE: 2269 case ISD::SUBC: 2270 case ISD::SUBE: 2271 return LowerADDC_ADDE_SUBC_SUBE(Op, DAG); 2272 case ISD::SADDO: 2273 case ISD::UADDO: 2274 case ISD::SSUBO: 2275 case ISD::USUBO: 2276 case ISD::SMULO: 2277 case ISD::UMULO: 2278 return LowerXALUO(Op, DAG); 2279 case ISD::FADD: 2280 return LowerF128Call(Op, DAG, RTLIB::ADD_F128); 2281 case ISD::FSUB: 2282 return LowerF128Call(Op, DAG, RTLIB::SUB_F128); 2283 case ISD::FMUL: 2284 return LowerF128Call(Op, DAG, RTLIB::MUL_F128); 2285 case ISD::FDIV: 2286 return LowerF128Call(Op, DAG, RTLIB::DIV_F128); 2287 case ISD::FP_ROUND: 2288 return LowerFP_ROUND(Op, DAG); 2289 case ISD::FP_EXTEND: 2290 return LowerFP_EXTEND(Op, DAG); 2291 case ISD::FRAMEADDR: 2292 return LowerFRAMEADDR(Op, DAG); 2293 case ISD::RETURNADDR: 2294 return LowerRETURNADDR(Op, DAG); 2295 case ISD::INSERT_VECTOR_ELT: 2296 return LowerINSERT_VECTOR_ELT(Op, DAG); 2297 case ISD::EXTRACT_VECTOR_ELT: 2298 return LowerEXTRACT_VECTOR_ELT(Op, DAG); 2299 case ISD::BUILD_VECTOR: 2300 return LowerBUILD_VECTOR(Op, DAG); 2301 case ISD::VECTOR_SHUFFLE: 2302 return LowerVECTOR_SHUFFLE(Op, DAG); 2303 case ISD::EXTRACT_SUBVECTOR: 2304 return LowerEXTRACT_SUBVECTOR(Op, DAG); 2305 case ISD::SRA: 2306 case ISD::SRL: 2307 case ISD::SHL: 2308 return LowerVectorSRA_SRL_SHL(Op, DAG); 2309 case ISD::SHL_PARTS: 2310 return LowerShiftLeftParts(Op, DAG); 2311 case ISD::SRL_PARTS: 2312 case ISD::SRA_PARTS: 2313 return LowerShiftRightParts(Op, DAG); 2314 case ISD::CTPOP: 2315 return LowerCTPOP(Op, DAG); 2316 case ISD::FCOPYSIGN: 2317 return LowerFCOPYSIGN(Op, DAG); 2318 case ISD::AND: 2319 return LowerVectorAND(Op, DAG); 2320 case ISD::OR: 2321 return LowerVectorOR(Op, DAG); 2322 case ISD::XOR: 2323 return LowerXOR(Op, DAG); 2324 case ISD::PREFETCH: 2325 return LowerPREFETCH(Op, DAG); 2326 case ISD::SINT_TO_FP: 2327 case ISD::UINT_TO_FP: 2328 return LowerINT_TO_FP(Op, DAG); 2329 case ISD::FP_TO_SINT: 2330 case ISD::FP_TO_UINT: 2331 return LowerFP_TO_INT(Op, DAG); 2332 case ISD::FSINCOS: 2333 return LowerFSINCOS(Op, DAG); 2334 case ISD::MUL: 2335 return LowerMUL(Op, DAG); 2336 case ISD::INTRINSIC_WO_CHAIN: 2337 return LowerINTRINSIC_WO_CHAIN(Op, DAG); 2338 } 2339 } 2340 2341 //===----------------------------------------------------------------------===// 2342 // Calling Convention Implementation 2343 //===----------------------------------------------------------------------===// 2344 2345 #include "AArch64GenCallingConv.inc" 2346 2347 /// Selects the correct CCAssignFn for a given CallingConvention value. 2348 CCAssignFn *AArch64TargetLowering::CCAssignFnForCall(CallingConv::ID CC, 2349 bool IsVarArg) const { 2350 switch (CC) { 2351 default: 2352 llvm_unreachable("Unsupported calling convention."); 2353 case CallingConv::WebKit_JS: 2354 return CC_AArch64_WebKit_JS; 2355 case CallingConv::GHC: 2356 return CC_AArch64_GHC; 2357 case CallingConv::C: 2358 case CallingConv::Fast: 2359 if (!Subtarget->isTargetDarwin()) 2360 return CC_AArch64_AAPCS; 2361 return IsVarArg ? CC_AArch64_DarwinPCS_VarArg : CC_AArch64_DarwinPCS; 2362 } 2363 } 2364 2365 SDValue AArch64TargetLowering::LowerFormalArguments( 2366 SDValue Chain, CallingConv::ID CallConv, bool isVarArg, 2367 const SmallVectorImpl<ISD::InputArg> &Ins, SDLoc DL, SelectionDAG &DAG, 2368 SmallVectorImpl<SDValue> &InVals) const { 2369 MachineFunction &MF = DAG.getMachineFunction(); 2370 MachineFrameInfo *MFI = MF.getFrameInfo(); 2371 2372 // Assign locations to all of the incoming arguments. 2373 SmallVector<CCValAssign, 16> ArgLocs; 2374 CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), ArgLocs, 2375 *DAG.getContext()); 2376 2377 // At this point, Ins[].VT may already be promoted to i32. To correctly 2378 // handle passing i8 as i8 instead of i32 on stack, we pass in both i32 and 2379 // i8 to CC_AArch64_AAPCS with i32 being ValVT and i8 being LocVT. 2380 // Since AnalyzeFormalArguments uses Ins[].VT for both ValVT and LocVT, here 2381 // we use a special version of AnalyzeFormalArguments to pass in ValVT and 2382 // LocVT. 2383 unsigned NumArgs = Ins.size(); 2384 Function::const_arg_iterator CurOrigArg = MF.getFunction()->arg_begin(); 2385 unsigned CurArgIdx = 0; 2386 for (unsigned i = 0; i != NumArgs; ++i) { 2387 MVT ValVT = Ins[i].VT; 2388 if (Ins[i].isOrigArg()) { 2389 std::advance(CurOrigArg, Ins[i].getOrigArgIndex() - CurArgIdx); 2390 CurArgIdx = Ins[i].getOrigArgIndex(); 2391 2392 // Get type of the original argument. 2393 EVT ActualVT = getValueType(DAG.getDataLayout(), CurOrigArg->getType(), 2394 /*AllowUnknown*/ true); 2395 MVT ActualMVT = ActualVT.isSimple() ? ActualVT.getSimpleVT() : MVT::Other; 2396 // If ActualMVT is i1/i8/i16, we should set LocVT to i8/i8/i16. 2397 if (ActualMVT == MVT::i1 || ActualMVT == MVT::i8) 2398 ValVT = MVT::i8; 2399 else if (ActualMVT == MVT::i16) 2400 ValVT = MVT::i16; 2401 } 2402 CCAssignFn *AssignFn = CCAssignFnForCall(CallConv, /*IsVarArg=*/false); 2403 bool Res = 2404 AssignFn(i, ValVT, ValVT, CCValAssign::Full, Ins[i].Flags, CCInfo); 2405 assert(!Res && "Call operand has unhandled type"); 2406 (void)Res; 2407 } 2408 assert(ArgLocs.size() == Ins.size()); 2409 SmallVector<SDValue, 16> ArgValues; 2410 for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) { 2411 CCValAssign &VA = ArgLocs[i]; 2412 2413 if (Ins[i].Flags.isByVal()) { 2414 // Byval is used for HFAs in the PCS, but the system should work in a 2415 // non-compliant manner for larger structs. 2416 EVT PtrVT = getPointerTy(DAG.getDataLayout()); 2417 int Size = Ins[i].Flags.getByValSize(); 2418 unsigned NumRegs = (Size + 7) / 8; 2419 2420 // FIXME: This works on big-endian for composite byvals, which are the common 2421 // case. It should also work for fundamental types too. 2422 unsigned FrameIdx = 2423 MFI->CreateFixedObject(8 * NumRegs, VA.getLocMemOffset(), false); 2424 SDValue FrameIdxN = DAG.getFrameIndex(FrameIdx, PtrVT); 2425 InVals.push_back(FrameIdxN); 2426 2427 continue; 2428 } 2429 2430 if (VA.isRegLoc()) { 2431 // Arguments stored in registers. 2432 EVT RegVT = VA.getLocVT(); 2433 2434 SDValue ArgValue; 2435 const TargetRegisterClass *RC; 2436 2437 if (RegVT == MVT::i32) 2438 RC = &AArch64::GPR32RegClass; 2439 else if (RegVT == MVT::i64) 2440 RC = &AArch64::GPR64RegClass; 2441 else if (RegVT == MVT::f16) 2442 RC = &AArch64::FPR16RegClass; 2443 else if (RegVT == MVT::f32) 2444 RC = &AArch64::FPR32RegClass; 2445 else if (RegVT == MVT::f64 || RegVT.is64BitVector()) 2446 RC = &AArch64::FPR64RegClass; 2447 else if (RegVT == MVT::f128 || RegVT.is128BitVector()) 2448 RC = &AArch64::FPR128RegClass; 2449 else 2450 llvm_unreachable("RegVT not supported by FORMAL_ARGUMENTS Lowering"); 2451 2452 // Transform the arguments in physical registers into virtual ones. 2453 unsigned Reg = MF.addLiveIn(VA.getLocReg(), RC); 2454 ArgValue = DAG.getCopyFromReg(Chain, DL, Reg, RegVT); 2455 2456 // If this is an 8, 16 or 32-bit value, it is really passed promoted 2457 // to 64 bits. Insert an assert[sz]ext to capture this, then 2458 // truncate to the right size. 2459 switch (VA.getLocInfo()) { 2460 default: 2461 llvm_unreachable("Unknown loc info!"); 2462 case CCValAssign::Full: 2463 break; 2464 case CCValAssign::BCvt: 2465 ArgValue = DAG.getNode(ISD::BITCAST, DL, VA.getValVT(), ArgValue); 2466 break; 2467 case CCValAssign::AExt: 2468 case CCValAssign::SExt: 2469 case CCValAssign::ZExt: 2470 // SelectionDAGBuilder will insert appropriate AssertZExt & AssertSExt 2471 // nodes after our lowering. 2472 assert(RegVT == Ins[i].VT && "incorrect register location selected"); 2473 break; 2474 } 2475 2476 InVals.push_back(ArgValue); 2477 2478 } else { // VA.isRegLoc() 2479 assert(VA.isMemLoc() && "CCValAssign is neither reg nor mem"); 2480 unsigned ArgOffset = VA.getLocMemOffset(); 2481 unsigned ArgSize = VA.getValVT().getSizeInBits() / 8; 2482 2483 uint32_t BEAlign = 0; 2484 if (!Subtarget->isLittleEndian() && ArgSize < 8 && 2485 !Ins[i].Flags.isInConsecutiveRegs()) 2486 BEAlign = 8 - ArgSize; 2487 2488 int FI = MFI->CreateFixedObject(ArgSize, ArgOffset + BEAlign, true); 2489 2490 // Create load nodes to retrieve arguments from the stack. 2491 SDValue FIN = DAG.getFrameIndex(FI, getPointerTy(DAG.getDataLayout())); 2492 SDValue ArgValue; 2493 2494 // For NON_EXTLOAD, generic code in getLoad assert(ValVT == MemVT) 2495 ISD::LoadExtType ExtType = ISD::NON_EXTLOAD; 2496 MVT MemVT = VA.getValVT(); 2497 2498 switch (VA.getLocInfo()) { 2499 default: 2500 break; 2501 case CCValAssign::BCvt: 2502 MemVT = VA.getLocVT(); 2503 break; 2504 case CCValAssign::SExt: 2505 ExtType = ISD::SEXTLOAD; 2506 break; 2507 case CCValAssign::ZExt: 2508 ExtType = ISD::ZEXTLOAD; 2509 break; 2510 case CCValAssign::AExt: 2511 ExtType = ISD::EXTLOAD; 2512 break; 2513 } 2514 2515 ArgValue = DAG.getExtLoad( 2516 ExtType, DL, VA.getLocVT(), Chain, FIN, 2517 MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI), 2518 MemVT, false, false, false, 0); 2519 2520 InVals.push_back(ArgValue); 2521 } 2522 } 2523 2524 // varargs 2525 if (isVarArg) { 2526 if (!Subtarget->isTargetDarwin()) { 2527 // The AAPCS variadic function ABI is identical to the non-variadic 2528 // one. As a result there may be more arguments in registers and we should 2529 // save them for future reference. 2530 saveVarArgRegisters(CCInfo, DAG, DL, Chain); 2531 } 2532 2533 AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>(); 2534 // This will point to the next argument passed via stack. 2535 unsigned StackOffset = CCInfo.getNextStackOffset(); 2536 // We currently pass all varargs at 8-byte alignment. 2537 StackOffset = ((StackOffset + 7) & ~7); 2538 AFI->setVarArgsStackIndex(MFI->CreateFixedObject(4, StackOffset, true)); 2539 } 2540 2541 AArch64FunctionInfo *FuncInfo = MF.getInfo<AArch64FunctionInfo>(); 2542 unsigned StackArgSize = CCInfo.getNextStackOffset(); 2543 bool TailCallOpt = MF.getTarget().Options.GuaranteedTailCallOpt; 2544 if (DoesCalleeRestoreStack(CallConv, TailCallOpt)) { 2545 // This is a non-standard ABI so by fiat I say we're allowed to make full 2546 // use of the stack area to be popped, which must be aligned to 16 bytes in 2547 // any case: 2548 StackArgSize = RoundUpToAlignment(StackArgSize, 16); 2549 2550 // If we're expected to restore the stack (e.g. fastcc) then we'll be adding 2551 // a multiple of 16. 2552 FuncInfo->setArgumentStackToRestore(StackArgSize); 2553 2554 // This realignment carries over to the available bytes below. Our own 2555 // callers will guarantee the space is free by giving an aligned value to 2556 // CALLSEQ_START. 2557 } 2558 // Even if we're not expected to free up the space, it's useful to know how 2559 // much is there while considering tail calls (because we can reuse it). 2560 FuncInfo->setBytesInStackArgArea(StackArgSize); 2561 2562 return Chain; 2563 } 2564 2565 void AArch64TargetLowering::saveVarArgRegisters(CCState &CCInfo, 2566 SelectionDAG &DAG, SDLoc DL, 2567 SDValue &Chain) const { 2568 MachineFunction &MF = DAG.getMachineFunction(); 2569 MachineFrameInfo *MFI = MF.getFrameInfo(); 2570 AArch64FunctionInfo *FuncInfo = MF.getInfo<AArch64FunctionInfo>(); 2571 auto PtrVT = getPointerTy(DAG.getDataLayout()); 2572 2573 SmallVector<SDValue, 8> MemOps; 2574 2575 static const MCPhysReg GPRArgRegs[] = { AArch64::X0, AArch64::X1, AArch64::X2, 2576 AArch64::X3, AArch64::X4, AArch64::X5, 2577 AArch64::X6, AArch64::X7 }; 2578 static const unsigned NumGPRArgRegs = array_lengthof(GPRArgRegs); 2579 unsigned FirstVariadicGPR = CCInfo.getFirstUnallocated(GPRArgRegs); 2580 2581 unsigned GPRSaveSize = 8 * (NumGPRArgRegs - FirstVariadicGPR); 2582 int GPRIdx = 0; 2583 if (GPRSaveSize != 0) { 2584 GPRIdx = MFI->CreateStackObject(GPRSaveSize, 8, false); 2585 2586 SDValue FIN = DAG.getFrameIndex(GPRIdx, PtrVT); 2587 2588 for (unsigned i = FirstVariadicGPR; i < NumGPRArgRegs; ++i) { 2589 unsigned VReg = MF.addLiveIn(GPRArgRegs[i], &AArch64::GPR64RegClass); 2590 SDValue Val = DAG.getCopyFromReg(Chain, DL, VReg, MVT::i64); 2591 SDValue Store = DAG.getStore( 2592 Val.getValue(1), DL, Val, FIN, 2593 MachinePointerInfo::getStack(DAG.getMachineFunction(), i * 8), false, 2594 false, 0); 2595 MemOps.push_back(Store); 2596 FIN = 2597 DAG.getNode(ISD::ADD, DL, PtrVT, FIN, DAG.getConstant(8, DL, PtrVT)); 2598 } 2599 } 2600 FuncInfo->setVarArgsGPRIndex(GPRIdx); 2601 FuncInfo->setVarArgsGPRSize(GPRSaveSize); 2602 2603 if (Subtarget->hasFPARMv8()) { 2604 static const MCPhysReg FPRArgRegs[] = { 2605 AArch64::Q0, AArch64::Q1, AArch64::Q2, AArch64::Q3, 2606 AArch64::Q4, AArch64::Q5, AArch64::Q6, AArch64::Q7}; 2607 static const unsigned NumFPRArgRegs = array_lengthof(FPRArgRegs); 2608 unsigned FirstVariadicFPR = CCInfo.getFirstUnallocated(FPRArgRegs); 2609 2610 unsigned FPRSaveSize = 16 * (NumFPRArgRegs - FirstVariadicFPR); 2611 int FPRIdx = 0; 2612 if (FPRSaveSize != 0) { 2613 FPRIdx = MFI->CreateStackObject(FPRSaveSize, 16, false); 2614 2615 SDValue FIN = DAG.getFrameIndex(FPRIdx, PtrVT); 2616 2617 for (unsigned i = FirstVariadicFPR; i < NumFPRArgRegs; ++i) { 2618 unsigned VReg = MF.addLiveIn(FPRArgRegs[i], &AArch64::FPR128RegClass); 2619 SDValue Val = DAG.getCopyFromReg(Chain, DL, VReg, MVT::f128); 2620 2621 SDValue Store = DAG.getStore( 2622 Val.getValue(1), DL, Val, FIN, 2623 MachinePointerInfo::getStack(DAG.getMachineFunction(), i * 16), 2624 false, false, 0); 2625 MemOps.push_back(Store); 2626 FIN = DAG.getNode(ISD::ADD, DL, PtrVT, FIN, 2627 DAG.getConstant(16, DL, PtrVT)); 2628 } 2629 } 2630 FuncInfo->setVarArgsFPRIndex(FPRIdx); 2631 FuncInfo->setVarArgsFPRSize(FPRSaveSize); 2632 } 2633 2634 if (!MemOps.empty()) { 2635 Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOps); 2636 } 2637 } 2638 2639 /// LowerCallResult - Lower the result values of a call into the 2640 /// appropriate copies out of appropriate physical registers. 2641 SDValue AArch64TargetLowering::LowerCallResult( 2642 SDValue Chain, SDValue InFlag, CallingConv::ID CallConv, bool isVarArg, 2643 const SmallVectorImpl<ISD::InputArg> &Ins, SDLoc DL, SelectionDAG &DAG, 2644 SmallVectorImpl<SDValue> &InVals, bool isThisReturn, 2645 SDValue ThisVal) const { 2646 CCAssignFn *RetCC = CallConv == CallingConv::WebKit_JS 2647 ? RetCC_AArch64_WebKit_JS 2648 : RetCC_AArch64_AAPCS; 2649 // Assign locations to each value returned by this call. 2650 SmallVector<CCValAssign, 16> RVLocs; 2651 CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), RVLocs, 2652 *DAG.getContext()); 2653 CCInfo.AnalyzeCallResult(Ins, RetCC); 2654 2655 // Copy all of the result registers out of their specified physreg. 2656 for (unsigned i = 0; i != RVLocs.size(); ++i) { 2657 CCValAssign VA = RVLocs[i]; 2658 2659 // Pass 'this' value directly from the argument to return value, to avoid 2660 // reg unit interference 2661 if (i == 0 && isThisReturn) { 2662 assert(!VA.needsCustom() && VA.getLocVT() == MVT::i64 && 2663 "unexpected return calling convention register assignment"); 2664 InVals.push_back(ThisVal); 2665 continue; 2666 } 2667 2668 SDValue Val = 2669 DAG.getCopyFromReg(Chain, DL, VA.getLocReg(), VA.getLocVT(), InFlag); 2670 Chain = Val.getValue(1); 2671 InFlag = Val.getValue(2); 2672 2673 switch (VA.getLocInfo()) { 2674 default: 2675 llvm_unreachable("Unknown loc info!"); 2676 case CCValAssign::Full: 2677 break; 2678 case CCValAssign::BCvt: 2679 Val = DAG.getNode(ISD::BITCAST, DL, VA.getValVT(), Val); 2680 break; 2681 } 2682 2683 InVals.push_back(Val); 2684 } 2685 2686 return Chain; 2687 } 2688 2689 bool AArch64TargetLowering::isEligibleForTailCallOptimization( 2690 SDValue Callee, CallingConv::ID CalleeCC, bool isVarArg, 2691 bool isCalleeStructRet, bool isCallerStructRet, 2692 const SmallVectorImpl<ISD::OutputArg> &Outs, 2693 const SmallVectorImpl<SDValue> &OutVals, 2694 const SmallVectorImpl<ISD::InputArg> &Ins, SelectionDAG &DAG) const { 2695 // For CallingConv::C this function knows whether the ABI needs 2696 // changing. That's not true for other conventions so they will have to opt in 2697 // manually. 2698 if (!IsTailCallConvention(CalleeCC) && CalleeCC != CallingConv::C) 2699 return false; 2700 2701 const MachineFunction &MF = DAG.getMachineFunction(); 2702 const Function *CallerF = MF.getFunction(); 2703 CallingConv::ID CallerCC = CallerF->getCallingConv(); 2704 bool CCMatch = CallerCC == CalleeCC; 2705 2706 // Byval parameters hand the function a pointer directly into the stack area 2707 // we want to reuse during a tail call. Working around this *is* possible (see 2708 // X86) but less efficient and uglier in LowerCall. 2709 for (Function::const_arg_iterator i = CallerF->arg_begin(), 2710 e = CallerF->arg_end(); 2711 i != e; ++i) 2712 if (i->hasByValAttr()) 2713 return false; 2714 2715 if (getTargetMachine().Options.GuaranteedTailCallOpt) { 2716 if (IsTailCallConvention(CalleeCC) && CCMatch) 2717 return true; 2718 return false; 2719 } 2720 2721 // Externally-defined functions with weak linkage should not be 2722 // tail-called on AArch64 when the OS does not support dynamic 2723 // pre-emption of symbols, as the AAELF spec requires normal calls 2724 // to undefined weak functions to be replaced with a NOP or jump to the 2725 // next instruction. The behaviour of branch instructions in this 2726 // situation (as used for tail calls) is implementation-defined, so we 2727 // cannot rely on the linker replacing the tail call with a return. 2728 if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee)) { 2729 const GlobalValue *GV = G->getGlobal(); 2730 const Triple &TT = getTargetMachine().getTargetTriple(); 2731 if (GV->hasExternalWeakLinkage() && 2732 (!TT.isOSWindows() || TT.isOSBinFormatELF() || TT.isOSBinFormatMachO())) 2733 return false; 2734 } 2735 2736 // Now we search for cases where we can use a tail call without changing the 2737 // ABI. Sibcall is used in some places (particularly gcc) to refer to this 2738 // concept. 2739 2740 // I want anyone implementing a new calling convention to think long and hard 2741 // about this assert. 2742 assert((!isVarArg || CalleeCC == CallingConv::C) && 2743 "Unexpected variadic calling convention"); 2744 2745 if (isVarArg && !Outs.empty()) { 2746 // At least two cases here: if caller is fastcc then we can't have any 2747 // memory arguments (we'd be expected to clean up the stack afterwards). If 2748 // caller is C then we could potentially use its argument area. 2749 2750 // FIXME: for now we take the most conservative of these in both cases: 2751 // disallow all variadic memory operands. 2752 SmallVector<CCValAssign, 16> ArgLocs; 2753 CCState CCInfo(CalleeCC, isVarArg, DAG.getMachineFunction(), ArgLocs, 2754 *DAG.getContext()); 2755 2756 CCInfo.AnalyzeCallOperands(Outs, CCAssignFnForCall(CalleeCC, true)); 2757 for (const CCValAssign &ArgLoc : ArgLocs) 2758 if (!ArgLoc.isRegLoc()) 2759 return false; 2760 } 2761 2762 // If the calling conventions do not match, then we'd better make sure the 2763 // results are returned in the same way as what the caller expects. 2764 if (!CCMatch) { 2765 SmallVector<CCValAssign, 16> RVLocs1; 2766 CCState CCInfo1(CalleeCC, false, DAG.getMachineFunction(), RVLocs1, 2767 *DAG.getContext()); 2768 CCInfo1.AnalyzeCallResult(Ins, CCAssignFnForCall(CalleeCC, isVarArg)); 2769 2770 SmallVector<CCValAssign, 16> RVLocs2; 2771 CCState CCInfo2(CallerCC, false, DAG.getMachineFunction(), RVLocs2, 2772 *DAG.getContext()); 2773 CCInfo2.AnalyzeCallResult(Ins, CCAssignFnForCall(CallerCC, isVarArg)); 2774 2775 if (RVLocs1.size() != RVLocs2.size()) 2776 return false; 2777 for (unsigned i = 0, e = RVLocs1.size(); i != e; ++i) { 2778 if (RVLocs1[i].isRegLoc() != RVLocs2[i].isRegLoc()) 2779 return false; 2780 if (RVLocs1[i].getLocInfo() != RVLocs2[i].getLocInfo()) 2781 return false; 2782 if (RVLocs1[i].isRegLoc()) { 2783 if (RVLocs1[i].getLocReg() != RVLocs2[i].getLocReg()) 2784 return false; 2785 } else { 2786 if (RVLocs1[i].getLocMemOffset() != RVLocs2[i].getLocMemOffset()) 2787 return false; 2788 } 2789 } 2790 } 2791 2792 // Nothing more to check if the callee is taking no arguments 2793 if (Outs.empty()) 2794 return true; 2795 2796 SmallVector<CCValAssign, 16> ArgLocs; 2797 CCState CCInfo(CalleeCC, isVarArg, DAG.getMachineFunction(), ArgLocs, 2798 *DAG.getContext()); 2799 2800 CCInfo.AnalyzeCallOperands(Outs, CCAssignFnForCall(CalleeCC, isVarArg)); 2801 2802 const AArch64FunctionInfo *FuncInfo = MF.getInfo<AArch64FunctionInfo>(); 2803 2804 // If the stack arguments for this call would fit into our own save area then 2805 // the call can be made tail. 2806 return CCInfo.getNextStackOffset() <= FuncInfo->getBytesInStackArgArea(); 2807 } 2808 2809 SDValue AArch64TargetLowering::addTokenForArgument(SDValue Chain, 2810 SelectionDAG &DAG, 2811 MachineFrameInfo *MFI, 2812 int ClobberedFI) const { 2813 SmallVector<SDValue, 8> ArgChains; 2814 int64_t FirstByte = MFI->getObjectOffset(ClobberedFI); 2815 int64_t LastByte = FirstByte + MFI->getObjectSize(ClobberedFI) - 1; 2816 2817 // Include the original chain at the beginning of the list. When this is 2818 // used by target LowerCall hooks, this helps legalize find the 2819 // CALLSEQ_BEGIN node. 2820 ArgChains.push_back(Chain); 2821 2822 // Add a chain value for each stack argument corresponding 2823 for (SDNode::use_iterator U = DAG.getEntryNode().getNode()->use_begin(), 2824 UE = DAG.getEntryNode().getNode()->use_end(); 2825 U != UE; ++U) 2826 if (LoadSDNode *L = dyn_cast<LoadSDNode>(*U)) 2827 if (FrameIndexSDNode *FI = dyn_cast<FrameIndexSDNode>(L->getBasePtr())) 2828 if (FI->getIndex() < 0) { 2829 int64_t InFirstByte = MFI->getObjectOffset(FI->getIndex()); 2830 int64_t InLastByte = InFirstByte; 2831 InLastByte += MFI->getObjectSize(FI->getIndex()) - 1; 2832 2833 if ((InFirstByte <= FirstByte && FirstByte <= InLastByte) || 2834 (FirstByte <= InFirstByte && InFirstByte <= LastByte)) 2835 ArgChains.push_back(SDValue(L, 1)); 2836 } 2837 2838 // Build a tokenfactor for all the chains. 2839 return DAG.getNode(ISD::TokenFactor, SDLoc(Chain), MVT::Other, ArgChains); 2840 } 2841 2842 bool AArch64TargetLowering::DoesCalleeRestoreStack(CallingConv::ID CallCC, 2843 bool TailCallOpt) const { 2844 return CallCC == CallingConv::Fast && TailCallOpt; 2845 } 2846 2847 bool AArch64TargetLowering::IsTailCallConvention(CallingConv::ID CallCC) const { 2848 return CallCC == CallingConv::Fast; 2849 } 2850 2851 /// LowerCall - Lower a call to a callseq_start + CALL + callseq_end chain, 2852 /// and add input and output parameter nodes. 2853 SDValue 2854 AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI, 2855 SmallVectorImpl<SDValue> &InVals) const { 2856 SelectionDAG &DAG = CLI.DAG; 2857 SDLoc &DL = CLI.DL; 2858 SmallVector<ISD::OutputArg, 32> &Outs = CLI.Outs; 2859 SmallVector<SDValue, 32> &OutVals = CLI.OutVals; 2860 SmallVector<ISD::InputArg, 32> &Ins = CLI.Ins; 2861 SDValue Chain = CLI.Chain; 2862 SDValue Callee = CLI.Callee; 2863 bool &IsTailCall = CLI.IsTailCall; 2864 CallingConv::ID CallConv = CLI.CallConv; 2865 bool IsVarArg = CLI.IsVarArg; 2866 2867 MachineFunction &MF = DAG.getMachineFunction(); 2868 bool IsStructRet = (Outs.empty()) ? false : Outs[0].Flags.isSRet(); 2869 bool IsThisReturn = false; 2870 2871 AArch64FunctionInfo *FuncInfo = MF.getInfo<AArch64FunctionInfo>(); 2872 bool TailCallOpt = MF.getTarget().Options.GuaranteedTailCallOpt; 2873 bool IsSibCall = false; 2874 2875 if (IsTailCall) { 2876 // Check if it's really possible to do a tail call. 2877 IsTailCall = isEligibleForTailCallOptimization( 2878 Callee, CallConv, IsVarArg, IsStructRet, 2879 MF.getFunction()->hasStructRetAttr(), Outs, OutVals, Ins, DAG); 2880 if (!IsTailCall && CLI.CS && CLI.CS->isMustTailCall()) 2881 report_fatal_error("failed to perform tail call elimination on a call " 2882 "site marked musttail"); 2883 2884 // A sibling call is one where we're under the usual C ABI and not planning 2885 // to change that but can still do a tail call: 2886 if (!TailCallOpt && IsTailCall) 2887 IsSibCall = true; 2888 2889 if (IsTailCall) 2890 ++NumTailCalls; 2891 } 2892 2893 // Analyze operands of the call, assigning locations to each operand. 2894 SmallVector<CCValAssign, 16> ArgLocs; 2895 CCState CCInfo(CallConv, IsVarArg, DAG.getMachineFunction(), ArgLocs, 2896 *DAG.getContext()); 2897 2898 if (IsVarArg) { 2899 // Handle fixed and variable vector arguments differently. 2900 // Variable vector arguments always go into memory. 2901 unsigned NumArgs = Outs.size(); 2902 2903 for (unsigned i = 0; i != NumArgs; ++i) { 2904 MVT ArgVT = Outs[i].VT; 2905 ISD::ArgFlagsTy ArgFlags = Outs[i].Flags; 2906 CCAssignFn *AssignFn = CCAssignFnForCall(CallConv, 2907 /*IsVarArg=*/ !Outs[i].IsFixed); 2908 bool Res = AssignFn(i, ArgVT, ArgVT, CCValAssign::Full, ArgFlags, CCInfo); 2909 assert(!Res && "Call operand has unhandled type"); 2910 (void)Res; 2911 } 2912 } else { 2913 // At this point, Outs[].VT may already be promoted to i32. To correctly 2914 // handle passing i8 as i8 instead of i32 on stack, we pass in both i32 and 2915 // i8 to CC_AArch64_AAPCS with i32 being ValVT and i8 being LocVT. 2916 // Since AnalyzeCallOperands uses Ins[].VT for both ValVT and LocVT, here 2917 // we use a special version of AnalyzeCallOperands to pass in ValVT and 2918 // LocVT. 2919 unsigned NumArgs = Outs.size(); 2920 for (unsigned i = 0; i != NumArgs; ++i) { 2921 MVT ValVT = Outs[i].VT; 2922 // Get type of the original argument. 2923 EVT ActualVT = getValueType(DAG.getDataLayout(), 2924 CLI.getArgs()[Outs[i].OrigArgIndex].Ty, 2925 /*AllowUnknown*/ true); 2926 MVT ActualMVT = ActualVT.isSimple() ? ActualVT.getSimpleVT() : ValVT; 2927 ISD::ArgFlagsTy ArgFlags = Outs[i].Flags; 2928 // If ActualMVT is i1/i8/i16, we should set LocVT to i8/i8/i16. 2929 if (ActualMVT == MVT::i1 || ActualMVT == MVT::i8) 2930 ValVT = MVT::i8; 2931 else if (ActualMVT == MVT::i16) 2932 ValVT = MVT::i16; 2933 2934 CCAssignFn *AssignFn = CCAssignFnForCall(CallConv, /*IsVarArg=*/false); 2935 bool Res = AssignFn(i, ValVT, ValVT, CCValAssign::Full, ArgFlags, CCInfo); 2936 assert(!Res && "Call operand has unhandled type"); 2937 (void)Res; 2938 } 2939 } 2940 2941 // Get a count of how many bytes are to be pushed on the stack. 2942 unsigned NumBytes = CCInfo.getNextStackOffset(); 2943 2944 if (IsSibCall) { 2945 // Since we're not changing the ABI to make this a tail call, the memory 2946 // operands are already available in the caller's incoming argument space. 2947 NumBytes = 0; 2948 } 2949 2950 // FPDiff is the byte offset of the call's argument area from the callee's. 2951 // Stores to callee stack arguments will be placed in FixedStackSlots offset 2952 // by this amount for a tail call. In a sibling call it must be 0 because the 2953 // caller will deallocate the entire stack and the callee still expects its 2954 // arguments to begin at SP+0. Completely unused for non-tail calls. 2955 int FPDiff = 0; 2956 2957 if (IsTailCall && !IsSibCall) { 2958 unsigned NumReusableBytes = FuncInfo->getBytesInStackArgArea(); 2959 2960 // Since callee will pop argument stack as a tail call, we must keep the 2961 // popped size 16-byte aligned. 2962 NumBytes = RoundUpToAlignment(NumBytes, 16); 2963 2964 // FPDiff will be negative if this tail call requires more space than we 2965 // would automatically have in our incoming argument space. Positive if we 2966 // can actually shrink the stack. 2967 FPDiff = NumReusableBytes - NumBytes; 2968 2969 // The stack pointer must be 16-byte aligned at all times it's used for a 2970 // memory operation, which in practice means at *all* times and in 2971 // particular across call boundaries. Therefore our own arguments started at 2972 // a 16-byte aligned SP and the delta applied for the tail call should 2973 // satisfy the same constraint. 2974 assert(FPDiff % 16 == 0 && "unaligned stack on tail call"); 2975 } 2976 2977 // Adjust the stack pointer for the new arguments... 2978 // These operations are automatically eliminated by the prolog/epilog pass 2979 if (!IsSibCall) 2980 Chain = DAG.getCALLSEQ_START(Chain, DAG.getIntPtrConstant(NumBytes, DL, 2981 true), 2982 DL); 2983 2984 SDValue StackPtr = DAG.getCopyFromReg(Chain, DL, AArch64::SP, 2985 getPointerTy(DAG.getDataLayout())); 2986 2987 SmallVector<std::pair<unsigned, SDValue>, 8> RegsToPass; 2988 SmallVector<SDValue, 8> MemOpChains; 2989 auto PtrVT = getPointerTy(DAG.getDataLayout()); 2990 2991 // Walk the register/memloc assignments, inserting copies/loads. 2992 for (unsigned i = 0, realArgIdx = 0, e = ArgLocs.size(); i != e; 2993 ++i, ++realArgIdx) { 2994 CCValAssign &VA = ArgLocs[i]; 2995 SDValue Arg = OutVals[realArgIdx]; 2996 ISD::ArgFlagsTy Flags = Outs[realArgIdx].Flags; 2997 2998 // Promote the value if needed. 2999 switch (VA.getLocInfo()) { 3000 default: 3001 llvm_unreachable("Unknown loc info!"); 3002 case CCValAssign::Full: 3003 break; 3004 case CCValAssign::SExt: 3005 Arg = DAG.getNode(ISD::SIGN_EXTEND, DL, VA.getLocVT(), Arg); 3006 break; 3007 case CCValAssign::ZExt: 3008 Arg = DAG.getNode(ISD::ZERO_EXTEND, DL, VA.getLocVT(), Arg); 3009 break; 3010 case CCValAssign::AExt: 3011 if (Outs[realArgIdx].ArgVT == MVT::i1) { 3012 // AAPCS requires i1 to be zero-extended to 8-bits by the caller. 3013 Arg = DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, Arg); 3014 Arg = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i8, Arg); 3015 } 3016 Arg = DAG.getNode(ISD::ANY_EXTEND, DL, VA.getLocVT(), Arg); 3017 break; 3018 case CCValAssign::BCvt: 3019 Arg = DAG.getNode(ISD::BITCAST, DL, VA.getLocVT(), Arg); 3020 break; 3021 case CCValAssign::FPExt: 3022 Arg = DAG.getNode(ISD::FP_EXTEND, DL, VA.getLocVT(), Arg); 3023 break; 3024 } 3025 3026 if (VA.isRegLoc()) { 3027 if (realArgIdx == 0 && Flags.isReturned() && Outs[0].VT == MVT::i64) { 3028 assert(VA.getLocVT() == MVT::i64 && 3029 "unexpected calling convention register assignment"); 3030 assert(!Ins.empty() && Ins[0].VT == MVT::i64 && 3031 "unexpected use of 'returned'"); 3032 IsThisReturn = true; 3033 } 3034 RegsToPass.push_back(std::make_pair(VA.getLocReg(), Arg)); 3035 } else { 3036 assert(VA.isMemLoc()); 3037 3038 SDValue DstAddr; 3039 MachinePointerInfo DstInfo; 3040 3041 // FIXME: This works on big-endian for composite byvals, which are the 3042 // common case. It should also work for fundamental types too. 3043 uint32_t BEAlign = 0; 3044 unsigned OpSize = Flags.isByVal() ? Flags.getByValSize() * 8 3045 : VA.getValVT().getSizeInBits(); 3046 OpSize = (OpSize + 7) / 8; 3047 if (!Subtarget->isLittleEndian() && !Flags.isByVal() && 3048 !Flags.isInConsecutiveRegs()) { 3049 if (OpSize < 8) 3050 BEAlign = 8 - OpSize; 3051 } 3052 unsigned LocMemOffset = VA.getLocMemOffset(); 3053 int32_t Offset = LocMemOffset + BEAlign; 3054 SDValue PtrOff = DAG.getIntPtrConstant(Offset, DL); 3055 PtrOff = DAG.getNode(ISD::ADD, DL, PtrVT, StackPtr, PtrOff); 3056 3057 if (IsTailCall) { 3058 Offset = Offset + FPDiff; 3059 int FI = MF.getFrameInfo()->CreateFixedObject(OpSize, Offset, true); 3060 3061 DstAddr = DAG.getFrameIndex(FI, PtrVT); 3062 DstInfo = 3063 MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI); 3064 3065 // Make sure any stack arguments overlapping with where we're storing 3066 // are loaded before this eventual operation. Otherwise they'll be 3067 // clobbered. 3068 Chain = addTokenForArgument(Chain, DAG, MF.getFrameInfo(), FI); 3069 } else { 3070 SDValue PtrOff = DAG.getIntPtrConstant(Offset, DL); 3071 3072 DstAddr = DAG.getNode(ISD::ADD, DL, PtrVT, StackPtr, PtrOff); 3073 DstInfo = MachinePointerInfo::getStack(DAG.getMachineFunction(), 3074 LocMemOffset); 3075 } 3076 3077 if (Outs[i].Flags.isByVal()) { 3078 SDValue SizeNode = 3079 DAG.getConstant(Outs[i].Flags.getByValSize(), DL, MVT::i64); 3080 SDValue Cpy = DAG.getMemcpy( 3081 Chain, DL, DstAddr, Arg, SizeNode, Outs[i].Flags.getByValAlign(), 3082 /*isVol = */ false, /*AlwaysInline = */ false, 3083 /*isTailCall = */ false, 3084 DstInfo, MachinePointerInfo()); 3085 3086 MemOpChains.push_back(Cpy); 3087 } else { 3088 // Since we pass i1/i8/i16 as i1/i8/i16 on stack and Arg is already 3089 // promoted to a legal register type i32, we should truncate Arg back to 3090 // i1/i8/i16. 3091 if (VA.getValVT() == MVT::i1 || VA.getValVT() == MVT::i8 || 3092 VA.getValVT() == MVT::i16) 3093 Arg = DAG.getNode(ISD::TRUNCATE, DL, VA.getValVT(), Arg); 3094 3095 SDValue Store = 3096 DAG.getStore(Chain, DL, Arg, DstAddr, DstInfo, false, false, 0); 3097 MemOpChains.push_back(Store); 3098 } 3099 } 3100 } 3101 3102 if (!MemOpChains.empty()) 3103 Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOpChains); 3104 3105 // Build a sequence of copy-to-reg nodes chained together with token chain 3106 // and flag operands which copy the outgoing args into the appropriate regs. 3107 SDValue InFlag; 3108 for (auto &RegToPass : RegsToPass) { 3109 Chain = DAG.getCopyToReg(Chain, DL, RegToPass.first, 3110 RegToPass.second, InFlag); 3111 InFlag = Chain.getValue(1); 3112 } 3113 3114 // If the callee is a GlobalAddress/ExternalSymbol node (quite common, every 3115 // direct call is) turn it into a TargetGlobalAddress/TargetExternalSymbol 3116 // node so that legalize doesn't hack it. 3117 if (getTargetMachine().getCodeModel() == CodeModel::Large && 3118 Subtarget->isTargetMachO()) { 3119 if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee)) { 3120 const GlobalValue *GV = G->getGlobal(); 3121 bool InternalLinkage = GV->hasInternalLinkage(); 3122 if (InternalLinkage) 3123 Callee = DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, 0); 3124 else { 3125 Callee = 3126 DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, AArch64II::MO_GOT); 3127 Callee = DAG.getNode(AArch64ISD::LOADgot, DL, PtrVT, Callee); 3128 } 3129 } else if (ExternalSymbolSDNode *S = 3130 dyn_cast<ExternalSymbolSDNode>(Callee)) { 3131 const char *Sym = S->getSymbol(); 3132 Callee = DAG.getTargetExternalSymbol(Sym, PtrVT, AArch64II::MO_GOT); 3133 Callee = DAG.getNode(AArch64ISD::LOADgot, DL, PtrVT, Callee); 3134 } 3135 } else if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee)) { 3136 const GlobalValue *GV = G->getGlobal(); 3137 Callee = DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, 0); 3138 } else if (ExternalSymbolSDNode *S = dyn_cast<ExternalSymbolSDNode>(Callee)) { 3139 const char *Sym = S->getSymbol(); 3140 Callee = DAG.getTargetExternalSymbol(Sym, PtrVT, 0); 3141 } 3142 3143 // We don't usually want to end the call-sequence here because we would tidy 3144 // the frame up *after* the call, however in the ABI-changing tail-call case 3145 // we've carefully laid out the parameters so that when sp is reset they'll be 3146 // in the correct location. 3147 if (IsTailCall && !IsSibCall) { 3148 Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(NumBytes, DL, true), 3149 DAG.getIntPtrConstant(0, DL, true), InFlag, DL); 3150 InFlag = Chain.getValue(1); 3151 } 3152 3153 std::vector<SDValue> Ops; 3154 Ops.push_back(Chain); 3155 Ops.push_back(Callee); 3156 3157 if (IsTailCall) { 3158 // Each tail call may have to adjust the stack by a different amount, so 3159 // this information must travel along with the operation for eventual 3160 // consumption by emitEpilogue. 3161 Ops.push_back(DAG.getTargetConstant(FPDiff, DL, MVT::i32)); 3162 } 3163 3164 // Add argument registers to the end of the list so that they are known live 3165 // into the call. 3166 for (auto &RegToPass : RegsToPass) 3167 Ops.push_back(DAG.getRegister(RegToPass.first, 3168 RegToPass.second.getValueType())); 3169 3170 // Add a register mask operand representing the call-preserved registers. 3171 const uint32_t *Mask; 3172 const AArch64RegisterInfo *TRI = Subtarget->getRegisterInfo(); 3173 if (IsThisReturn) { 3174 // For 'this' returns, use the X0-preserving mask if applicable 3175 Mask = TRI->getThisReturnPreservedMask(MF, CallConv); 3176 if (!Mask) { 3177 IsThisReturn = false; 3178 Mask = TRI->getCallPreservedMask(MF, CallConv); 3179 } 3180 } else 3181 Mask = TRI->getCallPreservedMask(MF, CallConv); 3182 3183 assert(Mask && "Missing call preserved mask for calling convention"); 3184 Ops.push_back(DAG.getRegisterMask(Mask)); 3185 3186 if (InFlag.getNode()) 3187 Ops.push_back(InFlag); 3188 3189 SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue); 3190 3191 // If we're doing a tall call, use a TC_RETURN here rather than an 3192 // actual call instruction. 3193 if (IsTailCall) { 3194 MF.getFrameInfo()->setHasTailCall(); 3195 return DAG.getNode(AArch64ISD::TC_RETURN, DL, NodeTys, Ops); 3196 } 3197 3198 // Returns a chain and a flag for retval copy to use. 3199 Chain = DAG.getNode(AArch64ISD::CALL, DL, NodeTys, Ops); 3200 InFlag = Chain.getValue(1); 3201 3202 uint64_t CalleePopBytes = DoesCalleeRestoreStack(CallConv, TailCallOpt) 3203 ? RoundUpToAlignment(NumBytes, 16) 3204 : 0; 3205 3206 Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(NumBytes, DL, true), 3207 DAG.getIntPtrConstant(CalleePopBytes, DL, true), 3208 InFlag, DL); 3209 if (!Ins.empty()) 3210 InFlag = Chain.getValue(1); 3211 3212 // Handle result values, copying them out of physregs into vregs that we 3213 // return. 3214 return LowerCallResult(Chain, InFlag, CallConv, IsVarArg, Ins, DL, DAG, 3215 InVals, IsThisReturn, 3216 IsThisReturn ? OutVals[0] : SDValue()); 3217 } 3218 3219 bool AArch64TargetLowering::CanLowerReturn( 3220 CallingConv::ID CallConv, MachineFunction &MF, bool isVarArg, 3221 const SmallVectorImpl<ISD::OutputArg> &Outs, LLVMContext &Context) const { 3222 CCAssignFn *RetCC = CallConv == CallingConv::WebKit_JS 3223 ? RetCC_AArch64_WebKit_JS 3224 : RetCC_AArch64_AAPCS; 3225 SmallVector<CCValAssign, 16> RVLocs; 3226 CCState CCInfo(CallConv, isVarArg, MF, RVLocs, Context); 3227 return CCInfo.CheckReturn(Outs, RetCC); 3228 } 3229 3230 SDValue 3231 AArch64TargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv, 3232 bool isVarArg, 3233 const SmallVectorImpl<ISD::OutputArg> &Outs, 3234 const SmallVectorImpl<SDValue> &OutVals, 3235 SDLoc DL, SelectionDAG &DAG) const { 3236 CCAssignFn *RetCC = CallConv == CallingConv::WebKit_JS 3237 ? RetCC_AArch64_WebKit_JS 3238 : RetCC_AArch64_AAPCS; 3239 SmallVector<CCValAssign, 16> RVLocs; 3240 CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), RVLocs, 3241 *DAG.getContext()); 3242 CCInfo.AnalyzeReturn(Outs, RetCC); 3243 3244 // Copy the result values into the output registers. 3245 SDValue Flag; 3246 SmallVector<SDValue, 4> RetOps(1, Chain); 3247 for (unsigned i = 0, realRVLocIdx = 0; i != RVLocs.size(); 3248 ++i, ++realRVLocIdx) { 3249 CCValAssign &VA = RVLocs[i]; 3250 assert(VA.isRegLoc() && "Can only return in registers!"); 3251 SDValue Arg = OutVals[realRVLocIdx]; 3252 3253 switch (VA.getLocInfo()) { 3254 default: 3255 llvm_unreachable("Unknown loc info!"); 3256 case CCValAssign::Full: 3257 if (Outs[i].ArgVT == MVT::i1) { 3258 // AAPCS requires i1 to be zero-extended to i8 by the producer of the 3259 // value. This is strictly redundant on Darwin (which uses "zeroext 3260 // i1"), but will be optimised out before ISel. 3261 Arg = DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, Arg); 3262 Arg = DAG.getNode(ISD::ZERO_EXTEND, DL, VA.getLocVT(), Arg); 3263 } 3264 break; 3265 case CCValAssign::BCvt: 3266 Arg = DAG.getNode(ISD::BITCAST, DL, VA.getLocVT(), Arg); 3267 break; 3268 } 3269 3270 Chain = DAG.getCopyToReg(Chain, DL, VA.getLocReg(), Arg, Flag); 3271 Flag = Chain.getValue(1); 3272 RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT())); 3273 } 3274 const AArch64RegisterInfo *TRI = Subtarget->getRegisterInfo(); 3275 const MCPhysReg *I = 3276 TRI->getCalleeSavedRegsViaCopy(&DAG.getMachineFunction()); 3277 if (I) { 3278 for (; *I; ++I) { 3279 if (AArch64::GPR64RegClass.contains(*I)) 3280 RetOps.push_back(DAG.getRegister(*I, MVT::i64)); 3281 else if (AArch64::FPR64RegClass.contains(*I)) 3282 RetOps.push_back(DAG.getRegister(*I, MVT::getFloatingPointVT(64))); 3283 else 3284 llvm_unreachable("Unexpected register class in CSRsViaCopy!"); 3285 } 3286 } 3287 3288 RetOps[0] = Chain; // Update chain. 3289 3290 // Add the flag if we have it. 3291 if (Flag.getNode()) 3292 RetOps.push_back(Flag); 3293 3294 return DAG.getNode(AArch64ISD::RET_FLAG, DL, MVT::Other, RetOps); 3295 } 3296 3297 //===----------------------------------------------------------------------===// 3298 // Other Lowering Code 3299 //===----------------------------------------------------------------------===// 3300 3301 SDValue AArch64TargetLowering::LowerGlobalAddress(SDValue Op, 3302 SelectionDAG &DAG) const { 3303 EVT PtrVT = getPointerTy(DAG.getDataLayout()); 3304 SDLoc DL(Op); 3305 const GlobalAddressSDNode *GN = cast<GlobalAddressSDNode>(Op); 3306 const GlobalValue *GV = GN->getGlobal(); 3307 unsigned char OpFlags = 3308 Subtarget->ClassifyGlobalReference(GV, getTargetMachine()); 3309 3310 assert(cast<GlobalAddressSDNode>(Op)->getOffset() == 0 && 3311 "unexpected offset in global node"); 3312 3313 // This also catched the large code model case for Darwin. 3314 if ((OpFlags & AArch64II::MO_GOT) != 0) { 3315 SDValue GotAddr = DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, OpFlags); 3316 // FIXME: Once remat is capable of dealing with instructions with register 3317 // operands, expand this into two nodes instead of using a wrapper node. 3318 return DAG.getNode(AArch64ISD::LOADgot, DL, PtrVT, GotAddr); 3319 } 3320 3321 if ((OpFlags & AArch64II::MO_CONSTPOOL) != 0) { 3322 assert(getTargetMachine().getCodeModel() == CodeModel::Small && 3323 "use of MO_CONSTPOOL only supported on small model"); 3324 SDValue Hi = DAG.getTargetConstantPool(GV, PtrVT, 0, 0, AArch64II::MO_PAGE); 3325 SDValue ADRP = DAG.getNode(AArch64ISD::ADRP, DL, PtrVT, Hi); 3326 unsigned char LoFlags = AArch64II::MO_PAGEOFF | AArch64II::MO_NC; 3327 SDValue Lo = DAG.getTargetConstantPool(GV, PtrVT, 0, 0, LoFlags); 3328 SDValue PoolAddr = DAG.getNode(AArch64ISD::ADDlow, DL, PtrVT, ADRP, Lo); 3329 SDValue GlobalAddr = DAG.getLoad( 3330 PtrVT, DL, DAG.getEntryNode(), PoolAddr, 3331 MachinePointerInfo::getConstantPool(DAG.getMachineFunction()), 3332 /*isVolatile=*/false, 3333 /*isNonTemporal=*/true, 3334 /*isInvariant=*/true, 8); 3335 if (GN->getOffset() != 0) 3336 return DAG.getNode(ISD::ADD, DL, PtrVT, GlobalAddr, 3337 DAG.getConstant(GN->getOffset(), DL, PtrVT)); 3338 return GlobalAddr; 3339 } 3340 3341 if (getTargetMachine().getCodeModel() == CodeModel::Large) { 3342 const unsigned char MO_NC = AArch64II::MO_NC; 3343 return DAG.getNode( 3344 AArch64ISD::WrapperLarge, DL, PtrVT, 3345 DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, AArch64II::MO_G3), 3346 DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, AArch64II::MO_G2 | MO_NC), 3347 DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, AArch64II::MO_G1 | MO_NC), 3348 DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, AArch64II::MO_G0 | MO_NC)); 3349 } else { 3350 // Use ADRP/ADD or ADRP/LDR for everything else: the small model on ELF and 3351 // the only correct model on Darwin. 3352 SDValue Hi = DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, 3353 OpFlags | AArch64II::MO_PAGE); 3354 unsigned char LoFlags = OpFlags | AArch64II::MO_PAGEOFF | AArch64II::MO_NC; 3355 SDValue Lo = DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, LoFlags); 3356 3357 SDValue ADRP = DAG.getNode(AArch64ISD::ADRP, DL, PtrVT, Hi); 3358 return DAG.getNode(AArch64ISD::ADDlow, DL, PtrVT, ADRP, Lo); 3359 } 3360 } 3361 3362 /// \brief Convert a TLS address reference into the correct sequence of loads 3363 /// and calls to compute the variable's address (for Darwin, currently) and 3364 /// return an SDValue containing the final node. 3365 3366 /// Darwin only has one TLS scheme which must be capable of dealing with the 3367 /// fully general situation, in the worst case. This means: 3368 /// + "extern __thread" declaration. 3369 /// + Defined in a possibly unknown dynamic library. 3370 /// 3371 /// The general system is that each __thread variable has a [3 x i64] descriptor 3372 /// which contains information used by the runtime to calculate the address. The 3373 /// only part of this the compiler needs to know about is the first xword, which 3374 /// contains a function pointer that must be called with the address of the 3375 /// entire descriptor in "x0". 3376 /// 3377 /// Since this descriptor may be in a different unit, in general even the 3378 /// descriptor must be accessed via an indirect load. The "ideal" code sequence 3379 /// is: 3380 /// adrp x0, _var@TLVPPAGE 3381 /// ldr x0, [x0, _var@TLVPPAGEOFF] ; x0 now contains address of descriptor 3382 /// ldr x1, [x0] ; x1 contains 1st entry of descriptor, 3383 /// ; the function pointer 3384 /// blr x1 ; Uses descriptor address in x0 3385 /// ; Address of _var is now in x0. 3386 /// 3387 /// If the address of _var's descriptor *is* known to the linker, then it can 3388 /// change the first "ldr" instruction to an appropriate "add x0, x0, #imm" for 3389 /// a slight efficiency gain. 3390 SDValue 3391 AArch64TargetLowering::LowerDarwinGlobalTLSAddress(SDValue Op, 3392 SelectionDAG &DAG) const { 3393 assert(Subtarget->isTargetDarwin() && "TLS only supported on Darwin"); 3394 3395 SDLoc DL(Op); 3396 MVT PtrVT = getPointerTy(DAG.getDataLayout()); 3397 const GlobalValue *GV = cast<GlobalAddressSDNode>(Op)->getGlobal(); 3398 3399 SDValue TLVPAddr = 3400 DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, AArch64II::MO_TLS); 3401 SDValue DescAddr = DAG.getNode(AArch64ISD::LOADgot, DL, PtrVT, TLVPAddr); 3402 3403 // The first entry in the descriptor is a function pointer that we must call 3404 // to obtain the address of the variable. 3405 SDValue Chain = DAG.getEntryNode(); 3406 SDValue FuncTLVGet = 3407 DAG.getLoad(MVT::i64, DL, Chain, DescAddr, 3408 MachinePointerInfo::getGOT(DAG.getMachineFunction()), false, 3409 true, true, 8); 3410 Chain = FuncTLVGet.getValue(1); 3411 3412 MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo(); 3413 MFI->setAdjustsStack(true); 3414 3415 // TLS calls preserve all registers except those that absolutely must be 3416 // trashed: X0 (it takes an argument), LR (it's a call) and NZCV (let's not be 3417 // silly). 3418 const uint32_t *Mask = 3419 Subtarget->getRegisterInfo()->getTLSCallPreservedMask(); 3420 3421 // Finally, we can make the call. This is just a degenerate version of a 3422 // normal AArch64 call node: x0 takes the address of the descriptor, and 3423 // returns the address of the variable in this thread. 3424 Chain = DAG.getCopyToReg(Chain, DL, AArch64::X0, DescAddr, SDValue()); 3425 Chain = 3426 DAG.getNode(AArch64ISD::CALL, DL, DAG.getVTList(MVT::Other, MVT::Glue), 3427 Chain, FuncTLVGet, DAG.getRegister(AArch64::X0, MVT::i64), 3428 DAG.getRegisterMask(Mask), Chain.getValue(1)); 3429 return DAG.getCopyFromReg(Chain, DL, AArch64::X0, PtrVT, Chain.getValue(1)); 3430 } 3431 3432 /// When accessing thread-local variables under either the general-dynamic or 3433 /// local-dynamic system, we make a "TLS-descriptor" call. The variable will 3434 /// have a descriptor, accessible via a PC-relative ADRP, and whose first entry 3435 /// is a function pointer to carry out the resolution. 3436 /// 3437 /// The sequence is: 3438 /// adrp x0, :tlsdesc:var 3439 /// ldr x1, [x0, #:tlsdesc_lo12:var] 3440 /// add x0, x0, #:tlsdesc_lo12:var 3441 /// .tlsdesccall var 3442 /// blr x1 3443 /// (TPIDR_EL0 offset now in x0) 3444 /// 3445 /// The above sequence must be produced unscheduled, to enable the linker to 3446 /// optimize/relax this sequence. 3447 /// Therefore, a pseudo-instruction (TLSDESC_CALLSEQ) is used to represent the 3448 /// above sequence, and expanded really late in the compilation flow, to ensure 3449 /// the sequence is produced as per above. 3450 SDValue AArch64TargetLowering::LowerELFTLSDescCallSeq(SDValue SymAddr, SDLoc DL, 3451 SelectionDAG &DAG) const { 3452 EVT PtrVT = getPointerTy(DAG.getDataLayout()); 3453 3454 SDValue Chain = DAG.getEntryNode(); 3455 SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue); 3456 3457 SmallVector<SDValue, 2> Ops; 3458 Ops.push_back(Chain); 3459 Ops.push_back(SymAddr); 3460 3461 Chain = DAG.getNode(AArch64ISD::TLSDESC_CALLSEQ, DL, NodeTys, Ops); 3462 SDValue Glue = Chain.getValue(1); 3463 3464 return DAG.getCopyFromReg(Chain, DL, AArch64::X0, PtrVT, Glue); 3465 } 3466 3467 SDValue 3468 AArch64TargetLowering::LowerELFGlobalTLSAddress(SDValue Op, 3469 SelectionDAG &DAG) const { 3470 assert(Subtarget->isTargetELF() && "This function expects an ELF target"); 3471 assert(getTargetMachine().getCodeModel() == CodeModel::Small && 3472 "ELF TLS only supported in small memory model"); 3473 // Different choices can be made for the maximum size of the TLS area for a 3474 // module. For the small address model, the default TLS size is 16MiB and the 3475 // maximum TLS size is 4GiB. 3476 // FIXME: add -mtls-size command line option and make it control the 16MiB 3477 // vs. 4GiB code sequence generation. 3478 const GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(Op); 3479 3480 TLSModel::Model Model = getTargetMachine().getTLSModel(GA->getGlobal()); 3481 3482 if (DAG.getTarget().Options.EmulatedTLS) 3483 return LowerToTLSEmulatedModel(GA, DAG); 3484 3485 if (!EnableAArch64ELFLocalDynamicTLSGeneration) { 3486 if (Model == TLSModel::LocalDynamic) 3487 Model = TLSModel::GeneralDynamic; 3488 } 3489 3490 SDValue TPOff; 3491 EVT PtrVT = getPointerTy(DAG.getDataLayout()); 3492 SDLoc DL(Op); 3493 const GlobalValue *GV = GA->getGlobal(); 3494 3495 SDValue ThreadBase = DAG.getNode(AArch64ISD::THREAD_POINTER, DL, PtrVT); 3496 3497 if (Model == TLSModel::LocalExec) { 3498 SDValue HiVar = DAG.getTargetGlobalAddress( 3499 GV, DL, PtrVT, 0, AArch64II::MO_TLS | AArch64II::MO_HI12); 3500 SDValue LoVar = DAG.getTargetGlobalAddress( 3501 GV, DL, PtrVT, 0, 3502 AArch64II::MO_TLS | AArch64II::MO_PAGEOFF | AArch64II::MO_NC); 3503 3504 SDValue TPWithOff_lo = 3505 SDValue(DAG.getMachineNode(AArch64::ADDXri, DL, PtrVT, ThreadBase, 3506 HiVar, 3507 DAG.getTargetConstant(0, DL, MVT::i32)), 3508 0); 3509 SDValue TPWithOff = 3510 SDValue(DAG.getMachineNode(AArch64::ADDXri, DL, PtrVT, TPWithOff_lo, 3511 LoVar, 3512 DAG.getTargetConstant(0, DL, MVT::i32)), 3513 0); 3514 return TPWithOff; 3515 } else if (Model == TLSModel::InitialExec) { 3516 TPOff = DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, AArch64II::MO_TLS); 3517 TPOff = DAG.getNode(AArch64ISD::LOADgot, DL, PtrVT, TPOff); 3518 } else if (Model == TLSModel::LocalDynamic) { 3519 // Local-dynamic accesses proceed in two phases. A general-dynamic TLS 3520 // descriptor call against the special symbol _TLS_MODULE_BASE_ to calculate 3521 // the beginning of the module's TLS region, followed by a DTPREL offset 3522 // calculation. 3523 3524 // These accesses will need deduplicating if there's more than one. 3525 AArch64FunctionInfo *MFI = 3526 DAG.getMachineFunction().getInfo<AArch64FunctionInfo>(); 3527 MFI->incNumLocalDynamicTLSAccesses(); 3528 3529 // The call needs a relocation too for linker relaxation. It doesn't make 3530 // sense to call it MO_PAGE or MO_PAGEOFF though so we need another copy of 3531 // the address. 3532 SDValue SymAddr = DAG.getTargetExternalSymbol("_TLS_MODULE_BASE_", PtrVT, 3533 AArch64II::MO_TLS); 3534 3535 // Now we can calculate the offset from TPIDR_EL0 to this module's 3536 // thread-local area. 3537 TPOff = LowerELFTLSDescCallSeq(SymAddr, DL, DAG); 3538 3539 // Now use :dtprel_whatever: operations to calculate this variable's offset 3540 // in its thread-storage area. 3541 SDValue HiVar = DAG.getTargetGlobalAddress( 3542 GV, DL, MVT::i64, 0, AArch64II::MO_TLS | AArch64II::MO_HI12); 3543 SDValue LoVar = DAG.getTargetGlobalAddress( 3544 GV, DL, MVT::i64, 0, 3545 AArch64II::MO_TLS | AArch64II::MO_PAGEOFF | AArch64II::MO_NC); 3546 3547 TPOff = SDValue(DAG.getMachineNode(AArch64::ADDXri, DL, PtrVT, TPOff, HiVar, 3548 DAG.getTargetConstant(0, DL, MVT::i32)), 3549 0); 3550 TPOff = SDValue(DAG.getMachineNode(AArch64::ADDXri, DL, PtrVT, TPOff, LoVar, 3551 DAG.getTargetConstant(0, DL, MVT::i32)), 3552 0); 3553 } else if (Model == TLSModel::GeneralDynamic) { 3554 // The call needs a relocation too for linker relaxation. It doesn't make 3555 // sense to call it MO_PAGE or MO_PAGEOFF though so we need another copy of 3556 // the address. 3557 SDValue SymAddr = 3558 DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, AArch64II::MO_TLS); 3559 3560 // Finally we can make a call to calculate the offset from tpidr_el0. 3561 TPOff = LowerELFTLSDescCallSeq(SymAddr, DL, DAG); 3562 } else 3563 llvm_unreachable("Unsupported ELF TLS access model"); 3564 3565 return DAG.getNode(ISD::ADD, DL, PtrVT, ThreadBase, TPOff); 3566 } 3567 3568 SDValue AArch64TargetLowering::LowerGlobalTLSAddress(SDValue Op, 3569 SelectionDAG &DAG) const { 3570 if (Subtarget->isTargetDarwin()) 3571 return LowerDarwinGlobalTLSAddress(Op, DAG); 3572 else if (Subtarget->isTargetELF()) 3573 return LowerELFGlobalTLSAddress(Op, DAG); 3574 3575 llvm_unreachable("Unexpected platform trying to use TLS"); 3576 } 3577 SDValue AArch64TargetLowering::LowerBR_CC(SDValue Op, SelectionDAG &DAG) const { 3578 SDValue Chain = Op.getOperand(0); 3579 ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(1))->get(); 3580 SDValue LHS = Op.getOperand(2); 3581 SDValue RHS = Op.getOperand(3); 3582 SDValue Dest = Op.getOperand(4); 3583 SDLoc dl(Op); 3584 3585 // Handle f128 first, since lowering it will result in comparing the return 3586 // value of a libcall against zero, which is just what the rest of LowerBR_CC 3587 // is expecting to deal with. 3588 if (LHS.getValueType() == MVT::f128) { 3589 softenSetCCOperands(DAG, MVT::f128, LHS, RHS, CC, dl); 3590 3591 // If softenSetCCOperands returned a scalar, we need to compare the result 3592 // against zero to select between true and false values. 3593 if (!RHS.getNode()) { 3594 RHS = DAG.getConstant(0, dl, LHS.getValueType()); 3595 CC = ISD::SETNE; 3596 } 3597 } 3598 3599 // Optimize {s|u}{add|sub|mul}.with.overflow feeding into a branch 3600 // instruction. 3601 unsigned Opc = LHS.getOpcode(); 3602 if (LHS.getResNo() == 1 && isOneConstant(RHS) && 3603 (Opc == ISD::SADDO || Opc == ISD::UADDO || Opc == ISD::SSUBO || 3604 Opc == ISD::USUBO || Opc == ISD::SMULO || Opc == ISD::UMULO)) { 3605 assert((CC == ISD::SETEQ || CC == ISD::SETNE) && 3606 "Unexpected condition code."); 3607 // Only lower legal XALUO ops. 3608 if (!DAG.getTargetLoweringInfo().isTypeLegal(LHS->getValueType(0))) 3609 return SDValue(); 3610 3611 // The actual operation with overflow check. 3612 AArch64CC::CondCode OFCC; 3613 SDValue Value, Overflow; 3614 std::tie(Value, Overflow) = getAArch64XALUOOp(OFCC, LHS.getValue(0), DAG); 3615 3616 if (CC == ISD::SETNE) 3617 OFCC = getInvertedCondCode(OFCC); 3618 SDValue CCVal = DAG.getConstant(OFCC, dl, MVT::i32); 3619 3620 return DAG.getNode(AArch64ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal, 3621 Overflow); 3622 } 3623 3624 if (LHS.getValueType().isInteger()) { 3625 assert((LHS.getValueType() == RHS.getValueType()) && 3626 (LHS.getValueType() == MVT::i32 || LHS.getValueType() == MVT::i64)); 3627 3628 // If the RHS of the comparison is zero, we can potentially fold this 3629 // to a specialized branch. 3630 const ConstantSDNode *RHSC = dyn_cast<ConstantSDNode>(RHS); 3631 if (RHSC && RHSC->getZExtValue() == 0) { 3632 if (CC == ISD::SETEQ) { 3633 // See if we can use a TBZ to fold in an AND as well. 3634 // TBZ has a smaller branch displacement than CBZ. If the offset is 3635 // out of bounds, a late MI-layer pass rewrites branches. 3636 // 403.gcc is an example that hits this case. 3637 if (LHS.getOpcode() == ISD::AND && 3638 isa<ConstantSDNode>(LHS.getOperand(1)) && 3639 isPowerOf2_64(LHS.getConstantOperandVal(1))) { 3640 SDValue Test = LHS.getOperand(0); 3641 uint64_t Mask = LHS.getConstantOperandVal(1); 3642 return DAG.getNode(AArch64ISD::TBZ, dl, MVT::Other, Chain, Test, 3643 DAG.getConstant(Log2_64(Mask), dl, MVT::i64), 3644 Dest); 3645 } 3646 3647 return DAG.getNode(AArch64ISD::CBZ, dl, MVT::Other, Chain, LHS, Dest); 3648 } else if (CC == ISD::SETNE) { 3649 // See if we can use a TBZ to fold in an AND as well. 3650 // TBZ has a smaller branch displacement than CBZ. If the offset is 3651 // out of bounds, a late MI-layer pass rewrites branches. 3652 // 403.gcc is an example that hits this case. 3653 if (LHS.getOpcode() == ISD::AND && 3654 isa<ConstantSDNode>(LHS.getOperand(1)) && 3655 isPowerOf2_64(LHS.getConstantOperandVal(1))) { 3656 SDValue Test = LHS.getOperand(0); 3657 uint64_t Mask = LHS.getConstantOperandVal(1); 3658 return DAG.getNode(AArch64ISD::TBNZ, dl, MVT::Other, Chain, Test, 3659 DAG.getConstant(Log2_64(Mask), dl, MVT::i64), 3660 Dest); 3661 } 3662 3663 return DAG.getNode(AArch64ISD::CBNZ, dl, MVT::Other, Chain, LHS, Dest); 3664 } else if (CC == ISD::SETLT && LHS.getOpcode() != ISD::AND) { 3665 // Don't combine AND since emitComparison converts the AND to an ANDS 3666 // (a.k.a. TST) and the test in the test bit and branch instruction 3667 // becomes redundant. This would also increase register pressure. 3668 uint64_t Mask = LHS.getValueType().getSizeInBits() - 1; 3669 return DAG.getNode(AArch64ISD::TBNZ, dl, MVT::Other, Chain, LHS, 3670 DAG.getConstant(Mask, dl, MVT::i64), Dest); 3671 } 3672 } 3673 if (RHSC && RHSC->getSExtValue() == -1 && CC == ISD::SETGT && 3674 LHS.getOpcode() != ISD::AND) { 3675 // Don't combine AND since emitComparison converts the AND to an ANDS 3676 // (a.k.a. TST) and the test in the test bit and branch instruction 3677 // becomes redundant. This would also increase register pressure. 3678 uint64_t Mask = LHS.getValueType().getSizeInBits() - 1; 3679 return DAG.getNode(AArch64ISD::TBZ, dl, MVT::Other, Chain, LHS, 3680 DAG.getConstant(Mask, dl, MVT::i64), Dest); 3681 } 3682 3683 SDValue CCVal; 3684 SDValue Cmp = getAArch64Cmp(LHS, RHS, CC, CCVal, DAG, dl); 3685 return DAG.getNode(AArch64ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal, 3686 Cmp); 3687 } 3688 3689 assert(LHS.getValueType() == MVT::f32 || LHS.getValueType() == MVT::f64); 3690 3691 // Unfortunately, the mapping of LLVM FP CC's onto AArch64 CC's isn't totally 3692 // clean. Some of them require two branches to implement. 3693 SDValue Cmp = emitComparison(LHS, RHS, CC, dl, DAG); 3694 AArch64CC::CondCode CC1, CC2; 3695 changeFPCCToAArch64CC(CC, CC1, CC2); 3696 SDValue CC1Val = DAG.getConstant(CC1, dl, MVT::i32); 3697 SDValue BR1 = 3698 DAG.getNode(AArch64ISD::BRCOND, dl, MVT::Other, Chain, Dest, CC1Val, Cmp); 3699 if (CC2 != AArch64CC::AL) { 3700 SDValue CC2Val = DAG.getConstant(CC2, dl, MVT::i32); 3701 return DAG.getNode(AArch64ISD::BRCOND, dl, MVT::Other, BR1, Dest, CC2Val, 3702 Cmp); 3703 } 3704 3705 return BR1; 3706 } 3707 3708 SDValue AArch64TargetLowering::LowerFCOPYSIGN(SDValue Op, 3709 SelectionDAG &DAG) const { 3710 EVT VT = Op.getValueType(); 3711 SDLoc DL(Op); 3712 3713 SDValue In1 = Op.getOperand(0); 3714 SDValue In2 = Op.getOperand(1); 3715 EVT SrcVT = In2.getValueType(); 3716 3717 if (SrcVT.bitsLT(VT)) 3718 In2 = DAG.getNode(ISD::FP_EXTEND, DL, VT, In2); 3719 else if (SrcVT.bitsGT(VT)) 3720 In2 = DAG.getNode(ISD::FP_ROUND, DL, VT, In2, DAG.getIntPtrConstant(0, DL)); 3721 3722 EVT VecVT; 3723 EVT EltVT; 3724 uint64_t EltMask; 3725 SDValue VecVal1, VecVal2; 3726 if (VT == MVT::f32 || VT == MVT::v2f32 || VT == MVT::v4f32) { 3727 EltVT = MVT::i32; 3728 VecVT = (VT == MVT::v2f32 ? MVT::v2i32 : MVT::v4i32); 3729 EltMask = 0x80000000ULL; 3730 3731 if (!VT.isVector()) { 3732 VecVal1 = DAG.getTargetInsertSubreg(AArch64::ssub, DL, VecVT, 3733 DAG.getUNDEF(VecVT), In1); 3734 VecVal2 = DAG.getTargetInsertSubreg(AArch64::ssub, DL, VecVT, 3735 DAG.getUNDEF(VecVT), In2); 3736 } else { 3737 VecVal1 = DAG.getNode(ISD::BITCAST, DL, VecVT, In1); 3738 VecVal2 = DAG.getNode(ISD::BITCAST, DL, VecVT, In2); 3739 } 3740 } else if (VT == MVT::f64 || VT == MVT::v2f64) { 3741 EltVT = MVT::i64; 3742 VecVT = MVT::v2i64; 3743 3744 // We want to materialize a mask with the high bit set, but the AdvSIMD 3745 // immediate moves cannot materialize that in a single instruction for 3746 // 64-bit elements. Instead, materialize zero and then negate it. 3747 EltMask = 0; 3748 3749 if (!VT.isVector()) { 3750 VecVal1 = DAG.getTargetInsertSubreg(AArch64::dsub, DL, VecVT, 3751 DAG.getUNDEF(VecVT), In1); 3752 VecVal2 = DAG.getTargetInsertSubreg(AArch64::dsub, DL, VecVT, 3753 DAG.getUNDEF(VecVT), In2); 3754 } else { 3755 VecVal1 = DAG.getNode(ISD::BITCAST, DL, VecVT, In1); 3756 VecVal2 = DAG.getNode(ISD::BITCAST, DL, VecVT, In2); 3757 } 3758 } else { 3759 llvm_unreachable("Invalid type for copysign!"); 3760 } 3761 3762 SDValue BuildVec = DAG.getConstant(EltMask, DL, VecVT); 3763 3764 // If we couldn't materialize the mask above, then the mask vector will be 3765 // the zero vector, and we need to negate it here. 3766 if (VT == MVT::f64 || VT == MVT::v2f64) { 3767 BuildVec = DAG.getNode(ISD::BITCAST, DL, MVT::v2f64, BuildVec); 3768 BuildVec = DAG.getNode(ISD::FNEG, DL, MVT::v2f64, BuildVec); 3769 BuildVec = DAG.getNode(ISD::BITCAST, DL, MVT::v2i64, BuildVec); 3770 } 3771 3772 SDValue Sel = 3773 DAG.getNode(AArch64ISD::BIT, DL, VecVT, VecVal1, VecVal2, BuildVec); 3774 3775 if (VT == MVT::f32) 3776 return DAG.getTargetExtractSubreg(AArch64::ssub, DL, VT, Sel); 3777 else if (VT == MVT::f64) 3778 return DAG.getTargetExtractSubreg(AArch64::dsub, DL, VT, Sel); 3779 else 3780 return DAG.getNode(ISD::BITCAST, DL, VT, Sel); 3781 } 3782 3783 SDValue AArch64TargetLowering::LowerCTPOP(SDValue Op, SelectionDAG &DAG) const { 3784 if (DAG.getMachineFunction().getFunction()->hasFnAttribute( 3785 Attribute::NoImplicitFloat)) 3786 return SDValue(); 3787 3788 if (!Subtarget->hasNEON()) 3789 return SDValue(); 3790 3791 // While there is no integer popcount instruction, it can 3792 // be more efficiently lowered to the following sequence that uses 3793 // AdvSIMD registers/instructions as long as the copies to/from 3794 // the AdvSIMD registers are cheap. 3795 // FMOV D0, X0 // copy 64-bit int to vector, high bits zero'd 3796 // CNT V0.8B, V0.8B // 8xbyte pop-counts 3797 // ADDV B0, V0.8B // sum 8xbyte pop-counts 3798 // UMOV X0, V0.B[0] // copy byte result back to integer reg 3799 SDValue Val = Op.getOperand(0); 3800 SDLoc DL(Op); 3801 EVT VT = Op.getValueType(); 3802 3803 if (VT == MVT::i32) 3804 Val = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i64, Val); 3805 Val = DAG.getNode(ISD::BITCAST, DL, MVT::v8i8, Val); 3806 3807 SDValue CtPop = DAG.getNode(ISD::CTPOP, DL, MVT::v8i8, Val); 3808 SDValue UaddLV = DAG.getNode( 3809 ISD::INTRINSIC_WO_CHAIN, DL, MVT::i32, 3810 DAG.getConstant(Intrinsic::aarch64_neon_uaddlv, DL, MVT::i32), CtPop); 3811 3812 if (VT == MVT::i64) 3813 UaddLV = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i64, UaddLV); 3814 return UaddLV; 3815 } 3816 3817 SDValue AArch64TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const { 3818 3819 if (Op.getValueType().isVector()) 3820 return LowerVSETCC(Op, DAG); 3821 3822 SDValue LHS = Op.getOperand(0); 3823 SDValue RHS = Op.getOperand(1); 3824 ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(2))->get(); 3825 SDLoc dl(Op); 3826 3827 // We chose ZeroOrOneBooleanContents, so use zero and one. 3828 EVT VT = Op.getValueType(); 3829 SDValue TVal = DAG.getConstant(1, dl, VT); 3830 SDValue FVal = DAG.getConstant(0, dl, VT); 3831 3832 // Handle f128 first, since one possible outcome is a normal integer 3833 // comparison which gets picked up by the next if statement. 3834 if (LHS.getValueType() == MVT::f128) { 3835 softenSetCCOperands(DAG, MVT::f128, LHS, RHS, CC, dl); 3836 3837 // If softenSetCCOperands returned a scalar, use it. 3838 if (!RHS.getNode()) { 3839 assert(LHS.getValueType() == Op.getValueType() && 3840 "Unexpected setcc expansion!"); 3841 return LHS; 3842 } 3843 } 3844 3845 if (LHS.getValueType().isInteger()) { 3846 SDValue CCVal; 3847 SDValue Cmp = 3848 getAArch64Cmp(LHS, RHS, ISD::getSetCCInverse(CC, true), CCVal, DAG, dl); 3849 3850 // Note that we inverted the condition above, so we reverse the order of 3851 // the true and false operands here. This will allow the setcc to be 3852 // matched to a single CSINC instruction. 3853 return DAG.getNode(AArch64ISD::CSEL, dl, VT, FVal, TVal, CCVal, Cmp); 3854 } 3855 3856 // Now we know we're dealing with FP values. 3857 assert(LHS.getValueType() == MVT::f32 || LHS.getValueType() == MVT::f64); 3858 3859 // If that fails, we'll need to perform an FCMP + CSEL sequence. Go ahead 3860 // and do the comparison. 3861 SDValue Cmp = emitComparison(LHS, RHS, CC, dl, DAG); 3862 3863 AArch64CC::CondCode CC1, CC2; 3864 changeFPCCToAArch64CC(CC, CC1, CC2); 3865 if (CC2 == AArch64CC::AL) { 3866 changeFPCCToAArch64CC(ISD::getSetCCInverse(CC, false), CC1, CC2); 3867 SDValue CC1Val = DAG.getConstant(CC1, dl, MVT::i32); 3868 3869 // Note that we inverted the condition above, so we reverse the order of 3870 // the true and false operands here. This will allow the setcc to be 3871 // matched to a single CSINC instruction. 3872 return DAG.getNode(AArch64ISD::CSEL, dl, VT, FVal, TVal, CC1Val, Cmp); 3873 } else { 3874 // Unfortunately, the mapping of LLVM FP CC's onto AArch64 CC's isn't 3875 // totally clean. Some of them require two CSELs to implement. As is in 3876 // this case, we emit the first CSEL and then emit a second using the output 3877 // of the first as the RHS. We're effectively OR'ing the two CC's together. 3878 3879 // FIXME: It would be nice if we could match the two CSELs to two CSINCs. 3880 SDValue CC1Val = DAG.getConstant(CC1, dl, MVT::i32); 3881 SDValue CS1 = 3882 DAG.getNode(AArch64ISD::CSEL, dl, VT, TVal, FVal, CC1Val, Cmp); 3883 3884 SDValue CC2Val = DAG.getConstant(CC2, dl, MVT::i32); 3885 return DAG.getNode(AArch64ISD::CSEL, dl, VT, TVal, CS1, CC2Val, Cmp); 3886 } 3887 } 3888 3889 SDValue AArch64TargetLowering::LowerSELECT_CC(ISD::CondCode CC, SDValue LHS, 3890 SDValue RHS, SDValue TVal, 3891 SDValue FVal, SDLoc dl, 3892 SelectionDAG &DAG) const { 3893 // Handle f128 first, because it will result in a comparison of some RTLIB 3894 // call result against zero. 3895 if (LHS.getValueType() == MVT::f128) { 3896 softenSetCCOperands(DAG, MVT::f128, LHS, RHS, CC, dl); 3897 3898 // If softenSetCCOperands returned a scalar, we need to compare the result 3899 // against zero to select between true and false values. 3900 if (!RHS.getNode()) { 3901 RHS = DAG.getConstant(0, dl, LHS.getValueType()); 3902 CC = ISD::SETNE; 3903 } 3904 } 3905 3906 // Also handle f16, for which we need to do a f32 comparison. 3907 if (LHS.getValueType() == MVT::f16) { 3908 LHS = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f32, LHS); 3909 RHS = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f32, RHS); 3910 } 3911 3912 // Next, handle integers. 3913 if (LHS.getValueType().isInteger()) { 3914 assert((LHS.getValueType() == RHS.getValueType()) && 3915 (LHS.getValueType() == MVT::i32 || LHS.getValueType() == MVT::i64)); 3916 3917 unsigned Opcode = AArch64ISD::CSEL; 3918 3919 // If both the TVal and the FVal are constants, see if we can swap them in 3920 // order to for a CSINV or CSINC out of them. 3921 ConstantSDNode *CFVal = dyn_cast<ConstantSDNode>(FVal); 3922 ConstantSDNode *CTVal = dyn_cast<ConstantSDNode>(TVal); 3923 3924 if (CTVal && CFVal && CTVal->isAllOnesValue() && CFVal->isNullValue()) { 3925 std::swap(TVal, FVal); 3926 std::swap(CTVal, CFVal); 3927 CC = ISD::getSetCCInverse(CC, true); 3928 } else if (CTVal && CFVal && CTVal->isOne() && CFVal->isNullValue()) { 3929 std::swap(TVal, FVal); 3930 std::swap(CTVal, CFVal); 3931 CC = ISD::getSetCCInverse(CC, true); 3932 } else if (TVal.getOpcode() == ISD::XOR) { 3933 // If TVal is a NOT we want to swap TVal and FVal so that we can match 3934 // with a CSINV rather than a CSEL. 3935 if (isAllOnesConstant(TVal.getOperand(1))) { 3936 std::swap(TVal, FVal); 3937 std::swap(CTVal, CFVal); 3938 CC = ISD::getSetCCInverse(CC, true); 3939 } 3940 } else if (TVal.getOpcode() == ISD::SUB) { 3941 // If TVal is a negation (SUB from 0) we want to swap TVal and FVal so 3942 // that we can match with a CSNEG rather than a CSEL. 3943 if (isNullConstant(TVal.getOperand(0))) { 3944 std::swap(TVal, FVal); 3945 std::swap(CTVal, CFVal); 3946 CC = ISD::getSetCCInverse(CC, true); 3947 } 3948 } else if (CTVal && CFVal) { 3949 const int64_t TrueVal = CTVal->getSExtValue(); 3950 const int64_t FalseVal = CFVal->getSExtValue(); 3951 bool Swap = false; 3952 3953 // If both TVal and FVal are constants, see if FVal is the 3954 // inverse/negation/increment of TVal and generate a CSINV/CSNEG/CSINC 3955 // instead of a CSEL in that case. 3956 if (TrueVal == ~FalseVal) { 3957 Opcode = AArch64ISD::CSINV; 3958 } else if (TrueVal == -FalseVal) { 3959 Opcode = AArch64ISD::CSNEG; 3960 } else if (TVal.getValueType() == MVT::i32) { 3961 // If our operands are only 32-bit wide, make sure we use 32-bit 3962 // arithmetic for the check whether we can use CSINC. This ensures that 3963 // the addition in the check will wrap around properly in case there is 3964 // an overflow (which would not be the case if we do the check with 3965 // 64-bit arithmetic). 3966 const uint32_t TrueVal32 = CTVal->getZExtValue(); 3967 const uint32_t FalseVal32 = CFVal->getZExtValue(); 3968 3969 if ((TrueVal32 == FalseVal32 + 1) || (TrueVal32 + 1 == FalseVal32)) { 3970 Opcode = AArch64ISD::CSINC; 3971 3972 if (TrueVal32 > FalseVal32) { 3973 Swap = true; 3974 } 3975 } 3976 // 64-bit check whether we can use CSINC. 3977 } else if ((TrueVal == FalseVal + 1) || (TrueVal + 1 == FalseVal)) { 3978 Opcode = AArch64ISD::CSINC; 3979 3980 if (TrueVal > FalseVal) { 3981 Swap = true; 3982 } 3983 } 3984 3985 // Swap TVal and FVal if necessary. 3986 if (Swap) { 3987 std::swap(TVal, FVal); 3988 std::swap(CTVal, CFVal); 3989 CC = ISD::getSetCCInverse(CC, true); 3990 } 3991 3992 if (Opcode != AArch64ISD::CSEL) { 3993 // Drop FVal since we can get its value by simply inverting/negating 3994 // TVal. 3995 FVal = TVal; 3996 } 3997 } 3998 3999 SDValue CCVal; 4000 SDValue Cmp = getAArch64Cmp(LHS, RHS, CC, CCVal, DAG, dl); 4001 4002 EVT VT = TVal.getValueType(); 4003 return DAG.getNode(Opcode, dl, VT, TVal, FVal, CCVal, Cmp); 4004 } 4005 4006 // Now we know we're dealing with FP values. 4007 assert(LHS.getValueType() == MVT::f32 || LHS.getValueType() == MVT::f64); 4008 assert(LHS.getValueType() == RHS.getValueType()); 4009 EVT VT = TVal.getValueType(); 4010 SDValue Cmp = emitComparison(LHS, RHS, CC, dl, DAG); 4011 4012 // Unfortunately, the mapping of LLVM FP CC's onto AArch64 CC's isn't totally 4013 // clean. Some of them require two CSELs to implement. 4014 AArch64CC::CondCode CC1, CC2; 4015 changeFPCCToAArch64CC(CC, CC1, CC2); 4016 SDValue CC1Val = DAG.getConstant(CC1, dl, MVT::i32); 4017 SDValue CS1 = DAG.getNode(AArch64ISD::CSEL, dl, VT, TVal, FVal, CC1Val, Cmp); 4018 4019 // If we need a second CSEL, emit it, using the output of the first as the 4020 // RHS. We're effectively OR'ing the two CC's together. 4021 if (CC2 != AArch64CC::AL) { 4022 SDValue CC2Val = DAG.getConstant(CC2, dl, MVT::i32); 4023 return DAG.getNode(AArch64ISD::CSEL, dl, VT, TVal, CS1, CC2Val, Cmp); 4024 } 4025 4026 // Otherwise, return the output of the first CSEL. 4027 return CS1; 4028 } 4029 4030 SDValue AArch64TargetLowering::LowerSELECT_CC(SDValue Op, 4031 SelectionDAG &DAG) const { 4032 ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(4))->get(); 4033 SDValue LHS = Op.getOperand(0); 4034 SDValue RHS = Op.getOperand(1); 4035 SDValue TVal = Op.getOperand(2); 4036 SDValue FVal = Op.getOperand(3); 4037 SDLoc DL(Op); 4038 return LowerSELECT_CC(CC, LHS, RHS, TVal, FVal, DL, DAG); 4039 } 4040 4041 SDValue AArch64TargetLowering::LowerSELECT(SDValue Op, 4042 SelectionDAG &DAG) const { 4043 SDValue CCVal = Op->getOperand(0); 4044 SDValue TVal = Op->getOperand(1); 4045 SDValue FVal = Op->getOperand(2); 4046 SDLoc DL(Op); 4047 4048 unsigned Opc = CCVal.getOpcode(); 4049 // Optimize {s|u}{add|sub|mul}.with.overflow feeding into a select 4050 // instruction. 4051 if (CCVal.getResNo() == 1 && 4052 (Opc == ISD::SADDO || Opc == ISD::UADDO || Opc == ISD::SSUBO || 4053 Opc == ISD::USUBO || Opc == ISD::SMULO || Opc == ISD::UMULO)) { 4054 // Only lower legal XALUO ops. 4055 if (!DAG.getTargetLoweringInfo().isTypeLegal(CCVal->getValueType(0))) 4056 return SDValue(); 4057 4058 AArch64CC::CondCode OFCC; 4059 SDValue Value, Overflow; 4060 std::tie(Value, Overflow) = getAArch64XALUOOp(OFCC, CCVal.getValue(0), DAG); 4061 SDValue CCVal = DAG.getConstant(OFCC, DL, MVT::i32); 4062 4063 return DAG.getNode(AArch64ISD::CSEL, DL, Op.getValueType(), TVal, FVal, 4064 CCVal, Overflow); 4065 } 4066 4067 // Lower it the same way as we would lower a SELECT_CC node. 4068 ISD::CondCode CC; 4069 SDValue LHS, RHS; 4070 if (CCVal.getOpcode() == ISD::SETCC) { 4071 LHS = CCVal.getOperand(0); 4072 RHS = CCVal.getOperand(1); 4073 CC = cast<CondCodeSDNode>(CCVal->getOperand(2))->get(); 4074 } else { 4075 LHS = CCVal; 4076 RHS = DAG.getConstant(0, DL, CCVal.getValueType()); 4077 CC = ISD::SETNE; 4078 } 4079 return LowerSELECT_CC(CC, LHS, RHS, TVal, FVal, DL, DAG); 4080 } 4081 4082 SDValue AArch64TargetLowering::LowerJumpTable(SDValue Op, 4083 SelectionDAG &DAG) const { 4084 // Jump table entries as PC relative offsets. No additional tweaking 4085 // is necessary here. Just get the address of the jump table. 4086 JumpTableSDNode *JT = cast<JumpTableSDNode>(Op); 4087 EVT PtrVT = getPointerTy(DAG.getDataLayout()); 4088 SDLoc DL(Op); 4089 4090 if (getTargetMachine().getCodeModel() == CodeModel::Large && 4091 !Subtarget->isTargetMachO()) { 4092 const unsigned char MO_NC = AArch64II::MO_NC; 4093 return DAG.getNode( 4094 AArch64ISD::WrapperLarge, DL, PtrVT, 4095 DAG.getTargetJumpTable(JT->getIndex(), PtrVT, AArch64II::MO_G3), 4096 DAG.getTargetJumpTable(JT->getIndex(), PtrVT, AArch64II::MO_G2 | MO_NC), 4097 DAG.getTargetJumpTable(JT->getIndex(), PtrVT, AArch64II::MO_G1 | MO_NC), 4098 DAG.getTargetJumpTable(JT->getIndex(), PtrVT, 4099 AArch64II::MO_G0 | MO_NC)); 4100 } 4101 4102 SDValue Hi = 4103 DAG.getTargetJumpTable(JT->getIndex(), PtrVT, AArch64II::MO_PAGE); 4104 SDValue Lo = DAG.getTargetJumpTable(JT->getIndex(), PtrVT, 4105 AArch64II::MO_PAGEOFF | AArch64II::MO_NC); 4106 SDValue ADRP = DAG.getNode(AArch64ISD::ADRP, DL, PtrVT, Hi); 4107 return DAG.getNode(AArch64ISD::ADDlow, DL, PtrVT, ADRP, Lo); 4108 } 4109 4110 SDValue AArch64TargetLowering::LowerConstantPool(SDValue Op, 4111 SelectionDAG &DAG) const { 4112 ConstantPoolSDNode *CP = cast<ConstantPoolSDNode>(Op); 4113 EVT PtrVT = getPointerTy(DAG.getDataLayout()); 4114 SDLoc DL(Op); 4115 4116 if (getTargetMachine().getCodeModel() == CodeModel::Large) { 4117 // Use the GOT for the large code model on iOS. 4118 if (Subtarget->isTargetMachO()) { 4119 SDValue GotAddr = DAG.getTargetConstantPool( 4120 CP->getConstVal(), PtrVT, CP->getAlignment(), CP->getOffset(), 4121 AArch64II::MO_GOT); 4122 return DAG.getNode(AArch64ISD::LOADgot, DL, PtrVT, GotAddr); 4123 } 4124 4125 const unsigned char MO_NC = AArch64II::MO_NC; 4126 return DAG.getNode( 4127 AArch64ISD::WrapperLarge, DL, PtrVT, 4128 DAG.getTargetConstantPool(CP->getConstVal(), PtrVT, CP->getAlignment(), 4129 CP->getOffset(), AArch64II::MO_G3), 4130 DAG.getTargetConstantPool(CP->getConstVal(), PtrVT, CP->getAlignment(), 4131 CP->getOffset(), AArch64II::MO_G2 | MO_NC), 4132 DAG.getTargetConstantPool(CP->getConstVal(), PtrVT, CP->getAlignment(), 4133 CP->getOffset(), AArch64II::MO_G1 | MO_NC), 4134 DAG.getTargetConstantPool(CP->getConstVal(), PtrVT, CP->getAlignment(), 4135 CP->getOffset(), AArch64II::MO_G0 | MO_NC)); 4136 } else { 4137 // Use ADRP/ADD or ADRP/LDR for everything else: the small memory model on 4138 // ELF, the only valid one on Darwin. 4139 SDValue Hi = 4140 DAG.getTargetConstantPool(CP->getConstVal(), PtrVT, CP->getAlignment(), 4141 CP->getOffset(), AArch64II::MO_PAGE); 4142 SDValue Lo = DAG.getTargetConstantPool( 4143 CP->getConstVal(), PtrVT, CP->getAlignment(), CP->getOffset(), 4144 AArch64II::MO_PAGEOFF | AArch64II::MO_NC); 4145 4146 SDValue ADRP = DAG.getNode(AArch64ISD::ADRP, DL, PtrVT, Hi); 4147 return DAG.getNode(AArch64ISD::ADDlow, DL, PtrVT, ADRP, Lo); 4148 } 4149 } 4150 4151 SDValue AArch64TargetLowering::LowerBlockAddress(SDValue Op, 4152 SelectionDAG &DAG) const { 4153 const BlockAddress *BA = cast<BlockAddressSDNode>(Op)->getBlockAddress(); 4154 EVT PtrVT = getPointerTy(DAG.getDataLayout()); 4155 SDLoc DL(Op); 4156 if (getTargetMachine().getCodeModel() == CodeModel::Large && 4157 !Subtarget->isTargetMachO()) { 4158 const unsigned char MO_NC = AArch64II::MO_NC; 4159 return DAG.getNode( 4160 AArch64ISD::WrapperLarge, DL, PtrVT, 4161 DAG.getTargetBlockAddress(BA, PtrVT, 0, AArch64II::MO_G3), 4162 DAG.getTargetBlockAddress(BA, PtrVT, 0, AArch64II::MO_G2 | MO_NC), 4163 DAG.getTargetBlockAddress(BA, PtrVT, 0, AArch64II::MO_G1 | MO_NC), 4164 DAG.getTargetBlockAddress(BA, PtrVT, 0, AArch64II::MO_G0 | MO_NC)); 4165 } else { 4166 SDValue Hi = DAG.getTargetBlockAddress(BA, PtrVT, 0, AArch64II::MO_PAGE); 4167 SDValue Lo = DAG.getTargetBlockAddress(BA, PtrVT, 0, AArch64II::MO_PAGEOFF | 4168 AArch64II::MO_NC); 4169 SDValue ADRP = DAG.getNode(AArch64ISD::ADRP, DL, PtrVT, Hi); 4170 return DAG.getNode(AArch64ISD::ADDlow, DL, PtrVT, ADRP, Lo); 4171 } 4172 } 4173 4174 SDValue AArch64TargetLowering::LowerDarwin_VASTART(SDValue Op, 4175 SelectionDAG &DAG) const { 4176 AArch64FunctionInfo *FuncInfo = 4177 DAG.getMachineFunction().getInfo<AArch64FunctionInfo>(); 4178 4179 SDLoc DL(Op); 4180 SDValue FR = DAG.getFrameIndex(FuncInfo->getVarArgsStackIndex(), 4181 getPointerTy(DAG.getDataLayout())); 4182 const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue(); 4183 return DAG.getStore(Op.getOperand(0), DL, FR, Op.getOperand(1), 4184 MachinePointerInfo(SV), false, false, 0); 4185 } 4186 4187 SDValue AArch64TargetLowering::LowerAAPCS_VASTART(SDValue Op, 4188 SelectionDAG &DAG) const { 4189 // The layout of the va_list struct is specified in the AArch64 Procedure Call 4190 // Standard, section B.3. 4191 MachineFunction &MF = DAG.getMachineFunction(); 4192 AArch64FunctionInfo *FuncInfo = MF.getInfo<AArch64FunctionInfo>(); 4193 auto PtrVT = getPointerTy(DAG.getDataLayout()); 4194 SDLoc DL(Op); 4195 4196 SDValue Chain = Op.getOperand(0); 4197 SDValue VAList = Op.getOperand(1); 4198 const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue(); 4199 SmallVector<SDValue, 4> MemOps; 4200 4201 // void *__stack at offset 0 4202 SDValue Stack = DAG.getFrameIndex(FuncInfo->getVarArgsStackIndex(), PtrVT); 4203 MemOps.push_back(DAG.getStore(Chain, DL, Stack, VAList, 4204 MachinePointerInfo(SV), false, false, 8)); 4205 4206 // void *__gr_top at offset 8 4207 int GPRSize = FuncInfo->getVarArgsGPRSize(); 4208 if (GPRSize > 0) { 4209 SDValue GRTop, GRTopAddr; 4210 4211 GRTopAddr = 4212 DAG.getNode(ISD::ADD, DL, PtrVT, VAList, DAG.getConstant(8, DL, PtrVT)); 4213 4214 GRTop = DAG.getFrameIndex(FuncInfo->getVarArgsGPRIndex(), PtrVT); 4215 GRTop = DAG.getNode(ISD::ADD, DL, PtrVT, GRTop, 4216 DAG.getConstant(GPRSize, DL, PtrVT)); 4217 4218 MemOps.push_back(DAG.getStore(Chain, DL, GRTop, GRTopAddr, 4219 MachinePointerInfo(SV, 8), false, false, 8)); 4220 } 4221 4222 // void *__vr_top at offset 16 4223 int FPRSize = FuncInfo->getVarArgsFPRSize(); 4224 if (FPRSize > 0) { 4225 SDValue VRTop, VRTopAddr; 4226 VRTopAddr = DAG.getNode(ISD::ADD, DL, PtrVT, VAList, 4227 DAG.getConstant(16, DL, PtrVT)); 4228 4229 VRTop = DAG.getFrameIndex(FuncInfo->getVarArgsFPRIndex(), PtrVT); 4230 VRTop = DAG.getNode(ISD::ADD, DL, PtrVT, VRTop, 4231 DAG.getConstant(FPRSize, DL, PtrVT)); 4232 4233 MemOps.push_back(DAG.getStore(Chain, DL, VRTop, VRTopAddr, 4234 MachinePointerInfo(SV, 16), false, false, 8)); 4235 } 4236 4237 // int __gr_offs at offset 24 4238 SDValue GROffsAddr = 4239 DAG.getNode(ISD::ADD, DL, PtrVT, VAList, DAG.getConstant(24, DL, PtrVT)); 4240 MemOps.push_back(DAG.getStore(Chain, DL, 4241 DAG.getConstant(-GPRSize, DL, MVT::i32), 4242 GROffsAddr, MachinePointerInfo(SV, 24), false, 4243 false, 4)); 4244 4245 // int __vr_offs at offset 28 4246 SDValue VROffsAddr = 4247 DAG.getNode(ISD::ADD, DL, PtrVT, VAList, DAG.getConstant(28, DL, PtrVT)); 4248 MemOps.push_back(DAG.getStore(Chain, DL, 4249 DAG.getConstant(-FPRSize, DL, MVT::i32), 4250 VROffsAddr, MachinePointerInfo(SV, 28), false, 4251 false, 4)); 4252 4253 return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOps); 4254 } 4255 4256 SDValue AArch64TargetLowering::LowerVASTART(SDValue Op, 4257 SelectionDAG &DAG) const { 4258 return Subtarget->isTargetDarwin() ? LowerDarwin_VASTART(Op, DAG) 4259 : LowerAAPCS_VASTART(Op, DAG); 4260 } 4261 4262 SDValue AArch64TargetLowering::LowerVACOPY(SDValue Op, 4263 SelectionDAG &DAG) const { 4264 // AAPCS has three pointers and two ints (= 32 bytes), Darwin has single 4265 // pointer. 4266 SDLoc DL(Op); 4267 unsigned VaListSize = Subtarget->isTargetDarwin() ? 8 : 32; 4268 const Value *DestSV = cast<SrcValueSDNode>(Op.getOperand(3))->getValue(); 4269 const Value *SrcSV = cast<SrcValueSDNode>(Op.getOperand(4))->getValue(); 4270 4271 return DAG.getMemcpy(Op.getOperand(0), DL, Op.getOperand(1), 4272 Op.getOperand(2), 4273 DAG.getConstant(VaListSize, DL, MVT::i32), 4274 8, false, false, false, MachinePointerInfo(DestSV), 4275 MachinePointerInfo(SrcSV)); 4276 } 4277 4278 SDValue AArch64TargetLowering::LowerVAARG(SDValue Op, SelectionDAG &DAG) const { 4279 assert(Subtarget->isTargetDarwin() && 4280 "automatic va_arg instruction only works on Darwin"); 4281 4282 const Value *V = cast<SrcValueSDNode>(Op.getOperand(2))->getValue(); 4283 EVT VT = Op.getValueType(); 4284 SDLoc DL(Op); 4285 SDValue Chain = Op.getOperand(0); 4286 SDValue Addr = Op.getOperand(1); 4287 unsigned Align = Op.getConstantOperandVal(3); 4288 auto PtrVT = getPointerTy(DAG.getDataLayout()); 4289 4290 SDValue VAList = DAG.getLoad(PtrVT, DL, Chain, Addr, MachinePointerInfo(V), 4291 false, false, false, 0); 4292 Chain = VAList.getValue(1); 4293 4294 if (Align > 8) { 4295 assert(((Align & (Align - 1)) == 0) && "Expected Align to be a power of 2"); 4296 VAList = DAG.getNode(ISD::ADD, DL, PtrVT, VAList, 4297 DAG.getConstant(Align - 1, DL, PtrVT)); 4298 VAList = DAG.getNode(ISD::AND, DL, PtrVT, VAList, 4299 DAG.getConstant(-(int64_t)Align, DL, PtrVT)); 4300 } 4301 4302 Type *ArgTy = VT.getTypeForEVT(*DAG.getContext()); 4303 uint64_t ArgSize = DAG.getDataLayout().getTypeAllocSize(ArgTy); 4304 4305 // Scalar integer and FP values smaller than 64 bits are implicitly extended 4306 // up to 64 bits. At the very least, we have to increase the striding of the 4307 // vaargs list to match this, and for FP values we need to introduce 4308 // FP_ROUND nodes as well. 4309 if (VT.isInteger() && !VT.isVector()) 4310 ArgSize = 8; 4311 bool NeedFPTrunc = false; 4312 if (VT.isFloatingPoint() && !VT.isVector() && VT != MVT::f64) { 4313 ArgSize = 8; 4314 NeedFPTrunc = true; 4315 } 4316 4317 // Increment the pointer, VAList, to the next vaarg 4318 SDValue VANext = DAG.getNode(ISD::ADD, DL, PtrVT, VAList, 4319 DAG.getConstant(ArgSize, DL, PtrVT)); 4320 // Store the incremented VAList to the legalized pointer 4321 SDValue APStore = DAG.getStore(Chain, DL, VANext, Addr, MachinePointerInfo(V), 4322 false, false, 0); 4323 4324 // Load the actual argument out of the pointer VAList 4325 if (NeedFPTrunc) { 4326 // Load the value as an f64. 4327 SDValue WideFP = DAG.getLoad(MVT::f64, DL, APStore, VAList, 4328 MachinePointerInfo(), false, false, false, 0); 4329 // Round the value down to an f32. 4330 SDValue NarrowFP = DAG.getNode(ISD::FP_ROUND, DL, VT, WideFP.getValue(0), 4331 DAG.getIntPtrConstant(1, DL)); 4332 SDValue Ops[] = { NarrowFP, WideFP.getValue(1) }; 4333 // Merge the rounded value with the chain output of the load. 4334 return DAG.getMergeValues(Ops, DL); 4335 } 4336 4337 return DAG.getLoad(VT, DL, APStore, VAList, MachinePointerInfo(), false, 4338 false, false, 0); 4339 } 4340 4341 SDValue AArch64TargetLowering::LowerFRAMEADDR(SDValue Op, 4342 SelectionDAG &DAG) const { 4343 MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo(); 4344 MFI->setFrameAddressIsTaken(true); 4345 4346 EVT VT = Op.getValueType(); 4347 SDLoc DL(Op); 4348 unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue(); 4349 SDValue FrameAddr = 4350 DAG.getCopyFromReg(DAG.getEntryNode(), DL, AArch64::FP, VT); 4351 while (Depth--) 4352 FrameAddr = DAG.getLoad(VT, DL, DAG.getEntryNode(), FrameAddr, 4353 MachinePointerInfo(), false, false, false, 0); 4354 return FrameAddr; 4355 } 4356 4357 // FIXME? Maybe this could be a TableGen attribute on some registers and 4358 // this table could be generated automatically from RegInfo. 4359 unsigned AArch64TargetLowering::getRegisterByName(const char* RegName, EVT VT, 4360 SelectionDAG &DAG) const { 4361 unsigned Reg = StringSwitch<unsigned>(RegName) 4362 .Case("sp", AArch64::SP) 4363 .Default(0); 4364 if (Reg) 4365 return Reg; 4366 report_fatal_error(Twine("Invalid register name \"" 4367 + StringRef(RegName) + "\".")); 4368 } 4369 4370 SDValue AArch64TargetLowering::LowerRETURNADDR(SDValue Op, 4371 SelectionDAG &DAG) const { 4372 MachineFunction &MF = DAG.getMachineFunction(); 4373 MachineFrameInfo *MFI = MF.getFrameInfo(); 4374 MFI->setReturnAddressIsTaken(true); 4375 4376 EVT VT = Op.getValueType(); 4377 SDLoc DL(Op); 4378 unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue(); 4379 if (Depth) { 4380 SDValue FrameAddr = LowerFRAMEADDR(Op, DAG); 4381 SDValue Offset = DAG.getConstant(8, DL, getPointerTy(DAG.getDataLayout())); 4382 return DAG.getLoad(VT, DL, DAG.getEntryNode(), 4383 DAG.getNode(ISD::ADD, DL, VT, FrameAddr, Offset), 4384 MachinePointerInfo(), false, false, false, 0); 4385 } 4386 4387 // Return LR, which contains the return address. Mark it an implicit live-in. 4388 unsigned Reg = MF.addLiveIn(AArch64::LR, &AArch64::GPR64RegClass); 4389 return DAG.getCopyFromReg(DAG.getEntryNode(), DL, Reg, VT); 4390 } 4391 4392 /// LowerShiftRightParts - Lower SRA_PARTS, which returns two 4393 /// i64 values and take a 2 x i64 value to shift plus a shift amount. 4394 SDValue AArch64TargetLowering::LowerShiftRightParts(SDValue Op, 4395 SelectionDAG &DAG) const { 4396 assert(Op.getNumOperands() == 3 && "Not a double-shift!"); 4397 EVT VT = Op.getValueType(); 4398 unsigned VTBits = VT.getSizeInBits(); 4399 SDLoc dl(Op); 4400 SDValue ShOpLo = Op.getOperand(0); 4401 SDValue ShOpHi = Op.getOperand(1); 4402 SDValue ShAmt = Op.getOperand(2); 4403 unsigned Opc = (Op.getOpcode() == ISD::SRA_PARTS) ? ISD::SRA : ISD::SRL; 4404 4405 assert(Op.getOpcode() == ISD::SRA_PARTS || Op.getOpcode() == ISD::SRL_PARTS); 4406 4407 SDValue RevShAmt = DAG.getNode(ISD::SUB, dl, MVT::i64, 4408 DAG.getConstant(VTBits, dl, MVT::i64), ShAmt); 4409 SDValue HiBitsForLo = DAG.getNode(ISD::SHL, dl, VT, ShOpHi, RevShAmt); 4410 4411 // Unfortunately, if ShAmt == 0, we just calculated "(SHL ShOpHi, 64)" which 4412 // is "undef". We wanted 0, so CSEL it directly. 4413 SDValue Cmp = emitComparison(ShAmt, DAG.getConstant(0, dl, MVT::i64), 4414 ISD::SETEQ, dl, DAG); 4415 SDValue CCVal = DAG.getConstant(AArch64CC::EQ, dl, MVT::i32); 4416 HiBitsForLo = 4417 DAG.getNode(AArch64ISD::CSEL, dl, VT, DAG.getConstant(0, dl, MVT::i64), 4418 HiBitsForLo, CCVal, Cmp); 4419 4420 SDValue ExtraShAmt = DAG.getNode(ISD::SUB, dl, MVT::i64, ShAmt, 4421 DAG.getConstant(VTBits, dl, MVT::i64)); 4422 4423 SDValue LoBitsForLo = DAG.getNode(ISD::SRL, dl, VT, ShOpLo, ShAmt); 4424 SDValue LoForNormalShift = 4425 DAG.getNode(ISD::OR, dl, VT, LoBitsForLo, HiBitsForLo); 4426 4427 Cmp = emitComparison(ExtraShAmt, DAG.getConstant(0, dl, MVT::i64), ISD::SETGE, 4428 dl, DAG); 4429 CCVal = DAG.getConstant(AArch64CC::GE, dl, MVT::i32); 4430 SDValue LoForBigShift = DAG.getNode(Opc, dl, VT, ShOpHi, ExtraShAmt); 4431 SDValue Lo = DAG.getNode(AArch64ISD::CSEL, dl, VT, LoForBigShift, 4432 LoForNormalShift, CCVal, Cmp); 4433 4434 // AArch64 shifts larger than the register width are wrapped rather than 4435 // clamped, so we can't just emit "hi >> x". 4436 SDValue HiForNormalShift = DAG.getNode(Opc, dl, VT, ShOpHi, ShAmt); 4437 SDValue HiForBigShift = 4438 Opc == ISD::SRA 4439 ? DAG.getNode(Opc, dl, VT, ShOpHi, 4440 DAG.getConstant(VTBits - 1, dl, MVT::i64)) 4441 : DAG.getConstant(0, dl, VT); 4442 SDValue Hi = DAG.getNode(AArch64ISD::CSEL, dl, VT, HiForBigShift, 4443 HiForNormalShift, CCVal, Cmp); 4444 4445 SDValue Ops[2] = { Lo, Hi }; 4446 return DAG.getMergeValues(Ops, dl); 4447 } 4448 4449 4450 /// LowerShiftLeftParts - Lower SHL_PARTS, which returns two 4451 /// i64 values and take a 2 x i64 value to shift plus a shift amount. 4452 SDValue AArch64TargetLowering::LowerShiftLeftParts(SDValue Op, 4453 SelectionDAG &DAG) const { 4454 assert(Op.getNumOperands() == 3 && "Not a double-shift!"); 4455 EVT VT = Op.getValueType(); 4456 unsigned VTBits = VT.getSizeInBits(); 4457 SDLoc dl(Op); 4458 SDValue ShOpLo = Op.getOperand(0); 4459 SDValue ShOpHi = Op.getOperand(1); 4460 SDValue ShAmt = Op.getOperand(2); 4461 4462 assert(Op.getOpcode() == ISD::SHL_PARTS); 4463 SDValue RevShAmt = DAG.getNode(ISD::SUB, dl, MVT::i64, 4464 DAG.getConstant(VTBits, dl, MVT::i64), ShAmt); 4465 SDValue LoBitsForHi = DAG.getNode(ISD::SRL, dl, VT, ShOpLo, RevShAmt); 4466 4467 // Unfortunately, if ShAmt == 0, we just calculated "(SRL ShOpLo, 64)" which 4468 // is "undef". We wanted 0, so CSEL it directly. 4469 SDValue Cmp = emitComparison(ShAmt, DAG.getConstant(0, dl, MVT::i64), 4470 ISD::SETEQ, dl, DAG); 4471 SDValue CCVal = DAG.getConstant(AArch64CC::EQ, dl, MVT::i32); 4472 LoBitsForHi = 4473 DAG.getNode(AArch64ISD::CSEL, dl, VT, DAG.getConstant(0, dl, MVT::i64), 4474 LoBitsForHi, CCVal, Cmp); 4475 4476 SDValue ExtraShAmt = DAG.getNode(ISD::SUB, dl, MVT::i64, ShAmt, 4477 DAG.getConstant(VTBits, dl, MVT::i64)); 4478 SDValue HiBitsForHi = DAG.getNode(ISD::SHL, dl, VT, ShOpHi, ShAmt); 4479 SDValue HiForNormalShift = 4480 DAG.getNode(ISD::OR, dl, VT, LoBitsForHi, HiBitsForHi); 4481 4482 SDValue HiForBigShift = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, ExtraShAmt); 4483 4484 Cmp = emitComparison(ExtraShAmt, DAG.getConstant(0, dl, MVT::i64), ISD::SETGE, 4485 dl, DAG); 4486 CCVal = DAG.getConstant(AArch64CC::GE, dl, MVT::i32); 4487 SDValue Hi = DAG.getNode(AArch64ISD::CSEL, dl, VT, HiForBigShift, 4488 HiForNormalShift, CCVal, Cmp); 4489 4490 // AArch64 shifts of larger than register sizes are wrapped rather than 4491 // clamped, so we can't just emit "lo << a" if a is too big. 4492 SDValue LoForBigShift = DAG.getConstant(0, dl, VT); 4493 SDValue LoForNormalShift = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, ShAmt); 4494 SDValue Lo = DAG.getNode(AArch64ISD::CSEL, dl, VT, LoForBigShift, 4495 LoForNormalShift, CCVal, Cmp); 4496 4497 SDValue Ops[2] = { Lo, Hi }; 4498 return DAG.getMergeValues(Ops, dl); 4499 } 4500 4501 bool AArch64TargetLowering::isOffsetFoldingLegal( 4502 const GlobalAddressSDNode *GA) const { 4503 // The AArch64 target doesn't support folding offsets into global addresses. 4504 return false; 4505 } 4506 4507 bool AArch64TargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT) const { 4508 // We can materialize #0.0 as fmov $Rd, XZR for 64-bit and 32-bit cases. 4509 // FIXME: We should be able to handle f128 as well with a clever lowering. 4510 if (Imm.isPosZero() && (VT == MVT::f64 || VT == MVT::f32)) 4511 return true; 4512 4513 if (VT == MVT::f64) 4514 return AArch64_AM::getFP64Imm(Imm) != -1; 4515 else if (VT == MVT::f32) 4516 return AArch64_AM::getFP32Imm(Imm) != -1; 4517 return false; 4518 } 4519 4520 //===----------------------------------------------------------------------===// 4521 // AArch64 Optimization Hooks 4522 //===----------------------------------------------------------------------===// 4523 4524 //===----------------------------------------------------------------------===// 4525 // AArch64 Inline Assembly Support 4526 //===----------------------------------------------------------------------===// 4527 4528 // Table of Constraints 4529 // TODO: This is the current set of constraints supported by ARM for the 4530 // compiler, not all of them may make sense, e.g. S may be difficult to support. 4531 // 4532 // r - A general register 4533 // w - An FP/SIMD register of some size in the range v0-v31 4534 // x - An FP/SIMD register of some size in the range v0-v15 4535 // I - Constant that can be used with an ADD instruction 4536 // J - Constant that can be used with a SUB instruction 4537 // K - Constant that can be used with a 32-bit logical instruction 4538 // L - Constant that can be used with a 64-bit logical instruction 4539 // M - Constant that can be used as a 32-bit MOV immediate 4540 // N - Constant that can be used as a 64-bit MOV immediate 4541 // Q - A memory reference with base register and no offset 4542 // S - A symbolic address 4543 // Y - Floating point constant zero 4544 // Z - Integer constant zero 4545 // 4546 // Note that general register operands will be output using their 64-bit x 4547 // register name, whatever the size of the variable, unless the asm operand 4548 // is prefixed by the %w modifier. Floating-point and SIMD register operands 4549 // will be output with the v prefix unless prefixed by the %b, %h, %s, %d or 4550 // %q modifier. 4551 4552 /// getConstraintType - Given a constraint letter, return the type of 4553 /// constraint it is for this target. 4554 AArch64TargetLowering::ConstraintType 4555 AArch64TargetLowering::getConstraintType(StringRef Constraint) const { 4556 if (Constraint.size() == 1) { 4557 switch (Constraint[0]) { 4558 default: 4559 break; 4560 case 'z': 4561 return C_Other; 4562 case 'x': 4563 case 'w': 4564 return C_RegisterClass; 4565 // An address with a single base register. Due to the way we 4566 // currently handle addresses it is the same as 'r'. 4567 case 'Q': 4568 return C_Memory; 4569 } 4570 } 4571 return TargetLowering::getConstraintType(Constraint); 4572 } 4573 4574 /// Examine constraint type and operand type and determine a weight value. 4575 /// This object must already have been set up with the operand type 4576 /// and the current alternative constraint selected. 4577 TargetLowering::ConstraintWeight 4578 AArch64TargetLowering::getSingleConstraintMatchWeight( 4579 AsmOperandInfo &info, const char *constraint) const { 4580 ConstraintWeight weight = CW_Invalid; 4581 Value *CallOperandVal = info.CallOperandVal; 4582 // If we don't have a value, we can't do a match, 4583 // but allow it at the lowest weight. 4584 if (!CallOperandVal) 4585 return CW_Default; 4586 Type *type = CallOperandVal->getType(); 4587 // Look at the constraint type. 4588 switch (*constraint) { 4589 default: 4590 weight = TargetLowering::getSingleConstraintMatchWeight(info, constraint); 4591 break; 4592 case 'x': 4593 case 'w': 4594 if (type->isFloatingPointTy() || type->isVectorTy()) 4595 weight = CW_Register; 4596 break; 4597 case 'z': 4598 weight = CW_Constant; 4599 break; 4600 } 4601 return weight; 4602 } 4603 4604 std::pair<unsigned, const TargetRegisterClass *> 4605 AArch64TargetLowering::getRegForInlineAsmConstraint( 4606 const TargetRegisterInfo *TRI, StringRef Constraint, MVT VT) const { 4607 if (Constraint.size() == 1) { 4608 switch (Constraint[0]) { 4609 case 'r': 4610 if (VT.getSizeInBits() == 64) 4611 return std::make_pair(0U, &AArch64::GPR64commonRegClass); 4612 return std::make_pair(0U, &AArch64::GPR32commonRegClass); 4613 case 'w': 4614 if (VT == MVT::f32) 4615 return std::make_pair(0U, &AArch64::FPR32RegClass); 4616 if (VT.getSizeInBits() == 64) 4617 return std::make_pair(0U, &AArch64::FPR64RegClass); 4618 if (VT.getSizeInBits() == 128) 4619 return std::make_pair(0U, &AArch64::FPR128RegClass); 4620 break; 4621 // The instructions that this constraint is designed for can 4622 // only take 128-bit registers so just use that regclass. 4623 case 'x': 4624 if (VT.getSizeInBits() == 128) 4625 return std::make_pair(0U, &AArch64::FPR128_loRegClass); 4626 break; 4627 } 4628 } 4629 if (StringRef("{cc}").equals_lower(Constraint)) 4630 return std::make_pair(unsigned(AArch64::NZCV), &AArch64::CCRRegClass); 4631 4632 // Use the default implementation in TargetLowering to convert the register 4633 // constraint into a member of a register class. 4634 std::pair<unsigned, const TargetRegisterClass *> Res; 4635 Res = TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT); 4636 4637 // Not found as a standard register? 4638 if (!Res.second) { 4639 unsigned Size = Constraint.size(); 4640 if ((Size == 4 || Size == 5) && Constraint[0] == '{' && 4641 tolower(Constraint[1]) == 'v' && Constraint[Size - 1] == '}') { 4642 int RegNo; 4643 bool Failed = Constraint.slice(2, Size - 1).getAsInteger(10, RegNo); 4644 if (!Failed && RegNo >= 0 && RegNo <= 31) { 4645 // v0 - v31 are aliases of q0 - q31. 4646 // By default we'll emit v0-v31 for this unless there's a modifier where 4647 // we'll emit the correct register as well. 4648 Res.first = AArch64::FPR128RegClass.getRegister(RegNo); 4649 Res.second = &AArch64::FPR128RegClass; 4650 } 4651 } 4652 } 4653 4654 return Res; 4655 } 4656 4657 /// LowerAsmOperandForConstraint - Lower the specified operand into the Ops 4658 /// vector. If it is invalid, don't add anything to Ops. 4659 void AArch64TargetLowering::LowerAsmOperandForConstraint( 4660 SDValue Op, std::string &Constraint, std::vector<SDValue> &Ops, 4661 SelectionDAG &DAG) const { 4662 SDValue Result; 4663 4664 // Currently only support length 1 constraints. 4665 if (Constraint.length() != 1) 4666 return; 4667 4668 char ConstraintLetter = Constraint[0]; 4669 switch (ConstraintLetter) { 4670 default: 4671 break; 4672 4673 // This set of constraints deal with valid constants for various instructions. 4674 // Validate and return a target constant for them if we can. 4675 case 'z': { 4676 // 'z' maps to xzr or wzr so it needs an input of 0. 4677 if (!isNullConstant(Op)) 4678 return; 4679 4680 if (Op.getValueType() == MVT::i64) 4681 Result = DAG.getRegister(AArch64::XZR, MVT::i64); 4682 else 4683 Result = DAG.getRegister(AArch64::WZR, MVT::i32); 4684 break; 4685 } 4686 4687 case 'I': 4688 case 'J': 4689 case 'K': 4690 case 'L': 4691 case 'M': 4692 case 'N': 4693 ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op); 4694 if (!C) 4695 return; 4696 4697 // Grab the value and do some validation. 4698 uint64_t CVal = C->getZExtValue(); 4699 switch (ConstraintLetter) { 4700 // The I constraint applies only to simple ADD or SUB immediate operands: 4701 // i.e. 0 to 4095 with optional shift by 12 4702 // The J constraint applies only to ADD or SUB immediates that would be 4703 // valid when negated, i.e. if [an add pattern] were to be output as a SUB 4704 // instruction [or vice versa], in other words -1 to -4095 with optional 4705 // left shift by 12. 4706 case 'I': 4707 if (isUInt<12>(CVal) || isShiftedUInt<12, 12>(CVal)) 4708 break; 4709 return; 4710 case 'J': { 4711 uint64_t NVal = -C->getSExtValue(); 4712 if (isUInt<12>(NVal) || isShiftedUInt<12, 12>(NVal)) { 4713 CVal = C->getSExtValue(); 4714 break; 4715 } 4716 return; 4717 } 4718 // The K and L constraints apply *only* to logical immediates, including 4719 // what used to be the MOVI alias for ORR (though the MOVI alias has now 4720 // been removed and MOV should be used). So these constraints have to 4721 // distinguish between bit patterns that are valid 32-bit or 64-bit 4722 // "bitmask immediates": for example 0xaaaaaaaa is a valid bimm32 (K), but 4723 // not a valid bimm64 (L) where 0xaaaaaaaaaaaaaaaa would be valid, and vice 4724 // versa. 4725 case 'K': 4726 if (AArch64_AM::isLogicalImmediate(CVal, 32)) 4727 break; 4728 return; 4729 case 'L': 4730 if (AArch64_AM::isLogicalImmediate(CVal, 64)) 4731 break; 4732 return; 4733 // The M and N constraints are a superset of K and L respectively, for use 4734 // with the MOV (immediate) alias. As well as the logical immediates they 4735 // also match 32 or 64-bit immediates that can be loaded either using a 4736 // *single* MOVZ or MOVN , such as 32-bit 0x12340000, 0x00001234, 0xffffedca 4737 // (M) or 64-bit 0x1234000000000000 (N) etc. 4738 // As a note some of this code is liberally stolen from the asm parser. 4739 case 'M': { 4740 if (!isUInt<32>(CVal)) 4741 return; 4742 if (AArch64_AM::isLogicalImmediate(CVal, 32)) 4743 break; 4744 if ((CVal & 0xFFFF) == CVal) 4745 break; 4746 if ((CVal & 0xFFFF0000ULL) == CVal) 4747 break; 4748 uint64_t NCVal = ~(uint32_t)CVal; 4749 if ((NCVal & 0xFFFFULL) == NCVal) 4750 break; 4751 if ((NCVal & 0xFFFF0000ULL) == NCVal) 4752 break; 4753 return; 4754 } 4755 case 'N': { 4756 if (AArch64_AM::isLogicalImmediate(CVal, 64)) 4757 break; 4758 if ((CVal & 0xFFFFULL) == CVal) 4759 break; 4760 if ((CVal & 0xFFFF0000ULL) == CVal) 4761 break; 4762 if ((CVal & 0xFFFF00000000ULL) == CVal) 4763 break; 4764 if ((CVal & 0xFFFF000000000000ULL) == CVal) 4765 break; 4766 uint64_t NCVal = ~CVal; 4767 if ((NCVal & 0xFFFFULL) == NCVal) 4768 break; 4769 if ((NCVal & 0xFFFF0000ULL) == NCVal) 4770 break; 4771 if ((NCVal & 0xFFFF00000000ULL) == NCVal) 4772 break; 4773 if ((NCVal & 0xFFFF000000000000ULL) == NCVal) 4774 break; 4775 return; 4776 } 4777 default: 4778 return; 4779 } 4780 4781 // All assembler immediates are 64-bit integers. 4782 Result = DAG.getTargetConstant(CVal, SDLoc(Op), MVT::i64); 4783 break; 4784 } 4785 4786 if (Result.getNode()) { 4787 Ops.push_back(Result); 4788 return; 4789 } 4790 4791 return TargetLowering::LowerAsmOperandForConstraint(Op, Constraint, Ops, DAG); 4792 } 4793 4794 //===----------------------------------------------------------------------===// 4795 // AArch64 Advanced SIMD Support 4796 //===----------------------------------------------------------------------===// 4797 4798 /// WidenVector - Given a value in the V64 register class, produce the 4799 /// equivalent value in the V128 register class. 4800 static SDValue WidenVector(SDValue V64Reg, SelectionDAG &DAG) { 4801 EVT VT = V64Reg.getValueType(); 4802 unsigned NarrowSize = VT.getVectorNumElements(); 4803 MVT EltTy = VT.getVectorElementType().getSimpleVT(); 4804 MVT WideTy = MVT::getVectorVT(EltTy, 2 * NarrowSize); 4805 SDLoc DL(V64Reg); 4806 4807 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, WideTy, DAG.getUNDEF(WideTy), 4808 V64Reg, DAG.getConstant(0, DL, MVT::i32)); 4809 } 4810 4811 /// getExtFactor - Determine the adjustment factor for the position when 4812 /// generating an "extract from vector registers" instruction. 4813 static unsigned getExtFactor(SDValue &V) { 4814 EVT EltType = V.getValueType().getVectorElementType(); 4815 return EltType.getSizeInBits() / 8; 4816 } 4817 4818 /// NarrowVector - Given a value in the V128 register class, produce the 4819 /// equivalent value in the V64 register class. 4820 static SDValue NarrowVector(SDValue V128Reg, SelectionDAG &DAG) { 4821 EVT VT = V128Reg.getValueType(); 4822 unsigned WideSize = VT.getVectorNumElements(); 4823 MVT EltTy = VT.getVectorElementType().getSimpleVT(); 4824 MVT NarrowTy = MVT::getVectorVT(EltTy, WideSize / 2); 4825 SDLoc DL(V128Reg); 4826 4827 return DAG.getTargetExtractSubreg(AArch64::dsub, DL, NarrowTy, V128Reg); 4828 } 4829 4830 // Gather data to see if the operation can be modelled as a 4831 // shuffle in combination with VEXTs. 4832 SDValue AArch64TargetLowering::ReconstructShuffle(SDValue Op, 4833 SelectionDAG &DAG) const { 4834 assert(Op.getOpcode() == ISD::BUILD_VECTOR && "Unknown opcode!"); 4835 SDLoc dl(Op); 4836 EVT VT = Op.getValueType(); 4837 unsigned NumElts = VT.getVectorNumElements(); 4838 4839 struct ShuffleSourceInfo { 4840 SDValue Vec; 4841 unsigned MinElt; 4842 unsigned MaxElt; 4843 4844 // We may insert some combination of BITCASTs and VEXT nodes to force Vec to 4845 // be compatible with the shuffle we intend to construct. As a result 4846 // ShuffleVec will be some sliding window into the original Vec. 4847 SDValue ShuffleVec; 4848 4849 // Code should guarantee that element i in Vec starts at element "WindowBase 4850 // + i * WindowScale in ShuffleVec". 4851 int WindowBase; 4852 int WindowScale; 4853 4854 bool operator ==(SDValue OtherVec) { return Vec == OtherVec; } 4855 ShuffleSourceInfo(SDValue Vec) 4856 : Vec(Vec), MinElt(UINT_MAX), MaxElt(0), ShuffleVec(Vec), WindowBase(0), 4857 WindowScale(1) {} 4858 }; 4859 4860 // First gather all vectors used as an immediate source for this BUILD_VECTOR 4861 // node. 4862 SmallVector<ShuffleSourceInfo, 2> Sources; 4863 for (unsigned i = 0; i < NumElts; ++i) { 4864 SDValue V = Op.getOperand(i); 4865 if (V.getOpcode() == ISD::UNDEF) 4866 continue; 4867 else if (V.getOpcode() != ISD::EXTRACT_VECTOR_ELT) { 4868 // A shuffle can only come from building a vector from various 4869 // elements of other vectors. 4870 return SDValue(); 4871 } 4872 4873 // Add this element source to the list if it's not already there. 4874 SDValue SourceVec = V.getOperand(0); 4875 auto Source = std::find(Sources.begin(), Sources.end(), SourceVec); 4876 if (Source == Sources.end()) 4877 Source = Sources.insert(Sources.end(), ShuffleSourceInfo(SourceVec)); 4878 4879 // Update the minimum and maximum lane number seen. 4880 unsigned EltNo = cast<ConstantSDNode>(V.getOperand(1))->getZExtValue(); 4881 Source->MinElt = std::min(Source->MinElt, EltNo); 4882 Source->MaxElt = std::max(Source->MaxElt, EltNo); 4883 } 4884 4885 // Currently only do something sane when at most two source vectors 4886 // are involved. 4887 if (Sources.size() > 2) 4888 return SDValue(); 4889 4890 // Find out the smallest element size among result and two sources, and use 4891 // it as element size to build the shuffle_vector. 4892 EVT SmallestEltTy = VT.getVectorElementType(); 4893 for (auto &Source : Sources) { 4894 EVT SrcEltTy = Source.Vec.getValueType().getVectorElementType(); 4895 if (SrcEltTy.bitsLT(SmallestEltTy)) { 4896 SmallestEltTy = SrcEltTy; 4897 } 4898 } 4899 unsigned ResMultiplier = 4900 VT.getVectorElementType().getSizeInBits() / SmallestEltTy.getSizeInBits(); 4901 NumElts = VT.getSizeInBits() / SmallestEltTy.getSizeInBits(); 4902 EVT ShuffleVT = EVT::getVectorVT(*DAG.getContext(), SmallestEltTy, NumElts); 4903 4904 // If the source vector is too wide or too narrow, we may nevertheless be able 4905 // to construct a compatible shuffle either by concatenating it with UNDEF or 4906 // extracting a suitable range of elements. 4907 for (auto &Src : Sources) { 4908 EVT SrcVT = Src.ShuffleVec.getValueType(); 4909 4910 if (SrcVT.getSizeInBits() == VT.getSizeInBits()) 4911 continue; 4912 4913 // This stage of the search produces a source with the same element type as 4914 // the original, but with a total width matching the BUILD_VECTOR output. 4915 EVT EltVT = SrcVT.getVectorElementType(); 4916 unsigned NumSrcElts = VT.getSizeInBits() / EltVT.getSizeInBits(); 4917 EVT DestVT = EVT::getVectorVT(*DAG.getContext(), EltVT, NumSrcElts); 4918 4919 if (SrcVT.getSizeInBits() < VT.getSizeInBits()) { 4920 assert(2 * SrcVT.getSizeInBits() == VT.getSizeInBits()); 4921 // We can pad out the smaller vector for free, so if it's part of a 4922 // shuffle... 4923 Src.ShuffleVec = 4924 DAG.getNode(ISD::CONCAT_VECTORS, dl, DestVT, Src.ShuffleVec, 4925 DAG.getUNDEF(Src.ShuffleVec.getValueType())); 4926 continue; 4927 } 4928 4929 assert(SrcVT.getSizeInBits() == 2 * VT.getSizeInBits()); 4930 4931 if (Src.MaxElt - Src.MinElt >= NumSrcElts) { 4932 // Span too large for a VEXT to cope 4933 return SDValue(); 4934 } 4935 4936 if (Src.MinElt >= NumSrcElts) { 4937 // The extraction can just take the second half 4938 Src.ShuffleVec = 4939 DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, DestVT, Src.ShuffleVec, 4940 DAG.getConstant(NumSrcElts, dl, MVT::i64)); 4941 Src.WindowBase = -NumSrcElts; 4942 } else if (Src.MaxElt < NumSrcElts) { 4943 // The extraction can just take the first half 4944 Src.ShuffleVec = 4945 DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, DestVT, Src.ShuffleVec, 4946 DAG.getConstant(0, dl, MVT::i64)); 4947 } else { 4948 // An actual VEXT is needed 4949 SDValue VEXTSrc1 = 4950 DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, DestVT, Src.ShuffleVec, 4951 DAG.getConstant(0, dl, MVT::i64)); 4952 SDValue VEXTSrc2 = 4953 DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, DestVT, Src.ShuffleVec, 4954 DAG.getConstant(NumSrcElts, dl, MVT::i64)); 4955 unsigned Imm = Src.MinElt * getExtFactor(VEXTSrc1); 4956 4957 Src.ShuffleVec = DAG.getNode(AArch64ISD::EXT, dl, DestVT, VEXTSrc1, 4958 VEXTSrc2, 4959 DAG.getConstant(Imm, dl, MVT::i32)); 4960 Src.WindowBase = -Src.MinElt; 4961 } 4962 } 4963 4964 // Another possible incompatibility occurs from the vector element types. We 4965 // can fix this by bitcasting the source vectors to the same type we intend 4966 // for the shuffle. 4967 for (auto &Src : Sources) { 4968 EVT SrcEltTy = Src.ShuffleVec.getValueType().getVectorElementType(); 4969 if (SrcEltTy == SmallestEltTy) 4970 continue; 4971 assert(ShuffleVT.getVectorElementType() == SmallestEltTy); 4972 Src.ShuffleVec = DAG.getNode(ISD::BITCAST, dl, ShuffleVT, Src.ShuffleVec); 4973 Src.WindowScale = SrcEltTy.getSizeInBits() / SmallestEltTy.getSizeInBits(); 4974 Src.WindowBase *= Src.WindowScale; 4975 } 4976 4977 // Final sanity check before we try to actually produce a shuffle. 4978 DEBUG( 4979 for (auto Src : Sources) 4980 assert(Src.ShuffleVec.getValueType() == ShuffleVT); 4981 ); 4982 4983 // The stars all align, our next step is to produce the mask for the shuffle. 4984 SmallVector<int, 8> Mask(ShuffleVT.getVectorNumElements(), -1); 4985 int BitsPerShuffleLane = ShuffleVT.getVectorElementType().getSizeInBits(); 4986 for (unsigned i = 0; i < VT.getVectorNumElements(); ++i) { 4987 SDValue Entry = Op.getOperand(i); 4988 if (Entry.getOpcode() == ISD::UNDEF) 4989 continue; 4990 4991 auto Src = std::find(Sources.begin(), Sources.end(), Entry.getOperand(0)); 4992 int EltNo = cast<ConstantSDNode>(Entry.getOperand(1))->getSExtValue(); 4993 4994 // EXTRACT_VECTOR_ELT performs an implicit any_ext; BUILD_VECTOR an implicit 4995 // trunc. So only std::min(SrcBits, DestBits) actually get defined in this 4996 // segment. 4997 EVT OrigEltTy = Entry.getOperand(0).getValueType().getVectorElementType(); 4998 int BitsDefined = std::min(OrigEltTy.getSizeInBits(), 4999 VT.getVectorElementType().getSizeInBits()); 5000 int LanesDefined = BitsDefined / BitsPerShuffleLane; 5001 5002 // This source is expected to fill ResMultiplier lanes of the final shuffle, 5003 // starting at the appropriate offset. 5004 int *LaneMask = &Mask[i * ResMultiplier]; 5005 5006 int ExtractBase = EltNo * Src->WindowScale + Src->WindowBase; 5007 ExtractBase += NumElts * (Src - Sources.begin()); 5008 for (int j = 0; j < LanesDefined; ++j) 5009 LaneMask[j] = ExtractBase + j; 5010 } 5011 5012 // Final check before we try to produce nonsense... 5013 if (!isShuffleMaskLegal(Mask, ShuffleVT)) 5014 return SDValue(); 5015 5016 SDValue ShuffleOps[] = { DAG.getUNDEF(ShuffleVT), DAG.getUNDEF(ShuffleVT) }; 5017 for (unsigned i = 0; i < Sources.size(); ++i) 5018 ShuffleOps[i] = Sources[i].ShuffleVec; 5019 5020 SDValue Shuffle = DAG.getVectorShuffle(ShuffleVT, dl, ShuffleOps[0], 5021 ShuffleOps[1], &Mask[0]); 5022 return DAG.getNode(ISD::BITCAST, dl, VT, Shuffle); 5023 } 5024 5025 // check if an EXT instruction can handle the shuffle mask when the 5026 // vector sources of the shuffle are the same. 5027 static bool isSingletonEXTMask(ArrayRef<int> M, EVT VT, unsigned &Imm) { 5028 unsigned NumElts = VT.getVectorNumElements(); 5029 5030 // Assume that the first shuffle index is not UNDEF. Fail if it is. 5031 if (M[0] < 0) 5032 return false; 5033 5034 Imm = M[0]; 5035 5036 // If this is a VEXT shuffle, the immediate value is the index of the first 5037 // element. The other shuffle indices must be the successive elements after 5038 // the first one. 5039 unsigned ExpectedElt = Imm; 5040 for (unsigned i = 1; i < NumElts; ++i) { 5041 // Increment the expected index. If it wraps around, just follow it 5042 // back to index zero and keep going. 5043 ++ExpectedElt; 5044 if (ExpectedElt == NumElts) 5045 ExpectedElt = 0; 5046 5047 if (M[i] < 0) 5048 continue; // ignore UNDEF indices 5049 if (ExpectedElt != static_cast<unsigned>(M[i])) 5050 return false; 5051 } 5052 5053 return true; 5054 } 5055 5056 // check if an EXT instruction can handle the shuffle mask when the 5057 // vector sources of the shuffle are different. 5058 static bool isEXTMask(ArrayRef<int> M, EVT VT, bool &ReverseEXT, 5059 unsigned &Imm) { 5060 // Look for the first non-undef element. 5061 const int *FirstRealElt = std::find_if(M.begin(), M.end(), 5062 [](int Elt) {return Elt >= 0;}); 5063 5064 // Benefit form APInt to handle overflow when calculating expected element. 5065 unsigned NumElts = VT.getVectorNumElements(); 5066 unsigned MaskBits = APInt(32, NumElts * 2).logBase2(); 5067 APInt ExpectedElt = APInt(MaskBits, *FirstRealElt + 1); 5068 // The following shuffle indices must be the successive elements after the 5069 // first real element. 5070 const int *FirstWrongElt = std::find_if(FirstRealElt + 1, M.end(), 5071 [&](int Elt) {return Elt != ExpectedElt++ && Elt != -1;}); 5072 if (FirstWrongElt != M.end()) 5073 return false; 5074 5075 // The index of an EXT is the first element if it is not UNDEF. 5076 // Watch out for the beginning UNDEFs. The EXT index should be the expected 5077 // value of the first element. E.g. 5078 // <-1, -1, 3, ...> is treated as <1, 2, 3, ...>. 5079 // <-1, -1, 0, 1, ...> is treated as <2*NumElts-2, 2*NumElts-1, 0, 1, ...>. 5080 // ExpectedElt is the last mask index plus 1. 5081 Imm = ExpectedElt.getZExtValue(); 5082 5083 // There are two difference cases requiring to reverse input vectors. 5084 // For example, for vector <4 x i32> we have the following cases, 5085 // Case 1: shufflevector(<4 x i32>,<4 x i32>,<-1, -1, -1, 0>) 5086 // Case 2: shufflevector(<4 x i32>,<4 x i32>,<-1, -1, 7, 0>) 5087 // For both cases, we finally use mask <5, 6, 7, 0>, which requires 5088 // to reverse two input vectors. 5089 if (Imm < NumElts) 5090 ReverseEXT = true; 5091 else 5092 Imm -= NumElts; 5093 5094 return true; 5095 } 5096 5097 /// isREVMask - Check if a vector shuffle corresponds to a REV 5098 /// instruction with the specified blocksize. (The order of the elements 5099 /// within each block of the vector is reversed.) 5100 static bool isREVMask(ArrayRef<int> M, EVT VT, unsigned BlockSize) { 5101 assert((BlockSize == 16 || BlockSize == 32 || BlockSize == 64) && 5102 "Only possible block sizes for REV are: 16, 32, 64"); 5103 5104 unsigned EltSz = VT.getVectorElementType().getSizeInBits(); 5105 if (EltSz == 64) 5106 return false; 5107 5108 unsigned NumElts = VT.getVectorNumElements(); 5109 unsigned BlockElts = M[0] + 1; 5110 // If the first shuffle index is UNDEF, be optimistic. 5111 if (M[0] < 0) 5112 BlockElts = BlockSize / EltSz; 5113 5114 if (BlockSize <= EltSz || BlockSize != BlockElts * EltSz) 5115 return false; 5116 5117 for (unsigned i = 0; i < NumElts; ++i) { 5118 if (M[i] < 0) 5119 continue; // ignore UNDEF indices 5120 if ((unsigned)M[i] != (i - i % BlockElts) + (BlockElts - 1 - i % BlockElts)) 5121 return false; 5122 } 5123 5124 return true; 5125 } 5126 5127 static bool isZIPMask(ArrayRef<int> M, EVT VT, unsigned &WhichResult) { 5128 unsigned NumElts = VT.getVectorNumElements(); 5129 WhichResult = (M[0] == 0 ? 0 : 1); 5130 unsigned Idx = WhichResult * NumElts / 2; 5131 for (unsigned i = 0; i != NumElts; i += 2) { 5132 if ((M[i] >= 0 && (unsigned)M[i] != Idx) || 5133 (M[i + 1] >= 0 && (unsigned)M[i + 1] != Idx + NumElts)) 5134 return false; 5135 Idx += 1; 5136 } 5137 5138 return true; 5139 } 5140 5141 static bool isUZPMask(ArrayRef<int> M, EVT VT, unsigned &WhichResult) { 5142 unsigned NumElts = VT.getVectorNumElements(); 5143 WhichResult = (M[0] == 0 ? 0 : 1); 5144 for (unsigned i = 0; i != NumElts; ++i) { 5145 if (M[i] < 0) 5146 continue; // ignore UNDEF indices 5147 if ((unsigned)M[i] != 2 * i + WhichResult) 5148 return false; 5149 } 5150 5151 return true; 5152 } 5153 5154 static bool isTRNMask(ArrayRef<int> M, EVT VT, unsigned &WhichResult) { 5155 unsigned NumElts = VT.getVectorNumElements(); 5156 WhichResult = (M[0] == 0 ? 0 : 1); 5157 for (unsigned i = 0; i < NumElts; i += 2) { 5158 if ((M[i] >= 0 && (unsigned)M[i] != i + WhichResult) || 5159 (M[i + 1] >= 0 && (unsigned)M[i + 1] != i + NumElts + WhichResult)) 5160 return false; 5161 } 5162 return true; 5163 } 5164 5165 /// isZIP_v_undef_Mask - Special case of isZIPMask for canonical form of 5166 /// "vector_shuffle v, v", i.e., "vector_shuffle v, undef". 5167 /// Mask is e.g., <0, 0, 1, 1> instead of <0, 4, 1, 5>. 5168 static bool isZIP_v_undef_Mask(ArrayRef<int> M, EVT VT, unsigned &WhichResult) { 5169 unsigned NumElts = VT.getVectorNumElements(); 5170 WhichResult = (M[0] == 0 ? 0 : 1); 5171 unsigned Idx = WhichResult * NumElts / 2; 5172 for (unsigned i = 0; i != NumElts; i += 2) { 5173 if ((M[i] >= 0 && (unsigned)M[i] != Idx) || 5174 (M[i + 1] >= 0 && (unsigned)M[i + 1] != Idx)) 5175 return false; 5176 Idx += 1; 5177 } 5178 5179 return true; 5180 } 5181 5182 /// isUZP_v_undef_Mask - Special case of isUZPMask for canonical form of 5183 /// "vector_shuffle v, v", i.e., "vector_shuffle v, undef". 5184 /// Mask is e.g., <0, 2, 0, 2> instead of <0, 2, 4, 6>, 5185 static bool isUZP_v_undef_Mask(ArrayRef<int> M, EVT VT, unsigned &WhichResult) { 5186 unsigned Half = VT.getVectorNumElements() / 2; 5187 WhichResult = (M[0] == 0 ? 0 : 1); 5188 for (unsigned j = 0; j != 2; ++j) { 5189 unsigned Idx = WhichResult; 5190 for (unsigned i = 0; i != Half; ++i) { 5191 int MIdx = M[i + j * Half]; 5192 if (MIdx >= 0 && (unsigned)MIdx != Idx) 5193 return false; 5194 Idx += 2; 5195 } 5196 } 5197 5198 return true; 5199 } 5200 5201 /// isTRN_v_undef_Mask - Special case of isTRNMask for canonical form of 5202 /// "vector_shuffle v, v", i.e., "vector_shuffle v, undef". 5203 /// Mask is e.g., <0, 0, 2, 2> instead of <0, 4, 2, 6>. 5204 static bool isTRN_v_undef_Mask(ArrayRef<int> M, EVT VT, unsigned &WhichResult) { 5205 unsigned NumElts = VT.getVectorNumElements(); 5206 WhichResult = (M[0] == 0 ? 0 : 1); 5207 for (unsigned i = 0; i < NumElts; i += 2) { 5208 if ((M[i] >= 0 && (unsigned)M[i] != i + WhichResult) || 5209 (M[i + 1] >= 0 && (unsigned)M[i + 1] != i + WhichResult)) 5210 return false; 5211 } 5212 return true; 5213 } 5214 5215 static bool isINSMask(ArrayRef<int> M, int NumInputElements, 5216 bool &DstIsLeft, int &Anomaly) { 5217 if (M.size() != static_cast<size_t>(NumInputElements)) 5218 return false; 5219 5220 int NumLHSMatch = 0, NumRHSMatch = 0; 5221 int LastLHSMismatch = -1, LastRHSMismatch = -1; 5222 5223 for (int i = 0; i < NumInputElements; ++i) { 5224 if (M[i] == -1) { 5225 ++NumLHSMatch; 5226 ++NumRHSMatch; 5227 continue; 5228 } 5229 5230 if (M[i] == i) 5231 ++NumLHSMatch; 5232 else 5233 LastLHSMismatch = i; 5234 5235 if (M[i] == i + NumInputElements) 5236 ++NumRHSMatch; 5237 else 5238 LastRHSMismatch = i; 5239 } 5240 5241 if (NumLHSMatch == NumInputElements - 1) { 5242 DstIsLeft = true; 5243 Anomaly = LastLHSMismatch; 5244 return true; 5245 } else if (NumRHSMatch == NumInputElements - 1) { 5246 DstIsLeft = false; 5247 Anomaly = LastRHSMismatch; 5248 return true; 5249 } 5250 5251 return false; 5252 } 5253 5254 static bool isConcatMask(ArrayRef<int> Mask, EVT VT, bool SplitLHS) { 5255 if (VT.getSizeInBits() != 128) 5256 return false; 5257 5258 unsigned NumElts = VT.getVectorNumElements(); 5259 5260 for (int I = 0, E = NumElts / 2; I != E; I++) { 5261 if (Mask[I] != I) 5262 return false; 5263 } 5264 5265 int Offset = NumElts / 2; 5266 for (int I = NumElts / 2, E = NumElts; I != E; I++) { 5267 if (Mask[I] != I + SplitLHS * Offset) 5268 return false; 5269 } 5270 5271 return true; 5272 } 5273 5274 static SDValue tryFormConcatFromShuffle(SDValue Op, SelectionDAG &DAG) { 5275 SDLoc DL(Op); 5276 EVT VT = Op.getValueType(); 5277 SDValue V0 = Op.getOperand(0); 5278 SDValue V1 = Op.getOperand(1); 5279 ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(Op)->getMask(); 5280 5281 if (VT.getVectorElementType() != V0.getValueType().getVectorElementType() || 5282 VT.getVectorElementType() != V1.getValueType().getVectorElementType()) 5283 return SDValue(); 5284 5285 bool SplitV0 = V0.getValueType().getSizeInBits() == 128; 5286 5287 if (!isConcatMask(Mask, VT, SplitV0)) 5288 return SDValue(); 5289 5290 EVT CastVT = EVT::getVectorVT(*DAG.getContext(), VT.getVectorElementType(), 5291 VT.getVectorNumElements() / 2); 5292 if (SplitV0) { 5293 V0 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, CastVT, V0, 5294 DAG.getConstant(0, DL, MVT::i64)); 5295 } 5296 if (V1.getValueType().getSizeInBits() == 128) { 5297 V1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, CastVT, V1, 5298 DAG.getConstant(0, DL, MVT::i64)); 5299 } 5300 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, V0, V1); 5301 } 5302 5303 /// GeneratePerfectShuffle - Given an entry in the perfect-shuffle table, emit 5304 /// the specified operations to build the shuffle. 5305 static SDValue GeneratePerfectShuffle(unsigned PFEntry, SDValue LHS, 5306 SDValue RHS, SelectionDAG &DAG, 5307 SDLoc dl) { 5308 unsigned OpNum = (PFEntry >> 26) & 0x0F; 5309 unsigned LHSID = (PFEntry >> 13) & ((1 << 13) - 1); 5310 unsigned RHSID = (PFEntry >> 0) & ((1 << 13) - 1); 5311 5312 enum { 5313 OP_COPY = 0, // Copy, used for things like <u,u,u,3> to say it is <0,1,2,3> 5314 OP_VREV, 5315 OP_VDUP0, 5316 OP_VDUP1, 5317 OP_VDUP2, 5318 OP_VDUP3, 5319 OP_VEXT1, 5320 OP_VEXT2, 5321 OP_VEXT3, 5322 OP_VUZPL, // VUZP, left result 5323 OP_VUZPR, // VUZP, right result 5324 OP_VZIPL, // VZIP, left result 5325 OP_VZIPR, // VZIP, right result 5326 OP_VTRNL, // VTRN, left result 5327 OP_VTRNR // VTRN, right result 5328 }; 5329 5330 if (OpNum == OP_COPY) { 5331 if (LHSID == (1 * 9 + 2) * 9 + 3) 5332 return LHS; 5333 assert(LHSID == ((4 * 9 + 5) * 9 + 6) * 9 + 7 && "Illegal OP_COPY!"); 5334 return RHS; 5335 } 5336 5337 SDValue OpLHS, OpRHS; 5338 OpLHS = GeneratePerfectShuffle(PerfectShuffleTable[LHSID], LHS, RHS, DAG, dl); 5339 OpRHS = GeneratePerfectShuffle(PerfectShuffleTable[RHSID], LHS, RHS, DAG, dl); 5340 EVT VT = OpLHS.getValueType(); 5341 5342 switch (OpNum) { 5343 default: 5344 llvm_unreachable("Unknown shuffle opcode!"); 5345 case OP_VREV: 5346 // VREV divides the vector in half and swaps within the half. 5347 if (VT.getVectorElementType() == MVT::i32 || 5348 VT.getVectorElementType() == MVT::f32) 5349 return DAG.getNode(AArch64ISD::REV64, dl, VT, OpLHS); 5350 // vrev <4 x i16> -> REV32 5351 if (VT.getVectorElementType() == MVT::i16 || 5352 VT.getVectorElementType() == MVT::f16) 5353 return DAG.getNode(AArch64ISD::REV32, dl, VT, OpLHS); 5354 // vrev <4 x i8> -> REV16 5355 assert(VT.getVectorElementType() == MVT::i8); 5356 return DAG.getNode(AArch64ISD::REV16, dl, VT, OpLHS); 5357 case OP_VDUP0: 5358 case OP_VDUP1: 5359 case OP_VDUP2: 5360 case OP_VDUP3: { 5361 EVT EltTy = VT.getVectorElementType(); 5362 unsigned Opcode; 5363 if (EltTy == MVT::i8) 5364 Opcode = AArch64ISD::DUPLANE8; 5365 else if (EltTy == MVT::i16 || EltTy == MVT::f16) 5366 Opcode = AArch64ISD::DUPLANE16; 5367 else if (EltTy == MVT::i32 || EltTy == MVT::f32) 5368 Opcode = AArch64ISD::DUPLANE32; 5369 else if (EltTy == MVT::i64 || EltTy == MVT::f64) 5370 Opcode = AArch64ISD::DUPLANE64; 5371 else 5372 llvm_unreachable("Invalid vector element type?"); 5373 5374 if (VT.getSizeInBits() == 64) 5375 OpLHS = WidenVector(OpLHS, DAG); 5376 SDValue Lane = DAG.getConstant(OpNum - OP_VDUP0, dl, MVT::i64); 5377 return DAG.getNode(Opcode, dl, VT, OpLHS, Lane); 5378 } 5379 case OP_VEXT1: 5380 case OP_VEXT2: 5381 case OP_VEXT3: { 5382 unsigned Imm = (OpNum - OP_VEXT1 + 1) * getExtFactor(OpLHS); 5383 return DAG.getNode(AArch64ISD::EXT, dl, VT, OpLHS, OpRHS, 5384 DAG.getConstant(Imm, dl, MVT::i32)); 5385 } 5386 case OP_VUZPL: 5387 return DAG.getNode(AArch64ISD::UZP1, dl, DAG.getVTList(VT, VT), OpLHS, 5388 OpRHS); 5389 case OP_VUZPR: 5390 return DAG.getNode(AArch64ISD::UZP2, dl, DAG.getVTList(VT, VT), OpLHS, 5391 OpRHS); 5392 case OP_VZIPL: 5393 return DAG.getNode(AArch64ISD::ZIP1, dl, DAG.getVTList(VT, VT), OpLHS, 5394 OpRHS); 5395 case OP_VZIPR: 5396 return DAG.getNode(AArch64ISD::ZIP2, dl, DAG.getVTList(VT, VT), OpLHS, 5397 OpRHS); 5398 case OP_VTRNL: 5399 return DAG.getNode(AArch64ISD::TRN1, dl, DAG.getVTList(VT, VT), OpLHS, 5400 OpRHS); 5401 case OP_VTRNR: 5402 return DAG.getNode(AArch64ISD::TRN2, dl, DAG.getVTList(VT, VT), OpLHS, 5403 OpRHS); 5404 } 5405 } 5406 5407 static SDValue GenerateTBL(SDValue Op, ArrayRef<int> ShuffleMask, 5408 SelectionDAG &DAG) { 5409 // Check to see if we can use the TBL instruction. 5410 SDValue V1 = Op.getOperand(0); 5411 SDValue V2 = Op.getOperand(1); 5412 SDLoc DL(Op); 5413 5414 EVT EltVT = Op.getValueType().getVectorElementType(); 5415 unsigned BytesPerElt = EltVT.getSizeInBits() / 8; 5416 5417 SmallVector<SDValue, 8> TBLMask; 5418 for (int Val : ShuffleMask) { 5419 for (unsigned Byte = 0; Byte < BytesPerElt; ++Byte) { 5420 unsigned Offset = Byte + Val * BytesPerElt; 5421 TBLMask.push_back(DAG.getConstant(Offset, DL, MVT::i32)); 5422 } 5423 } 5424 5425 MVT IndexVT = MVT::v8i8; 5426 unsigned IndexLen = 8; 5427 if (Op.getValueType().getSizeInBits() == 128) { 5428 IndexVT = MVT::v16i8; 5429 IndexLen = 16; 5430 } 5431 5432 SDValue V1Cst = DAG.getNode(ISD::BITCAST, DL, IndexVT, V1); 5433 SDValue V2Cst = DAG.getNode(ISD::BITCAST, DL, IndexVT, V2); 5434 5435 SDValue Shuffle; 5436 if (V2.getNode()->getOpcode() == ISD::UNDEF) { 5437 if (IndexLen == 8) 5438 V1Cst = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v16i8, V1Cst, V1Cst); 5439 Shuffle = DAG.getNode( 5440 ISD::INTRINSIC_WO_CHAIN, DL, IndexVT, 5441 DAG.getConstant(Intrinsic::aarch64_neon_tbl1, DL, MVT::i32), V1Cst, 5442 DAG.getNode(ISD::BUILD_VECTOR, DL, IndexVT, 5443 makeArrayRef(TBLMask.data(), IndexLen))); 5444 } else { 5445 if (IndexLen == 8) { 5446 V1Cst = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v16i8, V1Cst, V2Cst); 5447 Shuffle = DAG.getNode( 5448 ISD::INTRINSIC_WO_CHAIN, DL, IndexVT, 5449 DAG.getConstant(Intrinsic::aarch64_neon_tbl1, DL, MVT::i32), V1Cst, 5450 DAG.getNode(ISD::BUILD_VECTOR, DL, IndexVT, 5451 makeArrayRef(TBLMask.data(), IndexLen))); 5452 } else { 5453 // FIXME: We cannot, for the moment, emit a TBL2 instruction because we 5454 // cannot currently represent the register constraints on the input 5455 // table registers. 5456 // Shuffle = DAG.getNode(AArch64ISD::TBL2, DL, IndexVT, V1Cst, V2Cst, 5457 // DAG.getNode(ISD::BUILD_VECTOR, DL, IndexVT, 5458 // &TBLMask[0], IndexLen)); 5459 Shuffle = DAG.getNode( 5460 ISD::INTRINSIC_WO_CHAIN, DL, IndexVT, 5461 DAG.getConstant(Intrinsic::aarch64_neon_tbl2, DL, MVT::i32), 5462 V1Cst, V2Cst, 5463 DAG.getNode(ISD::BUILD_VECTOR, DL, IndexVT, 5464 makeArrayRef(TBLMask.data(), IndexLen))); 5465 } 5466 } 5467 return DAG.getNode(ISD::BITCAST, DL, Op.getValueType(), Shuffle); 5468 } 5469 5470 static unsigned getDUPLANEOp(EVT EltType) { 5471 if (EltType == MVT::i8) 5472 return AArch64ISD::DUPLANE8; 5473 if (EltType == MVT::i16 || EltType == MVT::f16) 5474 return AArch64ISD::DUPLANE16; 5475 if (EltType == MVT::i32 || EltType == MVT::f32) 5476 return AArch64ISD::DUPLANE32; 5477 if (EltType == MVT::i64 || EltType == MVT::f64) 5478 return AArch64ISD::DUPLANE64; 5479 5480 llvm_unreachable("Invalid vector element type?"); 5481 } 5482 5483 SDValue AArch64TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, 5484 SelectionDAG &DAG) const { 5485 SDLoc dl(Op); 5486 EVT VT = Op.getValueType(); 5487 5488 ShuffleVectorSDNode *SVN = cast<ShuffleVectorSDNode>(Op.getNode()); 5489 5490 // Convert shuffles that are directly supported on NEON to target-specific 5491 // DAG nodes, instead of keeping them as shuffles and matching them again 5492 // during code selection. This is more efficient and avoids the possibility 5493 // of inconsistencies between legalization and selection. 5494 ArrayRef<int> ShuffleMask = SVN->getMask(); 5495 5496 SDValue V1 = Op.getOperand(0); 5497 SDValue V2 = Op.getOperand(1); 5498 5499 if (ShuffleVectorSDNode::isSplatMask(&ShuffleMask[0], 5500 V1.getValueType().getSimpleVT())) { 5501 int Lane = SVN->getSplatIndex(); 5502 // If this is undef splat, generate it via "just" vdup, if possible. 5503 if (Lane == -1) 5504 Lane = 0; 5505 5506 if (Lane == 0 && V1.getOpcode() == ISD::SCALAR_TO_VECTOR) 5507 return DAG.getNode(AArch64ISD::DUP, dl, V1.getValueType(), 5508 V1.getOperand(0)); 5509 // Test if V1 is a BUILD_VECTOR and the lane being referenced is a non- 5510 // constant. If so, we can just reference the lane's definition directly. 5511 if (V1.getOpcode() == ISD::BUILD_VECTOR && 5512 !isa<ConstantSDNode>(V1.getOperand(Lane))) 5513 return DAG.getNode(AArch64ISD::DUP, dl, VT, V1.getOperand(Lane)); 5514 5515 // Otherwise, duplicate from the lane of the input vector. 5516 unsigned Opcode = getDUPLANEOp(V1.getValueType().getVectorElementType()); 5517 5518 // SelectionDAGBuilder may have "helpfully" already extracted or conatenated 5519 // to make a vector of the same size as this SHUFFLE. We can ignore the 5520 // extract entirely, and canonicalise the concat using WidenVector. 5521 if (V1.getOpcode() == ISD::EXTRACT_SUBVECTOR) { 5522 Lane += cast<ConstantSDNode>(V1.getOperand(1))->getZExtValue(); 5523 V1 = V1.getOperand(0); 5524 } else if (V1.getOpcode() == ISD::CONCAT_VECTORS) { 5525 unsigned Idx = Lane >= (int)VT.getVectorNumElements() / 2; 5526 Lane -= Idx * VT.getVectorNumElements() / 2; 5527 V1 = WidenVector(V1.getOperand(Idx), DAG); 5528 } else if (VT.getSizeInBits() == 64) 5529 V1 = WidenVector(V1, DAG); 5530 5531 return DAG.getNode(Opcode, dl, VT, V1, DAG.getConstant(Lane, dl, MVT::i64)); 5532 } 5533 5534 if (isREVMask(ShuffleMask, VT, 64)) 5535 return DAG.getNode(AArch64ISD::REV64, dl, V1.getValueType(), V1, V2); 5536 if (isREVMask(ShuffleMask, VT, 32)) 5537 return DAG.getNode(AArch64ISD::REV32, dl, V1.getValueType(), V1, V2); 5538 if (isREVMask(ShuffleMask, VT, 16)) 5539 return DAG.getNode(AArch64ISD::REV16, dl, V1.getValueType(), V1, V2); 5540 5541 bool ReverseEXT = false; 5542 unsigned Imm; 5543 if (isEXTMask(ShuffleMask, VT, ReverseEXT, Imm)) { 5544 if (ReverseEXT) 5545 std::swap(V1, V2); 5546 Imm *= getExtFactor(V1); 5547 return DAG.getNode(AArch64ISD::EXT, dl, V1.getValueType(), V1, V2, 5548 DAG.getConstant(Imm, dl, MVT::i32)); 5549 } else if (V2->getOpcode() == ISD::UNDEF && 5550 isSingletonEXTMask(ShuffleMask, VT, Imm)) { 5551 Imm *= getExtFactor(V1); 5552 return DAG.getNode(AArch64ISD::EXT, dl, V1.getValueType(), V1, V1, 5553 DAG.getConstant(Imm, dl, MVT::i32)); 5554 } 5555 5556 unsigned WhichResult; 5557 if (isZIPMask(ShuffleMask, VT, WhichResult)) { 5558 unsigned Opc = (WhichResult == 0) ? AArch64ISD::ZIP1 : AArch64ISD::ZIP2; 5559 return DAG.getNode(Opc, dl, V1.getValueType(), V1, V2); 5560 } 5561 if (isUZPMask(ShuffleMask, VT, WhichResult)) { 5562 unsigned Opc = (WhichResult == 0) ? AArch64ISD::UZP1 : AArch64ISD::UZP2; 5563 return DAG.getNode(Opc, dl, V1.getValueType(), V1, V2); 5564 } 5565 if (isTRNMask(ShuffleMask, VT, WhichResult)) { 5566 unsigned Opc = (WhichResult == 0) ? AArch64ISD::TRN1 : AArch64ISD::TRN2; 5567 return DAG.getNode(Opc, dl, V1.getValueType(), V1, V2); 5568 } 5569 5570 if (isZIP_v_undef_Mask(ShuffleMask, VT, WhichResult)) { 5571 unsigned Opc = (WhichResult == 0) ? AArch64ISD::ZIP1 : AArch64ISD::ZIP2; 5572 return DAG.getNode(Opc, dl, V1.getValueType(), V1, V1); 5573 } 5574 if (isUZP_v_undef_Mask(ShuffleMask, VT, WhichResult)) { 5575 unsigned Opc = (WhichResult == 0) ? AArch64ISD::UZP1 : AArch64ISD::UZP2; 5576 return DAG.getNode(Opc, dl, V1.getValueType(), V1, V1); 5577 } 5578 if (isTRN_v_undef_Mask(ShuffleMask, VT, WhichResult)) { 5579 unsigned Opc = (WhichResult == 0) ? AArch64ISD::TRN1 : AArch64ISD::TRN2; 5580 return DAG.getNode(Opc, dl, V1.getValueType(), V1, V1); 5581 } 5582 5583 SDValue Concat = tryFormConcatFromShuffle(Op, DAG); 5584 if (Concat.getNode()) 5585 return Concat; 5586 5587 bool DstIsLeft; 5588 int Anomaly; 5589 int NumInputElements = V1.getValueType().getVectorNumElements(); 5590 if (isINSMask(ShuffleMask, NumInputElements, DstIsLeft, Anomaly)) { 5591 SDValue DstVec = DstIsLeft ? V1 : V2; 5592 SDValue DstLaneV = DAG.getConstant(Anomaly, dl, MVT::i64); 5593 5594 SDValue SrcVec = V1; 5595 int SrcLane = ShuffleMask[Anomaly]; 5596 if (SrcLane >= NumInputElements) { 5597 SrcVec = V2; 5598 SrcLane -= VT.getVectorNumElements(); 5599 } 5600 SDValue SrcLaneV = DAG.getConstant(SrcLane, dl, MVT::i64); 5601 5602 EVT ScalarVT = VT.getVectorElementType(); 5603 5604 if (ScalarVT.getSizeInBits() < 32 && ScalarVT.isInteger()) 5605 ScalarVT = MVT::i32; 5606 5607 return DAG.getNode( 5608 ISD::INSERT_VECTOR_ELT, dl, VT, DstVec, 5609 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, ScalarVT, SrcVec, SrcLaneV), 5610 DstLaneV); 5611 } 5612 5613 // If the shuffle is not directly supported and it has 4 elements, use 5614 // the PerfectShuffle-generated table to synthesize it from other shuffles. 5615 unsigned NumElts = VT.getVectorNumElements(); 5616 if (NumElts == 4) { 5617 unsigned PFIndexes[4]; 5618 for (unsigned i = 0; i != 4; ++i) { 5619 if (ShuffleMask[i] < 0) 5620 PFIndexes[i] = 8; 5621 else 5622 PFIndexes[i] = ShuffleMask[i]; 5623 } 5624 5625 // Compute the index in the perfect shuffle table. 5626 unsigned PFTableIndex = PFIndexes[0] * 9 * 9 * 9 + PFIndexes[1] * 9 * 9 + 5627 PFIndexes[2] * 9 + PFIndexes[3]; 5628 unsigned PFEntry = PerfectShuffleTable[PFTableIndex]; 5629 unsigned Cost = (PFEntry >> 30); 5630 5631 if (Cost <= 4) 5632 return GeneratePerfectShuffle(PFEntry, V1, V2, DAG, dl); 5633 } 5634 5635 return GenerateTBL(Op, ShuffleMask, DAG); 5636 } 5637 5638 static bool resolveBuildVector(BuildVectorSDNode *BVN, APInt &CnstBits, 5639 APInt &UndefBits) { 5640 EVT VT = BVN->getValueType(0); 5641 APInt SplatBits, SplatUndef; 5642 unsigned SplatBitSize; 5643 bool HasAnyUndefs; 5644 if (BVN->isConstantSplat(SplatBits, SplatUndef, SplatBitSize, HasAnyUndefs)) { 5645 unsigned NumSplats = VT.getSizeInBits() / SplatBitSize; 5646 5647 for (unsigned i = 0; i < NumSplats; ++i) { 5648 CnstBits <<= SplatBitSize; 5649 UndefBits <<= SplatBitSize; 5650 CnstBits |= SplatBits.zextOrTrunc(VT.getSizeInBits()); 5651 UndefBits |= (SplatBits ^ SplatUndef).zextOrTrunc(VT.getSizeInBits()); 5652 } 5653 5654 return true; 5655 } 5656 5657 return false; 5658 } 5659 5660 SDValue AArch64TargetLowering::LowerVectorAND(SDValue Op, 5661 SelectionDAG &DAG) const { 5662 BuildVectorSDNode *BVN = 5663 dyn_cast<BuildVectorSDNode>(Op.getOperand(1).getNode()); 5664 SDValue LHS = Op.getOperand(0); 5665 SDLoc dl(Op); 5666 EVT VT = Op.getValueType(); 5667 5668 if (!BVN) 5669 return Op; 5670 5671 APInt CnstBits(VT.getSizeInBits(), 0); 5672 APInt UndefBits(VT.getSizeInBits(), 0); 5673 if (resolveBuildVector(BVN, CnstBits, UndefBits)) { 5674 // We only have BIC vector immediate instruction, which is and-not. 5675 CnstBits = ~CnstBits; 5676 5677 // We make use of a little bit of goto ickiness in order to avoid having to 5678 // duplicate the immediate matching logic for the undef toggled case. 5679 bool SecondTry = false; 5680 AttemptModImm: 5681 5682 if (CnstBits.getHiBits(64) == CnstBits.getLoBits(64)) { 5683 CnstBits = CnstBits.zextOrTrunc(64); 5684 uint64_t CnstVal = CnstBits.getZExtValue(); 5685 5686 if (AArch64_AM::isAdvSIMDModImmType1(CnstVal)) { 5687 CnstVal = AArch64_AM::encodeAdvSIMDModImmType1(CnstVal); 5688 MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32; 5689 SDValue Mov = DAG.getNode(AArch64ISD::BICi, dl, MovTy, LHS, 5690 DAG.getConstant(CnstVal, dl, MVT::i32), 5691 DAG.getConstant(0, dl, MVT::i32)); 5692 return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov); 5693 } 5694 5695 if (AArch64_AM::isAdvSIMDModImmType2(CnstVal)) { 5696 CnstVal = AArch64_AM::encodeAdvSIMDModImmType2(CnstVal); 5697 MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32; 5698 SDValue Mov = DAG.getNode(AArch64ISD::BICi, dl, MovTy, LHS, 5699 DAG.getConstant(CnstVal, dl, MVT::i32), 5700 DAG.getConstant(8, dl, MVT::i32)); 5701 return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov); 5702 } 5703 5704 if (AArch64_AM::isAdvSIMDModImmType3(CnstVal)) { 5705 CnstVal = AArch64_AM::encodeAdvSIMDModImmType3(CnstVal); 5706 MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32; 5707 SDValue Mov = DAG.getNode(AArch64ISD::BICi, dl, MovTy, LHS, 5708 DAG.getConstant(CnstVal, dl, MVT::i32), 5709 DAG.getConstant(16, dl, MVT::i32)); 5710 return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov); 5711 } 5712 5713 if (AArch64_AM::isAdvSIMDModImmType4(CnstVal)) { 5714 CnstVal = AArch64_AM::encodeAdvSIMDModImmType4(CnstVal); 5715 MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32; 5716 SDValue Mov = DAG.getNode(AArch64ISD::BICi, dl, MovTy, LHS, 5717 DAG.getConstant(CnstVal, dl, MVT::i32), 5718 DAG.getConstant(24, dl, MVT::i32)); 5719 return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov); 5720 } 5721 5722 if (AArch64_AM::isAdvSIMDModImmType5(CnstVal)) { 5723 CnstVal = AArch64_AM::encodeAdvSIMDModImmType5(CnstVal); 5724 MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v8i16 : MVT::v4i16; 5725 SDValue Mov = DAG.getNode(AArch64ISD::BICi, dl, MovTy, LHS, 5726 DAG.getConstant(CnstVal, dl, MVT::i32), 5727 DAG.getConstant(0, dl, MVT::i32)); 5728 return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov); 5729 } 5730 5731 if (AArch64_AM::isAdvSIMDModImmType6(CnstVal)) { 5732 CnstVal = AArch64_AM::encodeAdvSIMDModImmType6(CnstVal); 5733 MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v8i16 : MVT::v4i16; 5734 SDValue Mov = DAG.getNode(AArch64ISD::BICi, dl, MovTy, LHS, 5735 DAG.getConstant(CnstVal, dl, MVT::i32), 5736 DAG.getConstant(8, dl, MVT::i32)); 5737 return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov); 5738 } 5739 } 5740 5741 if (SecondTry) 5742 goto FailedModImm; 5743 SecondTry = true; 5744 CnstBits = ~UndefBits; 5745 goto AttemptModImm; 5746 } 5747 5748 // We can always fall back to a non-immediate AND. 5749 FailedModImm: 5750 return Op; 5751 } 5752 5753 // Specialized code to quickly find if PotentialBVec is a BuildVector that 5754 // consists of only the same constant int value, returned in reference arg 5755 // ConstVal 5756 static bool isAllConstantBuildVector(const SDValue &PotentialBVec, 5757 uint64_t &ConstVal) { 5758 BuildVectorSDNode *Bvec = dyn_cast<BuildVectorSDNode>(PotentialBVec); 5759 if (!Bvec) 5760 return false; 5761 ConstantSDNode *FirstElt = dyn_cast<ConstantSDNode>(Bvec->getOperand(0)); 5762 if (!FirstElt) 5763 return false; 5764 EVT VT = Bvec->getValueType(0); 5765 unsigned NumElts = VT.getVectorNumElements(); 5766 for (unsigned i = 1; i < NumElts; ++i) 5767 if (dyn_cast<ConstantSDNode>(Bvec->getOperand(i)) != FirstElt) 5768 return false; 5769 ConstVal = FirstElt->getZExtValue(); 5770 return true; 5771 } 5772 5773 static unsigned getIntrinsicID(const SDNode *N) { 5774 unsigned Opcode = N->getOpcode(); 5775 switch (Opcode) { 5776 default: 5777 return Intrinsic::not_intrinsic; 5778 case ISD::INTRINSIC_WO_CHAIN: { 5779 unsigned IID = cast<ConstantSDNode>(N->getOperand(0))->getZExtValue(); 5780 if (IID < Intrinsic::num_intrinsics) 5781 return IID; 5782 return Intrinsic::not_intrinsic; 5783 } 5784 } 5785 } 5786 5787 // Attempt to form a vector S[LR]I from (or (and X, BvecC1), (lsl Y, C2)), 5788 // to (SLI X, Y, C2), where X and Y have matching vector types, BvecC1 is a 5789 // BUILD_VECTORs with constant element C1, C2 is a constant, and C1 == ~C2. 5790 // Also, logical shift right -> sri, with the same structure. 5791 static SDValue tryLowerToSLI(SDNode *N, SelectionDAG &DAG) { 5792 EVT VT = N->getValueType(0); 5793 5794 if (!VT.isVector()) 5795 return SDValue(); 5796 5797 SDLoc DL(N); 5798 5799 // Is the first op an AND? 5800 const SDValue And = N->getOperand(0); 5801 if (And.getOpcode() != ISD::AND) 5802 return SDValue(); 5803 5804 // Is the second op an shl or lshr? 5805 SDValue Shift = N->getOperand(1); 5806 // This will have been turned into: AArch64ISD::VSHL vector, #shift 5807 // or AArch64ISD::VLSHR vector, #shift 5808 unsigned ShiftOpc = Shift.getOpcode(); 5809 if ((ShiftOpc != AArch64ISD::VSHL && ShiftOpc != AArch64ISD::VLSHR)) 5810 return SDValue(); 5811 bool IsShiftRight = ShiftOpc == AArch64ISD::VLSHR; 5812 5813 // Is the shift amount constant? 5814 ConstantSDNode *C2node = dyn_cast<ConstantSDNode>(Shift.getOperand(1)); 5815 if (!C2node) 5816 return SDValue(); 5817 5818 // Is the and mask vector all constant? 5819 uint64_t C1; 5820 if (!isAllConstantBuildVector(And.getOperand(1), C1)) 5821 return SDValue(); 5822 5823 // Is C1 == ~C2, taking into account how much one can shift elements of a 5824 // particular size? 5825 uint64_t C2 = C2node->getZExtValue(); 5826 unsigned ElemSizeInBits = VT.getVectorElementType().getSizeInBits(); 5827 if (C2 > ElemSizeInBits) 5828 return SDValue(); 5829 unsigned ElemMask = (1 << ElemSizeInBits) - 1; 5830 if ((C1 & ElemMask) != (~C2 & ElemMask)) 5831 return SDValue(); 5832 5833 SDValue X = And.getOperand(0); 5834 SDValue Y = Shift.getOperand(0); 5835 5836 unsigned Intrin = 5837 IsShiftRight ? Intrinsic::aarch64_neon_vsri : Intrinsic::aarch64_neon_vsli; 5838 SDValue ResultSLI = 5839 DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT, 5840 DAG.getConstant(Intrin, DL, MVT::i32), X, Y, 5841 Shift.getOperand(1)); 5842 5843 DEBUG(dbgs() << "aarch64-lower: transformed: \n"); 5844 DEBUG(N->dump(&DAG)); 5845 DEBUG(dbgs() << "into: \n"); 5846 DEBUG(ResultSLI->dump(&DAG)); 5847 5848 ++NumShiftInserts; 5849 return ResultSLI; 5850 } 5851 5852 SDValue AArch64TargetLowering::LowerVectorOR(SDValue Op, 5853 SelectionDAG &DAG) const { 5854 // Attempt to form a vector S[LR]I from (or (and X, C1), (lsl Y, C2)) 5855 if (EnableAArch64SlrGeneration) { 5856 SDValue Res = tryLowerToSLI(Op.getNode(), DAG); 5857 if (Res.getNode()) 5858 return Res; 5859 } 5860 5861 BuildVectorSDNode *BVN = 5862 dyn_cast<BuildVectorSDNode>(Op.getOperand(0).getNode()); 5863 SDValue LHS = Op.getOperand(1); 5864 SDLoc dl(Op); 5865 EVT VT = Op.getValueType(); 5866 5867 // OR commutes, so try swapping the operands. 5868 if (!BVN) { 5869 LHS = Op.getOperand(0); 5870 BVN = dyn_cast<BuildVectorSDNode>(Op.getOperand(1).getNode()); 5871 } 5872 if (!BVN) 5873 return Op; 5874 5875 APInt CnstBits(VT.getSizeInBits(), 0); 5876 APInt UndefBits(VT.getSizeInBits(), 0); 5877 if (resolveBuildVector(BVN, CnstBits, UndefBits)) { 5878 // We make use of a little bit of goto ickiness in order to avoid having to 5879 // duplicate the immediate matching logic for the undef toggled case. 5880 bool SecondTry = false; 5881 AttemptModImm: 5882 5883 if (CnstBits.getHiBits(64) == CnstBits.getLoBits(64)) { 5884 CnstBits = CnstBits.zextOrTrunc(64); 5885 uint64_t CnstVal = CnstBits.getZExtValue(); 5886 5887 if (AArch64_AM::isAdvSIMDModImmType1(CnstVal)) { 5888 CnstVal = AArch64_AM::encodeAdvSIMDModImmType1(CnstVal); 5889 MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32; 5890 SDValue Mov = DAG.getNode(AArch64ISD::ORRi, dl, MovTy, LHS, 5891 DAG.getConstant(CnstVal, dl, MVT::i32), 5892 DAG.getConstant(0, dl, MVT::i32)); 5893 return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov); 5894 } 5895 5896 if (AArch64_AM::isAdvSIMDModImmType2(CnstVal)) { 5897 CnstVal = AArch64_AM::encodeAdvSIMDModImmType2(CnstVal); 5898 MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32; 5899 SDValue Mov = DAG.getNode(AArch64ISD::ORRi, dl, MovTy, LHS, 5900 DAG.getConstant(CnstVal, dl, MVT::i32), 5901 DAG.getConstant(8, dl, MVT::i32)); 5902 return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov); 5903 } 5904 5905 if (AArch64_AM::isAdvSIMDModImmType3(CnstVal)) { 5906 CnstVal = AArch64_AM::encodeAdvSIMDModImmType3(CnstVal); 5907 MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32; 5908 SDValue Mov = DAG.getNode(AArch64ISD::ORRi, dl, MovTy, LHS, 5909 DAG.getConstant(CnstVal, dl, MVT::i32), 5910 DAG.getConstant(16, dl, MVT::i32)); 5911 return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov); 5912 } 5913 5914 if (AArch64_AM::isAdvSIMDModImmType4(CnstVal)) { 5915 CnstVal = AArch64_AM::encodeAdvSIMDModImmType4(CnstVal); 5916 MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32; 5917 SDValue Mov = DAG.getNode(AArch64ISD::ORRi, dl, MovTy, LHS, 5918 DAG.getConstant(CnstVal, dl, MVT::i32), 5919 DAG.getConstant(24, dl, MVT::i32)); 5920 return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov); 5921 } 5922 5923 if (AArch64_AM::isAdvSIMDModImmType5(CnstVal)) { 5924 CnstVal = AArch64_AM::encodeAdvSIMDModImmType5(CnstVal); 5925 MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v8i16 : MVT::v4i16; 5926 SDValue Mov = DAG.getNode(AArch64ISD::ORRi, dl, MovTy, LHS, 5927 DAG.getConstant(CnstVal, dl, MVT::i32), 5928 DAG.getConstant(0, dl, MVT::i32)); 5929 return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov); 5930 } 5931 5932 if (AArch64_AM::isAdvSIMDModImmType6(CnstVal)) { 5933 CnstVal = AArch64_AM::encodeAdvSIMDModImmType6(CnstVal); 5934 MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v8i16 : MVT::v4i16; 5935 SDValue Mov = DAG.getNode(AArch64ISD::ORRi, dl, MovTy, LHS, 5936 DAG.getConstant(CnstVal, dl, MVT::i32), 5937 DAG.getConstant(8, dl, MVT::i32)); 5938 return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov); 5939 } 5940 } 5941 5942 if (SecondTry) 5943 goto FailedModImm; 5944 SecondTry = true; 5945 CnstBits = UndefBits; 5946 goto AttemptModImm; 5947 } 5948 5949 // We can always fall back to a non-immediate OR. 5950 FailedModImm: 5951 return Op; 5952 } 5953 5954 // Normalize the operands of BUILD_VECTOR. The value of constant operands will 5955 // be truncated to fit element width. 5956 static SDValue NormalizeBuildVector(SDValue Op, 5957 SelectionDAG &DAG) { 5958 assert(Op.getOpcode() == ISD::BUILD_VECTOR && "Unknown opcode!"); 5959 SDLoc dl(Op); 5960 EVT VT = Op.getValueType(); 5961 EVT EltTy= VT.getVectorElementType(); 5962 5963 if (EltTy.isFloatingPoint() || EltTy.getSizeInBits() > 16) 5964 return Op; 5965 5966 SmallVector<SDValue, 16> Ops; 5967 for (SDValue Lane : Op->ops()) { 5968 if (auto *CstLane = dyn_cast<ConstantSDNode>(Lane)) { 5969 APInt LowBits(EltTy.getSizeInBits(), 5970 CstLane->getZExtValue()); 5971 Lane = DAG.getConstant(LowBits.getZExtValue(), dl, MVT::i32); 5972 } 5973 Ops.push_back(Lane); 5974 } 5975 return DAG.getNode(ISD::BUILD_VECTOR, dl, VT, Ops); 5976 } 5977 5978 SDValue AArch64TargetLowering::LowerBUILD_VECTOR(SDValue Op, 5979 SelectionDAG &DAG) const { 5980 SDLoc dl(Op); 5981 EVT VT = Op.getValueType(); 5982 Op = NormalizeBuildVector(Op, DAG); 5983 BuildVectorSDNode *BVN = cast<BuildVectorSDNode>(Op.getNode()); 5984 5985 APInt CnstBits(VT.getSizeInBits(), 0); 5986 APInt UndefBits(VT.getSizeInBits(), 0); 5987 if (resolveBuildVector(BVN, CnstBits, UndefBits)) { 5988 // We make use of a little bit of goto ickiness in order to avoid having to 5989 // duplicate the immediate matching logic for the undef toggled case. 5990 bool SecondTry = false; 5991 AttemptModImm: 5992 5993 if (CnstBits.getHiBits(64) == CnstBits.getLoBits(64)) { 5994 CnstBits = CnstBits.zextOrTrunc(64); 5995 uint64_t CnstVal = CnstBits.getZExtValue(); 5996 5997 // Certain magic vector constants (used to express things like NOT 5998 // and NEG) are passed through unmodified. This allows codegen patterns 5999 // for these operations to match. Special-purpose patterns will lower 6000 // these immediates to MOVIs if it proves necessary. 6001 if (VT.isInteger() && (CnstVal == 0 || CnstVal == ~0ULL)) 6002 return Op; 6003 6004 // The many faces of MOVI... 6005 if (AArch64_AM::isAdvSIMDModImmType10(CnstVal)) { 6006 CnstVal = AArch64_AM::encodeAdvSIMDModImmType10(CnstVal); 6007 if (VT.getSizeInBits() == 128) { 6008 SDValue Mov = DAG.getNode(AArch64ISD::MOVIedit, dl, MVT::v2i64, 6009 DAG.getConstant(CnstVal, dl, MVT::i32)); 6010 return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov); 6011 } 6012 6013 // Support the V64 version via subregister insertion. 6014 SDValue Mov = DAG.getNode(AArch64ISD::MOVIedit, dl, MVT::f64, 6015 DAG.getConstant(CnstVal, dl, MVT::i32)); 6016 return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov); 6017 } 6018 6019 if (AArch64_AM::isAdvSIMDModImmType1(CnstVal)) { 6020 CnstVal = AArch64_AM::encodeAdvSIMDModImmType1(CnstVal); 6021 MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32; 6022 SDValue Mov = DAG.getNode(AArch64ISD::MOVIshift, dl, MovTy, 6023 DAG.getConstant(CnstVal, dl, MVT::i32), 6024 DAG.getConstant(0, dl, MVT::i32)); 6025 return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov); 6026 } 6027 6028 if (AArch64_AM::isAdvSIMDModImmType2(CnstVal)) { 6029 CnstVal = AArch64_AM::encodeAdvSIMDModImmType2(CnstVal); 6030 MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32; 6031 SDValue Mov = DAG.getNode(AArch64ISD::MOVIshift, dl, MovTy, 6032 DAG.getConstant(CnstVal, dl, MVT::i32), 6033 DAG.getConstant(8, dl, MVT::i32)); 6034 return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov); 6035 } 6036 6037 if (AArch64_AM::isAdvSIMDModImmType3(CnstVal)) { 6038 CnstVal = AArch64_AM::encodeAdvSIMDModImmType3(CnstVal); 6039 MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32; 6040 SDValue Mov = DAG.getNode(AArch64ISD::MOVIshift, dl, MovTy, 6041 DAG.getConstant(CnstVal, dl, MVT::i32), 6042 DAG.getConstant(16, dl, MVT::i32)); 6043 return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov); 6044 } 6045 6046 if (AArch64_AM::isAdvSIMDModImmType4(CnstVal)) { 6047 CnstVal = AArch64_AM::encodeAdvSIMDModImmType4(CnstVal); 6048 MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32; 6049 SDValue Mov = DAG.getNode(AArch64ISD::MOVIshift, dl, MovTy, 6050 DAG.getConstant(CnstVal, dl, MVT::i32), 6051 DAG.getConstant(24, dl, MVT::i32)); 6052 return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov); 6053 } 6054 6055 if (AArch64_AM::isAdvSIMDModImmType5(CnstVal)) { 6056 CnstVal = AArch64_AM::encodeAdvSIMDModImmType5(CnstVal); 6057 MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v8i16 : MVT::v4i16; 6058 SDValue Mov = DAG.getNode(AArch64ISD::MOVIshift, dl, MovTy, 6059 DAG.getConstant(CnstVal, dl, MVT::i32), 6060 DAG.getConstant(0, dl, MVT::i32)); 6061 return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov); 6062 } 6063 6064 if (AArch64_AM::isAdvSIMDModImmType6(CnstVal)) { 6065 CnstVal = AArch64_AM::encodeAdvSIMDModImmType6(CnstVal); 6066 MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v8i16 : MVT::v4i16; 6067 SDValue Mov = DAG.getNode(AArch64ISD::MOVIshift, dl, MovTy, 6068 DAG.getConstant(CnstVal, dl, MVT::i32), 6069 DAG.getConstant(8, dl, MVT::i32)); 6070 return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov); 6071 } 6072 6073 if (AArch64_AM::isAdvSIMDModImmType7(CnstVal)) { 6074 CnstVal = AArch64_AM::encodeAdvSIMDModImmType7(CnstVal); 6075 MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32; 6076 SDValue Mov = DAG.getNode(AArch64ISD::MOVImsl, dl, MovTy, 6077 DAG.getConstant(CnstVal, dl, MVT::i32), 6078 DAG.getConstant(264, dl, MVT::i32)); 6079 return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov); 6080 } 6081 6082 if (AArch64_AM::isAdvSIMDModImmType8(CnstVal)) { 6083 CnstVal = AArch64_AM::encodeAdvSIMDModImmType8(CnstVal); 6084 MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32; 6085 SDValue Mov = DAG.getNode(AArch64ISD::MOVImsl, dl, MovTy, 6086 DAG.getConstant(CnstVal, dl, MVT::i32), 6087 DAG.getConstant(272, dl, MVT::i32)); 6088 return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov); 6089 } 6090 6091 if (AArch64_AM::isAdvSIMDModImmType9(CnstVal)) { 6092 CnstVal = AArch64_AM::encodeAdvSIMDModImmType9(CnstVal); 6093 MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v16i8 : MVT::v8i8; 6094 SDValue Mov = DAG.getNode(AArch64ISD::MOVI, dl, MovTy, 6095 DAG.getConstant(CnstVal, dl, MVT::i32)); 6096 return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov); 6097 } 6098 6099 // The few faces of FMOV... 6100 if (AArch64_AM::isAdvSIMDModImmType11(CnstVal)) { 6101 CnstVal = AArch64_AM::encodeAdvSIMDModImmType11(CnstVal); 6102 MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4f32 : MVT::v2f32; 6103 SDValue Mov = DAG.getNode(AArch64ISD::FMOV, dl, MovTy, 6104 DAG.getConstant(CnstVal, dl, MVT::i32)); 6105 return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov); 6106 } 6107 6108 if (AArch64_AM::isAdvSIMDModImmType12(CnstVal) && 6109 VT.getSizeInBits() == 128) { 6110 CnstVal = AArch64_AM::encodeAdvSIMDModImmType12(CnstVal); 6111 SDValue Mov = DAG.getNode(AArch64ISD::FMOV, dl, MVT::v2f64, 6112 DAG.getConstant(CnstVal, dl, MVT::i32)); 6113 return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov); 6114 } 6115 6116 // The many faces of MVNI... 6117 CnstVal = ~CnstVal; 6118 if (AArch64_AM::isAdvSIMDModImmType1(CnstVal)) { 6119 CnstVal = AArch64_AM::encodeAdvSIMDModImmType1(CnstVal); 6120 MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32; 6121 SDValue Mov = DAG.getNode(AArch64ISD::MVNIshift, dl, MovTy, 6122 DAG.getConstant(CnstVal, dl, MVT::i32), 6123 DAG.getConstant(0, dl, MVT::i32)); 6124 return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov); 6125 } 6126 6127 if (AArch64_AM::isAdvSIMDModImmType2(CnstVal)) { 6128 CnstVal = AArch64_AM::encodeAdvSIMDModImmType2(CnstVal); 6129 MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32; 6130 SDValue Mov = DAG.getNode(AArch64ISD::MVNIshift, dl, MovTy, 6131 DAG.getConstant(CnstVal, dl, MVT::i32), 6132 DAG.getConstant(8, dl, MVT::i32)); 6133 return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov); 6134 } 6135 6136 if (AArch64_AM::isAdvSIMDModImmType3(CnstVal)) { 6137 CnstVal = AArch64_AM::encodeAdvSIMDModImmType3(CnstVal); 6138 MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32; 6139 SDValue Mov = DAG.getNode(AArch64ISD::MVNIshift, dl, MovTy, 6140 DAG.getConstant(CnstVal, dl, MVT::i32), 6141 DAG.getConstant(16, dl, MVT::i32)); 6142 return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov); 6143 } 6144 6145 if (AArch64_AM::isAdvSIMDModImmType4(CnstVal)) { 6146 CnstVal = AArch64_AM::encodeAdvSIMDModImmType4(CnstVal); 6147 MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32; 6148 SDValue Mov = DAG.getNode(AArch64ISD::MVNIshift, dl, MovTy, 6149 DAG.getConstant(CnstVal, dl, MVT::i32), 6150 DAG.getConstant(24, dl, MVT::i32)); 6151 return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov); 6152 } 6153 6154 if (AArch64_AM::isAdvSIMDModImmType5(CnstVal)) { 6155 CnstVal = AArch64_AM::encodeAdvSIMDModImmType5(CnstVal); 6156 MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v8i16 : MVT::v4i16; 6157 SDValue Mov = DAG.getNode(AArch64ISD::MVNIshift, dl, MovTy, 6158 DAG.getConstant(CnstVal, dl, MVT::i32), 6159 DAG.getConstant(0, dl, MVT::i32)); 6160 return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov); 6161 } 6162 6163 if (AArch64_AM::isAdvSIMDModImmType6(CnstVal)) { 6164 CnstVal = AArch64_AM::encodeAdvSIMDModImmType6(CnstVal); 6165 MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v8i16 : MVT::v4i16; 6166 SDValue Mov = DAG.getNode(AArch64ISD::MVNIshift, dl, MovTy, 6167 DAG.getConstant(CnstVal, dl, MVT::i32), 6168 DAG.getConstant(8, dl, MVT::i32)); 6169 return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov); 6170 } 6171 6172 if (AArch64_AM::isAdvSIMDModImmType7(CnstVal)) { 6173 CnstVal = AArch64_AM::encodeAdvSIMDModImmType7(CnstVal); 6174 MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32; 6175 SDValue Mov = DAG.getNode(AArch64ISD::MVNImsl, dl, MovTy, 6176 DAG.getConstant(CnstVal, dl, MVT::i32), 6177 DAG.getConstant(264, dl, MVT::i32)); 6178 return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov); 6179 } 6180 6181 if (AArch64_AM::isAdvSIMDModImmType8(CnstVal)) { 6182 CnstVal = AArch64_AM::encodeAdvSIMDModImmType8(CnstVal); 6183 MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32; 6184 SDValue Mov = DAG.getNode(AArch64ISD::MVNImsl, dl, MovTy, 6185 DAG.getConstant(CnstVal, dl, MVT::i32), 6186 DAG.getConstant(272, dl, MVT::i32)); 6187 return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov); 6188 } 6189 } 6190 6191 if (SecondTry) 6192 goto FailedModImm; 6193 SecondTry = true; 6194 CnstBits = UndefBits; 6195 goto AttemptModImm; 6196 } 6197 FailedModImm: 6198 6199 // Scan through the operands to find some interesting properties we can 6200 // exploit: 6201 // 1) If only one value is used, we can use a DUP, or 6202 // 2) if only the low element is not undef, we can just insert that, or 6203 // 3) if only one constant value is used (w/ some non-constant lanes), 6204 // we can splat the constant value into the whole vector then fill 6205 // in the non-constant lanes. 6206 // 4) FIXME: If different constant values are used, but we can intelligently 6207 // select the values we'll be overwriting for the non-constant 6208 // lanes such that we can directly materialize the vector 6209 // some other way (MOVI, e.g.), we can be sneaky. 6210 unsigned NumElts = VT.getVectorNumElements(); 6211 bool isOnlyLowElement = true; 6212 bool usesOnlyOneValue = true; 6213 bool usesOnlyOneConstantValue = true; 6214 bool isConstant = true; 6215 unsigned NumConstantLanes = 0; 6216 SDValue Value; 6217 SDValue ConstantValue; 6218 for (unsigned i = 0; i < NumElts; ++i) { 6219 SDValue V = Op.getOperand(i); 6220 if (V.getOpcode() == ISD::UNDEF) 6221 continue; 6222 if (i > 0) 6223 isOnlyLowElement = false; 6224 if (!isa<ConstantFPSDNode>(V) && !isa<ConstantSDNode>(V)) 6225 isConstant = false; 6226 6227 if (isa<ConstantSDNode>(V) || isa<ConstantFPSDNode>(V)) { 6228 ++NumConstantLanes; 6229 if (!ConstantValue.getNode()) 6230 ConstantValue = V; 6231 else if (ConstantValue != V) 6232 usesOnlyOneConstantValue = false; 6233 } 6234 6235 if (!Value.getNode()) 6236 Value = V; 6237 else if (V != Value) 6238 usesOnlyOneValue = false; 6239 } 6240 6241 if (!Value.getNode()) 6242 return DAG.getUNDEF(VT); 6243 6244 if (isOnlyLowElement) 6245 return DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Value); 6246 6247 // Use DUP for non-constant splats. For f32 constant splats, reduce to 6248 // i32 and try again. 6249 if (usesOnlyOneValue) { 6250 if (!isConstant) { 6251 if (Value.getOpcode() != ISD::EXTRACT_VECTOR_ELT || 6252 Value.getValueType() != VT) 6253 return DAG.getNode(AArch64ISD::DUP, dl, VT, Value); 6254 6255 // This is actually a DUPLANExx operation, which keeps everything vectory. 6256 6257 // DUPLANE works on 128-bit vectors, widen it if necessary. 6258 SDValue Lane = Value.getOperand(1); 6259 Value = Value.getOperand(0); 6260 if (Value.getValueType().getSizeInBits() == 64) 6261 Value = WidenVector(Value, DAG); 6262 6263 unsigned Opcode = getDUPLANEOp(VT.getVectorElementType()); 6264 return DAG.getNode(Opcode, dl, VT, Value, Lane); 6265 } 6266 6267 if (VT.getVectorElementType().isFloatingPoint()) { 6268 SmallVector<SDValue, 8> Ops; 6269 EVT EltTy = VT.getVectorElementType(); 6270 assert ((EltTy == MVT::f16 || EltTy == MVT::f32 || EltTy == MVT::f64) && 6271 "Unsupported floating-point vector type"); 6272 MVT NewType = MVT::getIntegerVT(EltTy.getSizeInBits()); 6273 for (unsigned i = 0; i < NumElts; ++i) 6274 Ops.push_back(DAG.getNode(ISD::BITCAST, dl, NewType, Op.getOperand(i))); 6275 EVT VecVT = EVT::getVectorVT(*DAG.getContext(), NewType, NumElts); 6276 SDValue Val = DAG.getNode(ISD::BUILD_VECTOR, dl, VecVT, Ops); 6277 Val = LowerBUILD_VECTOR(Val, DAG); 6278 if (Val.getNode()) 6279 return DAG.getNode(ISD::BITCAST, dl, VT, Val); 6280 } 6281 } 6282 6283 // If there was only one constant value used and for more than one lane, 6284 // start by splatting that value, then replace the non-constant lanes. This 6285 // is better than the default, which will perform a separate initialization 6286 // for each lane. 6287 if (NumConstantLanes > 0 && usesOnlyOneConstantValue) { 6288 SDValue Val = DAG.getNode(AArch64ISD::DUP, dl, VT, ConstantValue); 6289 // Now insert the non-constant lanes. 6290 for (unsigned i = 0; i < NumElts; ++i) { 6291 SDValue V = Op.getOperand(i); 6292 SDValue LaneIdx = DAG.getConstant(i, dl, MVT::i64); 6293 if (!isa<ConstantSDNode>(V) && !isa<ConstantFPSDNode>(V)) { 6294 // Note that type legalization likely mucked about with the VT of the 6295 // source operand, so we may have to convert it here before inserting. 6296 Val = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, Val, V, LaneIdx); 6297 } 6298 } 6299 return Val; 6300 } 6301 6302 // If all elements are constants and the case above didn't get hit, fall back 6303 // to the default expansion, which will generate a load from the constant 6304 // pool. 6305 if (isConstant) 6306 return SDValue(); 6307 6308 // Empirical tests suggest this is rarely worth it for vectors of length <= 2. 6309 if (NumElts >= 4) { 6310 if (SDValue shuffle = ReconstructShuffle(Op, DAG)) 6311 return shuffle; 6312 } 6313 6314 // If all else fails, just use a sequence of INSERT_VECTOR_ELT when we 6315 // know the default expansion would otherwise fall back on something even 6316 // worse. For a vector with one or two non-undef values, that's 6317 // scalar_to_vector for the elements followed by a shuffle (provided the 6318 // shuffle is valid for the target) and materialization element by element 6319 // on the stack followed by a load for everything else. 6320 if (!isConstant && !usesOnlyOneValue) { 6321 SDValue Vec = DAG.getUNDEF(VT); 6322 SDValue Op0 = Op.getOperand(0); 6323 unsigned ElemSize = VT.getVectorElementType().getSizeInBits(); 6324 unsigned i = 0; 6325 // For 32 and 64 bit types, use INSERT_SUBREG for lane zero to 6326 // a) Avoid a RMW dependency on the full vector register, and 6327 // b) Allow the register coalescer to fold away the copy if the 6328 // value is already in an S or D register. 6329 // Do not do this for UNDEF/LOAD nodes because we have better patterns 6330 // for those avoiding the SCALAR_TO_VECTOR/BUILD_VECTOR. 6331 if (Op0.getOpcode() != ISD::UNDEF && Op0.getOpcode() != ISD::LOAD && 6332 (ElemSize == 32 || ElemSize == 64)) { 6333 unsigned SubIdx = ElemSize == 32 ? AArch64::ssub : AArch64::dsub; 6334 MachineSDNode *N = 6335 DAG.getMachineNode(TargetOpcode::INSERT_SUBREG, dl, VT, Vec, Op0, 6336 DAG.getTargetConstant(SubIdx, dl, MVT::i32)); 6337 Vec = SDValue(N, 0); 6338 ++i; 6339 } 6340 for (; i < NumElts; ++i) { 6341 SDValue V = Op.getOperand(i); 6342 if (V.getOpcode() == ISD::UNDEF) 6343 continue; 6344 SDValue LaneIdx = DAG.getConstant(i, dl, MVT::i64); 6345 Vec = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, Vec, V, LaneIdx); 6346 } 6347 return Vec; 6348 } 6349 6350 // Just use the default expansion. We failed to find a better alternative. 6351 return SDValue(); 6352 } 6353 6354 SDValue AArch64TargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op, 6355 SelectionDAG &DAG) const { 6356 assert(Op.getOpcode() == ISD::INSERT_VECTOR_ELT && "Unknown opcode!"); 6357 6358 // Check for non-constant or out of range lane. 6359 EVT VT = Op.getOperand(0).getValueType(); 6360 ConstantSDNode *CI = dyn_cast<ConstantSDNode>(Op.getOperand(2)); 6361 if (!CI || CI->getZExtValue() >= VT.getVectorNumElements()) 6362 return SDValue(); 6363 6364 6365 // Insertion/extraction are legal for V128 types. 6366 if (VT == MVT::v16i8 || VT == MVT::v8i16 || VT == MVT::v4i32 || 6367 VT == MVT::v2i64 || VT == MVT::v4f32 || VT == MVT::v2f64 || 6368 VT == MVT::v8f16) 6369 return Op; 6370 6371 if (VT != MVT::v8i8 && VT != MVT::v4i16 && VT != MVT::v2i32 && 6372 VT != MVT::v1i64 && VT != MVT::v2f32 && VT != MVT::v4f16) 6373 return SDValue(); 6374 6375 // For V64 types, we perform insertion by expanding the value 6376 // to a V128 type and perform the insertion on that. 6377 SDLoc DL(Op); 6378 SDValue WideVec = WidenVector(Op.getOperand(0), DAG); 6379 EVT WideTy = WideVec.getValueType(); 6380 6381 SDValue Node = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, WideTy, WideVec, 6382 Op.getOperand(1), Op.getOperand(2)); 6383 // Re-narrow the resultant vector. 6384 return NarrowVector(Node, DAG); 6385 } 6386 6387 SDValue 6388 AArch64TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op, 6389 SelectionDAG &DAG) const { 6390 assert(Op.getOpcode() == ISD::EXTRACT_VECTOR_ELT && "Unknown opcode!"); 6391 6392 // Check for non-constant or out of range lane. 6393 EVT VT = Op.getOperand(0).getValueType(); 6394 ConstantSDNode *CI = dyn_cast<ConstantSDNode>(Op.getOperand(1)); 6395 if (!CI || CI->getZExtValue() >= VT.getVectorNumElements()) 6396 return SDValue(); 6397 6398 6399 // Insertion/extraction are legal for V128 types. 6400 if (VT == MVT::v16i8 || VT == MVT::v8i16 || VT == MVT::v4i32 || 6401 VT == MVT::v2i64 || VT == MVT::v4f32 || VT == MVT::v2f64 || 6402 VT == MVT::v8f16) 6403 return Op; 6404 6405 if (VT != MVT::v8i8 && VT != MVT::v4i16 && VT != MVT::v2i32 && 6406 VT != MVT::v1i64 && VT != MVT::v2f32 && VT != MVT::v4f16) 6407 return SDValue(); 6408 6409 // For V64 types, we perform extraction by expanding the value 6410 // to a V128 type and perform the extraction on that. 6411 SDLoc DL(Op); 6412 SDValue WideVec = WidenVector(Op.getOperand(0), DAG); 6413 EVT WideTy = WideVec.getValueType(); 6414 6415 EVT ExtrTy = WideTy.getVectorElementType(); 6416 if (ExtrTy == MVT::i16 || ExtrTy == MVT::i8) 6417 ExtrTy = MVT::i32; 6418 6419 // For extractions, we just return the result directly. 6420 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ExtrTy, WideVec, 6421 Op.getOperand(1)); 6422 } 6423 6424 SDValue AArch64TargetLowering::LowerEXTRACT_SUBVECTOR(SDValue Op, 6425 SelectionDAG &DAG) const { 6426 EVT VT = Op.getOperand(0).getValueType(); 6427 SDLoc dl(Op); 6428 // Just in case... 6429 if (!VT.isVector()) 6430 return SDValue(); 6431 6432 ConstantSDNode *Cst = dyn_cast<ConstantSDNode>(Op.getOperand(1)); 6433 if (!Cst) 6434 return SDValue(); 6435 unsigned Val = Cst->getZExtValue(); 6436 6437 unsigned Size = Op.getValueType().getSizeInBits(); 6438 6439 // This will get lowered to an appropriate EXTRACT_SUBREG in ISel. 6440 if (Val == 0) 6441 return Op; 6442 6443 // If this is extracting the upper 64-bits of a 128-bit vector, we match 6444 // that directly. 6445 if (Size == 64 && Val * VT.getVectorElementType().getSizeInBits() == 64) 6446 return Op; 6447 6448 return SDValue(); 6449 } 6450 6451 bool AArch64TargetLowering::isShuffleMaskLegal(const SmallVectorImpl<int> &M, 6452 EVT VT) const { 6453 if (VT.getVectorNumElements() == 4 && 6454 (VT.is128BitVector() || VT.is64BitVector())) { 6455 unsigned PFIndexes[4]; 6456 for (unsigned i = 0; i != 4; ++i) { 6457 if (M[i] < 0) 6458 PFIndexes[i] = 8; 6459 else 6460 PFIndexes[i] = M[i]; 6461 } 6462 6463 // Compute the index in the perfect shuffle table. 6464 unsigned PFTableIndex = PFIndexes[0] * 9 * 9 * 9 + PFIndexes[1] * 9 * 9 + 6465 PFIndexes[2] * 9 + PFIndexes[3]; 6466 unsigned PFEntry = PerfectShuffleTable[PFTableIndex]; 6467 unsigned Cost = (PFEntry >> 30); 6468 6469 if (Cost <= 4) 6470 return true; 6471 } 6472 6473 bool DummyBool; 6474 int DummyInt; 6475 unsigned DummyUnsigned; 6476 6477 return (ShuffleVectorSDNode::isSplatMask(&M[0], VT) || isREVMask(M, VT, 64) || 6478 isREVMask(M, VT, 32) || isREVMask(M, VT, 16) || 6479 isEXTMask(M, VT, DummyBool, DummyUnsigned) || 6480 // isTBLMask(M, VT) || // FIXME: Port TBL support from ARM. 6481 isTRNMask(M, VT, DummyUnsigned) || isUZPMask(M, VT, DummyUnsigned) || 6482 isZIPMask(M, VT, DummyUnsigned) || 6483 isTRN_v_undef_Mask(M, VT, DummyUnsigned) || 6484 isUZP_v_undef_Mask(M, VT, DummyUnsigned) || 6485 isZIP_v_undef_Mask(M, VT, DummyUnsigned) || 6486 isINSMask(M, VT.getVectorNumElements(), DummyBool, DummyInt) || 6487 isConcatMask(M, VT, VT.getSizeInBits() == 128)); 6488 } 6489 6490 /// getVShiftImm - Check if this is a valid build_vector for the immediate 6491 /// operand of a vector shift operation, where all the elements of the 6492 /// build_vector must have the same constant integer value. 6493 static bool getVShiftImm(SDValue Op, unsigned ElementBits, int64_t &Cnt) { 6494 // Ignore bit_converts. 6495 while (Op.getOpcode() == ISD::BITCAST) 6496 Op = Op.getOperand(0); 6497 BuildVectorSDNode *BVN = dyn_cast<BuildVectorSDNode>(Op.getNode()); 6498 APInt SplatBits, SplatUndef; 6499 unsigned SplatBitSize; 6500 bool HasAnyUndefs; 6501 if (!BVN || !BVN->isConstantSplat(SplatBits, SplatUndef, SplatBitSize, 6502 HasAnyUndefs, ElementBits) || 6503 SplatBitSize > ElementBits) 6504 return false; 6505 Cnt = SplatBits.getSExtValue(); 6506 return true; 6507 } 6508 6509 /// isVShiftLImm - Check if this is a valid build_vector for the immediate 6510 /// operand of a vector shift left operation. That value must be in the range: 6511 /// 0 <= Value < ElementBits for a left shift; or 6512 /// 0 <= Value <= ElementBits for a long left shift. 6513 static bool isVShiftLImm(SDValue Op, EVT VT, bool isLong, int64_t &Cnt) { 6514 assert(VT.isVector() && "vector shift count is not a vector type"); 6515 int64_t ElementBits = VT.getVectorElementType().getSizeInBits(); 6516 if (!getVShiftImm(Op, ElementBits, Cnt)) 6517 return false; 6518 return (Cnt >= 0 && (isLong ? Cnt - 1 : Cnt) < ElementBits); 6519 } 6520 6521 /// isVShiftRImm - Check if this is a valid build_vector for the immediate 6522 /// operand of a vector shift right operation. The value must be in the range: 6523 /// 1 <= Value <= ElementBits for a right shift; or 6524 static bool isVShiftRImm(SDValue Op, EVT VT, bool isNarrow, int64_t &Cnt) { 6525 assert(VT.isVector() && "vector shift count is not a vector type"); 6526 int64_t ElementBits = VT.getVectorElementType().getSizeInBits(); 6527 if (!getVShiftImm(Op, ElementBits, Cnt)) 6528 return false; 6529 return (Cnt >= 1 && Cnt <= (isNarrow ? ElementBits / 2 : ElementBits)); 6530 } 6531 6532 SDValue AArch64TargetLowering::LowerVectorSRA_SRL_SHL(SDValue Op, 6533 SelectionDAG &DAG) const { 6534 EVT VT = Op.getValueType(); 6535 SDLoc DL(Op); 6536 int64_t Cnt; 6537 6538 if (!Op.getOperand(1).getValueType().isVector()) 6539 return Op; 6540 unsigned EltSize = VT.getVectorElementType().getSizeInBits(); 6541 6542 switch (Op.getOpcode()) { 6543 default: 6544 llvm_unreachable("unexpected shift opcode"); 6545 6546 case ISD::SHL: 6547 if (isVShiftLImm(Op.getOperand(1), VT, false, Cnt) && Cnt < EltSize) 6548 return DAG.getNode(AArch64ISD::VSHL, DL, VT, Op.getOperand(0), 6549 DAG.getConstant(Cnt, DL, MVT::i32)); 6550 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT, 6551 DAG.getConstant(Intrinsic::aarch64_neon_ushl, DL, 6552 MVT::i32), 6553 Op.getOperand(0), Op.getOperand(1)); 6554 case ISD::SRA: 6555 case ISD::SRL: 6556 // Right shift immediate 6557 if (isVShiftRImm(Op.getOperand(1), VT, false, Cnt) && Cnt < EltSize) { 6558 unsigned Opc = 6559 (Op.getOpcode() == ISD::SRA) ? AArch64ISD::VASHR : AArch64ISD::VLSHR; 6560 return DAG.getNode(Opc, DL, VT, Op.getOperand(0), 6561 DAG.getConstant(Cnt, DL, MVT::i32)); 6562 } 6563 6564 // Right shift register. Note, there is not a shift right register 6565 // instruction, but the shift left register instruction takes a signed 6566 // value, where negative numbers specify a right shift. 6567 unsigned Opc = (Op.getOpcode() == ISD::SRA) ? Intrinsic::aarch64_neon_sshl 6568 : Intrinsic::aarch64_neon_ushl; 6569 // negate the shift amount 6570 SDValue NegShift = DAG.getNode(AArch64ISD::NEG, DL, VT, Op.getOperand(1)); 6571 SDValue NegShiftLeft = 6572 DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT, 6573 DAG.getConstant(Opc, DL, MVT::i32), Op.getOperand(0), 6574 NegShift); 6575 return NegShiftLeft; 6576 } 6577 6578 return SDValue(); 6579 } 6580 6581 static SDValue EmitVectorComparison(SDValue LHS, SDValue RHS, 6582 AArch64CC::CondCode CC, bool NoNans, EVT VT, 6583 SDLoc dl, SelectionDAG &DAG) { 6584 EVT SrcVT = LHS.getValueType(); 6585 assert(VT.getSizeInBits() == SrcVT.getSizeInBits() && 6586 "function only supposed to emit natural comparisons"); 6587 6588 BuildVectorSDNode *BVN = dyn_cast<BuildVectorSDNode>(RHS.getNode()); 6589 APInt CnstBits(VT.getSizeInBits(), 0); 6590 APInt UndefBits(VT.getSizeInBits(), 0); 6591 bool IsCnst = BVN && resolveBuildVector(BVN, CnstBits, UndefBits); 6592 bool IsZero = IsCnst && (CnstBits == 0); 6593 6594 if (SrcVT.getVectorElementType().isFloatingPoint()) { 6595 switch (CC) { 6596 default: 6597 return SDValue(); 6598 case AArch64CC::NE: { 6599 SDValue Fcmeq; 6600 if (IsZero) 6601 Fcmeq = DAG.getNode(AArch64ISD::FCMEQz, dl, VT, LHS); 6602 else 6603 Fcmeq = DAG.getNode(AArch64ISD::FCMEQ, dl, VT, LHS, RHS); 6604 return DAG.getNode(AArch64ISD::NOT, dl, VT, Fcmeq); 6605 } 6606 case AArch64CC::EQ: 6607 if (IsZero) 6608 return DAG.getNode(AArch64ISD::FCMEQz, dl, VT, LHS); 6609 return DAG.getNode(AArch64ISD::FCMEQ, dl, VT, LHS, RHS); 6610 case AArch64CC::GE: 6611 if (IsZero) 6612 return DAG.getNode(AArch64ISD::FCMGEz, dl, VT, LHS); 6613 return DAG.getNode(AArch64ISD::FCMGE, dl, VT, LHS, RHS); 6614 case AArch64CC::GT: 6615 if (IsZero) 6616 return DAG.getNode(AArch64ISD::FCMGTz, dl, VT, LHS); 6617 return DAG.getNode(AArch64ISD::FCMGT, dl, VT, LHS, RHS); 6618 case AArch64CC::LS: 6619 if (IsZero) 6620 return DAG.getNode(AArch64ISD::FCMLEz, dl, VT, LHS); 6621 return DAG.getNode(AArch64ISD::FCMGE, dl, VT, RHS, LHS); 6622 case AArch64CC::LT: 6623 if (!NoNans) 6624 return SDValue(); 6625 // If we ignore NaNs then we can use to the MI implementation. 6626 // Fallthrough. 6627 case AArch64CC::MI: 6628 if (IsZero) 6629 return DAG.getNode(AArch64ISD::FCMLTz, dl, VT, LHS); 6630 return DAG.getNode(AArch64ISD::FCMGT, dl, VT, RHS, LHS); 6631 } 6632 } 6633 6634 switch (CC) { 6635 default: 6636 return SDValue(); 6637 case AArch64CC::NE: { 6638 SDValue Cmeq; 6639 if (IsZero) 6640 Cmeq = DAG.getNode(AArch64ISD::CMEQz, dl, VT, LHS); 6641 else 6642 Cmeq = DAG.getNode(AArch64ISD::CMEQ, dl, VT, LHS, RHS); 6643 return DAG.getNode(AArch64ISD::NOT, dl, VT, Cmeq); 6644 } 6645 case AArch64CC::EQ: 6646 if (IsZero) 6647 return DAG.getNode(AArch64ISD::CMEQz, dl, VT, LHS); 6648 return DAG.getNode(AArch64ISD::CMEQ, dl, VT, LHS, RHS); 6649 case AArch64CC::GE: 6650 if (IsZero) 6651 return DAG.getNode(AArch64ISD::CMGEz, dl, VT, LHS); 6652 return DAG.getNode(AArch64ISD::CMGE, dl, VT, LHS, RHS); 6653 case AArch64CC::GT: 6654 if (IsZero) 6655 return DAG.getNode(AArch64ISD::CMGTz, dl, VT, LHS); 6656 return DAG.getNode(AArch64ISD::CMGT, dl, VT, LHS, RHS); 6657 case AArch64CC::LE: 6658 if (IsZero) 6659 return DAG.getNode(AArch64ISD::CMLEz, dl, VT, LHS); 6660 return DAG.getNode(AArch64ISD::CMGE, dl, VT, RHS, LHS); 6661 case AArch64CC::LS: 6662 return DAG.getNode(AArch64ISD::CMHS, dl, VT, RHS, LHS); 6663 case AArch64CC::LO: 6664 return DAG.getNode(AArch64ISD::CMHI, dl, VT, RHS, LHS); 6665 case AArch64CC::LT: 6666 if (IsZero) 6667 return DAG.getNode(AArch64ISD::CMLTz, dl, VT, LHS); 6668 return DAG.getNode(AArch64ISD::CMGT, dl, VT, RHS, LHS); 6669 case AArch64CC::HI: 6670 return DAG.getNode(AArch64ISD::CMHI, dl, VT, LHS, RHS); 6671 case AArch64CC::HS: 6672 return DAG.getNode(AArch64ISD::CMHS, dl, VT, LHS, RHS); 6673 } 6674 } 6675 6676 SDValue AArch64TargetLowering::LowerVSETCC(SDValue Op, 6677 SelectionDAG &DAG) const { 6678 ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(2))->get(); 6679 SDValue LHS = Op.getOperand(0); 6680 SDValue RHS = Op.getOperand(1); 6681 EVT CmpVT = LHS.getValueType().changeVectorElementTypeToInteger(); 6682 SDLoc dl(Op); 6683 6684 if (LHS.getValueType().getVectorElementType().isInteger()) { 6685 assert(LHS.getValueType() == RHS.getValueType()); 6686 AArch64CC::CondCode AArch64CC = changeIntCCToAArch64CC(CC); 6687 SDValue Cmp = 6688 EmitVectorComparison(LHS, RHS, AArch64CC, false, CmpVT, dl, DAG); 6689 return DAG.getSExtOrTrunc(Cmp, dl, Op.getValueType()); 6690 } 6691 6692 if (LHS.getValueType().getVectorElementType() == MVT::f16) 6693 return SDValue(); 6694 6695 assert(LHS.getValueType().getVectorElementType() == MVT::f32 || 6696 LHS.getValueType().getVectorElementType() == MVT::f64); 6697 6698 // Unfortunately, the mapping of LLVM FP CC's onto AArch64 CC's isn't totally 6699 // clean. Some of them require two branches to implement. 6700 AArch64CC::CondCode CC1, CC2; 6701 bool ShouldInvert; 6702 changeVectorFPCCToAArch64CC(CC, CC1, CC2, ShouldInvert); 6703 6704 bool NoNaNs = getTargetMachine().Options.NoNaNsFPMath; 6705 SDValue Cmp = 6706 EmitVectorComparison(LHS, RHS, CC1, NoNaNs, CmpVT, dl, DAG); 6707 if (!Cmp.getNode()) 6708 return SDValue(); 6709 6710 if (CC2 != AArch64CC::AL) { 6711 SDValue Cmp2 = 6712 EmitVectorComparison(LHS, RHS, CC2, NoNaNs, CmpVT, dl, DAG); 6713 if (!Cmp2.getNode()) 6714 return SDValue(); 6715 6716 Cmp = DAG.getNode(ISD::OR, dl, CmpVT, Cmp, Cmp2); 6717 } 6718 6719 Cmp = DAG.getSExtOrTrunc(Cmp, dl, Op.getValueType()); 6720 6721 if (ShouldInvert) 6722 return Cmp = DAG.getNOT(dl, Cmp, Cmp.getValueType()); 6723 6724 return Cmp; 6725 } 6726 6727 /// getTgtMemIntrinsic - Represent NEON load and store intrinsics as 6728 /// MemIntrinsicNodes. The associated MachineMemOperands record the alignment 6729 /// specified in the intrinsic calls. 6730 bool AArch64TargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info, 6731 const CallInst &I, 6732 unsigned Intrinsic) const { 6733 auto &DL = I.getModule()->getDataLayout(); 6734 switch (Intrinsic) { 6735 case Intrinsic::aarch64_neon_ld2: 6736 case Intrinsic::aarch64_neon_ld3: 6737 case Intrinsic::aarch64_neon_ld4: 6738 case Intrinsic::aarch64_neon_ld1x2: 6739 case Intrinsic::aarch64_neon_ld1x3: 6740 case Intrinsic::aarch64_neon_ld1x4: 6741 case Intrinsic::aarch64_neon_ld2lane: 6742 case Intrinsic::aarch64_neon_ld3lane: 6743 case Intrinsic::aarch64_neon_ld4lane: 6744 case Intrinsic::aarch64_neon_ld2r: 6745 case Intrinsic::aarch64_neon_ld3r: 6746 case Intrinsic::aarch64_neon_ld4r: { 6747 Info.opc = ISD::INTRINSIC_W_CHAIN; 6748 // Conservatively set memVT to the entire set of vectors loaded. 6749 uint64_t NumElts = DL.getTypeSizeInBits(I.getType()) / 64; 6750 Info.memVT = EVT::getVectorVT(I.getType()->getContext(), MVT::i64, NumElts); 6751 Info.ptrVal = I.getArgOperand(I.getNumArgOperands() - 1); 6752 Info.offset = 0; 6753 Info.align = 0; 6754 Info.vol = false; // volatile loads with NEON intrinsics not supported 6755 Info.readMem = true; 6756 Info.writeMem = false; 6757 return true; 6758 } 6759 case Intrinsic::aarch64_neon_st2: 6760 case Intrinsic::aarch64_neon_st3: 6761 case Intrinsic::aarch64_neon_st4: 6762 case Intrinsic::aarch64_neon_st1x2: 6763 case Intrinsic::aarch64_neon_st1x3: 6764 case Intrinsic::aarch64_neon_st1x4: 6765 case Intrinsic::aarch64_neon_st2lane: 6766 case Intrinsic::aarch64_neon_st3lane: 6767 case Intrinsic::aarch64_neon_st4lane: { 6768 Info.opc = ISD::INTRINSIC_VOID; 6769 // Conservatively set memVT to the entire set of vectors stored. 6770 unsigned NumElts = 0; 6771 for (unsigned ArgI = 1, ArgE = I.getNumArgOperands(); ArgI < ArgE; ++ArgI) { 6772 Type *ArgTy = I.getArgOperand(ArgI)->getType(); 6773 if (!ArgTy->isVectorTy()) 6774 break; 6775 NumElts += DL.getTypeSizeInBits(ArgTy) / 64; 6776 } 6777 Info.memVT = EVT::getVectorVT(I.getType()->getContext(), MVT::i64, NumElts); 6778 Info.ptrVal = I.getArgOperand(I.getNumArgOperands() - 1); 6779 Info.offset = 0; 6780 Info.align = 0; 6781 Info.vol = false; // volatile stores with NEON intrinsics not supported 6782 Info.readMem = false; 6783 Info.writeMem = true; 6784 return true; 6785 } 6786 case Intrinsic::aarch64_ldaxr: 6787 case Intrinsic::aarch64_ldxr: { 6788 PointerType *PtrTy = cast<PointerType>(I.getArgOperand(0)->getType()); 6789 Info.opc = ISD::INTRINSIC_W_CHAIN; 6790 Info.memVT = MVT::getVT(PtrTy->getElementType()); 6791 Info.ptrVal = I.getArgOperand(0); 6792 Info.offset = 0; 6793 Info.align = DL.getABITypeAlignment(PtrTy->getElementType()); 6794 Info.vol = true; 6795 Info.readMem = true; 6796 Info.writeMem = false; 6797 return true; 6798 } 6799 case Intrinsic::aarch64_stlxr: 6800 case Intrinsic::aarch64_stxr: { 6801 PointerType *PtrTy = cast<PointerType>(I.getArgOperand(1)->getType()); 6802 Info.opc = ISD::INTRINSIC_W_CHAIN; 6803 Info.memVT = MVT::getVT(PtrTy->getElementType()); 6804 Info.ptrVal = I.getArgOperand(1); 6805 Info.offset = 0; 6806 Info.align = DL.getABITypeAlignment(PtrTy->getElementType()); 6807 Info.vol = true; 6808 Info.readMem = false; 6809 Info.writeMem = true; 6810 return true; 6811 } 6812 case Intrinsic::aarch64_ldaxp: 6813 case Intrinsic::aarch64_ldxp: { 6814 Info.opc = ISD::INTRINSIC_W_CHAIN; 6815 Info.memVT = MVT::i128; 6816 Info.ptrVal = I.getArgOperand(0); 6817 Info.offset = 0; 6818 Info.align = 16; 6819 Info.vol = true; 6820 Info.readMem = true; 6821 Info.writeMem = false; 6822 return true; 6823 } 6824 case Intrinsic::aarch64_stlxp: 6825 case Intrinsic::aarch64_stxp: { 6826 Info.opc = ISD::INTRINSIC_W_CHAIN; 6827 Info.memVT = MVT::i128; 6828 Info.ptrVal = I.getArgOperand(2); 6829 Info.offset = 0; 6830 Info.align = 16; 6831 Info.vol = true; 6832 Info.readMem = false; 6833 Info.writeMem = true; 6834 return true; 6835 } 6836 default: 6837 break; 6838 } 6839 6840 return false; 6841 } 6842 6843 // Truncations from 64-bit GPR to 32-bit GPR is free. 6844 bool AArch64TargetLowering::isTruncateFree(Type *Ty1, Type *Ty2) const { 6845 if (!Ty1->isIntegerTy() || !Ty2->isIntegerTy()) 6846 return false; 6847 unsigned NumBits1 = Ty1->getPrimitiveSizeInBits(); 6848 unsigned NumBits2 = Ty2->getPrimitiveSizeInBits(); 6849 return NumBits1 > NumBits2; 6850 } 6851 bool AArch64TargetLowering::isTruncateFree(EVT VT1, EVT VT2) const { 6852 if (VT1.isVector() || VT2.isVector() || !VT1.isInteger() || !VT2.isInteger()) 6853 return false; 6854 unsigned NumBits1 = VT1.getSizeInBits(); 6855 unsigned NumBits2 = VT2.getSizeInBits(); 6856 return NumBits1 > NumBits2; 6857 } 6858 6859 /// Check if it is profitable to hoist instruction in then/else to if. 6860 /// Not profitable if I and it's user can form a FMA instruction 6861 /// because we prefer FMSUB/FMADD. 6862 bool AArch64TargetLowering::isProfitableToHoist(Instruction *I) const { 6863 if (I->getOpcode() != Instruction::FMul) 6864 return true; 6865 6866 if (I->getNumUses() != 1) 6867 return true; 6868 6869 Instruction *User = I->user_back(); 6870 6871 if (User && 6872 !(User->getOpcode() == Instruction::FSub || 6873 User->getOpcode() == Instruction::FAdd)) 6874 return true; 6875 6876 const TargetOptions &Options = getTargetMachine().Options; 6877 const DataLayout &DL = I->getModule()->getDataLayout(); 6878 EVT VT = getValueType(DL, User->getOperand(0)->getType()); 6879 6880 if (isFMAFasterThanFMulAndFAdd(VT) && 6881 isOperationLegalOrCustom(ISD::FMA, VT) && 6882 (Options.AllowFPOpFusion == FPOpFusion::Fast || Options.UnsafeFPMath)) 6883 return false; 6884 6885 return true; 6886 } 6887 6888 // All 32-bit GPR operations implicitly zero the high-half of the corresponding 6889 // 64-bit GPR. 6890 bool AArch64TargetLowering::isZExtFree(Type *Ty1, Type *Ty2) const { 6891 if (!Ty1->isIntegerTy() || !Ty2->isIntegerTy()) 6892 return false; 6893 unsigned NumBits1 = Ty1->getPrimitiveSizeInBits(); 6894 unsigned NumBits2 = Ty2->getPrimitiveSizeInBits(); 6895 return NumBits1 == 32 && NumBits2 == 64; 6896 } 6897 bool AArch64TargetLowering::isZExtFree(EVT VT1, EVT VT2) const { 6898 if (VT1.isVector() || VT2.isVector() || !VT1.isInteger() || !VT2.isInteger()) 6899 return false; 6900 unsigned NumBits1 = VT1.getSizeInBits(); 6901 unsigned NumBits2 = VT2.getSizeInBits(); 6902 return NumBits1 == 32 && NumBits2 == 64; 6903 } 6904 6905 bool AArch64TargetLowering::isZExtFree(SDValue Val, EVT VT2) const { 6906 EVT VT1 = Val.getValueType(); 6907 if (isZExtFree(VT1, VT2)) { 6908 return true; 6909 } 6910 6911 if (Val.getOpcode() != ISD::LOAD) 6912 return false; 6913 6914 // 8-, 16-, and 32-bit integer loads all implicitly zero-extend. 6915 return (VT1.isSimple() && !VT1.isVector() && VT1.isInteger() && 6916 VT2.isSimple() && !VT2.isVector() && VT2.isInteger() && 6917 VT1.getSizeInBits() <= 32); 6918 } 6919 6920 bool AArch64TargetLowering::isExtFreeImpl(const Instruction *Ext) const { 6921 if (isa<FPExtInst>(Ext)) 6922 return false; 6923 6924 // Vector types are next free. 6925 if (Ext->getType()->isVectorTy()) 6926 return false; 6927 6928 for (const Use &U : Ext->uses()) { 6929 // The extension is free if we can fold it with a left shift in an 6930 // addressing mode or an arithmetic operation: add, sub, and cmp. 6931 6932 // Is there a shift? 6933 const Instruction *Instr = cast<Instruction>(U.getUser()); 6934 6935 // Is this a constant shift? 6936 switch (Instr->getOpcode()) { 6937 case Instruction::Shl: 6938 if (!isa<ConstantInt>(Instr->getOperand(1))) 6939 return false; 6940 break; 6941 case Instruction::GetElementPtr: { 6942 gep_type_iterator GTI = gep_type_begin(Instr); 6943 auto &DL = Ext->getModule()->getDataLayout(); 6944 std::advance(GTI, U.getOperandNo()); 6945 Type *IdxTy = *GTI; 6946 // This extension will end up with a shift because of the scaling factor. 6947 // 8-bit sized types have a scaling factor of 1, thus a shift amount of 0. 6948 // Get the shift amount based on the scaling factor: 6949 // log2(sizeof(IdxTy)) - log2(8). 6950 uint64_t ShiftAmt = 6951 countTrailingZeros(DL.getTypeStoreSizeInBits(IdxTy)) - 3; 6952 // Is the constant foldable in the shift of the addressing mode? 6953 // I.e., shift amount is between 1 and 4 inclusive. 6954 if (ShiftAmt == 0 || ShiftAmt > 4) 6955 return false; 6956 break; 6957 } 6958 case Instruction::Trunc: 6959 // Check if this is a noop. 6960 // trunc(sext ty1 to ty2) to ty1. 6961 if (Instr->getType() == Ext->getOperand(0)->getType()) 6962 continue; 6963 // FALL THROUGH. 6964 default: 6965 return false; 6966 } 6967 6968 // At this point we can use the bfm family, so this extension is free 6969 // for that use. 6970 } 6971 return true; 6972 } 6973 6974 bool AArch64TargetLowering::hasPairedLoad(Type *LoadedType, 6975 unsigned &RequiredAligment) const { 6976 if (!LoadedType->isIntegerTy() && !LoadedType->isFloatTy()) 6977 return false; 6978 // Cyclone supports unaligned accesses. 6979 RequiredAligment = 0; 6980 unsigned NumBits = LoadedType->getPrimitiveSizeInBits(); 6981 return NumBits == 32 || NumBits == 64; 6982 } 6983 6984 bool AArch64TargetLowering::hasPairedLoad(EVT LoadedType, 6985 unsigned &RequiredAligment) const { 6986 if (!LoadedType.isSimple() || 6987 (!LoadedType.isInteger() && !LoadedType.isFloatingPoint())) 6988 return false; 6989 // Cyclone supports unaligned accesses. 6990 RequiredAligment = 0; 6991 unsigned NumBits = LoadedType.getSizeInBits(); 6992 return NumBits == 32 || NumBits == 64; 6993 } 6994 6995 /// \brief Lower an interleaved load into a ldN intrinsic. 6996 /// 6997 /// E.g. Lower an interleaved load (Factor = 2): 6998 /// %wide.vec = load <8 x i32>, <8 x i32>* %ptr 6999 /// %v0 = shuffle %wide.vec, undef, <0, 2, 4, 6> ; Extract even elements 7000 /// %v1 = shuffle %wide.vec, undef, <1, 3, 5, 7> ; Extract odd elements 7001 /// 7002 /// Into: 7003 /// %ld2 = { <4 x i32>, <4 x i32> } call llvm.aarch64.neon.ld2(%ptr) 7004 /// %vec0 = extractelement { <4 x i32>, <4 x i32> } %ld2, i32 0 7005 /// %vec1 = extractelement { <4 x i32>, <4 x i32> } %ld2, i32 1 7006 bool AArch64TargetLowering::lowerInterleavedLoad( 7007 LoadInst *LI, ArrayRef<ShuffleVectorInst *> Shuffles, 7008 ArrayRef<unsigned> Indices, unsigned Factor) const { 7009 assert(Factor >= 2 && Factor <= getMaxSupportedInterleaveFactor() && 7010 "Invalid interleave factor"); 7011 assert(!Shuffles.empty() && "Empty shufflevector input"); 7012 assert(Shuffles.size() == Indices.size() && 7013 "Unmatched number of shufflevectors and indices"); 7014 7015 const DataLayout &DL = LI->getModule()->getDataLayout(); 7016 7017 VectorType *VecTy = Shuffles[0]->getType(); 7018 unsigned VecSize = DL.getTypeSizeInBits(VecTy); 7019 7020 // Skip if we do not have NEON and skip illegal vector types. 7021 if (!Subtarget->hasNEON() || (VecSize != 64 && VecSize != 128)) 7022 return false; 7023 7024 // A pointer vector can not be the return type of the ldN intrinsics. Need to 7025 // load integer vectors first and then convert to pointer vectors. 7026 Type *EltTy = VecTy->getVectorElementType(); 7027 if (EltTy->isPointerTy()) 7028 VecTy = 7029 VectorType::get(DL.getIntPtrType(EltTy), VecTy->getVectorNumElements()); 7030 7031 Type *PtrTy = VecTy->getPointerTo(LI->getPointerAddressSpace()); 7032 Type *Tys[2] = {VecTy, PtrTy}; 7033 static const Intrinsic::ID LoadInts[3] = {Intrinsic::aarch64_neon_ld2, 7034 Intrinsic::aarch64_neon_ld3, 7035 Intrinsic::aarch64_neon_ld4}; 7036 Function *LdNFunc = 7037 Intrinsic::getDeclaration(LI->getModule(), LoadInts[Factor - 2], Tys); 7038 7039 IRBuilder<> Builder(LI); 7040 Value *Ptr = Builder.CreateBitCast(LI->getPointerOperand(), PtrTy); 7041 7042 CallInst *LdN = Builder.CreateCall(LdNFunc, Ptr, "ldN"); 7043 7044 // Replace uses of each shufflevector with the corresponding vector loaded 7045 // by ldN. 7046 for (unsigned i = 0; i < Shuffles.size(); i++) { 7047 ShuffleVectorInst *SVI = Shuffles[i]; 7048 unsigned Index = Indices[i]; 7049 7050 Value *SubVec = Builder.CreateExtractValue(LdN, Index); 7051 7052 // Convert the integer vector to pointer vector if the element is pointer. 7053 if (EltTy->isPointerTy()) 7054 SubVec = Builder.CreateIntToPtr(SubVec, SVI->getType()); 7055 7056 SVI->replaceAllUsesWith(SubVec); 7057 } 7058 7059 return true; 7060 } 7061 7062 /// \brief Get a mask consisting of sequential integers starting from \p Start. 7063 /// 7064 /// I.e. <Start, Start + 1, ..., Start + NumElts - 1> 7065 static Constant *getSequentialMask(IRBuilder<> &Builder, unsigned Start, 7066 unsigned NumElts) { 7067 SmallVector<Constant *, 16> Mask; 7068 for (unsigned i = 0; i < NumElts; i++) 7069 Mask.push_back(Builder.getInt32(Start + i)); 7070 7071 return ConstantVector::get(Mask); 7072 } 7073 7074 /// \brief Lower an interleaved store into a stN intrinsic. 7075 /// 7076 /// E.g. Lower an interleaved store (Factor = 3): 7077 /// %i.vec = shuffle <8 x i32> %v0, <8 x i32> %v1, 7078 /// <0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11> 7079 /// store <12 x i32> %i.vec, <12 x i32>* %ptr 7080 /// 7081 /// Into: 7082 /// %sub.v0 = shuffle <8 x i32> %v0, <8 x i32> v1, <0, 1, 2, 3> 7083 /// %sub.v1 = shuffle <8 x i32> %v0, <8 x i32> v1, <4, 5, 6, 7> 7084 /// %sub.v2 = shuffle <8 x i32> %v0, <8 x i32> v1, <8, 9, 10, 11> 7085 /// call void llvm.aarch64.neon.st3(%sub.v0, %sub.v1, %sub.v2, %ptr) 7086 /// 7087 /// Note that the new shufflevectors will be removed and we'll only generate one 7088 /// st3 instruction in CodeGen. 7089 bool AArch64TargetLowering::lowerInterleavedStore(StoreInst *SI, 7090 ShuffleVectorInst *SVI, 7091 unsigned Factor) const { 7092 assert(Factor >= 2 && Factor <= getMaxSupportedInterleaveFactor() && 7093 "Invalid interleave factor"); 7094 7095 VectorType *VecTy = SVI->getType(); 7096 assert(VecTy->getVectorNumElements() % Factor == 0 && 7097 "Invalid interleaved store"); 7098 7099 unsigned NumSubElts = VecTy->getVectorNumElements() / Factor; 7100 Type *EltTy = VecTy->getVectorElementType(); 7101 VectorType *SubVecTy = VectorType::get(EltTy, NumSubElts); 7102 7103 const DataLayout &DL = SI->getModule()->getDataLayout(); 7104 unsigned SubVecSize = DL.getTypeSizeInBits(SubVecTy); 7105 7106 // Skip if we do not have NEON and skip illegal vector types. 7107 if (!Subtarget->hasNEON() || (SubVecSize != 64 && SubVecSize != 128)) 7108 return false; 7109 7110 Value *Op0 = SVI->getOperand(0); 7111 Value *Op1 = SVI->getOperand(1); 7112 IRBuilder<> Builder(SI); 7113 7114 // StN intrinsics don't support pointer vectors as arguments. Convert pointer 7115 // vectors to integer vectors. 7116 if (EltTy->isPointerTy()) { 7117 Type *IntTy = DL.getIntPtrType(EltTy); 7118 unsigned NumOpElts = 7119 dyn_cast<VectorType>(Op0->getType())->getVectorNumElements(); 7120 7121 // Convert to the corresponding integer vector. 7122 Type *IntVecTy = VectorType::get(IntTy, NumOpElts); 7123 Op0 = Builder.CreatePtrToInt(Op0, IntVecTy); 7124 Op1 = Builder.CreatePtrToInt(Op1, IntVecTy); 7125 7126 SubVecTy = VectorType::get(IntTy, NumSubElts); 7127 } 7128 7129 Type *PtrTy = SubVecTy->getPointerTo(SI->getPointerAddressSpace()); 7130 Type *Tys[2] = {SubVecTy, PtrTy}; 7131 static const Intrinsic::ID StoreInts[3] = {Intrinsic::aarch64_neon_st2, 7132 Intrinsic::aarch64_neon_st3, 7133 Intrinsic::aarch64_neon_st4}; 7134 Function *StNFunc = 7135 Intrinsic::getDeclaration(SI->getModule(), StoreInts[Factor - 2], Tys); 7136 7137 SmallVector<Value *, 5> Ops; 7138 7139 // Split the shufflevector operands into sub vectors for the new stN call. 7140 for (unsigned i = 0; i < Factor; i++) 7141 Ops.push_back(Builder.CreateShuffleVector( 7142 Op0, Op1, getSequentialMask(Builder, NumSubElts * i, NumSubElts))); 7143 7144 Ops.push_back(Builder.CreateBitCast(SI->getPointerOperand(), PtrTy)); 7145 Builder.CreateCall(StNFunc, Ops); 7146 return true; 7147 } 7148 7149 static bool memOpAlign(unsigned DstAlign, unsigned SrcAlign, 7150 unsigned AlignCheck) { 7151 return ((SrcAlign == 0 || SrcAlign % AlignCheck == 0) && 7152 (DstAlign == 0 || DstAlign % AlignCheck == 0)); 7153 } 7154 7155 EVT AArch64TargetLowering::getOptimalMemOpType(uint64_t Size, unsigned DstAlign, 7156 unsigned SrcAlign, bool IsMemset, 7157 bool ZeroMemset, 7158 bool MemcpyStrSrc, 7159 MachineFunction &MF) const { 7160 // Don't use AdvSIMD to implement 16-byte memset. It would have taken one 7161 // instruction to materialize the v2i64 zero and one store (with restrictive 7162 // addressing mode). Just do two i64 store of zero-registers. 7163 bool Fast; 7164 const Function *F = MF.getFunction(); 7165 if (Subtarget->hasFPARMv8() && !IsMemset && Size >= 16 && 7166 !F->hasFnAttribute(Attribute::NoImplicitFloat) && 7167 (memOpAlign(SrcAlign, DstAlign, 16) || 7168 (allowsMisalignedMemoryAccesses(MVT::f128, 0, 1, &Fast) && Fast))) 7169 return MVT::f128; 7170 7171 if (Size >= 8 && 7172 (memOpAlign(SrcAlign, DstAlign, 8) || 7173 (allowsMisalignedMemoryAccesses(MVT::i64, 0, 1, &Fast) && Fast))) 7174 return MVT::i64; 7175 7176 if (Size >= 4 && 7177 (memOpAlign(SrcAlign, DstAlign, 4) || 7178 (allowsMisalignedMemoryAccesses(MVT::i32, 0, 1, &Fast) && Fast))) 7179 return MVT::i32; 7180 7181 return MVT::Other; 7182 } 7183 7184 // 12-bit optionally shifted immediates are legal for adds. 7185 bool AArch64TargetLowering::isLegalAddImmediate(int64_t Immed) const { 7186 if ((Immed >> 12) == 0 || ((Immed & 0xfff) == 0 && Immed >> 24 == 0)) 7187 return true; 7188 return false; 7189 } 7190 7191 // Integer comparisons are implemented with ADDS/SUBS, so the range of valid 7192 // immediates is the same as for an add or a sub. 7193 bool AArch64TargetLowering::isLegalICmpImmediate(int64_t Immed) const { 7194 if (Immed < 0) 7195 Immed *= -1; 7196 return isLegalAddImmediate(Immed); 7197 } 7198 7199 /// isLegalAddressingMode - Return true if the addressing mode represented 7200 /// by AM is legal for this target, for a load/store of the specified type. 7201 bool AArch64TargetLowering::isLegalAddressingMode(const DataLayout &DL, 7202 const AddrMode &AM, Type *Ty, 7203 unsigned AS) const { 7204 // AArch64 has five basic addressing modes: 7205 // reg 7206 // reg + 9-bit signed offset 7207 // reg + SIZE_IN_BYTES * 12-bit unsigned offset 7208 // reg1 + reg2 7209 // reg + SIZE_IN_BYTES * reg 7210 7211 // No global is ever allowed as a base. 7212 if (AM.BaseGV) 7213 return false; 7214 7215 // No reg+reg+imm addressing. 7216 if (AM.HasBaseReg && AM.BaseOffs && AM.Scale) 7217 return false; 7218 7219 // check reg + imm case: 7220 // i.e., reg + 0, reg + imm9, reg + SIZE_IN_BYTES * uimm12 7221 uint64_t NumBytes = 0; 7222 if (Ty->isSized()) { 7223 uint64_t NumBits = DL.getTypeSizeInBits(Ty); 7224 NumBytes = NumBits / 8; 7225 if (!isPowerOf2_64(NumBits)) 7226 NumBytes = 0; 7227 } 7228 7229 if (!AM.Scale) { 7230 int64_t Offset = AM.BaseOffs; 7231 7232 // 9-bit signed offset 7233 if (Offset >= -(1LL << 9) && Offset <= (1LL << 9) - 1) 7234 return true; 7235 7236 // 12-bit unsigned offset 7237 unsigned shift = Log2_64(NumBytes); 7238 if (NumBytes && Offset > 0 && (Offset / NumBytes) <= (1LL << 12) - 1 && 7239 // Must be a multiple of NumBytes (NumBytes is a power of 2) 7240 (Offset >> shift) << shift == Offset) 7241 return true; 7242 return false; 7243 } 7244 7245 // Check reg1 + SIZE_IN_BYTES * reg2 and reg1 + reg2 7246 7247 if (!AM.Scale || AM.Scale == 1 || 7248 (AM.Scale > 0 && (uint64_t)AM.Scale == NumBytes)) 7249 return true; 7250 return false; 7251 } 7252 7253 int AArch64TargetLowering::getScalingFactorCost(const DataLayout &DL, 7254 const AddrMode &AM, Type *Ty, 7255 unsigned AS) const { 7256 // Scaling factors are not free at all. 7257 // Operands | Rt Latency 7258 // ------------------------------------------- 7259 // Rt, [Xn, Xm] | 4 7260 // ------------------------------------------- 7261 // Rt, [Xn, Xm, lsl #imm] | Rn: 4 Rm: 5 7262 // Rt, [Xn, Wm, <extend> #imm] | 7263 if (isLegalAddressingMode(DL, AM, Ty, AS)) 7264 // Scale represents reg2 * scale, thus account for 1 if 7265 // it is not equal to 0 or 1. 7266 return AM.Scale != 0 && AM.Scale != 1; 7267 return -1; 7268 } 7269 7270 bool AArch64TargetLowering::isFMAFasterThanFMulAndFAdd(EVT VT) const { 7271 VT = VT.getScalarType(); 7272 7273 if (!VT.isSimple()) 7274 return false; 7275 7276 switch (VT.getSimpleVT().SimpleTy) { 7277 case MVT::f32: 7278 case MVT::f64: 7279 return true; 7280 default: 7281 break; 7282 } 7283 7284 return false; 7285 } 7286 7287 const MCPhysReg * 7288 AArch64TargetLowering::getScratchRegisters(CallingConv::ID) const { 7289 // LR is a callee-save register, but we must treat it as clobbered by any call 7290 // site. Hence we include LR in the scratch registers, which are in turn added 7291 // as implicit-defs for stackmaps and patchpoints. 7292 static const MCPhysReg ScratchRegs[] = { 7293 AArch64::X16, AArch64::X17, AArch64::LR, 0 7294 }; 7295 return ScratchRegs; 7296 } 7297 7298 bool 7299 AArch64TargetLowering::isDesirableToCommuteWithShift(const SDNode *N) const { 7300 EVT VT = N->getValueType(0); 7301 // If N is unsigned bit extraction: ((x >> C) & mask), then do not combine 7302 // it with shift to let it be lowered to UBFX. 7303 if (N->getOpcode() == ISD::AND && (VT == MVT::i32 || VT == MVT::i64) && 7304 isa<ConstantSDNode>(N->getOperand(1))) { 7305 uint64_t TruncMask = N->getConstantOperandVal(1); 7306 if (isMask_64(TruncMask) && 7307 N->getOperand(0).getOpcode() == ISD::SRL && 7308 isa<ConstantSDNode>(N->getOperand(0)->getOperand(1))) 7309 return false; 7310 } 7311 return true; 7312 } 7313 7314 bool AArch64TargetLowering::shouldConvertConstantLoadToIntImm(const APInt &Imm, 7315 Type *Ty) const { 7316 assert(Ty->isIntegerTy()); 7317 7318 unsigned BitSize = Ty->getPrimitiveSizeInBits(); 7319 if (BitSize == 0) 7320 return false; 7321 7322 int64_t Val = Imm.getSExtValue(); 7323 if (Val == 0 || AArch64_AM::isLogicalImmediate(Val, BitSize)) 7324 return true; 7325 7326 if ((int64_t)Val < 0) 7327 Val = ~Val; 7328 if (BitSize == 32) 7329 Val &= (1LL << 32) - 1; 7330 7331 unsigned LZ = countLeadingZeros((uint64_t)Val); 7332 unsigned Shift = (63 - LZ) / 16; 7333 // MOVZ is free so return true for one or fewer MOVK. 7334 return Shift < 3; 7335 } 7336 7337 // Generate SUBS and CSEL for integer abs. 7338 static SDValue performIntegerAbsCombine(SDNode *N, SelectionDAG &DAG) { 7339 EVT VT = N->getValueType(0); 7340 7341 SDValue N0 = N->getOperand(0); 7342 SDValue N1 = N->getOperand(1); 7343 SDLoc DL(N); 7344 7345 // Check pattern of XOR(ADD(X,Y), Y) where Y is SRA(X, size(X)-1) 7346 // and change it to SUB and CSEL. 7347 if (VT.isInteger() && N->getOpcode() == ISD::XOR && 7348 N0.getOpcode() == ISD::ADD && N0.getOperand(1) == N1 && 7349 N1.getOpcode() == ISD::SRA && N1.getOperand(0) == N0.getOperand(0)) 7350 if (ConstantSDNode *Y1C = dyn_cast<ConstantSDNode>(N1.getOperand(1))) 7351 if (Y1C->getAPIntValue() == VT.getSizeInBits() - 1) { 7352 SDValue Neg = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT), 7353 N0.getOperand(0)); 7354 // Generate SUBS & CSEL. 7355 SDValue Cmp = 7356 DAG.getNode(AArch64ISD::SUBS, DL, DAG.getVTList(VT, MVT::i32), 7357 N0.getOperand(0), DAG.getConstant(0, DL, VT)); 7358 return DAG.getNode(AArch64ISD::CSEL, DL, VT, N0.getOperand(0), Neg, 7359 DAG.getConstant(AArch64CC::PL, DL, MVT::i32), 7360 SDValue(Cmp.getNode(), 1)); 7361 } 7362 return SDValue(); 7363 } 7364 7365 // performXorCombine - Attempts to handle integer ABS. 7366 static SDValue performXorCombine(SDNode *N, SelectionDAG &DAG, 7367 TargetLowering::DAGCombinerInfo &DCI, 7368 const AArch64Subtarget *Subtarget) { 7369 if (DCI.isBeforeLegalizeOps()) 7370 return SDValue(); 7371 7372 return performIntegerAbsCombine(N, DAG); 7373 } 7374 7375 SDValue 7376 AArch64TargetLowering::BuildSDIVPow2(SDNode *N, const APInt &Divisor, 7377 SelectionDAG &DAG, 7378 std::vector<SDNode *> *Created) const { 7379 // fold (sdiv X, pow2) 7380 EVT VT = N->getValueType(0); 7381 if ((VT != MVT::i32 && VT != MVT::i64) || 7382 !(Divisor.isPowerOf2() || (-Divisor).isPowerOf2())) 7383 return SDValue(); 7384 7385 SDLoc DL(N); 7386 SDValue N0 = N->getOperand(0); 7387 unsigned Lg2 = Divisor.countTrailingZeros(); 7388 SDValue Zero = DAG.getConstant(0, DL, VT); 7389 SDValue Pow2MinusOne = DAG.getConstant((1ULL << Lg2) - 1, DL, VT); 7390 7391 // Add (N0 < 0) ? Pow2 - 1 : 0; 7392 SDValue CCVal; 7393 SDValue Cmp = getAArch64Cmp(N0, Zero, ISD::SETLT, CCVal, DAG, DL); 7394 SDValue Add = DAG.getNode(ISD::ADD, DL, VT, N0, Pow2MinusOne); 7395 SDValue CSel = DAG.getNode(AArch64ISD::CSEL, DL, VT, Add, N0, CCVal, Cmp); 7396 7397 if (Created) { 7398 Created->push_back(Cmp.getNode()); 7399 Created->push_back(Add.getNode()); 7400 Created->push_back(CSel.getNode()); 7401 } 7402 7403 // Divide by pow2. 7404 SDValue SRA = 7405 DAG.getNode(ISD::SRA, DL, VT, CSel, DAG.getConstant(Lg2, DL, MVT::i64)); 7406 7407 // If we're dividing by a positive value, we're done. Otherwise, we must 7408 // negate the result. 7409 if (Divisor.isNonNegative()) 7410 return SRA; 7411 7412 if (Created) 7413 Created->push_back(SRA.getNode()); 7414 return DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT), SRA); 7415 } 7416 7417 static SDValue performMulCombine(SDNode *N, SelectionDAG &DAG, 7418 TargetLowering::DAGCombinerInfo &DCI, 7419 const AArch64Subtarget *Subtarget) { 7420 if (DCI.isBeforeLegalizeOps()) 7421 return SDValue(); 7422 7423 // Multiplication of a power of two plus/minus one can be done more 7424 // cheaply as as shift+add/sub. For now, this is true unilaterally. If 7425 // future CPUs have a cheaper MADD instruction, this may need to be 7426 // gated on a subtarget feature. For Cyclone, 32-bit MADD is 4 cycles and 7427 // 64-bit is 5 cycles, so this is always a win. 7428 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(N->getOperand(1))) { 7429 APInt Value = C->getAPIntValue(); 7430 EVT VT = N->getValueType(0); 7431 SDLoc DL(N); 7432 if (Value.isNonNegative()) { 7433 // (mul x, 2^N + 1) => (add (shl x, N), x) 7434 APInt VM1 = Value - 1; 7435 if (VM1.isPowerOf2()) { 7436 SDValue ShiftedVal = 7437 DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0), 7438 DAG.getConstant(VM1.logBase2(), DL, MVT::i64)); 7439 return DAG.getNode(ISD::ADD, DL, VT, ShiftedVal, 7440 N->getOperand(0)); 7441 } 7442 // (mul x, 2^N - 1) => (sub (shl x, N), x) 7443 APInt VP1 = Value + 1; 7444 if (VP1.isPowerOf2()) { 7445 SDValue ShiftedVal = 7446 DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0), 7447 DAG.getConstant(VP1.logBase2(), DL, MVT::i64)); 7448 return DAG.getNode(ISD::SUB, DL, VT, ShiftedVal, 7449 N->getOperand(0)); 7450 } 7451 } else { 7452 // (mul x, -(2^N - 1)) => (sub x, (shl x, N)) 7453 APInt VNP1 = -Value + 1; 7454 if (VNP1.isPowerOf2()) { 7455 SDValue ShiftedVal = 7456 DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0), 7457 DAG.getConstant(VNP1.logBase2(), DL, MVT::i64)); 7458 return DAG.getNode(ISD::SUB, DL, VT, N->getOperand(0), 7459 ShiftedVal); 7460 } 7461 // (mul x, -(2^N + 1)) => - (add (shl x, N), x) 7462 APInt VNM1 = -Value - 1; 7463 if (VNM1.isPowerOf2()) { 7464 SDValue ShiftedVal = 7465 DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0), 7466 DAG.getConstant(VNM1.logBase2(), DL, MVT::i64)); 7467 SDValue Add = 7468 DAG.getNode(ISD::ADD, DL, VT, ShiftedVal, N->getOperand(0)); 7469 return DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT), Add); 7470 } 7471 } 7472 } 7473 return SDValue(); 7474 } 7475 7476 static SDValue performVectorCompareAndMaskUnaryOpCombine(SDNode *N, 7477 SelectionDAG &DAG) { 7478 // Take advantage of vector comparisons producing 0 or -1 in each lane to 7479 // optimize away operation when it's from a constant. 7480 // 7481 // The general transformation is: 7482 // UNARYOP(AND(VECTOR_CMP(x,y), constant)) --> 7483 // AND(VECTOR_CMP(x,y), constant2) 7484 // constant2 = UNARYOP(constant) 7485 7486 // Early exit if this isn't a vector operation, the operand of the 7487 // unary operation isn't a bitwise AND, or if the sizes of the operations 7488 // aren't the same. 7489 EVT VT = N->getValueType(0); 7490 if (!VT.isVector() || N->getOperand(0)->getOpcode() != ISD::AND || 7491 N->getOperand(0)->getOperand(0)->getOpcode() != ISD::SETCC || 7492 VT.getSizeInBits() != N->getOperand(0)->getValueType(0).getSizeInBits()) 7493 return SDValue(); 7494 7495 // Now check that the other operand of the AND is a constant. We could 7496 // make the transformation for non-constant splats as well, but it's unclear 7497 // that would be a benefit as it would not eliminate any operations, just 7498 // perform one more step in scalar code before moving to the vector unit. 7499 if (BuildVectorSDNode *BV = 7500 dyn_cast<BuildVectorSDNode>(N->getOperand(0)->getOperand(1))) { 7501 // Bail out if the vector isn't a constant. 7502 if (!BV->isConstant()) 7503 return SDValue(); 7504 7505 // Everything checks out. Build up the new and improved node. 7506 SDLoc DL(N); 7507 EVT IntVT = BV->getValueType(0); 7508 // Create a new constant of the appropriate type for the transformed 7509 // DAG. 7510 SDValue SourceConst = DAG.getNode(N->getOpcode(), DL, VT, SDValue(BV, 0)); 7511 // The AND node needs bitcasts to/from an integer vector type around it. 7512 SDValue MaskConst = DAG.getNode(ISD::BITCAST, DL, IntVT, SourceConst); 7513 SDValue NewAnd = DAG.getNode(ISD::AND, DL, IntVT, 7514 N->getOperand(0)->getOperand(0), MaskConst); 7515 SDValue Res = DAG.getNode(ISD::BITCAST, DL, VT, NewAnd); 7516 return Res; 7517 } 7518 7519 return SDValue(); 7520 } 7521 7522 static SDValue performIntToFpCombine(SDNode *N, SelectionDAG &DAG, 7523 const AArch64Subtarget *Subtarget) { 7524 // First try to optimize away the conversion when it's conditionally from 7525 // a constant. Vectors only. 7526 if (SDValue Res = performVectorCompareAndMaskUnaryOpCombine(N, DAG)) 7527 return Res; 7528 7529 EVT VT = N->getValueType(0); 7530 if (VT != MVT::f32 && VT != MVT::f64) 7531 return SDValue(); 7532 7533 // Only optimize when the source and destination types have the same width. 7534 if (VT.getSizeInBits() != N->getOperand(0).getValueType().getSizeInBits()) 7535 return SDValue(); 7536 7537 // If the result of an integer load is only used by an integer-to-float 7538 // conversion, use a fp load instead and a AdvSIMD scalar {S|U}CVTF instead. 7539 // This eliminates an "integer-to-vector-move" UOP and improves throughput. 7540 SDValue N0 = N->getOperand(0); 7541 if (Subtarget->hasNEON() && ISD::isNormalLoad(N0.getNode()) && N0.hasOneUse() && 7542 // Do not change the width of a volatile load. 7543 !cast<LoadSDNode>(N0)->isVolatile()) { 7544 LoadSDNode *LN0 = cast<LoadSDNode>(N0); 7545 SDValue Load = DAG.getLoad(VT, SDLoc(N), LN0->getChain(), LN0->getBasePtr(), 7546 LN0->getPointerInfo(), LN0->isVolatile(), 7547 LN0->isNonTemporal(), LN0->isInvariant(), 7548 LN0->getAlignment()); 7549 7550 // Make sure successors of the original load stay after it by updating them 7551 // to use the new Chain. 7552 DAG.ReplaceAllUsesOfValueWith(SDValue(LN0, 1), Load.getValue(1)); 7553 7554 unsigned Opcode = 7555 (N->getOpcode() == ISD::SINT_TO_FP) ? AArch64ISD::SITOF : AArch64ISD::UITOF; 7556 return DAG.getNode(Opcode, SDLoc(N), VT, Load); 7557 } 7558 7559 return SDValue(); 7560 } 7561 7562 /// Fold a floating-point multiply by power of two into floating-point to 7563 /// fixed-point conversion. 7564 static SDValue performFpToIntCombine(SDNode *N, SelectionDAG &DAG, 7565 const AArch64Subtarget *Subtarget) { 7566 if (!Subtarget->hasNEON()) 7567 return SDValue(); 7568 7569 SDValue Op = N->getOperand(0); 7570 if (!Op.getValueType().isVector() || Op.getOpcode() != ISD::FMUL) 7571 return SDValue(); 7572 7573 SDValue ConstVec = Op->getOperand(1); 7574 if (!isa<BuildVectorSDNode>(ConstVec)) 7575 return SDValue(); 7576 7577 MVT FloatTy = Op.getSimpleValueType().getVectorElementType(); 7578 uint32_t FloatBits = FloatTy.getSizeInBits(); 7579 if (FloatBits != 32 && FloatBits != 64) 7580 return SDValue(); 7581 7582 MVT IntTy = N->getSimpleValueType(0).getVectorElementType(); 7583 uint32_t IntBits = IntTy.getSizeInBits(); 7584 if (IntBits != 16 && IntBits != 32 && IntBits != 64) 7585 return SDValue(); 7586 7587 // Avoid conversions where iN is larger than the float (e.g., float -> i64). 7588 if (IntBits > FloatBits) 7589 return SDValue(); 7590 7591 BitVector UndefElements; 7592 BuildVectorSDNode *BV = cast<BuildVectorSDNode>(ConstVec); 7593 int32_t Bits = IntBits == 64 ? 64 : 32; 7594 int32_t C = BV->getConstantFPSplatPow2ToLog2Int(&UndefElements, Bits + 1); 7595 if (C == -1 || C == 0 || C > Bits) 7596 return SDValue(); 7597 7598 MVT ResTy; 7599 unsigned NumLanes = Op.getValueType().getVectorNumElements(); 7600 switch (NumLanes) { 7601 default: 7602 return SDValue(); 7603 case 2: 7604 ResTy = FloatBits == 32 ? MVT::v2i32 : MVT::v2i64; 7605 break; 7606 case 4: 7607 ResTy = MVT::v4i32; 7608 break; 7609 } 7610 7611 SDLoc DL(N); 7612 bool IsSigned = N->getOpcode() == ISD::FP_TO_SINT; 7613 unsigned IntrinsicOpcode = IsSigned ? Intrinsic::aarch64_neon_vcvtfp2fxs 7614 : Intrinsic::aarch64_neon_vcvtfp2fxu; 7615 SDValue FixConv = 7616 DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, ResTy, 7617 DAG.getConstant(IntrinsicOpcode, DL, MVT::i32), 7618 Op->getOperand(0), DAG.getConstant(C, DL, MVT::i32)); 7619 // We can handle smaller integers by generating an extra trunc. 7620 if (IntBits < FloatBits) 7621 FixConv = DAG.getNode(ISD::TRUNCATE, DL, N->getValueType(0), FixConv); 7622 7623 return FixConv; 7624 } 7625 7626 /// Fold a floating-point divide by power of two into fixed-point to 7627 /// floating-point conversion. 7628 static SDValue performFDivCombine(SDNode *N, SelectionDAG &DAG, 7629 const AArch64Subtarget *Subtarget) { 7630 if (!Subtarget->hasNEON()) 7631 return SDValue(); 7632 7633 SDValue Op = N->getOperand(0); 7634 unsigned Opc = Op->getOpcode(); 7635 if (!Op.getValueType().isVector() || 7636 (Opc != ISD::SINT_TO_FP && Opc != ISD::UINT_TO_FP)) 7637 return SDValue(); 7638 7639 SDValue ConstVec = N->getOperand(1); 7640 if (!isa<BuildVectorSDNode>(ConstVec)) 7641 return SDValue(); 7642 7643 MVT IntTy = Op.getOperand(0).getSimpleValueType().getVectorElementType(); 7644 int32_t IntBits = IntTy.getSizeInBits(); 7645 if (IntBits != 16 && IntBits != 32 && IntBits != 64) 7646 return SDValue(); 7647 7648 MVT FloatTy = N->getSimpleValueType(0).getVectorElementType(); 7649 int32_t FloatBits = FloatTy.getSizeInBits(); 7650 if (FloatBits != 32 && FloatBits != 64) 7651 return SDValue(); 7652 7653 // Avoid conversions where iN is larger than the float (e.g., i64 -> float). 7654 if (IntBits > FloatBits) 7655 return SDValue(); 7656 7657 BitVector UndefElements; 7658 BuildVectorSDNode *BV = cast<BuildVectorSDNode>(ConstVec); 7659 int32_t C = BV->getConstantFPSplatPow2ToLog2Int(&UndefElements, FloatBits + 1); 7660 if (C == -1 || C == 0 || C > FloatBits) 7661 return SDValue(); 7662 7663 MVT ResTy; 7664 unsigned NumLanes = Op.getValueType().getVectorNumElements(); 7665 switch (NumLanes) { 7666 default: 7667 return SDValue(); 7668 case 2: 7669 ResTy = FloatBits == 32 ? MVT::v2i32 : MVT::v2i64; 7670 break; 7671 case 4: 7672 ResTy = MVT::v4i32; 7673 break; 7674 } 7675 7676 SDLoc DL(N); 7677 SDValue ConvInput = Op.getOperand(0); 7678 bool IsSigned = Opc == ISD::SINT_TO_FP; 7679 if (IntBits < FloatBits) 7680 ConvInput = DAG.getNode(IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND, DL, 7681 ResTy, ConvInput); 7682 7683 unsigned IntrinsicOpcode = IsSigned ? Intrinsic::aarch64_neon_vcvtfxs2fp 7684 : Intrinsic::aarch64_neon_vcvtfxu2fp; 7685 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, Op.getValueType(), 7686 DAG.getConstant(IntrinsicOpcode, DL, MVT::i32), ConvInput, 7687 DAG.getConstant(C, DL, MVT::i32)); 7688 } 7689 7690 /// An EXTR instruction is made up of two shifts, ORed together. This helper 7691 /// searches for and classifies those shifts. 7692 static bool findEXTRHalf(SDValue N, SDValue &Src, uint32_t &ShiftAmount, 7693 bool &FromHi) { 7694 if (N.getOpcode() == ISD::SHL) 7695 FromHi = false; 7696 else if (N.getOpcode() == ISD::SRL) 7697 FromHi = true; 7698 else 7699 return false; 7700 7701 if (!isa<ConstantSDNode>(N.getOperand(1))) 7702 return false; 7703 7704 ShiftAmount = N->getConstantOperandVal(1); 7705 Src = N->getOperand(0); 7706 return true; 7707 } 7708 7709 /// EXTR instruction extracts a contiguous chunk of bits from two existing 7710 /// registers viewed as a high/low pair. This function looks for the pattern: 7711 /// (or (shl VAL1, #N), (srl VAL2, #RegWidth-N)) and replaces it with an 7712 /// EXTR. Can't quite be done in TableGen because the two immediates aren't 7713 /// independent. 7714 static SDValue tryCombineToEXTR(SDNode *N, 7715 TargetLowering::DAGCombinerInfo &DCI) { 7716 SelectionDAG &DAG = DCI.DAG; 7717 SDLoc DL(N); 7718 EVT VT = N->getValueType(0); 7719 7720 assert(N->getOpcode() == ISD::OR && "Unexpected root"); 7721 7722 if (VT != MVT::i32 && VT != MVT::i64) 7723 return SDValue(); 7724 7725 SDValue LHS; 7726 uint32_t ShiftLHS = 0; 7727 bool LHSFromHi = 0; 7728 if (!findEXTRHalf(N->getOperand(0), LHS, ShiftLHS, LHSFromHi)) 7729 return SDValue(); 7730 7731 SDValue RHS; 7732 uint32_t ShiftRHS = 0; 7733 bool RHSFromHi = 0; 7734 if (!findEXTRHalf(N->getOperand(1), RHS, ShiftRHS, RHSFromHi)) 7735 return SDValue(); 7736 7737 // If they're both trying to come from the high part of the register, they're 7738 // not really an EXTR. 7739 if (LHSFromHi == RHSFromHi) 7740 return SDValue(); 7741 7742 if (ShiftLHS + ShiftRHS != VT.getSizeInBits()) 7743 return SDValue(); 7744 7745 if (LHSFromHi) { 7746 std::swap(LHS, RHS); 7747 std::swap(ShiftLHS, ShiftRHS); 7748 } 7749 7750 return DAG.getNode(AArch64ISD::EXTR, DL, VT, LHS, RHS, 7751 DAG.getConstant(ShiftRHS, DL, MVT::i64)); 7752 } 7753 7754 static SDValue tryCombineToBSL(SDNode *N, 7755 TargetLowering::DAGCombinerInfo &DCI) { 7756 EVT VT = N->getValueType(0); 7757 SelectionDAG &DAG = DCI.DAG; 7758 SDLoc DL(N); 7759 7760 if (!VT.isVector()) 7761 return SDValue(); 7762 7763 SDValue N0 = N->getOperand(0); 7764 if (N0.getOpcode() != ISD::AND) 7765 return SDValue(); 7766 7767 SDValue N1 = N->getOperand(1); 7768 if (N1.getOpcode() != ISD::AND) 7769 return SDValue(); 7770 7771 // We only have to look for constant vectors here since the general, variable 7772 // case can be handled in TableGen. 7773 unsigned Bits = VT.getVectorElementType().getSizeInBits(); 7774 uint64_t BitMask = Bits == 64 ? -1ULL : ((1ULL << Bits) - 1); 7775 for (int i = 1; i >= 0; --i) 7776 for (int j = 1; j >= 0; --j) { 7777 BuildVectorSDNode *BVN0 = dyn_cast<BuildVectorSDNode>(N0->getOperand(i)); 7778 BuildVectorSDNode *BVN1 = dyn_cast<BuildVectorSDNode>(N1->getOperand(j)); 7779 if (!BVN0 || !BVN1) 7780 continue; 7781 7782 bool FoundMatch = true; 7783 for (unsigned k = 0; k < VT.getVectorNumElements(); ++k) { 7784 ConstantSDNode *CN0 = dyn_cast<ConstantSDNode>(BVN0->getOperand(k)); 7785 ConstantSDNode *CN1 = dyn_cast<ConstantSDNode>(BVN1->getOperand(k)); 7786 if (!CN0 || !CN1 || 7787 CN0->getZExtValue() != (BitMask & ~CN1->getZExtValue())) { 7788 FoundMatch = false; 7789 break; 7790 } 7791 } 7792 7793 if (FoundMatch) 7794 return DAG.getNode(AArch64ISD::BSL, DL, VT, SDValue(BVN0, 0), 7795 N0->getOperand(1 - i), N1->getOperand(1 - j)); 7796 } 7797 7798 return SDValue(); 7799 } 7800 7801 static SDValue performORCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, 7802 const AArch64Subtarget *Subtarget) { 7803 // Attempt to form an EXTR from (or (shl VAL1, #N), (srl VAL2, #RegWidth-N)) 7804 if (!EnableAArch64ExtrGeneration) 7805 return SDValue(); 7806 SelectionDAG &DAG = DCI.DAG; 7807 EVT VT = N->getValueType(0); 7808 7809 if (!DAG.getTargetLoweringInfo().isTypeLegal(VT)) 7810 return SDValue(); 7811 7812 SDValue Res = tryCombineToEXTR(N, DCI); 7813 if (Res.getNode()) 7814 return Res; 7815 7816 Res = tryCombineToBSL(N, DCI); 7817 if (Res.getNode()) 7818 return Res; 7819 7820 return SDValue(); 7821 } 7822 7823 static SDValue performBitcastCombine(SDNode *N, 7824 TargetLowering::DAGCombinerInfo &DCI, 7825 SelectionDAG &DAG) { 7826 // Wait 'til after everything is legalized to try this. That way we have 7827 // legal vector types and such. 7828 if (DCI.isBeforeLegalizeOps()) 7829 return SDValue(); 7830 7831 // Remove extraneous bitcasts around an extract_subvector. 7832 // For example, 7833 // (v4i16 (bitconvert 7834 // (extract_subvector (v2i64 (bitconvert (v8i16 ...)), (i64 1))))) 7835 // becomes 7836 // (extract_subvector ((v8i16 ...), (i64 4))) 7837 7838 // Only interested in 64-bit vectors as the ultimate result. 7839 EVT VT = N->getValueType(0); 7840 if (!VT.isVector()) 7841 return SDValue(); 7842 if (VT.getSimpleVT().getSizeInBits() != 64) 7843 return SDValue(); 7844 // Is the operand an extract_subvector starting at the beginning or halfway 7845 // point of the vector? A low half may also come through as an 7846 // EXTRACT_SUBREG, so look for that, too. 7847 SDValue Op0 = N->getOperand(0); 7848 if (Op0->getOpcode() != ISD::EXTRACT_SUBVECTOR && 7849 !(Op0->isMachineOpcode() && 7850 Op0->getMachineOpcode() == AArch64::EXTRACT_SUBREG)) 7851 return SDValue(); 7852 uint64_t idx = cast<ConstantSDNode>(Op0->getOperand(1))->getZExtValue(); 7853 if (Op0->getOpcode() == ISD::EXTRACT_SUBVECTOR) { 7854 if (Op0->getValueType(0).getVectorNumElements() != idx && idx != 0) 7855 return SDValue(); 7856 } else if (Op0->getMachineOpcode() == AArch64::EXTRACT_SUBREG) { 7857 if (idx != AArch64::dsub) 7858 return SDValue(); 7859 // The dsub reference is equivalent to a lane zero subvector reference. 7860 idx = 0; 7861 } 7862 // Look through the bitcast of the input to the extract. 7863 if (Op0->getOperand(0)->getOpcode() != ISD::BITCAST) 7864 return SDValue(); 7865 SDValue Source = Op0->getOperand(0)->getOperand(0); 7866 // If the source type has twice the number of elements as our destination 7867 // type, we know this is an extract of the high or low half of the vector. 7868 EVT SVT = Source->getValueType(0); 7869 if (SVT.getVectorNumElements() != VT.getVectorNumElements() * 2) 7870 return SDValue(); 7871 7872 DEBUG(dbgs() << "aarch64-lower: bitcast extract_subvector simplification\n"); 7873 7874 // Create the simplified form to just extract the low or high half of the 7875 // vector directly rather than bothering with the bitcasts. 7876 SDLoc dl(N); 7877 unsigned NumElements = VT.getVectorNumElements(); 7878 if (idx) { 7879 SDValue HalfIdx = DAG.getConstant(NumElements, dl, MVT::i64); 7880 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, Source, HalfIdx); 7881 } else { 7882 SDValue SubReg = DAG.getTargetConstant(AArch64::dsub, dl, MVT::i32); 7883 return SDValue(DAG.getMachineNode(TargetOpcode::EXTRACT_SUBREG, dl, VT, 7884 Source, SubReg), 7885 0); 7886 } 7887 } 7888 7889 static SDValue performConcatVectorsCombine(SDNode *N, 7890 TargetLowering::DAGCombinerInfo &DCI, 7891 SelectionDAG &DAG) { 7892 SDLoc dl(N); 7893 EVT VT = N->getValueType(0); 7894 SDValue N0 = N->getOperand(0), N1 = N->getOperand(1); 7895 7896 // Optimize concat_vectors of truncated vectors, where the intermediate 7897 // type is illegal, to avoid said illegality, e.g., 7898 // (v4i16 (concat_vectors (v2i16 (truncate (v2i64))), 7899 // (v2i16 (truncate (v2i64))))) 7900 // -> 7901 // (v4i16 (truncate (vector_shuffle (v4i32 (bitcast (v2i64))), 7902 // (v4i32 (bitcast (v2i64))), 7903 // <0, 2, 4, 6>))) 7904 // This isn't really target-specific, but ISD::TRUNCATE legality isn't keyed 7905 // on both input and result type, so we might generate worse code. 7906 // On AArch64 we know it's fine for v2i64->v4i16 and v4i32->v8i8. 7907 if (N->getNumOperands() == 2 && 7908 N0->getOpcode() == ISD::TRUNCATE && 7909 N1->getOpcode() == ISD::TRUNCATE) { 7910 SDValue N00 = N0->getOperand(0); 7911 SDValue N10 = N1->getOperand(0); 7912 EVT N00VT = N00.getValueType(); 7913 7914 if (N00VT == N10.getValueType() && 7915 (N00VT == MVT::v2i64 || N00VT == MVT::v4i32) && 7916 N00VT.getScalarSizeInBits() == 4 * VT.getScalarSizeInBits()) { 7917 MVT MidVT = (N00VT == MVT::v2i64 ? MVT::v4i32 : MVT::v8i16); 7918 SmallVector<int, 8> Mask(MidVT.getVectorNumElements()); 7919 for (size_t i = 0; i < Mask.size(); ++i) 7920 Mask[i] = i * 2; 7921 return DAG.getNode(ISD::TRUNCATE, dl, VT, 7922 DAG.getVectorShuffle( 7923 MidVT, dl, 7924 DAG.getNode(ISD::BITCAST, dl, MidVT, N00), 7925 DAG.getNode(ISD::BITCAST, dl, MidVT, N10), Mask)); 7926 } 7927 } 7928 7929 // Wait 'til after everything is legalized to try this. That way we have 7930 // legal vector types and such. 7931 if (DCI.isBeforeLegalizeOps()) 7932 return SDValue(); 7933 7934 // If we see a (concat_vectors (v1x64 A), (v1x64 A)) it's really a vector 7935 // splat. The indexed instructions are going to be expecting a DUPLANE64, so 7936 // canonicalise to that. 7937 if (N0 == N1 && VT.getVectorNumElements() == 2) { 7938 assert(VT.getVectorElementType().getSizeInBits() == 64); 7939 return DAG.getNode(AArch64ISD::DUPLANE64, dl, VT, WidenVector(N0, DAG), 7940 DAG.getConstant(0, dl, MVT::i64)); 7941 } 7942 7943 // Canonicalise concat_vectors so that the right-hand vector has as few 7944 // bit-casts as possible before its real operation. The primary matching 7945 // destination for these operations will be the narrowing "2" instructions, 7946 // which depend on the operation being performed on this right-hand vector. 7947 // For example, 7948 // (concat_vectors LHS, (v1i64 (bitconvert (v4i16 RHS)))) 7949 // becomes 7950 // (bitconvert (concat_vectors (v4i16 (bitconvert LHS)), RHS)) 7951 7952 if (N1->getOpcode() != ISD::BITCAST) 7953 return SDValue(); 7954 SDValue RHS = N1->getOperand(0); 7955 MVT RHSTy = RHS.getValueType().getSimpleVT(); 7956 // If the RHS is not a vector, this is not the pattern we're looking for. 7957 if (!RHSTy.isVector()) 7958 return SDValue(); 7959 7960 DEBUG(dbgs() << "aarch64-lower: concat_vectors bitcast simplification\n"); 7961 7962 MVT ConcatTy = MVT::getVectorVT(RHSTy.getVectorElementType(), 7963 RHSTy.getVectorNumElements() * 2); 7964 return DAG.getNode(ISD::BITCAST, dl, VT, 7965 DAG.getNode(ISD::CONCAT_VECTORS, dl, ConcatTy, 7966 DAG.getNode(ISD::BITCAST, dl, RHSTy, N0), 7967 RHS)); 7968 } 7969 7970 static SDValue tryCombineFixedPointConvert(SDNode *N, 7971 TargetLowering::DAGCombinerInfo &DCI, 7972 SelectionDAG &DAG) { 7973 // Wait 'til after everything is legalized to try this. That way we have 7974 // legal vector types and such. 7975 if (DCI.isBeforeLegalizeOps()) 7976 return SDValue(); 7977 // Transform a scalar conversion of a value from a lane extract into a 7978 // lane extract of a vector conversion. E.g., from foo1 to foo2: 7979 // double foo1(int64x2_t a) { return vcvtd_n_f64_s64(a[1], 9); } 7980 // double foo2(int64x2_t a) { return vcvtq_n_f64_s64(a, 9)[1]; } 7981 // 7982 // The second form interacts better with instruction selection and the 7983 // register allocator to avoid cross-class register copies that aren't 7984 // coalescable due to a lane reference. 7985 7986 // Check the operand and see if it originates from a lane extract. 7987 SDValue Op1 = N->getOperand(1); 7988 if (Op1.getOpcode() == ISD::EXTRACT_VECTOR_ELT) { 7989 // Yep, no additional predication needed. Perform the transform. 7990 SDValue IID = N->getOperand(0); 7991 SDValue Shift = N->getOperand(2); 7992 SDValue Vec = Op1.getOperand(0); 7993 SDValue Lane = Op1.getOperand(1); 7994 EVT ResTy = N->getValueType(0); 7995 EVT VecResTy; 7996 SDLoc DL(N); 7997 7998 // The vector width should be 128 bits by the time we get here, even 7999 // if it started as 64 bits (the extract_vector handling will have 8000 // done so). 8001 assert(Vec.getValueType().getSizeInBits() == 128 && 8002 "unexpected vector size on extract_vector_elt!"); 8003 if (Vec.getValueType() == MVT::v4i32) 8004 VecResTy = MVT::v4f32; 8005 else if (Vec.getValueType() == MVT::v2i64) 8006 VecResTy = MVT::v2f64; 8007 else 8008 llvm_unreachable("unexpected vector type!"); 8009 8010 SDValue Convert = 8011 DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VecResTy, IID, Vec, Shift); 8012 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ResTy, Convert, Lane); 8013 } 8014 return SDValue(); 8015 } 8016 8017 // AArch64 high-vector "long" operations are formed by performing the non-high 8018 // version on an extract_subvector of each operand which gets the high half: 8019 // 8020 // (longop2 LHS, RHS) == (longop (extract_high LHS), (extract_high RHS)) 8021 // 8022 // However, there are cases which don't have an extract_high explicitly, but 8023 // have another operation that can be made compatible with one for free. For 8024 // example: 8025 // 8026 // (dupv64 scalar) --> (extract_high (dup128 scalar)) 8027 // 8028 // This routine does the actual conversion of such DUPs, once outer routines 8029 // have determined that everything else is in order. 8030 // It also supports immediate DUP-like nodes (MOVI/MVNi), which we can fold 8031 // similarly here. 8032 static SDValue tryExtendDUPToExtractHigh(SDValue N, SelectionDAG &DAG) { 8033 switch (N.getOpcode()) { 8034 case AArch64ISD::DUP: 8035 case AArch64ISD::DUPLANE8: 8036 case AArch64ISD::DUPLANE16: 8037 case AArch64ISD::DUPLANE32: 8038 case AArch64ISD::DUPLANE64: 8039 case AArch64ISD::MOVI: 8040 case AArch64ISD::MOVIshift: 8041 case AArch64ISD::MOVIedit: 8042 case AArch64ISD::MOVImsl: 8043 case AArch64ISD::MVNIshift: 8044 case AArch64ISD::MVNImsl: 8045 break; 8046 default: 8047 // FMOV could be supported, but isn't very useful, as it would only occur 8048 // if you passed a bitcast' floating point immediate to an eligible long 8049 // integer op (addl, smull, ...). 8050 return SDValue(); 8051 } 8052 8053 MVT NarrowTy = N.getSimpleValueType(); 8054 if (!NarrowTy.is64BitVector()) 8055 return SDValue(); 8056 8057 MVT ElementTy = NarrowTy.getVectorElementType(); 8058 unsigned NumElems = NarrowTy.getVectorNumElements(); 8059 MVT NewVT = MVT::getVectorVT(ElementTy, NumElems * 2); 8060 8061 SDLoc dl(N); 8062 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, NarrowTy, 8063 DAG.getNode(N->getOpcode(), dl, NewVT, N->ops()), 8064 DAG.getConstant(NumElems, dl, MVT::i64)); 8065 } 8066 8067 static bool isEssentiallyExtractSubvector(SDValue N) { 8068 if (N.getOpcode() == ISD::EXTRACT_SUBVECTOR) 8069 return true; 8070 8071 return N.getOpcode() == ISD::BITCAST && 8072 N.getOperand(0).getOpcode() == ISD::EXTRACT_SUBVECTOR; 8073 } 8074 8075 /// \brief Helper structure to keep track of ISD::SET_CC operands. 8076 struct GenericSetCCInfo { 8077 const SDValue *Opnd0; 8078 const SDValue *Opnd1; 8079 ISD::CondCode CC; 8080 }; 8081 8082 /// \brief Helper structure to keep track of a SET_CC lowered into AArch64 code. 8083 struct AArch64SetCCInfo { 8084 const SDValue *Cmp; 8085 AArch64CC::CondCode CC; 8086 }; 8087 8088 /// \brief Helper structure to keep track of SetCC information. 8089 union SetCCInfo { 8090 GenericSetCCInfo Generic; 8091 AArch64SetCCInfo AArch64; 8092 }; 8093 8094 /// \brief Helper structure to be able to read SetCC information. If set to 8095 /// true, IsAArch64 field, Info is a AArch64SetCCInfo, otherwise Info is a 8096 /// GenericSetCCInfo. 8097 struct SetCCInfoAndKind { 8098 SetCCInfo Info; 8099 bool IsAArch64; 8100 }; 8101 8102 /// \brief Check whether or not \p Op is a SET_CC operation, either a generic or 8103 /// an 8104 /// AArch64 lowered one. 8105 /// \p SetCCInfo is filled accordingly. 8106 /// \post SetCCInfo is meanginfull only when this function returns true. 8107 /// \return True when Op is a kind of SET_CC operation. 8108 static bool isSetCC(SDValue Op, SetCCInfoAndKind &SetCCInfo) { 8109 // If this is a setcc, this is straight forward. 8110 if (Op.getOpcode() == ISD::SETCC) { 8111 SetCCInfo.Info.Generic.Opnd0 = &Op.getOperand(0); 8112 SetCCInfo.Info.Generic.Opnd1 = &Op.getOperand(1); 8113 SetCCInfo.Info.Generic.CC = cast<CondCodeSDNode>(Op.getOperand(2))->get(); 8114 SetCCInfo.IsAArch64 = false; 8115 return true; 8116 } 8117 // Otherwise, check if this is a matching csel instruction. 8118 // In other words: 8119 // - csel 1, 0, cc 8120 // - csel 0, 1, !cc 8121 if (Op.getOpcode() != AArch64ISD::CSEL) 8122 return false; 8123 // Set the information about the operands. 8124 // TODO: we want the operands of the Cmp not the csel 8125 SetCCInfo.Info.AArch64.Cmp = &Op.getOperand(3); 8126 SetCCInfo.IsAArch64 = true; 8127 SetCCInfo.Info.AArch64.CC = static_cast<AArch64CC::CondCode>( 8128 cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue()); 8129 8130 // Check that the operands matches the constraints: 8131 // (1) Both operands must be constants. 8132 // (2) One must be 1 and the other must be 0. 8133 ConstantSDNode *TValue = dyn_cast<ConstantSDNode>(Op.getOperand(0)); 8134 ConstantSDNode *FValue = dyn_cast<ConstantSDNode>(Op.getOperand(1)); 8135 8136 // Check (1). 8137 if (!TValue || !FValue) 8138 return false; 8139 8140 // Check (2). 8141 if (!TValue->isOne()) { 8142 // Update the comparison when we are interested in !cc. 8143 std::swap(TValue, FValue); 8144 SetCCInfo.Info.AArch64.CC = 8145 AArch64CC::getInvertedCondCode(SetCCInfo.Info.AArch64.CC); 8146 } 8147 return TValue->isOne() && FValue->isNullValue(); 8148 } 8149 8150 // Returns true if Op is setcc or zext of setcc. 8151 static bool isSetCCOrZExtSetCC(const SDValue& Op, SetCCInfoAndKind &Info) { 8152 if (isSetCC(Op, Info)) 8153 return true; 8154 return ((Op.getOpcode() == ISD::ZERO_EXTEND) && 8155 isSetCC(Op->getOperand(0), Info)); 8156 } 8157 8158 // The folding we want to perform is: 8159 // (add x, [zext] (setcc cc ...) ) 8160 // --> 8161 // (csel x, (add x, 1), !cc ...) 8162 // 8163 // The latter will get matched to a CSINC instruction. 8164 static SDValue performSetccAddFolding(SDNode *Op, SelectionDAG &DAG) { 8165 assert(Op && Op->getOpcode() == ISD::ADD && "Unexpected operation!"); 8166 SDValue LHS = Op->getOperand(0); 8167 SDValue RHS = Op->getOperand(1); 8168 SetCCInfoAndKind InfoAndKind; 8169 8170 // If neither operand is a SET_CC, give up. 8171 if (!isSetCCOrZExtSetCC(LHS, InfoAndKind)) { 8172 std::swap(LHS, RHS); 8173 if (!isSetCCOrZExtSetCC(LHS, InfoAndKind)) 8174 return SDValue(); 8175 } 8176 8177 // FIXME: This could be generatized to work for FP comparisons. 8178 EVT CmpVT = InfoAndKind.IsAArch64 8179 ? InfoAndKind.Info.AArch64.Cmp->getOperand(0).getValueType() 8180 : InfoAndKind.Info.Generic.Opnd0->getValueType(); 8181 if (CmpVT != MVT::i32 && CmpVT != MVT::i64) 8182 return SDValue(); 8183 8184 SDValue CCVal; 8185 SDValue Cmp; 8186 SDLoc dl(Op); 8187 if (InfoAndKind.IsAArch64) { 8188 CCVal = DAG.getConstant( 8189 AArch64CC::getInvertedCondCode(InfoAndKind.Info.AArch64.CC), dl, 8190 MVT::i32); 8191 Cmp = *InfoAndKind.Info.AArch64.Cmp; 8192 } else 8193 Cmp = getAArch64Cmp(*InfoAndKind.Info.Generic.Opnd0, 8194 *InfoAndKind.Info.Generic.Opnd1, 8195 ISD::getSetCCInverse(InfoAndKind.Info.Generic.CC, true), 8196 CCVal, DAG, dl); 8197 8198 EVT VT = Op->getValueType(0); 8199 LHS = DAG.getNode(ISD::ADD, dl, VT, RHS, DAG.getConstant(1, dl, VT)); 8200 return DAG.getNode(AArch64ISD::CSEL, dl, VT, RHS, LHS, CCVal, Cmp); 8201 } 8202 8203 // The basic add/sub long vector instructions have variants with "2" on the end 8204 // which act on the high-half of their inputs. They are normally matched by 8205 // patterns like: 8206 // 8207 // (add (zeroext (extract_high LHS)), 8208 // (zeroext (extract_high RHS))) 8209 // -> uaddl2 vD, vN, vM 8210 // 8211 // However, if one of the extracts is something like a duplicate, this 8212 // instruction can still be used profitably. This function puts the DAG into a 8213 // more appropriate form for those patterns to trigger. 8214 static SDValue performAddSubLongCombine(SDNode *N, 8215 TargetLowering::DAGCombinerInfo &DCI, 8216 SelectionDAG &DAG) { 8217 if (DCI.isBeforeLegalizeOps()) 8218 return SDValue(); 8219 8220 MVT VT = N->getSimpleValueType(0); 8221 if (!VT.is128BitVector()) { 8222 if (N->getOpcode() == ISD::ADD) 8223 return performSetccAddFolding(N, DAG); 8224 return SDValue(); 8225 } 8226 8227 // Make sure both branches are extended in the same way. 8228 SDValue LHS = N->getOperand(0); 8229 SDValue RHS = N->getOperand(1); 8230 if ((LHS.getOpcode() != ISD::ZERO_EXTEND && 8231 LHS.getOpcode() != ISD::SIGN_EXTEND) || 8232 LHS.getOpcode() != RHS.getOpcode()) 8233 return SDValue(); 8234 8235 unsigned ExtType = LHS.getOpcode(); 8236 8237 // It's not worth doing if at least one of the inputs isn't already an 8238 // extract, but we don't know which it'll be so we have to try both. 8239 if (isEssentiallyExtractSubvector(LHS.getOperand(0))) { 8240 RHS = tryExtendDUPToExtractHigh(RHS.getOperand(0), DAG); 8241 if (!RHS.getNode()) 8242 return SDValue(); 8243 8244 RHS = DAG.getNode(ExtType, SDLoc(N), VT, RHS); 8245 } else if (isEssentiallyExtractSubvector(RHS.getOperand(0))) { 8246 LHS = tryExtendDUPToExtractHigh(LHS.getOperand(0), DAG); 8247 if (!LHS.getNode()) 8248 return SDValue(); 8249 8250 LHS = DAG.getNode(ExtType, SDLoc(N), VT, LHS); 8251 } 8252 8253 return DAG.getNode(N->getOpcode(), SDLoc(N), VT, LHS, RHS); 8254 } 8255 8256 // Massage DAGs which we can use the high-half "long" operations on into 8257 // something isel will recognize better. E.g. 8258 // 8259 // (aarch64_neon_umull (extract_high vec) (dupv64 scalar)) --> 8260 // (aarch64_neon_umull (extract_high (v2i64 vec))) 8261 // (extract_high (v2i64 (dup128 scalar))))) 8262 // 8263 static SDValue tryCombineLongOpWithDup(unsigned IID, SDNode *N, 8264 TargetLowering::DAGCombinerInfo &DCI, 8265 SelectionDAG &DAG) { 8266 if (DCI.isBeforeLegalizeOps()) 8267 return SDValue(); 8268 8269 SDValue LHS = N->getOperand(1); 8270 SDValue RHS = N->getOperand(2); 8271 assert(LHS.getValueType().is64BitVector() && 8272 RHS.getValueType().is64BitVector() && 8273 "unexpected shape for long operation"); 8274 8275 // Either node could be a DUP, but it's not worth doing both of them (you'd 8276 // just as well use the non-high version) so look for a corresponding extract 8277 // operation on the other "wing". 8278 if (isEssentiallyExtractSubvector(LHS)) { 8279 RHS = tryExtendDUPToExtractHigh(RHS, DAG); 8280 if (!RHS.getNode()) 8281 return SDValue(); 8282 } else if (isEssentiallyExtractSubvector(RHS)) { 8283 LHS = tryExtendDUPToExtractHigh(LHS, DAG); 8284 if (!LHS.getNode()) 8285 return SDValue(); 8286 } 8287 8288 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SDLoc(N), N->getValueType(0), 8289 N->getOperand(0), LHS, RHS); 8290 } 8291 8292 static SDValue tryCombineShiftImm(unsigned IID, SDNode *N, SelectionDAG &DAG) { 8293 MVT ElemTy = N->getSimpleValueType(0).getScalarType(); 8294 unsigned ElemBits = ElemTy.getSizeInBits(); 8295 8296 int64_t ShiftAmount; 8297 if (BuildVectorSDNode *BVN = dyn_cast<BuildVectorSDNode>(N->getOperand(2))) { 8298 APInt SplatValue, SplatUndef; 8299 unsigned SplatBitSize; 8300 bool HasAnyUndefs; 8301 if (!BVN->isConstantSplat(SplatValue, SplatUndef, SplatBitSize, 8302 HasAnyUndefs, ElemBits) || 8303 SplatBitSize != ElemBits) 8304 return SDValue(); 8305 8306 ShiftAmount = SplatValue.getSExtValue(); 8307 } else if (ConstantSDNode *CVN = dyn_cast<ConstantSDNode>(N->getOperand(2))) { 8308 ShiftAmount = CVN->getSExtValue(); 8309 } else 8310 return SDValue(); 8311 8312 unsigned Opcode; 8313 bool IsRightShift; 8314 switch (IID) { 8315 default: 8316 llvm_unreachable("Unknown shift intrinsic"); 8317 case Intrinsic::aarch64_neon_sqshl: 8318 Opcode = AArch64ISD::SQSHL_I; 8319 IsRightShift = false; 8320 break; 8321 case Intrinsic::aarch64_neon_uqshl: 8322 Opcode = AArch64ISD::UQSHL_I; 8323 IsRightShift = false; 8324 break; 8325 case Intrinsic::aarch64_neon_srshl: 8326 Opcode = AArch64ISD::SRSHR_I; 8327 IsRightShift = true; 8328 break; 8329 case Intrinsic::aarch64_neon_urshl: 8330 Opcode = AArch64ISD::URSHR_I; 8331 IsRightShift = true; 8332 break; 8333 case Intrinsic::aarch64_neon_sqshlu: 8334 Opcode = AArch64ISD::SQSHLU_I; 8335 IsRightShift = false; 8336 break; 8337 } 8338 8339 if (IsRightShift && ShiftAmount <= -1 && ShiftAmount >= -(int)ElemBits) { 8340 SDLoc dl(N); 8341 return DAG.getNode(Opcode, dl, N->getValueType(0), N->getOperand(1), 8342 DAG.getConstant(-ShiftAmount, dl, MVT::i32)); 8343 } else if (!IsRightShift && ShiftAmount >= 0 && ShiftAmount < ElemBits) { 8344 SDLoc dl(N); 8345 return DAG.getNode(Opcode, dl, N->getValueType(0), N->getOperand(1), 8346 DAG.getConstant(ShiftAmount, dl, MVT::i32)); 8347 } 8348 8349 return SDValue(); 8350 } 8351 8352 // The CRC32[BH] instructions ignore the high bits of their data operand. Since 8353 // the intrinsics must be legal and take an i32, this means there's almost 8354 // certainly going to be a zext in the DAG which we can eliminate. 8355 static SDValue tryCombineCRC32(unsigned Mask, SDNode *N, SelectionDAG &DAG) { 8356 SDValue AndN = N->getOperand(2); 8357 if (AndN.getOpcode() != ISD::AND) 8358 return SDValue(); 8359 8360 ConstantSDNode *CMask = dyn_cast<ConstantSDNode>(AndN.getOperand(1)); 8361 if (!CMask || CMask->getZExtValue() != Mask) 8362 return SDValue(); 8363 8364 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SDLoc(N), MVT::i32, 8365 N->getOperand(0), N->getOperand(1), AndN.getOperand(0)); 8366 } 8367 8368 static SDValue combineAcrossLanesIntrinsic(unsigned Opc, SDNode *N, 8369 SelectionDAG &DAG) { 8370 SDLoc dl(N); 8371 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, N->getValueType(0), 8372 DAG.getNode(Opc, dl, 8373 N->getOperand(1).getSimpleValueType(), 8374 N->getOperand(1)), 8375 DAG.getConstant(0, dl, MVT::i64)); 8376 } 8377 8378 static SDValue performIntrinsicCombine(SDNode *N, 8379 TargetLowering::DAGCombinerInfo &DCI, 8380 const AArch64Subtarget *Subtarget) { 8381 SelectionDAG &DAG = DCI.DAG; 8382 unsigned IID = getIntrinsicID(N); 8383 switch (IID) { 8384 default: 8385 break; 8386 case Intrinsic::aarch64_neon_vcvtfxs2fp: 8387 case Intrinsic::aarch64_neon_vcvtfxu2fp: 8388 return tryCombineFixedPointConvert(N, DCI, DAG); 8389 case Intrinsic::aarch64_neon_saddv: 8390 return combineAcrossLanesIntrinsic(AArch64ISD::SADDV, N, DAG); 8391 case Intrinsic::aarch64_neon_uaddv: 8392 return combineAcrossLanesIntrinsic(AArch64ISD::UADDV, N, DAG); 8393 case Intrinsic::aarch64_neon_sminv: 8394 return combineAcrossLanesIntrinsic(AArch64ISD::SMINV, N, DAG); 8395 case Intrinsic::aarch64_neon_uminv: 8396 return combineAcrossLanesIntrinsic(AArch64ISD::UMINV, N, DAG); 8397 case Intrinsic::aarch64_neon_smaxv: 8398 return combineAcrossLanesIntrinsic(AArch64ISD::SMAXV, N, DAG); 8399 case Intrinsic::aarch64_neon_umaxv: 8400 return combineAcrossLanesIntrinsic(AArch64ISD::UMAXV, N, DAG); 8401 case Intrinsic::aarch64_neon_fmax: 8402 return DAG.getNode(ISD::FMAXNAN, SDLoc(N), N->getValueType(0), 8403 N->getOperand(1), N->getOperand(2)); 8404 case Intrinsic::aarch64_neon_fmin: 8405 return DAG.getNode(ISD::FMINNAN, SDLoc(N), N->getValueType(0), 8406 N->getOperand(1), N->getOperand(2)); 8407 case Intrinsic::aarch64_neon_fmaxnm: 8408 return DAG.getNode(ISD::FMAXNUM, SDLoc(N), N->getValueType(0), 8409 N->getOperand(1), N->getOperand(2)); 8410 case Intrinsic::aarch64_neon_fminnm: 8411 return DAG.getNode(ISD::FMINNUM, SDLoc(N), N->getValueType(0), 8412 N->getOperand(1), N->getOperand(2)); 8413 case Intrinsic::aarch64_neon_smull: 8414 case Intrinsic::aarch64_neon_umull: 8415 case Intrinsic::aarch64_neon_pmull: 8416 case Intrinsic::aarch64_neon_sqdmull: 8417 return tryCombineLongOpWithDup(IID, N, DCI, DAG); 8418 case Intrinsic::aarch64_neon_sqshl: 8419 case Intrinsic::aarch64_neon_uqshl: 8420 case Intrinsic::aarch64_neon_sqshlu: 8421 case Intrinsic::aarch64_neon_srshl: 8422 case Intrinsic::aarch64_neon_urshl: 8423 return tryCombineShiftImm(IID, N, DAG); 8424 case Intrinsic::aarch64_crc32b: 8425 case Intrinsic::aarch64_crc32cb: 8426 return tryCombineCRC32(0xff, N, DAG); 8427 case Intrinsic::aarch64_crc32h: 8428 case Intrinsic::aarch64_crc32ch: 8429 return tryCombineCRC32(0xffff, N, DAG); 8430 } 8431 return SDValue(); 8432 } 8433 8434 static SDValue performExtendCombine(SDNode *N, 8435 TargetLowering::DAGCombinerInfo &DCI, 8436 SelectionDAG &DAG) { 8437 // If we see something like (zext (sabd (extract_high ...), (DUP ...))) then 8438 // we can convert that DUP into another extract_high (of a bigger DUP), which 8439 // helps the backend to decide that an sabdl2 would be useful, saving a real 8440 // extract_high operation. 8441 if (!DCI.isBeforeLegalizeOps() && N->getOpcode() == ISD::ZERO_EXTEND && 8442 N->getOperand(0).getOpcode() == ISD::INTRINSIC_WO_CHAIN) { 8443 SDNode *ABDNode = N->getOperand(0).getNode(); 8444 unsigned IID = getIntrinsicID(ABDNode); 8445 if (IID == Intrinsic::aarch64_neon_sabd || 8446 IID == Intrinsic::aarch64_neon_uabd) { 8447 SDValue NewABD = tryCombineLongOpWithDup(IID, ABDNode, DCI, DAG); 8448 if (!NewABD.getNode()) 8449 return SDValue(); 8450 8451 return DAG.getNode(ISD::ZERO_EXTEND, SDLoc(N), N->getValueType(0), 8452 NewABD); 8453 } 8454 } 8455 8456 // This is effectively a custom type legalization for AArch64. 8457 // 8458 // Type legalization will split an extend of a small, legal, type to a larger 8459 // illegal type by first splitting the destination type, often creating 8460 // illegal source types, which then get legalized in isel-confusing ways, 8461 // leading to really terrible codegen. E.g., 8462 // %result = v8i32 sext v8i8 %value 8463 // becomes 8464 // %losrc = extract_subreg %value, ... 8465 // %hisrc = extract_subreg %value, ... 8466 // %lo = v4i32 sext v4i8 %losrc 8467 // %hi = v4i32 sext v4i8 %hisrc 8468 // Things go rapidly downhill from there. 8469 // 8470 // For AArch64, the [sz]ext vector instructions can only go up one element 8471 // size, so we can, e.g., extend from i8 to i16, but to go from i8 to i32 8472 // take two instructions. 8473 // 8474 // This implies that the most efficient way to do the extend from v8i8 8475 // to two v4i32 values is to first extend the v8i8 to v8i16, then do 8476 // the normal splitting to happen for the v8i16->v8i32. 8477 8478 // This is pre-legalization to catch some cases where the default 8479 // type legalization will create ill-tempered code. 8480 if (!DCI.isBeforeLegalizeOps()) 8481 return SDValue(); 8482 8483 // We're only interested in cleaning things up for non-legal vector types 8484 // here. If both the source and destination are legal, things will just 8485 // work naturally without any fiddling. 8486 const TargetLowering &TLI = DAG.getTargetLoweringInfo(); 8487 EVT ResVT = N->getValueType(0); 8488 if (!ResVT.isVector() || TLI.isTypeLegal(ResVT)) 8489 return SDValue(); 8490 // If the vector type isn't a simple VT, it's beyond the scope of what 8491 // we're worried about here. Let legalization do its thing and hope for 8492 // the best. 8493 SDValue Src = N->getOperand(0); 8494 EVT SrcVT = Src->getValueType(0); 8495 if (!ResVT.isSimple() || !SrcVT.isSimple()) 8496 return SDValue(); 8497 8498 // If the source VT is a 64-bit vector, we can play games and get the 8499 // better results we want. 8500 if (SrcVT.getSizeInBits() != 64) 8501 return SDValue(); 8502 8503 unsigned SrcEltSize = SrcVT.getVectorElementType().getSizeInBits(); 8504 unsigned ElementCount = SrcVT.getVectorNumElements(); 8505 SrcVT = MVT::getVectorVT(MVT::getIntegerVT(SrcEltSize * 2), ElementCount); 8506 SDLoc DL(N); 8507 Src = DAG.getNode(N->getOpcode(), DL, SrcVT, Src); 8508 8509 // Now split the rest of the operation into two halves, each with a 64 8510 // bit source. 8511 EVT LoVT, HiVT; 8512 SDValue Lo, Hi; 8513 unsigned NumElements = ResVT.getVectorNumElements(); 8514 assert(!(NumElements & 1) && "Splitting vector, but not in half!"); 8515 LoVT = HiVT = EVT::getVectorVT(*DAG.getContext(), 8516 ResVT.getVectorElementType(), NumElements / 2); 8517 8518 EVT InNVT = EVT::getVectorVT(*DAG.getContext(), SrcVT.getVectorElementType(), 8519 LoVT.getVectorNumElements()); 8520 Lo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, InNVT, Src, 8521 DAG.getConstant(0, DL, MVT::i64)); 8522 Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, InNVT, Src, 8523 DAG.getConstant(InNVT.getVectorNumElements(), DL, MVT::i64)); 8524 Lo = DAG.getNode(N->getOpcode(), DL, LoVT, Lo); 8525 Hi = DAG.getNode(N->getOpcode(), DL, HiVT, Hi); 8526 8527 // Now combine the parts back together so we still have a single result 8528 // like the combiner expects. 8529 return DAG.getNode(ISD::CONCAT_VECTORS, DL, ResVT, Lo, Hi); 8530 } 8531 8532 /// Replace a splat of a scalar to a vector store by scalar stores of the scalar 8533 /// value. The load store optimizer pass will merge them to store pair stores. 8534 /// This has better performance than a splat of the scalar followed by a split 8535 /// vector store. Even if the stores are not merged it is four stores vs a dup, 8536 /// followed by an ext.b and two stores. 8537 static SDValue replaceSplatVectorStore(SelectionDAG &DAG, StoreSDNode *St) { 8538 SDValue StVal = St->getValue(); 8539 EVT VT = StVal.getValueType(); 8540 8541 // Don't replace floating point stores, they possibly won't be transformed to 8542 // stp because of the store pair suppress pass. 8543 if (VT.isFloatingPoint()) 8544 return SDValue(); 8545 8546 // Check for insert vector elements. 8547 if (StVal.getOpcode() != ISD::INSERT_VECTOR_ELT) 8548 return SDValue(); 8549 8550 // We can express a splat as store pair(s) for 2 or 4 elements. 8551 unsigned NumVecElts = VT.getVectorNumElements(); 8552 if (NumVecElts != 4 && NumVecElts != 2) 8553 return SDValue(); 8554 SDValue SplatVal = StVal.getOperand(1); 8555 unsigned RemainInsertElts = NumVecElts - 1; 8556 8557 // Check that this is a splat. 8558 while (--RemainInsertElts) { 8559 SDValue NextInsertElt = StVal.getOperand(0); 8560 if (NextInsertElt.getOpcode() != ISD::INSERT_VECTOR_ELT) 8561 return SDValue(); 8562 if (NextInsertElt.getOperand(1) != SplatVal) 8563 return SDValue(); 8564 StVal = NextInsertElt; 8565 } 8566 unsigned OrigAlignment = St->getAlignment(); 8567 unsigned EltOffset = NumVecElts == 4 ? 4 : 8; 8568 unsigned Alignment = std::min(OrigAlignment, EltOffset); 8569 8570 // Create scalar stores. This is at least as good as the code sequence for a 8571 // split unaligned store which is a dup.s, ext.b, and two stores. 8572 // Most of the time the three stores should be replaced by store pair 8573 // instructions (stp). 8574 SDLoc DL(St); 8575 SDValue BasePtr = St->getBasePtr(); 8576 SDValue NewST1 = 8577 DAG.getStore(St->getChain(), DL, SplatVal, BasePtr, St->getPointerInfo(), 8578 St->isVolatile(), St->isNonTemporal(), St->getAlignment()); 8579 8580 unsigned Offset = EltOffset; 8581 while (--NumVecElts) { 8582 SDValue OffsetPtr = DAG.getNode(ISD::ADD, DL, MVT::i64, BasePtr, 8583 DAG.getConstant(Offset, DL, MVT::i64)); 8584 NewST1 = DAG.getStore(NewST1.getValue(0), DL, SplatVal, OffsetPtr, 8585 St->getPointerInfo(), St->isVolatile(), 8586 St->isNonTemporal(), Alignment); 8587 Offset += EltOffset; 8588 } 8589 return NewST1; 8590 } 8591 8592 static SDValue split16BStores(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, 8593 SelectionDAG &DAG, 8594 const AArch64Subtarget *Subtarget) { 8595 if (!DCI.isBeforeLegalize()) 8596 return SDValue(); 8597 8598 StoreSDNode *S = cast<StoreSDNode>(N); 8599 if (S->isVolatile()) 8600 return SDValue(); 8601 8602 // FIXME: The logic for deciding if an unaligned store should be split should 8603 // be included in TLI.allowsMisalignedMemoryAccesses(), and there should be 8604 // a call to that function here. 8605 8606 // Cyclone has bad performance on unaligned 16B stores when crossing line and 8607 // page boundaries. We want to split such stores. 8608 if (!Subtarget->isCyclone()) 8609 return SDValue(); 8610 8611 // Don't split at -Oz. 8612 if (DAG.getMachineFunction().getFunction()->optForMinSize()) 8613 return SDValue(); 8614 8615 SDValue StVal = S->getValue(); 8616 EVT VT = StVal.getValueType(); 8617 8618 // Don't split v2i64 vectors. Memcpy lowering produces those and splitting 8619 // those up regresses performance on micro-benchmarks and olden/bh. 8620 if (!VT.isVector() || VT.getVectorNumElements() < 2 || VT == MVT::v2i64) 8621 return SDValue(); 8622 8623 // Split unaligned 16B stores. They are terrible for performance. 8624 // Don't split stores with alignment of 1 or 2. Code that uses clang vector 8625 // extensions can use this to mark that it does not want splitting to happen 8626 // (by underspecifying alignment to be 1 or 2). Furthermore, the chance of 8627 // eliminating alignment hazards is only 1 in 8 for alignment of 2. 8628 if (VT.getSizeInBits() != 128 || S->getAlignment() >= 16 || 8629 S->getAlignment() <= 2) 8630 return SDValue(); 8631 8632 // If we get a splat of a scalar convert this vector store to a store of 8633 // scalars. They will be merged into store pairs thereby removing two 8634 // instructions. 8635 if (SDValue ReplacedSplat = replaceSplatVectorStore(DAG, S)) 8636 return ReplacedSplat; 8637 8638 SDLoc DL(S); 8639 unsigned NumElts = VT.getVectorNumElements() / 2; 8640 // Split VT into two. 8641 EVT HalfVT = 8642 EVT::getVectorVT(*DAG.getContext(), VT.getVectorElementType(), NumElts); 8643 SDValue SubVector0 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, StVal, 8644 DAG.getConstant(0, DL, MVT::i64)); 8645 SDValue SubVector1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, StVal, 8646 DAG.getConstant(NumElts, DL, MVT::i64)); 8647 SDValue BasePtr = S->getBasePtr(); 8648 SDValue NewST1 = 8649 DAG.getStore(S->getChain(), DL, SubVector0, BasePtr, S->getPointerInfo(), 8650 S->isVolatile(), S->isNonTemporal(), S->getAlignment()); 8651 SDValue OffsetPtr = DAG.getNode(ISD::ADD, DL, MVT::i64, BasePtr, 8652 DAG.getConstant(8, DL, MVT::i64)); 8653 return DAG.getStore(NewST1.getValue(0), DL, SubVector1, OffsetPtr, 8654 S->getPointerInfo(), S->isVolatile(), S->isNonTemporal(), 8655 S->getAlignment()); 8656 } 8657 8658 /// Target-specific DAG combine function for post-increment LD1 (lane) and 8659 /// post-increment LD1R. 8660 static SDValue performPostLD1Combine(SDNode *N, 8661 TargetLowering::DAGCombinerInfo &DCI, 8662 bool IsLaneOp) { 8663 if (DCI.isBeforeLegalizeOps()) 8664 return SDValue(); 8665 8666 SelectionDAG &DAG = DCI.DAG; 8667 EVT VT = N->getValueType(0); 8668 8669 unsigned LoadIdx = IsLaneOp ? 1 : 0; 8670 SDNode *LD = N->getOperand(LoadIdx).getNode(); 8671 // If it is not LOAD, can not do such combine. 8672 if (LD->getOpcode() != ISD::LOAD) 8673 return SDValue(); 8674 8675 LoadSDNode *LoadSDN = cast<LoadSDNode>(LD); 8676 EVT MemVT = LoadSDN->getMemoryVT(); 8677 // Check if memory operand is the same type as the vector element. 8678 if (MemVT != VT.getVectorElementType()) 8679 return SDValue(); 8680 8681 // Check if there are other uses. If so, do not combine as it will introduce 8682 // an extra load. 8683 for (SDNode::use_iterator UI = LD->use_begin(), UE = LD->use_end(); UI != UE; 8684 ++UI) { 8685 if (UI.getUse().getResNo() == 1) // Ignore uses of the chain result. 8686 continue; 8687 if (*UI != N) 8688 return SDValue(); 8689 } 8690 8691 SDValue Addr = LD->getOperand(1); 8692 SDValue Vector = N->getOperand(0); 8693 // Search for a use of the address operand that is an increment. 8694 for (SDNode::use_iterator UI = Addr.getNode()->use_begin(), UE = 8695 Addr.getNode()->use_end(); UI != UE; ++UI) { 8696 SDNode *User = *UI; 8697 if (User->getOpcode() != ISD::ADD 8698 || UI.getUse().getResNo() != Addr.getResNo()) 8699 continue; 8700 8701 // Check that the add is independent of the load. Otherwise, folding it 8702 // would create a cycle. 8703 if (User->isPredecessorOf(LD) || LD->isPredecessorOf(User)) 8704 continue; 8705 // Also check that add is not used in the vector operand. This would also 8706 // create a cycle. 8707 if (User->isPredecessorOf(Vector.getNode())) 8708 continue; 8709 8710 // If the increment is a constant, it must match the memory ref size. 8711 SDValue Inc = User->getOperand(User->getOperand(0) == Addr ? 1 : 0); 8712 if (ConstantSDNode *CInc = dyn_cast<ConstantSDNode>(Inc.getNode())) { 8713 uint32_t IncVal = CInc->getZExtValue(); 8714 unsigned NumBytes = VT.getScalarSizeInBits() / 8; 8715 if (IncVal != NumBytes) 8716 continue; 8717 Inc = DAG.getRegister(AArch64::XZR, MVT::i64); 8718 } 8719 8720 // Finally, check that the vector doesn't depend on the load. 8721 // Again, this would create a cycle. 8722 // The load depending on the vector is fine, as that's the case for the 8723 // LD1*post we'll eventually generate anyway. 8724 if (LoadSDN->isPredecessorOf(Vector.getNode())) 8725 continue; 8726 8727 SmallVector<SDValue, 8> Ops; 8728 Ops.push_back(LD->getOperand(0)); // Chain 8729 if (IsLaneOp) { 8730 Ops.push_back(Vector); // The vector to be inserted 8731 Ops.push_back(N->getOperand(2)); // The lane to be inserted in the vector 8732 } 8733 Ops.push_back(Addr); 8734 Ops.push_back(Inc); 8735 8736 EVT Tys[3] = { VT, MVT::i64, MVT::Other }; 8737 SDVTList SDTys = DAG.getVTList(Tys); 8738 unsigned NewOp = IsLaneOp ? AArch64ISD::LD1LANEpost : AArch64ISD::LD1DUPpost; 8739 SDValue UpdN = DAG.getMemIntrinsicNode(NewOp, SDLoc(N), SDTys, Ops, 8740 MemVT, 8741 LoadSDN->getMemOperand()); 8742 8743 // Update the uses. 8744 SmallVector<SDValue, 2> NewResults; 8745 NewResults.push_back(SDValue(LD, 0)); // The result of load 8746 NewResults.push_back(SDValue(UpdN.getNode(), 2)); // Chain 8747 DCI.CombineTo(LD, NewResults); 8748 DCI.CombineTo(N, SDValue(UpdN.getNode(), 0)); // Dup/Inserted Result 8749 DCI.CombineTo(User, SDValue(UpdN.getNode(), 1)); // Write back register 8750 8751 break; 8752 } 8753 return SDValue(); 8754 } 8755 8756 /// Simplify \Addr given that the top byte of it is ignored by HW during 8757 /// address translation. 8758 static bool performTBISimplification(SDValue Addr, 8759 TargetLowering::DAGCombinerInfo &DCI, 8760 SelectionDAG &DAG) { 8761 APInt DemandedMask = APInt::getLowBitsSet(64, 56); 8762 APInt KnownZero, KnownOne; 8763 TargetLowering::TargetLoweringOpt TLO(DAG, DCI.isBeforeLegalize(), 8764 DCI.isBeforeLegalizeOps()); 8765 const TargetLowering &TLI = DAG.getTargetLoweringInfo(); 8766 if (TLI.SimplifyDemandedBits(Addr, DemandedMask, KnownZero, KnownOne, TLO)) { 8767 DCI.CommitTargetLoweringOpt(TLO); 8768 return true; 8769 } 8770 return false; 8771 } 8772 8773 static SDValue performSTORECombine(SDNode *N, 8774 TargetLowering::DAGCombinerInfo &DCI, 8775 SelectionDAG &DAG, 8776 const AArch64Subtarget *Subtarget) { 8777 SDValue Split = split16BStores(N, DCI, DAG, Subtarget); 8778 if (Split.getNode()) 8779 return Split; 8780 8781 if (Subtarget->supportsAddressTopByteIgnored() && 8782 performTBISimplification(N->getOperand(2), DCI, DAG)) 8783 return SDValue(N, 0); 8784 8785 return SDValue(); 8786 } 8787 8788 /// This function handles the log2-shuffle pattern produced by the 8789 /// LoopVectorizer for the across vector reduction. It consists of 8790 /// log2(NumVectorElements) steps and, in each step, 2^(s) elements 8791 /// are reduced, where s is an induction variable from 0 to 8792 /// log2(NumVectorElements). 8793 static SDValue tryMatchAcrossLaneShuffleForReduction(SDNode *N, SDValue OpV, 8794 unsigned Op, 8795 SelectionDAG &DAG) { 8796 EVT VTy = OpV->getOperand(0).getValueType(); 8797 if (!VTy.isVector()) 8798 return SDValue(); 8799 8800 int NumVecElts = VTy.getVectorNumElements(); 8801 if (Op == ISD::FMAXNUM || Op == ISD::FMINNUM) { 8802 if (NumVecElts != 4) 8803 return SDValue(); 8804 } else { 8805 if (NumVecElts != 4 && NumVecElts != 8 && NumVecElts != 16) 8806 return SDValue(); 8807 } 8808 8809 int NumExpectedSteps = APInt(8, NumVecElts).logBase2(); 8810 SDValue PreOp = OpV; 8811 // Iterate over each step of the across vector reduction. 8812 for (int CurStep = 0; CurStep != NumExpectedSteps; ++CurStep) { 8813 SDValue CurOp = PreOp.getOperand(0); 8814 SDValue Shuffle = PreOp.getOperand(1); 8815 if (Shuffle.getOpcode() != ISD::VECTOR_SHUFFLE) { 8816 // Try to swap the 1st and 2nd operand as add and min/max instructions 8817 // are commutative. 8818 CurOp = PreOp.getOperand(1); 8819 Shuffle = PreOp.getOperand(0); 8820 if (Shuffle.getOpcode() != ISD::VECTOR_SHUFFLE) 8821 return SDValue(); 8822 } 8823 8824 // Check if the input vector is fed by the operator we want to handle, 8825 // except the last step; the very first input vector is not necessarily 8826 // the same operator we are handling. 8827 if (CurOp.getOpcode() != Op && (CurStep != (NumExpectedSteps - 1))) 8828 return SDValue(); 8829 8830 // Check if it forms one step of the across vector reduction. 8831 // E.g., 8832 // %cur = add %1, %0 8833 // %shuffle = vector_shuffle %cur, <2, 3, u, u> 8834 // %pre = add %cur, %shuffle 8835 if (Shuffle.getOperand(0) != CurOp) 8836 return SDValue(); 8837 8838 int NumMaskElts = 1 << CurStep; 8839 ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(Shuffle)->getMask(); 8840 // Check mask values in each step. 8841 // We expect the shuffle mask in each step follows a specific pattern 8842 // denoted here by the <M, U> form, where M is a sequence of integers 8843 // starting from NumMaskElts, increasing by 1, and the number integers 8844 // in M should be NumMaskElts. U is a sequence of UNDEFs and the number 8845 // of undef in U should be NumVecElts - NumMaskElts. 8846 // E.g., for <8 x i16>, mask values in each step should be : 8847 // step 0 : <1,u,u,u,u,u,u,u> 8848 // step 1 : <2,3,u,u,u,u,u,u> 8849 // step 2 : <4,5,6,7,u,u,u,u> 8850 for (int i = 0; i < NumVecElts; ++i) 8851 if ((i < NumMaskElts && Mask[i] != (NumMaskElts + i)) || 8852 (i >= NumMaskElts && !(Mask[i] < 0))) 8853 return SDValue(); 8854 8855 PreOp = CurOp; 8856 } 8857 unsigned Opcode; 8858 bool IsIntrinsic = false; 8859 8860 switch (Op) { 8861 default: 8862 llvm_unreachable("Unexpected operator for across vector reduction"); 8863 case ISD::ADD: 8864 Opcode = AArch64ISD::UADDV; 8865 break; 8866 case ISD::SMAX: 8867 Opcode = AArch64ISD::SMAXV; 8868 break; 8869 case ISD::UMAX: 8870 Opcode = AArch64ISD::UMAXV; 8871 break; 8872 case ISD::SMIN: 8873 Opcode = AArch64ISD::SMINV; 8874 break; 8875 case ISD::UMIN: 8876 Opcode = AArch64ISD::UMINV; 8877 break; 8878 case ISD::FMAXNUM: 8879 Opcode = Intrinsic::aarch64_neon_fmaxnmv; 8880 IsIntrinsic = true; 8881 break; 8882 case ISD::FMINNUM: 8883 Opcode = Intrinsic::aarch64_neon_fminnmv; 8884 IsIntrinsic = true; 8885 break; 8886 } 8887 SDLoc DL(N); 8888 8889 return IsIntrinsic 8890 ? DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, N->getValueType(0), 8891 DAG.getConstant(Opcode, DL, MVT::i32), PreOp) 8892 : DAG.getNode( 8893 ISD::EXTRACT_VECTOR_ELT, DL, N->getValueType(0), 8894 DAG.getNode(Opcode, DL, PreOp.getSimpleValueType(), PreOp), 8895 DAG.getConstant(0, DL, MVT::i64)); 8896 } 8897 8898 /// Target-specific DAG combine for the across vector min/max reductions. 8899 /// This function specifically handles the final clean-up step of the vector 8900 /// min/max reductions produced by the LoopVectorizer. It is the log2-shuffle 8901 /// pattern, which narrows down and finds the final min/max value from all 8902 /// elements of the vector. 8903 /// For example, for a <16 x i8> vector : 8904 /// svn0 = vector_shuffle %0, undef<8,9,10,11,12,13,14,15,u,u,u,u,u,u,u,u> 8905 /// %smax0 = smax %arr, svn0 8906 /// %svn1 = vector_shuffle %smax0, undef<4,5,6,7,u,u,u,u,u,u,u,u,u,u,u,u> 8907 /// %smax1 = smax %smax0, %svn1 8908 /// %svn2 = vector_shuffle %smax1, undef<2,3,u,u,u,u,u,u,u,u,u,u,u,u,u,u> 8909 /// %smax2 = smax %smax1, svn2 8910 /// %svn3 = vector_shuffle %smax2, undef<1,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u> 8911 /// %sc = setcc %smax2, %svn3, gt 8912 /// %n0 = extract_vector_elt %sc, #0 8913 /// %n1 = extract_vector_elt %smax2, #0 8914 /// %n2 = extract_vector_elt $smax2, #1 8915 /// %result = select %n0, %n1, n2 8916 /// becomes : 8917 /// %1 = smaxv %0 8918 /// %result = extract_vector_elt %1, 0 8919 static SDValue 8920 performAcrossLaneMinMaxReductionCombine(SDNode *N, SelectionDAG &DAG, 8921 const AArch64Subtarget *Subtarget) { 8922 if (!Subtarget->hasNEON()) 8923 return SDValue(); 8924 8925 SDValue N0 = N->getOperand(0); 8926 SDValue IfTrue = N->getOperand(1); 8927 SDValue IfFalse = N->getOperand(2); 8928 8929 // Check if the SELECT merges up the final result of the min/max 8930 // from a vector. 8931 if (N0.getOpcode() != ISD::EXTRACT_VECTOR_ELT || 8932 IfTrue.getOpcode() != ISD::EXTRACT_VECTOR_ELT || 8933 IfFalse.getOpcode() != ISD::EXTRACT_VECTOR_ELT) 8934 return SDValue(); 8935 8936 // Expect N0 is fed by SETCC. 8937 SDValue SetCC = N0.getOperand(0); 8938 EVT SetCCVT = SetCC.getValueType(); 8939 if (SetCC.getOpcode() != ISD::SETCC || !SetCCVT.isVector() || 8940 SetCCVT.getVectorElementType() != MVT::i1) 8941 return SDValue(); 8942 8943 SDValue VectorOp = SetCC.getOperand(0); 8944 unsigned Op = VectorOp->getOpcode(); 8945 // Check if the input vector is fed by the operator we want to handle. 8946 if (Op != ISD::SMAX && Op != ISD::UMAX && Op != ISD::SMIN && 8947 Op != ISD::UMIN && Op != ISD::FMAXNUM && Op != ISD::FMINNUM) 8948 return SDValue(); 8949 8950 EVT VTy = VectorOp.getValueType(); 8951 if (!VTy.isVector()) 8952 return SDValue(); 8953 8954 if (VTy.getSizeInBits() < 64) 8955 return SDValue(); 8956 8957 EVT EltTy = VTy.getVectorElementType(); 8958 if (Op == ISD::FMAXNUM || Op == ISD::FMINNUM) { 8959 if (EltTy != MVT::f32) 8960 return SDValue(); 8961 } else { 8962 if (EltTy != MVT::i32 && EltTy != MVT::i16 && EltTy != MVT::i8) 8963 return SDValue(); 8964 } 8965 8966 // Check if extracting from the same vector. 8967 // For example, 8968 // %sc = setcc %vector, %svn1, gt 8969 // %n0 = extract_vector_elt %sc, #0 8970 // %n1 = extract_vector_elt %vector, #0 8971 // %n2 = extract_vector_elt $vector, #1 8972 if (!(VectorOp == IfTrue->getOperand(0) && 8973 VectorOp == IfFalse->getOperand(0))) 8974 return SDValue(); 8975 8976 // Check if the condition code is matched with the operator type. 8977 ISD::CondCode CC = cast<CondCodeSDNode>(SetCC->getOperand(2))->get(); 8978 if ((Op == ISD::SMAX && CC != ISD::SETGT && CC != ISD::SETGE) || 8979 (Op == ISD::UMAX && CC != ISD::SETUGT && CC != ISD::SETUGE) || 8980 (Op == ISD::SMIN && CC != ISD::SETLT && CC != ISD::SETLE) || 8981 (Op == ISD::UMIN && CC != ISD::SETULT && CC != ISD::SETULE) || 8982 (Op == ISD::FMAXNUM && CC != ISD::SETOGT && CC != ISD::SETOGE && 8983 CC != ISD::SETUGT && CC != ISD::SETUGE && CC != ISD::SETGT && 8984 CC != ISD::SETGE) || 8985 (Op == ISD::FMINNUM && CC != ISD::SETOLT && CC != ISD::SETOLE && 8986 CC != ISD::SETULT && CC != ISD::SETULE && CC != ISD::SETLT && 8987 CC != ISD::SETLE)) 8988 return SDValue(); 8989 8990 // Expect to check only lane 0 from the vector SETCC. 8991 if (!isNullConstant(N0.getOperand(1))) 8992 return SDValue(); 8993 8994 // Expect to extract the true value from lane 0. 8995 if (!isNullConstant(IfTrue.getOperand(1))) 8996 return SDValue(); 8997 8998 // Expect to extract the false value from lane 1. 8999 if (!isOneConstant(IfFalse.getOperand(1))) 9000 return SDValue(); 9001 9002 return tryMatchAcrossLaneShuffleForReduction(N, SetCC, Op, DAG); 9003 } 9004 9005 /// Target-specific DAG combine for the across vector add reduction. 9006 /// This function specifically handles the final clean-up step of the vector 9007 /// add reduction produced by the LoopVectorizer. It is the log2-shuffle 9008 /// pattern, which adds all elements of a vector together. 9009 /// For example, for a <4 x i32> vector : 9010 /// %1 = vector_shuffle %0, <2,3,u,u> 9011 /// %2 = add %0, %1 9012 /// %3 = vector_shuffle %2, <1,u,u,u> 9013 /// %4 = add %2, %3 9014 /// %result = extract_vector_elt %4, 0 9015 /// becomes : 9016 /// %0 = uaddv %0 9017 /// %result = extract_vector_elt %0, 0 9018 static SDValue 9019 performAcrossLaneAddReductionCombine(SDNode *N, SelectionDAG &DAG, 9020 const AArch64Subtarget *Subtarget) { 9021 if (!Subtarget->hasNEON()) 9022 return SDValue(); 9023 SDValue N0 = N->getOperand(0); 9024 SDValue N1 = N->getOperand(1); 9025 9026 // Check if the input vector is fed by the ADD. 9027 if (N0->getOpcode() != ISD::ADD) 9028 return SDValue(); 9029 9030 // The vector extract idx must constant zero because we only expect the final 9031 // result of the reduction is placed in lane 0. 9032 if (!isNullConstant(N1)) 9033 return SDValue(); 9034 9035 EVT VTy = N0.getValueType(); 9036 if (!VTy.isVector()) 9037 return SDValue(); 9038 9039 EVT EltTy = VTy.getVectorElementType(); 9040 if (EltTy != MVT::i32 && EltTy != MVT::i16 && EltTy != MVT::i8) 9041 return SDValue(); 9042 9043 if (VTy.getSizeInBits() < 64) 9044 return SDValue(); 9045 9046 return tryMatchAcrossLaneShuffleForReduction(N, N0, ISD::ADD, DAG); 9047 } 9048 9049 /// Target-specific DAG combine function for NEON load/store intrinsics 9050 /// to merge base address updates. 9051 static SDValue performNEONPostLDSTCombine(SDNode *N, 9052 TargetLowering::DAGCombinerInfo &DCI, 9053 SelectionDAG &DAG) { 9054 if (DCI.isBeforeLegalize() || DCI.isCalledByLegalizer()) 9055 return SDValue(); 9056 9057 unsigned AddrOpIdx = N->getNumOperands() - 1; 9058 SDValue Addr = N->getOperand(AddrOpIdx); 9059 9060 // Search for a use of the address operand that is an increment. 9061 for (SDNode::use_iterator UI = Addr.getNode()->use_begin(), 9062 UE = Addr.getNode()->use_end(); UI != UE; ++UI) { 9063 SDNode *User = *UI; 9064 if (User->getOpcode() != ISD::ADD || 9065 UI.getUse().getResNo() != Addr.getResNo()) 9066 continue; 9067 9068 // Check that the add is independent of the load/store. Otherwise, folding 9069 // it would create a cycle. 9070 if (User->isPredecessorOf(N) || N->isPredecessorOf(User)) 9071 continue; 9072 9073 // Find the new opcode for the updating load/store. 9074 bool IsStore = false; 9075 bool IsLaneOp = false; 9076 bool IsDupOp = false; 9077 unsigned NewOpc = 0; 9078 unsigned NumVecs = 0; 9079 unsigned IntNo = cast<ConstantSDNode>(N->getOperand(1))->getZExtValue(); 9080 switch (IntNo) { 9081 default: llvm_unreachable("unexpected intrinsic for Neon base update"); 9082 case Intrinsic::aarch64_neon_ld2: NewOpc = AArch64ISD::LD2post; 9083 NumVecs = 2; break; 9084 case Intrinsic::aarch64_neon_ld3: NewOpc = AArch64ISD::LD3post; 9085 NumVecs = 3; break; 9086 case Intrinsic::aarch64_neon_ld4: NewOpc = AArch64ISD::LD4post; 9087 NumVecs = 4; break; 9088 case Intrinsic::aarch64_neon_st2: NewOpc = AArch64ISD::ST2post; 9089 NumVecs = 2; IsStore = true; break; 9090 case Intrinsic::aarch64_neon_st3: NewOpc = AArch64ISD::ST3post; 9091 NumVecs = 3; IsStore = true; break; 9092 case Intrinsic::aarch64_neon_st4: NewOpc = AArch64ISD::ST4post; 9093 NumVecs = 4; IsStore = true; break; 9094 case Intrinsic::aarch64_neon_ld1x2: NewOpc = AArch64ISD::LD1x2post; 9095 NumVecs = 2; break; 9096 case Intrinsic::aarch64_neon_ld1x3: NewOpc = AArch64ISD::LD1x3post; 9097 NumVecs = 3; break; 9098 case Intrinsic::aarch64_neon_ld1x4: NewOpc = AArch64ISD::LD1x4post; 9099 NumVecs = 4; break; 9100 case Intrinsic::aarch64_neon_st1x2: NewOpc = AArch64ISD::ST1x2post; 9101 NumVecs = 2; IsStore = true; break; 9102 case Intrinsic::aarch64_neon_st1x3: NewOpc = AArch64ISD::ST1x3post; 9103 NumVecs = 3; IsStore = true; break; 9104 case Intrinsic::aarch64_neon_st1x4: NewOpc = AArch64ISD::ST1x4post; 9105 NumVecs = 4; IsStore = true; break; 9106 case Intrinsic::aarch64_neon_ld2r: NewOpc = AArch64ISD::LD2DUPpost; 9107 NumVecs = 2; IsDupOp = true; break; 9108 case Intrinsic::aarch64_neon_ld3r: NewOpc = AArch64ISD::LD3DUPpost; 9109 NumVecs = 3; IsDupOp = true; break; 9110 case Intrinsic::aarch64_neon_ld4r: NewOpc = AArch64ISD::LD4DUPpost; 9111 NumVecs = 4; IsDupOp = true; break; 9112 case Intrinsic::aarch64_neon_ld2lane: NewOpc = AArch64ISD::LD2LANEpost; 9113 NumVecs = 2; IsLaneOp = true; break; 9114 case Intrinsic::aarch64_neon_ld3lane: NewOpc = AArch64ISD::LD3LANEpost; 9115 NumVecs = 3; IsLaneOp = true; break; 9116 case Intrinsic::aarch64_neon_ld4lane: NewOpc = AArch64ISD::LD4LANEpost; 9117 NumVecs = 4; IsLaneOp = true; break; 9118 case Intrinsic::aarch64_neon_st2lane: NewOpc = AArch64ISD::ST2LANEpost; 9119 NumVecs = 2; IsStore = true; IsLaneOp = true; break; 9120 case Intrinsic::aarch64_neon_st3lane: NewOpc = AArch64ISD::ST3LANEpost; 9121 NumVecs = 3; IsStore = true; IsLaneOp = true; break; 9122 case Intrinsic::aarch64_neon_st4lane: NewOpc = AArch64ISD::ST4LANEpost; 9123 NumVecs = 4; IsStore = true; IsLaneOp = true; break; 9124 } 9125 9126 EVT VecTy; 9127 if (IsStore) 9128 VecTy = N->getOperand(2).getValueType(); 9129 else 9130 VecTy = N->getValueType(0); 9131 9132 // If the increment is a constant, it must match the memory ref size. 9133 SDValue Inc = User->getOperand(User->getOperand(0) == Addr ? 1 : 0); 9134 if (ConstantSDNode *CInc = dyn_cast<ConstantSDNode>(Inc.getNode())) { 9135 uint32_t IncVal = CInc->getZExtValue(); 9136 unsigned NumBytes = NumVecs * VecTy.getSizeInBits() / 8; 9137 if (IsLaneOp || IsDupOp) 9138 NumBytes /= VecTy.getVectorNumElements(); 9139 if (IncVal != NumBytes) 9140 continue; 9141 Inc = DAG.getRegister(AArch64::XZR, MVT::i64); 9142 } 9143 SmallVector<SDValue, 8> Ops; 9144 Ops.push_back(N->getOperand(0)); // Incoming chain 9145 // Load lane and store have vector list as input. 9146 if (IsLaneOp || IsStore) 9147 for (unsigned i = 2; i < AddrOpIdx; ++i) 9148 Ops.push_back(N->getOperand(i)); 9149 Ops.push_back(Addr); // Base register 9150 Ops.push_back(Inc); 9151 9152 // Return Types. 9153 EVT Tys[6]; 9154 unsigned NumResultVecs = (IsStore ? 0 : NumVecs); 9155 unsigned n; 9156 for (n = 0; n < NumResultVecs; ++n) 9157 Tys[n] = VecTy; 9158 Tys[n++] = MVT::i64; // Type of write back register 9159 Tys[n] = MVT::Other; // Type of the chain 9160 SDVTList SDTys = DAG.getVTList(makeArrayRef(Tys, NumResultVecs + 2)); 9161 9162 MemIntrinsicSDNode *MemInt = cast<MemIntrinsicSDNode>(N); 9163 SDValue UpdN = DAG.getMemIntrinsicNode(NewOpc, SDLoc(N), SDTys, Ops, 9164 MemInt->getMemoryVT(), 9165 MemInt->getMemOperand()); 9166 9167 // Update the uses. 9168 std::vector<SDValue> NewResults; 9169 for (unsigned i = 0; i < NumResultVecs; ++i) { 9170 NewResults.push_back(SDValue(UpdN.getNode(), i)); 9171 } 9172 NewResults.push_back(SDValue(UpdN.getNode(), NumResultVecs + 1)); 9173 DCI.CombineTo(N, NewResults); 9174 DCI.CombineTo(User, SDValue(UpdN.getNode(), NumResultVecs)); 9175 9176 break; 9177 } 9178 return SDValue(); 9179 } 9180 9181 // Checks to see if the value is the prescribed width and returns information 9182 // about its extension mode. 9183 static 9184 bool checkValueWidth(SDValue V, unsigned width, ISD::LoadExtType &ExtType) { 9185 ExtType = ISD::NON_EXTLOAD; 9186 switch(V.getNode()->getOpcode()) { 9187 default: 9188 return false; 9189 case ISD::LOAD: { 9190 LoadSDNode *LoadNode = cast<LoadSDNode>(V.getNode()); 9191 if ((LoadNode->getMemoryVT() == MVT::i8 && width == 8) 9192 || (LoadNode->getMemoryVT() == MVT::i16 && width == 16)) { 9193 ExtType = LoadNode->getExtensionType(); 9194 return true; 9195 } 9196 return false; 9197 } 9198 case ISD::AssertSext: { 9199 VTSDNode *TypeNode = cast<VTSDNode>(V.getNode()->getOperand(1)); 9200 if ((TypeNode->getVT() == MVT::i8 && width == 8) 9201 || (TypeNode->getVT() == MVT::i16 && width == 16)) { 9202 ExtType = ISD::SEXTLOAD; 9203 return true; 9204 } 9205 return false; 9206 } 9207 case ISD::AssertZext: { 9208 VTSDNode *TypeNode = cast<VTSDNode>(V.getNode()->getOperand(1)); 9209 if ((TypeNode->getVT() == MVT::i8 && width == 8) 9210 || (TypeNode->getVT() == MVT::i16 && width == 16)) { 9211 ExtType = ISD::ZEXTLOAD; 9212 return true; 9213 } 9214 return false; 9215 } 9216 case ISD::Constant: 9217 case ISD::TargetConstant: { 9218 if (std::abs(cast<ConstantSDNode>(V.getNode())->getSExtValue()) < 9219 1LL << (width - 1)) 9220 return true; 9221 return false; 9222 } 9223 } 9224 9225 return true; 9226 } 9227 9228 // This function does a whole lot of voodoo to determine if the tests are 9229 // equivalent without and with a mask. Essentially what happens is that given a 9230 // DAG resembling: 9231 // 9232 // +-------------+ +-------------+ +-------------+ +-------------+ 9233 // | Input | | AddConstant | | CompConstant| | CC | 9234 // +-------------+ +-------------+ +-------------+ +-------------+ 9235 // | | | | 9236 // V V | +----------+ 9237 // +-------------+ +----+ | | 9238 // | ADD | |0xff| | | 9239 // +-------------+ +----+ | | 9240 // | | | | 9241 // V V | | 9242 // +-------------+ | | 9243 // | AND | | | 9244 // +-------------+ | | 9245 // | | | 9246 // +-----+ | | 9247 // | | | 9248 // V V V 9249 // +-------------+ 9250 // | CMP | 9251 // +-------------+ 9252 // 9253 // The AND node may be safely removed for some combinations of inputs. In 9254 // particular we need to take into account the extension type of the Input, 9255 // the exact values of AddConstant, CompConstant, and CC, along with the nominal 9256 // width of the input (this can work for any width inputs, the above graph is 9257 // specific to 8 bits. 9258 // 9259 // The specific equations were worked out by generating output tables for each 9260 // AArch64CC value in terms of and AddConstant (w1), CompConstant(w2). The 9261 // problem was simplified by working with 4 bit inputs, which means we only 9262 // needed to reason about 24 distinct bit patterns: 8 patterns unique to zero 9263 // extension (8,15), 8 patterns unique to sign extensions (-8,-1), and 8 9264 // patterns present in both extensions (0,7). For every distinct set of 9265 // AddConstant and CompConstants bit patterns we can consider the masked and 9266 // unmasked versions to be equivalent if the result of this function is true for 9267 // all 16 distinct bit patterns of for the current extension type of Input (w0). 9268 // 9269 // sub w8, w0, w1 9270 // and w10, w8, #0x0f 9271 // cmp w8, w2 9272 // cset w9, AArch64CC 9273 // cmp w10, w2 9274 // cset w11, AArch64CC 9275 // cmp w9, w11 9276 // cset w0, eq 9277 // ret 9278 // 9279 // Since the above function shows when the outputs are equivalent it defines 9280 // when it is safe to remove the AND. Unfortunately it only runs on AArch64 and 9281 // would be expensive to run during compiles. The equations below were written 9282 // in a test harness that confirmed they gave equivalent outputs to the above 9283 // for all inputs function, so they can be used determine if the removal is 9284 // legal instead. 9285 // 9286 // isEquivalentMaskless() is the code for testing if the AND can be removed 9287 // factored out of the DAG recognition as the DAG can take several forms. 9288 9289 static 9290 bool isEquivalentMaskless(unsigned CC, unsigned width, 9291 ISD::LoadExtType ExtType, signed AddConstant, 9292 signed CompConstant) { 9293 // By being careful about our equations and only writing the in term 9294 // symbolic values and well known constants (0, 1, -1, MaxUInt) we can 9295 // make them generally applicable to all bit widths. 9296 signed MaxUInt = (1 << width); 9297 9298 // For the purposes of these comparisons sign extending the type is 9299 // equivalent to zero extending the add and displacing it by half the integer 9300 // width. Provided we are careful and make sure our equations are valid over 9301 // the whole range we can just adjust the input and avoid writing equations 9302 // for sign extended inputs. 9303 if (ExtType == ISD::SEXTLOAD) 9304 AddConstant -= (1 << (width-1)); 9305 9306 switch(CC) { 9307 case AArch64CC::LE: 9308 case AArch64CC::GT: { 9309 if ((AddConstant == 0) || 9310 (CompConstant == MaxUInt - 1 && AddConstant < 0) || 9311 (AddConstant >= 0 && CompConstant < 0) || 9312 (AddConstant <= 0 && CompConstant <= 0 && CompConstant < AddConstant)) 9313 return true; 9314 } break; 9315 case AArch64CC::LT: 9316 case AArch64CC::GE: { 9317 if ((AddConstant == 0) || 9318 (AddConstant >= 0 && CompConstant <= 0) || 9319 (AddConstant <= 0 && CompConstant <= 0 && CompConstant <= AddConstant)) 9320 return true; 9321 } break; 9322 case AArch64CC::HI: 9323 case AArch64CC::LS: { 9324 if ((AddConstant >= 0 && CompConstant < 0) || 9325 (AddConstant <= 0 && CompConstant >= -1 && 9326 CompConstant < AddConstant + MaxUInt)) 9327 return true; 9328 } break; 9329 case AArch64CC::PL: 9330 case AArch64CC::MI: { 9331 if ((AddConstant == 0) || 9332 (AddConstant > 0 && CompConstant <= 0) || 9333 (AddConstant < 0 && CompConstant <= AddConstant)) 9334 return true; 9335 } break; 9336 case AArch64CC::LO: 9337 case AArch64CC::HS: { 9338 if ((AddConstant >= 0 && CompConstant <= 0) || 9339 (AddConstant <= 0 && CompConstant >= 0 && 9340 CompConstant <= AddConstant + MaxUInt)) 9341 return true; 9342 } break; 9343 case AArch64CC::EQ: 9344 case AArch64CC::NE: { 9345 if ((AddConstant > 0 && CompConstant < 0) || 9346 (AddConstant < 0 && CompConstant >= 0 && 9347 CompConstant < AddConstant + MaxUInt) || 9348 (AddConstant >= 0 && CompConstant >= 0 && 9349 CompConstant >= AddConstant) || 9350 (AddConstant <= 0 && CompConstant < 0 && CompConstant < AddConstant)) 9351 9352 return true; 9353 } break; 9354 case AArch64CC::VS: 9355 case AArch64CC::VC: 9356 case AArch64CC::AL: 9357 case AArch64CC::NV: 9358 return true; 9359 case AArch64CC::Invalid: 9360 break; 9361 } 9362 9363 return false; 9364 } 9365 9366 static 9367 SDValue performCONDCombine(SDNode *N, 9368 TargetLowering::DAGCombinerInfo &DCI, 9369 SelectionDAG &DAG, unsigned CCIndex, 9370 unsigned CmpIndex) { 9371 unsigned CC = cast<ConstantSDNode>(N->getOperand(CCIndex))->getSExtValue(); 9372 SDNode *SubsNode = N->getOperand(CmpIndex).getNode(); 9373 unsigned CondOpcode = SubsNode->getOpcode(); 9374 9375 if (CondOpcode != AArch64ISD::SUBS) 9376 return SDValue(); 9377 9378 // There is a SUBS feeding this condition. Is it fed by a mask we can 9379 // use? 9380 9381 SDNode *AndNode = SubsNode->getOperand(0).getNode(); 9382 unsigned MaskBits = 0; 9383 9384 if (AndNode->getOpcode() != ISD::AND) 9385 return SDValue(); 9386 9387 if (ConstantSDNode *CN = dyn_cast<ConstantSDNode>(AndNode->getOperand(1))) { 9388 uint32_t CNV = CN->getZExtValue(); 9389 if (CNV == 255) 9390 MaskBits = 8; 9391 else if (CNV == 65535) 9392 MaskBits = 16; 9393 } 9394 9395 if (!MaskBits) 9396 return SDValue(); 9397 9398 SDValue AddValue = AndNode->getOperand(0); 9399 9400 if (AddValue.getOpcode() != ISD::ADD) 9401 return SDValue(); 9402 9403 // The basic dag structure is correct, grab the inputs and validate them. 9404 9405 SDValue AddInputValue1 = AddValue.getNode()->getOperand(0); 9406 SDValue AddInputValue2 = AddValue.getNode()->getOperand(1); 9407 SDValue SubsInputValue = SubsNode->getOperand(1); 9408 9409 // The mask is present and the provenance of all the values is a smaller type, 9410 // lets see if the mask is superfluous. 9411 9412 if (!isa<ConstantSDNode>(AddInputValue2.getNode()) || 9413 !isa<ConstantSDNode>(SubsInputValue.getNode())) 9414 return SDValue(); 9415 9416 ISD::LoadExtType ExtType; 9417 9418 if (!checkValueWidth(SubsInputValue, MaskBits, ExtType) || 9419 !checkValueWidth(AddInputValue2, MaskBits, ExtType) || 9420 !checkValueWidth(AddInputValue1, MaskBits, ExtType) ) 9421 return SDValue(); 9422 9423 if(!isEquivalentMaskless(CC, MaskBits, ExtType, 9424 cast<ConstantSDNode>(AddInputValue2.getNode())->getSExtValue(), 9425 cast<ConstantSDNode>(SubsInputValue.getNode())->getSExtValue())) 9426 return SDValue(); 9427 9428 // The AND is not necessary, remove it. 9429 9430 SDVTList VTs = DAG.getVTList(SubsNode->getValueType(0), 9431 SubsNode->getValueType(1)); 9432 SDValue Ops[] = { AddValue, SubsNode->getOperand(1) }; 9433 9434 SDValue NewValue = DAG.getNode(CondOpcode, SDLoc(SubsNode), VTs, Ops); 9435 DAG.ReplaceAllUsesWith(SubsNode, NewValue.getNode()); 9436 9437 return SDValue(N, 0); 9438 } 9439 9440 // Optimize compare with zero and branch. 9441 static SDValue performBRCONDCombine(SDNode *N, 9442 TargetLowering::DAGCombinerInfo &DCI, 9443 SelectionDAG &DAG) { 9444 SDValue NV = performCONDCombine(N, DCI, DAG, 2, 3); 9445 if (NV.getNode()) 9446 N = NV.getNode(); 9447 SDValue Chain = N->getOperand(0); 9448 SDValue Dest = N->getOperand(1); 9449 SDValue CCVal = N->getOperand(2); 9450 SDValue Cmp = N->getOperand(3); 9451 9452 assert(isa<ConstantSDNode>(CCVal) && "Expected a ConstantSDNode here!"); 9453 unsigned CC = cast<ConstantSDNode>(CCVal)->getZExtValue(); 9454 if (CC != AArch64CC::EQ && CC != AArch64CC::NE) 9455 return SDValue(); 9456 9457 unsigned CmpOpc = Cmp.getOpcode(); 9458 if (CmpOpc != AArch64ISD::ADDS && CmpOpc != AArch64ISD::SUBS) 9459 return SDValue(); 9460 9461 // Only attempt folding if there is only one use of the flag and no use of the 9462 // value. 9463 if (!Cmp->hasNUsesOfValue(0, 0) || !Cmp->hasNUsesOfValue(1, 1)) 9464 return SDValue(); 9465 9466 SDValue LHS = Cmp.getOperand(0); 9467 SDValue RHS = Cmp.getOperand(1); 9468 9469 assert(LHS.getValueType() == RHS.getValueType() && 9470 "Expected the value type to be the same for both operands!"); 9471 if (LHS.getValueType() != MVT::i32 && LHS.getValueType() != MVT::i64) 9472 return SDValue(); 9473 9474 if (isNullConstant(LHS)) 9475 std::swap(LHS, RHS); 9476 9477 if (!isNullConstant(RHS)) 9478 return SDValue(); 9479 9480 if (LHS.getOpcode() == ISD::SHL || LHS.getOpcode() == ISD::SRA || 9481 LHS.getOpcode() == ISD::SRL) 9482 return SDValue(); 9483 9484 // Fold the compare into the branch instruction. 9485 SDValue BR; 9486 if (CC == AArch64CC::EQ) 9487 BR = DAG.getNode(AArch64ISD::CBZ, SDLoc(N), MVT::Other, Chain, LHS, Dest); 9488 else 9489 BR = DAG.getNode(AArch64ISD::CBNZ, SDLoc(N), MVT::Other, Chain, LHS, Dest); 9490 9491 // Do not add new nodes to DAG combiner worklist. 9492 DCI.CombineTo(N, BR, false); 9493 9494 return SDValue(); 9495 } 9496 9497 // vselect (v1i1 setcc) -> 9498 // vselect (v1iXX setcc) (XX is the size of the compared operand type) 9499 // FIXME: Currently the type legalizer can't handle VSELECT having v1i1 as 9500 // condition. If it can legalize "VSELECT v1i1" correctly, no need to combine 9501 // such VSELECT. 9502 static SDValue performVSelectCombine(SDNode *N, SelectionDAG &DAG) { 9503 SDValue N0 = N->getOperand(0); 9504 EVT CCVT = N0.getValueType(); 9505 9506 if (N0.getOpcode() != ISD::SETCC || CCVT.getVectorNumElements() != 1 || 9507 CCVT.getVectorElementType() != MVT::i1) 9508 return SDValue(); 9509 9510 EVT ResVT = N->getValueType(0); 9511 EVT CmpVT = N0.getOperand(0).getValueType(); 9512 // Only combine when the result type is of the same size as the compared 9513 // operands. 9514 if (ResVT.getSizeInBits() != CmpVT.getSizeInBits()) 9515 return SDValue(); 9516 9517 SDValue IfTrue = N->getOperand(1); 9518 SDValue IfFalse = N->getOperand(2); 9519 SDValue SetCC = 9520 DAG.getSetCC(SDLoc(N), CmpVT.changeVectorElementTypeToInteger(), 9521 N0.getOperand(0), N0.getOperand(1), 9522 cast<CondCodeSDNode>(N0.getOperand(2))->get()); 9523 return DAG.getNode(ISD::VSELECT, SDLoc(N), ResVT, SetCC, 9524 IfTrue, IfFalse); 9525 } 9526 9527 /// A vector select: "(select vL, vR, (setcc LHS, RHS))" is best performed with 9528 /// the compare-mask instructions rather than going via NZCV, even if LHS and 9529 /// RHS are really scalar. This replaces any scalar setcc in the above pattern 9530 /// with a vector one followed by a DUP shuffle on the result. 9531 static SDValue performSelectCombine(SDNode *N, 9532 TargetLowering::DAGCombinerInfo &DCI) { 9533 SelectionDAG &DAG = DCI.DAG; 9534 SDValue N0 = N->getOperand(0); 9535 EVT ResVT = N->getValueType(0); 9536 9537 if (N0.getOpcode() != ISD::SETCC) 9538 return SDValue(); 9539 9540 // Make sure the SETCC result is either i1 (initial DAG), or i32, the lowered 9541 // scalar SetCCResultType. We also don't expect vectors, because we assume 9542 // that selects fed by vector SETCCs are canonicalized to VSELECT. 9543 assert((N0.getValueType() == MVT::i1 || N0.getValueType() == MVT::i32) && 9544 "Scalar-SETCC feeding SELECT has unexpected result type!"); 9545 9546 // If NumMaskElts == 0, the comparison is larger than select result. The 9547 // largest real NEON comparison is 64-bits per lane, which means the result is 9548 // at most 32-bits and an illegal vector. Just bail out for now. 9549 EVT SrcVT = N0.getOperand(0).getValueType(); 9550 9551 // Don't try to do this optimization when the setcc itself has i1 operands. 9552 // There are no legal vectors of i1, so this would be pointless. 9553 if (SrcVT == MVT::i1) 9554 return SDValue(); 9555 9556 int NumMaskElts = ResVT.getSizeInBits() / SrcVT.getSizeInBits(); 9557 if (!ResVT.isVector() || NumMaskElts == 0) 9558 return SDValue(); 9559 9560 SrcVT = EVT::getVectorVT(*DAG.getContext(), SrcVT, NumMaskElts); 9561 EVT CCVT = SrcVT.changeVectorElementTypeToInteger(); 9562 9563 // Also bail out if the vector CCVT isn't the same size as ResVT. 9564 // This can happen if the SETCC operand size doesn't divide the ResVT size 9565 // (e.g., f64 vs v3f32). 9566 if (CCVT.getSizeInBits() != ResVT.getSizeInBits()) 9567 return SDValue(); 9568 9569 // Make sure we didn't create illegal types, if we're not supposed to. 9570 assert(DCI.isBeforeLegalize() || 9571 DAG.getTargetLoweringInfo().isTypeLegal(SrcVT)); 9572 9573 // First perform a vector comparison, where lane 0 is the one we're interested 9574 // in. 9575 SDLoc DL(N0); 9576 SDValue LHS = 9577 DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, SrcVT, N0.getOperand(0)); 9578 SDValue RHS = 9579 DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, SrcVT, N0.getOperand(1)); 9580 SDValue SetCC = DAG.getNode(ISD::SETCC, DL, CCVT, LHS, RHS, N0.getOperand(2)); 9581 9582 // Now duplicate the comparison mask we want across all other lanes. 9583 SmallVector<int, 8> DUPMask(CCVT.getVectorNumElements(), 0); 9584 SDValue Mask = DAG.getVectorShuffle(CCVT, DL, SetCC, SetCC, DUPMask.data()); 9585 Mask = DAG.getNode(ISD::BITCAST, DL, 9586 ResVT.changeVectorElementTypeToInteger(), Mask); 9587 9588 return DAG.getSelect(DL, ResVT, Mask, N->getOperand(1), N->getOperand(2)); 9589 } 9590 9591 /// Get rid of unnecessary NVCASTs (that don't change the type). 9592 static SDValue performNVCASTCombine(SDNode *N) { 9593 if (N->getValueType(0) == N->getOperand(0).getValueType()) 9594 return N->getOperand(0); 9595 9596 return SDValue(); 9597 } 9598 9599 SDValue AArch64TargetLowering::PerformDAGCombine(SDNode *N, 9600 DAGCombinerInfo &DCI) const { 9601 SelectionDAG &DAG = DCI.DAG; 9602 switch (N->getOpcode()) { 9603 default: 9604 break; 9605 case ISD::ADD: 9606 case ISD::SUB: 9607 return performAddSubLongCombine(N, DCI, DAG); 9608 case ISD::XOR: 9609 return performXorCombine(N, DAG, DCI, Subtarget); 9610 case ISD::MUL: 9611 return performMulCombine(N, DAG, DCI, Subtarget); 9612 case ISD::SINT_TO_FP: 9613 case ISD::UINT_TO_FP: 9614 return performIntToFpCombine(N, DAG, Subtarget); 9615 case ISD::FP_TO_SINT: 9616 case ISD::FP_TO_UINT: 9617 return performFpToIntCombine(N, DAG, Subtarget); 9618 case ISD::FDIV: 9619 return performFDivCombine(N, DAG, Subtarget); 9620 case ISD::OR: 9621 return performORCombine(N, DCI, Subtarget); 9622 case ISD::INTRINSIC_WO_CHAIN: 9623 return performIntrinsicCombine(N, DCI, Subtarget); 9624 case ISD::ANY_EXTEND: 9625 case ISD::ZERO_EXTEND: 9626 case ISD::SIGN_EXTEND: 9627 return performExtendCombine(N, DCI, DAG); 9628 case ISD::BITCAST: 9629 return performBitcastCombine(N, DCI, DAG); 9630 case ISD::CONCAT_VECTORS: 9631 return performConcatVectorsCombine(N, DCI, DAG); 9632 case ISD::SELECT: { 9633 SDValue RV = performSelectCombine(N, DCI); 9634 if (!RV.getNode()) 9635 RV = performAcrossLaneMinMaxReductionCombine(N, DAG, Subtarget); 9636 return RV; 9637 } 9638 case ISD::VSELECT: 9639 return performVSelectCombine(N, DCI.DAG); 9640 case ISD::LOAD: 9641 if (performTBISimplification(N->getOperand(1), DCI, DAG)) 9642 return SDValue(N, 0); 9643 break; 9644 case ISD::STORE: 9645 return performSTORECombine(N, DCI, DAG, Subtarget); 9646 case AArch64ISD::BRCOND: 9647 return performBRCONDCombine(N, DCI, DAG); 9648 case AArch64ISD::CSEL: 9649 return performCONDCombine(N, DCI, DAG, 2, 3); 9650 case AArch64ISD::DUP: 9651 return performPostLD1Combine(N, DCI, false); 9652 case AArch64ISD::NVCAST: 9653 return performNVCASTCombine(N); 9654 case ISD::INSERT_VECTOR_ELT: 9655 return performPostLD1Combine(N, DCI, true); 9656 case ISD::EXTRACT_VECTOR_ELT: 9657 return performAcrossLaneAddReductionCombine(N, DAG, Subtarget); 9658 case ISD::INTRINSIC_VOID: 9659 case ISD::INTRINSIC_W_CHAIN: 9660 switch (cast<ConstantSDNode>(N->getOperand(1))->getZExtValue()) { 9661 case Intrinsic::aarch64_neon_ld2: 9662 case Intrinsic::aarch64_neon_ld3: 9663 case Intrinsic::aarch64_neon_ld4: 9664 case Intrinsic::aarch64_neon_ld1x2: 9665 case Intrinsic::aarch64_neon_ld1x3: 9666 case Intrinsic::aarch64_neon_ld1x4: 9667 case Intrinsic::aarch64_neon_ld2lane: 9668 case Intrinsic::aarch64_neon_ld3lane: 9669 case Intrinsic::aarch64_neon_ld4lane: 9670 case Intrinsic::aarch64_neon_ld2r: 9671 case Intrinsic::aarch64_neon_ld3r: 9672 case Intrinsic::aarch64_neon_ld4r: 9673 case Intrinsic::aarch64_neon_st2: 9674 case Intrinsic::aarch64_neon_st3: 9675 case Intrinsic::aarch64_neon_st4: 9676 case Intrinsic::aarch64_neon_st1x2: 9677 case Intrinsic::aarch64_neon_st1x3: 9678 case Intrinsic::aarch64_neon_st1x4: 9679 case Intrinsic::aarch64_neon_st2lane: 9680 case Intrinsic::aarch64_neon_st3lane: 9681 case Intrinsic::aarch64_neon_st4lane: 9682 return performNEONPostLDSTCombine(N, DCI, DAG); 9683 default: 9684 break; 9685 } 9686 } 9687 return SDValue(); 9688 } 9689 9690 // Check if the return value is used as only a return value, as otherwise 9691 // we can't perform a tail-call. In particular, we need to check for 9692 // target ISD nodes that are returns and any other "odd" constructs 9693 // that the generic analysis code won't necessarily catch. 9694 bool AArch64TargetLowering::isUsedByReturnOnly(SDNode *N, 9695 SDValue &Chain) const { 9696 if (N->getNumValues() != 1) 9697 return false; 9698 if (!N->hasNUsesOfValue(1, 0)) 9699 return false; 9700 9701 SDValue TCChain = Chain; 9702 SDNode *Copy = *N->use_begin(); 9703 if (Copy->getOpcode() == ISD::CopyToReg) { 9704 // If the copy has a glue operand, we conservatively assume it isn't safe to 9705 // perform a tail call. 9706 if (Copy->getOperand(Copy->getNumOperands() - 1).getValueType() == 9707 MVT::Glue) 9708 return false; 9709 TCChain = Copy->getOperand(0); 9710 } else if (Copy->getOpcode() != ISD::FP_EXTEND) 9711 return false; 9712 9713 bool HasRet = false; 9714 for (SDNode *Node : Copy->uses()) { 9715 if (Node->getOpcode() != AArch64ISD::RET_FLAG) 9716 return false; 9717 HasRet = true; 9718 } 9719 9720 if (!HasRet) 9721 return false; 9722 9723 Chain = TCChain; 9724 return true; 9725 } 9726 9727 // Return whether the an instruction can potentially be optimized to a tail 9728 // call. This will cause the optimizers to attempt to move, or duplicate, 9729 // return instructions to help enable tail call optimizations for this 9730 // instruction. 9731 bool AArch64TargetLowering::mayBeEmittedAsTailCall(CallInst *CI) const { 9732 if (!CI->isTailCall()) 9733 return false; 9734 9735 return true; 9736 } 9737 9738 bool AArch64TargetLowering::getIndexedAddressParts(SDNode *Op, SDValue &Base, 9739 SDValue &Offset, 9740 ISD::MemIndexedMode &AM, 9741 bool &IsInc, 9742 SelectionDAG &DAG) const { 9743 if (Op->getOpcode() != ISD::ADD && Op->getOpcode() != ISD::SUB) 9744 return false; 9745 9746 Base = Op->getOperand(0); 9747 // All of the indexed addressing mode instructions take a signed 9748 // 9 bit immediate offset. 9749 if (ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(Op->getOperand(1))) { 9750 int64_t RHSC = (int64_t)RHS->getZExtValue(); 9751 if (RHSC >= 256 || RHSC <= -256) 9752 return false; 9753 IsInc = (Op->getOpcode() == ISD::ADD); 9754 Offset = Op->getOperand(1); 9755 return true; 9756 } 9757 return false; 9758 } 9759 9760 bool AArch64TargetLowering::getPreIndexedAddressParts(SDNode *N, SDValue &Base, 9761 SDValue &Offset, 9762 ISD::MemIndexedMode &AM, 9763 SelectionDAG &DAG) const { 9764 EVT VT; 9765 SDValue Ptr; 9766 if (LoadSDNode *LD = dyn_cast<LoadSDNode>(N)) { 9767 VT = LD->getMemoryVT(); 9768 Ptr = LD->getBasePtr(); 9769 } else if (StoreSDNode *ST = dyn_cast<StoreSDNode>(N)) { 9770 VT = ST->getMemoryVT(); 9771 Ptr = ST->getBasePtr(); 9772 } else 9773 return false; 9774 9775 bool IsInc; 9776 if (!getIndexedAddressParts(Ptr.getNode(), Base, Offset, AM, IsInc, DAG)) 9777 return false; 9778 AM = IsInc ? ISD::PRE_INC : ISD::PRE_DEC; 9779 return true; 9780 } 9781 9782 bool AArch64TargetLowering::getPostIndexedAddressParts( 9783 SDNode *N, SDNode *Op, SDValue &Base, SDValue &Offset, 9784 ISD::MemIndexedMode &AM, SelectionDAG &DAG) const { 9785 EVT VT; 9786 SDValue Ptr; 9787 if (LoadSDNode *LD = dyn_cast<LoadSDNode>(N)) { 9788 VT = LD->getMemoryVT(); 9789 Ptr = LD->getBasePtr(); 9790 } else if (StoreSDNode *ST = dyn_cast<StoreSDNode>(N)) { 9791 VT = ST->getMemoryVT(); 9792 Ptr = ST->getBasePtr(); 9793 } else 9794 return false; 9795 9796 bool IsInc; 9797 if (!getIndexedAddressParts(Op, Base, Offset, AM, IsInc, DAG)) 9798 return false; 9799 // Post-indexing updates the base, so it's not a valid transform 9800 // if that's not the same as the load's pointer. 9801 if (Ptr != Base) 9802 return false; 9803 AM = IsInc ? ISD::POST_INC : ISD::POST_DEC; 9804 return true; 9805 } 9806 9807 static void ReplaceBITCASTResults(SDNode *N, SmallVectorImpl<SDValue> &Results, 9808 SelectionDAG &DAG) { 9809 SDLoc DL(N); 9810 SDValue Op = N->getOperand(0); 9811 9812 if (N->getValueType(0) != MVT::i16 || Op.getValueType() != MVT::f16) 9813 return; 9814 9815 Op = SDValue( 9816 DAG.getMachineNode(TargetOpcode::INSERT_SUBREG, DL, MVT::f32, 9817 DAG.getUNDEF(MVT::i32), Op, 9818 DAG.getTargetConstant(AArch64::hsub, DL, MVT::i32)), 9819 0); 9820 Op = DAG.getNode(ISD::BITCAST, DL, MVT::i32, Op); 9821 Results.push_back(DAG.getNode(ISD::TRUNCATE, DL, MVT::i16, Op)); 9822 } 9823 9824 static void ReplaceReductionResults(SDNode *N, 9825 SmallVectorImpl<SDValue> &Results, 9826 SelectionDAG &DAG, unsigned InterOp, 9827 unsigned AcrossOp) { 9828 EVT LoVT, HiVT; 9829 SDValue Lo, Hi; 9830 SDLoc dl(N); 9831 std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(N->getValueType(0)); 9832 std::tie(Lo, Hi) = DAG.SplitVectorOperand(N, 0); 9833 SDValue InterVal = DAG.getNode(InterOp, dl, LoVT, Lo, Hi); 9834 SDValue SplitVal = DAG.getNode(AcrossOp, dl, LoVT, InterVal); 9835 Results.push_back(SplitVal); 9836 } 9837 9838 void AArch64TargetLowering::ReplaceNodeResults( 9839 SDNode *N, SmallVectorImpl<SDValue> &Results, SelectionDAG &DAG) const { 9840 switch (N->getOpcode()) { 9841 default: 9842 llvm_unreachable("Don't know how to custom expand this"); 9843 case ISD::BITCAST: 9844 ReplaceBITCASTResults(N, Results, DAG); 9845 return; 9846 case AArch64ISD::SADDV: 9847 ReplaceReductionResults(N, Results, DAG, ISD::ADD, AArch64ISD::SADDV); 9848 return; 9849 case AArch64ISD::UADDV: 9850 ReplaceReductionResults(N, Results, DAG, ISD::ADD, AArch64ISD::UADDV); 9851 return; 9852 case AArch64ISD::SMINV: 9853 ReplaceReductionResults(N, Results, DAG, ISD::SMIN, AArch64ISD::SMINV); 9854 return; 9855 case AArch64ISD::UMINV: 9856 ReplaceReductionResults(N, Results, DAG, ISD::UMIN, AArch64ISD::UMINV); 9857 return; 9858 case AArch64ISD::SMAXV: 9859 ReplaceReductionResults(N, Results, DAG, ISD::SMAX, AArch64ISD::SMAXV); 9860 return; 9861 case AArch64ISD::UMAXV: 9862 ReplaceReductionResults(N, Results, DAG, ISD::UMAX, AArch64ISD::UMAXV); 9863 return; 9864 case ISD::FP_TO_UINT: 9865 case ISD::FP_TO_SINT: 9866 assert(N->getValueType(0) == MVT::i128 && "unexpected illegal conversion"); 9867 // Let normal code take care of it by not adding anything to Results. 9868 return; 9869 } 9870 } 9871 9872 bool AArch64TargetLowering::useLoadStackGuardNode() const { 9873 return true; 9874 } 9875 9876 unsigned AArch64TargetLowering::combineRepeatedFPDivisors() const { 9877 // Combine multiple FDIVs with the same divisor into multiple FMULs by the 9878 // reciprocal if there are three or more FDIVs. 9879 return 3; 9880 } 9881 9882 TargetLoweringBase::LegalizeTypeAction 9883 AArch64TargetLowering::getPreferredVectorAction(EVT VT) const { 9884 MVT SVT = VT.getSimpleVT(); 9885 // During type legalization, we prefer to widen v1i8, v1i16, v1i32 to v8i8, 9886 // v4i16, v2i32 instead of to promote. 9887 if (SVT == MVT::v1i8 || SVT == MVT::v1i16 || SVT == MVT::v1i32 9888 || SVT == MVT::v1f32) 9889 return TypeWidenVector; 9890 9891 return TargetLoweringBase::getPreferredVectorAction(VT); 9892 } 9893 9894 // Loads and stores less than 128-bits are already atomic; ones above that 9895 // are doomed anyway, so defer to the default libcall and blame the OS when 9896 // things go wrong. 9897 bool AArch64TargetLowering::shouldExpandAtomicStoreInIR(StoreInst *SI) const { 9898 unsigned Size = SI->getValueOperand()->getType()->getPrimitiveSizeInBits(); 9899 return Size == 128; 9900 } 9901 9902 // Loads and stores less than 128-bits are already atomic; ones above that 9903 // are doomed anyway, so defer to the default libcall and blame the OS when 9904 // things go wrong. 9905 TargetLowering::AtomicExpansionKind 9906 AArch64TargetLowering::shouldExpandAtomicLoadInIR(LoadInst *LI) const { 9907 unsigned Size = LI->getType()->getPrimitiveSizeInBits(); 9908 return Size == 128 ? AtomicExpansionKind::LLSC : AtomicExpansionKind::None; 9909 } 9910 9911 // For the real atomic operations, we have ldxr/stxr up to 128 bits, 9912 TargetLowering::AtomicExpansionKind 9913 AArch64TargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const { 9914 unsigned Size = AI->getType()->getPrimitiveSizeInBits(); 9915 return Size <= 128 ? AtomicExpansionKind::LLSC : AtomicExpansionKind::None; 9916 } 9917 9918 bool AArch64TargetLowering::shouldExpandAtomicCmpXchgInIR( 9919 AtomicCmpXchgInst *AI) const { 9920 return true; 9921 } 9922 9923 Value *AArch64TargetLowering::emitLoadLinked(IRBuilder<> &Builder, Value *Addr, 9924 AtomicOrdering Ord) const { 9925 Module *M = Builder.GetInsertBlock()->getParent()->getParent(); 9926 Type *ValTy = cast<PointerType>(Addr->getType())->getElementType(); 9927 bool IsAcquire = isAtLeastAcquire(Ord); 9928 9929 // Since i128 isn't legal and intrinsics don't get type-lowered, the ldrexd 9930 // intrinsic must return {i64, i64} and we have to recombine them into a 9931 // single i128 here. 9932 if (ValTy->getPrimitiveSizeInBits() == 128) { 9933 Intrinsic::ID Int = 9934 IsAcquire ? Intrinsic::aarch64_ldaxp : Intrinsic::aarch64_ldxp; 9935 Function *Ldxr = llvm::Intrinsic::getDeclaration(M, Int); 9936 9937 Addr = Builder.CreateBitCast(Addr, Type::getInt8PtrTy(M->getContext())); 9938 Value *LoHi = Builder.CreateCall(Ldxr, Addr, "lohi"); 9939 9940 Value *Lo = Builder.CreateExtractValue(LoHi, 0, "lo"); 9941 Value *Hi = Builder.CreateExtractValue(LoHi, 1, "hi"); 9942 Lo = Builder.CreateZExt(Lo, ValTy, "lo64"); 9943 Hi = Builder.CreateZExt(Hi, ValTy, "hi64"); 9944 return Builder.CreateOr( 9945 Lo, Builder.CreateShl(Hi, ConstantInt::get(ValTy, 64)), "val64"); 9946 } 9947 9948 Type *Tys[] = { Addr->getType() }; 9949 Intrinsic::ID Int = 9950 IsAcquire ? Intrinsic::aarch64_ldaxr : Intrinsic::aarch64_ldxr; 9951 Function *Ldxr = llvm::Intrinsic::getDeclaration(M, Int, Tys); 9952 9953 return Builder.CreateTruncOrBitCast( 9954 Builder.CreateCall(Ldxr, Addr), 9955 cast<PointerType>(Addr->getType())->getElementType()); 9956 } 9957 9958 void AArch64TargetLowering::emitAtomicCmpXchgNoStoreLLBalance( 9959 IRBuilder<> &Builder) const { 9960 Module *M = Builder.GetInsertBlock()->getParent()->getParent(); 9961 Builder.CreateCall( 9962 llvm::Intrinsic::getDeclaration(M, Intrinsic::aarch64_clrex)); 9963 } 9964 9965 Value *AArch64TargetLowering::emitStoreConditional(IRBuilder<> &Builder, 9966 Value *Val, Value *Addr, 9967 AtomicOrdering Ord) const { 9968 Module *M = Builder.GetInsertBlock()->getParent()->getParent(); 9969 bool IsRelease = isAtLeastRelease(Ord); 9970 9971 // Since the intrinsics must have legal type, the i128 intrinsics take two 9972 // parameters: "i64, i64". We must marshal Val into the appropriate form 9973 // before the call. 9974 if (Val->getType()->getPrimitiveSizeInBits() == 128) { 9975 Intrinsic::ID Int = 9976 IsRelease ? Intrinsic::aarch64_stlxp : Intrinsic::aarch64_stxp; 9977 Function *Stxr = Intrinsic::getDeclaration(M, Int); 9978 Type *Int64Ty = Type::getInt64Ty(M->getContext()); 9979 9980 Value *Lo = Builder.CreateTrunc(Val, Int64Ty, "lo"); 9981 Value *Hi = Builder.CreateTrunc(Builder.CreateLShr(Val, 64), Int64Ty, "hi"); 9982 Addr = Builder.CreateBitCast(Addr, Type::getInt8PtrTy(M->getContext())); 9983 return Builder.CreateCall(Stxr, {Lo, Hi, Addr}); 9984 } 9985 9986 Intrinsic::ID Int = 9987 IsRelease ? Intrinsic::aarch64_stlxr : Intrinsic::aarch64_stxr; 9988 Type *Tys[] = { Addr->getType() }; 9989 Function *Stxr = Intrinsic::getDeclaration(M, Int, Tys); 9990 9991 return Builder.CreateCall(Stxr, 9992 {Builder.CreateZExtOrBitCast( 9993 Val, Stxr->getFunctionType()->getParamType(0)), 9994 Addr}); 9995 } 9996 9997 bool AArch64TargetLowering::functionArgumentNeedsConsecutiveRegisters( 9998 Type *Ty, CallingConv::ID CallConv, bool isVarArg) const { 9999 return Ty->isArrayTy(); 10000 } 10001 10002 bool AArch64TargetLowering::shouldNormalizeToSelectSequence(LLVMContext &, 10003 EVT) const { 10004 return false; 10005 } 10006 10007 Value *AArch64TargetLowering::getSafeStackPointerLocation(IRBuilder<> &IRB) const { 10008 if (!Subtarget->isTargetAndroid()) 10009 return TargetLowering::getSafeStackPointerLocation(IRB); 10010 10011 // Android provides a fixed TLS slot for the SafeStack pointer. See the 10012 // definition of TLS_SLOT_SAFESTACK in 10013 // https://android.googlesource.com/platform/bionic/+/master/libc/private/bionic_tls.h 10014 const unsigned TlsOffset = 0x48; 10015 Module *M = IRB.GetInsertBlock()->getParent()->getParent(); 10016 Function *ThreadPointerFunc = 10017 Intrinsic::getDeclaration(M, Intrinsic::aarch64_thread_pointer); 10018 return IRB.CreatePointerCast( 10019 IRB.CreateConstGEP1_32(IRB.CreateCall(ThreadPointerFunc), TlsOffset), 10020 Type::getInt8PtrTy(IRB.getContext())->getPointerTo(0)); 10021 } 10022 10023 void AArch64TargetLowering::initializeSplitCSR(MachineBasicBlock *Entry) const { 10024 // Update IsSplitCSR in AArch64unctionInfo. 10025 AArch64FunctionInfo *AFI = Entry->getParent()->getInfo<AArch64FunctionInfo>(); 10026 AFI->setIsSplitCSR(true); 10027 } 10028 10029 void AArch64TargetLowering::insertCopiesSplitCSR( 10030 MachineBasicBlock *Entry, 10031 const SmallVectorImpl<MachineBasicBlock *> &Exits) const { 10032 const AArch64RegisterInfo *TRI = Subtarget->getRegisterInfo(); 10033 const MCPhysReg *IStart = TRI->getCalleeSavedRegsViaCopy(Entry->getParent()); 10034 if (!IStart) 10035 return; 10036 10037 const TargetInstrInfo *TII = Subtarget->getInstrInfo(); 10038 MachineRegisterInfo *MRI = &Entry->getParent()->getRegInfo(); 10039 for (const MCPhysReg *I = IStart; *I; ++I) { 10040 const TargetRegisterClass *RC = nullptr; 10041 if (AArch64::GPR64RegClass.contains(*I)) 10042 RC = &AArch64::GPR64RegClass; 10043 else if (AArch64::FPR64RegClass.contains(*I)) 10044 RC = &AArch64::FPR64RegClass; 10045 else 10046 llvm_unreachable("Unexpected register class in CSRsViaCopy!"); 10047 10048 unsigned NewVR = MRI->createVirtualRegister(RC); 10049 // Create copy from CSR to a virtual register. 10050 // FIXME: this currently does not emit CFI pseudo-instructions, it works 10051 // fine for CXX_FAST_TLS since the C++-style TLS access functions should be 10052 // nounwind. If we want to generalize this later, we may need to emit 10053 // CFI pseudo-instructions. 10054 assert(Entry->getParent()->getFunction()->hasFnAttribute( 10055 Attribute::NoUnwind) && 10056 "Function should be nounwind in insertCopiesSplitCSR!"); 10057 Entry->addLiveIn(*I); 10058 BuildMI(*Entry, Entry->begin(), DebugLoc(), TII->get(TargetOpcode::COPY), 10059 NewVR) 10060 .addReg(*I); 10061 10062 for (auto *Exit : Exits) 10063 BuildMI(*Exit, Exit->begin(), DebugLoc(), TII->get(TargetOpcode::COPY), 10064 *I) 10065 .addReg(NewVR); 10066 } 10067 } 10068