1 //===-- AArch64ISelLowering.cpp - AArch64 DAG Lowering Implementation ----===// 2 // 3 // The LLVM Compiler Infrastructure 4 // 5 // This file is distributed under the University of Illinois Open Source 6 // License. See LICENSE.TXT for details. 7 // 8 //===----------------------------------------------------------------------===// 9 // 10 // This file implements the AArch64TargetLowering class. 11 // 12 //===----------------------------------------------------------------------===// 13 14 #include "AArch64ISelLowering.h" 15 #include "AArch64CallingConvention.h" 16 #include "AArch64MachineFunctionInfo.h" 17 #include "AArch64PerfectShuffle.h" 18 #include "AArch64Subtarget.h" 19 #include "AArch64TargetMachine.h" 20 #include "AArch64TargetObjectFile.h" 21 #include "MCTargetDesc/AArch64AddressingModes.h" 22 #include "llvm/ADT/Statistic.h" 23 #include "llvm/CodeGen/CallingConvLower.h" 24 #include "llvm/CodeGen/MachineFrameInfo.h" 25 #include "llvm/CodeGen/MachineInstrBuilder.h" 26 #include "llvm/CodeGen/MachineRegisterInfo.h" 27 #include "llvm/IR/Function.h" 28 #include "llvm/IR/GetElementPtrTypeIterator.h" 29 #include "llvm/IR/Intrinsics.h" 30 #include "llvm/IR/Type.h" 31 #include "llvm/Support/CommandLine.h" 32 #include "llvm/Support/Debug.h" 33 #include "llvm/Support/ErrorHandling.h" 34 #include "llvm/Support/raw_ostream.h" 35 #include "llvm/Target/TargetOptions.h" 36 using namespace llvm; 37 38 #define DEBUG_TYPE "aarch64-lower" 39 40 STATISTIC(NumTailCalls, "Number of tail calls"); 41 STATISTIC(NumShiftInserts, "Number of vector shift inserts"); 42 43 // Place holder until extr generation is tested fully. 44 static cl::opt<bool> 45 EnableAArch64ExtrGeneration("aarch64-extr-generation", cl::Hidden, 46 cl::desc("Allow AArch64 (or (shift)(shift))->extract"), 47 cl::init(true)); 48 49 static cl::opt<bool> 50 EnableAArch64SlrGeneration("aarch64-shift-insert-generation", cl::Hidden, 51 cl::desc("Allow AArch64 SLI/SRI formation"), 52 cl::init(false)); 53 54 // FIXME: The necessary dtprel relocations don't seem to be supported 55 // well in the GNU bfd and gold linkers at the moment. Therefore, by 56 // default, for now, fall back to GeneralDynamic code generation. 57 cl::opt<bool> EnableAArch64ELFLocalDynamicTLSGeneration( 58 "aarch64-elf-ldtls-generation", cl::Hidden, 59 cl::desc("Allow AArch64 Local Dynamic TLS code generation"), 60 cl::init(false)); 61 62 /// Value type used for condition codes. 63 static const MVT MVT_CC = MVT::i32; 64 65 AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM, 66 const AArch64Subtarget &STI) 67 : TargetLowering(TM), Subtarget(&STI) { 68 69 // AArch64 doesn't have comparisons which set GPRs or setcc instructions, so 70 // we have to make something up. Arbitrarily, choose ZeroOrOne. 71 setBooleanContents(ZeroOrOneBooleanContent); 72 // When comparing vectors the result sets the different elements in the 73 // vector to all-one or all-zero. 74 setBooleanVectorContents(ZeroOrNegativeOneBooleanContent); 75 76 // Set up the register classes. 77 addRegisterClass(MVT::i32, &AArch64::GPR32allRegClass); 78 addRegisterClass(MVT::i64, &AArch64::GPR64allRegClass); 79 80 if (Subtarget->hasFPARMv8()) { 81 addRegisterClass(MVT::f16, &AArch64::FPR16RegClass); 82 addRegisterClass(MVT::f32, &AArch64::FPR32RegClass); 83 addRegisterClass(MVT::f64, &AArch64::FPR64RegClass); 84 addRegisterClass(MVT::f128, &AArch64::FPR128RegClass); 85 } 86 87 if (Subtarget->hasNEON()) { 88 addRegisterClass(MVT::v16i8, &AArch64::FPR8RegClass); 89 addRegisterClass(MVT::v8i16, &AArch64::FPR16RegClass); 90 // Someone set us up the NEON. 91 addDRTypeForNEON(MVT::v2f32); 92 addDRTypeForNEON(MVT::v8i8); 93 addDRTypeForNEON(MVT::v4i16); 94 addDRTypeForNEON(MVT::v2i32); 95 addDRTypeForNEON(MVT::v1i64); 96 addDRTypeForNEON(MVT::v1f64); 97 addDRTypeForNEON(MVT::v4f16); 98 99 addQRTypeForNEON(MVT::v4f32); 100 addQRTypeForNEON(MVT::v2f64); 101 addQRTypeForNEON(MVT::v16i8); 102 addQRTypeForNEON(MVT::v8i16); 103 addQRTypeForNEON(MVT::v4i32); 104 addQRTypeForNEON(MVT::v2i64); 105 addQRTypeForNEON(MVT::v8f16); 106 } 107 108 // Compute derived properties from the register classes 109 computeRegisterProperties(Subtarget->getRegisterInfo()); 110 111 // Provide all sorts of operation actions 112 setOperationAction(ISD::GlobalAddress, MVT::i64, Custom); 113 setOperationAction(ISD::GlobalTLSAddress, MVT::i64, Custom); 114 setOperationAction(ISD::SETCC, MVT::i32, Custom); 115 setOperationAction(ISD::SETCC, MVT::i64, Custom); 116 setOperationAction(ISD::SETCC, MVT::f32, Custom); 117 setOperationAction(ISD::SETCC, MVT::f64, Custom); 118 setOperationAction(ISD::BRCOND, MVT::Other, Expand); 119 setOperationAction(ISD::BR_CC, MVT::i32, Custom); 120 setOperationAction(ISD::BR_CC, MVT::i64, Custom); 121 setOperationAction(ISD::BR_CC, MVT::f32, Custom); 122 setOperationAction(ISD::BR_CC, MVT::f64, Custom); 123 setOperationAction(ISD::SELECT, MVT::i32, Custom); 124 setOperationAction(ISD::SELECT, MVT::i64, Custom); 125 setOperationAction(ISD::SELECT, MVT::f32, Custom); 126 setOperationAction(ISD::SELECT, MVT::f64, Custom); 127 setOperationAction(ISD::SELECT_CC, MVT::i32, Custom); 128 setOperationAction(ISD::SELECT_CC, MVT::i64, Custom); 129 setOperationAction(ISD::SELECT_CC, MVT::f32, Custom); 130 setOperationAction(ISD::SELECT_CC, MVT::f64, Custom); 131 setOperationAction(ISD::BR_JT, MVT::Other, Expand); 132 setOperationAction(ISD::JumpTable, MVT::i64, Custom); 133 134 setOperationAction(ISD::SHL_PARTS, MVT::i64, Custom); 135 setOperationAction(ISD::SRA_PARTS, MVT::i64, Custom); 136 setOperationAction(ISD::SRL_PARTS, MVT::i64, Custom); 137 138 setOperationAction(ISD::FREM, MVT::f32, Expand); 139 setOperationAction(ISD::FREM, MVT::f64, Expand); 140 setOperationAction(ISD::FREM, MVT::f80, Expand); 141 142 // Custom lowering hooks are needed for XOR 143 // to fold it into CSINC/CSINV. 144 setOperationAction(ISD::XOR, MVT::i32, Custom); 145 setOperationAction(ISD::XOR, MVT::i64, Custom); 146 147 // Virtually no operation on f128 is legal, but LLVM can't expand them when 148 // there's a valid register class, so we need custom operations in most cases. 149 setOperationAction(ISD::FABS, MVT::f128, Expand); 150 setOperationAction(ISD::FADD, MVT::f128, Custom); 151 setOperationAction(ISD::FCOPYSIGN, MVT::f128, Expand); 152 setOperationAction(ISD::FCOS, MVT::f128, Expand); 153 setOperationAction(ISD::FDIV, MVT::f128, Custom); 154 setOperationAction(ISD::FMA, MVT::f128, Expand); 155 setOperationAction(ISD::FMUL, MVT::f128, Custom); 156 setOperationAction(ISD::FNEG, MVT::f128, Expand); 157 setOperationAction(ISD::FPOW, MVT::f128, Expand); 158 setOperationAction(ISD::FREM, MVT::f128, Expand); 159 setOperationAction(ISD::FRINT, MVT::f128, Expand); 160 setOperationAction(ISD::FSIN, MVT::f128, Expand); 161 setOperationAction(ISD::FSINCOS, MVT::f128, Expand); 162 setOperationAction(ISD::FSQRT, MVT::f128, Expand); 163 setOperationAction(ISD::FSUB, MVT::f128, Custom); 164 setOperationAction(ISD::FTRUNC, MVT::f128, Expand); 165 setOperationAction(ISD::SETCC, MVT::f128, Custom); 166 setOperationAction(ISD::BR_CC, MVT::f128, Custom); 167 setOperationAction(ISD::SELECT, MVT::f128, Custom); 168 setOperationAction(ISD::SELECT_CC, MVT::f128, Custom); 169 setOperationAction(ISD::FP_EXTEND, MVT::f128, Custom); 170 171 // Lowering for many of the conversions is actually specified by the non-f128 172 // type. The LowerXXX function will be trivial when f128 isn't involved. 173 setOperationAction(ISD::FP_TO_SINT, MVT::i32, Custom); 174 setOperationAction(ISD::FP_TO_SINT, MVT::i64, Custom); 175 setOperationAction(ISD::FP_TO_SINT, MVT::i128, Custom); 176 setOperationAction(ISD::FP_TO_UINT, MVT::i32, Custom); 177 setOperationAction(ISD::FP_TO_UINT, MVT::i64, Custom); 178 setOperationAction(ISD::FP_TO_UINT, MVT::i128, Custom); 179 setOperationAction(ISD::SINT_TO_FP, MVT::i32, Custom); 180 setOperationAction(ISD::SINT_TO_FP, MVT::i64, Custom); 181 setOperationAction(ISD::SINT_TO_FP, MVT::i128, Custom); 182 setOperationAction(ISD::UINT_TO_FP, MVT::i32, Custom); 183 setOperationAction(ISD::UINT_TO_FP, MVT::i64, Custom); 184 setOperationAction(ISD::UINT_TO_FP, MVT::i128, Custom); 185 setOperationAction(ISD::FP_ROUND, MVT::f32, Custom); 186 setOperationAction(ISD::FP_ROUND, MVT::f64, Custom); 187 188 // Variable arguments. 189 setOperationAction(ISD::VASTART, MVT::Other, Custom); 190 setOperationAction(ISD::VAARG, MVT::Other, Custom); 191 setOperationAction(ISD::VACOPY, MVT::Other, Custom); 192 setOperationAction(ISD::VAEND, MVT::Other, Expand); 193 194 // Variable-sized objects. 195 setOperationAction(ISD::STACKSAVE, MVT::Other, Expand); 196 setOperationAction(ISD::STACKRESTORE, MVT::Other, Expand); 197 setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i64, Expand); 198 199 // Constant pool entries 200 setOperationAction(ISD::ConstantPool, MVT::i64, Custom); 201 202 // BlockAddress 203 setOperationAction(ISD::BlockAddress, MVT::i64, Custom); 204 205 // Add/Sub overflow ops with MVT::Glues are lowered to NZCV dependences. 206 setOperationAction(ISD::ADDC, MVT::i32, Custom); 207 setOperationAction(ISD::ADDE, MVT::i32, Custom); 208 setOperationAction(ISD::SUBC, MVT::i32, Custom); 209 setOperationAction(ISD::SUBE, MVT::i32, Custom); 210 setOperationAction(ISD::ADDC, MVT::i64, Custom); 211 setOperationAction(ISD::ADDE, MVT::i64, Custom); 212 setOperationAction(ISD::SUBC, MVT::i64, Custom); 213 setOperationAction(ISD::SUBE, MVT::i64, Custom); 214 215 // AArch64 lacks both left-rotate and popcount instructions. 216 setOperationAction(ISD::ROTL, MVT::i32, Expand); 217 setOperationAction(ISD::ROTL, MVT::i64, Expand); 218 for (MVT VT : MVT::vector_valuetypes()) { 219 setOperationAction(ISD::ROTL, VT, Expand); 220 setOperationAction(ISD::ROTR, VT, Expand); 221 } 222 223 // AArch64 doesn't have {U|S}MUL_LOHI. 224 setOperationAction(ISD::UMUL_LOHI, MVT::i64, Expand); 225 setOperationAction(ISD::SMUL_LOHI, MVT::i64, Expand); 226 227 228 // Expand the undefined-at-zero variants to cttz/ctlz to their defined-at-zero 229 // counterparts, which AArch64 supports directly. 230 setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i32, Expand); 231 setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i32, Expand); 232 setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i64, Expand); 233 setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i64, Expand); 234 235 setOperationAction(ISD::CTPOP, MVT::i32, Custom); 236 setOperationAction(ISD::CTPOP, MVT::i64, Custom); 237 238 setOperationAction(ISD::SDIVREM, MVT::i32, Expand); 239 setOperationAction(ISD::SDIVREM, MVT::i64, Expand); 240 for (MVT VT : MVT::vector_valuetypes()) { 241 setOperationAction(ISD::SDIVREM, VT, Expand); 242 setOperationAction(ISD::UDIVREM, VT, Expand); 243 } 244 setOperationAction(ISD::SREM, MVT::i32, Expand); 245 setOperationAction(ISD::SREM, MVT::i64, Expand); 246 setOperationAction(ISD::UDIVREM, MVT::i32, Expand); 247 setOperationAction(ISD::UDIVREM, MVT::i64, Expand); 248 setOperationAction(ISD::UREM, MVT::i32, Expand); 249 setOperationAction(ISD::UREM, MVT::i64, Expand); 250 251 // Custom lower Add/Sub/Mul with overflow. 252 setOperationAction(ISD::SADDO, MVT::i32, Custom); 253 setOperationAction(ISD::SADDO, MVT::i64, Custom); 254 setOperationAction(ISD::UADDO, MVT::i32, Custom); 255 setOperationAction(ISD::UADDO, MVT::i64, Custom); 256 setOperationAction(ISD::SSUBO, MVT::i32, Custom); 257 setOperationAction(ISD::SSUBO, MVT::i64, Custom); 258 setOperationAction(ISD::USUBO, MVT::i32, Custom); 259 setOperationAction(ISD::USUBO, MVT::i64, Custom); 260 setOperationAction(ISD::SMULO, MVT::i32, Custom); 261 setOperationAction(ISD::SMULO, MVT::i64, Custom); 262 setOperationAction(ISD::UMULO, MVT::i32, Custom); 263 setOperationAction(ISD::UMULO, MVT::i64, Custom); 264 265 setOperationAction(ISD::FSIN, MVT::f32, Expand); 266 setOperationAction(ISD::FSIN, MVT::f64, Expand); 267 setOperationAction(ISD::FCOS, MVT::f32, Expand); 268 setOperationAction(ISD::FCOS, MVT::f64, Expand); 269 setOperationAction(ISD::FPOW, MVT::f32, Expand); 270 setOperationAction(ISD::FPOW, MVT::f64, Expand); 271 setOperationAction(ISD::FCOPYSIGN, MVT::f64, Custom); 272 setOperationAction(ISD::FCOPYSIGN, MVT::f32, Custom); 273 274 // f16 is a storage-only type, always promote it to f32. 275 setOperationAction(ISD::SETCC, MVT::f16, Promote); 276 setOperationAction(ISD::BR_CC, MVT::f16, Promote); 277 setOperationAction(ISD::SELECT_CC, MVT::f16, Promote); 278 setOperationAction(ISD::SELECT, MVT::f16, Promote); 279 setOperationAction(ISD::FADD, MVT::f16, Promote); 280 setOperationAction(ISD::FSUB, MVT::f16, Promote); 281 setOperationAction(ISD::FMUL, MVT::f16, Promote); 282 setOperationAction(ISD::FDIV, MVT::f16, Promote); 283 setOperationAction(ISD::FREM, MVT::f16, Promote); 284 setOperationAction(ISD::FMA, MVT::f16, Promote); 285 setOperationAction(ISD::FNEG, MVT::f16, Promote); 286 setOperationAction(ISD::FABS, MVT::f16, Promote); 287 setOperationAction(ISD::FCEIL, MVT::f16, Promote); 288 setOperationAction(ISD::FCOPYSIGN, MVT::f16, Promote); 289 setOperationAction(ISD::FCOS, MVT::f16, Promote); 290 setOperationAction(ISD::FFLOOR, MVT::f16, Promote); 291 setOperationAction(ISD::FNEARBYINT, MVT::f16, Promote); 292 setOperationAction(ISD::FPOW, MVT::f16, Promote); 293 setOperationAction(ISD::FPOWI, MVT::f16, Promote); 294 setOperationAction(ISD::FRINT, MVT::f16, Promote); 295 setOperationAction(ISD::FSIN, MVT::f16, Promote); 296 setOperationAction(ISD::FSINCOS, MVT::f16, Promote); 297 setOperationAction(ISD::FSQRT, MVT::f16, Promote); 298 setOperationAction(ISD::FEXP, MVT::f16, Promote); 299 setOperationAction(ISD::FEXP2, MVT::f16, Promote); 300 setOperationAction(ISD::FLOG, MVT::f16, Promote); 301 setOperationAction(ISD::FLOG2, MVT::f16, Promote); 302 setOperationAction(ISD::FLOG10, MVT::f16, Promote); 303 setOperationAction(ISD::FROUND, MVT::f16, Promote); 304 setOperationAction(ISD::FTRUNC, MVT::f16, Promote); 305 setOperationAction(ISD::FMINNUM, MVT::f16, Promote); 306 setOperationAction(ISD::FMAXNUM, MVT::f16, Promote); 307 setOperationAction(ISD::FMINNAN, MVT::f16, Promote); 308 setOperationAction(ISD::FMAXNAN, MVT::f16, Promote); 309 310 // v4f16 is also a storage-only type, so promote it to v4f32 when that is 311 // known to be safe. 312 setOperationAction(ISD::FADD, MVT::v4f16, Promote); 313 setOperationAction(ISD::FSUB, MVT::v4f16, Promote); 314 setOperationAction(ISD::FMUL, MVT::v4f16, Promote); 315 setOperationAction(ISD::FDIV, MVT::v4f16, Promote); 316 setOperationAction(ISD::FP_EXTEND, MVT::v4f16, Promote); 317 setOperationAction(ISD::FP_ROUND, MVT::v4f16, Promote); 318 AddPromotedToType(ISD::FADD, MVT::v4f16, MVT::v4f32); 319 AddPromotedToType(ISD::FSUB, MVT::v4f16, MVT::v4f32); 320 AddPromotedToType(ISD::FMUL, MVT::v4f16, MVT::v4f32); 321 AddPromotedToType(ISD::FDIV, MVT::v4f16, MVT::v4f32); 322 AddPromotedToType(ISD::FP_EXTEND, MVT::v4f16, MVT::v4f32); 323 AddPromotedToType(ISD::FP_ROUND, MVT::v4f16, MVT::v4f32); 324 325 // Expand all other v4f16 operations. 326 // FIXME: We could generate better code by promoting some operations to 327 // a pair of v4f32s 328 setOperationAction(ISD::FABS, MVT::v4f16, Expand); 329 setOperationAction(ISD::FCEIL, MVT::v4f16, Expand); 330 setOperationAction(ISD::FCOPYSIGN, MVT::v4f16, Expand); 331 setOperationAction(ISD::FCOS, MVT::v4f16, Expand); 332 setOperationAction(ISD::FFLOOR, MVT::v4f16, Expand); 333 setOperationAction(ISD::FMA, MVT::v4f16, Expand); 334 setOperationAction(ISD::FNEARBYINT, MVT::v4f16, Expand); 335 setOperationAction(ISD::FNEG, MVT::v4f16, Expand); 336 setOperationAction(ISD::FPOW, MVT::v4f16, Expand); 337 setOperationAction(ISD::FPOWI, MVT::v4f16, Expand); 338 setOperationAction(ISD::FREM, MVT::v4f16, Expand); 339 setOperationAction(ISD::FROUND, MVT::v4f16, Expand); 340 setOperationAction(ISD::FRINT, MVT::v4f16, Expand); 341 setOperationAction(ISD::FSIN, MVT::v4f16, Expand); 342 setOperationAction(ISD::FSINCOS, MVT::v4f16, Expand); 343 setOperationAction(ISD::FSQRT, MVT::v4f16, Expand); 344 setOperationAction(ISD::FTRUNC, MVT::v4f16, Expand); 345 setOperationAction(ISD::SETCC, MVT::v4f16, Expand); 346 setOperationAction(ISD::BR_CC, MVT::v4f16, Expand); 347 setOperationAction(ISD::SELECT, MVT::v4f16, Expand); 348 setOperationAction(ISD::SELECT_CC, MVT::v4f16, Expand); 349 setOperationAction(ISD::FEXP, MVT::v4f16, Expand); 350 setOperationAction(ISD::FEXP2, MVT::v4f16, Expand); 351 setOperationAction(ISD::FLOG, MVT::v4f16, Expand); 352 setOperationAction(ISD::FLOG2, MVT::v4f16, Expand); 353 setOperationAction(ISD::FLOG10, MVT::v4f16, Expand); 354 355 356 // v8f16 is also a storage-only type, so expand it. 357 setOperationAction(ISD::FABS, MVT::v8f16, Expand); 358 setOperationAction(ISD::FADD, MVT::v8f16, Expand); 359 setOperationAction(ISD::FCEIL, MVT::v8f16, Expand); 360 setOperationAction(ISD::FCOPYSIGN, MVT::v8f16, Expand); 361 setOperationAction(ISD::FCOS, MVT::v8f16, Expand); 362 setOperationAction(ISD::FDIV, MVT::v8f16, Expand); 363 setOperationAction(ISD::FFLOOR, MVT::v8f16, Expand); 364 setOperationAction(ISD::FMA, MVT::v8f16, Expand); 365 setOperationAction(ISD::FMUL, MVT::v8f16, Expand); 366 setOperationAction(ISD::FNEARBYINT, MVT::v8f16, Expand); 367 setOperationAction(ISD::FNEG, MVT::v8f16, Expand); 368 setOperationAction(ISD::FPOW, MVT::v8f16, Expand); 369 setOperationAction(ISD::FPOWI, MVT::v8f16, Expand); 370 setOperationAction(ISD::FREM, MVT::v8f16, Expand); 371 setOperationAction(ISD::FROUND, MVT::v8f16, Expand); 372 setOperationAction(ISD::FRINT, MVT::v8f16, Expand); 373 setOperationAction(ISD::FSIN, MVT::v8f16, Expand); 374 setOperationAction(ISD::FSINCOS, MVT::v8f16, Expand); 375 setOperationAction(ISD::FSQRT, MVT::v8f16, Expand); 376 setOperationAction(ISD::FSUB, MVT::v8f16, Expand); 377 setOperationAction(ISD::FTRUNC, MVT::v8f16, Expand); 378 setOperationAction(ISD::SETCC, MVT::v8f16, Expand); 379 setOperationAction(ISD::BR_CC, MVT::v8f16, Expand); 380 setOperationAction(ISD::SELECT, MVT::v8f16, Expand); 381 setOperationAction(ISD::SELECT_CC, MVT::v8f16, Expand); 382 setOperationAction(ISD::FP_EXTEND, MVT::v8f16, Expand); 383 setOperationAction(ISD::FEXP, MVT::v8f16, Expand); 384 setOperationAction(ISD::FEXP2, MVT::v8f16, Expand); 385 setOperationAction(ISD::FLOG, MVT::v8f16, Expand); 386 setOperationAction(ISD::FLOG2, MVT::v8f16, Expand); 387 setOperationAction(ISD::FLOG10, MVT::v8f16, Expand); 388 389 // AArch64 has implementations of a lot of rounding-like FP operations. 390 for (MVT Ty : {MVT::f32, MVT::f64}) { 391 setOperationAction(ISD::FFLOOR, Ty, Legal); 392 setOperationAction(ISD::FNEARBYINT, Ty, Legal); 393 setOperationAction(ISD::FCEIL, Ty, Legal); 394 setOperationAction(ISD::FRINT, Ty, Legal); 395 setOperationAction(ISD::FTRUNC, Ty, Legal); 396 setOperationAction(ISD::FROUND, Ty, Legal); 397 setOperationAction(ISD::FMINNUM, Ty, Legal); 398 setOperationAction(ISD::FMAXNUM, Ty, Legal); 399 setOperationAction(ISD::FMINNAN, Ty, Legal); 400 setOperationAction(ISD::FMAXNAN, Ty, Legal); 401 } 402 403 setOperationAction(ISD::PREFETCH, MVT::Other, Custom); 404 405 // Lower READCYCLECOUNTER using an mrs from PMCCNTR_EL0. 406 // This requires the Performance Monitors extension. 407 if (Subtarget->hasPerfMon()) 408 setOperationAction(ISD::READCYCLECOUNTER, MVT::i64, Legal); 409 410 if (Subtarget->isTargetMachO()) { 411 // For iOS, we don't want to the normal expansion of a libcall to 412 // sincos. We want to issue a libcall to __sincos_stret to avoid memory 413 // traffic. 414 setOperationAction(ISD::FSINCOS, MVT::f64, Custom); 415 setOperationAction(ISD::FSINCOS, MVT::f32, Custom); 416 } else { 417 setOperationAction(ISD::FSINCOS, MVT::f64, Expand); 418 setOperationAction(ISD::FSINCOS, MVT::f32, Expand); 419 } 420 421 // Make floating-point constants legal for the large code model, so they don't 422 // become loads from the constant pool. 423 if (Subtarget->isTargetMachO() && TM.getCodeModel() == CodeModel::Large) { 424 setOperationAction(ISD::ConstantFP, MVT::f32, Legal); 425 setOperationAction(ISD::ConstantFP, MVT::f64, Legal); 426 } 427 428 // AArch64 does not have floating-point extending loads, i1 sign-extending 429 // load, floating-point truncating stores, or v2i32->v2i16 truncating store. 430 for (MVT VT : MVT::fp_valuetypes()) { 431 setLoadExtAction(ISD::EXTLOAD, VT, MVT::f16, Expand); 432 setLoadExtAction(ISD::EXTLOAD, VT, MVT::f32, Expand); 433 setLoadExtAction(ISD::EXTLOAD, VT, MVT::f64, Expand); 434 setLoadExtAction(ISD::EXTLOAD, VT, MVT::f80, Expand); 435 } 436 for (MVT VT : MVT::integer_valuetypes()) 437 setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i1, Expand); 438 439 setTruncStoreAction(MVT::f32, MVT::f16, Expand); 440 setTruncStoreAction(MVT::f64, MVT::f32, Expand); 441 setTruncStoreAction(MVT::f64, MVT::f16, Expand); 442 setTruncStoreAction(MVT::f128, MVT::f80, Expand); 443 setTruncStoreAction(MVT::f128, MVT::f64, Expand); 444 setTruncStoreAction(MVT::f128, MVT::f32, Expand); 445 setTruncStoreAction(MVT::f128, MVT::f16, Expand); 446 447 setOperationAction(ISD::BITCAST, MVT::i16, Custom); 448 setOperationAction(ISD::BITCAST, MVT::f16, Custom); 449 450 // Indexed loads and stores are supported. 451 for (unsigned im = (unsigned)ISD::PRE_INC; 452 im != (unsigned)ISD::LAST_INDEXED_MODE; ++im) { 453 setIndexedLoadAction(im, MVT::i8, Legal); 454 setIndexedLoadAction(im, MVT::i16, Legal); 455 setIndexedLoadAction(im, MVT::i32, Legal); 456 setIndexedLoadAction(im, MVT::i64, Legal); 457 setIndexedLoadAction(im, MVT::f64, Legal); 458 setIndexedLoadAction(im, MVT::f32, Legal); 459 setIndexedLoadAction(im, MVT::f16, Legal); 460 setIndexedStoreAction(im, MVT::i8, Legal); 461 setIndexedStoreAction(im, MVT::i16, Legal); 462 setIndexedStoreAction(im, MVT::i32, Legal); 463 setIndexedStoreAction(im, MVT::i64, Legal); 464 setIndexedStoreAction(im, MVT::f64, Legal); 465 setIndexedStoreAction(im, MVT::f32, Legal); 466 setIndexedStoreAction(im, MVT::f16, Legal); 467 } 468 469 // Trap. 470 setOperationAction(ISD::TRAP, MVT::Other, Legal); 471 472 // We combine OR nodes for bitfield operations. 473 setTargetDAGCombine(ISD::OR); 474 475 // Vector add and sub nodes may conceal a high-half opportunity. 476 // Also, try to fold ADD into CSINC/CSINV.. 477 setTargetDAGCombine(ISD::ADD); 478 setTargetDAGCombine(ISD::SUB); 479 480 setTargetDAGCombine(ISD::XOR); 481 setTargetDAGCombine(ISD::SINT_TO_FP); 482 setTargetDAGCombine(ISD::UINT_TO_FP); 483 484 setTargetDAGCombine(ISD::FP_TO_SINT); 485 setTargetDAGCombine(ISD::FP_TO_UINT); 486 setTargetDAGCombine(ISD::FDIV); 487 488 setTargetDAGCombine(ISD::INTRINSIC_WO_CHAIN); 489 490 setTargetDAGCombine(ISD::ANY_EXTEND); 491 setTargetDAGCombine(ISD::ZERO_EXTEND); 492 setTargetDAGCombine(ISD::SIGN_EXTEND); 493 setTargetDAGCombine(ISD::BITCAST); 494 setTargetDAGCombine(ISD::CONCAT_VECTORS); 495 setTargetDAGCombine(ISD::STORE); 496 if (Subtarget->supportsAddressTopByteIgnored()) 497 setTargetDAGCombine(ISD::LOAD); 498 499 setTargetDAGCombine(ISD::MUL); 500 501 setTargetDAGCombine(ISD::SELECT); 502 setTargetDAGCombine(ISD::VSELECT); 503 504 setTargetDAGCombine(ISD::INTRINSIC_VOID); 505 setTargetDAGCombine(ISD::INTRINSIC_W_CHAIN); 506 setTargetDAGCombine(ISD::INSERT_VECTOR_ELT); 507 setTargetDAGCombine(ISD::EXTRACT_VECTOR_ELT); 508 509 MaxStoresPerMemset = MaxStoresPerMemsetOptSize = 8; 510 MaxStoresPerMemcpy = MaxStoresPerMemcpyOptSize = 4; 511 MaxStoresPerMemmove = MaxStoresPerMemmoveOptSize = 4; 512 513 setStackPointerRegisterToSaveRestore(AArch64::SP); 514 515 setSchedulingPreference(Sched::Hybrid); 516 517 // Enable TBZ/TBNZ 518 MaskAndBranchFoldingIsLegal = true; 519 EnableExtLdPromotion = true; 520 521 setMinFunctionAlignment(2); 522 523 setHasExtractBitsInsn(true); 524 525 setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom); 526 527 if (Subtarget->hasNEON()) { 528 // FIXME: v1f64 shouldn't be legal if we can avoid it, because it leads to 529 // silliness like this: 530 setOperationAction(ISD::FABS, MVT::v1f64, Expand); 531 setOperationAction(ISD::FADD, MVT::v1f64, Expand); 532 setOperationAction(ISD::FCEIL, MVT::v1f64, Expand); 533 setOperationAction(ISD::FCOPYSIGN, MVT::v1f64, Expand); 534 setOperationAction(ISD::FCOS, MVT::v1f64, Expand); 535 setOperationAction(ISD::FDIV, MVT::v1f64, Expand); 536 setOperationAction(ISD::FFLOOR, MVT::v1f64, Expand); 537 setOperationAction(ISD::FMA, MVT::v1f64, Expand); 538 setOperationAction(ISD::FMUL, MVT::v1f64, Expand); 539 setOperationAction(ISD::FNEARBYINT, MVT::v1f64, Expand); 540 setOperationAction(ISD::FNEG, MVT::v1f64, Expand); 541 setOperationAction(ISD::FPOW, MVT::v1f64, Expand); 542 setOperationAction(ISD::FREM, MVT::v1f64, Expand); 543 setOperationAction(ISD::FROUND, MVT::v1f64, Expand); 544 setOperationAction(ISD::FRINT, MVT::v1f64, Expand); 545 setOperationAction(ISD::FSIN, MVT::v1f64, Expand); 546 setOperationAction(ISD::FSINCOS, MVT::v1f64, Expand); 547 setOperationAction(ISD::FSQRT, MVT::v1f64, Expand); 548 setOperationAction(ISD::FSUB, MVT::v1f64, Expand); 549 setOperationAction(ISD::FTRUNC, MVT::v1f64, Expand); 550 setOperationAction(ISD::SETCC, MVT::v1f64, Expand); 551 setOperationAction(ISD::BR_CC, MVT::v1f64, Expand); 552 setOperationAction(ISD::SELECT, MVT::v1f64, Expand); 553 setOperationAction(ISD::SELECT_CC, MVT::v1f64, Expand); 554 setOperationAction(ISD::FP_EXTEND, MVT::v1f64, Expand); 555 556 setOperationAction(ISD::FP_TO_SINT, MVT::v1i64, Expand); 557 setOperationAction(ISD::FP_TO_UINT, MVT::v1i64, Expand); 558 setOperationAction(ISD::SINT_TO_FP, MVT::v1i64, Expand); 559 setOperationAction(ISD::UINT_TO_FP, MVT::v1i64, Expand); 560 setOperationAction(ISD::FP_ROUND, MVT::v1f64, Expand); 561 562 setOperationAction(ISD::MUL, MVT::v1i64, Expand); 563 564 // AArch64 doesn't have a direct vector ->f32 conversion instructions for 565 // elements smaller than i32, so promote the input to i32 first. 566 setOperationAction(ISD::UINT_TO_FP, MVT::v4i8, Promote); 567 setOperationAction(ISD::SINT_TO_FP, MVT::v4i8, Promote); 568 setOperationAction(ISD::UINT_TO_FP, MVT::v4i16, Promote); 569 setOperationAction(ISD::SINT_TO_FP, MVT::v4i16, Promote); 570 // i8 and i16 vector elements also need promotion to i32 for v8i8 or v8i16 571 // -> v8f16 conversions. 572 setOperationAction(ISD::SINT_TO_FP, MVT::v8i8, Promote); 573 setOperationAction(ISD::UINT_TO_FP, MVT::v8i8, Promote); 574 setOperationAction(ISD::SINT_TO_FP, MVT::v8i16, Promote); 575 setOperationAction(ISD::UINT_TO_FP, MVT::v8i16, Promote); 576 // Similarly, there is no direct i32 -> f64 vector conversion instruction. 577 setOperationAction(ISD::SINT_TO_FP, MVT::v2i32, Custom); 578 setOperationAction(ISD::UINT_TO_FP, MVT::v2i32, Custom); 579 setOperationAction(ISD::SINT_TO_FP, MVT::v2i64, Custom); 580 setOperationAction(ISD::UINT_TO_FP, MVT::v2i64, Custom); 581 // Or, direct i32 -> f16 vector conversion. Set it so custom, so the 582 // conversion happens in two steps: v4i32 -> v4f32 -> v4f16 583 setOperationAction(ISD::SINT_TO_FP, MVT::v4i32, Custom); 584 setOperationAction(ISD::UINT_TO_FP, MVT::v4i32, Custom); 585 586 // AArch64 doesn't have MUL.2d: 587 setOperationAction(ISD::MUL, MVT::v2i64, Expand); 588 // Custom handling for some quad-vector types to detect MULL. 589 setOperationAction(ISD::MUL, MVT::v8i16, Custom); 590 setOperationAction(ISD::MUL, MVT::v4i32, Custom); 591 setOperationAction(ISD::MUL, MVT::v2i64, Custom); 592 593 setOperationAction(ISD::ANY_EXTEND, MVT::v4i32, Legal); 594 setTruncStoreAction(MVT::v2i32, MVT::v2i16, Expand); 595 // Likewise, narrowing and extending vector loads/stores aren't handled 596 // directly. 597 for (MVT VT : MVT::vector_valuetypes()) { 598 setOperationAction(ISD::SIGN_EXTEND_INREG, VT, Expand); 599 600 setOperationAction(ISD::MULHS, VT, Expand); 601 setOperationAction(ISD::SMUL_LOHI, VT, Expand); 602 setOperationAction(ISD::MULHU, VT, Expand); 603 setOperationAction(ISD::UMUL_LOHI, VT, Expand); 604 605 setOperationAction(ISD::BSWAP, VT, Expand); 606 607 for (MVT InnerVT : MVT::vector_valuetypes()) { 608 setTruncStoreAction(VT, InnerVT, Expand); 609 setLoadExtAction(ISD::SEXTLOAD, VT, InnerVT, Expand); 610 setLoadExtAction(ISD::ZEXTLOAD, VT, InnerVT, Expand); 611 setLoadExtAction(ISD::EXTLOAD, VT, InnerVT, Expand); 612 } 613 } 614 615 // AArch64 has implementations of a lot of rounding-like FP operations. 616 for (MVT Ty : {MVT::v2f32, MVT::v4f32, MVT::v2f64}) { 617 setOperationAction(ISD::FFLOOR, Ty, Legal); 618 setOperationAction(ISD::FNEARBYINT, Ty, Legal); 619 setOperationAction(ISD::FCEIL, Ty, Legal); 620 setOperationAction(ISD::FRINT, Ty, Legal); 621 setOperationAction(ISD::FTRUNC, Ty, Legal); 622 setOperationAction(ISD::FROUND, Ty, Legal); 623 } 624 } 625 626 // Prefer likely predicted branches to selects on out-of-order cores. 627 if (Subtarget->isCortexA57()) 628 PredictableSelectIsExpensive = true; 629 } 630 631 void AArch64TargetLowering::addTypeForNEON(EVT VT, EVT PromotedBitwiseVT) { 632 if (VT == MVT::v2f32 || VT == MVT::v4f16) { 633 setOperationAction(ISD::LOAD, VT.getSimpleVT(), Promote); 634 AddPromotedToType(ISD::LOAD, VT.getSimpleVT(), MVT::v2i32); 635 636 setOperationAction(ISD::STORE, VT.getSimpleVT(), Promote); 637 AddPromotedToType(ISD::STORE, VT.getSimpleVT(), MVT::v2i32); 638 } else if (VT == MVT::v2f64 || VT == MVT::v4f32 || VT == MVT::v8f16) { 639 setOperationAction(ISD::LOAD, VT.getSimpleVT(), Promote); 640 AddPromotedToType(ISD::LOAD, VT.getSimpleVT(), MVT::v2i64); 641 642 setOperationAction(ISD::STORE, VT.getSimpleVT(), Promote); 643 AddPromotedToType(ISD::STORE, VT.getSimpleVT(), MVT::v2i64); 644 } 645 646 // Mark vector float intrinsics as expand. 647 if (VT == MVT::v2f32 || VT == MVT::v4f32 || VT == MVT::v2f64) { 648 setOperationAction(ISD::FSIN, VT.getSimpleVT(), Expand); 649 setOperationAction(ISD::FCOS, VT.getSimpleVT(), Expand); 650 setOperationAction(ISD::FPOWI, VT.getSimpleVT(), Expand); 651 setOperationAction(ISD::FPOW, VT.getSimpleVT(), Expand); 652 setOperationAction(ISD::FLOG, VT.getSimpleVT(), Expand); 653 setOperationAction(ISD::FLOG2, VT.getSimpleVT(), Expand); 654 setOperationAction(ISD::FLOG10, VT.getSimpleVT(), Expand); 655 setOperationAction(ISD::FEXP, VT.getSimpleVT(), Expand); 656 setOperationAction(ISD::FEXP2, VT.getSimpleVT(), Expand); 657 658 // But we do support custom-lowering for FCOPYSIGN. 659 setOperationAction(ISD::FCOPYSIGN, VT.getSimpleVT(), Custom); 660 } 661 662 setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT.getSimpleVT(), Custom); 663 setOperationAction(ISD::INSERT_VECTOR_ELT, VT.getSimpleVT(), Custom); 664 setOperationAction(ISD::BUILD_VECTOR, VT.getSimpleVT(), Custom); 665 setOperationAction(ISD::VECTOR_SHUFFLE, VT.getSimpleVT(), Custom); 666 setOperationAction(ISD::EXTRACT_SUBVECTOR, VT.getSimpleVT(), Custom); 667 setOperationAction(ISD::SRA, VT.getSimpleVT(), Custom); 668 setOperationAction(ISD::SRL, VT.getSimpleVT(), Custom); 669 setOperationAction(ISD::SHL, VT.getSimpleVT(), Custom); 670 setOperationAction(ISD::AND, VT.getSimpleVT(), Custom); 671 setOperationAction(ISD::OR, VT.getSimpleVT(), Custom); 672 setOperationAction(ISD::SETCC, VT.getSimpleVT(), Custom); 673 setOperationAction(ISD::CONCAT_VECTORS, VT.getSimpleVT(), Legal); 674 675 setOperationAction(ISD::SELECT, VT.getSimpleVT(), Expand); 676 setOperationAction(ISD::SELECT_CC, VT.getSimpleVT(), Expand); 677 setOperationAction(ISD::VSELECT, VT.getSimpleVT(), Expand); 678 for (MVT InnerVT : MVT::all_valuetypes()) 679 setLoadExtAction(ISD::EXTLOAD, InnerVT, VT.getSimpleVT(), Expand); 680 681 // CNT supports only B element sizes. 682 if (VT != MVT::v8i8 && VT != MVT::v16i8) 683 setOperationAction(ISD::CTPOP, VT.getSimpleVT(), Expand); 684 685 setOperationAction(ISD::UDIV, VT.getSimpleVT(), Expand); 686 setOperationAction(ISD::SDIV, VT.getSimpleVT(), Expand); 687 setOperationAction(ISD::UREM, VT.getSimpleVT(), Expand); 688 setOperationAction(ISD::SREM, VT.getSimpleVT(), Expand); 689 setOperationAction(ISD::FREM, VT.getSimpleVT(), Expand); 690 691 setOperationAction(ISD::FP_TO_SINT, VT.getSimpleVT(), Custom); 692 setOperationAction(ISD::FP_TO_UINT, VT.getSimpleVT(), Custom); 693 694 // [SU][MIN|MAX] are available for all NEON types apart from i64. 695 if (!VT.isFloatingPoint() && 696 VT.getSimpleVT() != MVT::v2i64 && VT.getSimpleVT() != MVT::v1i64) 697 for (unsigned Opcode : {ISD::SMIN, ISD::SMAX, ISD::UMIN, ISD::UMAX}) 698 setOperationAction(Opcode, VT.getSimpleVT(), Legal); 699 700 // F[MIN|MAX][NUM|NAN] are available for all FP NEON types (not f16 though!). 701 if (VT.isFloatingPoint() && VT.getVectorElementType() != MVT::f16) 702 for (unsigned Opcode : {ISD::FMINNAN, ISD::FMAXNAN, 703 ISD::FMINNUM, ISD::FMAXNUM}) 704 setOperationAction(Opcode, VT.getSimpleVT(), Legal); 705 706 if (Subtarget->isLittleEndian()) { 707 for (unsigned im = (unsigned)ISD::PRE_INC; 708 im != (unsigned)ISD::LAST_INDEXED_MODE; ++im) { 709 setIndexedLoadAction(im, VT.getSimpleVT(), Legal); 710 setIndexedStoreAction(im, VT.getSimpleVT(), Legal); 711 } 712 } 713 } 714 715 void AArch64TargetLowering::addDRTypeForNEON(MVT VT) { 716 addRegisterClass(VT, &AArch64::FPR64RegClass); 717 addTypeForNEON(VT, MVT::v2i32); 718 } 719 720 void AArch64TargetLowering::addQRTypeForNEON(MVT VT) { 721 addRegisterClass(VT, &AArch64::FPR128RegClass); 722 addTypeForNEON(VT, MVT::v4i32); 723 } 724 725 EVT AArch64TargetLowering::getSetCCResultType(const DataLayout &, LLVMContext &, 726 EVT VT) const { 727 if (!VT.isVector()) 728 return MVT::i32; 729 return VT.changeVectorElementTypeToInteger(); 730 } 731 732 /// computeKnownBitsForTargetNode - Determine which of the bits specified in 733 /// Mask are known to be either zero or one and return them in the 734 /// KnownZero/KnownOne bitsets. 735 void AArch64TargetLowering::computeKnownBitsForTargetNode( 736 const SDValue Op, APInt &KnownZero, APInt &KnownOne, 737 const SelectionDAG &DAG, unsigned Depth) const { 738 switch (Op.getOpcode()) { 739 default: 740 break; 741 case AArch64ISD::CSEL: { 742 APInt KnownZero2, KnownOne2; 743 DAG.computeKnownBits(Op->getOperand(0), KnownZero, KnownOne, Depth + 1); 744 DAG.computeKnownBits(Op->getOperand(1), KnownZero2, KnownOne2, Depth + 1); 745 KnownZero &= KnownZero2; 746 KnownOne &= KnownOne2; 747 break; 748 } 749 case ISD::INTRINSIC_W_CHAIN: { 750 ConstantSDNode *CN = cast<ConstantSDNode>(Op->getOperand(1)); 751 Intrinsic::ID IntID = static_cast<Intrinsic::ID>(CN->getZExtValue()); 752 switch (IntID) { 753 default: return; 754 case Intrinsic::aarch64_ldaxr: 755 case Intrinsic::aarch64_ldxr: { 756 unsigned BitWidth = KnownOne.getBitWidth(); 757 EVT VT = cast<MemIntrinsicSDNode>(Op)->getMemoryVT(); 758 unsigned MemBits = VT.getScalarType().getSizeInBits(); 759 KnownZero |= APInt::getHighBitsSet(BitWidth, BitWidth - MemBits); 760 return; 761 } 762 } 763 break; 764 } 765 case ISD::INTRINSIC_WO_CHAIN: 766 case ISD::INTRINSIC_VOID: { 767 unsigned IntNo = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue(); 768 switch (IntNo) { 769 default: 770 break; 771 case Intrinsic::aarch64_neon_umaxv: 772 case Intrinsic::aarch64_neon_uminv: { 773 // Figure out the datatype of the vector operand. The UMINV instruction 774 // will zero extend the result, so we can mark as known zero all the 775 // bits larger than the element datatype. 32-bit or larget doesn't need 776 // this as those are legal types and will be handled by isel directly. 777 MVT VT = Op.getOperand(1).getValueType().getSimpleVT(); 778 unsigned BitWidth = KnownZero.getBitWidth(); 779 if (VT == MVT::v8i8 || VT == MVT::v16i8) { 780 assert(BitWidth >= 8 && "Unexpected width!"); 781 APInt Mask = APInt::getHighBitsSet(BitWidth, BitWidth - 8); 782 KnownZero |= Mask; 783 } else if (VT == MVT::v4i16 || VT == MVT::v8i16) { 784 assert(BitWidth >= 16 && "Unexpected width!"); 785 APInt Mask = APInt::getHighBitsSet(BitWidth, BitWidth - 16); 786 KnownZero |= Mask; 787 } 788 break; 789 } break; 790 } 791 } 792 } 793 } 794 795 MVT AArch64TargetLowering::getScalarShiftAmountTy(const DataLayout &DL, 796 EVT) const { 797 return MVT::i64; 798 } 799 800 bool AArch64TargetLowering::allowsMisalignedMemoryAccesses(EVT VT, 801 unsigned AddrSpace, 802 unsigned Align, 803 bool *Fast) const { 804 if (Subtarget->requiresStrictAlign()) 805 return false; 806 807 // FIXME: This is mostly true for Cyclone, but not necessarily others. 808 if (Fast) { 809 // FIXME: Define an attribute for slow unaligned accesses instead of 810 // relying on the CPU type as a proxy. 811 // On Cyclone, unaligned 128-bit stores are slow. 812 *Fast = !Subtarget->isCyclone() || VT.getStoreSize() != 16 || 813 // See comments in performSTORECombine() for more details about 814 // these conditions. 815 816 // Code that uses clang vector extensions can mark that it 817 // wants unaligned accesses to be treated as fast by 818 // underspecifying alignment to be 1 or 2. 819 Align <= 2 || 820 821 // Disregard v2i64. Memcpy lowering produces those and splitting 822 // them regresses performance on micro-benchmarks and olden/bh. 823 VT == MVT::v2i64; 824 } 825 return true; 826 } 827 828 FastISel * 829 AArch64TargetLowering::createFastISel(FunctionLoweringInfo &funcInfo, 830 const TargetLibraryInfo *libInfo) const { 831 return AArch64::createFastISel(funcInfo, libInfo); 832 } 833 834 const char *AArch64TargetLowering::getTargetNodeName(unsigned Opcode) const { 835 switch ((AArch64ISD::NodeType)Opcode) { 836 case AArch64ISD::FIRST_NUMBER: break; 837 case AArch64ISD::CALL: return "AArch64ISD::CALL"; 838 case AArch64ISD::ADRP: return "AArch64ISD::ADRP"; 839 case AArch64ISD::ADDlow: return "AArch64ISD::ADDlow"; 840 case AArch64ISD::LOADgot: return "AArch64ISD::LOADgot"; 841 case AArch64ISD::RET_FLAG: return "AArch64ISD::RET_FLAG"; 842 case AArch64ISD::BRCOND: return "AArch64ISD::BRCOND"; 843 case AArch64ISD::CSEL: return "AArch64ISD::CSEL"; 844 case AArch64ISD::FCSEL: return "AArch64ISD::FCSEL"; 845 case AArch64ISD::CSINV: return "AArch64ISD::CSINV"; 846 case AArch64ISD::CSNEG: return "AArch64ISD::CSNEG"; 847 case AArch64ISD::CSINC: return "AArch64ISD::CSINC"; 848 case AArch64ISD::THREAD_POINTER: return "AArch64ISD::THREAD_POINTER"; 849 case AArch64ISD::TLSDESC_CALLSEQ: return "AArch64ISD::TLSDESC_CALLSEQ"; 850 case AArch64ISD::ADC: return "AArch64ISD::ADC"; 851 case AArch64ISD::SBC: return "AArch64ISD::SBC"; 852 case AArch64ISD::ADDS: return "AArch64ISD::ADDS"; 853 case AArch64ISD::SUBS: return "AArch64ISD::SUBS"; 854 case AArch64ISD::ADCS: return "AArch64ISD::ADCS"; 855 case AArch64ISD::SBCS: return "AArch64ISD::SBCS"; 856 case AArch64ISD::ANDS: return "AArch64ISD::ANDS"; 857 case AArch64ISD::CCMP: return "AArch64ISD::CCMP"; 858 case AArch64ISD::CCMN: return "AArch64ISD::CCMN"; 859 case AArch64ISD::FCCMP: return "AArch64ISD::FCCMP"; 860 case AArch64ISD::FCMP: return "AArch64ISD::FCMP"; 861 case AArch64ISD::DUP: return "AArch64ISD::DUP"; 862 case AArch64ISD::DUPLANE8: return "AArch64ISD::DUPLANE8"; 863 case AArch64ISD::DUPLANE16: return "AArch64ISD::DUPLANE16"; 864 case AArch64ISD::DUPLANE32: return "AArch64ISD::DUPLANE32"; 865 case AArch64ISD::DUPLANE64: return "AArch64ISD::DUPLANE64"; 866 case AArch64ISD::MOVI: return "AArch64ISD::MOVI"; 867 case AArch64ISD::MOVIshift: return "AArch64ISD::MOVIshift"; 868 case AArch64ISD::MOVIedit: return "AArch64ISD::MOVIedit"; 869 case AArch64ISD::MOVImsl: return "AArch64ISD::MOVImsl"; 870 case AArch64ISD::FMOV: return "AArch64ISD::FMOV"; 871 case AArch64ISD::MVNIshift: return "AArch64ISD::MVNIshift"; 872 case AArch64ISD::MVNImsl: return "AArch64ISD::MVNImsl"; 873 case AArch64ISD::BICi: return "AArch64ISD::BICi"; 874 case AArch64ISD::ORRi: return "AArch64ISD::ORRi"; 875 case AArch64ISD::BSL: return "AArch64ISD::BSL"; 876 case AArch64ISD::NEG: return "AArch64ISD::NEG"; 877 case AArch64ISD::EXTR: return "AArch64ISD::EXTR"; 878 case AArch64ISD::ZIP1: return "AArch64ISD::ZIP1"; 879 case AArch64ISD::ZIP2: return "AArch64ISD::ZIP2"; 880 case AArch64ISD::UZP1: return "AArch64ISD::UZP1"; 881 case AArch64ISD::UZP2: return "AArch64ISD::UZP2"; 882 case AArch64ISD::TRN1: return "AArch64ISD::TRN1"; 883 case AArch64ISD::TRN2: return "AArch64ISD::TRN2"; 884 case AArch64ISD::REV16: return "AArch64ISD::REV16"; 885 case AArch64ISD::REV32: return "AArch64ISD::REV32"; 886 case AArch64ISD::REV64: return "AArch64ISD::REV64"; 887 case AArch64ISD::EXT: return "AArch64ISD::EXT"; 888 case AArch64ISD::VSHL: return "AArch64ISD::VSHL"; 889 case AArch64ISD::VLSHR: return "AArch64ISD::VLSHR"; 890 case AArch64ISD::VASHR: return "AArch64ISD::VASHR"; 891 case AArch64ISD::CMEQ: return "AArch64ISD::CMEQ"; 892 case AArch64ISD::CMGE: return "AArch64ISD::CMGE"; 893 case AArch64ISD::CMGT: return "AArch64ISD::CMGT"; 894 case AArch64ISD::CMHI: return "AArch64ISD::CMHI"; 895 case AArch64ISD::CMHS: return "AArch64ISD::CMHS"; 896 case AArch64ISD::FCMEQ: return "AArch64ISD::FCMEQ"; 897 case AArch64ISD::FCMGE: return "AArch64ISD::FCMGE"; 898 case AArch64ISD::FCMGT: return "AArch64ISD::FCMGT"; 899 case AArch64ISD::CMEQz: return "AArch64ISD::CMEQz"; 900 case AArch64ISD::CMGEz: return "AArch64ISD::CMGEz"; 901 case AArch64ISD::CMGTz: return "AArch64ISD::CMGTz"; 902 case AArch64ISD::CMLEz: return "AArch64ISD::CMLEz"; 903 case AArch64ISD::CMLTz: return "AArch64ISD::CMLTz"; 904 case AArch64ISD::FCMEQz: return "AArch64ISD::FCMEQz"; 905 case AArch64ISD::FCMGEz: return "AArch64ISD::FCMGEz"; 906 case AArch64ISD::FCMGTz: return "AArch64ISD::FCMGTz"; 907 case AArch64ISD::FCMLEz: return "AArch64ISD::FCMLEz"; 908 case AArch64ISD::FCMLTz: return "AArch64ISD::FCMLTz"; 909 case AArch64ISD::SADDV: return "AArch64ISD::SADDV"; 910 case AArch64ISD::UADDV: return "AArch64ISD::UADDV"; 911 case AArch64ISD::SMINV: return "AArch64ISD::SMINV"; 912 case AArch64ISD::UMINV: return "AArch64ISD::UMINV"; 913 case AArch64ISD::SMAXV: return "AArch64ISD::SMAXV"; 914 case AArch64ISD::UMAXV: return "AArch64ISD::UMAXV"; 915 case AArch64ISD::NOT: return "AArch64ISD::NOT"; 916 case AArch64ISD::BIT: return "AArch64ISD::BIT"; 917 case AArch64ISD::CBZ: return "AArch64ISD::CBZ"; 918 case AArch64ISD::CBNZ: return "AArch64ISD::CBNZ"; 919 case AArch64ISD::TBZ: return "AArch64ISD::TBZ"; 920 case AArch64ISD::TBNZ: return "AArch64ISD::TBNZ"; 921 case AArch64ISD::TC_RETURN: return "AArch64ISD::TC_RETURN"; 922 case AArch64ISD::PREFETCH: return "AArch64ISD::PREFETCH"; 923 case AArch64ISD::SITOF: return "AArch64ISD::SITOF"; 924 case AArch64ISD::UITOF: return "AArch64ISD::UITOF"; 925 case AArch64ISD::NVCAST: return "AArch64ISD::NVCAST"; 926 case AArch64ISD::SQSHL_I: return "AArch64ISD::SQSHL_I"; 927 case AArch64ISD::UQSHL_I: return "AArch64ISD::UQSHL_I"; 928 case AArch64ISD::SRSHR_I: return "AArch64ISD::SRSHR_I"; 929 case AArch64ISD::URSHR_I: return "AArch64ISD::URSHR_I"; 930 case AArch64ISD::SQSHLU_I: return "AArch64ISD::SQSHLU_I"; 931 case AArch64ISD::WrapperLarge: return "AArch64ISD::WrapperLarge"; 932 case AArch64ISD::LD2post: return "AArch64ISD::LD2post"; 933 case AArch64ISD::LD3post: return "AArch64ISD::LD3post"; 934 case AArch64ISD::LD4post: return "AArch64ISD::LD4post"; 935 case AArch64ISD::ST2post: return "AArch64ISD::ST2post"; 936 case AArch64ISD::ST3post: return "AArch64ISD::ST3post"; 937 case AArch64ISD::ST4post: return "AArch64ISD::ST4post"; 938 case AArch64ISD::LD1x2post: return "AArch64ISD::LD1x2post"; 939 case AArch64ISD::LD1x3post: return "AArch64ISD::LD1x3post"; 940 case AArch64ISD::LD1x4post: return "AArch64ISD::LD1x4post"; 941 case AArch64ISD::ST1x2post: return "AArch64ISD::ST1x2post"; 942 case AArch64ISD::ST1x3post: return "AArch64ISD::ST1x3post"; 943 case AArch64ISD::ST1x4post: return "AArch64ISD::ST1x4post"; 944 case AArch64ISD::LD1DUPpost: return "AArch64ISD::LD1DUPpost"; 945 case AArch64ISD::LD2DUPpost: return "AArch64ISD::LD2DUPpost"; 946 case AArch64ISD::LD3DUPpost: return "AArch64ISD::LD3DUPpost"; 947 case AArch64ISD::LD4DUPpost: return "AArch64ISD::LD4DUPpost"; 948 case AArch64ISD::LD1LANEpost: return "AArch64ISD::LD1LANEpost"; 949 case AArch64ISD::LD2LANEpost: return "AArch64ISD::LD2LANEpost"; 950 case AArch64ISD::LD3LANEpost: return "AArch64ISD::LD3LANEpost"; 951 case AArch64ISD::LD4LANEpost: return "AArch64ISD::LD4LANEpost"; 952 case AArch64ISD::ST2LANEpost: return "AArch64ISD::ST2LANEpost"; 953 case AArch64ISD::ST3LANEpost: return "AArch64ISD::ST3LANEpost"; 954 case AArch64ISD::ST4LANEpost: return "AArch64ISD::ST4LANEpost"; 955 case AArch64ISD::SMULL: return "AArch64ISD::SMULL"; 956 case AArch64ISD::UMULL: return "AArch64ISD::UMULL"; 957 } 958 return nullptr; 959 } 960 961 MachineBasicBlock * 962 AArch64TargetLowering::EmitF128CSEL(MachineInstr *MI, 963 MachineBasicBlock *MBB) const { 964 // We materialise the F128CSEL pseudo-instruction as some control flow and a 965 // phi node: 966 967 // OrigBB: 968 // [... previous instrs leading to comparison ...] 969 // b.ne TrueBB 970 // b EndBB 971 // TrueBB: 972 // ; Fallthrough 973 // EndBB: 974 // Dest = PHI [IfTrue, TrueBB], [IfFalse, OrigBB] 975 976 MachineFunction *MF = MBB->getParent(); 977 const TargetInstrInfo *TII = Subtarget->getInstrInfo(); 978 const BasicBlock *LLVM_BB = MBB->getBasicBlock(); 979 DebugLoc DL = MI->getDebugLoc(); 980 MachineFunction::iterator It = ++MBB->getIterator(); 981 982 unsigned DestReg = MI->getOperand(0).getReg(); 983 unsigned IfTrueReg = MI->getOperand(1).getReg(); 984 unsigned IfFalseReg = MI->getOperand(2).getReg(); 985 unsigned CondCode = MI->getOperand(3).getImm(); 986 bool NZCVKilled = MI->getOperand(4).isKill(); 987 988 MachineBasicBlock *TrueBB = MF->CreateMachineBasicBlock(LLVM_BB); 989 MachineBasicBlock *EndBB = MF->CreateMachineBasicBlock(LLVM_BB); 990 MF->insert(It, TrueBB); 991 MF->insert(It, EndBB); 992 993 // Transfer rest of current basic-block to EndBB 994 EndBB->splice(EndBB->begin(), MBB, std::next(MachineBasicBlock::iterator(MI)), 995 MBB->end()); 996 EndBB->transferSuccessorsAndUpdatePHIs(MBB); 997 998 BuildMI(MBB, DL, TII->get(AArch64::Bcc)).addImm(CondCode).addMBB(TrueBB); 999 BuildMI(MBB, DL, TII->get(AArch64::B)).addMBB(EndBB); 1000 MBB->addSuccessor(TrueBB); 1001 MBB->addSuccessor(EndBB); 1002 1003 // TrueBB falls through to the end. 1004 TrueBB->addSuccessor(EndBB); 1005 1006 if (!NZCVKilled) { 1007 TrueBB->addLiveIn(AArch64::NZCV); 1008 EndBB->addLiveIn(AArch64::NZCV); 1009 } 1010 1011 BuildMI(*EndBB, EndBB->begin(), DL, TII->get(AArch64::PHI), DestReg) 1012 .addReg(IfTrueReg) 1013 .addMBB(TrueBB) 1014 .addReg(IfFalseReg) 1015 .addMBB(MBB); 1016 1017 MI->eraseFromParent(); 1018 return EndBB; 1019 } 1020 1021 MachineBasicBlock * 1022 AArch64TargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI, 1023 MachineBasicBlock *BB) const { 1024 switch (MI->getOpcode()) { 1025 default: 1026 #ifndef NDEBUG 1027 MI->dump(); 1028 #endif 1029 llvm_unreachable("Unexpected instruction for custom inserter!"); 1030 1031 case AArch64::F128CSEL: 1032 return EmitF128CSEL(MI, BB); 1033 1034 case TargetOpcode::STACKMAP: 1035 case TargetOpcode::PATCHPOINT: 1036 return emitPatchPoint(MI, BB); 1037 } 1038 } 1039 1040 //===----------------------------------------------------------------------===// 1041 // AArch64 Lowering private implementation. 1042 //===----------------------------------------------------------------------===// 1043 1044 //===----------------------------------------------------------------------===// 1045 // Lowering Code 1046 //===----------------------------------------------------------------------===// 1047 1048 /// changeIntCCToAArch64CC - Convert a DAG integer condition code to an AArch64 1049 /// CC 1050 static AArch64CC::CondCode changeIntCCToAArch64CC(ISD::CondCode CC) { 1051 switch (CC) { 1052 default: 1053 llvm_unreachable("Unknown condition code!"); 1054 case ISD::SETNE: 1055 return AArch64CC::NE; 1056 case ISD::SETEQ: 1057 return AArch64CC::EQ; 1058 case ISD::SETGT: 1059 return AArch64CC::GT; 1060 case ISD::SETGE: 1061 return AArch64CC::GE; 1062 case ISD::SETLT: 1063 return AArch64CC::LT; 1064 case ISD::SETLE: 1065 return AArch64CC::LE; 1066 case ISD::SETUGT: 1067 return AArch64CC::HI; 1068 case ISD::SETUGE: 1069 return AArch64CC::HS; 1070 case ISD::SETULT: 1071 return AArch64CC::LO; 1072 case ISD::SETULE: 1073 return AArch64CC::LS; 1074 } 1075 } 1076 1077 /// changeFPCCToAArch64CC - Convert a DAG fp condition code to an AArch64 CC. 1078 static void changeFPCCToAArch64CC(ISD::CondCode CC, 1079 AArch64CC::CondCode &CondCode, 1080 AArch64CC::CondCode &CondCode2) { 1081 CondCode2 = AArch64CC::AL; 1082 switch (CC) { 1083 default: 1084 llvm_unreachable("Unknown FP condition!"); 1085 case ISD::SETEQ: 1086 case ISD::SETOEQ: 1087 CondCode = AArch64CC::EQ; 1088 break; 1089 case ISD::SETGT: 1090 case ISD::SETOGT: 1091 CondCode = AArch64CC::GT; 1092 break; 1093 case ISD::SETGE: 1094 case ISD::SETOGE: 1095 CondCode = AArch64CC::GE; 1096 break; 1097 case ISD::SETOLT: 1098 CondCode = AArch64CC::MI; 1099 break; 1100 case ISD::SETOLE: 1101 CondCode = AArch64CC::LS; 1102 break; 1103 case ISD::SETONE: 1104 CondCode = AArch64CC::MI; 1105 CondCode2 = AArch64CC::GT; 1106 break; 1107 case ISD::SETO: 1108 CondCode = AArch64CC::VC; 1109 break; 1110 case ISD::SETUO: 1111 CondCode = AArch64CC::VS; 1112 break; 1113 case ISD::SETUEQ: 1114 CondCode = AArch64CC::EQ; 1115 CondCode2 = AArch64CC::VS; 1116 break; 1117 case ISD::SETUGT: 1118 CondCode = AArch64CC::HI; 1119 break; 1120 case ISD::SETUGE: 1121 CondCode = AArch64CC::PL; 1122 break; 1123 case ISD::SETLT: 1124 case ISD::SETULT: 1125 CondCode = AArch64CC::LT; 1126 break; 1127 case ISD::SETLE: 1128 case ISD::SETULE: 1129 CondCode = AArch64CC::LE; 1130 break; 1131 case ISD::SETNE: 1132 case ISD::SETUNE: 1133 CondCode = AArch64CC::NE; 1134 break; 1135 } 1136 } 1137 1138 /// changeVectorFPCCToAArch64CC - Convert a DAG fp condition code to an AArch64 1139 /// CC usable with the vector instructions. Fewer operations are available 1140 /// without a real NZCV register, so we have to use less efficient combinations 1141 /// to get the same effect. 1142 static void changeVectorFPCCToAArch64CC(ISD::CondCode CC, 1143 AArch64CC::CondCode &CondCode, 1144 AArch64CC::CondCode &CondCode2, 1145 bool &Invert) { 1146 Invert = false; 1147 switch (CC) { 1148 default: 1149 // Mostly the scalar mappings work fine. 1150 changeFPCCToAArch64CC(CC, CondCode, CondCode2); 1151 break; 1152 case ISD::SETUO: 1153 Invert = true; // Fallthrough 1154 case ISD::SETO: 1155 CondCode = AArch64CC::MI; 1156 CondCode2 = AArch64CC::GE; 1157 break; 1158 case ISD::SETUEQ: 1159 case ISD::SETULT: 1160 case ISD::SETULE: 1161 case ISD::SETUGT: 1162 case ISD::SETUGE: 1163 // All of the compare-mask comparisons are ordered, but we can switch 1164 // between the two by a double inversion. E.g. ULE == !OGT. 1165 Invert = true; 1166 changeFPCCToAArch64CC(getSetCCInverse(CC, false), CondCode, CondCode2); 1167 break; 1168 } 1169 } 1170 1171 static bool isLegalArithImmed(uint64_t C) { 1172 // Matches AArch64DAGToDAGISel::SelectArithImmed(). 1173 return (C >> 12 == 0) || ((C & 0xFFFULL) == 0 && C >> 24 == 0); 1174 } 1175 1176 static SDValue emitComparison(SDValue LHS, SDValue RHS, ISD::CondCode CC, 1177 SDLoc dl, SelectionDAG &DAG) { 1178 EVT VT = LHS.getValueType(); 1179 1180 if (VT.isFloatingPoint()) 1181 return DAG.getNode(AArch64ISD::FCMP, dl, VT, LHS, RHS); 1182 1183 // The CMP instruction is just an alias for SUBS, and representing it as 1184 // SUBS means that it's possible to get CSE with subtract operations. 1185 // A later phase can perform the optimization of setting the destination 1186 // register to WZR/XZR if it ends up being unused. 1187 unsigned Opcode = AArch64ISD::SUBS; 1188 1189 if (RHS.getOpcode() == ISD::SUB && isNullConstant(RHS.getOperand(0)) && 1190 (CC == ISD::SETEQ || CC == ISD::SETNE)) { 1191 // We'd like to combine a (CMP op1, (sub 0, op2) into a CMN instruction on 1192 // the grounds that "op1 - (-op2) == op1 + op2". However, the C and V flags 1193 // can be set differently by this operation. It comes down to whether 1194 // "SInt(~op2)+1 == SInt(~op2+1)" (and the same for UInt). If they are then 1195 // everything is fine. If not then the optimization is wrong. Thus general 1196 // comparisons are only valid if op2 != 0. 1197 1198 // So, finally, the only LLVM-native comparisons that don't mention C and V 1199 // are SETEQ and SETNE. They're the only ones we can safely use CMN for in 1200 // the absence of information about op2. 1201 Opcode = AArch64ISD::ADDS; 1202 RHS = RHS.getOperand(1); 1203 } else if (LHS.getOpcode() == ISD::AND && isNullConstant(RHS) && 1204 !isUnsignedIntSetCC(CC)) { 1205 // Similarly, (CMP (and X, Y), 0) can be implemented with a TST 1206 // (a.k.a. ANDS) except that the flags are only guaranteed to work for one 1207 // of the signed comparisons. 1208 Opcode = AArch64ISD::ANDS; 1209 RHS = LHS.getOperand(1); 1210 LHS = LHS.getOperand(0); 1211 } 1212 1213 return DAG.getNode(Opcode, dl, DAG.getVTList(VT, MVT_CC), LHS, RHS) 1214 .getValue(1); 1215 } 1216 1217 /// \defgroup AArch64CCMP CMP;CCMP matching 1218 /// 1219 /// These functions deal with the formation of CMP;CCMP;... sequences. 1220 /// The CCMP/CCMN/FCCMP/FCCMPE instructions allow the conditional execution of 1221 /// a comparison. They set the NZCV flags to a predefined value if their 1222 /// predicate is false. This allows to express arbitrary conjunctions, for 1223 /// example "cmp 0 (and (setCA (cmp A)) (setCB (cmp B))))" 1224 /// expressed as: 1225 /// cmp A 1226 /// ccmp B, inv(CB), CA 1227 /// check for CB flags 1228 /// 1229 /// In general we can create code for arbitrary "... (and (and A B) C)" 1230 /// sequences. We can also implement some "or" expressions, because "(or A B)" 1231 /// is equivalent to "not (and (not A) (not B))" and we can implement some 1232 /// negation operations: 1233 /// We can negate the results of a single comparison by inverting the flags 1234 /// used when the predicate fails and inverting the flags tested in the next 1235 /// instruction; We can also negate the results of the whole previous 1236 /// conditional compare sequence by inverting the flags tested in the next 1237 /// instruction. However there is no way to negate the result of a partial 1238 /// sequence. 1239 /// 1240 /// Therefore on encountering an "or" expression we can negate the subtree on 1241 /// one side and have to be able to push the negate to the leafs of the subtree 1242 /// on the other side (see also the comments in code). As complete example: 1243 /// "or (or (setCA (cmp A)) (setCB (cmp B))) 1244 /// (and (setCC (cmp C)) (setCD (cmp D)))" 1245 /// is transformed to 1246 /// "not (and (not (and (setCC (cmp C)) (setCC (cmp D)))) 1247 /// (and (not (setCA (cmp A)) (not (setCB (cmp B))))))" 1248 /// and implemented as: 1249 /// cmp C 1250 /// ccmp D, inv(CD), CC 1251 /// ccmp A, CA, inv(CD) 1252 /// ccmp B, CB, inv(CA) 1253 /// check for CB flags 1254 /// A counterexample is "or (and A B) (and C D)" which cannot be implemented 1255 /// by conditional compare sequences. 1256 /// @{ 1257 1258 /// Create a conditional comparison; Use CCMP, CCMN or FCCMP as appropriate. 1259 static SDValue emitConditionalComparison(SDValue LHS, SDValue RHS, 1260 ISD::CondCode CC, SDValue CCOp, 1261 SDValue Condition, unsigned NZCV, 1262 SDLoc DL, SelectionDAG &DAG) { 1263 unsigned Opcode = 0; 1264 if (LHS.getValueType().isFloatingPoint()) 1265 Opcode = AArch64ISD::FCCMP; 1266 else if (RHS.getOpcode() == ISD::SUB) { 1267 SDValue SubOp0 = RHS.getOperand(0); 1268 if (isNullConstant(SubOp0) && (CC == ISD::SETEQ || CC == ISD::SETNE)) { 1269 // See emitComparison() on why we can only do this for SETEQ and SETNE. 1270 Opcode = AArch64ISD::CCMN; 1271 RHS = RHS.getOperand(1); 1272 } 1273 } 1274 if (Opcode == 0) 1275 Opcode = AArch64ISD::CCMP; 1276 1277 SDValue NZCVOp = DAG.getConstant(NZCV, DL, MVT::i32); 1278 return DAG.getNode(Opcode, DL, MVT_CC, LHS, RHS, NZCVOp, Condition, CCOp); 1279 } 1280 1281 /// Returns true if @p Val is a tree of AND/OR/SETCC operations. 1282 /// CanPushNegate is set to true if we can push a negate operation through 1283 /// the tree in a was that we are left with AND operations and negate operations 1284 /// at the leafs only. i.e. "not (or (or x y) z)" can be changed to 1285 /// "and (and (not x) (not y)) (not z)"; "not (or (and x y) z)" cannot be 1286 /// brought into such a form. 1287 static bool isConjunctionDisjunctionTree(const SDValue Val, bool &CanPushNegate, 1288 unsigned Depth = 0) { 1289 if (!Val.hasOneUse()) 1290 return false; 1291 unsigned Opcode = Val->getOpcode(); 1292 if (Opcode == ISD::SETCC) { 1293 CanPushNegate = true; 1294 return true; 1295 } 1296 // Protect against stack overflow. 1297 if (Depth > 15) 1298 return false; 1299 if (Opcode == ISD::AND || Opcode == ISD::OR) { 1300 SDValue O0 = Val->getOperand(0); 1301 SDValue O1 = Val->getOperand(1); 1302 bool CanPushNegateL; 1303 if (!isConjunctionDisjunctionTree(O0, CanPushNegateL, Depth+1)) 1304 return false; 1305 bool CanPushNegateR; 1306 if (!isConjunctionDisjunctionTree(O1, CanPushNegateR, Depth+1)) 1307 return false; 1308 // We cannot push a negate through an AND operation (it would become an OR), 1309 // we can however change a (not (or x y)) to (and (not x) (not y)) if we can 1310 // push the negate through the x/y subtrees. 1311 CanPushNegate = (Opcode == ISD::OR) && CanPushNegateL && CanPushNegateR; 1312 return true; 1313 } 1314 return false; 1315 } 1316 1317 /// Emit conjunction or disjunction tree with the CMP/FCMP followed by a chain 1318 /// of CCMP/CFCMP ops. See @ref AArch64CCMP. 1319 /// Tries to transform the given i1 producing node @p Val to a series compare 1320 /// and conditional compare operations. @returns an NZCV flags producing node 1321 /// and sets @p OutCC to the flags that should be tested or returns SDValue() if 1322 /// transformation was not possible. 1323 /// On recursive invocations @p PushNegate may be set to true to have negation 1324 /// effects pushed to the tree leafs; @p Predicate is an NZCV flag predicate 1325 /// for the comparisons in the current subtree; @p Depth limits the search 1326 /// depth to avoid stack overflow. 1327 static SDValue emitConjunctionDisjunctionTree(SelectionDAG &DAG, SDValue Val, 1328 AArch64CC::CondCode &OutCC, bool PushNegate = false, 1329 SDValue CCOp = SDValue(), AArch64CC::CondCode Predicate = AArch64CC::AL, 1330 unsigned Depth = 0) { 1331 // We're at a tree leaf, produce a conditional comparison operation. 1332 unsigned Opcode = Val->getOpcode(); 1333 if (Opcode == ISD::SETCC) { 1334 SDValue LHS = Val->getOperand(0); 1335 SDValue RHS = Val->getOperand(1); 1336 ISD::CondCode CC = cast<CondCodeSDNode>(Val->getOperand(2))->get(); 1337 bool isInteger = LHS.getValueType().isInteger(); 1338 if (PushNegate) 1339 CC = getSetCCInverse(CC, isInteger); 1340 SDLoc DL(Val); 1341 // Determine OutCC and handle FP special case. 1342 if (isInteger) { 1343 OutCC = changeIntCCToAArch64CC(CC); 1344 } else { 1345 assert(LHS.getValueType().isFloatingPoint()); 1346 AArch64CC::CondCode ExtraCC; 1347 changeFPCCToAArch64CC(CC, OutCC, ExtraCC); 1348 // Surpisingly some floating point conditions can't be tested with a 1349 // single condition code. Construct an additional comparison in this case. 1350 // See comment below on how we deal with OR conditions. 1351 if (ExtraCC != AArch64CC::AL) { 1352 SDValue ExtraCmp; 1353 if (!CCOp.getNode()) 1354 ExtraCmp = emitComparison(LHS, RHS, CC, DL, DAG); 1355 else { 1356 SDValue ConditionOp = DAG.getConstant(Predicate, DL, MVT_CC); 1357 // Note that we want the inverse of ExtraCC, so NZCV is not inversed. 1358 unsigned NZCV = AArch64CC::getNZCVToSatisfyCondCode(ExtraCC); 1359 ExtraCmp = emitConditionalComparison(LHS, RHS, CC, CCOp, ConditionOp, 1360 NZCV, DL, DAG); 1361 } 1362 CCOp = ExtraCmp; 1363 Predicate = AArch64CC::getInvertedCondCode(ExtraCC); 1364 OutCC = AArch64CC::getInvertedCondCode(OutCC); 1365 } 1366 } 1367 1368 // Produce a normal comparison if we are first in the chain 1369 if (!CCOp.getNode()) 1370 return emitComparison(LHS, RHS, CC, DL, DAG); 1371 // Otherwise produce a ccmp. 1372 SDValue ConditionOp = DAG.getConstant(Predicate, DL, MVT_CC); 1373 AArch64CC::CondCode InvOutCC = AArch64CC::getInvertedCondCode(OutCC); 1374 unsigned NZCV = AArch64CC::getNZCVToSatisfyCondCode(InvOutCC); 1375 return emitConditionalComparison(LHS, RHS, CC, CCOp, ConditionOp, NZCV, DL, 1376 DAG); 1377 } else if ((Opcode != ISD::AND && Opcode != ISD::OR) || !Val->hasOneUse()) 1378 return SDValue(); 1379 1380 assert((Opcode == ISD::OR || !PushNegate) 1381 && "Can only push negate through OR operation"); 1382 1383 // Check if both sides can be transformed. 1384 SDValue LHS = Val->getOperand(0); 1385 SDValue RHS = Val->getOperand(1); 1386 bool CanPushNegateL; 1387 if (!isConjunctionDisjunctionTree(LHS, CanPushNegateL, Depth+1)) 1388 return SDValue(); 1389 bool CanPushNegateR; 1390 if (!isConjunctionDisjunctionTree(RHS, CanPushNegateR, Depth+1)) 1391 return SDValue(); 1392 1393 // Do we need to negate our operands? 1394 bool NegateOperands = Opcode == ISD::OR; 1395 // We can negate the results of all previous operations by inverting the 1396 // predicate flags giving us a free negation for one side. For the other side 1397 // we need to be able to push the negation to the leafs of the tree. 1398 if (NegateOperands) { 1399 if (!CanPushNegateL && !CanPushNegateR) 1400 return SDValue(); 1401 // Order the side where we can push the negate through to LHS. 1402 if (!CanPushNegateL && CanPushNegateR) 1403 std::swap(LHS, RHS); 1404 } else { 1405 bool NeedsNegOutL = LHS->getOpcode() == ISD::OR; 1406 bool NeedsNegOutR = RHS->getOpcode() == ISD::OR; 1407 if (NeedsNegOutL && NeedsNegOutR) 1408 return SDValue(); 1409 // Order the side where we need to negate the output flags to RHS so it 1410 // gets emitted first. 1411 if (NeedsNegOutL) 1412 std::swap(LHS, RHS); 1413 } 1414 1415 // Emit RHS. If we want to negate the tree we only need to push a negate 1416 // through if we are already in a PushNegate case, otherwise we can negate 1417 // the "flags to test" afterwards. 1418 AArch64CC::CondCode RHSCC; 1419 SDValue CmpR = emitConjunctionDisjunctionTree(DAG, RHS, RHSCC, PushNegate, 1420 CCOp, Predicate, Depth+1); 1421 if (NegateOperands && !PushNegate) 1422 RHSCC = AArch64CC::getInvertedCondCode(RHSCC); 1423 // Emit LHS. We must push the negate through if we need to negate it. 1424 SDValue CmpL = emitConjunctionDisjunctionTree(DAG, LHS, OutCC, NegateOperands, 1425 CmpR, RHSCC, Depth+1); 1426 // If we transformed an OR to and AND then we have to negate the result 1427 // (or absorb a PushNegate resulting in a double negation). 1428 if (Opcode == ISD::OR && !PushNegate) 1429 OutCC = AArch64CC::getInvertedCondCode(OutCC); 1430 return CmpL; 1431 } 1432 1433 /// @} 1434 1435 static SDValue getAArch64Cmp(SDValue LHS, SDValue RHS, ISD::CondCode CC, 1436 SDValue &AArch64cc, SelectionDAG &DAG, SDLoc dl) { 1437 if (ConstantSDNode *RHSC = dyn_cast<ConstantSDNode>(RHS.getNode())) { 1438 EVT VT = RHS.getValueType(); 1439 uint64_t C = RHSC->getZExtValue(); 1440 if (!isLegalArithImmed(C)) { 1441 // Constant does not fit, try adjusting it by one? 1442 switch (CC) { 1443 default: 1444 break; 1445 case ISD::SETLT: 1446 case ISD::SETGE: 1447 if ((VT == MVT::i32 && C != 0x80000000 && 1448 isLegalArithImmed((uint32_t)(C - 1))) || 1449 (VT == MVT::i64 && C != 0x80000000ULL && 1450 isLegalArithImmed(C - 1ULL))) { 1451 CC = (CC == ISD::SETLT) ? ISD::SETLE : ISD::SETGT; 1452 C = (VT == MVT::i32) ? (uint32_t)(C - 1) : C - 1; 1453 RHS = DAG.getConstant(C, dl, VT); 1454 } 1455 break; 1456 case ISD::SETULT: 1457 case ISD::SETUGE: 1458 if ((VT == MVT::i32 && C != 0 && 1459 isLegalArithImmed((uint32_t)(C - 1))) || 1460 (VT == MVT::i64 && C != 0ULL && isLegalArithImmed(C - 1ULL))) { 1461 CC = (CC == ISD::SETULT) ? ISD::SETULE : ISD::SETUGT; 1462 C = (VT == MVT::i32) ? (uint32_t)(C - 1) : C - 1; 1463 RHS = DAG.getConstant(C, dl, VT); 1464 } 1465 break; 1466 case ISD::SETLE: 1467 case ISD::SETGT: 1468 if ((VT == MVT::i32 && C != INT32_MAX && 1469 isLegalArithImmed((uint32_t)(C + 1))) || 1470 (VT == MVT::i64 && C != INT64_MAX && 1471 isLegalArithImmed(C + 1ULL))) { 1472 CC = (CC == ISD::SETLE) ? ISD::SETLT : ISD::SETGE; 1473 C = (VT == MVT::i32) ? (uint32_t)(C + 1) : C + 1; 1474 RHS = DAG.getConstant(C, dl, VT); 1475 } 1476 break; 1477 case ISD::SETULE: 1478 case ISD::SETUGT: 1479 if ((VT == MVT::i32 && C != UINT32_MAX && 1480 isLegalArithImmed((uint32_t)(C + 1))) || 1481 (VT == MVT::i64 && C != UINT64_MAX && 1482 isLegalArithImmed(C + 1ULL))) { 1483 CC = (CC == ISD::SETULE) ? ISD::SETULT : ISD::SETUGE; 1484 C = (VT == MVT::i32) ? (uint32_t)(C + 1) : C + 1; 1485 RHS = DAG.getConstant(C, dl, VT); 1486 } 1487 break; 1488 } 1489 } 1490 } 1491 SDValue Cmp; 1492 AArch64CC::CondCode AArch64CC; 1493 if ((CC == ISD::SETEQ || CC == ISD::SETNE) && isa<ConstantSDNode>(RHS)) { 1494 const ConstantSDNode *RHSC = cast<ConstantSDNode>(RHS); 1495 1496 // The imm operand of ADDS is an unsigned immediate, in the range 0 to 4095. 1497 // For the i8 operand, the largest immediate is 255, so this can be easily 1498 // encoded in the compare instruction. For the i16 operand, however, the 1499 // largest immediate cannot be encoded in the compare. 1500 // Therefore, use a sign extending load and cmn to avoid materializing the 1501 // -1 constant. For example, 1502 // movz w1, #65535 1503 // ldrh w0, [x0, #0] 1504 // cmp w0, w1 1505 // > 1506 // ldrsh w0, [x0, #0] 1507 // cmn w0, #1 1508 // Fundamental, we're relying on the property that (zext LHS) == (zext RHS) 1509 // if and only if (sext LHS) == (sext RHS). The checks are in place to 1510 // ensure both the LHS and RHS are truly zero extended and to make sure the 1511 // transformation is profitable. 1512 if ((RHSC->getZExtValue() >> 16 == 0) && isa<LoadSDNode>(LHS) && 1513 cast<LoadSDNode>(LHS)->getExtensionType() == ISD::ZEXTLOAD && 1514 cast<LoadSDNode>(LHS)->getMemoryVT() == MVT::i16 && 1515 LHS.getNode()->hasNUsesOfValue(1, 0)) { 1516 int16_t ValueofRHS = cast<ConstantSDNode>(RHS)->getZExtValue(); 1517 if (ValueofRHS < 0 && isLegalArithImmed(-ValueofRHS)) { 1518 SDValue SExt = 1519 DAG.getNode(ISD::SIGN_EXTEND_INREG, dl, LHS.getValueType(), LHS, 1520 DAG.getValueType(MVT::i16)); 1521 Cmp = emitComparison(SExt, DAG.getConstant(ValueofRHS, dl, 1522 RHS.getValueType()), 1523 CC, dl, DAG); 1524 AArch64CC = changeIntCCToAArch64CC(CC); 1525 } 1526 } 1527 1528 if (!Cmp && (RHSC->isNullValue() || RHSC->isOne())) { 1529 if ((Cmp = emitConjunctionDisjunctionTree(DAG, LHS, AArch64CC))) { 1530 if ((CC == ISD::SETNE) ^ RHSC->isNullValue()) 1531 AArch64CC = AArch64CC::getInvertedCondCode(AArch64CC); 1532 } 1533 } 1534 } 1535 1536 if (!Cmp) { 1537 Cmp = emitComparison(LHS, RHS, CC, dl, DAG); 1538 AArch64CC = changeIntCCToAArch64CC(CC); 1539 } 1540 AArch64cc = DAG.getConstant(AArch64CC, dl, MVT_CC); 1541 return Cmp; 1542 } 1543 1544 static std::pair<SDValue, SDValue> 1545 getAArch64XALUOOp(AArch64CC::CondCode &CC, SDValue Op, SelectionDAG &DAG) { 1546 assert((Op.getValueType() == MVT::i32 || Op.getValueType() == MVT::i64) && 1547 "Unsupported value type"); 1548 SDValue Value, Overflow; 1549 SDLoc DL(Op); 1550 SDValue LHS = Op.getOperand(0); 1551 SDValue RHS = Op.getOperand(1); 1552 unsigned Opc = 0; 1553 switch (Op.getOpcode()) { 1554 default: 1555 llvm_unreachable("Unknown overflow instruction!"); 1556 case ISD::SADDO: 1557 Opc = AArch64ISD::ADDS; 1558 CC = AArch64CC::VS; 1559 break; 1560 case ISD::UADDO: 1561 Opc = AArch64ISD::ADDS; 1562 CC = AArch64CC::HS; 1563 break; 1564 case ISD::SSUBO: 1565 Opc = AArch64ISD::SUBS; 1566 CC = AArch64CC::VS; 1567 break; 1568 case ISD::USUBO: 1569 Opc = AArch64ISD::SUBS; 1570 CC = AArch64CC::LO; 1571 break; 1572 // Multiply needs a little bit extra work. 1573 case ISD::SMULO: 1574 case ISD::UMULO: { 1575 CC = AArch64CC::NE; 1576 bool IsSigned = Op.getOpcode() == ISD::SMULO; 1577 if (Op.getValueType() == MVT::i32) { 1578 unsigned ExtendOpc = IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND; 1579 // For a 32 bit multiply with overflow check we want the instruction 1580 // selector to generate a widening multiply (SMADDL/UMADDL). For that we 1581 // need to generate the following pattern: 1582 // (i64 add 0, (i64 mul (i64 sext|zext i32 %a), (i64 sext|zext i32 %b)) 1583 LHS = DAG.getNode(ExtendOpc, DL, MVT::i64, LHS); 1584 RHS = DAG.getNode(ExtendOpc, DL, MVT::i64, RHS); 1585 SDValue Mul = DAG.getNode(ISD::MUL, DL, MVT::i64, LHS, RHS); 1586 SDValue Add = DAG.getNode(ISD::ADD, DL, MVT::i64, Mul, 1587 DAG.getConstant(0, DL, MVT::i64)); 1588 // On AArch64 the upper 32 bits are always zero extended for a 32 bit 1589 // operation. We need to clear out the upper 32 bits, because we used a 1590 // widening multiply that wrote all 64 bits. In the end this should be a 1591 // noop. 1592 Value = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, Add); 1593 if (IsSigned) { 1594 // The signed overflow check requires more than just a simple check for 1595 // any bit set in the upper 32 bits of the result. These bits could be 1596 // just the sign bits of a negative number. To perform the overflow 1597 // check we have to arithmetic shift right the 32nd bit of the result by 1598 // 31 bits. Then we compare the result to the upper 32 bits. 1599 SDValue UpperBits = DAG.getNode(ISD::SRL, DL, MVT::i64, Add, 1600 DAG.getConstant(32, DL, MVT::i64)); 1601 UpperBits = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, UpperBits); 1602 SDValue LowerBits = DAG.getNode(ISD::SRA, DL, MVT::i32, Value, 1603 DAG.getConstant(31, DL, MVT::i64)); 1604 // It is important that LowerBits is last, otherwise the arithmetic 1605 // shift will not be folded into the compare (SUBS). 1606 SDVTList VTs = DAG.getVTList(MVT::i32, MVT::i32); 1607 Overflow = DAG.getNode(AArch64ISD::SUBS, DL, VTs, UpperBits, LowerBits) 1608 .getValue(1); 1609 } else { 1610 // The overflow check for unsigned multiply is easy. We only need to 1611 // check if any of the upper 32 bits are set. This can be done with a 1612 // CMP (shifted register). For that we need to generate the following 1613 // pattern: 1614 // (i64 AArch64ISD::SUBS i64 0, (i64 srl i64 %Mul, i64 32) 1615 SDValue UpperBits = DAG.getNode(ISD::SRL, DL, MVT::i64, Mul, 1616 DAG.getConstant(32, DL, MVT::i64)); 1617 SDVTList VTs = DAG.getVTList(MVT::i64, MVT::i32); 1618 Overflow = 1619 DAG.getNode(AArch64ISD::SUBS, DL, VTs, 1620 DAG.getConstant(0, DL, MVT::i64), 1621 UpperBits).getValue(1); 1622 } 1623 break; 1624 } 1625 assert(Op.getValueType() == MVT::i64 && "Expected an i64 value type"); 1626 // For the 64 bit multiply 1627 Value = DAG.getNode(ISD::MUL, DL, MVT::i64, LHS, RHS); 1628 if (IsSigned) { 1629 SDValue UpperBits = DAG.getNode(ISD::MULHS, DL, MVT::i64, LHS, RHS); 1630 SDValue LowerBits = DAG.getNode(ISD::SRA, DL, MVT::i64, Value, 1631 DAG.getConstant(63, DL, MVT::i64)); 1632 // It is important that LowerBits is last, otherwise the arithmetic 1633 // shift will not be folded into the compare (SUBS). 1634 SDVTList VTs = DAG.getVTList(MVT::i64, MVT::i32); 1635 Overflow = DAG.getNode(AArch64ISD::SUBS, DL, VTs, UpperBits, LowerBits) 1636 .getValue(1); 1637 } else { 1638 SDValue UpperBits = DAG.getNode(ISD::MULHU, DL, MVT::i64, LHS, RHS); 1639 SDVTList VTs = DAG.getVTList(MVT::i64, MVT::i32); 1640 Overflow = 1641 DAG.getNode(AArch64ISD::SUBS, DL, VTs, 1642 DAG.getConstant(0, DL, MVT::i64), 1643 UpperBits).getValue(1); 1644 } 1645 break; 1646 } 1647 } // switch (...) 1648 1649 if (Opc) { 1650 SDVTList VTs = DAG.getVTList(Op->getValueType(0), MVT::i32); 1651 1652 // Emit the AArch64 operation with overflow check. 1653 Value = DAG.getNode(Opc, DL, VTs, LHS, RHS); 1654 Overflow = Value.getValue(1); 1655 } 1656 return std::make_pair(Value, Overflow); 1657 } 1658 1659 SDValue AArch64TargetLowering::LowerF128Call(SDValue Op, SelectionDAG &DAG, 1660 RTLIB::Libcall Call) const { 1661 SmallVector<SDValue, 2> Ops(Op->op_begin(), Op->op_end()); 1662 return makeLibCall(DAG, Call, MVT::f128, Ops, false, SDLoc(Op)).first; 1663 } 1664 1665 static SDValue LowerXOR(SDValue Op, SelectionDAG &DAG) { 1666 SDValue Sel = Op.getOperand(0); 1667 SDValue Other = Op.getOperand(1); 1668 1669 // If neither operand is a SELECT_CC, give up. 1670 if (Sel.getOpcode() != ISD::SELECT_CC) 1671 std::swap(Sel, Other); 1672 if (Sel.getOpcode() != ISD::SELECT_CC) 1673 return Op; 1674 1675 // The folding we want to perform is: 1676 // (xor x, (select_cc a, b, cc, 0, -1) ) 1677 // --> 1678 // (csel x, (xor x, -1), cc ...) 1679 // 1680 // The latter will get matched to a CSINV instruction. 1681 1682 ISD::CondCode CC = cast<CondCodeSDNode>(Sel.getOperand(4))->get(); 1683 SDValue LHS = Sel.getOperand(0); 1684 SDValue RHS = Sel.getOperand(1); 1685 SDValue TVal = Sel.getOperand(2); 1686 SDValue FVal = Sel.getOperand(3); 1687 SDLoc dl(Sel); 1688 1689 // FIXME: This could be generalized to non-integer comparisons. 1690 if (LHS.getValueType() != MVT::i32 && LHS.getValueType() != MVT::i64) 1691 return Op; 1692 1693 ConstantSDNode *CFVal = dyn_cast<ConstantSDNode>(FVal); 1694 ConstantSDNode *CTVal = dyn_cast<ConstantSDNode>(TVal); 1695 1696 // The values aren't constants, this isn't the pattern we're looking for. 1697 if (!CFVal || !CTVal) 1698 return Op; 1699 1700 // We can commute the SELECT_CC by inverting the condition. This 1701 // might be needed to make this fit into a CSINV pattern. 1702 if (CTVal->isAllOnesValue() && CFVal->isNullValue()) { 1703 std::swap(TVal, FVal); 1704 std::swap(CTVal, CFVal); 1705 CC = ISD::getSetCCInverse(CC, true); 1706 } 1707 1708 // If the constants line up, perform the transform! 1709 if (CTVal->isNullValue() && CFVal->isAllOnesValue()) { 1710 SDValue CCVal; 1711 SDValue Cmp = getAArch64Cmp(LHS, RHS, CC, CCVal, DAG, dl); 1712 1713 FVal = Other; 1714 TVal = DAG.getNode(ISD::XOR, dl, Other.getValueType(), Other, 1715 DAG.getConstant(-1ULL, dl, Other.getValueType())); 1716 1717 return DAG.getNode(AArch64ISD::CSEL, dl, Sel.getValueType(), FVal, TVal, 1718 CCVal, Cmp); 1719 } 1720 1721 return Op; 1722 } 1723 1724 static SDValue LowerADDC_ADDE_SUBC_SUBE(SDValue Op, SelectionDAG &DAG) { 1725 EVT VT = Op.getValueType(); 1726 1727 // Let legalize expand this if it isn't a legal type yet. 1728 if (!DAG.getTargetLoweringInfo().isTypeLegal(VT)) 1729 return SDValue(); 1730 1731 SDVTList VTs = DAG.getVTList(VT, MVT::i32); 1732 1733 unsigned Opc; 1734 bool ExtraOp = false; 1735 switch (Op.getOpcode()) { 1736 default: 1737 llvm_unreachable("Invalid code"); 1738 case ISD::ADDC: 1739 Opc = AArch64ISD::ADDS; 1740 break; 1741 case ISD::SUBC: 1742 Opc = AArch64ISD::SUBS; 1743 break; 1744 case ISD::ADDE: 1745 Opc = AArch64ISD::ADCS; 1746 ExtraOp = true; 1747 break; 1748 case ISD::SUBE: 1749 Opc = AArch64ISD::SBCS; 1750 ExtraOp = true; 1751 break; 1752 } 1753 1754 if (!ExtraOp) 1755 return DAG.getNode(Opc, SDLoc(Op), VTs, Op.getOperand(0), Op.getOperand(1)); 1756 return DAG.getNode(Opc, SDLoc(Op), VTs, Op.getOperand(0), Op.getOperand(1), 1757 Op.getOperand(2)); 1758 } 1759 1760 static SDValue LowerXALUO(SDValue Op, SelectionDAG &DAG) { 1761 // Let legalize expand this if it isn't a legal type yet. 1762 if (!DAG.getTargetLoweringInfo().isTypeLegal(Op.getValueType())) 1763 return SDValue(); 1764 1765 SDLoc dl(Op); 1766 AArch64CC::CondCode CC; 1767 // The actual operation that sets the overflow or carry flag. 1768 SDValue Value, Overflow; 1769 std::tie(Value, Overflow) = getAArch64XALUOOp(CC, Op, DAG); 1770 1771 // We use 0 and 1 as false and true values. 1772 SDValue TVal = DAG.getConstant(1, dl, MVT::i32); 1773 SDValue FVal = DAG.getConstant(0, dl, MVT::i32); 1774 1775 // We use an inverted condition, because the conditional select is inverted 1776 // too. This will allow it to be selected to a single instruction: 1777 // CSINC Wd, WZR, WZR, invert(cond). 1778 SDValue CCVal = DAG.getConstant(getInvertedCondCode(CC), dl, MVT::i32); 1779 Overflow = DAG.getNode(AArch64ISD::CSEL, dl, MVT::i32, FVal, TVal, 1780 CCVal, Overflow); 1781 1782 SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32); 1783 return DAG.getNode(ISD::MERGE_VALUES, dl, VTs, Value, Overflow); 1784 } 1785 1786 // Prefetch operands are: 1787 // 1: Address to prefetch 1788 // 2: bool isWrite 1789 // 3: int locality (0 = no locality ... 3 = extreme locality) 1790 // 4: bool isDataCache 1791 static SDValue LowerPREFETCH(SDValue Op, SelectionDAG &DAG) { 1792 SDLoc DL(Op); 1793 unsigned IsWrite = cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue(); 1794 unsigned Locality = cast<ConstantSDNode>(Op.getOperand(3))->getZExtValue(); 1795 unsigned IsData = cast<ConstantSDNode>(Op.getOperand(4))->getZExtValue(); 1796 1797 bool IsStream = !Locality; 1798 // When the locality number is set 1799 if (Locality) { 1800 // The front-end should have filtered out the out-of-range values 1801 assert(Locality <= 3 && "Prefetch locality out-of-range"); 1802 // The locality degree is the opposite of the cache speed. 1803 // Put the number the other way around. 1804 // The encoding starts at 0 for level 1 1805 Locality = 3 - Locality; 1806 } 1807 1808 // built the mask value encoding the expected behavior. 1809 unsigned PrfOp = (IsWrite << 4) | // Load/Store bit 1810 (!IsData << 3) | // IsDataCache bit 1811 (Locality << 1) | // Cache level bits 1812 (unsigned)IsStream; // Stream bit 1813 return DAG.getNode(AArch64ISD::PREFETCH, DL, MVT::Other, Op.getOperand(0), 1814 DAG.getConstant(PrfOp, DL, MVT::i32), Op.getOperand(1)); 1815 } 1816 1817 SDValue AArch64TargetLowering::LowerFP_EXTEND(SDValue Op, 1818 SelectionDAG &DAG) const { 1819 assert(Op.getValueType() == MVT::f128 && "Unexpected lowering"); 1820 1821 RTLIB::Libcall LC; 1822 LC = RTLIB::getFPEXT(Op.getOperand(0).getValueType(), Op.getValueType()); 1823 1824 return LowerF128Call(Op, DAG, LC); 1825 } 1826 1827 SDValue AArch64TargetLowering::LowerFP_ROUND(SDValue Op, 1828 SelectionDAG &DAG) const { 1829 if (Op.getOperand(0).getValueType() != MVT::f128) { 1830 // It's legal except when f128 is involved 1831 return Op; 1832 } 1833 1834 RTLIB::Libcall LC; 1835 LC = RTLIB::getFPROUND(Op.getOperand(0).getValueType(), Op.getValueType()); 1836 1837 // FP_ROUND node has a second operand indicating whether it is known to be 1838 // precise. That doesn't take part in the LibCall so we can't directly use 1839 // LowerF128Call. 1840 SDValue SrcVal = Op.getOperand(0); 1841 return makeLibCall(DAG, LC, Op.getValueType(), SrcVal, /*isSigned*/ false, 1842 SDLoc(Op)).first; 1843 } 1844 1845 static SDValue LowerVectorFP_TO_INT(SDValue Op, SelectionDAG &DAG) { 1846 // Warning: We maintain cost tables in AArch64TargetTransformInfo.cpp. 1847 // Any additional optimization in this function should be recorded 1848 // in the cost tables. 1849 EVT InVT = Op.getOperand(0).getValueType(); 1850 EVT VT = Op.getValueType(); 1851 unsigned NumElts = InVT.getVectorNumElements(); 1852 1853 // f16 vectors are promoted to f32 before a conversion. 1854 if (InVT.getVectorElementType() == MVT::f16) { 1855 MVT NewVT = MVT::getVectorVT(MVT::f32, NumElts); 1856 SDLoc dl(Op); 1857 return DAG.getNode( 1858 Op.getOpcode(), dl, Op.getValueType(), 1859 DAG.getNode(ISD::FP_EXTEND, dl, NewVT, Op.getOperand(0))); 1860 } 1861 1862 if (VT.getSizeInBits() < InVT.getSizeInBits()) { 1863 SDLoc dl(Op); 1864 SDValue Cv = 1865 DAG.getNode(Op.getOpcode(), dl, InVT.changeVectorElementTypeToInteger(), 1866 Op.getOperand(0)); 1867 return DAG.getNode(ISD::TRUNCATE, dl, VT, Cv); 1868 } 1869 1870 if (VT.getSizeInBits() > InVT.getSizeInBits()) { 1871 SDLoc dl(Op); 1872 MVT ExtVT = 1873 MVT::getVectorVT(MVT::getFloatingPointVT(VT.getScalarSizeInBits()), 1874 VT.getVectorNumElements()); 1875 SDValue Ext = DAG.getNode(ISD::FP_EXTEND, dl, ExtVT, Op.getOperand(0)); 1876 return DAG.getNode(Op.getOpcode(), dl, VT, Ext); 1877 } 1878 1879 // Type changing conversions are illegal. 1880 return Op; 1881 } 1882 1883 SDValue AArch64TargetLowering::LowerFP_TO_INT(SDValue Op, 1884 SelectionDAG &DAG) const { 1885 if (Op.getOperand(0).getValueType().isVector()) 1886 return LowerVectorFP_TO_INT(Op, DAG); 1887 1888 // f16 conversions are promoted to f32. 1889 if (Op.getOperand(0).getValueType() == MVT::f16) { 1890 SDLoc dl(Op); 1891 return DAG.getNode( 1892 Op.getOpcode(), dl, Op.getValueType(), 1893 DAG.getNode(ISD::FP_EXTEND, dl, MVT::f32, Op.getOperand(0))); 1894 } 1895 1896 if (Op.getOperand(0).getValueType() != MVT::f128) { 1897 // It's legal except when f128 is involved 1898 return Op; 1899 } 1900 1901 RTLIB::Libcall LC; 1902 if (Op.getOpcode() == ISD::FP_TO_SINT) 1903 LC = RTLIB::getFPTOSINT(Op.getOperand(0).getValueType(), Op.getValueType()); 1904 else 1905 LC = RTLIB::getFPTOUINT(Op.getOperand(0).getValueType(), Op.getValueType()); 1906 1907 SmallVector<SDValue, 2> Ops(Op->op_begin(), Op->op_end()); 1908 return makeLibCall(DAG, LC, Op.getValueType(), Ops, false, SDLoc(Op)).first; 1909 } 1910 1911 static SDValue LowerVectorINT_TO_FP(SDValue Op, SelectionDAG &DAG) { 1912 // Warning: We maintain cost tables in AArch64TargetTransformInfo.cpp. 1913 // Any additional optimization in this function should be recorded 1914 // in the cost tables. 1915 EVT VT = Op.getValueType(); 1916 SDLoc dl(Op); 1917 SDValue In = Op.getOperand(0); 1918 EVT InVT = In.getValueType(); 1919 1920 if (VT.getSizeInBits() < InVT.getSizeInBits()) { 1921 MVT CastVT = 1922 MVT::getVectorVT(MVT::getFloatingPointVT(InVT.getScalarSizeInBits()), 1923 InVT.getVectorNumElements()); 1924 In = DAG.getNode(Op.getOpcode(), dl, CastVT, In); 1925 return DAG.getNode(ISD::FP_ROUND, dl, VT, In, DAG.getIntPtrConstant(0, dl)); 1926 } 1927 1928 if (VT.getSizeInBits() > InVT.getSizeInBits()) { 1929 unsigned CastOpc = 1930 Op.getOpcode() == ISD::SINT_TO_FP ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND; 1931 EVT CastVT = VT.changeVectorElementTypeToInteger(); 1932 In = DAG.getNode(CastOpc, dl, CastVT, In); 1933 return DAG.getNode(Op.getOpcode(), dl, VT, In); 1934 } 1935 1936 return Op; 1937 } 1938 1939 SDValue AArch64TargetLowering::LowerINT_TO_FP(SDValue Op, 1940 SelectionDAG &DAG) const { 1941 if (Op.getValueType().isVector()) 1942 return LowerVectorINT_TO_FP(Op, DAG); 1943 1944 // f16 conversions are promoted to f32. 1945 if (Op.getValueType() == MVT::f16) { 1946 SDLoc dl(Op); 1947 return DAG.getNode( 1948 ISD::FP_ROUND, dl, MVT::f16, 1949 DAG.getNode(Op.getOpcode(), dl, MVT::f32, Op.getOperand(0)), 1950 DAG.getIntPtrConstant(0, dl)); 1951 } 1952 1953 // i128 conversions are libcalls. 1954 if (Op.getOperand(0).getValueType() == MVT::i128) 1955 return SDValue(); 1956 1957 // Other conversions are legal, unless it's to the completely software-based 1958 // fp128. 1959 if (Op.getValueType() != MVT::f128) 1960 return Op; 1961 1962 RTLIB::Libcall LC; 1963 if (Op.getOpcode() == ISD::SINT_TO_FP) 1964 LC = RTLIB::getSINTTOFP(Op.getOperand(0).getValueType(), Op.getValueType()); 1965 else 1966 LC = RTLIB::getUINTTOFP(Op.getOperand(0).getValueType(), Op.getValueType()); 1967 1968 return LowerF128Call(Op, DAG, LC); 1969 } 1970 1971 SDValue AArch64TargetLowering::LowerFSINCOS(SDValue Op, 1972 SelectionDAG &DAG) const { 1973 // For iOS, we want to call an alternative entry point: __sincos_stret, 1974 // which returns the values in two S / D registers. 1975 SDLoc dl(Op); 1976 SDValue Arg = Op.getOperand(0); 1977 EVT ArgVT = Arg.getValueType(); 1978 Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext()); 1979 1980 ArgListTy Args; 1981 ArgListEntry Entry; 1982 1983 Entry.Node = Arg; 1984 Entry.Ty = ArgTy; 1985 Entry.isSExt = false; 1986 Entry.isZExt = false; 1987 Args.push_back(Entry); 1988 1989 const char *LibcallName = 1990 (ArgVT == MVT::f64) ? "__sincos_stret" : "__sincosf_stret"; 1991 SDValue Callee = 1992 DAG.getExternalSymbol(LibcallName, getPointerTy(DAG.getDataLayout())); 1993 1994 StructType *RetTy = StructType::get(ArgTy, ArgTy, nullptr); 1995 TargetLowering::CallLoweringInfo CLI(DAG); 1996 CLI.setDebugLoc(dl).setChain(DAG.getEntryNode()) 1997 .setCallee(CallingConv::Fast, RetTy, Callee, std::move(Args), 0); 1998 1999 std::pair<SDValue, SDValue> CallResult = LowerCallTo(CLI); 2000 return CallResult.first; 2001 } 2002 2003 static SDValue LowerBITCAST(SDValue Op, SelectionDAG &DAG) { 2004 if (Op.getValueType() != MVT::f16) 2005 return SDValue(); 2006 2007 assert(Op.getOperand(0).getValueType() == MVT::i16); 2008 SDLoc DL(Op); 2009 2010 Op = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Op.getOperand(0)); 2011 Op = DAG.getNode(ISD::BITCAST, DL, MVT::f32, Op); 2012 return SDValue( 2013 DAG.getMachineNode(TargetOpcode::EXTRACT_SUBREG, DL, MVT::f16, Op, 2014 DAG.getTargetConstant(AArch64::hsub, DL, MVT::i32)), 2015 0); 2016 } 2017 2018 static EVT getExtensionTo64Bits(const EVT &OrigVT) { 2019 if (OrigVT.getSizeInBits() >= 64) 2020 return OrigVT; 2021 2022 assert(OrigVT.isSimple() && "Expecting a simple value type"); 2023 2024 MVT::SimpleValueType OrigSimpleTy = OrigVT.getSimpleVT().SimpleTy; 2025 switch (OrigSimpleTy) { 2026 default: llvm_unreachable("Unexpected Vector Type"); 2027 case MVT::v2i8: 2028 case MVT::v2i16: 2029 return MVT::v2i32; 2030 case MVT::v4i8: 2031 return MVT::v4i16; 2032 } 2033 } 2034 2035 static SDValue addRequiredExtensionForVectorMULL(SDValue N, SelectionDAG &DAG, 2036 const EVT &OrigTy, 2037 const EVT &ExtTy, 2038 unsigned ExtOpcode) { 2039 // The vector originally had a size of OrigTy. It was then extended to ExtTy. 2040 // We expect the ExtTy to be 128-bits total. If the OrigTy is less than 2041 // 64-bits we need to insert a new extension so that it will be 64-bits. 2042 assert(ExtTy.is128BitVector() && "Unexpected extension size"); 2043 if (OrigTy.getSizeInBits() >= 64) 2044 return N; 2045 2046 // Must extend size to at least 64 bits to be used as an operand for VMULL. 2047 EVT NewVT = getExtensionTo64Bits(OrigTy); 2048 2049 return DAG.getNode(ExtOpcode, SDLoc(N), NewVT, N); 2050 } 2051 2052 static bool isExtendedBUILD_VECTOR(SDNode *N, SelectionDAG &DAG, 2053 bool isSigned) { 2054 EVT VT = N->getValueType(0); 2055 2056 if (N->getOpcode() != ISD::BUILD_VECTOR) 2057 return false; 2058 2059 for (const SDValue &Elt : N->op_values()) { 2060 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Elt)) { 2061 unsigned EltSize = VT.getVectorElementType().getSizeInBits(); 2062 unsigned HalfSize = EltSize / 2; 2063 if (isSigned) { 2064 if (!isIntN(HalfSize, C->getSExtValue())) 2065 return false; 2066 } else { 2067 if (!isUIntN(HalfSize, C->getZExtValue())) 2068 return false; 2069 } 2070 continue; 2071 } 2072 return false; 2073 } 2074 2075 return true; 2076 } 2077 2078 static SDValue skipExtensionForVectorMULL(SDNode *N, SelectionDAG &DAG) { 2079 if (N->getOpcode() == ISD::SIGN_EXTEND || N->getOpcode() == ISD::ZERO_EXTEND) 2080 return addRequiredExtensionForVectorMULL(N->getOperand(0), DAG, 2081 N->getOperand(0)->getValueType(0), 2082 N->getValueType(0), 2083 N->getOpcode()); 2084 2085 assert(N->getOpcode() == ISD::BUILD_VECTOR && "expected BUILD_VECTOR"); 2086 EVT VT = N->getValueType(0); 2087 SDLoc dl(N); 2088 unsigned EltSize = VT.getVectorElementType().getSizeInBits() / 2; 2089 unsigned NumElts = VT.getVectorNumElements(); 2090 MVT TruncVT = MVT::getIntegerVT(EltSize); 2091 SmallVector<SDValue, 8> Ops; 2092 for (unsigned i = 0; i != NumElts; ++i) { 2093 ConstantSDNode *C = cast<ConstantSDNode>(N->getOperand(i)); 2094 const APInt &CInt = C->getAPIntValue(); 2095 // Element types smaller than 32 bits are not legal, so use i32 elements. 2096 // The values are implicitly truncated so sext vs. zext doesn't matter. 2097 Ops.push_back(DAG.getConstant(CInt.zextOrTrunc(32), dl, MVT::i32)); 2098 } 2099 return DAG.getNode(ISD::BUILD_VECTOR, dl, 2100 MVT::getVectorVT(TruncVT, NumElts), Ops); 2101 } 2102 2103 static bool isSignExtended(SDNode *N, SelectionDAG &DAG) { 2104 if (N->getOpcode() == ISD::SIGN_EXTEND) 2105 return true; 2106 if (isExtendedBUILD_VECTOR(N, DAG, true)) 2107 return true; 2108 return false; 2109 } 2110 2111 static bool isZeroExtended(SDNode *N, SelectionDAG &DAG) { 2112 if (N->getOpcode() == ISD::ZERO_EXTEND) 2113 return true; 2114 if (isExtendedBUILD_VECTOR(N, DAG, false)) 2115 return true; 2116 return false; 2117 } 2118 2119 static bool isAddSubSExt(SDNode *N, SelectionDAG &DAG) { 2120 unsigned Opcode = N->getOpcode(); 2121 if (Opcode == ISD::ADD || Opcode == ISD::SUB) { 2122 SDNode *N0 = N->getOperand(0).getNode(); 2123 SDNode *N1 = N->getOperand(1).getNode(); 2124 return N0->hasOneUse() && N1->hasOneUse() && 2125 isSignExtended(N0, DAG) && isSignExtended(N1, DAG); 2126 } 2127 return false; 2128 } 2129 2130 static bool isAddSubZExt(SDNode *N, SelectionDAG &DAG) { 2131 unsigned Opcode = N->getOpcode(); 2132 if (Opcode == ISD::ADD || Opcode == ISD::SUB) { 2133 SDNode *N0 = N->getOperand(0).getNode(); 2134 SDNode *N1 = N->getOperand(1).getNode(); 2135 return N0->hasOneUse() && N1->hasOneUse() && 2136 isZeroExtended(N0, DAG) && isZeroExtended(N1, DAG); 2137 } 2138 return false; 2139 } 2140 2141 static SDValue LowerMUL(SDValue Op, SelectionDAG &DAG) { 2142 // Multiplications are only custom-lowered for 128-bit vectors so that 2143 // VMULL can be detected. Otherwise v2i64 multiplications are not legal. 2144 EVT VT = Op.getValueType(); 2145 assert(VT.is128BitVector() && VT.isInteger() && 2146 "unexpected type for custom-lowering ISD::MUL"); 2147 SDNode *N0 = Op.getOperand(0).getNode(); 2148 SDNode *N1 = Op.getOperand(1).getNode(); 2149 unsigned NewOpc = 0; 2150 bool isMLA = false; 2151 bool isN0SExt = isSignExtended(N0, DAG); 2152 bool isN1SExt = isSignExtended(N1, DAG); 2153 if (isN0SExt && isN1SExt) 2154 NewOpc = AArch64ISD::SMULL; 2155 else { 2156 bool isN0ZExt = isZeroExtended(N0, DAG); 2157 bool isN1ZExt = isZeroExtended(N1, DAG); 2158 if (isN0ZExt && isN1ZExt) 2159 NewOpc = AArch64ISD::UMULL; 2160 else if (isN1SExt || isN1ZExt) { 2161 // Look for (s/zext A + s/zext B) * (s/zext C). We want to turn these 2162 // into (s/zext A * s/zext C) + (s/zext B * s/zext C) 2163 if (isN1SExt && isAddSubSExt(N0, DAG)) { 2164 NewOpc = AArch64ISD::SMULL; 2165 isMLA = true; 2166 } else if (isN1ZExt && isAddSubZExt(N0, DAG)) { 2167 NewOpc = AArch64ISD::UMULL; 2168 isMLA = true; 2169 } else if (isN0ZExt && isAddSubZExt(N1, DAG)) { 2170 std::swap(N0, N1); 2171 NewOpc = AArch64ISD::UMULL; 2172 isMLA = true; 2173 } 2174 } 2175 2176 if (!NewOpc) { 2177 if (VT == MVT::v2i64) 2178 // Fall through to expand this. It is not legal. 2179 return SDValue(); 2180 else 2181 // Other vector multiplications are legal. 2182 return Op; 2183 } 2184 } 2185 2186 // Legalize to a S/UMULL instruction 2187 SDLoc DL(Op); 2188 SDValue Op0; 2189 SDValue Op1 = skipExtensionForVectorMULL(N1, DAG); 2190 if (!isMLA) { 2191 Op0 = skipExtensionForVectorMULL(N0, DAG); 2192 assert(Op0.getValueType().is64BitVector() && 2193 Op1.getValueType().is64BitVector() && 2194 "unexpected types for extended operands to VMULL"); 2195 return DAG.getNode(NewOpc, DL, VT, Op0, Op1); 2196 } 2197 // Optimizing (zext A + zext B) * C, to (S/UMULL A, C) + (S/UMULL B, C) during 2198 // isel lowering to take advantage of no-stall back to back s/umul + s/umla. 2199 // This is true for CPUs with accumulate forwarding such as Cortex-A53/A57 2200 SDValue N00 = skipExtensionForVectorMULL(N0->getOperand(0).getNode(), DAG); 2201 SDValue N01 = skipExtensionForVectorMULL(N0->getOperand(1).getNode(), DAG); 2202 EVT Op1VT = Op1.getValueType(); 2203 return DAG.getNode(N0->getOpcode(), DL, VT, 2204 DAG.getNode(NewOpc, DL, VT, 2205 DAG.getNode(ISD::BITCAST, DL, Op1VT, N00), Op1), 2206 DAG.getNode(NewOpc, DL, VT, 2207 DAG.getNode(ISD::BITCAST, DL, Op1VT, N01), Op1)); 2208 } 2209 2210 SDValue AArch64TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, 2211 SelectionDAG &DAG) const { 2212 unsigned IntNo = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue(); 2213 SDLoc dl(Op); 2214 switch (IntNo) { 2215 default: return SDValue(); // Don't custom lower most intrinsics. 2216 case Intrinsic::aarch64_thread_pointer: { 2217 EVT PtrVT = getPointerTy(DAG.getDataLayout()); 2218 return DAG.getNode(AArch64ISD::THREAD_POINTER, dl, PtrVT); 2219 } 2220 case Intrinsic::aarch64_neon_smax: 2221 return DAG.getNode(ISD::SMAX, dl, Op.getValueType(), 2222 Op.getOperand(1), Op.getOperand(2)); 2223 case Intrinsic::aarch64_neon_umax: 2224 return DAG.getNode(ISD::UMAX, dl, Op.getValueType(), 2225 Op.getOperand(1), Op.getOperand(2)); 2226 case Intrinsic::aarch64_neon_smin: 2227 return DAG.getNode(ISD::SMIN, dl, Op.getValueType(), 2228 Op.getOperand(1), Op.getOperand(2)); 2229 case Intrinsic::aarch64_neon_umin: 2230 return DAG.getNode(ISD::UMIN, dl, Op.getValueType(), 2231 Op.getOperand(1), Op.getOperand(2)); 2232 } 2233 } 2234 2235 SDValue AArch64TargetLowering::LowerOperation(SDValue Op, 2236 SelectionDAG &DAG) const { 2237 switch (Op.getOpcode()) { 2238 default: 2239 llvm_unreachable("unimplemented operand"); 2240 return SDValue(); 2241 case ISD::BITCAST: 2242 return LowerBITCAST(Op, DAG); 2243 case ISD::GlobalAddress: 2244 return LowerGlobalAddress(Op, DAG); 2245 case ISD::GlobalTLSAddress: 2246 return LowerGlobalTLSAddress(Op, DAG); 2247 case ISD::SETCC: 2248 return LowerSETCC(Op, DAG); 2249 case ISD::BR_CC: 2250 return LowerBR_CC(Op, DAG); 2251 case ISD::SELECT: 2252 return LowerSELECT(Op, DAG); 2253 case ISD::SELECT_CC: 2254 return LowerSELECT_CC(Op, DAG); 2255 case ISD::JumpTable: 2256 return LowerJumpTable(Op, DAG); 2257 case ISD::ConstantPool: 2258 return LowerConstantPool(Op, DAG); 2259 case ISD::BlockAddress: 2260 return LowerBlockAddress(Op, DAG); 2261 case ISD::VASTART: 2262 return LowerVASTART(Op, DAG); 2263 case ISD::VACOPY: 2264 return LowerVACOPY(Op, DAG); 2265 case ISD::VAARG: 2266 return LowerVAARG(Op, DAG); 2267 case ISD::ADDC: 2268 case ISD::ADDE: 2269 case ISD::SUBC: 2270 case ISD::SUBE: 2271 return LowerADDC_ADDE_SUBC_SUBE(Op, DAG); 2272 case ISD::SADDO: 2273 case ISD::UADDO: 2274 case ISD::SSUBO: 2275 case ISD::USUBO: 2276 case ISD::SMULO: 2277 case ISD::UMULO: 2278 return LowerXALUO(Op, DAG); 2279 case ISD::FADD: 2280 return LowerF128Call(Op, DAG, RTLIB::ADD_F128); 2281 case ISD::FSUB: 2282 return LowerF128Call(Op, DAG, RTLIB::SUB_F128); 2283 case ISD::FMUL: 2284 return LowerF128Call(Op, DAG, RTLIB::MUL_F128); 2285 case ISD::FDIV: 2286 return LowerF128Call(Op, DAG, RTLIB::DIV_F128); 2287 case ISD::FP_ROUND: 2288 return LowerFP_ROUND(Op, DAG); 2289 case ISD::FP_EXTEND: 2290 return LowerFP_EXTEND(Op, DAG); 2291 case ISD::FRAMEADDR: 2292 return LowerFRAMEADDR(Op, DAG); 2293 case ISD::RETURNADDR: 2294 return LowerRETURNADDR(Op, DAG); 2295 case ISD::INSERT_VECTOR_ELT: 2296 return LowerINSERT_VECTOR_ELT(Op, DAG); 2297 case ISD::EXTRACT_VECTOR_ELT: 2298 return LowerEXTRACT_VECTOR_ELT(Op, DAG); 2299 case ISD::BUILD_VECTOR: 2300 return LowerBUILD_VECTOR(Op, DAG); 2301 case ISD::VECTOR_SHUFFLE: 2302 return LowerVECTOR_SHUFFLE(Op, DAG); 2303 case ISD::EXTRACT_SUBVECTOR: 2304 return LowerEXTRACT_SUBVECTOR(Op, DAG); 2305 case ISD::SRA: 2306 case ISD::SRL: 2307 case ISD::SHL: 2308 return LowerVectorSRA_SRL_SHL(Op, DAG); 2309 case ISD::SHL_PARTS: 2310 return LowerShiftLeftParts(Op, DAG); 2311 case ISD::SRL_PARTS: 2312 case ISD::SRA_PARTS: 2313 return LowerShiftRightParts(Op, DAG); 2314 case ISD::CTPOP: 2315 return LowerCTPOP(Op, DAG); 2316 case ISD::FCOPYSIGN: 2317 return LowerFCOPYSIGN(Op, DAG); 2318 case ISD::AND: 2319 return LowerVectorAND(Op, DAG); 2320 case ISD::OR: 2321 return LowerVectorOR(Op, DAG); 2322 case ISD::XOR: 2323 return LowerXOR(Op, DAG); 2324 case ISD::PREFETCH: 2325 return LowerPREFETCH(Op, DAG); 2326 case ISD::SINT_TO_FP: 2327 case ISD::UINT_TO_FP: 2328 return LowerINT_TO_FP(Op, DAG); 2329 case ISD::FP_TO_SINT: 2330 case ISD::FP_TO_UINT: 2331 return LowerFP_TO_INT(Op, DAG); 2332 case ISD::FSINCOS: 2333 return LowerFSINCOS(Op, DAG); 2334 case ISD::MUL: 2335 return LowerMUL(Op, DAG); 2336 case ISD::INTRINSIC_WO_CHAIN: 2337 return LowerINTRINSIC_WO_CHAIN(Op, DAG); 2338 } 2339 } 2340 2341 //===----------------------------------------------------------------------===// 2342 // Calling Convention Implementation 2343 //===----------------------------------------------------------------------===// 2344 2345 #include "AArch64GenCallingConv.inc" 2346 2347 /// Selects the correct CCAssignFn for a given CallingConvention value. 2348 CCAssignFn *AArch64TargetLowering::CCAssignFnForCall(CallingConv::ID CC, 2349 bool IsVarArg) const { 2350 switch (CC) { 2351 default: 2352 llvm_unreachable("Unsupported calling convention."); 2353 case CallingConv::WebKit_JS: 2354 return CC_AArch64_WebKit_JS; 2355 case CallingConv::GHC: 2356 return CC_AArch64_GHC; 2357 case CallingConv::C: 2358 case CallingConv::Fast: 2359 if (!Subtarget->isTargetDarwin()) 2360 return CC_AArch64_AAPCS; 2361 return IsVarArg ? CC_AArch64_DarwinPCS_VarArg : CC_AArch64_DarwinPCS; 2362 } 2363 } 2364 2365 SDValue AArch64TargetLowering::LowerFormalArguments( 2366 SDValue Chain, CallingConv::ID CallConv, bool isVarArg, 2367 const SmallVectorImpl<ISD::InputArg> &Ins, SDLoc DL, SelectionDAG &DAG, 2368 SmallVectorImpl<SDValue> &InVals) const { 2369 MachineFunction &MF = DAG.getMachineFunction(); 2370 MachineFrameInfo *MFI = MF.getFrameInfo(); 2371 2372 // Assign locations to all of the incoming arguments. 2373 SmallVector<CCValAssign, 16> ArgLocs; 2374 CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), ArgLocs, 2375 *DAG.getContext()); 2376 2377 // At this point, Ins[].VT may already be promoted to i32. To correctly 2378 // handle passing i8 as i8 instead of i32 on stack, we pass in both i32 and 2379 // i8 to CC_AArch64_AAPCS with i32 being ValVT and i8 being LocVT. 2380 // Since AnalyzeFormalArguments uses Ins[].VT for both ValVT and LocVT, here 2381 // we use a special version of AnalyzeFormalArguments to pass in ValVT and 2382 // LocVT. 2383 unsigned NumArgs = Ins.size(); 2384 Function::const_arg_iterator CurOrigArg = MF.getFunction()->arg_begin(); 2385 unsigned CurArgIdx = 0; 2386 for (unsigned i = 0; i != NumArgs; ++i) { 2387 MVT ValVT = Ins[i].VT; 2388 if (Ins[i].isOrigArg()) { 2389 std::advance(CurOrigArg, Ins[i].getOrigArgIndex() - CurArgIdx); 2390 CurArgIdx = Ins[i].getOrigArgIndex(); 2391 2392 // Get type of the original argument. 2393 EVT ActualVT = getValueType(DAG.getDataLayout(), CurOrigArg->getType(), 2394 /*AllowUnknown*/ true); 2395 MVT ActualMVT = ActualVT.isSimple() ? ActualVT.getSimpleVT() : MVT::Other; 2396 // If ActualMVT is i1/i8/i16, we should set LocVT to i8/i8/i16. 2397 if (ActualMVT == MVT::i1 || ActualMVT == MVT::i8) 2398 ValVT = MVT::i8; 2399 else if (ActualMVT == MVT::i16) 2400 ValVT = MVT::i16; 2401 } 2402 CCAssignFn *AssignFn = CCAssignFnForCall(CallConv, /*IsVarArg=*/false); 2403 bool Res = 2404 AssignFn(i, ValVT, ValVT, CCValAssign::Full, Ins[i].Flags, CCInfo); 2405 assert(!Res && "Call operand has unhandled type"); 2406 (void)Res; 2407 } 2408 assert(ArgLocs.size() == Ins.size()); 2409 SmallVector<SDValue, 16> ArgValues; 2410 for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) { 2411 CCValAssign &VA = ArgLocs[i]; 2412 2413 if (Ins[i].Flags.isByVal()) { 2414 // Byval is used for HFAs in the PCS, but the system should work in a 2415 // non-compliant manner for larger structs. 2416 EVT PtrVT = getPointerTy(DAG.getDataLayout()); 2417 int Size = Ins[i].Flags.getByValSize(); 2418 unsigned NumRegs = (Size + 7) / 8; 2419 2420 // FIXME: This works on big-endian for composite byvals, which are the common 2421 // case. It should also work for fundamental types too. 2422 unsigned FrameIdx = 2423 MFI->CreateFixedObject(8 * NumRegs, VA.getLocMemOffset(), false); 2424 SDValue FrameIdxN = DAG.getFrameIndex(FrameIdx, PtrVT); 2425 InVals.push_back(FrameIdxN); 2426 2427 continue; 2428 } 2429 2430 if (VA.isRegLoc()) { 2431 // Arguments stored in registers. 2432 EVT RegVT = VA.getLocVT(); 2433 2434 SDValue ArgValue; 2435 const TargetRegisterClass *RC; 2436 2437 if (RegVT == MVT::i32) 2438 RC = &AArch64::GPR32RegClass; 2439 else if (RegVT == MVT::i64) 2440 RC = &AArch64::GPR64RegClass; 2441 else if (RegVT == MVT::f16) 2442 RC = &AArch64::FPR16RegClass; 2443 else if (RegVT == MVT::f32) 2444 RC = &AArch64::FPR32RegClass; 2445 else if (RegVT == MVT::f64 || RegVT.is64BitVector()) 2446 RC = &AArch64::FPR64RegClass; 2447 else if (RegVT == MVT::f128 || RegVT.is128BitVector()) 2448 RC = &AArch64::FPR128RegClass; 2449 else 2450 llvm_unreachable("RegVT not supported by FORMAL_ARGUMENTS Lowering"); 2451 2452 // Transform the arguments in physical registers into virtual ones. 2453 unsigned Reg = MF.addLiveIn(VA.getLocReg(), RC); 2454 ArgValue = DAG.getCopyFromReg(Chain, DL, Reg, RegVT); 2455 2456 // If this is an 8, 16 or 32-bit value, it is really passed promoted 2457 // to 64 bits. Insert an assert[sz]ext to capture this, then 2458 // truncate to the right size. 2459 switch (VA.getLocInfo()) { 2460 default: 2461 llvm_unreachable("Unknown loc info!"); 2462 case CCValAssign::Full: 2463 break; 2464 case CCValAssign::BCvt: 2465 ArgValue = DAG.getNode(ISD::BITCAST, DL, VA.getValVT(), ArgValue); 2466 break; 2467 case CCValAssign::AExt: 2468 case CCValAssign::SExt: 2469 case CCValAssign::ZExt: 2470 // SelectionDAGBuilder will insert appropriate AssertZExt & AssertSExt 2471 // nodes after our lowering. 2472 assert(RegVT == Ins[i].VT && "incorrect register location selected"); 2473 break; 2474 } 2475 2476 InVals.push_back(ArgValue); 2477 2478 } else { // VA.isRegLoc() 2479 assert(VA.isMemLoc() && "CCValAssign is neither reg nor mem"); 2480 unsigned ArgOffset = VA.getLocMemOffset(); 2481 unsigned ArgSize = VA.getValVT().getSizeInBits() / 8; 2482 2483 uint32_t BEAlign = 0; 2484 if (!Subtarget->isLittleEndian() && ArgSize < 8 && 2485 !Ins[i].Flags.isInConsecutiveRegs()) 2486 BEAlign = 8 - ArgSize; 2487 2488 int FI = MFI->CreateFixedObject(ArgSize, ArgOffset + BEAlign, true); 2489 2490 // Create load nodes to retrieve arguments from the stack. 2491 SDValue FIN = DAG.getFrameIndex(FI, getPointerTy(DAG.getDataLayout())); 2492 SDValue ArgValue; 2493 2494 // For NON_EXTLOAD, generic code in getLoad assert(ValVT == MemVT) 2495 ISD::LoadExtType ExtType = ISD::NON_EXTLOAD; 2496 MVT MemVT = VA.getValVT(); 2497 2498 switch (VA.getLocInfo()) { 2499 default: 2500 break; 2501 case CCValAssign::BCvt: 2502 MemVT = VA.getLocVT(); 2503 break; 2504 case CCValAssign::SExt: 2505 ExtType = ISD::SEXTLOAD; 2506 break; 2507 case CCValAssign::ZExt: 2508 ExtType = ISD::ZEXTLOAD; 2509 break; 2510 case CCValAssign::AExt: 2511 ExtType = ISD::EXTLOAD; 2512 break; 2513 } 2514 2515 ArgValue = DAG.getExtLoad( 2516 ExtType, DL, VA.getLocVT(), Chain, FIN, 2517 MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI), 2518 MemVT, false, false, false, 0); 2519 2520 InVals.push_back(ArgValue); 2521 } 2522 } 2523 2524 // varargs 2525 if (isVarArg) { 2526 if (!Subtarget->isTargetDarwin()) { 2527 // The AAPCS variadic function ABI is identical to the non-variadic 2528 // one. As a result there may be more arguments in registers and we should 2529 // save them for future reference. 2530 saveVarArgRegisters(CCInfo, DAG, DL, Chain); 2531 } 2532 2533 AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>(); 2534 // This will point to the next argument passed via stack. 2535 unsigned StackOffset = CCInfo.getNextStackOffset(); 2536 // We currently pass all varargs at 8-byte alignment. 2537 StackOffset = ((StackOffset + 7) & ~7); 2538 AFI->setVarArgsStackIndex(MFI->CreateFixedObject(4, StackOffset, true)); 2539 } 2540 2541 AArch64FunctionInfo *FuncInfo = MF.getInfo<AArch64FunctionInfo>(); 2542 unsigned StackArgSize = CCInfo.getNextStackOffset(); 2543 bool TailCallOpt = MF.getTarget().Options.GuaranteedTailCallOpt; 2544 if (DoesCalleeRestoreStack(CallConv, TailCallOpt)) { 2545 // This is a non-standard ABI so by fiat I say we're allowed to make full 2546 // use of the stack area to be popped, which must be aligned to 16 bytes in 2547 // any case: 2548 StackArgSize = RoundUpToAlignment(StackArgSize, 16); 2549 2550 // If we're expected to restore the stack (e.g. fastcc) then we'll be adding 2551 // a multiple of 16. 2552 FuncInfo->setArgumentStackToRestore(StackArgSize); 2553 2554 // This realignment carries over to the available bytes below. Our own 2555 // callers will guarantee the space is free by giving an aligned value to 2556 // CALLSEQ_START. 2557 } 2558 // Even if we're not expected to free up the space, it's useful to know how 2559 // much is there while considering tail calls (because we can reuse it). 2560 FuncInfo->setBytesInStackArgArea(StackArgSize); 2561 2562 return Chain; 2563 } 2564 2565 void AArch64TargetLowering::saveVarArgRegisters(CCState &CCInfo, 2566 SelectionDAG &DAG, SDLoc DL, 2567 SDValue &Chain) const { 2568 MachineFunction &MF = DAG.getMachineFunction(); 2569 MachineFrameInfo *MFI = MF.getFrameInfo(); 2570 AArch64FunctionInfo *FuncInfo = MF.getInfo<AArch64FunctionInfo>(); 2571 auto PtrVT = getPointerTy(DAG.getDataLayout()); 2572 2573 SmallVector<SDValue, 8> MemOps; 2574 2575 static const MCPhysReg GPRArgRegs[] = { AArch64::X0, AArch64::X1, AArch64::X2, 2576 AArch64::X3, AArch64::X4, AArch64::X5, 2577 AArch64::X6, AArch64::X7 }; 2578 static const unsigned NumGPRArgRegs = array_lengthof(GPRArgRegs); 2579 unsigned FirstVariadicGPR = CCInfo.getFirstUnallocated(GPRArgRegs); 2580 2581 unsigned GPRSaveSize = 8 * (NumGPRArgRegs - FirstVariadicGPR); 2582 int GPRIdx = 0; 2583 if (GPRSaveSize != 0) { 2584 GPRIdx = MFI->CreateStackObject(GPRSaveSize, 8, false); 2585 2586 SDValue FIN = DAG.getFrameIndex(GPRIdx, PtrVT); 2587 2588 for (unsigned i = FirstVariadicGPR; i < NumGPRArgRegs; ++i) { 2589 unsigned VReg = MF.addLiveIn(GPRArgRegs[i], &AArch64::GPR64RegClass); 2590 SDValue Val = DAG.getCopyFromReg(Chain, DL, VReg, MVT::i64); 2591 SDValue Store = DAG.getStore( 2592 Val.getValue(1), DL, Val, FIN, 2593 MachinePointerInfo::getStack(DAG.getMachineFunction(), i * 8), false, 2594 false, 0); 2595 MemOps.push_back(Store); 2596 FIN = 2597 DAG.getNode(ISD::ADD, DL, PtrVT, FIN, DAG.getConstant(8, DL, PtrVT)); 2598 } 2599 } 2600 FuncInfo->setVarArgsGPRIndex(GPRIdx); 2601 FuncInfo->setVarArgsGPRSize(GPRSaveSize); 2602 2603 if (Subtarget->hasFPARMv8()) { 2604 static const MCPhysReg FPRArgRegs[] = { 2605 AArch64::Q0, AArch64::Q1, AArch64::Q2, AArch64::Q3, 2606 AArch64::Q4, AArch64::Q5, AArch64::Q6, AArch64::Q7}; 2607 static const unsigned NumFPRArgRegs = array_lengthof(FPRArgRegs); 2608 unsigned FirstVariadicFPR = CCInfo.getFirstUnallocated(FPRArgRegs); 2609 2610 unsigned FPRSaveSize = 16 * (NumFPRArgRegs - FirstVariadicFPR); 2611 int FPRIdx = 0; 2612 if (FPRSaveSize != 0) { 2613 FPRIdx = MFI->CreateStackObject(FPRSaveSize, 16, false); 2614 2615 SDValue FIN = DAG.getFrameIndex(FPRIdx, PtrVT); 2616 2617 for (unsigned i = FirstVariadicFPR; i < NumFPRArgRegs; ++i) { 2618 unsigned VReg = MF.addLiveIn(FPRArgRegs[i], &AArch64::FPR128RegClass); 2619 SDValue Val = DAG.getCopyFromReg(Chain, DL, VReg, MVT::f128); 2620 2621 SDValue Store = DAG.getStore( 2622 Val.getValue(1), DL, Val, FIN, 2623 MachinePointerInfo::getStack(DAG.getMachineFunction(), i * 16), 2624 false, false, 0); 2625 MemOps.push_back(Store); 2626 FIN = DAG.getNode(ISD::ADD, DL, PtrVT, FIN, 2627 DAG.getConstant(16, DL, PtrVT)); 2628 } 2629 } 2630 FuncInfo->setVarArgsFPRIndex(FPRIdx); 2631 FuncInfo->setVarArgsFPRSize(FPRSaveSize); 2632 } 2633 2634 if (!MemOps.empty()) { 2635 Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOps); 2636 } 2637 } 2638 2639 /// LowerCallResult - Lower the result values of a call into the 2640 /// appropriate copies out of appropriate physical registers. 2641 SDValue AArch64TargetLowering::LowerCallResult( 2642 SDValue Chain, SDValue InFlag, CallingConv::ID CallConv, bool isVarArg, 2643 const SmallVectorImpl<ISD::InputArg> &Ins, SDLoc DL, SelectionDAG &DAG, 2644 SmallVectorImpl<SDValue> &InVals, bool isThisReturn, 2645 SDValue ThisVal) const { 2646 CCAssignFn *RetCC = CallConv == CallingConv::WebKit_JS 2647 ? RetCC_AArch64_WebKit_JS 2648 : RetCC_AArch64_AAPCS; 2649 // Assign locations to each value returned by this call. 2650 SmallVector<CCValAssign, 16> RVLocs; 2651 CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), RVLocs, 2652 *DAG.getContext()); 2653 CCInfo.AnalyzeCallResult(Ins, RetCC); 2654 2655 // Copy all of the result registers out of their specified physreg. 2656 for (unsigned i = 0; i != RVLocs.size(); ++i) { 2657 CCValAssign VA = RVLocs[i]; 2658 2659 // Pass 'this' value directly from the argument to return value, to avoid 2660 // reg unit interference 2661 if (i == 0 && isThisReturn) { 2662 assert(!VA.needsCustom() && VA.getLocVT() == MVT::i64 && 2663 "unexpected return calling convention register assignment"); 2664 InVals.push_back(ThisVal); 2665 continue; 2666 } 2667 2668 SDValue Val = 2669 DAG.getCopyFromReg(Chain, DL, VA.getLocReg(), VA.getLocVT(), InFlag); 2670 Chain = Val.getValue(1); 2671 InFlag = Val.getValue(2); 2672 2673 switch (VA.getLocInfo()) { 2674 default: 2675 llvm_unreachable("Unknown loc info!"); 2676 case CCValAssign::Full: 2677 break; 2678 case CCValAssign::BCvt: 2679 Val = DAG.getNode(ISD::BITCAST, DL, VA.getValVT(), Val); 2680 break; 2681 } 2682 2683 InVals.push_back(Val); 2684 } 2685 2686 return Chain; 2687 } 2688 2689 bool AArch64TargetLowering::isEligibleForTailCallOptimization( 2690 SDValue Callee, CallingConv::ID CalleeCC, bool isVarArg, 2691 bool isCalleeStructRet, bool isCallerStructRet, 2692 const SmallVectorImpl<ISD::OutputArg> &Outs, 2693 const SmallVectorImpl<SDValue> &OutVals, 2694 const SmallVectorImpl<ISD::InputArg> &Ins, SelectionDAG &DAG) const { 2695 // For CallingConv::C this function knows whether the ABI needs 2696 // changing. That's not true for other conventions so they will have to opt in 2697 // manually. 2698 if (!IsTailCallConvention(CalleeCC) && CalleeCC != CallingConv::C) 2699 return false; 2700 2701 const MachineFunction &MF = DAG.getMachineFunction(); 2702 const Function *CallerF = MF.getFunction(); 2703 CallingConv::ID CallerCC = CallerF->getCallingConv(); 2704 bool CCMatch = CallerCC == CalleeCC; 2705 2706 // Byval parameters hand the function a pointer directly into the stack area 2707 // we want to reuse during a tail call. Working around this *is* possible (see 2708 // X86) but less efficient and uglier in LowerCall. 2709 for (Function::const_arg_iterator i = CallerF->arg_begin(), 2710 e = CallerF->arg_end(); 2711 i != e; ++i) 2712 if (i->hasByValAttr()) 2713 return false; 2714 2715 if (getTargetMachine().Options.GuaranteedTailCallOpt) { 2716 if (IsTailCallConvention(CalleeCC) && CCMatch) 2717 return true; 2718 return false; 2719 } 2720 2721 // Externally-defined functions with weak linkage should not be 2722 // tail-called on AArch64 when the OS does not support dynamic 2723 // pre-emption of symbols, as the AAELF spec requires normal calls 2724 // to undefined weak functions to be replaced with a NOP or jump to the 2725 // next instruction. The behaviour of branch instructions in this 2726 // situation (as used for tail calls) is implementation-defined, so we 2727 // cannot rely on the linker replacing the tail call with a return. 2728 if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee)) { 2729 const GlobalValue *GV = G->getGlobal(); 2730 const Triple &TT = getTargetMachine().getTargetTriple(); 2731 if (GV->hasExternalWeakLinkage() && 2732 (!TT.isOSWindows() || TT.isOSBinFormatELF() || TT.isOSBinFormatMachO())) 2733 return false; 2734 } 2735 2736 // Now we search for cases where we can use a tail call without changing the 2737 // ABI. Sibcall is used in some places (particularly gcc) to refer to this 2738 // concept. 2739 2740 // I want anyone implementing a new calling convention to think long and hard 2741 // about this assert. 2742 assert((!isVarArg || CalleeCC == CallingConv::C) && 2743 "Unexpected variadic calling convention"); 2744 2745 if (isVarArg && !Outs.empty()) { 2746 // At least two cases here: if caller is fastcc then we can't have any 2747 // memory arguments (we'd be expected to clean up the stack afterwards). If 2748 // caller is C then we could potentially use its argument area. 2749 2750 // FIXME: for now we take the most conservative of these in both cases: 2751 // disallow all variadic memory operands. 2752 SmallVector<CCValAssign, 16> ArgLocs; 2753 CCState CCInfo(CalleeCC, isVarArg, DAG.getMachineFunction(), ArgLocs, 2754 *DAG.getContext()); 2755 2756 CCInfo.AnalyzeCallOperands(Outs, CCAssignFnForCall(CalleeCC, true)); 2757 for (const CCValAssign &ArgLoc : ArgLocs) 2758 if (!ArgLoc.isRegLoc()) 2759 return false; 2760 } 2761 2762 // If the calling conventions do not match, then we'd better make sure the 2763 // results are returned in the same way as what the caller expects. 2764 if (!CCMatch) { 2765 SmallVector<CCValAssign, 16> RVLocs1; 2766 CCState CCInfo1(CalleeCC, false, DAG.getMachineFunction(), RVLocs1, 2767 *DAG.getContext()); 2768 CCInfo1.AnalyzeCallResult(Ins, CCAssignFnForCall(CalleeCC, isVarArg)); 2769 2770 SmallVector<CCValAssign, 16> RVLocs2; 2771 CCState CCInfo2(CallerCC, false, DAG.getMachineFunction(), RVLocs2, 2772 *DAG.getContext()); 2773 CCInfo2.AnalyzeCallResult(Ins, CCAssignFnForCall(CallerCC, isVarArg)); 2774 2775 if (RVLocs1.size() != RVLocs2.size()) 2776 return false; 2777 for (unsigned i = 0, e = RVLocs1.size(); i != e; ++i) { 2778 if (RVLocs1[i].isRegLoc() != RVLocs2[i].isRegLoc()) 2779 return false; 2780 if (RVLocs1[i].getLocInfo() != RVLocs2[i].getLocInfo()) 2781 return false; 2782 if (RVLocs1[i].isRegLoc()) { 2783 if (RVLocs1[i].getLocReg() != RVLocs2[i].getLocReg()) 2784 return false; 2785 } else { 2786 if (RVLocs1[i].getLocMemOffset() != RVLocs2[i].getLocMemOffset()) 2787 return false; 2788 } 2789 } 2790 } 2791 2792 // Nothing more to check if the callee is taking no arguments 2793 if (Outs.empty()) 2794 return true; 2795 2796 SmallVector<CCValAssign, 16> ArgLocs; 2797 CCState CCInfo(CalleeCC, isVarArg, DAG.getMachineFunction(), ArgLocs, 2798 *DAG.getContext()); 2799 2800 CCInfo.AnalyzeCallOperands(Outs, CCAssignFnForCall(CalleeCC, isVarArg)); 2801 2802 const AArch64FunctionInfo *FuncInfo = MF.getInfo<AArch64FunctionInfo>(); 2803 2804 // If the stack arguments for this call would fit into our own save area then 2805 // the call can be made tail. 2806 return CCInfo.getNextStackOffset() <= FuncInfo->getBytesInStackArgArea(); 2807 } 2808 2809 SDValue AArch64TargetLowering::addTokenForArgument(SDValue Chain, 2810 SelectionDAG &DAG, 2811 MachineFrameInfo *MFI, 2812 int ClobberedFI) const { 2813 SmallVector<SDValue, 8> ArgChains; 2814 int64_t FirstByte = MFI->getObjectOffset(ClobberedFI); 2815 int64_t LastByte = FirstByte + MFI->getObjectSize(ClobberedFI) - 1; 2816 2817 // Include the original chain at the beginning of the list. When this is 2818 // used by target LowerCall hooks, this helps legalize find the 2819 // CALLSEQ_BEGIN node. 2820 ArgChains.push_back(Chain); 2821 2822 // Add a chain value for each stack argument corresponding 2823 for (SDNode::use_iterator U = DAG.getEntryNode().getNode()->use_begin(), 2824 UE = DAG.getEntryNode().getNode()->use_end(); 2825 U != UE; ++U) 2826 if (LoadSDNode *L = dyn_cast<LoadSDNode>(*U)) 2827 if (FrameIndexSDNode *FI = dyn_cast<FrameIndexSDNode>(L->getBasePtr())) 2828 if (FI->getIndex() < 0) { 2829 int64_t InFirstByte = MFI->getObjectOffset(FI->getIndex()); 2830 int64_t InLastByte = InFirstByte; 2831 InLastByte += MFI->getObjectSize(FI->getIndex()) - 1; 2832 2833 if ((InFirstByte <= FirstByte && FirstByte <= InLastByte) || 2834 (FirstByte <= InFirstByte && InFirstByte <= LastByte)) 2835 ArgChains.push_back(SDValue(L, 1)); 2836 } 2837 2838 // Build a tokenfactor for all the chains. 2839 return DAG.getNode(ISD::TokenFactor, SDLoc(Chain), MVT::Other, ArgChains); 2840 } 2841 2842 bool AArch64TargetLowering::DoesCalleeRestoreStack(CallingConv::ID CallCC, 2843 bool TailCallOpt) const { 2844 return CallCC == CallingConv::Fast && TailCallOpt; 2845 } 2846 2847 bool AArch64TargetLowering::IsTailCallConvention(CallingConv::ID CallCC) const { 2848 return CallCC == CallingConv::Fast; 2849 } 2850 2851 /// LowerCall - Lower a call to a callseq_start + CALL + callseq_end chain, 2852 /// and add input and output parameter nodes. 2853 SDValue 2854 AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI, 2855 SmallVectorImpl<SDValue> &InVals) const { 2856 SelectionDAG &DAG = CLI.DAG; 2857 SDLoc &DL = CLI.DL; 2858 SmallVector<ISD::OutputArg, 32> &Outs = CLI.Outs; 2859 SmallVector<SDValue, 32> &OutVals = CLI.OutVals; 2860 SmallVector<ISD::InputArg, 32> &Ins = CLI.Ins; 2861 SDValue Chain = CLI.Chain; 2862 SDValue Callee = CLI.Callee; 2863 bool &IsTailCall = CLI.IsTailCall; 2864 CallingConv::ID CallConv = CLI.CallConv; 2865 bool IsVarArg = CLI.IsVarArg; 2866 2867 MachineFunction &MF = DAG.getMachineFunction(); 2868 bool IsStructRet = (Outs.empty()) ? false : Outs[0].Flags.isSRet(); 2869 bool IsThisReturn = false; 2870 2871 AArch64FunctionInfo *FuncInfo = MF.getInfo<AArch64FunctionInfo>(); 2872 bool TailCallOpt = MF.getTarget().Options.GuaranteedTailCallOpt; 2873 bool IsSibCall = false; 2874 2875 if (IsTailCall) { 2876 // Check if it's really possible to do a tail call. 2877 IsTailCall = isEligibleForTailCallOptimization( 2878 Callee, CallConv, IsVarArg, IsStructRet, 2879 MF.getFunction()->hasStructRetAttr(), Outs, OutVals, Ins, DAG); 2880 if (!IsTailCall && CLI.CS && CLI.CS->isMustTailCall()) 2881 report_fatal_error("failed to perform tail call elimination on a call " 2882 "site marked musttail"); 2883 2884 // A sibling call is one where we're under the usual C ABI and not planning 2885 // to change that but can still do a tail call: 2886 if (!TailCallOpt && IsTailCall) 2887 IsSibCall = true; 2888 2889 if (IsTailCall) 2890 ++NumTailCalls; 2891 } 2892 2893 // Analyze operands of the call, assigning locations to each operand. 2894 SmallVector<CCValAssign, 16> ArgLocs; 2895 CCState CCInfo(CallConv, IsVarArg, DAG.getMachineFunction(), ArgLocs, 2896 *DAG.getContext()); 2897 2898 if (IsVarArg) { 2899 // Handle fixed and variable vector arguments differently. 2900 // Variable vector arguments always go into memory. 2901 unsigned NumArgs = Outs.size(); 2902 2903 for (unsigned i = 0; i != NumArgs; ++i) { 2904 MVT ArgVT = Outs[i].VT; 2905 ISD::ArgFlagsTy ArgFlags = Outs[i].Flags; 2906 CCAssignFn *AssignFn = CCAssignFnForCall(CallConv, 2907 /*IsVarArg=*/ !Outs[i].IsFixed); 2908 bool Res = AssignFn(i, ArgVT, ArgVT, CCValAssign::Full, ArgFlags, CCInfo); 2909 assert(!Res && "Call operand has unhandled type"); 2910 (void)Res; 2911 } 2912 } else { 2913 // At this point, Outs[].VT may already be promoted to i32. To correctly 2914 // handle passing i8 as i8 instead of i32 on stack, we pass in both i32 and 2915 // i8 to CC_AArch64_AAPCS with i32 being ValVT and i8 being LocVT. 2916 // Since AnalyzeCallOperands uses Ins[].VT for both ValVT and LocVT, here 2917 // we use a special version of AnalyzeCallOperands to pass in ValVT and 2918 // LocVT. 2919 unsigned NumArgs = Outs.size(); 2920 for (unsigned i = 0; i != NumArgs; ++i) { 2921 MVT ValVT = Outs[i].VT; 2922 // Get type of the original argument. 2923 EVT ActualVT = getValueType(DAG.getDataLayout(), 2924 CLI.getArgs()[Outs[i].OrigArgIndex].Ty, 2925 /*AllowUnknown*/ true); 2926 MVT ActualMVT = ActualVT.isSimple() ? ActualVT.getSimpleVT() : ValVT; 2927 ISD::ArgFlagsTy ArgFlags = Outs[i].Flags; 2928 // If ActualMVT is i1/i8/i16, we should set LocVT to i8/i8/i16. 2929 if (ActualMVT == MVT::i1 || ActualMVT == MVT::i8) 2930 ValVT = MVT::i8; 2931 else if (ActualMVT == MVT::i16) 2932 ValVT = MVT::i16; 2933 2934 CCAssignFn *AssignFn = CCAssignFnForCall(CallConv, /*IsVarArg=*/false); 2935 bool Res = AssignFn(i, ValVT, ValVT, CCValAssign::Full, ArgFlags, CCInfo); 2936 assert(!Res && "Call operand has unhandled type"); 2937 (void)Res; 2938 } 2939 } 2940 2941 // Get a count of how many bytes are to be pushed on the stack. 2942 unsigned NumBytes = CCInfo.getNextStackOffset(); 2943 2944 if (IsSibCall) { 2945 // Since we're not changing the ABI to make this a tail call, the memory 2946 // operands are already available in the caller's incoming argument space. 2947 NumBytes = 0; 2948 } 2949 2950 // FPDiff is the byte offset of the call's argument area from the callee's. 2951 // Stores to callee stack arguments will be placed in FixedStackSlots offset 2952 // by this amount for a tail call. In a sibling call it must be 0 because the 2953 // caller will deallocate the entire stack and the callee still expects its 2954 // arguments to begin at SP+0. Completely unused for non-tail calls. 2955 int FPDiff = 0; 2956 2957 if (IsTailCall && !IsSibCall) { 2958 unsigned NumReusableBytes = FuncInfo->getBytesInStackArgArea(); 2959 2960 // Since callee will pop argument stack as a tail call, we must keep the 2961 // popped size 16-byte aligned. 2962 NumBytes = RoundUpToAlignment(NumBytes, 16); 2963 2964 // FPDiff will be negative if this tail call requires more space than we 2965 // would automatically have in our incoming argument space. Positive if we 2966 // can actually shrink the stack. 2967 FPDiff = NumReusableBytes - NumBytes; 2968 2969 // The stack pointer must be 16-byte aligned at all times it's used for a 2970 // memory operation, which in practice means at *all* times and in 2971 // particular across call boundaries. Therefore our own arguments started at 2972 // a 16-byte aligned SP and the delta applied for the tail call should 2973 // satisfy the same constraint. 2974 assert(FPDiff % 16 == 0 && "unaligned stack on tail call"); 2975 } 2976 2977 // Adjust the stack pointer for the new arguments... 2978 // These operations are automatically eliminated by the prolog/epilog pass 2979 if (!IsSibCall) 2980 Chain = DAG.getCALLSEQ_START(Chain, DAG.getIntPtrConstant(NumBytes, DL, 2981 true), 2982 DL); 2983 2984 SDValue StackPtr = DAG.getCopyFromReg(Chain, DL, AArch64::SP, 2985 getPointerTy(DAG.getDataLayout())); 2986 2987 SmallVector<std::pair<unsigned, SDValue>, 8> RegsToPass; 2988 SmallVector<SDValue, 8> MemOpChains; 2989 auto PtrVT = getPointerTy(DAG.getDataLayout()); 2990 2991 // Walk the register/memloc assignments, inserting copies/loads. 2992 for (unsigned i = 0, realArgIdx = 0, e = ArgLocs.size(); i != e; 2993 ++i, ++realArgIdx) { 2994 CCValAssign &VA = ArgLocs[i]; 2995 SDValue Arg = OutVals[realArgIdx]; 2996 ISD::ArgFlagsTy Flags = Outs[realArgIdx].Flags; 2997 2998 // Promote the value if needed. 2999 switch (VA.getLocInfo()) { 3000 default: 3001 llvm_unreachable("Unknown loc info!"); 3002 case CCValAssign::Full: 3003 break; 3004 case CCValAssign::SExt: 3005 Arg = DAG.getNode(ISD::SIGN_EXTEND, DL, VA.getLocVT(), Arg); 3006 break; 3007 case CCValAssign::ZExt: 3008 Arg = DAG.getNode(ISD::ZERO_EXTEND, DL, VA.getLocVT(), Arg); 3009 break; 3010 case CCValAssign::AExt: 3011 if (Outs[realArgIdx].ArgVT == MVT::i1) { 3012 // AAPCS requires i1 to be zero-extended to 8-bits by the caller. 3013 Arg = DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, Arg); 3014 Arg = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i8, Arg); 3015 } 3016 Arg = DAG.getNode(ISD::ANY_EXTEND, DL, VA.getLocVT(), Arg); 3017 break; 3018 case CCValAssign::BCvt: 3019 Arg = DAG.getNode(ISD::BITCAST, DL, VA.getLocVT(), Arg); 3020 break; 3021 case CCValAssign::FPExt: 3022 Arg = DAG.getNode(ISD::FP_EXTEND, DL, VA.getLocVT(), Arg); 3023 break; 3024 } 3025 3026 if (VA.isRegLoc()) { 3027 if (realArgIdx == 0 && Flags.isReturned() && Outs[0].VT == MVT::i64) { 3028 assert(VA.getLocVT() == MVT::i64 && 3029 "unexpected calling convention register assignment"); 3030 assert(!Ins.empty() && Ins[0].VT == MVT::i64 && 3031 "unexpected use of 'returned'"); 3032 IsThisReturn = true; 3033 } 3034 RegsToPass.push_back(std::make_pair(VA.getLocReg(), Arg)); 3035 } else { 3036 assert(VA.isMemLoc()); 3037 3038 SDValue DstAddr; 3039 MachinePointerInfo DstInfo; 3040 3041 // FIXME: This works on big-endian for composite byvals, which are the 3042 // common case. It should also work for fundamental types too. 3043 uint32_t BEAlign = 0; 3044 unsigned OpSize = Flags.isByVal() ? Flags.getByValSize() * 8 3045 : VA.getValVT().getSizeInBits(); 3046 OpSize = (OpSize + 7) / 8; 3047 if (!Subtarget->isLittleEndian() && !Flags.isByVal() && 3048 !Flags.isInConsecutiveRegs()) { 3049 if (OpSize < 8) 3050 BEAlign = 8 - OpSize; 3051 } 3052 unsigned LocMemOffset = VA.getLocMemOffset(); 3053 int32_t Offset = LocMemOffset + BEAlign; 3054 SDValue PtrOff = DAG.getIntPtrConstant(Offset, DL); 3055 PtrOff = DAG.getNode(ISD::ADD, DL, PtrVT, StackPtr, PtrOff); 3056 3057 if (IsTailCall) { 3058 Offset = Offset + FPDiff; 3059 int FI = MF.getFrameInfo()->CreateFixedObject(OpSize, Offset, true); 3060 3061 DstAddr = DAG.getFrameIndex(FI, PtrVT); 3062 DstInfo = 3063 MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI); 3064 3065 // Make sure any stack arguments overlapping with where we're storing 3066 // are loaded before this eventual operation. Otherwise they'll be 3067 // clobbered. 3068 Chain = addTokenForArgument(Chain, DAG, MF.getFrameInfo(), FI); 3069 } else { 3070 SDValue PtrOff = DAG.getIntPtrConstant(Offset, DL); 3071 3072 DstAddr = DAG.getNode(ISD::ADD, DL, PtrVT, StackPtr, PtrOff); 3073 DstInfo = MachinePointerInfo::getStack(DAG.getMachineFunction(), 3074 LocMemOffset); 3075 } 3076 3077 if (Outs[i].Flags.isByVal()) { 3078 SDValue SizeNode = 3079 DAG.getConstant(Outs[i].Flags.getByValSize(), DL, MVT::i64); 3080 SDValue Cpy = DAG.getMemcpy( 3081 Chain, DL, DstAddr, Arg, SizeNode, Outs[i].Flags.getByValAlign(), 3082 /*isVol = */ false, /*AlwaysInline = */ false, 3083 /*isTailCall = */ false, 3084 DstInfo, MachinePointerInfo()); 3085 3086 MemOpChains.push_back(Cpy); 3087 } else { 3088 // Since we pass i1/i8/i16 as i1/i8/i16 on stack and Arg is already 3089 // promoted to a legal register type i32, we should truncate Arg back to 3090 // i1/i8/i16. 3091 if (VA.getValVT() == MVT::i1 || VA.getValVT() == MVT::i8 || 3092 VA.getValVT() == MVT::i16) 3093 Arg = DAG.getNode(ISD::TRUNCATE, DL, VA.getValVT(), Arg); 3094 3095 SDValue Store = 3096 DAG.getStore(Chain, DL, Arg, DstAddr, DstInfo, false, false, 0); 3097 MemOpChains.push_back(Store); 3098 } 3099 } 3100 } 3101 3102 if (!MemOpChains.empty()) 3103 Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOpChains); 3104 3105 // Build a sequence of copy-to-reg nodes chained together with token chain 3106 // and flag operands which copy the outgoing args into the appropriate regs. 3107 SDValue InFlag; 3108 for (auto &RegToPass : RegsToPass) { 3109 Chain = DAG.getCopyToReg(Chain, DL, RegToPass.first, 3110 RegToPass.second, InFlag); 3111 InFlag = Chain.getValue(1); 3112 } 3113 3114 // If the callee is a GlobalAddress/ExternalSymbol node (quite common, every 3115 // direct call is) turn it into a TargetGlobalAddress/TargetExternalSymbol 3116 // node so that legalize doesn't hack it. 3117 if (getTargetMachine().getCodeModel() == CodeModel::Large && 3118 Subtarget->isTargetMachO()) { 3119 if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee)) { 3120 const GlobalValue *GV = G->getGlobal(); 3121 bool InternalLinkage = GV->hasInternalLinkage(); 3122 if (InternalLinkage) 3123 Callee = DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, 0); 3124 else { 3125 Callee = 3126 DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, AArch64II::MO_GOT); 3127 Callee = DAG.getNode(AArch64ISD::LOADgot, DL, PtrVT, Callee); 3128 } 3129 } else if (ExternalSymbolSDNode *S = 3130 dyn_cast<ExternalSymbolSDNode>(Callee)) { 3131 const char *Sym = S->getSymbol(); 3132 Callee = DAG.getTargetExternalSymbol(Sym, PtrVT, AArch64II::MO_GOT); 3133 Callee = DAG.getNode(AArch64ISD::LOADgot, DL, PtrVT, Callee); 3134 } 3135 } else if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee)) { 3136 const GlobalValue *GV = G->getGlobal(); 3137 Callee = DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, 0); 3138 } else if (ExternalSymbolSDNode *S = dyn_cast<ExternalSymbolSDNode>(Callee)) { 3139 const char *Sym = S->getSymbol(); 3140 Callee = DAG.getTargetExternalSymbol(Sym, PtrVT, 0); 3141 } 3142 3143 // We don't usually want to end the call-sequence here because we would tidy 3144 // the frame up *after* the call, however in the ABI-changing tail-call case 3145 // we've carefully laid out the parameters so that when sp is reset they'll be 3146 // in the correct location. 3147 if (IsTailCall && !IsSibCall) { 3148 Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(NumBytes, DL, true), 3149 DAG.getIntPtrConstant(0, DL, true), InFlag, DL); 3150 InFlag = Chain.getValue(1); 3151 } 3152 3153 std::vector<SDValue> Ops; 3154 Ops.push_back(Chain); 3155 Ops.push_back(Callee); 3156 3157 if (IsTailCall) { 3158 // Each tail call may have to adjust the stack by a different amount, so 3159 // this information must travel along with the operation for eventual 3160 // consumption by emitEpilogue. 3161 Ops.push_back(DAG.getTargetConstant(FPDiff, DL, MVT::i32)); 3162 } 3163 3164 // Add argument registers to the end of the list so that they are known live 3165 // into the call. 3166 for (auto &RegToPass : RegsToPass) 3167 Ops.push_back(DAG.getRegister(RegToPass.first, 3168 RegToPass.second.getValueType())); 3169 3170 // Add a register mask operand representing the call-preserved registers. 3171 const uint32_t *Mask; 3172 const AArch64RegisterInfo *TRI = Subtarget->getRegisterInfo(); 3173 if (IsThisReturn) { 3174 // For 'this' returns, use the X0-preserving mask if applicable 3175 Mask = TRI->getThisReturnPreservedMask(MF, CallConv); 3176 if (!Mask) { 3177 IsThisReturn = false; 3178 Mask = TRI->getCallPreservedMask(MF, CallConv); 3179 } 3180 } else 3181 Mask = TRI->getCallPreservedMask(MF, CallConv); 3182 3183 assert(Mask && "Missing call preserved mask for calling convention"); 3184 Ops.push_back(DAG.getRegisterMask(Mask)); 3185 3186 if (InFlag.getNode()) 3187 Ops.push_back(InFlag); 3188 3189 SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue); 3190 3191 // If we're doing a tall call, use a TC_RETURN here rather than an 3192 // actual call instruction. 3193 if (IsTailCall) { 3194 MF.getFrameInfo()->setHasTailCall(); 3195 return DAG.getNode(AArch64ISD::TC_RETURN, DL, NodeTys, Ops); 3196 } 3197 3198 // Returns a chain and a flag for retval copy to use. 3199 Chain = DAG.getNode(AArch64ISD::CALL, DL, NodeTys, Ops); 3200 InFlag = Chain.getValue(1); 3201 3202 uint64_t CalleePopBytes = DoesCalleeRestoreStack(CallConv, TailCallOpt) 3203 ? RoundUpToAlignment(NumBytes, 16) 3204 : 0; 3205 3206 Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(NumBytes, DL, true), 3207 DAG.getIntPtrConstant(CalleePopBytes, DL, true), 3208 InFlag, DL); 3209 if (!Ins.empty()) 3210 InFlag = Chain.getValue(1); 3211 3212 // Handle result values, copying them out of physregs into vregs that we 3213 // return. 3214 return LowerCallResult(Chain, InFlag, CallConv, IsVarArg, Ins, DL, DAG, 3215 InVals, IsThisReturn, 3216 IsThisReturn ? OutVals[0] : SDValue()); 3217 } 3218 3219 bool AArch64TargetLowering::CanLowerReturn( 3220 CallingConv::ID CallConv, MachineFunction &MF, bool isVarArg, 3221 const SmallVectorImpl<ISD::OutputArg> &Outs, LLVMContext &Context) const { 3222 CCAssignFn *RetCC = CallConv == CallingConv::WebKit_JS 3223 ? RetCC_AArch64_WebKit_JS 3224 : RetCC_AArch64_AAPCS; 3225 SmallVector<CCValAssign, 16> RVLocs; 3226 CCState CCInfo(CallConv, isVarArg, MF, RVLocs, Context); 3227 return CCInfo.CheckReturn(Outs, RetCC); 3228 } 3229 3230 SDValue 3231 AArch64TargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv, 3232 bool isVarArg, 3233 const SmallVectorImpl<ISD::OutputArg> &Outs, 3234 const SmallVectorImpl<SDValue> &OutVals, 3235 SDLoc DL, SelectionDAG &DAG) const { 3236 CCAssignFn *RetCC = CallConv == CallingConv::WebKit_JS 3237 ? RetCC_AArch64_WebKit_JS 3238 : RetCC_AArch64_AAPCS; 3239 SmallVector<CCValAssign, 16> RVLocs; 3240 CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), RVLocs, 3241 *DAG.getContext()); 3242 CCInfo.AnalyzeReturn(Outs, RetCC); 3243 3244 // Copy the result values into the output registers. 3245 SDValue Flag; 3246 SmallVector<SDValue, 4> RetOps(1, Chain); 3247 for (unsigned i = 0, realRVLocIdx = 0; i != RVLocs.size(); 3248 ++i, ++realRVLocIdx) { 3249 CCValAssign &VA = RVLocs[i]; 3250 assert(VA.isRegLoc() && "Can only return in registers!"); 3251 SDValue Arg = OutVals[realRVLocIdx]; 3252 3253 switch (VA.getLocInfo()) { 3254 default: 3255 llvm_unreachable("Unknown loc info!"); 3256 case CCValAssign::Full: 3257 if (Outs[i].ArgVT == MVT::i1) { 3258 // AAPCS requires i1 to be zero-extended to i8 by the producer of the 3259 // value. This is strictly redundant on Darwin (which uses "zeroext 3260 // i1"), but will be optimised out before ISel. 3261 Arg = DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, Arg); 3262 Arg = DAG.getNode(ISD::ZERO_EXTEND, DL, VA.getLocVT(), Arg); 3263 } 3264 break; 3265 case CCValAssign::BCvt: 3266 Arg = DAG.getNode(ISD::BITCAST, DL, VA.getLocVT(), Arg); 3267 break; 3268 } 3269 3270 Chain = DAG.getCopyToReg(Chain, DL, VA.getLocReg(), Arg, Flag); 3271 Flag = Chain.getValue(1); 3272 RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT())); 3273 } 3274 const AArch64RegisterInfo *TRI = Subtarget->getRegisterInfo(); 3275 const MCPhysReg *I = 3276 TRI->getCalleeSavedRegsViaCopy(&DAG.getMachineFunction()); 3277 if (I) { 3278 for (; *I; ++I) { 3279 if (AArch64::GPR64RegClass.contains(*I)) 3280 RetOps.push_back(DAG.getRegister(*I, MVT::i64)); 3281 else if (AArch64::FPR64RegClass.contains(*I)) 3282 RetOps.push_back(DAG.getRegister(*I, MVT::getFloatingPointVT(64))); 3283 else 3284 llvm_unreachable("Unexpected register class in CSRsViaCopy!"); 3285 } 3286 } 3287 3288 RetOps[0] = Chain; // Update chain. 3289 3290 // Add the flag if we have it. 3291 if (Flag.getNode()) 3292 RetOps.push_back(Flag); 3293 3294 return DAG.getNode(AArch64ISD::RET_FLAG, DL, MVT::Other, RetOps); 3295 } 3296 3297 //===----------------------------------------------------------------------===// 3298 // Other Lowering Code 3299 //===----------------------------------------------------------------------===// 3300 3301 SDValue AArch64TargetLowering::LowerGlobalAddress(SDValue Op, 3302 SelectionDAG &DAG) const { 3303 EVT PtrVT = getPointerTy(DAG.getDataLayout()); 3304 SDLoc DL(Op); 3305 const GlobalAddressSDNode *GN = cast<GlobalAddressSDNode>(Op); 3306 const GlobalValue *GV = GN->getGlobal(); 3307 unsigned char OpFlags = 3308 Subtarget->ClassifyGlobalReference(GV, getTargetMachine()); 3309 3310 assert(cast<GlobalAddressSDNode>(Op)->getOffset() == 0 && 3311 "unexpected offset in global node"); 3312 3313 // This also catched the large code model case for Darwin. 3314 if ((OpFlags & AArch64II::MO_GOT) != 0) { 3315 SDValue GotAddr = DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, OpFlags); 3316 // FIXME: Once remat is capable of dealing with instructions with register 3317 // operands, expand this into two nodes instead of using a wrapper node. 3318 return DAG.getNode(AArch64ISD::LOADgot, DL, PtrVT, GotAddr); 3319 } 3320 3321 if ((OpFlags & AArch64II::MO_CONSTPOOL) != 0) { 3322 assert(getTargetMachine().getCodeModel() == CodeModel::Small && 3323 "use of MO_CONSTPOOL only supported on small model"); 3324 SDValue Hi = DAG.getTargetConstantPool(GV, PtrVT, 0, 0, AArch64II::MO_PAGE); 3325 SDValue ADRP = DAG.getNode(AArch64ISD::ADRP, DL, PtrVT, Hi); 3326 unsigned char LoFlags = AArch64II::MO_PAGEOFF | AArch64II::MO_NC; 3327 SDValue Lo = DAG.getTargetConstantPool(GV, PtrVT, 0, 0, LoFlags); 3328 SDValue PoolAddr = DAG.getNode(AArch64ISD::ADDlow, DL, PtrVT, ADRP, Lo); 3329 SDValue GlobalAddr = DAG.getLoad( 3330 PtrVT, DL, DAG.getEntryNode(), PoolAddr, 3331 MachinePointerInfo::getConstantPool(DAG.getMachineFunction()), 3332 /*isVolatile=*/false, 3333 /*isNonTemporal=*/true, 3334 /*isInvariant=*/true, 8); 3335 if (GN->getOffset() != 0) 3336 return DAG.getNode(ISD::ADD, DL, PtrVT, GlobalAddr, 3337 DAG.getConstant(GN->getOffset(), DL, PtrVT)); 3338 return GlobalAddr; 3339 } 3340 3341 if (getTargetMachine().getCodeModel() == CodeModel::Large) { 3342 const unsigned char MO_NC = AArch64II::MO_NC; 3343 return DAG.getNode( 3344 AArch64ISD::WrapperLarge, DL, PtrVT, 3345 DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, AArch64II::MO_G3), 3346 DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, AArch64II::MO_G2 | MO_NC), 3347 DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, AArch64II::MO_G1 | MO_NC), 3348 DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, AArch64II::MO_G0 | MO_NC)); 3349 } else { 3350 // Use ADRP/ADD or ADRP/LDR for everything else: the small model on ELF and 3351 // the only correct model on Darwin. 3352 SDValue Hi = DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, 3353 OpFlags | AArch64II::MO_PAGE); 3354 unsigned char LoFlags = OpFlags | AArch64II::MO_PAGEOFF | AArch64II::MO_NC; 3355 SDValue Lo = DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, LoFlags); 3356 3357 SDValue ADRP = DAG.getNode(AArch64ISD::ADRP, DL, PtrVT, Hi); 3358 return DAG.getNode(AArch64ISD::ADDlow, DL, PtrVT, ADRP, Lo); 3359 } 3360 } 3361 3362 /// \brief Convert a TLS address reference into the correct sequence of loads 3363 /// and calls to compute the variable's address (for Darwin, currently) and 3364 /// return an SDValue containing the final node. 3365 3366 /// Darwin only has one TLS scheme which must be capable of dealing with the 3367 /// fully general situation, in the worst case. This means: 3368 /// + "extern __thread" declaration. 3369 /// + Defined in a possibly unknown dynamic library. 3370 /// 3371 /// The general system is that each __thread variable has a [3 x i64] descriptor 3372 /// which contains information used by the runtime to calculate the address. The 3373 /// only part of this the compiler needs to know about is the first xword, which 3374 /// contains a function pointer that must be called with the address of the 3375 /// entire descriptor in "x0". 3376 /// 3377 /// Since this descriptor may be in a different unit, in general even the 3378 /// descriptor must be accessed via an indirect load. The "ideal" code sequence 3379 /// is: 3380 /// adrp x0, _var@TLVPPAGE 3381 /// ldr x0, [x0, _var@TLVPPAGEOFF] ; x0 now contains address of descriptor 3382 /// ldr x1, [x0] ; x1 contains 1st entry of descriptor, 3383 /// ; the function pointer 3384 /// blr x1 ; Uses descriptor address in x0 3385 /// ; Address of _var is now in x0. 3386 /// 3387 /// If the address of _var's descriptor *is* known to the linker, then it can 3388 /// change the first "ldr" instruction to an appropriate "add x0, x0, #imm" for 3389 /// a slight efficiency gain. 3390 SDValue 3391 AArch64TargetLowering::LowerDarwinGlobalTLSAddress(SDValue Op, 3392 SelectionDAG &DAG) const { 3393 assert(Subtarget->isTargetDarwin() && "TLS only supported on Darwin"); 3394 3395 SDLoc DL(Op); 3396 MVT PtrVT = getPointerTy(DAG.getDataLayout()); 3397 const GlobalValue *GV = cast<GlobalAddressSDNode>(Op)->getGlobal(); 3398 3399 SDValue TLVPAddr = 3400 DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, AArch64II::MO_TLS); 3401 SDValue DescAddr = DAG.getNode(AArch64ISD::LOADgot, DL, PtrVT, TLVPAddr); 3402 3403 // The first entry in the descriptor is a function pointer that we must call 3404 // to obtain the address of the variable. 3405 SDValue Chain = DAG.getEntryNode(); 3406 SDValue FuncTLVGet = 3407 DAG.getLoad(MVT::i64, DL, Chain, DescAddr, 3408 MachinePointerInfo::getGOT(DAG.getMachineFunction()), false, 3409 true, true, 8); 3410 Chain = FuncTLVGet.getValue(1); 3411 3412 MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo(); 3413 MFI->setAdjustsStack(true); 3414 3415 // TLS calls preserve all registers except those that absolutely must be 3416 // trashed: X0 (it takes an argument), LR (it's a call) and NZCV (let's not be 3417 // silly). 3418 const uint32_t *Mask = 3419 Subtarget->getRegisterInfo()->getTLSCallPreservedMask(); 3420 3421 // Finally, we can make the call. This is just a degenerate version of a 3422 // normal AArch64 call node: x0 takes the address of the descriptor, and 3423 // returns the address of the variable in this thread. 3424 Chain = DAG.getCopyToReg(Chain, DL, AArch64::X0, DescAddr, SDValue()); 3425 Chain = 3426 DAG.getNode(AArch64ISD::CALL, DL, DAG.getVTList(MVT::Other, MVT::Glue), 3427 Chain, FuncTLVGet, DAG.getRegister(AArch64::X0, MVT::i64), 3428 DAG.getRegisterMask(Mask), Chain.getValue(1)); 3429 return DAG.getCopyFromReg(Chain, DL, AArch64::X0, PtrVT, Chain.getValue(1)); 3430 } 3431 3432 /// When accessing thread-local variables under either the general-dynamic or 3433 /// local-dynamic system, we make a "TLS-descriptor" call. The variable will 3434 /// have a descriptor, accessible via a PC-relative ADRP, and whose first entry 3435 /// is a function pointer to carry out the resolution. 3436 /// 3437 /// The sequence is: 3438 /// adrp x0, :tlsdesc:var 3439 /// ldr x1, [x0, #:tlsdesc_lo12:var] 3440 /// add x0, x0, #:tlsdesc_lo12:var 3441 /// .tlsdesccall var 3442 /// blr x1 3443 /// (TPIDR_EL0 offset now in x0) 3444 /// 3445 /// The above sequence must be produced unscheduled, to enable the linker to 3446 /// optimize/relax this sequence. 3447 /// Therefore, a pseudo-instruction (TLSDESC_CALLSEQ) is used to represent the 3448 /// above sequence, and expanded really late in the compilation flow, to ensure 3449 /// the sequence is produced as per above. 3450 SDValue AArch64TargetLowering::LowerELFTLSDescCallSeq(SDValue SymAddr, SDLoc DL, 3451 SelectionDAG &DAG) const { 3452 EVT PtrVT = getPointerTy(DAG.getDataLayout()); 3453 3454 SDValue Chain = DAG.getEntryNode(); 3455 SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue); 3456 3457 SmallVector<SDValue, 2> Ops; 3458 Ops.push_back(Chain); 3459 Ops.push_back(SymAddr); 3460 3461 Chain = DAG.getNode(AArch64ISD::TLSDESC_CALLSEQ, DL, NodeTys, Ops); 3462 SDValue Glue = Chain.getValue(1); 3463 3464 return DAG.getCopyFromReg(Chain, DL, AArch64::X0, PtrVT, Glue); 3465 } 3466 3467 SDValue 3468 AArch64TargetLowering::LowerELFGlobalTLSAddress(SDValue Op, 3469 SelectionDAG &DAG) const { 3470 assert(Subtarget->isTargetELF() && "This function expects an ELF target"); 3471 assert(getTargetMachine().getCodeModel() == CodeModel::Small && 3472 "ELF TLS only supported in small memory model"); 3473 // Different choices can be made for the maximum size of the TLS area for a 3474 // module. For the small address model, the default TLS size is 16MiB and the 3475 // maximum TLS size is 4GiB. 3476 // FIXME: add -mtls-size command line option and make it control the 16MiB 3477 // vs. 4GiB code sequence generation. 3478 const GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(Op); 3479 3480 TLSModel::Model Model = getTargetMachine().getTLSModel(GA->getGlobal()); 3481 3482 if (DAG.getTarget().Options.EmulatedTLS) 3483 return LowerToTLSEmulatedModel(GA, DAG); 3484 3485 if (!EnableAArch64ELFLocalDynamicTLSGeneration) { 3486 if (Model == TLSModel::LocalDynamic) 3487 Model = TLSModel::GeneralDynamic; 3488 } 3489 3490 SDValue TPOff; 3491 EVT PtrVT = getPointerTy(DAG.getDataLayout()); 3492 SDLoc DL(Op); 3493 const GlobalValue *GV = GA->getGlobal(); 3494 3495 SDValue ThreadBase = DAG.getNode(AArch64ISD::THREAD_POINTER, DL, PtrVT); 3496 3497 if (Model == TLSModel::LocalExec) { 3498 SDValue HiVar = DAG.getTargetGlobalAddress( 3499 GV, DL, PtrVT, 0, AArch64II::MO_TLS | AArch64II::MO_HI12); 3500 SDValue LoVar = DAG.getTargetGlobalAddress( 3501 GV, DL, PtrVT, 0, 3502 AArch64II::MO_TLS | AArch64II::MO_PAGEOFF | AArch64II::MO_NC); 3503 3504 SDValue TPWithOff_lo = 3505 SDValue(DAG.getMachineNode(AArch64::ADDXri, DL, PtrVT, ThreadBase, 3506 HiVar, 3507 DAG.getTargetConstant(0, DL, MVT::i32)), 3508 0); 3509 SDValue TPWithOff = 3510 SDValue(DAG.getMachineNode(AArch64::ADDXri, DL, PtrVT, TPWithOff_lo, 3511 LoVar, 3512 DAG.getTargetConstant(0, DL, MVT::i32)), 3513 0); 3514 return TPWithOff; 3515 } else if (Model == TLSModel::InitialExec) { 3516 TPOff = DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, AArch64II::MO_TLS); 3517 TPOff = DAG.getNode(AArch64ISD::LOADgot, DL, PtrVT, TPOff); 3518 } else if (Model == TLSModel::LocalDynamic) { 3519 // Local-dynamic accesses proceed in two phases. A general-dynamic TLS 3520 // descriptor call against the special symbol _TLS_MODULE_BASE_ to calculate 3521 // the beginning of the module's TLS region, followed by a DTPREL offset 3522 // calculation. 3523 3524 // These accesses will need deduplicating if there's more than one. 3525 AArch64FunctionInfo *MFI = 3526 DAG.getMachineFunction().getInfo<AArch64FunctionInfo>(); 3527 MFI->incNumLocalDynamicTLSAccesses(); 3528 3529 // The call needs a relocation too for linker relaxation. It doesn't make 3530 // sense to call it MO_PAGE or MO_PAGEOFF though so we need another copy of 3531 // the address. 3532 SDValue SymAddr = DAG.getTargetExternalSymbol("_TLS_MODULE_BASE_", PtrVT, 3533 AArch64II::MO_TLS); 3534 3535 // Now we can calculate the offset from TPIDR_EL0 to this module's 3536 // thread-local area. 3537 TPOff = LowerELFTLSDescCallSeq(SymAddr, DL, DAG); 3538 3539 // Now use :dtprel_whatever: operations to calculate this variable's offset 3540 // in its thread-storage area. 3541 SDValue HiVar = DAG.getTargetGlobalAddress( 3542 GV, DL, MVT::i64, 0, AArch64II::MO_TLS | AArch64II::MO_HI12); 3543 SDValue LoVar = DAG.getTargetGlobalAddress( 3544 GV, DL, MVT::i64, 0, 3545 AArch64II::MO_TLS | AArch64II::MO_PAGEOFF | AArch64II::MO_NC); 3546 3547 TPOff = SDValue(DAG.getMachineNode(AArch64::ADDXri, DL, PtrVT, TPOff, HiVar, 3548 DAG.getTargetConstant(0, DL, MVT::i32)), 3549 0); 3550 TPOff = SDValue(DAG.getMachineNode(AArch64::ADDXri, DL, PtrVT, TPOff, LoVar, 3551 DAG.getTargetConstant(0, DL, MVT::i32)), 3552 0); 3553 } else if (Model == TLSModel::GeneralDynamic) { 3554 // The call needs a relocation too for linker relaxation. It doesn't make 3555 // sense to call it MO_PAGE or MO_PAGEOFF though so we need another copy of 3556 // the address. 3557 SDValue SymAddr = 3558 DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, AArch64II::MO_TLS); 3559 3560 // Finally we can make a call to calculate the offset from tpidr_el0. 3561 TPOff = LowerELFTLSDescCallSeq(SymAddr, DL, DAG); 3562 } else 3563 llvm_unreachable("Unsupported ELF TLS access model"); 3564 3565 return DAG.getNode(ISD::ADD, DL, PtrVT, ThreadBase, TPOff); 3566 } 3567 3568 SDValue AArch64TargetLowering::LowerGlobalTLSAddress(SDValue Op, 3569 SelectionDAG &DAG) const { 3570 if (Subtarget->isTargetDarwin()) 3571 return LowerDarwinGlobalTLSAddress(Op, DAG); 3572 else if (Subtarget->isTargetELF()) 3573 return LowerELFGlobalTLSAddress(Op, DAG); 3574 3575 llvm_unreachable("Unexpected platform trying to use TLS"); 3576 } 3577 SDValue AArch64TargetLowering::LowerBR_CC(SDValue Op, SelectionDAG &DAG) const { 3578 SDValue Chain = Op.getOperand(0); 3579 ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(1))->get(); 3580 SDValue LHS = Op.getOperand(2); 3581 SDValue RHS = Op.getOperand(3); 3582 SDValue Dest = Op.getOperand(4); 3583 SDLoc dl(Op); 3584 3585 // Handle f128 first, since lowering it will result in comparing the return 3586 // value of a libcall against zero, which is just what the rest of LowerBR_CC 3587 // is expecting to deal with. 3588 if (LHS.getValueType() == MVT::f128) { 3589 softenSetCCOperands(DAG, MVT::f128, LHS, RHS, CC, dl); 3590 3591 // If softenSetCCOperands returned a scalar, we need to compare the result 3592 // against zero to select between true and false values. 3593 if (!RHS.getNode()) { 3594 RHS = DAG.getConstant(0, dl, LHS.getValueType()); 3595 CC = ISD::SETNE; 3596 } 3597 } 3598 3599 // Optimize {s|u}{add|sub|mul}.with.overflow feeding into a branch 3600 // instruction. 3601 unsigned Opc = LHS.getOpcode(); 3602 if (LHS.getResNo() == 1 && isOneConstant(RHS) && 3603 (Opc == ISD::SADDO || Opc == ISD::UADDO || Opc == ISD::SSUBO || 3604 Opc == ISD::USUBO || Opc == ISD::SMULO || Opc == ISD::UMULO)) { 3605 assert((CC == ISD::SETEQ || CC == ISD::SETNE) && 3606 "Unexpected condition code."); 3607 // Only lower legal XALUO ops. 3608 if (!DAG.getTargetLoweringInfo().isTypeLegal(LHS->getValueType(0))) 3609 return SDValue(); 3610 3611 // The actual operation with overflow check. 3612 AArch64CC::CondCode OFCC; 3613 SDValue Value, Overflow; 3614 std::tie(Value, Overflow) = getAArch64XALUOOp(OFCC, LHS.getValue(0), DAG); 3615 3616 if (CC == ISD::SETNE) 3617 OFCC = getInvertedCondCode(OFCC); 3618 SDValue CCVal = DAG.getConstant(OFCC, dl, MVT::i32); 3619 3620 return DAG.getNode(AArch64ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal, 3621 Overflow); 3622 } 3623 3624 if (LHS.getValueType().isInteger()) { 3625 assert((LHS.getValueType() == RHS.getValueType()) && 3626 (LHS.getValueType() == MVT::i32 || LHS.getValueType() == MVT::i64)); 3627 3628 // If the RHS of the comparison is zero, we can potentially fold this 3629 // to a specialized branch. 3630 const ConstantSDNode *RHSC = dyn_cast<ConstantSDNode>(RHS); 3631 if (RHSC && RHSC->getZExtValue() == 0) { 3632 if (CC == ISD::SETEQ) { 3633 // See if we can use a TBZ to fold in an AND as well. 3634 // TBZ has a smaller branch displacement than CBZ. If the offset is 3635 // out of bounds, a late MI-layer pass rewrites branches. 3636 // 403.gcc is an example that hits this case. 3637 if (LHS.getOpcode() == ISD::AND && 3638 isa<ConstantSDNode>(LHS.getOperand(1)) && 3639 isPowerOf2_64(LHS.getConstantOperandVal(1))) { 3640 SDValue Test = LHS.getOperand(0); 3641 uint64_t Mask = LHS.getConstantOperandVal(1); 3642 return DAG.getNode(AArch64ISD::TBZ, dl, MVT::Other, Chain, Test, 3643 DAG.getConstant(Log2_64(Mask), dl, MVT::i64), 3644 Dest); 3645 } 3646 3647 return DAG.getNode(AArch64ISD::CBZ, dl, MVT::Other, Chain, LHS, Dest); 3648 } else if (CC == ISD::SETNE) { 3649 // See if we can use a TBZ to fold in an AND as well. 3650 // TBZ has a smaller branch displacement than CBZ. If the offset is 3651 // out of bounds, a late MI-layer pass rewrites branches. 3652 // 403.gcc is an example that hits this case. 3653 if (LHS.getOpcode() == ISD::AND && 3654 isa<ConstantSDNode>(LHS.getOperand(1)) && 3655 isPowerOf2_64(LHS.getConstantOperandVal(1))) { 3656 SDValue Test = LHS.getOperand(0); 3657 uint64_t Mask = LHS.getConstantOperandVal(1); 3658 return DAG.getNode(AArch64ISD::TBNZ, dl, MVT::Other, Chain, Test, 3659 DAG.getConstant(Log2_64(Mask), dl, MVT::i64), 3660 Dest); 3661 } 3662 3663 return DAG.getNode(AArch64ISD::CBNZ, dl, MVT::Other, Chain, LHS, Dest); 3664 } else if (CC == ISD::SETLT && LHS.getOpcode() != ISD::AND) { 3665 // Don't combine AND since emitComparison converts the AND to an ANDS 3666 // (a.k.a. TST) and the test in the test bit and branch instruction 3667 // becomes redundant. This would also increase register pressure. 3668 uint64_t Mask = LHS.getValueType().getSizeInBits() - 1; 3669 return DAG.getNode(AArch64ISD::TBNZ, dl, MVT::Other, Chain, LHS, 3670 DAG.getConstant(Mask, dl, MVT::i64), Dest); 3671 } 3672 } 3673 if (RHSC && RHSC->getSExtValue() == -1 && CC == ISD::SETGT && 3674 LHS.getOpcode() != ISD::AND) { 3675 // Don't combine AND since emitComparison converts the AND to an ANDS 3676 // (a.k.a. TST) and the test in the test bit and branch instruction 3677 // becomes redundant. This would also increase register pressure. 3678 uint64_t Mask = LHS.getValueType().getSizeInBits() - 1; 3679 return DAG.getNode(AArch64ISD::TBZ, dl, MVT::Other, Chain, LHS, 3680 DAG.getConstant(Mask, dl, MVT::i64), Dest); 3681 } 3682 3683 SDValue CCVal; 3684 SDValue Cmp = getAArch64Cmp(LHS, RHS, CC, CCVal, DAG, dl); 3685 return DAG.getNode(AArch64ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal, 3686 Cmp); 3687 } 3688 3689 assert(LHS.getValueType() == MVT::f32 || LHS.getValueType() == MVT::f64); 3690 3691 // Unfortunately, the mapping of LLVM FP CC's onto AArch64 CC's isn't totally 3692 // clean. Some of them require two branches to implement. 3693 SDValue Cmp = emitComparison(LHS, RHS, CC, dl, DAG); 3694 AArch64CC::CondCode CC1, CC2; 3695 changeFPCCToAArch64CC(CC, CC1, CC2); 3696 SDValue CC1Val = DAG.getConstant(CC1, dl, MVT::i32); 3697 SDValue BR1 = 3698 DAG.getNode(AArch64ISD::BRCOND, dl, MVT::Other, Chain, Dest, CC1Val, Cmp); 3699 if (CC2 != AArch64CC::AL) { 3700 SDValue CC2Val = DAG.getConstant(CC2, dl, MVT::i32); 3701 return DAG.getNode(AArch64ISD::BRCOND, dl, MVT::Other, BR1, Dest, CC2Val, 3702 Cmp); 3703 } 3704 3705 return BR1; 3706 } 3707 3708 SDValue AArch64TargetLowering::LowerFCOPYSIGN(SDValue Op, 3709 SelectionDAG &DAG) const { 3710 EVT VT = Op.getValueType(); 3711 SDLoc DL(Op); 3712 3713 SDValue In1 = Op.getOperand(0); 3714 SDValue In2 = Op.getOperand(1); 3715 EVT SrcVT = In2.getValueType(); 3716 3717 if (SrcVT.bitsLT(VT)) 3718 In2 = DAG.getNode(ISD::FP_EXTEND, DL, VT, In2); 3719 else if (SrcVT.bitsGT(VT)) 3720 In2 = DAG.getNode(ISD::FP_ROUND, DL, VT, In2, DAG.getIntPtrConstant(0, DL)); 3721 3722 EVT VecVT; 3723 EVT EltVT; 3724 uint64_t EltMask; 3725 SDValue VecVal1, VecVal2; 3726 if (VT == MVT::f32 || VT == MVT::v2f32 || VT == MVT::v4f32) { 3727 EltVT = MVT::i32; 3728 VecVT = (VT == MVT::v2f32 ? MVT::v2i32 : MVT::v4i32); 3729 EltMask = 0x80000000ULL; 3730 3731 if (!VT.isVector()) { 3732 VecVal1 = DAG.getTargetInsertSubreg(AArch64::ssub, DL, VecVT, 3733 DAG.getUNDEF(VecVT), In1); 3734 VecVal2 = DAG.getTargetInsertSubreg(AArch64::ssub, DL, VecVT, 3735 DAG.getUNDEF(VecVT), In2); 3736 } else { 3737 VecVal1 = DAG.getNode(ISD::BITCAST, DL, VecVT, In1); 3738 VecVal2 = DAG.getNode(ISD::BITCAST, DL, VecVT, In2); 3739 } 3740 } else if (VT == MVT::f64 || VT == MVT::v2f64) { 3741 EltVT = MVT::i64; 3742 VecVT = MVT::v2i64; 3743 3744 // We want to materialize a mask with the high bit set, but the AdvSIMD 3745 // immediate moves cannot materialize that in a single instruction for 3746 // 64-bit elements. Instead, materialize zero and then negate it. 3747 EltMask = 0; 3748 3749 if (!VT.isVector()) { 3750 VecVal1 = DAG.getTargetInsertSubreg(AArch64::dsub, DL, VecVT, 3751 DAG.getUNDEF(VecVT), In1); 3752 VecVal2 = DAG.getTargetInsertSubreg(AArch64::dsub, DL, VecVT, 3753 DAG.getUNDEF(VecVT), In2); 3754 } else { 3755 VecVal1 = DAG.getNode(ISD::BITCAST, DL, VecVT, In1); 3756 VecVal2 = DAG.getNode(ISD::BITCAST, DL, VecVT, In2); 3757 } 3758 } else { 3759 llvm_unreachable("Invalid type for copysign!"); 3760 } 3761 3762 SDValue BuildVec = DAG.getConstant(EltMask, DL, VecVT); 3763 3764 // If we couldn't materialize the mask above, then the mask vector will be 3765 // the zero vector, and we need to negate it here. 3766 if (VT == MVT::f64 || VT == MVT::v2f64) { 3767 BuildVec = DAG.getNode(ISD::BITCAST, DL, MVT::v2f64, BuildVec); 3768 BuildVec = DAG.getNode(ISD::FNEG, DL, MVT::v2f64, BuildVec); 3769 BuildVec = DAG.getNode(ISD::BITCAST, DL, MVT::v2i64, BuildVec); 3770 } 3771 3772 SDValue Sel = 3773 DAG.getNode(AArch64ISD::BIT, DL, VecVT, VecVal1, VecVal2, BuildVec); 3774 3775 if (VT == MVT::f32) 3776 return DAG.getTargetExtractSubreg(AArch64::ssub, DL, VT, Sel); 3777 else if (VT == MVT::f64) 3778 return DAG.getTargetExtractSubreg(AArch64::dsub, DL, VT, Sel); 3779 else 3780 return DAG.getNode(ISD::BITCAST, DL, VT, Sel); 3781 } 3782 3783 SDValue AArch64TargetLowering::LowerCTPOP(SDValue Op, SelectionDAG &DAG) const { 3784 if (DAG.getMachineFunction().getFunction()->hasFnAttribute( 3785 Attribute::NoImplicitFloat)) 3786 return SDValue(); 3787 3788 if (!Subtarget->hasNEON()) 3789 return SDValue(); 3790 3791 // While there is no integer popcount instruction, it can 3792 // be more efficiently lowered to the following sequence that uses 3793 // AdvSIMD registers/instructions as long as the copies to/from 3794 // the AdvSIMD registers are cheap. 3795 // FMOV D0, X0 // copy 64-bit int to vector, high bits zero'd 3796 // CNT V0.8B, V0.8B // 8xbyte pop-counts 3797 // ADDV B0, V0.8B // sum 8xbyte pop-counts 3798 // UMOV X0, V0.B[0] // copy byte result back to integer reg 3799 SDValue Val = Op.getOperand(0); 3800 SDLoc DL(Op); 3801 EVT VT = Op.getValueType(); 3802 3803 if (VT == MVT::i32) 3804 Val = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i64, Val); 3805 Val = DAG.getNode(ISD::BITCAST, DL, MVT::v8i8, Val); 3806 3807 SDValue CtPop = DAG.getNode(ISD::CTPOP, DL, MVT::v8i8, Val); 3808 SDValue UaddLV = DAG.getNode( 3809 ISD::INTRINSIC_WO_CHAIN, DL, MVT::i32, 3810 DAG.getConstant(Intrinsic::aarch64_neon_uaddlv, DL, MVT::i32), CtPop); 3811 3812 if (VT == MVT::i64) 3813 UaddLV = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i64, UaddLV); 3814 return UaddLV; 3815 } 3816 3817 SDValue AArch64TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const { 3818 3819 if (Op.getValueType().isVector()) 3820 return LowerVSETCC(Op, DAG); 3821 3822 SDValue LHS = Op.getOperand(0); 3823 SDValue RHS = Op.getOperand(1); 3824 ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(2))->get(); 3825 SDLoc dl(Op); 3826 3827 // We chose ZeroOrOneBooleanContents, so use zero and one. 3828 EVT VT = Op.getValueType(); 3829 SDValue TVal = DAG.getConstant(1, dl, VT); 3830 SDValue FVal = DAG.getConstant(0, dl, VT); 3831 3832 // Handle f128 first, since one possible outcome is a normal integer 3833 // comparison which gets picked up by the next if statement. 3834 if (LHS.getValueType() == MVT::f128) { 3835 softenSetCCOperands(DAG, MVT::f128, LHS, RHS, CC, dl); 3836 3837 // If softenSetCCOperands returned a scalar, use it. 3838 if (!RHS.getNode()) { 3839 assert(LHS.getValueType() == Op.getValueType() && 3840 "Unexpected setcc expansion!"); 3841 return LHS; 3842 } 3843 } 3844 3845 if (LHS.getValueType().isInteger()) { 3846 SDValue CCVal; 3847 SDValue Cmp = 3848 getAArch64Cmp(LHS, RHS, ISD::getSetCCInverse(CC, true), CCVal, DAG, dl); 3849 3850 // Note that we inverted the condition above, so we reverse the order of 3851 // the true and false operands here. This will allow the setcc to be 3852 // matched to a single CSINC instruction. 3853 return DAG.getNode(AArch64ISD::CSEL, dl, VT, FVal, TVal, CCVal, Cmp); 3854 } 3855 3856 // Now we know we're dealing with FP values. 3857 assert(LHS.getValueType() == MVT::f32 || LHS.getValueType() == MVT::f64); 3858 3859 // If that fails, we'll need to perform an FCMP + CSEL sequence. Go ahead 3860 // and do the comparison. 3861 SDValue Cmp = emitComparison(LHS, RHS, CC, dl, DAG); 3862 3863 AArch64CC::CondCode CC1, CC2; 3864 changeFPCCToAArch64CC(CC, CC1, CC2); 3865 if (CC2 == AArch64CC::AL) { 3866 changeFPCCToAArch64CC(ISD::getSetCCInverse(CC, false), CC1, CC2); 3867 SDValue CC1Val = DAG.getConstant(CC1, dl, MVT::i32); 3868 3869 // Note that we inverted the condition above, so we reverse the order of 3870 // the true and false operands here. This will allow the setcc to be 3871 // matched to a single CSINC instruction. 3872 return DAG.getNode(AArch64ISD::CSEL, dl, VT, FVal, TVal, CC1Val, Cmp); 3873 } else { 3874 // Unfortunately, the mapping of LLVM FP CC's onto AArch64 CC's isn't 3875 // totally clean. Some of them require two CSELs to implement. As is in 3876 // this case, we emit the first CSEL and then emit a second using the output 3877 // of the first as the RHS. We're effectively OR'ing the two CC's together. 3878 3879 // FIXME: It would be nice if we could match the two CSELs to two CSINCs. 3880 SDValue CC1Val = DAG.getConstant(CC1, dl, MVT::i32); 3881 SDValue CS1 = 3882 DAG.getNode(AArch64ISD::CSEL, dl, VT, TVal, FVal, CC1Val, Cmp); 3883 3884 SDValue CC2Val = DAG.getConstant(CC2, dl, MVT::i32); 3885 return DAG.getNode(AArch64ISD::CSEL, dl, VT, TVal, CS1, CC2Val, Cmp); 3886 } 3887 } 3888 3889 SDValue AArch64TargetLowering::LowerSELECT_CC(ISD::CondCode CC, SDValue LHS, 3890 SDValue RHS, SDValue TVal, 3891 SDValue FVal, SDLoc dl, 3892 SelectionDAG &DAG) const { 3893 // Handle f128 first, because it will result in a comparison of some RTLIB 3894 // call result against zero. 3895 if (LHS.getValueType() == MVT::f128) { 3896 softenSetCCOperands(DAG, MVT::f128, LHS, RHS, CC, dl); 3897 3898 // If softenSetCCOperands returned a scalar, we need to compare the result 3899 // against zero to select between true and false values. 3900 if (!RHS.getNode()) { 3901 RHS = DAG.getConstant(0, dl, LHS.getValueType()); 3902 CC = ISD::SETNE; 3903 } 3904 } 3905 3906 // Also handle f16, for which we need to do a f32 comparison. 3907 if (LHS.getValueType() == MVT::f16) { 3908 LHS = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f32, LHS); 3909 RHS = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f32, RHS); 3910 } 3911 3912 // Next, handle integers. 3913 if (LHS.getValueType().isInteger()) { 3914 assert((LHS.getValueType() == RHS.getValueType()) && 3915 (LHS.getValueType() == MVT::i32 || LHS.getValueType() == MVT::i64)); 3916 3917 unsigned Opcode = AArch64ISD::CSEL; 3918 3919 // If both the TVal and the FVal are constants, see if we can swap them in 3920 // order to for a CSINV or CSINC out of them. 3921 ConstantSDNode *CFVal = dyn_cast<ConstantSDNode>(FVal); 3922 ConstantSDNode *CTVal = dyn_cast<ConstantSDNode>(TVal); 3923 3924 if (CTVal && CFVal && CTVal->isAllOnesValue() && CFVal->isNullValue()) { 3925 std::swap(TVal, FVal); 3926 std::swap(CTVal, CFVal); 3927 CC = ISD::getSetCCInverse(CC, true); 3928 } else if (CTVal && CFVal && CTVal->isOne() && CFVal->isNullValue()) { 3929 std::swap(TVal, FVal); 3930 std::swap(CTVal, CFVal); 3931 CC = ISD::getSetCCInverse(CC, true); 3932 } else if (TVal.getOpcode() == ISD::XOR) { 3933 // If TVal is a NOT we want to swap TVal and FVal so that we can match 3934 // with a CSINV rather than a CSEL. 3935 if (isAllOnesConstant(TVal.getOperand(1))) { 3936 std::swap(TVal, FVal); 3937 std::swap(CTVal, CFVal); 3938 CC = ISD::getSetCCInverse(CC, true); 3939 } 3940 } else if (TVal.getOpcode() == ISD::SUB) { 3941 // If TVal is a negation (SUB from 0) we want to swap TVal and FVal so 3942 // that we can match with a CSNEG rather than a CSEL. 3943 if (isNullConstant(TVal.getOperand(0))) { 3944 std::swap(TVal, FVal); 3945 std::swap(CTVal, CFVal); 3946 CC = ISD::getSetCCInverse(CC, true); 3947 } 3948 } else if (CTVal && CFVal) { 3949 const int64_t TrueVal = CTVal->getSExtValue(); 3950 const int64_t FalseVal = CFVal->getSExtValue(); 3951 bool Swap = false; 3952 3953 // If both TVal and FVal are constants, see if FVal is the 3954 // inverse/negation/increment of TVal and generate a CSINV/CSNEG/CSINC 3955 // instead of a CSEL in that case. 3956 if (TrueVal == ~FalseVal) { 3957 Opcode = AArch64ISD::CSINV; 3958 } else if (TrueVal == -FalseVal) { 3959 Opcode = AArch64ISD::CSNEG; 3960 } else if (TVal.getValueType() == MVT::i32) { 3961 // If our operands are only 32-bit wide, make sure we use 32-bit 3962 // arithmetic for the check whether we can use CSINC. This ensures that 3963 // the addition in the check will wrap around properly in case there is 3964 // an overflow (which would not be the case if we do the check with 3965 // 64-bit arithmetic). 3966 const uint32_t TrueVal32 = CTVal->getZExtValue(); 3967 const uint32_t FalseVal32 = CFVal->getZExtValue(); 3968 3969 if ((TrueVal32 == FalseVal32 + 1) || (TrueVal32 + 1 == FalseVal32)) { 3970 Opcode = AArch64ISD::CSINC; 3971 3972 if (TrueVal32 > FalseVal32) { 3973 Swap = true; 3974 } 3975 } 3976 // 64-bit check whether we can use CSINC. 3977 } else if ((TrueVal == FalseVal + 1) || (TrueVal + 1 == FalseVal)) { 3978 Opcode = AArch64ISD::CSINC; 3979 3980 if (TrueVal > FalseVal) { 3981 Swap = true; 3982 } 3983 } 3984 3985 // Swap TVal and FVal if necessary. 3986 if (Swap) { 3987 std::swap(TVal, FVal); 3988 std::swap(CTVal, CFVal); 3989 CC = ISD::getSetCCInverse(CC, true); 3990 } 3991 3992 if (Opcode != AArch64ISD::CSEL) { 3993 // Drop FVal since we can get its value by simply inverting/negating 3994 // TVal. 3995 FVal = TVal; 3996 } 3997 } 3998 3999 SDValue CCVal; 4000 SDValue Cmp = getAArch64Cmp(LHS, RHS, CC, CCVal, DAG, dl); 4001 4002 EVT VT = TVal.getValueType(); 4003 return DAG.getNode(Opcode, dl, VT, TVal, FVal, CCVal, Cmp); 4004 } 4005 4006 // Now we know we're dealing with FP values. 4007 assert(LHS.getValueType() == MVT::f32 || LHS.getValueType() == MVT::f64); 4008 assert(LHS.getValueType() == RHS.getValueType()); 4009 EVT VT = TVal.getValueType(); 4010 SDValue Cmp = emitComparison(LHS, RHS, CC, dl, DAG); 4011 4012 // Unfortunately, the mapping of LLVM FP CC's onto AArch64 CC's isn't totally 4013 // clean. Some of them require two CSELs to implement. 4014 AArch64CC::CondCode CC1, CC2; 4015 changeFPCCToAArch64CC(CC, CC1, CC2); 4016 SDValue CC1Val = DAG.getConstant(CC1, dl, MVT::i32); 4017 SDValue CS1 = DAG.getNode(AArch64ISD::CSEL, dl, VT, TVal, FVal, CC1Val, Cmp); 4018 4019 // If we need a second CSEL, emit it, using the output of the first as the 4020 // RHS. We're effectively OR'ing the two CC's together. 4021 if (CC2 != AArch64CC::AL) { 4022 SDValue CC2Val = DAG.getConstant(CC2, dl, MVT::i32); 4023 return DAG.getNode(AArch64ISD::CSEL, dl, VT, TVal, CS1, CC2Val, Cmp); 4024 } 4025 4026 // Otherwise, return the output of the first CSEL. 4027 return CS1; 4028 } 4029 4030 SDValue AArch64TargetLowering::LowerSELECT_CC(SDValue Op, 4031 SelectionDAG &DAG) const { 4032 ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(4))->get(); 4033 SDValue LHS = Op.getOperand(0); 4034 SDValue RHS = Op.getOperand(1); 4035 SDValue TVal = Op.getOperand(2); 4036 SDValue FVal = Op.getOperand(3); 4037 SDLoc DL(Op); 4038 return LowerSELECT_CC(CC, LHS, RHS, TVal, FVal, DL, DAG); 4039 } 4040 4041 SDValue AArch64TargetLowering::LowerSELECT(SDValue Op, 4042 SelectionDAG &DAG) const { 4043 SDValue CCVal = Op->getOperand(0); 4044 SDValue TVal = Op->getOperand(1); 4045 SDValue FVal = Op->getOperand(2); 4046 SDLoc DL(Op); 4047 4048 unsigned Opc = CCVal.getOpcode(); 4049 // Optimize {s|u}{add|sub|mul}.with.overflow feeding into a select 4050 // instruction. 4051 if (CCVal.getResNo() == 1 && 4052 (Opc == ISD::SADDO || Opc == ISD::UADDO || Opc == ISD::SSUBO || 4053 Opc == ISD::USUBO || Opc == ISD::SMULO || Opc == ISD::UMULO)) { 4054 // Only lower legal XALUO ops. 4055 if (!DAG.getTargetLoweringInfo().isTypeLegal(CCVal->getValueType(0))) 4056 return SDValue(); 4057 4058 AArch64CC::CondCode OFCC; 4059 SDValue Value, Overflow; 4060 std::tie(Value, Overflow) = getAArch64XALUOOp(OFCC, CCVal.getValue(0), DAG); 4061 SDValue CCVal = DAG.getConstant(OFCC, DL, MVT::i32); 4062 4063 return DAG.getNode(AArch64ISD::CSEL, DL, Op.getValueType(), TVal, FVal, 4064 CCVal, Overflow); 4065 } 4066 4067 // Lower it the same way as we would lower a SELECT_CC node. 4068 ISD::CondCode CC; 4069 SDValue LHS, RHS; 4070 if (CCVal.getOpcode() == ISD::SETCC) { 4071 LHS = CCVal.getOperand(0); 4072 RHS = CCVal.getOperand(1); 4073 CC = cast<CondCodeSDNode>(CCVal->getOperand(2))->get(); 4074 } else { 4075 LHS = CCVal; 4076 RHS = DAG.getConstant(0, DL, CCVal.getValueType()); 4077 CC = ISD::SETNE; 4078 } 4079 return LowerSELECT_CC(CC, LHS, RHS, TVal, FVal, DL, DAG); 4080 } 4081 4082 SDValue AArch64TargetLowering::LowerJumpTable(SDValue Op, 4083 SelectionDAG &DAG) const { 4084 // Jump table entries as PC relative offsets. No additional tweaking 4085 // is necessary here. Just get the address of the jump table. 4086 JumpTableSDNode *JT = cast<JumpTableSDNode>(Op); 4087 EVT PtrVT = getPointerTy(DAG.getDataLayout()); 4088 SDLoc DL(Op); 4089 4090 if (getTargetMachine().getCodeModel() == CodeModel::Large && 4091 !Subtarget->isTargetMachO()) { 4092 const unsigned char MO_NC = AArch64II::MO_NC; 4093 return DAG.getNode( 4094 AArch64ISD::WrapperLarge, DL, PtrVT, 4095 DAG.getTargetJumpTable(JT->getIndex(), PtrVT, AArch64II::MO_G3), 4096 DAG.getTargetJumpTable(JT->getIndex(), PtrVT, AArch64II::MO_G2 | MO_NC), 4097 DAG.getTargetJumpTable(JT->getIndex(), PtrVT, AArch64II::MO_G1 | MO_NC), 4098 DAG.getTargetJumpTable(JT->getIndex(), PtrVT, 4099 AArch64II::MO_G0 | MO_NC)); 4100 } 4101 4102 SDValue Hi = 4103 DAG.getTargetJumpTable(JT->getIndex(), PtrVT, AArch64II::MO_PAGE); 4104 SDValue Lo = DAG.getTargetJumpTable(JT->getIndex(), PtrVT, 4105 AArch64II::MO_PAGEOFF | AArch64II::MO_NC); 4106 SDValue ADRP = DAG.getNode(AArch64ISD::ADRP, DL, PtrVT, Hi); 4107 return DAG.getNode(AArch64ISD::ADDlow, DL, PtrVT, ADRP, Lo); 4108 } 4109 4110 SDValue AArch64TargetLowering::LowerConstantPool(SDValue Op, 4111 SelectionDAG &DAG) const { 4112 ConstantPoolSDNode *CP = cast<ConstantPoolSDNode>(Op); 4113 EVT PtrVT = getPointerTy(DAG.getDataLayout()); 4114 SDLoc DL(Op); 4115 4116 if (getTargetMachine().getCodeModel() == CodeModel::Large) { 4117 // Use the GOT for the large code model on iOS. 4118 if (Subtarget->isTargetMachO()) { 4119 SDValue GotAddr = DAG.getTargetConstantPool( 4120 CP->getConstVal(), PtrVT, CP->getAlignment(), CP->getOffset(), 4121 AArch64II::MO_GOT); 4122 return DAG.getNode(AArch64ISD::LOADgot, DL, PtrVT, GotAddr); 4123 } 4124 4125 const unsigned char MO_NC = AArch64II::MO_NC; 4126 return DAG.getNode( 4127 AArch64ISD::WrapperLarge, DL, PtrVT, 4128 DAG.getTargetConstantPool(CP->getConstVal(), PtrVT, CP->getAlignment(), 4129 CP->getOffset(), AArch64II::MO_G3), 4130 DAG.getTargetConstantPool(CP->getConstVal(), PtrVT, CP->getAlignment(), 4131 CP->getOffset(), AArch64II::MO_G2 | MO_NC), 4132 DAG.getTargetConstantPool(CP->getConstVal(), PtrVT, CP->getAlignment(), 4133 CP->getOffset(), AArch64II::MO_G1 | MO_NC), 4134 DAG.getTargetConstantPool(CP->getConstVal(), PtrVT, CP->getAlignment(), 4135 CP->getOffset(), AArch64II::MO_G0 | MO_NC)); 4136 } else { 4137 // Use ADRP/ADD or ADRP/LDR for everything else: the small memory model on 4138 // ELF, the only valid one on Darwin. 4139 SDValue Hi = 4140 DAG.getTargetConstantPool(CP->getConstVal(), PtrVT, CP->getAlignment(), 4141 CP->getOffset(), AArch64II::MO_PAGE); 4142 SDValue Lo = DAG.getTargetConstantPool( 4143 CP->getConstVal(), PtrVT, CP->getAlignment(), CP->getOffset(), 4144 AArch64II::MO_PAGEOFF | AArch64II::MO_NC); 4145 4146 SDValue ADRP = DAG.getNode(AArch64ISD::ADRP, DL, PtrVT, Hi); 4147 return DAG.getNode(AArch64ISD::ADDlow, DL, PtrVT, ADRP, Lo); 4148 } 4149 } 4150 4151 SDValue AArch64TargetLowering::LowerBlockAddress(SDValue Op, 4152 SelectionDAG &DAG) const { 4153 const BlockAddress *BA = cast<BlockAddressSDNode>(Op)->getBlockAddress(); 4154 EVT PtrVT = getPointerTy(DAG.getDataLayout()); 4155 SDLoc DL(Op); 4156 if (getTargetMachine().getCodeModel() == CodeModel::Large && 4157 !Subtarget->isTargetMachO()) { 4158 const unsigned char MO_NC = AArch64II::MO_NC; 4159 return DAG.getNode( 4160 AArch64ISD::WrapperLarge, DL, PtrVT, 4161 DAG.getTargetBlockAddress(BA, PtrVT, 0, AArch64II::MO_G3), 4162 DAG.getTargetBlockAddress(BA, PtrVT, 0, AArch64II::MO_G2 | MO_NC), 4163 DAG.getTargetBlockAddress(BA, PtrVT, 0, AArch64II::MO_G1 | MO_NC), 4164 DAG.getTargetBlockAddress(BA, PtrVT, 0, AArch64II::MO_G0 | MO_NC)); 4165 } else { 4166 SDValue Hi = DAG.getTargetBlockAddress(BA, PtrVT, 0, AArch64II::MO_PAGE); 4167 SDValue Lo = DAG.getTargetBlockAddress(BA, PtrVT, 0, AArch64II::MO_PAGEOFF | 4168 AArch64II::MO_NC); 4169 SDValue ADRP = DAG.getNode(AArch64ISD::ADRP, DL, PtrVT, Hi); 4170 return DAG.getNode(AArch64ISD::ADDlow, DL, PtrVT, ADRP, Lo); 4171 } 4172 } 4173 4174 SDValue AArch64TargetLowering::LowerDarwin_VASTART(SDValue Op, 4175 SelectionDAG &DAG) const { 4176 AArch64FunctionInfo *FuncInfo = 4177 DAG.getMachineFunction().getInfo<AArch64FunctionInfo>(); 4178 4179 SDLoc DL(Op); 4180 SDValue FR = DAG.getFrameIndex(FuncInfo->getVarArgsStackIndex(), 4181 getPointerTy(DAG.getDataLayout())); 4182 const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue(); 4183 return DAG.getStore(Op.getOperand(0), DL, FR, Op.getOperand(1), 4184 MachinePointerInfo(SV), false, false, 0); 4185 } 4186 4187 SDValue AArch64TargetLowering::LowerAAPCS_VASTART(SDValue Op, 4188 SelectionDAG &DAG) const { 4189 // The layout of the va_list struct is specified in the AArch64 Procedure Call 4190 // Standard, section B.3. 4191 MachineFunction &MF = DAG.getMachineFunction(); 4192 AArch64FunctionInfo *FuncInfo = MF.getInfo<AArch64FunctionInfo>(); 4193 auto PtrVT = getPointerTy(DAG.getDataLayout()); 4194 SDLoc DL(Op); 4195 4196 SDValue Chain = Op.getOperand(0); 4197 SDValue VAList = Op.getOperand(1); 4198 const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue(); 4199 SmallVector<SDValue, 4> MemOps; 4200 4201 // void *__stack at offset 0 4202 SDValue Stack = DAG.getFrameIndex(FuncInfo->getVarArgsStackIndex(), PtrVT); 4203 MemOps.push_back(DAG.getStore(Chain, DL, Stack, VAList, 4204 MachinePointerInfo(SV), false, false, 8)); 4205 4206 // void *__gr_top at offset 8 4207 int GPRSize = FuncInfo->getVarArgsGPRSize(); 4208 if (GPRSize > 0) { 4209 SDValue GRTop, GRTopAddr; 4210 4211 GRTopAddr = 4212 DAG.getNode(ISD::ADD, DL, PtrVT, VAList, DAG.getConstant(8, DL, PtrVT)); 4213 4214 GRTop = DAG.getFrameIndex(FuncInfo->getVarArgsGPRIndex(), PtrVT); 4215 GRTop = DAG.getNode(ISD::ADD, DL, PtrVT, GRTop, 4216 DAG.getConstant(GPRSize, DL, PtrVT)); 4217 4218 MemOps.push_back(DAG.getStore(Chain, DL, GRTop, GRTopAddr, 4219 MachinePointerInfo(SV, 8), false, false, 8)); 4220 } 4221 4222 // void *__vr_top at offset 16 4223 int FPRSize = FuncInfo->getVarArgsFPRSize(); 4224 if (FPRSize > 0) { 4225 SDValue VRTop, VRTopAddr; 4226 VRTopAddr = DAG.getNode(ISD::ADD, DL, PtrVT, VAList, 4227 DAG.getConstant(16, DL, PtrVT)); 4228 4229 VRTop = DAG.getFrameIndex(FuncInfo->getVarArgsFPRIndex(), PtrVT); 4230 VRTop = DAG.getNode(ISD::ADD, DL, PtrVT, VRTop, 4231 DAG.getConstant(FPRSize, DL, PtrVT)); 4232 4233 MemOps.push_back(DAG.getStore(Chain, DL, VRTop, VRTopAddr, 4234 MachinePointerInfo(SV, 16), false, false, 8)); 4235 } 4236 4237 // int __gr_offs at offset 24 4238 SDValue GROffsAddr = 4239 DAG.getNode(ISD::ADD, DL, PtrVT, VAList, DAG.getConstant(24, DL, PtrVT)); 4240 MemOps.push_back(DAG.getStore(Chain, DL, 4241 DAG.getConstant(-GPRSize, DL, MVT::i32), 4242 GROffsAddr, MachinePointerInfo(SV, 24), false, 4243 false, 4)); 4244 4245 // int __vr_offs at offset 28 4246 SDValue VROffsAddr = 4247 DAG.getNode(ISD::ADD, DL, PtrVT, VAList, DAG.getConstant(28, DL, PtrVT)); 4248 MemOps.push_back(DAG.getStore(Chain, DL, 4249 DAG.getConstant(-FPRSize, DL, MVT::i32), 4250 VROffsAddr, MachinePointerInfo(SV, 28), false, 4251 false, 4)); 4252 4253 return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOps); 4254 } 4255 4256 SDValue AArch64TargetLowering::LowerVASTART(SDValue Op, 4257 SelectionDAG &DAG) const { 4258 return Subtarget->isTargetDarwin() ? LowerDarwin_VASTART(Op, DAG) 4259 : LowerAAPCS_VASTART(Op, DAG); 4260 } 4261 4262 SDValue AArch64TargetLowering::LowerVACOPY(SDValue Op, 4263 SelectionDAG &DAG) const { 4264 // AAPCS has three pointers and two ints (= 32 bytes), Darwin has single 4265 // pointer. 4266 SDLoc DL(Op); 4267 unsigned VaListSize = Subtarget->isTargetDarwin() ? 8 : 32; 4268 const Value *DestSV = cast<SrcValueSDNode>(Op.getOperand(3))->getValue(); 4269 const Value *SrcSV = cast<SrcValueSDNode>(Op.getOperand(4))->getValue(); 4270 4271 return DAG.getMemcpy(Op.getOperand(0), DL, Op.getOperand(1), 4272 Op.getOperand(2), 4273 DAG.getConstant(VaListSize, DL, MVT::i32), 4274 8, false, false, false, MachinePointerInfo(DestSV), 4275 MachinePointerInfo(SrcSV)); 4276 } 4277 4278 SDValue AArch64TargetLowering::LowerVAARG(SDValue Op, SelectionDAG &DAG) const { 4279 assert(Subtarget->isTargetDarwin() && 4280 "automatic va_arg instruction only works on Darwin"); 4281 4282 const Value *V = cast<SrcValueSDNode>(Op.getOperand(2))->getValue(); 4283 EVT VT = Op.getValueType(); 4284 SDLoc DL(Op); 4285 SDValue Chain = Op.getOperand(0); 4286 SDValue Addr = Op.getOperand(1); 4287 unsigned Align = Op.getConstantOperandVal(3); 4288 auto PtrVT = getPointerTy(DAG.getDataLayout()); 4289 4290 SDValue VAList = DAG.getLoad(PtrVT, DL, Chain, Addr, MachinePointerInfo(V), 4291 false, false, false, 0); 4292 Chain = VAList.getValue(1); 4293 4294 if (Align > 8) { 4295 assert(((Align & (Align - 1)) == 0) && "Expected Align to be a power of 2"); 4296 VAList = DAG.getNode(ISD::ADD, DL, PtrVT, VAList, 4297 DAG.getConstant(Align - 1, DL, PtrVT)); 4298 VAList = DAG.getNode(ISD::AND, DL, PtrVT, VAList, 4299 DAG.getConstant(-(int64_t)Align, DL, PtrVT)); 4300 } 4301 4302 Type *ArgTy = VT.getTypeForEVT(*DAG.getContext()); 4303 uint64_t ArgSize = DAG.getDataLayout().getTypeAllocSize(ArgTy); 4304 4305 // Scalar integer and FP values smaller than 64 bits are implicitly extended 4306 // up to 64 bits. At the very least, we have to increase the striding of the 4307 // vaargs list to match this, and for FP values we need to introduce 4308 // FP_ROUND nodes as well. 4309 if (VT.isInteger() && !VT.isVector()) 4310 ArgSize = 8; 4311 bool NeedFPTrunc = false; 4312 if (VT.isFloatingPoint() && !VT.isVector() && VT != MVT::f64) { 4313 ArgSize = 8; 4314 NeedFPTrunc = true; 4315 } 4316 4317 // Increment the pointer, VAList, to the next vaarg 4318 SDValue VANext = DAG.getNode(ISD::ADD, DL, PtrVT, VAList, 4319 DAG.getConstant(ArgSize, DL, PtrVT)); 4320 // Store the incremented VAList to the legalized pointer 4321 SDValue APStore = DAG.getStore(Chain, DL, VANext, Addr, MachinePointerInfo(V), 4322 false, false, 0); 4323 4324 // Load the actual argument out of the pointer VAList 4325 if (NeedFPTrunc) { 4326 // Load the value as an f64. 4327 SDValue WideFP = DAG.getLoad(MVT::f64, DL, APStore, VAList, 4328 MachinePointerInfo(), false, false, false, 0); 4329 // Round the value down to an f32. 4330 SDValue NarrowFP = DAG.getNode(ISD::FP_ROUND, DL, VT, WideFP.getValue(0), 4331 DAG.getIntPtrConstant(1, DL)); 4332 SDValue Ops[] = { NarrowFP, WideFP.getValue(1) }; 4333 // Merge the rounded value with the chain output of the load. 4334 return DAG.getMergeValues(Ops, DL); 4335 } 4336 4337 return DAG.getLoad(VT, DL, APStore, VAList, MachinePointerInfo(), false, 4338 false, false, 0); 4339 } 4340 4341 SDValue AArch64TargetLowering::LowerFRAMEADDR(SDValue Op, 4342 SelectionDAG &DAG) const { 4343 MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo(); 4344 MFI->setFrameAddressIsTaken(true); 4345 4346 EVT VT = Op.getValueType(); 4347 SDLoc DL(Op); 4348 unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue(); 4349 SDValue FrameAddr = 4350 DAG.getCopyFromReg(DAG.getEntryNode(), DL, AArch64::FP, VT); 4351 while (Depth--) 4352 FrameAddr = DAG.getLoad(VT, DL, DAG.getEntryNode(), FrameAddr, 4353 MachinePointerInfo(), false, false, false, 0); 4354 return FrameAddr; 4355 } 4356 4357 // FIXME? Maybe this could be a TableGen attribute on some registers and 4358 // this table could be generated automatically from RegInfo. 4359 unsigned AArch64TargetLowering::getRegisterByName(const char* RegName, EVT VT, 4360 SelectionDAG &DAG) const { 4361 unsigned Reg = StringSwitch<unsigned>(RegName) 4362 .Case("sp", AArch64::SP) 4363 .Default(0); 4364 if (Reg) 4365 return Reg; 4366 report_fatal_error(Twine("Invalid register name \"" 4367 + StringRef(RegName) + "\".")); 4368 } 4369 4370 SDValue AArch64TargetLowering::LowerRETURNADDR(SDValue Op, 4371 SelectionDAG &DAG) const { 4372 MachineFunction &MF = DAG.getMachineFunction(); 4373 MachineFrameInfo *MFI = MF.getFrameInfo(); 4374 MFI->setReturnAddressIsTaken(true); 4375 4376 EVT VT = Op.getValueType(); 4377 SDLoc DL(Op); 4378 unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue(); 4379 if (Depth) { 4380 SDValue FrameAddr = LowerFRAMEADDR(Op, DAG); 4381 SDValue Offset = DAG.getConstant(8, DL, getPointerTy(DAG.getDataLayout())); 4382 return DAG.getLoad(VT, DL, DAG.getEntryNode(), 4383 DAG.getNode(ISD::ADD, DL, VT, FrameAddr, Offset), 4384 MachinePointerInfo(), false, false, false, 0); 4385 } 4386 4387 // Return LR, which contains the return address. Mark it an implicit live-in. 4388 unsigned Reg = MF.addLiveIn(AArch64::LR, &AArch64::GPR64RegClass); 4389 return DAG.getCopyFromReg(DAG.getEntryNode(), DL, Reg, VT); 4390 } 4391 4392 /// LowerShiftRightParts - Lower SRA_PARTS, which returns two 4393 /// i64 values and take a 2 x i64 value to shift plus a shift amount. 4394 SDValue AArch64TargetLowering::LowerShiftRightParts(SDValue Op, 4395 SelectionDAG &DAG) const { 4396 assert(Op.getNumOperands() == 3 && "Not a double-shift!"); 4397 EVT VT = Op.getValueType(); 4398 unsigned VTBits = VT.getSizeInBits(); 4399 SDLoc dl(Op); 4400 SDValue ShOpLo = Op.getOperand(0); 4401 SDValue ShOpHi = Op.getOperand(1); 4402 SDValue ShAmt = Op.getOperand(2); 4403 unsigned Opc = (Op.getOpcode() == ISD::SRA_PARTS) ? ISD::SRA : ISD::SRL; 4404 4405 assert(Op.getOpcode() == ISD::SRA_PARTS || Op.getOpcode() == ISD::SRL_PARTS); 4406 4407 SDValue RevShAmt = DAG.getNode(ISD::SUB, dl, MVT::i64, 4408 DAG.getConstant(VTBits, dl, MVT::i64), ShAmt); 4409 SDValue HiBitsForLo = DAG.getNode(ISD::SHL, dl, VT, ShOpHi, RevShAmt); 4410 4411 // Unfortunately, if ShAmt == 0, we just calculated "(SHL ShOpHi, 64)" which 4412 // is "undef". We wanted 0, so CSEL it directly. 4413 SDValue Cmp = emitComparison(ShAmt, DAG.getConstant(0, dl, MVT::i64), 4414 ISD::SETEQ, dl, DAG); 4415 SDValue CCVal = DAG.getConstant(AArch64CC::EQ, dl, MVT::i32); 4416 HiBitsForLo = 4417 DAG.getNode(AArch64ISD::CSEL, dl, VT, DAG.getConstant(0, dl, MVT::i64), 4418 HiBitsForLo, CCVal, Cmp); 4419 4420 SDValue ExtraShAmt = DAG.getNode(ISD::SUB, dl, MVT::i64, ShAmt, 4421 DAG.getConstant(VTBits, dl, MVT::i64)); 4422 4423 SDValue LoBitsForLo = DAG.getNode(ISD::SRL, dl, VT, ShOpLo, ShAmt); 4424 SDValue LoForNormalShift = 4425 DAG.getNode(ISD::OR, dl, VT, LoBitsForLo, HiBitsForLo); 4426 4427 Cmp = emitComparison(ExtraShAmt, DAG.getConstant(0, dl, MVT::i64), ISD::SETGE, 4428 dl, DAG); 4429 CCVal = DAG.getConstant(AArch64CC::GE, dl, MVT::i32); 4430 SDValue LoForBigShift = DAG.getNode(Opc, dl, VT, ShOpHi, ExtraShAmt); 4431 SDValue Lo = DAG.getNode(AArch64ISD::CSEL, dl, VT, LoForBigShift, 4432 LoForNormalShift, CCVal, Cmp); 4433 4434 // AArch64 shifts larger than the register width are wrapped rather than 4435 // clamped, so we can't just emit "hi >> x". 4436 SDValue HiForNormalShift = DAG.getNode(Opc, dl, VT, ShOpHi, ShAmt); 4437 SDValue HiForBigShift = 4438 Opc == ISD::SRA 4439 ? DAG.getNode(Opc, dl, VT, ShOpHi, 4440 DAG.getConstant(VTBits - 1, dl, MVT::i64)) 4441 : DAG.getConstant(0, dl, VT); 4442 SDValue Hi = DAG.getNode(AArch64ISD::CSEL, dl, VT, HiForBigShift, 4443 HiForNormalShift, CCVal, Cmp); 4444 4445 SDValue Ops[2] = { Lo, Hi }; 4446 return DAG.getMergeValues(Ops, dl); 4447 } 4448 4449 4450 /// LowerShiftLeftParts - Lower SHL_PARTS, which returns two 4451 /// i64 values and take a 2 x i64 value to shift plus a shift amount. 4452 SDValue AArch64TargetLowering::LowerShiftLeftParts(SDValue Op, 4453 SelectionDAG &DAG) const { 4454 assert(Op.getNumOperands() == 3 && "Not a double-shift!"); 4455 EVT VT = Op.getValueType(); 4456 unsigned VTBits = VT.getSizeInBits(); 4457 SDLoc dl(Op); 4458 SDValue ShOpLo = Op.getOperand(0); 4459 SDValue ShOpHi = Op.getOperand(1); 4460 SDValue ShAmt = Op.getOperand(2); 4461 4462 assert(Op.getOpcode() == ISD::SHL_PARTS); 4463 SDValue RevShAmt = DAG.getNode(ISD::SUB, dl, MVT::i64, 4464 DAG.getConstant(VTBits, dl, MVT::i64), ShAmt); 4465 SDValue LoBitsForHi = DAG.getNode(ISD::SRL, dl, VT, ShOpLo, RevShAmt); 4466 4467 // Unfortunately, if ShAmt == 0, we just calculated "(SRL ShOpLo, 64)" which 4468 // is "undef". We wanted 0, so CSEL it directly. 4469 SDValue Cmp = emitComparison(ShAmt, DAG.getConstant(0, dl, MVT::i64), 4470 ISD::SETEQ, dl, DAG); 4471 SDValue CCVal = DAG.getConstant(AArch64CC::EQ, dl, MVT::i32); 4472 LoBitsForHi = 4473 DAG.getNode(AArch64ISD::CSEL, dl, VT, DAG.getConstant(0, dl, MVT::i64), 4474 LoBitsForHi, CCVal, Cmp); 4475 4476 SDValue ExtraShAmt = DAG.getNode(ISD::SUB, dl, MVT::i64, ShAmt, 4477 DAG.getConstant(VTBits, dl, MVT::i64)); 4478 SDValue HiBitsForHi = DAG.getNode(ISD::SHL, dl, VT, ShOpHi, ShAmt); 4479 SDValue HiForNormalShift = 4480 DAG.getNode(ISD::OR, dl, VT, LoBitsForHi, HiBitsForHi); 4481 4482 SDValue HiForBigShift = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, ExtraShAmt); 4483 4484 Cmp = emitComparison(ExtraShAmt, DAG.getConstant(0, dl, MVT::i64), ISD::SETGE, 4485 dl, DAG); 4486 CCVal = DAG.getConstant(AArch64CC::GE, dl, MVT::i32); 4487 SDValue Hi = DAG.getNode(AArch64ISD::CSEL, dl, VT, HiForBigShift, 4488 HiForNormalShift, CCVal, Cmp); 4489 4490 // AArch64 shifts of larger than register sizes are wrapped rather than 4491 // clamped, so we can't just emit "lo << a" if a is too big. 4492 SDValue LoForBigShift = DAG.getConstant(0, dl, VT); 4493 SDValue LoForNormalShift = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, ShAmt); 4494 SDValue Lo = DAG.getNode(AArch64ISD::CSEL, dl, VT, LoForBigShift, 4495 LoForNormalShift, CCVal, Cmp); 4496 4497 SDValue Ops[2] = { Lo, Hi }; 4498 return DAG.getMergeValues(Ops, dl); 4499 } 4500 4501 bool AArch64TargetLowering::isOffsetFoldingLegal( 4502 const GlobalAddressSDNode *GA) const { 4503 // The AArch64 target doesn't support folding offsets into global addresses. 4504 return false; 4505 } 4506 4507 bool AArch64TargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT) const { 4508 // We can materialize #0.0 as fmov $Rd, XZR for 64-bit and 32-bit cases. 4509 // FIXME: We should be able to handle f128 as well with a clever lowering. 4510 if (Imm.isPosZero() && (VT == MVT::f64 || VT == MVT::f32)) 4511 return true; 4512 4513 if (VT == MVT::f64) 4514 return AArch64_AM::getFP64Imm(Imm) != -1; 4515 else if (VT == MVT::f32) 4516 return AArch64_AM::getFP32Imm(Imm) != -1; 4517 return false; 4518 } 4519 4520 //===----------------------------------------------------------------------===// 4521 // AArch64 Optimization Hooks 4522 //===----------------------------------------------------------------------===// 4523 4524 //===----------------------------------------------------------------------===// 4525 // AArch64 Inline Assembly Support 4526 //===----------------------------------------------------------------------===// 4527 4528 // Table of Constraints 4529 // TODO: This is the current set of constraints supported by ARM for the 4530 // compiler, not all of them may make sense, e.g. S may be difficult to support. 4531 // 4532 // r - A general register 4533 // w - An FP/SIMD register of some size in the range v0-v31 4534 // x - An FP/SIMD register of some size in the range v0-v15 4535 // I - Constant that can be used with an ADD instruction 4536 // J - Constant that can be used with a SUB instruction 4537 // K - Constant that can be used with a 32-bit logical instruction 4538 // L - Constant that can be used with a 64-bit logical instruction 4539 // M - Constant that can be used as a 32-bit MOV immediate 4540 // N - Constant that can be used as a 64-bit MOV immediate 4541 // Q - A memory reference with base register and no offset 4542 // S - A symbolic address 4543 // Y - Floating point constant zero 4544 // Z - Integer constant zero 4545 // 4546 // Note that general register operands will be output using their 64-bit x 4547 // register name, whatever the size of the variable, unless the asm operand 4548 // is prefixed by the %w modifier. Floating-point and SIMD register operands 4549 // will be output with the v prefix unless prefixed by the %b, %h, %s, %d or 4550 // %q modifier. 4551 4552 /// getConstraintType - Given a constraint letter, return the type of 4553 /// constraint it is for this target. 4554 AArch64TargetLowering::ConstraintType 4555 AArch64TargetLowering::getConstraintType(StringRef Constraint) const { 4556 if (Constraint.size() == 1) { 4557 switch (Constraint[0]) { 4558 default: 4559 break; 4560 case 'z': 4561 return C_Other; 4562 case 'x': 4563 case 'w': 4564 return C_RegisterClass; 4565 // An address with a single base register. Due to the way we 4566 // currently handle addresses it is the same as 'r'. 4567 case 'Q': 4568 return C_Memory; 4569 } 4570 } 4571 return TargetLowering::getConstraintType(Constraint); 4572 } 4573 4574 /// Examine constraint type and operand type and determine a weight value. 4575 /// This object must already have been set up with the operand type 4576 /// and the current alternative constraint selected. 4577 TargetLowering::ConstraintWeight 4578 AArch64TargetLowering::getSingleConstraintMatchWeight( 4579 AsmOperandInfo &info, const char *constraint) const { 4580 ConstraintWeight weight = CW_Invalid; 4581 Value *CallOperandVal = info.CallOperandVal; 4582 // If we don't have a value, we can't do a match, 4583 // but allow it at the lowest weight. 4584 if (!CallOperandVal) 4585 return CW_Default; 4586 Type *type = CallOperandVal->getType(); 4587 // Look at the constraint type. 4588 switch (*constraint) { 4589 default: 4590 weight = TargetLowering::getSingleConstraintMatchWeight(info, constraint); 4591 break; 4592 case 'x': 4593 case 'w': 4594 if (type->isFloatingPointTy() || type->isVectorTy()) 4595 weight = CW_Register; 4596 break; 4597 case 'z': 4598 weight = CW_Constant; 4599 break; 4600 } 4601 return weight; 4602 } 4603 4604 std::pair<unsigned, const TargetRegisterClass *> 4605 AArch64TargetLowering::getRegForInlineAsmConstraint( 4606 const TargetRegisterInfo *TRI, StringRef Constraint, MVT VT) const { 4607 if (Constraint.size() == 1) { 4608 switch (Constraint[0]) { 4609 case 'r': 4610 if (VT.getSizeInBits() == 64) 4611 return std::make_pair(0U, &AArch64::GPR64commonRegClass); 4612 return std::make_pair(0U, &AArch64::GPR32commonRegClass); 4613 case 'w': 4614 if (VT == MVT::f32) 4615 return std::make_pair(0U, &AArch64::FPR32RegClass); 4616 if (VT.getSizeInBits() == 64) 4617 return std::make_pair(0U, &AArch64::FPR64RegClass); 4618 if (VT.getSizeInBits() == 128) 4619 return std::make_pair(0U, &AArch64::FPR128RegClass); 4620 break; 4621 // The instructions that this constraint is designed for can 4622 // only take 128-bit registers so just use that regclass. 4623 case 'x': 4624 if (VT.getSizeInBits() == 128) 4625 return std::make_pair(0U, &AArch64::FPR128_loRegClass); 4626 break; 4627 } 4628 } 4629 if (StringRef("{cc}").equals_lower(Constraint)) 4630 return std::make_pair(unsigned(AArch64::NZCV), &AArch64::CCRRegClass); 4631 4632 // Use the default implementation in TargetLowering to convert the register 4633 // constraint into a member of a register class. 4634 std::pair<unsigned, const TargetRegisterClass *> Res; 4635 Res = TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT); 4636 4637 // Not found as a standard register? 4638 if (!Res.second) { 4639 unsigned Size = Constraint.size(); 4640 if ((Size == 4 || Size == 5) && Constraint[0] == '{' && 4641 tolower(Constraint[1]) == 'v' && Constraint[Size - 1] == '}') { 4642 int RegNo; 4643 bool Failed = Constraint.slice(2, Size - 1).getAsInteger(10, RegNo); 4644 if (!Failed && RegNo >= 0 && RegNo <= 31) { 4645 // v0 - v31 are aliases of q0 - q31. 4646 // By default we'll emit v0-v31 for this unless there's a modifier where 4647 // we'll emit the correct register as well. 4648 Res.first = AArch64::FPR128RegClass.getRegister(RegNo); 4649 Res.second = &AArch64::FPR128RegClass; 4650 } 4651 } 4652 } 4653 4654 return Res; 4655 } 4656 4657 /// LowerAsmOperandForConstraint - Lower the specified operand into the Ops 4658 /// vector. If it is invalid, don't add anything to Ops. 4659 void AArch64TargetLowering::LowerAsmOperandForConstraint( 4660 SDValue Op, std::string &Constraint, std::vector<SDValue> &Ops, 4661 SelectionDAG &DAG) const { 4662 SDValue Result; 4663 4664 // Currently only support length 1 constraints. 4665 if (Constraint.length() != 1) 4666 return; 4667 4668 char ConstraintLetter = Constraint[0]; 4669 switch (ConstraintLetter) { 4670 default: 4671 break; 4672 4673 // This set of constraints deal with valid constants for various instructions. 4674 // Validate and return a target constant for them if we can. 4675 case 'z': { 4676 // 'z' maps to xzr or wzr so it needs an input of 0. 4677 if (!isNullConstant(Op)) 4678 return; 4679 4680 if (Op.getValueType() == MVT::i64) 4681 Result = DAG.getRegister(AArch64::XZR, MVT::i64); 4682 else 4683 Result = DAG.getRegister(AArch64::WZR, MVT::i32); 4684 break; 4685 } 4686 4687 case 'I': 4688 case 'J': 4689 case 'K': 4690 case 'L': 4691 case 'M': 4692 case 'N': 4693 ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op); 4694 if (!C) 4695 return; 4696 4697 // Grab the value and do some validation. 4698 uint64_t CVal = C->getZExtValue(); 4699 switch (ConstraintLetter) { 4700 // The I constraint applies only to simple ADD or SUB immediate operands: 4701 // i.e. 0 to 4095 with optional shift by 12 4702 // The J constraint applies only to ADD or SUB immediates that would be 4703 // valid when negated, i.e. if [an add pattern] were to be output as a SUB 4704 // instruction [or vice versa], in other words -1 to -4095 with optional 4705 // left shift by 12. 4706 case 'I': 4707 if (isUInt<12>(CVal) || isShiftedUInt<12, 12>(CVal)) 4708 break; 4709 return; 4710 case 'J': { 4711 uint64_t NVal = -C->getSExtValue(); 4712 if (isUInt<12>(NVal) || isShiftedUInt<12, 12>(NVal)) { 4713 CVal = C->getSExtValue(); 4714 break; 4715 } 4716 return; 4717 } 4718 // The K and L constraints apply *only* to logical immediates, including 4719 // what used to be the MOVI alias for ORR (though the MOVI alias has now 4720 // been removed and MOV should be used). So these constraints have to 4721 // distinguish between bit patterns that are valid 32-bit or 64-bit 4722 // "bitmask immediates": for example 0xaaaaaaaa is a valid bimm32 (K), but 4723 // not a valid bimm64 (L) where 0xaaaaaaaaaaaaaaaa would be valid, and vice 4724 // versa. 4725 case 'K': 4726 if (AArch64_AM::isLogicalImmediate(CVal, 32)) 4727 break; 4728 return; 4729 case 'L': 4730 if (AArch64_AM::isLogicalImmediate(CVal, 64)) 4731 break; 4732 return; 4733 // The M and N constraints are a superset of K and L respectively, for use 4734 // with the MOV (immediate) alias. As well as the logical immediates they 4735 // also match 32 or 64-bit immediates that can be loaded either using a 4736 // *single* MOVZ or MOVN , such as 32-bit 0x12340000, 0x00001234, 0xffffedca 4737 // (M) or 64-bit 0x1234000000000000 (N) etc. 4738 // As a note some of this code is liberally stolen from the asm parser. 4739 case 'M': { 4740 if (!isUInt<32>(CVal)) 4741 return; 4742 if (AArch64_AM::isLogicalImmediate(CVal, 32)) 4743 break; 4744 if ((CVal & 0xFFFF) == CVal) 4745 break; 4746 if ((CVal & 0xFFFF0000ULL) == CVal) 4747 break; 4748 uint64_t NCVal = ~(uint32_t)CVal; 4749 if ((NCVal & 0xFFFFULL) == NCVal) 4750 break; 4751 if ((NCVal & 0xFFFF0000ULL) == NCVal) 4752 break; 4753 return; 4754 } 4755 case 'N': { 4756 if (AArch64_AM::isLogicalImmediate(CVal, 64)) 4757 break; 4758 if ((CVal & 0xFFFFULL) == CVal) 4759 break; 4760 if ((CVal & 0xFFFF0000ULL) == CVal) 4761 break; 4762 if ((CVal & 0xFFFF00000000ULL) == CVal) 4763 break; 4764 if ((CVal & 0xFFFF000000000000ULL) == CVal) 4765 break; 4766 uint64_t NCVal = ~CVal; 4767 if ((NCVal & 0xFFFFULL) == NCVal) 4768 break; 4769 if ((NCVal & 0xFFFF0000ULL) == NCVal) 4770 break; 4771 if ((NCVal & 0xFFFF00000000ULL) == NCVal) 4772 break; 4773 if ((NCVal & 0xFFFF000000000000ULL) == NCVal) 4774 break; 4775 return; 4776 } 4777 default: 4778 return; 4779 } 4780 4781 // All assembler immediates are 64-bit integers. 4782 Result = DAG.getTargetConstant(CVal, SDLoc(Op), MVT::i64); 4783 break; 4784 } 4785 4786 if (Result.getNode()) { 4787 Ops.push_back(Result); 4788 return; 4789 } 4790 4791 return TargetLowering::LowerAsmOperandForConstraint(Op, Constraint, Ops, DAG); 4792 } 4793 4794 //===----------------------------------------------------------------------===// 4795 // AArch64 Advanced SIMD Support 4796 //===----------------------------------------------------------------------===// 4797 4798 /// WidenVector - Given a value in the V64 register class, produce the 4799 /// equivalent value in the V128 register class. 4800 static SDValue WidenVector(SDValue V64Reg, SelectionDAG &DAG) { 4801 EVT VT = V64Reg.getValueType(); 4802 unsigned NarrowSize = VT.getVectorNumElements(); 4803 MVT EltTy = VT.getVectorElementType().getSimpleVT(); 4804 MVT WideTy = MVT::getVectorVT(EltTy, 2 * NarrowSize); 4805 SDLoc DL(V64Reg); 4806 4807 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, WideTy, DAG.getUNDEF(WideTy), 4808 V64Reg, DAG.getConstant(0, DL, MVT::i32)); 4809 } 4810 4811 /// getExtFactor - Determine the adjustment factor for the position when 4812 /// generating an "extract from vector registers" instruction. 4813 static unsigned getExtFactor(SDValue &V) { 4814 EVT EltType = V.getValueType().getVectorElementType(); 4815 return EltType.getSizeInBits() / 8; 4816 } 4817 4818 /// NarrowVector - Given a value in the V128 register class, produce the 4819 /// equivalent value in the V64 register class. 4820 static SDValue NarrowVector(SDValue V128Reg, SelectionDAG &DAG) { 4821 EVT VT = V128Reg.getValueType(); 4822 unsigned WideSize = VT.getVectorNumElements(); 4823 MVT EltTy = VT.getVectorElementType().getSimpleVT(); 4824 MVT NarrowTy = MVT::getVectorVT(EltTy, WideSize / 2); 4825 SDLoc DL(V128Reg); 4826 4827 return DAG.getTargetExtractSubreg(AArch64::dsub, DL, NarrowTy, V128Reg); 4828 } 4829 4830 // Gather data to see if the operation can be modelled as a 4831 // shuffle in combination with VEXTs. 4832 SDValue AArch64TargetLowering::ReconstructShuffle(SDValue Op, 4833 SelectionDAG &DAG) const { 4834 assert(Op.getOpcode() == ISD::BUILD_VECTOR && "Unknown opcode!"); 4835 SDLoc dl(Op); 4836 EVT VT = Op.getValueType(); 4837 unsigned NumElts = VT.getVectorNumElements(); 4838 4839 struct ShuffleSourceInfo { 4840 SDValue Vec; 4841 unsigned MinElt; 4842 unsigned MaxElt; 4843 4844 // We may insert some combination of BITCASTs and VEXT nodes to force Vec to 4845 // be compatible with the shuffle we intend to construct. As a result 4846 // ShuffleVec will be some sliding window into the original Vec. 4847 SDValue ShuffleVec; 4848 4849 // Code should guarantee that element i in Vec starts at element "WindowBase 4850 // + i * WindowScale in ShuffleVec". 4851 int WindowBase; 4852 int WindowScale; 4853 4854 bool operator ==(SDValue OtherVec) { return Vec == OtherVec; } 4855 ShuffleSourceInfo(SDValue Vec) 4856 : Vec(Vec), MinElt(UINT_MAX), MaxElt(0), ShuffleVec(Vec), WindowBase(0), 4857 WindowScale(1) {} 4858 }; 4859 4860 // First gather all vectors used as an immediate source for this BUILD_VECTOR 4861 // node. 4862 SmallVector<ShuffleSourceInfo, 2> Sources; 4863 for (unsigned i = 0; i < NumElts; ++i) { 4864 SDValue V = Op.getOperand(i); 4865 if (V.getOpcode() == ISD::UNDEF) 4866 continue; 4867 else if (V.getOpcode() != ISD::EXTRACT_VECTOR_ELT) { 4868 // A shuffle can only come from building a vector from various 4869 // elements of other vectors. 4870 return SDValue(); 4871 } 4872 4873 // Add this element source to the list if it's not already there. 4874 SDValue SourceVec = V.getOperand(0); 4875 auto Source = std::find(Sources.begin(), Sources.end(), SourceVec); 4876 if (Source == Sources.end()) 4877 Source = Sources.insert(Sources.end(), ShuffleSourceInfo(SourceVec)); 4878 4879 // Update the minimum and maximum lane number seen. 4880 unsigned EltNo = cast<ConstantSDNode>(V.getOperand(1))->getZExtValue(); 4881 Source->MinElt = std::min(Source->MinElt, EltNo); 4882 Source->MaxElt = std::max(Source->MaxElt, EltNo); 4883 } 4884 4885 // Currently only do something sane when at most two source vectors 4886 // are involved. 4887 if (Sources.size() > 2) 4888 return SDValue(); 4889 4890 // Find out the smallest element size among result and two sources, and use 4891 // it as element size to build the shuffle_vector. 4892 EVT SmallestEltTy = VT.getVectorElementType(); 4893 for (auto &Source : Sources) { 4894 EVT SrcEltTy = Source.Vec.getValueType().getVectorElementType(); 4895 if (SrcEltTy.bitsLT(SmallestEltTy)) { 4896 SmallestEltTy = SrcEltTy; 4897 } 4898 } 4899 unsigned ResMultiplier = 4900 VT.getVectorElementType().getSizeInBits() / SmallestEltTy.getSizeInBits(); 4901 NumElts = VT.getSizeInBits() / SmallestEltTy.getSizeInBits(); 4902 EVT ShuffleVT = EVT::getVectorVT(*DAG.getContext(), SmallestEltTy, NumElts); 4903 4904 // If the source vector is too wide or too narrow, we may nevertheless be able 4905 // to construct a compatible shuffle either by concatenating it with UNDEF or 4906 // extracting a suitable range of elements. 4907 for (auto &Src : Sources) { 4908 EVT SrcVT = Src.ShuffleVec.getValueType(); 4909 4910 if (SrcVT.getSizeInBits() == VT.getSizeInBits()) 4911 continue; 4912 4913 // This stage of the search produces a source with the same element type as 4914 // the original, but with a total width matching the BUILD_VECTOR output. 4915 EVT EltVT = SrcVT.getVectorElementType(); 4916 unsigned NumSrcElts = VT.getSizeInBits() / EltVT.getSizeInBits(); 4917 EVT DestVT = EVT::getVectorVT(*DAG.getContext(), EltVT, NumSrcElts); 4918 4919 if (SrcVT.getSizeInBits() < VT.getSizeInBits()) { 4920 assert(2 * SrcVT.getSizeInBits() == VT.getSizeInBits()); 4921 // We can pad out the smaller vector for free, so if it's part of a 4922 // shuffle... 4923 Src.ShuffleVec = 4924 DAG.getNode(ISD::CONCAT_VECTORS, dl, DestVT, Src.ShuffleVec, 4925 DAG.getUNDEF(Src.ShuffleVec.getValueType())); 4926 continue; 4927 } 4928 4929 assert(SrcVT.getSizeInBits() == 2 * VT.getSizeInBits()); 4930 4931 if (Src.MaxElt - Src.MinElt >= NumSrcElts) { 4932 // Span too large for a VEXT to cope 4933 return SDValue(); 4934 } 4935 4936 if (Src.MinElt >= NumSrcElts) { 4937 // The extraction can just take the second half 4938 Src.ShuffleVec = 4939 DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, DestVT, Src.ShuffleVec, 4940 DAG.getConstant(NumSrcElts, dl, MVT::i64)); 4941 Src.WindowBase = -NumSrcElts; 4942 } else if (Src.MaxElt < NumSrcElts) { 4943 // The extraction can just take the first half 4944 Src.ShuffleVec = 4945 DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, DestVT, Src.ShuffleVec, 4946 DAG.getConstant(0, dl, MVT::i64)); 4947 } else { 4948 // An actual VEXT is needed 4949 SDValue VEXTSrc1 = 4950 DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, DestVT, Src.ShuffleVec, 4951 DAG.getConstant(0, dl, MVT::i64)); 4952 SDValue VEXTSrc2 = 4953 DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, DestVT, Src.ShuffleVec, 4954 DAG.getConstant(NumSrcElts, dl, MVT::i64)); 4955 unsigned Imm = Src.MinElt * getExtFactor(VEXTSrc1); 4956 4957 Src.ShuffleVec = DAG.getNode(AArch64ISD::EXT, dl, DestVT, VEXTSrc1, 4958 VEXTSrc2, 4959 DAG.getConstant(Imm, dl, MVT::i32)); 4960 Src.WindowBase = -Src.MinElt; 4961 } 4962 } 4963 4964 // Another possible incompatibility occurs from the vector element types. We 4965 // can fix this by bitcasting the source vectors to the same type we intend 4966 // for the shuffle. 4967 for (auto &Src : Sources) { 4968 EVT SrcEltTy = Src.ShuffleVec.getValueType().getVectorElementType(); 4969 if (SrcEltTy == SmallestEltTy) 4970 continue; 4971 assert(ShuffleVT.getVectorElementType() == SmallestEltTy); 4972 Src.ShuffleVec = DAG.getNode(ISD::BITCAST, dl, ShuffleVT, Src.ShuffleVec); 4973 Src.WindowScale = SrcEltTy.getSizeInBits() / SmallestEltTy.getSizeInBits(); 4974 Src.WindowBase *= Src.WindowScale; 4975 } 4976 4977 // Final sanity check before we try to actually produce a shuffle. 4978 DEBUG( 4979 for (auto Src : Sources) 4980 assert(Src.ShuffleVec.getValueType() == ShuffleVT); 4981 ); 4982 4983 // The stars all align, our next step is to produce the mask for the shuffle. 4984 SmallVector<int, 8> Mask(ShuffleVT.getVectorNumElements(), -1); 4985 int BitsPerShuffleLane = ShuffleVT.getVectorElementType().getSizeInBits(); 4986 for (unsigned i = 0; i < VT.getVectorNumElements(); ++i) { 4987 SDValue Entry = Op.getOperand(i); 4988 if (Entry.getOpcode() == ISD::UNDEF) 4989 continue; 4990 4991 auto Src = std::find(Sources.begin(), Sources.end(), Entry.getOperand(0)); 4992 int EltNo = cast<ConstantSDNode>(Entry.getOperand(1))->getSExtValue(); 4993 4994 // EXTRACT_VECTOR_ELT performs an implicit any_ext; BUILD_VECTOR an implicit 4995 // trunc. So only std::min(SrcBits, DestBits) actually get defined in this 4996 // segment. 4997 EVT OrigEltTy = Entry.getOperand(0).getValueType().getVectorElementType(); 4998 int BitsDefined = std::min(OrigEltTy.getSizeInBits(), 4999 VT.getVectorElementType().getSizeInBits()); 5000 int LanesDefined = BitsDefined / BitsPerShuffleLane; 5001 5002 // This source is expected to fill ResMultiplier lanes of the final shuffle, 5003 // starting at the appropriate offset. 5004 int *LaneMask = &Mask[i * ResMultiplier]; 5005 5006 int ExtractBase = EltNo * Src->WindowScale + Src->WindowBase; 5007 ExtractBase += NumElts * (Src - Sources.begin()); 5008 for (int j = 0; j < LanesDefined; ++j) 5009 LaneMask[j] = ExtractBase + j; 5010 } 5011 5012 // Final check before we try to produce nonsense... 5013 if (!isShuffleMaskLegal(Mask, ShuffleVT)) 5014 return SDValue(); 5015 5016 SDValue ShuffleOps[] = { DAG.getUNDEF(ShuffleVT), DAG.getUNDEF(ShuffleVT) }; 5017 for (unsigned i = 0; i < Sources.size(); ++i) 5018 ShuffleOps[i] = Sources[i].ShuffleVec; 5019 5020 SDValue Shuffle = DAG.getVectorShuffle(ShuffleVT, dl, ShuffleOps[0], 5021 ShuffleOps[1], &Mask[0]); 5022 return DAG.getNode(ISD::BITCAST, dl, VT, Shuffle); 5023 } 5024 5025 // check if an EXT instruction can handle the shuffle mask when the 5026 // vector sources of the shuffle are the same. 5027 static bool isSingletonEXTMask(ArrayRef<int> M, EVT VT, unsigned &Imm) { 5028 unsigned NumElts = VT.getVectorNumElements(); 5029 5030 // Assume that the first shuffle index is not UNDEF. Fail if it is. 5031 if (M[0] < 0) 5032 return false; 5033 5034 Imm = M[0]; 5035 5036 // If this is a VEXT shuffle, the immediate value is the index of the first 5037 // element. The other shuffle indices must be the successive elements after 5038 // the first one. 5039 unsigned ExpectedElt = Imm; 5040 for (unsigned i = 1; i < NumElts; ++i) { 5041 // Increment the expected index. If it wraps around, just follow it 5042 // back to index zero and keep going. 5043 ++ExpectedElt; 5044 if (ExpectedElt == NumElts) 5045 ExpectedElt = 0; 5046 5047 if (M[i] < 0) 5048 continue; // ignore UNDEF indices 5049 if (ExpectedElt != static_cast<unsigned>(M[i])) 5050 return false; 5051 } 5052 5053 return true; 5054 } 5055 5056 // check if an EXT instruction can handle the shuffle mask when the 5057 // vector sources of the shuffle are different. 5058 static bool isEXTMask(ArrayRef<int> M, EVT VT, bool &ReverseEXT, 5059 unsigned &Imm) { 5060 // Look for the first non-undef element. 5061 const int *FirstRealElt = std::find_if(M.begin(), M.end(), 5062 [](int Elt) {return Elt >= 0;}); 5063 5064 // Benefit form APInt to handle overflow when calculating expected element. 5065 unsigned NumElts = VT.getVectorNumElements(); 5066 unsigned MaskBits = APInt(32, NumElts * 2).logBase2(); 5067 APInt ExpectedElt = APInt(MaskBits, *FirstRealElt + 1); 5068 // The following shuffle indices must be the successive elements after the 5069 // first real element. 5070 const int *FirstWrongElt = std::find_if(FirstRealElt + 1, M.end(), 5071 [&](int Elt) {return Elt != ExpectedElt++ && Elt != -1;}); 5072 if (FirstWrongElt != M.end()) 5073 return false; 5074 5075 // The index of an EXT is the first element if it is not UNDEF. 5076 // Watch out for the beginning UNDEFs. The EXT index should be the expected 5077 // value of the first element. E.g. 5078 // <-1, -1, 3, ...> is treated as <1, 2, 3, ...>. 5079 // <-1, -1, 0, 1, ...> is treated as <2*NumElts-2, 2*NumElts-1, 0, 1, ...>. 5080 // ExpectedElt is the last mask index plus 1. 5081 Imm = ExpectedElt.getZExtValue(); 5082 5083 // There are two difference cases requiring to reverse input vectors. 5084 // For example, for vector <4 x i32> we have the following cases, 5085 // Case 1: shufflevector(<4 x i32>,<4 x i32>,<-1, -1, -1, 0>) 5086 // Case 2: shufflevector(<4 x i32>,<4 x i32>,<-1, -1, 7, 0>) 5087 // For both cases, we finally use mask <5, 6, 7, 0>, which requires 5088 // to reverse two input vectors. 5089 if (Imm < NumElts) 5090 ReverseEXT = true; 5091 else 5092 Imm -= NumElts; 5093 5094 return true; 5095 } 5096 5097 /// isREVMask - Check if a vector shuffle corresponds to a REV 5098 /// instruction with the specified blocksize. (The order of the elements 5099 /// within each block of the vector is reversed.) 5100 static bool isREVMask(ArrayRef<int> M, EVT VT, unsigned BlockSize) { 5101 assert((BlockSize == 16 || BlockSize == 32 || BlockSize == 64) && 5102 "Only possible block sizes for REV are: 16, 32, 64"); 5103 5104 unsigned EltSz = VT.getVectorElementType().getSizeInBits(); 5105 if (EltSz == 64) 5106 return false; 5107 5108 unsigned NumElts = VT.getVectorNumElements(); 5109 unsigned BlockElts = M[0] + 1; 5110 // If the first shuffle index is UNDEF, be optimistic. 5111 if (M[0] < 0) 5112 BlockElts = BlockSize / EltSz; 5113 5114 if (BlockSize <= EltSz || BlockSize != BlockElts * EltSz) 5115 return false; 5116 5117 for (unsigned i = 0; i < NumElts; ++i) { 5118 if (M[i] < 0) 5119 continue; // ignore UNDEF indices 5120 if ((unsigned)M[i] != (i - i % BlockElts) + (BlockElts - 1 - i % BlockElts)) 5121 return false; 5122 } 5123 5124 return true; 5125 } 5126 5127 static bool isZIPMask(ArrayRef<int> M, EVT VT, unsigned &WhichResult) { 5128 unsigned NumElts = VT.getVectorNumElements(); 5129 WhichResult = (M[0] == 0 ? 0 : 1); 5130 unsigned Idx = WhichResult * NumElts / 2; 5131 for (unsigned i = 0; i != NumElts; i += 2) { 5132 if ((M[i] >= 0 && (unsigned)M[i] != Idx) || 5133 (M[i + 1] >= 0 && (unsigned)M[i + 1] != Idx + NumElts)) 5134 return false; 5135 Idx += 1; 5136 } 5137 5138 return true; 5139 } 5140 5141 static bool isUZPMask(ArrayRef<int> M, EVT VT, unsigned &WhichResult) { 5142 unsigned NumElts = VT.getVectorNumElements(); 5143 WhichResult = (M[0] == 0 ? 0 : 1); 5144 for (unsigned i = 0; i != NumElts; ++i) { 5145 if (M[i] < 0) 5146 continue; // ignore UNDEF indices 5147 if ((unsigned)M[i] != 2 * i + WhichResult) 5148 return false; 5149 } 5150 5151 return true; 5152 } 5153 5154 static bool isTRNMask(ArrayRef<int> M, EVT VT, unsigned &WhichResult) { 5155 unsigned NumElts = VT.getVectorNumElements(); 5156 WhichResult = (M[0] == 0 ? 0 : 1); 5157 for (unsigned i = 0; i < NumElts; i += 2) { 5158 if ((M[i] >= 0 && (unsigned)M[i] != i + WhichResult) || 5159 (M[i + 1] >= 0 && (unsigned)M[i + 1] != i + NumElts + WhichResult)) 5160 return false; 5161 } 5162 return true; 5163 } 5164 5165 /// isZIP_v_undef_Mask - Special case of isZIPMask for canonical form of 5166 /// "vector_shuffle v, v", i.e., "vector_shuffle v, undef". 5167 /// Mask is e.g., <0, 0, 1, 1> instead of <0, 4, 1, 5>. 5168 static bool isZIP_v_undef_Mask(ArrayRef<int> M, EVT VT, unsigned &WhichResult) { 5169 unsigned NumElts = VT.getVectorNumElements(); 5170 WhichResult = (M[0] == 0 ? 0 : 1); 5171 unsigned Idx = WhichResult * NumElts / 2; 5172 for (unsigned i = 0; i != NumElts; i += 2) { 5173 if ((M[i] >= 0 && (unsigned)M[i] != Idx) || 5174 (M[i + 1] >= 0 && (unsigned)M[i + 1] != Idx)) 5175 return false; 5176 Idx += 1; 5177 } 5178 5179 return true; 5180 } 5181 5182 /// isUZP_v_undef_Mask - Special case of isUZPMask for canonical form of 5183 /// "vector_shuffle v, v", i.e., "vector_shuffle v, undef". 5184 /// Mask is e.g., <0, 2, 0, 2> instead of <0, 2, 4, 6>, 5185 static bool isUZP_v_undef_Mask(ArrayRef<int> M, EVT VT, unsigned &WhichResult) { 5186 unsigned Half = VT.getVectorNumElements() / 2; 5187 WhichResult = (M[0] == 0 ? 0 : 1); 5188 for (unsigned j = 0; j != 2; ++j) { 5189 unsigned Idx = WhichResult; 5190 for (unsigned i = 0; i != Half; ++i) { 5191 int MIdx = M[i + j * Half]; 5192 if (MIdx >= 0 && (unsigned)MIdx != Idx) 5193 return false; 5194 Idx += 2; 5195 } 5196 } 5197 5198 return true; 5199 } 5200 5201 /// isTRN_v_undef_Mask - Special case of isTRNMask for canonical form of 5202 /// "vector_shuffle v, v", i.e., "vector_shuffle v, undef". 5203 /// Mask is e.g., <0, 0, 2, 2> instead of <0, 4, 2, 6>. 5204 static bool isTRN_v_undef_Mask(ArrayRef<int> M, EVT VT, unsigned &WhichResult) { 5205 unsigned NumElts = VT.getVectorNumElements(); 5206 WhichResult = (M[0] == 0 ? 0 : 1); 5207 for (unsigned i = 0; i < NumElts; i += 2) { 5208 if ((M[i] >= 0 && (unsigned)M[i] != i + WhichResult) || 5209 (M[i + 1] >= 0 && (unsigned)M[i + 1] != i + WhichResult)) 5210 return false; 5211 } 5212 return true; 5213 } 5214 5215 static bool isINSMask(ArrayRef<int> M, int NumInputElements, 5216 bool &DstIsLeft, int &Anomaly) { 5217 if (M.size() != static_cast<size_t>(NumInputElements)) 5218 return false; 5219 5220 int NumLHSMatch = 0, NumRHSMatch = 0; 5221 int LastLHSMismatch = -1, LastRHSMismatch = -1; 5222 5223 for (int i = 0; i < NumInputElements; ++i) { 5224 if (M[i] == -1) { 5225 ++NumLHSMatch; 5226 ++NumRHSMatch; 5227 continue; 5228 } 5229 5230 if (M[i] == i) 5231 ++NumLHSMatch; 5232 else 5233 LastLHSMismatch = i; 5234 5235 if (M[i] == i + NumInputElements) 5236 ++NumRHSMatch; 5237 else 5238 LastRHSMismatch = i; 5239 } 5240 5241 if (NumLHSMatch == NumInputElements - 1) { 5242 DstIsLeft = true; 5243 Anomaly = LastLHSMismatch; 5244 return true; 5245 } else if (NumRHSMatch == NumInputElements - 1) { 5246 DstIsLeft = false; 5247 Anomaly = LastRHSMismatch; 5248 return true; 5249 } 5250 5251 return false; 5252 } 5253 5254 static bool isConcatMask(ArrayRef<int> Mask, EVT VT, bool SplitLHS) { 5255 if (VT.getSizeInBits() != 128) 5256 return false; 5257 5258 unsigned NumElts = VT.getVectorNumElements(); 5259 5260 for (int I = 0, E = NumElts / 2; I != E; I++) { 5261 if (Mask[I] != I) 5262 return false; 5263 } 5264 5265 int Offset = NumElts / 2; 5266 for (int I = NumElts / 2, E = NumElts; I != E; I++) { 5267 if (Mask[I] != I + SplitLHS * Offset) 5268 return false; 5269 } 5270 5271 return true; 5272 } 5273 5274 static SDValue tryFormConcatFromShuffle(SDValue Op, SelectionDAG &DAG) { 5275 SDLoc DL(Op); 5276 EVT VT = Op.getValueType(); 5277 SDValue V0 = Op.getOperand(0); 5278 SDValue V1 = Op.getOperand(1); 5279 ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(Op)->getMask(); 5280 5281 if (VT.getVectorElementType() != V0.getValueType().getVectorElementType() || 5282 VT.getVectorElementType() != V1.getValueType().getVectorElementType()) 5283 return SDValue(); 5284 5285 bool SplitV0 = V0.getValueType().getSizeInBits() == 128; 5286 5287 if (!isConcatMask(Mask, VT, SplitV0)) 5288 return SDValue(); 5289 5290 EVT CastVT = EVT::getVectorVT(*DAG.getContext(), VT.getVectorElementType(), 5291 VT.getVectorNumElements() / 2); 5292 if (SplitV0) { 5293 V0 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, CastVT, V0, 5294 DAG.getConstant(0, DL, MVT::i64)); 5295 } 5296 if (V1.getValueType().getSizeInBits() == 128) { 5297 V1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, CastVT, V1, 5298 DAG.getConstant(0, DL, MVT::i64)); 5299 } 5300 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, V0, V1); 5301 } 5302 5303 /// GeneratePerfectShuffle - Given an entry in the perfect-shuffle table, emit 5304 /// the specified operations to build the shuffle. 5305 static SDValue GeneratePerfectShuffle(unsigned PFEntry, SDValue LHS, 5306 SDValue RHS, SelectionDAG &DAG, 5307 SDLoc dl) { 5308 unsigned OpNum = (PFEntry >> 26) & 0x0F; 5309 unsigned LHSID = (PFEntry >> 13) & ((1 << 13) - 1); 5310 unsigned RHSID = (PFEntry >> 0) & ((1 << 13) - 1); 5311 5312 enum { 5313 OP_COPY = 0, // Copy, used for things like <u,u,u,3> to say it is <0,1,2,3> 5314 OP_VREV, 5315 OP_VDUP0, 5316 OP_VDUP1, 5317 OP_VDUP2, 5318 OP_VDUP3, 5319 OP_VEXT1, 5320 OP_VEXT2, 5321 OP_VEXT3, 5322 OP_VUZPL, // VUZP, left result 5323 OP_VUZPR, // VUZP, right result 5324 OP_VZIPL, // VZIP, left result 5325 OP_VZIPR, // VZIP, right result 5326 OP_VTRNL, // VTRN, left result 5327 OP_VTRNR // VTRN, right result 5328 }; 5329 5330 if (OpNum == OP_COPY) { 5331 if (LHSID == (1 * 9 + 2) * 9 + 3) 5332 return LHS; 5333 assert(LHSID == ((4 * 9 + 5) * 9 + 6) * 9 + 7 && "Illegal OP_COPY!"); 5334 return RHS; 5335 } 5336 5337 SDValue OpLHS, OpRHS; 5338 OpLHS = GeneratePerfectShuffle(PerfectShuffleTable[LHSID], LHS, RHS, DAG, dl); 5339 OpRHS = GeneratePerfectShuffle(PerfectShuffleTable[RHSID], LHS, RHS, DAG, dl); 5340 EVT VT = OpLHS.getValueType(); 5341 5342 switch (OpNum) { 5343 default: 5344 llvm_unreachable("Unknown shuffle opcode!"); 5345 case OP_VREV: 5346 // VREV divides the vector in half and swaps within the half. 5347 if (VT.getVectorElementType() == MVT::i32 || 5348 VT.getVectorElementType() == MVT::f32) 5349 return DAG.getNode(AArch64ISD::REV64, dl, VT, OpLHS); 5350 // vrev <4 x i16> -> REV32 5351 if (VT.getVectorElementType() == MVT::i16 || 5352 VT.getVectorElementType() == MVT::f16) 5353 return DAG.getNode(AArch64ISD::REV32, dl, VT, OpLHS); 5354 // vrev <4 x i8> -> REV16 5355 assert(VT.getVectorElementType() == MVT::i8); 5356 return DAG.getNode(AArch64ISD::REV16, dl, VT, OpLHS); 5357 case OP_VDUP0: 5358 case OP_VDUP1: 5359 case OP_VDUP2: 5360 case OP_VDUP3: { 5361 EVT EltTy = VT.getVectorElementType(); 5362 unsigned Opcode; 5363 if (EltTy == MVT::i8) 5364 Opcode = AArch64ISD::DUPLANE8; 5365 else if (EltTy == MVT::i16 || EltTy == MVT::f16) 5366 Opcode = AArch64ISD::DUPLANE16; 5367 else if (EltTy == MVT::i32 || EltTy == MVT::f32) 5368 Opcode = AArch64ISD::DUPLANE32; 5369 else if (EltTy == MVT::i64 || EltTy == MVT::f64) 5370 Opcode = AArch64ISD::DUPLANE64; 5371 else 5372 llvm_unreachable("Invalid vector element type?"); 5373 5374 if (VT.getSizeInBits() == 64) 5375 OpLHS = WidenVector(OpLHS, DAG); 5376 SDValue Lane = DAG.getConstant(OpNum - OP_VDUP0, dl, MVT::i64); 5377 return DAG.getNode(Opcode, dl, VT, OpLHS, Lane); 5378 } 5379 case OP_VEXT1: 5380 case OP_VEXT2: 5381 case OP_VEXT3: { 5382 unsigned Imm = (OpNum - OP_VEXT1 + 1) * getExtFactor(OpLHS); 5383 return DAG.getNode(AArch64ISD::EXT, dl, VT, OpLHS, OpRHS, 5384 DAG.getConstant(Imm, dl, MVT::i32)); 5385 } 5386 case OP_VUZPL: 5387 return DAG.getNode(AArch64ISD::UZP1, dl, DAG.getVTList(VT, VT), OpLHS, 5388 OpRHS); 5389 case OP_VUZPR: 5390 return DAG.getNode(AArch64ISD::UZP2, dl, DAG.getVTList(VT, VT), OpLHS, 5391 OpRHS); 5392 case OP_VZIPL: 5393 return DAG.getNode(AArch64ISD::ZIP1, dl, DAG.getVTList(VT, VT), OpLHS, 5394 OpRHS); 5395 case OP_VZIPR: 5396 return DAG.getNode(AArch64ISD::ZIP2, dl, DAG.getVTList(VT, VT), OpLHS, 5397 OpRHS); 5398 case OP_VTRNL: 5399 return DAG.getNode(AArch64ISD::TRN1, dl, DAG.getVTList(VT, VT), OpLHS, 5400 OpRHS); 5401 case OP_VTRNR: 5402 return DAG.getNode(AArch64ISD::TRN2, dl, DAG.getVTList(VT, VT), OpLHS, 5403 OpRHS); 5404 } 5405 } 5406 5407 static SDValue GenerateTBL(SDValue Op, ArrayRef<int> ShuffleMask, 5408 SelectionDAG &DAG) { 5409 // Check to see if we can use the TBL instruction. 5410 SDValue V1 = Op.getOperand(0); 5411 SDValue V2 = Op.getOperand(1); 5412 SDLoc DL(Op); 5413 5414 EVT EltVT = Op.getValueType().getVectorElementType(); 5415 unsigned BytesPerElt = EltVT.getSizeInBits() / 8; 5416 5417 SmallVector<SDValue, 8> TBLMask; 5418 for (int Val : ShuffleMask) { 5419 for (unsigned Byte = 0; Byte < BytesPerElt; ++Byte) { 5420 unsigned Offset = Byte + Val * BytesPerElt; 5421 TBLMask.push_back(DAG.getConstant(Offset, DL, MVT::i32)); 5422 } 5423 } 5424 5425 MVT IndexVT = MVT::v8i8; 5426 unsigned IndexLen = 8; 5427 if (Op.getValueType().getSizeInBits() == 128) { 5428 IndexVT = MVT::v16i8; 5429 IndexLen = 16; 5430 } 5431 5432 SDValue V1Cst = DAG.getNode(ISD::BITCAST, DL, IndexVT, V1); 5433 SDValue V2Cst = DAG.getNode(ISD::BITCAST, DL, IndexVT, V2); 5434 5435 SDValue Shuffle; 5436 if (V2.getNode()->getOpcode() == ISD::UNDEF) { 5437 if (IndexLen == 8) 5438 V1Cst = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v16i8, V1Cst, V1Cst); 5439 Shuffle = DAG.getNode( 5440 ISD::INTRINSIC_WO_CHAIN, DL, IndexVT, 5441 DAG.getConstant(Intrinsic::aarch64_neon_tbl1, DL, MVT::i32), V1Cst, 5442 DAG.getNode(ISD::BUILD_VECTOR, DL, IndexVT, 5443 makeArrayRef(TBLMask.data(), IndexLen))); 5444 } else { 5445 if (IndexLen == 8) { 5446 V1Cst = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v16i8, V1Cst, V2Cst); 5447 Shuffle = DAG.getNode( 5448 ISD::INTRINSIC_WO_CHAIN, DL, IndexVT, 5449 DAG.getConstant(Intrinsic::aarch64_neon_tbl1, DL, MVT::i32), V1Cst, 5450 DAG.getNode(ISD::BUILD_VECTOR, DL, IndexVT, 5451 makeArrayRef(TBLMask.data(), IndexLen))); 5452 } else { 5453 // FIXME: We cannot, for the moment, emit a TBL2 instruction because we 5454 // cannot currently represent the register constraints on the input 5455 // table registers. 5456 // Shuffle = DAG.getNode(AArch64ISD::TBL2, DL, IndexVT, V1Cst, V2Cst, 5457 // DAG.getNode(ISD::BUILD_VECTOR, DL, IndexVT, 5458 // &TBLMask[0], IndexLen)); 5459 Shuffle = DAG.getNode( 5460 ISD::INTRINSIC_WO_CHAIN, DL, IndexVT, 5461 DAG.getConstant(Intrinsic::aarch64_neon_tbl2, DL, MVT::i32), 5462 V1Cst, V2Cst, 5463 DAG.getNode(ISD::BUILD_VECTOR, DL, IndexVT, 5464 makeArrayRef(TBLMask.data(), IndexLen))); 5465 } 5466 } 5467 return DAG.getNode(ISD::BITCAST, DL, Op.getValueType(), Shuffle); 5468 } 5469 5470 static unsigned getDUPLANEOp(EVT EltType) { 5471 if (EltType == MVT::i8) 5472 return AArch64ISD::DUPLANE8; 5473 if (EltType == MVT::i16 || EltType == MVT::f16) 5474 return AArch64ISD::DUPLANE16; 5475 if (EltType == MVT::i32 || EltType == MVT::f32) 5476 return AArch64ISD::DUPLANE32; 5477 if (EltType == MVT::i64 || EltType == MVT::f64) 5478 return AArch64ISD::DUPLANE64; 5479 5480 llvm_unreachable("Invalid vector element type?"); 5481 } 5482 5483 SDValue AArch64TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, 5484 SelectionDAG &DAG) const { 5485 SDLoc dl(Op); 5486 EVT VT = Op.getValueType(); 5487 5488 ShuffleVectorSDNode *SVN = cast<ShuffleVectorSDNode>(Op.getNode()); 5489 5490 // Convert shuffles that are directly supported on NEON to target-specific 5491 // DAG nodes, instead of keeping them as shuffles and matching them again 5492 // during code selection. This is more efficient and avoids the possibility 5493 // of inconsistencies between legalization and selection. 5494 ArrayRef<int> ShuffleMask = SVN->getMask(); 5495 5496 SDValue V1 = Op.getOperand(0); 5497 SDValue V2 = Op.getOperand(1); 5498 5499 if (ShuffleVectorSDNode::isSplatMask(&ShuffleMask[0], 5500 V1.getValueType().getSimpleVT())) { 5501 int Lane = SVN->getSplatIndex(); 5502 // If this is undef splat, generate it via "just" vdup, if possible. 5503 if (Lane == -1) 5504 Lane = 0; 5505 5506 if (Lane == 0 && V1.getOpcode() == ISD::SCALAR_TO_VECTOR) 5507 return DAG.getNode(AArch64ISD::DUP, dl, V1.getValueType(), 5508 V1.getOperand(0)); 5509 // Test if V1 is a BUILD_VECTOR and the lane being referenced is a non- 5510 // constant. If so, we can just reference the lane's definition directly. 5511 if (V1.getOpcode() == ISD::BUILD_VECTOR && 5512 !isa<ConstantSDNode>(V1.getOperand(Lane))) 5513 return DAG.getNode(AArch64ISD::DUP, dl, VT, V1.getOperand(Lane)); 5514 5515 // Otherwise, duplicate from the lane of the input vector. 5516 unsigned Opcode = getDUPLANEOp(V1.getValueType().getVectorElementType()); 5517 5518 // SelectionDAGBuilder may have "helpfully" already extracted or conatenated 5519 // to make a vector of the same size as this SHUFFLE. We can ignore the 5520 // extract entirely, and canonicalise the concat using WidenVector. 5521 if (V1.getOpcode() == ISD::EXTRACT_SUBVECTOR) { 5522 Lane += cast<ConstantSDNode>(V1.getOperand(1))->getZExtValue(); 5523 V1 = V1.getOperand(0); 5524 } else if (V1.getOpcode() == ISD::CONCAT_VECTORS) { 5525 unsigned Idx = Lane >= (int)VT.getVectorNumElements() / 2; 5526 Lane -= Idx * VT.getVectorNumElements() / 2; 5527 V1 = WidenVector(V1.getOperand(Idx), DAG); 5528 } else if (VT.getSizeInBits() == 64) 5529 V1 = WidenVector(V1, DAG); 5530 5531 return DAG.getNode(Opcode, dl, VT, V1, DAG.getConstant(Lane, dl, MVT::i64)); 5532 } 5533 5534 if (isREVMask(ShuffleMask, VT, 64)) 5535 return DAG.getNode(AArch64ISD::REV64, dl, V1.getValueType(), V1, V2); 5536 if (isREVMask(ShuffleMask, VT, 32)) 5537 return DAG.getNode(AArch64ISD::REV32, dl, V1.getValueType(), V1, V2); 5538 if (isREVMask(ShuffleMask, VT, 16)) 5539 return DAG.getNode(AArch64ISD::REV16, dl, V1.getValueType(), V1, V2); 5540 5541 bool ReverseEXT = false; 5542 unsigned Imm; 5543 if (isEXTMask(ShuffleMask, VT, ReverseEXT, Imm)) { 5544 if (ReverseEXT) 5545 std::swap(V1, V2); 5546 Imm *= getExtFactor(V1); 5547 return DAG.getNode(AArch64ISD::EXT, dl, V1.getValueType(), V1